1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" 18 19 #include "defines.h" 20 #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" 21 #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" 22 #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" 23 24 namespace latinime { 25 26 typedef PatriciaTrieReadingUtils PtReadingUtils; 27 28 const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; 29 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; 30 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; 31 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; 32 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; 33 34 // Flag for single/multiple char group 35 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; 36 // Flag for terminal PtNodes 37 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; 38 // Flag for shortcut targets presence 39 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; 40 // Flag for bigram presence 41 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; 42 // Flag for non-words (typically, shortcut only entries) 43 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; 44 // Flag for blacklist 45 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; 46 47 /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( 48 const uint8_t *const buffer, int *const pos) { 49 const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 50 if (firstByte < 0x80) { 51 return firstByte; 52 } else { 53 return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( 54 buffer, pos); 55 } 56 } 57 58 /* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( 59 const uint8_t *const buffer, int *const pos) { 60 return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 61 } 62 63 /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, 64 int *const pos) { 65 return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos); 66 } 67 68 // Returns the number of read characters. 69 /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, 70 const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) { 71 int length = 0; 72 if (hasMultipleChars(flags)) { 73 length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer, 74 pos); 75 } else { 76 const int codePoint = getCodePointAndAdvancePosition(buffer, pos); 77 if (codePoint == NOT_A_CODE_POINT) { 78 // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is 79 // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR 80 // when the PtNode has a single code point. 81 length = 0; 82 AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", 83 *pos - 1, codePoint, buffer[*pos - 1]); 84 ASSERT(false); 85 } else if (maxLength > 0) { 86 outBuffer[0] = codePoint; 87 length = 1; 88 } 89 } 90 return length; 91 } 92 93 // Returns the number of skipped characters. 94 /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, 95 const int maxLength, int *const pos) { 96 if (hasMultipleChars(flags)) { 97 return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); 98 } else { 99 if (maxLength > 0) { 100 getCodePointAndAdvancePosition(buffer, pos); 101 return 1; 102 } else { 103 return 0; 104 } 105 } 106 } 107 108 /* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, 109 int *const pos) { 110 return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 111 } 112 113 /* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( 114 const uint8_t *const buffer, const NodeFlags flags, int *const pos) { 115 const int base = *pos; 116 int offset = 0; 117 switch (MASK_CHILDREN_POSITION_TYPE & flags) { 118 case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: 119 offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 120 break; 121 case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: 122 offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); 123 break; 124 case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: 125 offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); 126 break; 127 default: 128 // If we come here, it means we asked for the children of a word with 129 // no children. 130 return NOT_A_DICT_POS; 131 } 132 return base + offset; 133 } 134 135 /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, 136 const DictionaryShortcutsStructurePolicy *const shortcutPolicy, 137 const DictionaryBigramsStructurePolicy *const bigramPolicy, 138 NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, 139 int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, 140 int *const outBigramPos, int *const outSiblingPos) { 141 int readingPos = ptNodePos; 142 const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); 143 *outFlags = flags; 144 *outCodePointCount = getCharsAndAdvancePosition( 145 dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos); 146 *outProbability = isTerminal(flags) ? 147 readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; 148 *outChildrenPos = hasChildrenInFlags(flags) ? 149 readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS; 150 *outShortcutPos = NOT_A_DICT_POS; 151 if (hasShortcutTargets(flags)) { 152 *outShortcutPos = readingPos; 153 shortcutPolicy->skipAllShortcuts(&readingPos); 154 } 155 *outBigramPos = NOT_A_DICT_POS; 156 if (hasBigrams(flags)) { 157 *outBigramPos = readingPos; 158 bigramPolicy->skipAllBigrams(&readingPos); 159 } 160 *outSiblingPos = readingPos; 161 } 162 163 } // namespace latinime 164