1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" 18 19 #include "defines.h" 20 #include "dictionary/interface/dictionary_bigrams_structure_policy.h" 21 #include "dictionary/interface/dictionary_shortcuts_structure_policy.h" 22 #include "dictionary/utils/byte_array_utils.h" 23 24 namespace latinime { 25 26 typedef PatriciaTrieReadingUtils PtReadingUtils; 27 28 const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; 29 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; 30 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; 31 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; 32 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; 33 34 // Flag for single/multiple char group 35 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; 36 // Flag for terminal PtNodes 37 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; 38 // Flag for shortcut targets presence 39 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; 40 // Flag for bigram presence 41 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; 42 // Flag for non-words (typically, shortcut only entries) 43 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; 44 // Flag for possibly offensive words 45 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; 46 47 /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( 48 const uint8_t *const buffer, int *const pos) { 49 const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 50 if (firstByte < 0x80) { 51 return firstByte; 52 } else { 53 return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( 54 buffer, pos); 55 } 56 } 57 58 /* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( 59 const uint8_t *const buffer, int *const pos) { 60 return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 61 } 62 63 /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, 64 const int *const codePointTable, int *const pos) { 65 return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos); 66 } 67 68 // Returns the number of read characters. 69 /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, 70 const NodeFlags flags, const int maxLength, const int *const codePointTable, 71 int *const outBuffer, int *const pos) { 72 int length = 0; 73 if (hasMultipleChars(flags)) { 74 length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable, 75 outBuffer, pos); 76 } else { 77 const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos); 78 if (codePoint == NOT_A_CODE_POINT) { 79 // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is 80 // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR 81 // when the PtNode has a single code point. 82 length = 0; 83 AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", 84 *pos - 1, codePoint, buffer[*pos - 1]); 85 ASSERT(false); 86 } else if (maxLength > 0) { 87 outBuffer[0] = codePoint; 88 length = 1; 89 } 90 } 91 return length; 92 } 93 94 // Returns the number of skipped characters. 95 /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, 96 const int maxLength, const int *const codePointTable, int *const pos) { 97 if (hasMultipleChars(flags)) { 98 return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); 99 } else { 100 if (maxLength > 0) { 101 getCodePointAndAdvancePosition(buffer, codePointTable, pos); 102 return 1; 103 } else { 104 return 0; 105 } 106 } 107 } 108 109 /* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, 110 int *const pos) { 111 return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 112 } 113 114 /* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( 115 const uint8_t *const buffer, const NodeFlags flags, int *const pos) { 116 const int base = *pos; 117 int offset = 0; 118 switch (MASK_CHILDREN_POSITION_TYPE & flags) { 119 case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: 120 offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 121 break; 122 case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: 123 offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); 124 break; 125 case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: 126 offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); 127 break; 128 default: 129 // If we come here, it means we asked for the children of a word with 130 // no children. 131 return NOT_A_DICT_POS; 132 } 133 return base + offset; 134 } 135 136 /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, 137 const DictionaryShortcutsStructurePolicy *const shortcutPolicy, 138 const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable, 139 NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, 140 int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, 141 int *const outBigramPos, int *const outSiblingPos) { 142 int readingPos = ptNodePos; 143 const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); 144 *outFlags = flags; 145 *outCodePointCount = getCharsAndAdvancePosition( 146 dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos); 147 *outProbability = isTerminal(flags) ? 148 readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; 149 *outChildrenPos = hasChildrenInFlags(flags) ? 150 readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS; 151 *outShortcutPos = NOT_A_DICT_POS; 152 if (hasShortcutTargets(flags)) { 153 *outShortcutPos = readingPos; 154 shortcutPolicy->skipAllShortcuts(&readingPos); 155 } 156 *outBigramPos = NOT_A_DICT_POS; 157 if (hasBigrams(flags)) { 158 *outBigramPos = readingPos; 159 bigramPolicy->skipAllBigrams(&readingPos); 160 } 161 *outSiblingPos = readingPos; 162 } 163 164 } // namespace latinime 165