1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 #include "suggest/policyimpl/dictionary/patricia_trie_policy.h" 19 20 #include "defines.h" 21 #include "suggest/core/dicnode/dic_node.h" 22 #include "suggest/core/dicnode/dic_node_vector.h" 23 #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" 24 #include "suggest/policyimpl/dictionary/utils/probability_utils.h" 25 26 namespace latinime { 27 28 void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, 29 DicNodeVector *const childDicNodes) const { 30 if (!dicNode->hasChildren()) { 31 return; 32 } 33 int nextPos = dicNode->getChildrenPos(); 34 if (nextPos < 0 || nextPos >= mDictBufferSize) { 35 AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", 36 nextPos, mDictBufferSize); 37 ASSERT(false); 38 return; 39 } 40 const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( 41 mDictRoot, &nextPos); 42 for (int i = 0; i < childCount; i++) { 43 if (nextPos < 0 || nextPos >= mDictBufferSize) { 44 AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d", 45 nextPos, mDictBufferSize, i, childCount); 46 ASSERT(false); 47 return; 48 } 49 nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); 50 } 51 } 52 53 // This retrieves code points and the probability of the word by its terminal position. 54 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, 55 // it is possible to check for this with advantageous complexity. For each node, we search 56 // for PtNodes with children and compare the children position with the position we look for. 57 // When we shoot the position we look for, it means the word we look for is in the children 58 // of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a 59 // PtNode array with the last PtNode's children position still less than what we are searching for, 60 // we must descend the last PtNode's children (for example, if the word we are searching for starts 61 // with a z, it's the last PtNode of the root array, so all children addresses will be smaller 62 // than the position we look for, and we have to descend the z node). 63 /* Parameters : 64 * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is 65 * what is stored as the "bigram position" in each bigram) 66 * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. 67 * outUnigramProbability: a pointer to an int to write the probability into. 68 * Return value : the code point count, of 0 if the word was not found. 69 */ 70 // TODO: Split this function to be more readable 71 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( 72 const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, 73 int *const outUnigramProbability) const { 74 int pos = getRootPosition(); 75 int wordPos = 0; 76 // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will 77 // only traverse nodes that are actually a part of the terminal we are searching, so each time 78 // we enter this loop we are one depth level further than last time. 79 // The only reason we count nodes is because we want to reduce the probability of infinite 80 // looping in case there is a bug. Since we know there is an upper bound to the depth we are 81 // supposed to traverse, it does not hurt to count iterations. 82 for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { 83 int lastCandidatePtNodePos = 0; 84 // Let's loop through PtNodes in this PtNode array searching for either the terminal 85 // or one of its ascendants. 86 for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( 87 mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) { 88 const int startPos = pos; 89 const PatriciaTrieReadingUtils::NodeFlags flags = 90 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); 91 const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 92 mDictRoot, &pos); 93 if (ptNodePos == startPos) { 94 // We found the position. Copy the rest of the code points in the buffer and return 95 // the length. 96 outCodePoints[wordPos] = character; 97 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { 98 int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 99 mDictRoot, &pos); 100 // We count code points in order to avoid infinite loops if the file is broken 101 // or if there is some other bug 102 int charCount = maxCodePointCount; 103 while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { 104 outCodePoints[++wordPos] = nextChar; 105 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 106 mDictRoot, &pos); 107 } 108 } 109 *outUnigramProbability = 110 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, 111 &pos); 112 return ++wordPos; 113 } 114 // We need to skip past this PtNode, so skip any remaining code points after the 115 // first and possibly the probability. 116 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { 117 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); 118 } 119 if (PatriciaTrieReadingUtils::isTerminal(flags)) { 120 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); 121 } 122 // The fact that this PtNode has children is very important. Since we already know 123 // that this PtNode does not match, if it has no children we know it is irrelevant 124 // to what we are searching for. 125 const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); 126 // We will write in `found' whether we have passed the children position we are 127 // searching for. For example if we search for "beer", the children of b are less 128 // than the address we are searching for and the children of c are greater. When we 129 // come here for c, we realize this is too big, and that we should descend b. 130 bool found; 131 if (hasChildren) { 132 int currentPos = pos; 133 // Here comes the tricky part. First, read the children position. 134 const int childrenPos = PatriciaTrieReadingUtils 135 ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, ¤tPos); 136 if (childrenPos > ptNodePos) { 137 // If the children pos is greater than the position, it means the previous 138 // PtNode, which position is stored in lastCandidatePtNodePos, was the right 139 // one. 140 found = true; 141 } else if (1 >= ptNodeCount) { 142 // However if we are on the LAST PtNode of this array, and we have NOT shot the 143 // position we should descend THIS node. So we trick the lastCandidatePtNodePos 144 // so that we will descend this PtNode, not the previous one. 145 lastCandidatePtNodePos = startPos; 146 found = true; 147 } else { 148 // Else, we should continue looking. 149 found = false; 150 } 151 } else { 152 // Even if we don't have children here, we could still be on the last PtNode of / 153 // this array. If this is the case, we should descend the last PtNode that had 154 // children, and their position is already in lastCandidatePtNodePos. 155 found = (1 >= ptNodeCount); 156 } 157 158 if (found) { 159 // Okay, we found the PtNode we should descend. Its position is in 160 // the lastCandidatePtNodePos variable, so we just re-read it. 161 if (0 != lastCandidatePtNodePos) { 162 const PatriciaTrieReadingUtils::NodeFlags lastFlags = 163 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( 164 mDictRoot, &lastCandidatePtNodePos); 165 const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 166 mDictRoot, &lastCandidatePtNodePos); 167 // We copy all the characters in this PtNode to the buffer 168 outCodePoints[wordPos] = lastChar; 169 if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { 170 int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 171 mDictRoot, &lastCandidatePtNodePos); 172 int charCount = maxCodePointCount; 173 while (-1 != nextChar && --charCount > 0) { 174 outCodePoints[++wordPos] = nextChar; 175 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 176 mDictRoot, &lastCandidatePtNodePos); 177 } 178 } 179 ++wordPos; 180 // Now we only need to branch to the children address. Skip the probability if 181 // it's there, read pos, and break to resume the search at pos. 182 if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { 183 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, 184 &lastCandidatePtNodePos); 185 } 186 pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( 187 mDictRoot, lastFlags, &lastCandidatePtNodePos); 188 break; 189 } else { 190 // Here is a little tricky part: we come here if we found out that all children 191 // addresses in this PtNode are bigger than the address we are searching for. 192 // Should we conclude the word is not in the dictionary? No! It could still be 193 // one of the remaining PtNodes in this array, so we have to keep looking in 194 // this array until we find it (or we realize it's not there either, in which 195 // case it's actually not in the dictionary). Pass the end of this PtNode, 196 // ready to start the next one. 197 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 198 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( 199 mDictRoot, flags, &pos); 200 } 201 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 202 mShortcutListPolicy.skipAllShortcuts(&pos); 203 } 204 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { 205 mBigramListPolicy.skipAllBigrams(&pos); 206 } 207 } 208 } else { 209 // If we did not find it, we should record the last children address for the next 210 // iteration. 211 if (hasChildren) lastCandidatePtNodePos = startPos; 212 // Now skip the end of this PtNode (children pos and the attributes if any) so that 213 // our pos is after the end of this PtNode, at the start of the next one. 214 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 215 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( 216 mDictRoot, flags, &pos); 217 } 218 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 219 mShortcutListPolicy.skipAllShortcuts(&pos); 220 } 221 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { 222 mBigramListPolicy.skipAllBigrams(&pos); 223 } 224 } 225 226 } 227 } 228 // If we have looked through all the PtNodes and found no match, the ptNodePos is 229 // not the position of a terminal in this dictionary. 230 return 0; 231 } 232 233 // This function gets the position of the terminal node of the exact matching word in the 234 // dictionary. If no match is found, it returns NOT_A_DICT_POS. 235 int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, 236 const int length, const bool forceLowerCaseSearch) const { 237 int pos = getRootPosition(); 238 int wordPos = 0; 239 240 while (true) { 241 // If we already traversed the tree further than the word is long, there means 242 // there was no match (or we would have found it). 243 if (wordPos >= length) return NOT_A_DICT_POS; 244 int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot, 245 &pos); 246 const int wChar = forceLowerCaseSearch 247 ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; 248 while (true) { 249 // If there are no more PtNodes in this array, it means we could not 250 // find a matching character for this depth, therefore there is no match. 251 if (0 >= ptNodeCount) return NOT_A_DICT_POS; 252 const int ptNodePos = pos; 253 const PatriciaTrieReadingUtils::NodeFlags flags = 254 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); 255 int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, 256 &pos); 257 if (character == wChar) { 258 // This is the correct PtNode. Only one PtNode may start with the same char within 259 // a PtNode array, so either we found our match in this array, or there is 260 // no match and we can return NOT_A_DICT_POS. So we will check all the 261 // characters in this PtNode indeed does match. 262 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { 263 character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, 264 &pos); 265 while (NOT_A_CODE_POINT != character) { 266 ++wordPos; 267 // If we shoot the length of the word we search for, or if we find a single 268 // character that does not match, as explained above, it means the word is 269 // not in the dictionary (by virtue of this PtNode being the only one to 270 // match the word on the first character, but not matching the whole word). 271 if (wordPos >= length) return NOT_A_DICT_POS; 272 if (inWord[wordPos] != character) return NOT_A_DICT_POS; 273 character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 274 mDictRoot, &pos); 275 } 276 } 277 // If we come here we know that so far, we do match. Either we are on a terminal 278 // and we match the length, in which case we found it, or we traverse children. 279 // If we don't match the length AND don't have children, then a word in the 280 // dictionary fully matches a prefix of the searched word but not the full word. 281 ++wordPos; 282 if (PatriciaTrieReadingUtils::isTerminal(flags)) { 283 if (wordPos == length) { 284 return ptNodePos; 285 } 286 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); 287 } 288 if (!PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 289 return NOT_A_DICT_POS; 290 } 291 // We have children and we are still shorter than the word we are searching for, so 292 // we need to traverse children. Put the pointer on the children position, and 293 // break 294 pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, 295 flags, &pos); 296 break; 297 } else { 298 // This PtNode does not match, so skip the remaining part and go to the next. 299 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { 300 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, 301 &pos); 302 } 303 if (PatriciaTrieReadingUtils::isTerminal(flags)) { 304 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); 305 } 306 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 307 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, 308 flags, &pos); 309 } 310 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 311 mShortcutListPolicy.skipAllShortcuts(&pos); 312 } 313 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { 314 mBigramListPolicy.skipAllBigrams(&pos); 315 } 316 } 317 --ptNodeCount; 318 } 319 } 320 } 321 322 int PatriciaTriePolicy::getProbability(const int unigramProbability, 323 const int bigramProbability) const { 324 if (unigramProbability == NOT_A_PROBABILITY) { 325 return NOT_A_PROBABILITY; 326 } else if (bigramProbability == NOT_A_PROBABILITY) { 327 return ProbabilityUtils::backoff(unigramProbability); 328 } else { 329 return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, 330 bigramProbability); 331 } 332 } 333 334 int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { 335 if (ptNodePos == NOT_A_DICT_POS) { 336 return NOT_A_PROBABILITY; 337 } 338 int pos = ptNodePos; 339 const PatriciaTrieReadingUtils::NodeFlags flags = 340 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); 341 if (!PatriciaTrieReadingUtils::isTerminal(flags)) { 342 return NOT_A_PROBABILITY; 343 } 344 if (PatriciaTrieReadingUtils::isNotAWord(flags) 345 || PatriciaTrieReadingUtils::isBlacklisted(flags)) { 346 // If this is not a word, or if it's a blacklisted entry, it should behave as 347 // having no probability outside of the suggestion process (where it should be used 348 // for shortcuts). 349 return NOT_A_PROBABILITY; 350 } 351 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); 352 return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( 353 mDictRoot, &pos), NOT_A_PROBABILITY); 354 } 355 356 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 357 if (ptNodePos == NOT_A_DICT_POS) { 358 return NOT_A_DICT_POS; 359 } 360 int pos = ptNodePos; 361 const PatriciaTrieReadingUtils::NodeFlags flags = 362 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); 363 if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 364 return NOT_A_DICT_POS; 365 } 366 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); 367 if (PatriciaTrieReadingUtils::isTerminal(flags)) { 368 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); 369 } 370 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 371 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); 372 } 373 return pos; 374 } 375 376 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 377 if (ptNodePos == NOT_A_DICT_POS) { 378 return NOT_A_DICT_POS; 379 } 380 int pos = ptNodePos; 381 const PatriciaTrieReadingUtils::NodeFlags flags = 382 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); 383 if (!PatriciaTrieReadingUtils::hasBigrams(flags)) { 384 return NOT_A_DICT_POS; 385 } 386 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); 387 if (PatriciaTrieReadingUtils::isTerminal(flags)) { 388 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); 389 } 390 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 391 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); 392 } 393 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 394 mShortcutListPolicy.skipAllShortcuts(&pos);; 395 } 396 return pos; 397 } 398 399 int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, 400 const int ptNodePos, DicNodeVector *childDicNodes) const { 401 int pos = ptNodePos; 402 const PatriciaTrieReadingUtils::NodeFlags flags = 403 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); 404 int mergedNodeCodePoints[MAX_WORD_LENGTH]; 405 const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( 406 mDictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos); 407 const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))? 408 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos) 409 : NOT_A_PROBABILITY; 410 const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ? 411 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( 412 mDictRoot, flags, &pos) : NOT_A_DICT_POS; 413 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 414 getShortcutsStructurePolicy()->skipAllShortcuts(&pos); 415 } 416 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { 417 getBigramsStructurePolicy()->skipAllBigrams(&pos); 418 } 419 if (mergedNodeCodePointCount <= 0) { 420 AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); 421 ASSERT(false); 422 return pos; 423 } 424 childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, 425 PatriciaTrieReadingUtils::isTerminal(flags), 426 PatriciaTrieReadingUtils::hasChildrenInFlags(flags), 427 PatriciaTrieReadingUtils::isBlacklisted(flags) || 428 PatriciaTrieReadingUtils::isNotAWord(flags), 429 mergedNodeCodePointCount, mergedNodeCodePoints); 430 return pos; 431 } 432 433 } // namespace latinime 434