1 /* 2 * Copyright (C) 2010, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <cstring> 18 19 #define LOG_TAG "LatinIME: unigram_dictionary.cpp" 20 21 #include "binary_format.h" 22 #include "char_utils.h" 23 #include "defines.h" 24 #include "dictionary.h" 25 #include "digraph_utils.h" 26 #include "proximity_info.h" 27 #include "terminal_attributes.h" 28 #include "unigram_dictionary.h" 29 #include "words_priority_queue.h" 30 #include "words_priority_queue_pool.h" 31 32 namespace latinime { 33 34 // TODO: check the header 35 UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags) 36 : DICT_ROOT(streamStart), ROOT_POS(0), 37 MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) { 38 if (DEBUG_DICT) { 39 AKLOGI("UnigramDictionary - constructor"); 40 } 41 } 42 43 UnigramDictionary::~UnigramDictionary() { 44 } 45 46 // TODO: This needs to take a const int* and not tinker with its contents 47 static void addWord(int *word, int length, int probability, WordsPriorityQueue *queue, int type) { 48 queue->push(probability, word, length, type); 49 } 50 51 // Return the replacement code point for a digraph, or 0 if none. 52 int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int inputSize, 53 const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const { 54 55 // There can't be a digraph if we don't have at least 2 characters to examine 56 if (i + 2 > inputSize) return false; 57 58 // Search for the first char of some digraph 59 int lastDigraphIndex = -1; 60 const int thisChar = codes[i]; 61 for (lastDigraphIndex = digraphsSize - 1; lastDigraphIndex >= 0; --lastDigraphIndex) { 62 if (thisChar == digraphs[lastDigraphIndex].first) break; 63 } 64 // No match: return early 65 if (lastDigraphIndex < 0) return 0; 66 67 // It's an interesting digraph if the second char matches too. 68 if (digraphs[lastDigraphIndex].second == codes[i + 1]) { 69 return digraphs[lastDigraphIndex].compositeGlyph; 70 } else { 71 return 0; 72 } 73 } 74 75 // Mostly the same arguments as the non-recursive version, except: 76 // codes is the original value. It points to the start of the work buffer, and gets passed as is. 77 // inputSize is the size of the user input (thus, it is the size of codesSrc). 78 // codesDest is the current point in the work buffer. 79 // codesSrc is the current point in the user-input, original, content-unmodified buffer. 80 // codesRemain is the remaining size in codesSrc. 81 void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, 82 const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, 83 int *xCoordinatesBuffer, int *yCoordinatesBuffer, 84 const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 85 const bool useFullEditDistance, const int *codesSrc, 86 const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, 87 WordsPriorityQueuePool *queuePool, 88 const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const { 89 ASSERT(sizeof(codesDest[0]) == sizeof(codesSrc[0])); 90 ASSERT(sizeof(xCoordinatesBuffer[0]) == sizeof(xcoordinates[0])); 91 ASSERT(sizeof(yCoordinatesBuffer[0]) == sizeof(ycoordinates[0])); 92 93 const int startIndex = static_cast<int>(codesDest - codesBuffer); 94 if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) { 95 for (int i = 0; i < codesRemain; ++i) { 96 xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i]; 97 yCoordinatesBuffer[startIndex + i] = ycoordinates[codesBufferSize - codesRemain + i]; 98 const int replacementCodePoint = 99 getDigraphReplacement(codesSrc, i, codesRemain, digraphs, digraphsSize); 100 if (0 != replacementCodePoint) { 101 // Found a digraph. We will try both spellings. eg. the word is "pruefen" 102 103 // Copy the word up to the first char of the digraph, including proximity chars, 104 // and overwrite the primary code with the replacement code point. Then, continue 105 // processing on the remaining part of the word, skipping the second char of the 106 // digraph. 107 // In our example, copy "pru", replace "u" with the version with the diaeresis and 108 // continue running on "fen". 109 // Make i the index of the second char of the digraph for simplicity. Forgetting 110 // to do that results in an infinite recursion so take care! 111 ++i; 112 memcpy(codesDest, codesSrc, i * sizeof(codesDest[0])); 113 codesDest[i - 1] = replacementCodePoint; 114 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, 115 codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, 116 bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1, 117 codesRemain - i - 1, currentDepth + 1, codesDest + i, correction, 118 queuePool, digraphs, digraphsSize); 119 120 // Copy the second char of the digraph in place, then continue processing on 121 // the remaining part of the word. 122 // In our example, after "pru" in the buffer copy the "e", and continue on "fen" 123 memcpy(codesDest + i, codesSrc + i, sizeof(codesDest[0])); 124 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, 125 codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, 126 bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i, 127 currentDepth + 1, codesDest + i, correction, queuePool, digraphs, 128 digraphsSize); 129 return; 130 } 131 } 132 } 133 134 // If we come here, we hit the end of the word: let's check it against the dictionary. 135 // In our example, we'll come here once for "prufen" and then once for "pruefen". 136 // If the word contains several digraphs, we'll come it for the product of them. 137 // eg. if the word is "ueberpruefen" we'll test, in order, against 138 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen". 139 const unsigned int remainingBytes = sizeof(codesDest[0]) * codesRemain; 140 if (0 != remainingBytes) { 141 memcpy(codesDest, codesSrc, remainingBytes); 142 memcpy(&xCoordinatesBuffer[startIndex], &xcoordinates[codesBufferSize - codesRemain], 143 sizeof(xCoordinatesBuffer[0]) * codesRemain); 144 memcpy(&yCoordinatesBuffer[startIndex], &ycoordinates[codesBufferSize - codesRemain], 145 sizeof(yCoordinatesBuffer[0]) * codesRemain); 146 } 147 148 getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, 149 startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction, 150 queuePool); 151 } 152 153 // bigramMap contains the association <bigram address> -> <bigram probability> 154 // bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter 155 // in bigram_dictionary.cpp 156 int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 157 const int *ycoordinates, const int *inputCodePoints, const int inputSize, 158 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 159 const bool useFullEditDistance, int *outWords, int *frequencies, int *outputTypes) const { 160 WordsPriorityQueuePool queuePool(MAX_RESULTS, SUB_QUEUE_MAX_WORDS); 161 queuePool.clearAll(); 162 Correction masterCorrection; 163 masterCorrection.resetCorrection(); 164 const DigraphUtils::digraph_t *digraphs = 0; 165 const int digraphsSize = 166 DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(DICT_FLAGS, &digraphs); 167 if (digraphsSize > 0) 168 { // Incrementally tune the word and try all possibilities 169 int codesBuffer[sizeof(*inputCodePoints) * inputSize]; 170 int xCoordinatesBuffer[inputSize]; 171 int yCoordinatesBuffer[inputSize]; 172 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, 173 xCoordinatesBuffer, yCoordinatesBuffer, inputSize, bigramMap, bigramFilter, 174 useFullEditDistance, inputCodePoints, inputSize, 0, codesBuffer, &masterCorrection, 175 &queuePool, digraphs, digraphsSize); 176 } else { // Normal processing 177 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, inputSize, 178 bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool); 179 } 180 181 PROF_START(20); 182 if (DEBUG_DICT) { 183 float ns = queuePool.getMasterQueue()->getHighestNormalizedScore( 184 masterCorrection.getPrimaryInputWord(), inputSize, 0, 0, 0); 185 ns += 0; 186 AKLOGI("Max normalized score = %f", ns); 187 } 188 const int suggestedWordsCount = 189 queuePool.getMasterQueue()->outputSuggestions(masterCorrection.getPrimaryInputWord(), 190 inputSize, frequencies, outWords, outputTypes); 191 192 if (DEBUG_DICT) { 193 float ns = queuePool.getMasterQueue()->getHighestNormalizedScore( 194 masterCorrection.getPrimaryInputWord(), inputSize, 0, 0, 0); 195 ns += 0; 196 AKLOGI("Returning %d words", suggestedWordsCount); 197 /// Print the returned words 198 for (int j = 0; j < suggestedWordsCount; ++j) { 199 int *w = outWords + j * MAX_WORD_LENGTH; 200 char s[MAX_WORD_LENGTH]; 201 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; 202 (void)s; // To suppress compiler warning 203 AKLOGI("%s %i", s, frequencies[j]); 204 } 205 } 206 PROF_END(20); 207 PROF_CLOSE; 208 return suggestedWordsCount; 209 } 210 211 void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 212 const int *ycoordinates, const int *inputCodePoints, const int inputSize, 213 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 214 const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) 215 const { 216 PROF_OPEN; 217 PROF_START(0); 218 PROF_END(0); 219 220 PROF_START(1); 221 getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, bigramMap, 222 bigramFilter, useFullEditDistance, inputSize, correction, queuePool); 223 PROF_END(1); 224 225 PROF_START(2); 226 // Note: This line is intentionally left blank 227 PROF_END(2); 228 229 PROF_START(3); 230 // Note: This line is intentionally left blank 231 PROF_END(3); 232 233 PROF_START(4); 234 bool hasAutoCorrectionCandidate = false; 235 WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); 236 if (masterQueue->size() > 0) { 237 float nsForMaster = masterQueue->getHighestNormalizedScore( 238 correction->getPrimaryInputWord(), inputSize, 0, 0, 0); 239 hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); 240 } 241 PROF_END(4); 242 243 PROF_START(5); 244 // Multiple word suggestions 245 if (SUGGEST_MULTIPLE_WORDS 246 && inputSize >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { 247 getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, 248 useFullEditDistance, inputSize, correction, queuePool, 249 hasAutoCorrectionCandidate); 250 } 251 PROF_END(5); 252 253 PROF_START(6); 254 // Note: This line is intentionally left blank 255 PROF_END(6); 256 257 if (DEBUG_DICT) { 258 queuePool->dumpSubQueue1TopSuggestions(); 259 for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { 260 WordsPriorityQueue *queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i); 261 if (queue->size() > 0) { 262 WordsPriorityQueue::SuggestedWord *sw = queue->top(); 263 const int score = sw->mScore; 264 const int *word = sw->mWord; 265 const int wordLength = sw->mWordLength; 266 float ns = Correction::RankingAlgorithm::calcNormalizedScore( 267 correction->getPrimaryInputWord(), i, word, wordLength, score); 268 ns += 0; 269 AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, 270 (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD)); 271 DUMP_WORD(correction->getPrimaryInputWord(), i); 272 DUMP_WORD(word, wordLength); 273 } 274 } 275 } 276 } 277 278 void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, 279 const int *yCoordinates, const int *codes, const int inputSize, 280 Correction *correction) const { 281 if (DEBUG_DICT) { 282 AKLOGI("initSuggest"); 283 DUMP_WORD(codes, inputSize); 284 } 285 correction->initInputParams(proximityInfo, codes, inputSize, xCoordinates, yCoordinates); 286 const int maxDepth = min(inputSize * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); 287 correction->initCorrection(proximityInfo, inputSize, maxDepth); 288 } 289 290 void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, 291 const int *xcoordinates, const int *ycoordinates, const int *codes, 292 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 293 const bool useFullEditDistance, const int inputSize, 294 Correction *correction, WordsPriorityQueuePool *queuePool) const { 295 initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputSize, correction); 296 getSuggestionCandidates(useFullEditDistance, inputSize, bigramMap, bigramFilter, correction, 297 queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); 298 } 299 300 void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, 301 const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 302 Correction *correction, WordsPriorityQueuePool *queuePool, 303 const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) const { 304 uint8_t totalTraverseCount = correction->pushAndGetTotalTraverseCount(); 305 if (DEBUG_DICT) { 306 AKLOGI("Traverse count %d", totalTraverseCount); 307 } 308 if (totalTraverseCount > MULTIPLE_WORDS_SUGGESTION_MAX_TOTAL_TRAVERSE_COUNT) { 309 if (DEBUG_DICT) { 310 AKLOGI("Abort traversing %d", totalTraverseCount); 311 } 312 return; 313 } 314 // TODO: Remove setCorrectionParams 315 correction->setCorrectionParams(0, 0, 0, 316 -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, 317 doAutoCompletion, maxErrors); 318 int rootPosition = ROOT_POS; 319 // Get the number of children of root, then increment the position 320 int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); 321 int outputIndex = 0; 322 323 correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0)); 324 325 // Depth first search 326 while (outputIndex >= 0) { 327 if (correction->initProcessState(outputIndex)) { 328 int siblingPos = correction->getTreeSiblingPos(outputIndex); 329 int firstChildPos; 330 331 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, 332 bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos, 333 queuePool, currentWordIndex); 334 // Update next sibling pos 335 correction->setTreeSiblingPos(outputIndex, siblingPos); 336 337 if (needsToTraverseChildrenNodes) { 338 // Goes to child node 339 outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos); 340 } 341 } else { 342 // Goes to parent sibling node 343 outputIndex = correction->getTreeParentIndex(outputIndex); 344 } 345 } 346 } 347 348 void UnigramDictionary::onTerminal(const int probability, 349 const TerminalAttributes &terminalAttributes, Correction *correction, 350 WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, 351 const int currentWordIndex) const { 352 const int inputIndex = correction->getInputIndex(); 353 const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; 354 355 int wordLength; 356 int *wordPointer; 357 358 if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { 359 WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); 360 const int finalProbability = 361 correction->getFinalProbability(probability, &wordPointer, &wordLength); 362 363 if (0 != finalProbability && !terminalAttributes.isBlacklistedOrNotAWord()) { 364 // If the probability is 0, we don't want to add this word. However we still 365 // want to add its shortcuts (including a possible whitelist entry) if any. 366 // Furthermore, if this is not a word (shortcut only for example) or a blacklisted 367 // entry then we never want to suggest this. 368 addWord(wordPointer, wordLength, finalProbability, masterQueue, 369 Dictionary::KIND_CORRECTION); 370 } 371 372 const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0; 373 // Please note that the shortcut candidates will be added to the master queue only. 374 TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); 375 while (iterator.hasNextShortcutTarget()) { 376 // TODO: addWord only supports weak ordering, meaning we have no means 377 // to control the order of the shortcuts relative to one another or to the word. 378 // We need to either modulate the probability of each shortcut according 379 // to its own shortcut probability or to make the queue 380 // so that the insert order is protected inside the queue for words 381 // with the same score. For the moment we use -1 to make sure the shortcut will 382 // never be in front of the word. 383 int shortcutTarget[MAX_WORD_LENGTH]; 384 int shortcutFrequency; 385 const int shortcutTargetStringLength = iterator.getNextShortcutTarget( 386 MAX_WORD_LENGTH, shortcutTarget, &shortcutFrequency); 387 int shortcutScore; 388 int kind; 389 if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_PROBABILITY 390 && correction->sameAsTyped()) { 391 shortcutScore = S_INT_MAX; 392 kind = Dictionary::KIND_WHITELIST; 393 } else { 394 shortcutScore = shortcutProbability; 395 kind = Dictionary::KIND_CORRECTION; 396 } 397 addWord(shortcutTarget, shortcutTargetStringLength, shortcutScore, 398 masterQueue, kind); 399 } 400 } 401 402 // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH 403 // or more length. 404 if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) { 405 WordsPriorityQueue *subQueue; 406 subQueue = queuePool->getSubQueue(currentWordIndex, inputIndex); 407 if (!subQueue) { 408 return; 409 } 410 const int finalProbability = correction->getFinalProbabilityForSubQueue( 411 probability, &wordPointer, &wordLength, inputIndex); 412 addWord(wordPointer, wordLength, finalProbability, subQueue, Dictionary::KIND_CORRECTION); 413 } 414 } 415 416 int UnigramDictionary::getSubStringSuggestion( 417 ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, 418 const int *codes, const bool useFullEditDistance, Correction *correction, 419 WordsPriorityQueuePool *queuePool, const int inputSize, 420 const bool hasAutoCorrectionCandidate, const int currentWordIndex, 421 const int inputWordStartPos, const int inputWordLength, 422 const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, 423 int *wordLengthArray, int *outputWord, int *outputWordLength) const { 424 if (inputWordLength > MULTIPLE_WORDS_SUGGESTION_MAX_WORD_LENGTH) { 425 return FLAG_MULTIPLE_SUGGEST_ABORT; 426 } 427 428 ///////////////////////////////////////////// 429 // safety net for multiple word suggestion // 430 // TODO: Remove this safety net // 431 ///////////////////////////////////////////// 432 int smallWordCount = 0; 433 int singleLetterWordCount = 0; 434 if (inputWordLength == 1) { 435 ++singleLetterWordCount; 436 } 437 if (inputWordLength <= 2) { 438 // small word == single letter or 2-letter word 439 ++smallWordCount; 440 } 441 for (int i = 0; i < currentWordIndex; ++i) { 442 const int length = wordLengthArray[i]; 443 if (length == 1) { 444 ++singleLetterWordCount; 445 // Safety net to avoid suggesting sequential single letter words 446 if (i < (currentWordIndex - 1)) { 447 if (wordLengthArray[i + 1] == 1) { 448 return FLAG_MULTIPLE_SUGGEST_ABORT; 449 } 450 } else if (inputWordLength == 1) { 451 return FLAG_MULTIPLE_SUGGEST_ABORT; 452 } 453 } 454 if (length <= 2) { 455 ++smallWordCount; 456 } 457 // Safety net to avoid suggesting multiple words with many (4 or more, for now) small words 458 if (singleLetterWordCount >= 3 || smallWordCount >= 4) { 459 return FLAG_MULTIPLE_SUGGEST_ABORT; 460 } 461 } 462 ////////////////////////////////////////////// 463 // TODO: Remove the safety net above // 464 ////////////////////////////////////////////// 465 466 int *tempOutputWord = 0; 467 int nextWordLength = 0; 468 // TODO: Optimize init suggestion 469 initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, 470 inputSize, correction); 471 472 int word[MAX_WORD_LENGTH]; 473 int freq = getMostProbableWordLike( 474 inputWordStartPos, inputWordLength, correction, word); 475 if (freq > 0) { 476 nextWordLength = inputWordLength; 477 tempOutputWord = word; 478 } else if (!hasAutoCorrectionCandidate) { 479 if (inputWordStartPos > 0) { 480 const int offset = inputWordStartPos; 481 initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], 482 codes + offset, inputWordLength, correction); 483 queuePool->clearSubQueue(currentWordIndex); 484 // TODO: pass the bigram list for substring suggestion 485 getSuggestionCandidates(useFullEditDistance, inputWordLength, 486 0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool, 487 false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); 488 if (DEBUG_DICT) { 489 if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { 490 AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); 491 for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { 492 queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord(); 493 } 494 } 495 } 496 } 497 WordsPriorityQueue *queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); 498 // TODO: Return the correct value depending on doAutoCompletion 499 if (!queue || queue->size() <= 0) { 500 return FLAG_MULTIPLE_SUGGEST_ABORT; 501 } 502 int score = 0; 503 const float ns = queue->getHighestNormalizedScore( 504 correction->getPrimaryInputWord(), inputWordLength, 505 &tempOutputWord, &score, &nextWordLength); 506 if (DEBUG_DICT) { 507 AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score); 508 } 509 // Two words correction won't be done if the score of the first word doesn't exceed the 510 // threshold. 511 if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 512 || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { 513 return FLAG_MULTIPLE_SUGGEST_SKIP; 514 } 515 freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); 516 } 517 if (DEBUG_DICT) { 518 AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)", 519 currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos, 520 (currentWordIndex > 0) ? wordLengthArray[0] : 0); 521 } 522 if (freq <= 0 || nextWordLength <= 0 523 || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) { 524 return FLAG_MULTIPLE_SUGGEST_SKIP; 525 } 526 for (int i = 0; i < nextWordLength; ++i) { 527 outputWord[outputWordStartPos + i] = tempOutputWord[i]; 528 } 529 530 // Put output values 531 freqArray[currentWordIndex] = freq; 532 // TODO: put output length instead of input length 533 wordLengthArray[currentWordIndex] = inputWordLength; 534 const int tempOutputWordLength = outputWordStartPos + nextWordLength; 535 if (outputWordLength) { 536 *outputWordLength = tempOutputWordLength; 537 } 538 539 if ((inputWordStartPos + inputWordLength) < inputSize) { 540 if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) { 541 return FLAG_MULTIPLE_SUGGEST_SKIP; 542 } 543 outputWord[tempOutputWordLength] = KEYCODE_SPACE; 544 if (outputWordLength) { 545 ++*outputWordLength; 546 } 547 } else if (currentWordIndex >= 1) { 548 // TODO: Handle 3 or more words 549 const int pairFreq = correction->getFreqForSplitMultipleWords( 550 freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord); 551 if (DEBUG_DICT) { 552 DUMP_WORD(outputWord, tempOutputWordLength); 553 for (int i = 0; i < currentWordIndex + 1; ++i) { 554 AKLOGI("Split %d,%d words: freq = %d, length = %d", i, currentWordIndex + 1, 555 freqArray[i], wordLengthArray[i]); 556 } 557 AKLOGI("Split two words: freq = %d, length = %d, %d, isSpace ? %d", pairFreq, 558 inputSize, tempOutputWordLength, isSpaceProximity); 559 } 560 addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue(), 561 Dictionary::KIND_CORRECTION); 562 } 563 return FLAG_MULTIPLE_SUGGEST_CONTINUE; 564 } 565 566 void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, 567 const int *xcoordinates, const int *ycoordinates, const int *codes, 568 const bool useFullEditDistance, const int inputSize, Correction *correction, 569 WordsPriorityQueuePool *queuePool, const bool hasAutoCorrectionCandidate, 570 const int startInputPos, const int startWordIndex, const int outputWordLength, 571 int *freqArray, int *wordLengthArray, int *outputWord) const { 572 if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { 573 // Return if the last word index 574 return; 575 } 576 if (startWordIndex >= 1 577 && (hasAutoCorrectionCandidate 578 || inputSize < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) { 579 // Do not suggest 3+ words if already has auto correction candidate 580 return; 581 } 582 for (int i = startInputPos + 1; i < inputSize; ++i) { 583 if (DEBUG_CORRECTION_FREQ) { 584 AKLOGI("Multi words(%d), start in %d sep %d start out %d", 585 startWordIndex, startInputPos, i, outputWordLength); 586 DUMP_WORD(outputWord, outputWordLength); 587 } 588 int tempOutputWordLength = 0; 589 // Current word 590 int inputWordStartPos = startInputPos; 591 int inputWordLength = i - startInputPos; 592 const int suggestionFlag = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, 593 codes, useFullEditDistance, correction, queuePool, inputSize, 594 hasAutoCorrectionCandidate, startWordIndex, inputWordStartPos, inputWordLength, 595 outputWordLength, true /* not used */, freqArray, wordLengthArray, outputWord, 596 &tempOutputWordLength); 597 if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_ABORT) { 598 // TODO: break here 599 continue; 600 } else if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_SKIP) { 601 continue; 602 } 603 604 if (DEBUG_CORRECTION_FREQ) { 605 AKLOGI("Do missing space correction"); 606 } 607 // Next word 608 // Missing space 609 inputWordStartPos = i; 610 inputWordLength = inputSize - i; 611 if (getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, 612 useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate, 613 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, 614 false /* missing space */, freqArray, wordLengthArray, outputWord, 0) 615 != FLAG_MULTIPLE_SUGGEST_CONTINUE) { 616 getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, 617 useFullEditDistance, inputSize, correction, queuePool, 618 hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1, 619 tempOutputWordLength, freqArray, wordLengthArray, outputWord); 620 } 621 622 // Mistyped space 623 ++inputWordStartPos; 624 --inputWordLength; 625 626 if (inputWordLength <= 0) { 627 continue; 628 } 629 630 const int x = xcoordinates[inputWordStartPos - 1]; 631 const int y = ycoordinates[inputWordStartPos - 1]; 632 if (!proximityInfo->hasSpaceProximity(x, y)) { 633 continue; 634 } 635 636 if (DEBUG_CORRECTION_FREQ) { 637 AKLOGI("Do mistyped space correction"); 638 } 639 getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, 640 useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate, 641 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, 642 true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0); 643 } 644 } 645 646 void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, 647 const int *xcoordinates, const int *ycoordinates, const int *codes, 648 const bool useFullEditDistance, const int inputSize, 649 Correction *correction, WordsPriorityQueuePool *queuePool, 650 const bool hasAutoCorrectionCandidate) const { 651 if (inputSize >= MAX_WORD_LENGTH) return; 652 if (DEBUG_DICT) { 653 AKLOGI("--- Suggest multiple words"); 654 } 655 656 // Allocating fixed length array on stack 657 int outputWord[MAX_WORD_LENGTH]; 658 int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; 659 int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; 660 const int outputWordLength = 0; 661 const int startInputPos = 0; 662 const int startWordIndex = 0; 663 getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, 664 useFullEditDistance, inputSize, correction, queuePool, hasAutoCorrectionCandidate, 665 startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray, 666 outputWord); 667 } 668 669 // Wrapper for getMostProbableWordLikeInner, which matches it to the previous 670 // interface. 671 int UnigramDictionary::getMostProbableWordLike(const int startInputIndex, const int inputSize, 672 Correction *correction, int *word) const { 673 int inWord[inputSize]; 674 for (int i = 0; i < inputSize; ++i) { 675 inWord[i] = correction->getPrimaryCodePointAt(startInputIndex + i); 676 } 677 return getMostProbableWordLikeInner(inWord, inputSize, word); 678 } 679 680 // This function will take the position of a character array within a CharGroup, 681 // and check it actually like-matches the word in inWord starting at startInputIndex, 682 // that is, it matches it with case and accents squashed. 683 // The function returns true if there was a full match, false otherwise. 684 // The function will copy on-the-fly the characters in the CharGroup to outNewWord. 685 // It will also place the end position of the array in outPos; in outInputIndex, 686 // it will place the index of the first char AFTER the match if there was a match, 687 // and the initial position if there was not. It makes sense because if there was 688 // a match we want to continue searching, but if there was not, we want to go to 689 // the next CharGroup. 690 // In and out parameters may point to the same location. This function takes care 691 // not to use any input parameters after it wrote into its outputs. 692 static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, 693 const uint8_t *const root, const int startPos, const int *const inWord, 694 const int startInputIndex, const int inputSize, int *outNewWord, int *outInputIndex, 695 int *outPos) { 696 const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); 697 int pos = startPos; 698 int codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); 699 int baseChar = toBaseLowerCase(codePoint); 700 const int wChar = toBaseLowerCase(inWord[startInputIndex]); 701 702 if (baseChar != wChar) { 703 *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; 704 *outInputIndex = startInputIndex; 705 return false; 706 } 707 int inputIndex = startInputIndex; 708 outNewWord[inputIndex] = codePoint; 709 if (hasMultipleChars) { 710 codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); 711 while (NOT_A_CODE_POINT != codePoint) { 712 baseChar = toBaseLowerCase(codePoint); 713 if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) { 714 *outPos = BinaryFormat::skipOtherCharacters(root, pos); 715 *outInputIndex = startInputIndex; 716 return false; 717 } 718 outNewWord[inputIndex] = codePoint; 719 codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); 720 } 721 } 722 *outInputIndex = inputIndex + 1; 723 *outPos = pos; 724 return true; 725 } 726 727 // This function is invoked when a word like the word searched for is found. 728 // It will compare the probability to the max probability, and if greater, will 729 // copy the word into the output buffer. In output value maxFreq, it will 730 // write the new maximum probability if it changed. 731 static inline void onTerminalWordLike(const int freq, int *newWord, const int length, int *outWord, 732 int *maxFreq) { 733 if (freq > *maxFreq) { 734 for (int q = 0; q < length; ++q) { 735 outWord[q] = newWord[q]; 736 } 737 outWord[length] = 0; 738 *maxFreq = freq; 739 } 740 } 741 742 // Will find the highest probability of the words like the one passed as an argument, 743 // that is, everything that only differs by case/accents. 744 int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, const int inputSize, 745 int *outWord) const { 746 int newWord[MAX_WORD_LENGTH]; 747 int depth = 0; 748 int maxFreq = -1; 749 const uint8_t *const root = DICT_ROOT; 750 int stackChildCount[MAX_WORD_LENGTH]; 751 int stackInputIndex[MAX_WORD_LENGTH]; 752 int stackSiblingPos[MAX_WORD_LENGTH]; 753 754 int startPos = 0; 755 stackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); 756 stackInputIndex[0] = 0; 757 stackSiblingPos[0] = startPos; 758 while (depth >= 0) { 759 const int charGroupCount = stackChildCount[depth]; 760 int pos = stackSiblingPos[depth]; 761 for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) { 762 int inputIndex = stackInputIndex[depth]; 763 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 764 // Test whether all chars in this group match with the word we are searching for. If so, 765 // we want to traverse its children (or if the inputSize match, evaluate its 766 // probability). Note that this function will output the position regardless, but will 767 // only write into inputIndex if there is a match. 768 const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, 769 inputIndex, inputSize, newWord, &inputIndex, &pos); 770 if (isAlike && (!(BinaryFormat::FLAG_IS_NOT_A_WORD & flags)) 771 && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) { 772 const int probability = 773 BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); 774 onTerminalWordLike(probability, newWord, inputIndex, outWord, &maxFreq); 775 } 776 pos = BinaryFormat::skipProbability(flags, pos); 777 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); 778 const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); 779 // If we had a match and the word has children, we want to traverse them. We don't have 780 // to traverse words longer than the one we are searching for, since they will not match 781 // anyway, so don't traverse unless inputIndex < inputSize. 782 if (isAlike && (-1 != childrenNodePos) && (inputIndex < inputSize)) { 783 // Save position for this depth, to get back to this once children are done 784 stackChildCount[depth] = charGroupIndex; 785 stackSiblingPos[depth] = siblingPos; 786 // Prepare stack values for next depth 787 ++depth; 788 int childrenPos = childrenNodePos; 789 stackChildCount[depth] = 790 BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos); 791 stackSiblingPos[depth] = childrenPos; 792 stackInputIndex[depth] = inputIndex; 793 pos = childrenPos; 794 // Go to the next depth level. 795 ++depth; 796 break; 797 } else { 798 // No match, or no children, or word too long to ever match: go the next sibling. 799 pos = siblingPos; 800 } 801 } 802 --depth; 803 } 804 return maxFreq; 805 } 806 807 int UnigramDictionary::getProbability(const int *const inWord, const int length) const { 808 const uint8_t *const root = DICT_ROOT; 809 int pos = BinaryFormat::getTerminalPosition(root, inWord, length, 810 false /* forceLowerCaseSearch */); 811 if (NOT_VALID_WORD == pos) { 812 return NOT_A_PROBABILITY; 813 } 814 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 815 if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) { 816 // If this is not a word, or if it's a blacklisted entry, it should behave as 817 // having no probability outside of the suggestion process (where it should be used 818 // for shortcuts). 819 return NOT_A_PROBABILITY; 820 } 821 const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); 822 if (hasMultipleChars) { 823 pos = BinaryFormat::skipOtherCharacters(root, pos); 824 } else { 825 BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); 826 } 827 const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); 828 return unigramProbability; 829 } 830 831 // TODO: remove this function. 832 int UnigramDictionary::getBigramPosition(int pos, int *word, int offset, int length) const { 833 return -1; 834 } 835 836 // ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not. 837 // If the return value is false, then the caller should read in the output "nextSiblingPosition" 838 // to find out the address of the next sibling node and pass it to a new call of processCurrentNode. 839 // It is worthy to note that when false is returned, the output values other than 840 // nextSiblingPosition are undefined. 841 // If the return value is true, then the caller must proceed to traverse the children of this 842 // node. processCurrentNode will output the information about the children: their count in 843 // newCount, their position in newChildrenPosition, the traverseAllNodes flag in 844 // newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the 845 // diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into 846 // newOutputIndex. Please also note the following caveat: processCurrentNode does not know when 847 // there aren't any more nodes at this level, it merely returns the address of the first byte after 848 // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any 849 // given level, as output into newCount when traversing this level's parent. 850 bool UnigramDictionary::processCurrentNode(const int initialPos, 851 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, 852 int *newCount, int *newChildrenPosition, int *nextSiblingPosition, 853 WordsPriorityQueuePool *queuePool, const int currentWordIndex) const { 854 if (DEBUG_DICT) { 855 correction->checkState(); 856 } 857 int pos = initialPos; 858 859 // Flags contain the following information: 860 // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: 861 // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address 862 // is on the specified number of bytes. 863 // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address. 864 // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. 865 // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) 866 // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not 867 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); 868 const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); 869 const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); 870 871 bool needsToInvokeOnTerminal = false; 872 873 // This gets only ONE character from the stream. Next there will be: 874 // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node 875 // else if FLAG_IS_TERMINAL: the probability 876 // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address 877 // Note that you can't have a node that both is not a terminal and has no children. 878 int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); 879 ASSERT(NOT_A_CODE_POINT != c); 880 881 // We are going to loop through each character and make it look like it's a different 882 // node each time. To do that, we will process characters in this node in order until 883 // we find the character terminator. This is signalled by getCodePoint* returning 884 // NOT_A_CODE_POINT. 885 // As a special case, if there is only one character in this node, we must not read the 886 // next bytes so we will simulate the NOT_A_CODE_POINT return by testing the flags. 887 // This way, each loop run will look like a "virtual node". 888 do { 889 // We prefetch the next char. If 'c' is the last char of this node, we will have 890 // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node 891 // should behave as a terminal or not and whether we have children. 892 const int nextc = hasMultipleChars 893 ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT; 894 const bool isLastChar = (NOT_A_CODE_POINT == nextc); 895 // If there are more chars in this nodes, then this virtual node is not a terminal. 896 // If we are on the last char, this virtual node is a terminal if this node is. 897 const bool isTerminal = isLastChar && isTerminalNode; 898 899 Correction::CorrectionType stateType = correction->processCharAndCalcState( 900 c, isTerminal); 901 if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL 902 || stateType == Correction::ON_TERMINAL) { 903 needsToInvokeOnTerminal = true; 904 } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) { 905 // We found that this is an unrelated character, so we should give up traversing 906 // this node and its children entirely. 907 // However we may not be on the last virtual node yet so we skip the remaining 908 // characters in this node, the probability if it's there, read the next sibling 909 // position to output it, then return false. 910 // We don't have to output other values because we return false, as in 911 // "don't traverse children". 912 if (!isLastChar) { 913 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); 914 } 915 pos = BinaryFormat::skipProbability(flags, pos); 916 *nextSiblingPosition = 917 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 918 return false; 919 } 920 921 // Prepare for the next character. Promote the prefetched char to current char - the loop 922 // will take care of prefetching the next. If we finally found our last char, nextc will 923 // contain NOT_A_CODE_POINT. 924 c = nextc; 925 } while (NOT_A_CODE_POINT != c); 926 927 if (isTerminalNode) { 928 // The probability should be here, because we come here only if this is actually 929 // a terminal node, and we are on its last char. 930 const int unigramProbability = 931 BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos); 932 const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos); 933 const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); 934 TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); 935 // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. 936 // bigramFilter is a bloom filter of said frequencies for even faster rejection. 937 const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, 938 unigramProbability); 939 onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, 940 currentWordIndex); 941 942 // If there are more chars in this node, then this virtual node has children. 943 // If we are on the last char, this virtual node has children if this node has. 944 const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); 945 946 // This character matched the typed character (enough to traverse the node at least) 947 // so we just evaluated it. Now we should evaluate this virtual node's children - that 948 // is, if it has any. If it has no children, we're done here - so we skip the end of 949 // the node, output the siblings position, and return false "don't traverse children". 950 // Note that !hasChildren implies isLastChar, so we know we don't have to skip any 951 // remaining char in this group for there can't be any. 952 if (!hasChildren) { 953 pos = BinaryFormat::skipProbability(flags, pos); 954 *nextSiblingPosition = 955 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 956 return false; 957 } 958 959 // Optimization: Prune out words that are too long compared to how much was typed. 960 if (correction->needsToPrune()) { 961 pos = BinaryFormat::skipProbability(flags, pos); 962 *nextSiblingPosition = 963 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 964 if (DEBUG_DICT_FULL) { 965 AKLOGI("Traversing was pruned."); 966 } 967 return false; 968 } 969 } 970 971 // Now we finished processing this node, and we want to traverse children. If there are no 972 // children, we can't come here. 973 ASSERT(BinaryFormat::hasChildrenInFlags(flags)); 974 975 // If this node was a terminal it still has the probability under the pointer (it may have been 976 // read, but not skipped - see readProbabilityWithoutMovingPointer). 977 // Next come the children position, then possibly attributes (attributes are bigrams only for 978 // now, maybe something related to shortcuts in the future). 979 // Once this is read, we still need to output the number of nodes in the immediate children of 980 // this node, so we read and output it before returning true, as in "please traverse children". 981 pos = BinaryFormat::skipProbability(flags, pos); 982 int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); 983 *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 984 *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); 985 *newChildrenPosition = childrenPos; 986 return true; 987 } 988 } // namespace latinime 989