1 /* 2 ** 3 ** Copyright 2010, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include <assert.h> 19 #include <string.h> 20 21 #define LOG_TAG "LatinIME: unigram_dictionary.cpp" 22 23 #include "char_utils.h" 24 #include "defines.h" 25 #include "dictionary.h" 26 #include "unigram_dictionary.h" 27 28 #include "binary_format.h" 29 #include "terminal_attributes.h" 30 31 namespace latinime { 32 33 const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = 34 { { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS 35 { 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS 36 { 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS 37 38 const UnigramDictionary::digraph_t UnigramDictionary::FRENCH_LIGATURES_DIGRAPHS[] = 39 { { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE 40 { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE 41 42 // TODO: check the header 43 UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, 44 int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags) 45 : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), 46 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), 47 // TODO : remove this variable. 48 ROOT_POS(0), 49 BYTES_IN_ONE_CHAR(sizeof(int)), 50 MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), FLAGS(flags) { 51 if (DEBUG_DICT) { 52 AKLOGI("UnigramDictionary - constructor"); 53 } 54 } 55 56 UnigramDictionary::~UnigramDictionary() { 57 } 58 59 static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize) { 60 return sizeof(*codes) * codesSize; 61 } 62 63 // TODO: This needs to take a const unsigned short* and not tinker with its contents 64 static inline void addWord( 65 unsigned short *word, int length, int frequency, WordsPriorityQueue *queue) { 66 queue->push(frequency, word, length); 67 } 68 69 // Return the replacement code point for a digraph, or 0 if none. 70 int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int codesSize, 71 const digraph_t* const digraphs, const unsigned int digraphsSize) const { 72 73 // There can't be a digraph if we don't have at least 2 characters to examine 74 if (i + 2 > codesSize) return false; 75 76 // Search for the first char of some digraph 77 int lastDigraphIndex = -1; 78 const int thisChar = codes[i]; 79 for (lastDigraphIndex = digraphsSize - 1; lastDigraphIndex >= 0; --lastDigraphIndex) { 80 if (thisChar == digraphs[lastDigraphIndex].first) break; 81 } 82 // No match: return early 83 if (lastDigraphIndex < 0) return 0; 84 85 // It's an interesting digraph if the second char matches too. 86 if (digraphs[lastDigraphIndex].second == codes[i + 1]) { 87 return digraphs[lastDigraphIndex].replacement; 88 } else { 89 return 0; 90 } 91 } 92 93 // Mostly the same arguments as the non-recursive version, except: 94 // codes is the original value. It points to the start of the work buffer, and gets passed as is. 95 // codesSize is the size of the user input (thus, it is the size of codesSrc). 96 // codesDest is the current point in the work buffer. 97 // codesSrc is the current point in the user-input, original, content-unmodified buffer. 98 // codesRemain is the remaining size in codesSrc. 99 void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, 100 const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, 101 int *xCoordinatesBuffer, int *yCoordinatesBuffer, 102 const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 103 const bool useFullEditDistance, const int *codesSrc, 104 const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, 105 WordsPriorityQueuePool *queuePool, 106 const digraph_t* const digraphs, const unsigned int digraphsSize) { 107 108 const int startIndex = codesDest - codesBuffer; 109 if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) { 110 for (int i = 0; i < codesRemain; ++i) { 111 xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i]; 112 yCoordinatesBuffer[startIndex + i] = ycoordinates[codesBufferSize - codesRemain + i]; 113 const int replacementCodePoint = 114 getDigraphReplacement(codesSrc, i, codesRemain, digraphs, digraphsSize); 115 if (0 != replacementCodePoint) { 116 // Found a digraph. We will try both spellings. eg. the word is "pruefen" 117 118 // Copy the word up to the first char of the digraph, including proximity chars, 119 // and overwrite the primary code with the replacement code point. Then, continue 120 // processing on the remaining part of the word, skipping the second char of the 121 // digraph. 122 // In our example, copy "pru", replace "u" with the version with the diaeresis and 123 // continue running on "fen". 124 // Make i the index of the second char of the digraph for simplicity. Forgetting 125 // to do that results in an infinite recursion so take care! 126 ++i; 127 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR); 128 codesDest[(i - 1) * (BYTES_IN_ONE_CHAR / sizeof(codesDest[0]))] = 129 replacementCodePoint; 130 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, 131 codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, 132 bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1, 133 codesRemain - i - 1, currentDepth + 1, codesDest + i, correction, 134 queuePool, digraphs, digraphsSize); 135 136 // Copy the second char of the digraph in place, then continue processing on 137 // the remaining part of the word. 138 // In our example, after "pru" in the buffer copy the "e", and continue on "fen" 139 memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR); 140 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, 141 codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, 142 bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i, 143 currentDepth + 1, codesDest + i, correction, queuePool, digraphs, 144 digraphsSize); 145 return; 146 } 147 } 148 } 149 150 // If we come here, we hit the end of the word: let's check it against the dictionary. 151 // In our example, we'll come here once for "prufen" and then once for "pruefen". 152 // If the word contains several digraphs, we'll come it for the product of them. 153 // eg. if the word is "ueberpruefen" we'll test, in order, against 154 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen". 155 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain; 156 if (0 != remainingBytes) { 157 memcpy(codesDest, codesSrc, remainingBytes); 158 memcpy(&xCoordinatesBuffer[startIndex], &xcoordinates[codesBufferSize - codesRemain], 159 sizeof(int) * codesRemain); 160 memcpy(&yCoordinatesBuffer[startIndex], &ycoordinates[codesBufferSize - codesRemain], 161 sizeof(int) * codesRemain); 162 } 163 164 getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, 165 startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction, 166 queuePool); 167 } 168 169 // bigramMap contains the association <bigram address> -> <bigram frequency> 170 // bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter 171 // in bigram_dictionary.cpp 172 int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, 173 WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, 174 const int *ycoordinates, const int *codes, const int codesSize, 175 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 176 const bool useFullEditDistance, unsigned short *outWords, int *frequencies) { 177 178 queuePool->clearAll(); 179 Correction* masterCorrection = correction; 180 correction->resetCorrection(); 181 if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & FLAGS) 182 { // Incrementally tune the word and try all possibilities 183 int codesBuffer[getCodesBufferSize(codes, codesSize)]; 184 int xCoordinatesBuffer[codesSize]; 185 int yCoordinatesBuffer[codesSize]; 186 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, 187 xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, 188 useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, 189 queuePool, GERMAN_UMLAUT_DIGRAPHS, 190 sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); 191 } else if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & FLAGS) { 192 int codesBuffer[getCodesBufferSize(codes, codesSize)]; 193 int xCoordinatesBuffer[codesSize]; 194 int yCoordinatesBuffer[codesSize]; 195 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, 196 xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, 197 useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, 198 queuePool, FRENCH_LIGATURES_DIGRAPHS, 199 sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); 200 } else { // Normal processing 201 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, 202 bigramMap, bigramFilter, useFullEditDistance, masterCorrection, queuePool); 203 } 204 205 PROF_START(20); 206 if (DEBUG_DICT) { 207 float ns = queuePool->getMasterQueue()->getHighestNormalizedScore( 208 proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); 209 ns += 0; 210 AKLOGI("Max normalized score = %f", ns); 211 } 212 const int suggestedWordsCount = 213 queuePool->getMasterQueue()->outputSuggestions( 214 proximityInfo->getPrimaryInputWord(), codesSize, frequencies, outWords); 215 216 if (DEBUG_DICT) { 217 float ns = queuePool->getMasterQueue()->getHighestNormalizedScore( 218 proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); 219 ns += 0; 220 AKLOGI("Returning %d words", suggestedWordsCount); 221 /// Print the returned words 222 for (int j = 0; j < suggestedWordsCount; ++j) { 223 short unsigned int* w = outWords + j * MAX_WORD_LENGTH; 224 char s[MAX_WORD_LENGTH]; 225 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; 226 AKLOGI("%s %i", s, frequencies[j]); 227 } 228 } 229 PROF_END(20); 230 PROF_CLOSE; 231 return suggestedWordsCount; 232 } 233 234 void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, 235 const int *xcoordinates, const int *ycoordinates, const int *codes, 236 const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 237 const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) { 238 239 PROF_OPEN; 240 PROF_START(0); 241 PROF_END(0); 242 243 PROF_START(1); 244 getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramMap, bigramFilter, 245 useFullEditDistance, inputLength, correction, queuePool); 246 PROF_END(1); 247 248 PROF_START(2); 249 // Note: This line is intentionally left blank 250 PROF_END(2); 251 252 PROF_START(3); 253 // Note: This line is intentionally left blank 254 PROF_END(3); 255 256 PROF_START(4); 257 bool hasAutoCorrectionCandidate = false; 258 WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); 259 if (masterQueue->size() > 0) { 260 float nsForMaster = masterQueue->getHighestNormalizedScore( 261 proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); 262 hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); 263 } 264 PROF_END(4); 265 266 PROF_START(5); 267 // Multiple word suggestions 268 if (SUGGEST_MULTIPLE_WORDS 269 && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { 270 getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, 271 useFullEditDistance, inputLength, correction, queuePool, 272 hasAutoCorrectionCandidate); 273 } 274 PROF_END(5); 275 276 PROF_START(6); 277 // Note: This line is intentionally left blank 278 PROF_END(6); 279 280 if (DEBUG_DICT) { 281 queuePool->dumpSubQueue1TopSuggestions(); 282 for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { 283 WordsPriorityQueue* queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i); 284 if (queue->size() > 0) { 285 WordsPriorityQueue::SuggestedWord* sw = queue->top(); 286 const int score = sw->mScore; 287 const unsigned short* word = sw->mWord; 288 const int wordLength = sw->mWordLength; 289 float ns = Correction::RankingAlgorithm::calcNormalizedScore( 290 proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); 291 ns += 0; 292 AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, 293 (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD)); 294 DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); 295 DUMP_WORD(word, wordLength); 296 } 297 } 298 } 299 } 300 301 void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, 302 const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { 303 if (DEBUG_DICT) { 304 AKLOGI("initSuggest"); 305 DUMP_WORD_INT(codes, inputLength); 306 } 307 proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); 308 const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); 309 correction->initCorrection(proximityInfo, inputLength, maxDepth); 310 } 311 312 static const char QUOTE = '\''; 313 static const char SPACE = ' '; 314 315 void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, 316 const int *xcoordinates, const int *ycoordinates, const int *codes, 317 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 318 const bool useFullEditDistance, const int inputLength, 319 Correction *correction, WordsPriorityQueuePool *queuePool) { 320 initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); 321 getSuggestionCandidates(useFullEditDistance, inputLength, bigramMap, bigramFilter, correction, 322 queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); 323 } 324 325 void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, 326 const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 327 Correction *correction, WordsPriorityQueuePool *queuePool, 328 const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { 329 uint8_t totalTraverseCount = correction->pushAndGetTotalTraverseCount(); 330 if (DEBUG_DICT) { 331 AKLOGI("Traverse count %d", totalTraverseCount); 332 } 333 if (totalTraverseCount > MULTIPLE_WORDS_SUGGESTION_MAX_TOTAL_TRAVERSE_COUNT) { 334 if (DEBUG_DICT) { 335 AKLOGI("Abort traversing %d", totalTraverseCount); 336 } 337 return; 338 } 339 // TODO: Remove setCorrectionParams 340 correction->setCorrectionParams(0, 0, 0, 341 -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, 342 doAutoCompletion, maxErrors); 343 int rootPosition = ROOT_POS; 344 // Get the number of children of root, then increment the position 345 int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); 346 int outputIndex = 0; 347 348 correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0)); 349 350 // Depth first search 351 while (outputIndex >= 0) { 352 if (correction->initProcessState(outputIndex)) { 353 int siblingPos = correction->getTreeSiblingPos(outputIndex); 354 int firstChildPos; 355 356 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, 357 bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos, 358 queuePool, currentWordIndex); 359 // Update next sibling pos 360 correction->setTreeSiblingPos(outputIndex, siblingPos); 361 362 if (needsToTraverseChildrenNodes) { 363 // Goes to child node 364 outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos); 365 } 366 } else { 367 // Goes to parent sibling node 368 outputIndex = correction->getTreeParentIndex(outputIndex); 369 } 370 } 371 } 372 373 inline void UnigramDictionary::onTerminal(const int probability, 374 const TerminalAttributes& terminalAttributes, Correction *correction, 375 WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, 376 const int currentWordIndex) { 377 const int inputIndex = correction->getInputIndex(); 378 const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; 379 380 int wordLength; 381 unsigned short* wordPointer; 382 383 if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { 384 WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); 385 const int finalProbability = 386 correction->getFinalProbability(probability, &wordPointer, &wordLength); 387 if (finalProbability != NOT_A_PROBABILITY) { 388 addWord(wordPointer, wordLength, finalProbability, masterQueue); 389 390 const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0; 391 // Please note that the shortcut candidates will be added to the master queue only. 392 TerminalAttributes::ShortcutIterator iterator = 393 terminalAttributes.getShortcutIterator(); 394 while (iterator.hasNextShortcutTarget()) { 395 // TODO: addWord only supports weak ordering, meaning we have no means 396 // to control the order of the shortcuts relative to one another or to the word. 397 // We need to either modulate the probability of each shortcut according 398 // to its own shortcut probability or to make the queue 399 // so that the insert order is protected inside the queue for words 400 // with the same score. For the moment we use -1 to make sure the shortcut will 401 // never be in front of the word. 402 uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; 403 const int shortcutTargetStringLength = iterator.getNextShortcutTarget( 404 MAX_WORD_LENGTH_INTERNAL, shortcutTarget); 405 addWord(shortcutTarget, shortcutTargetStringLength, shortcutProbability, 406 masterQueue); 407 } 408 } 409 } 410 411 // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH 412 // or more length. 413 if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) { 414 WordsPriorityQueue *subQueue; 415 subQueue = queuePool->getSubQueue(currentWordIndex, inputIndex); 416 if (!subQueue) { 417 return; 418 } 419 const int finalProbability = correction->getFinalProbabilityForSubQueue( 420 probability, &wordPointer, &wordLength, inputIndex); 421 addWord(wordPointer, wordLength, finalProbability, subQueue); 422 } 423 } 424 425 int UnigramDictionary::getSubStringSuggestion( 426 ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, 427 const int *codes, const bool useFullEditDistance, Correction *correction, 428 WordsPriorityQueuePool* queuePool, const int inputLength, 429 const bool hasAutoCorrectionCandidate, const int currentWordIndex, 430 const int inputWordStartPos, const int inputWordLength, 431 const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, 432 int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) { 433 if (inputWordLength > MULTIPLE_WORDS_SUGGESTION_MAX_WORD_LENGTH) { 434 return FLAG_MULTIPLE_SUGGEST_ABORT; 435 } 436 437 ///////////////////////////////////////////// 438 // safety net for multiple word suggestion // 439 // TODO: Remove this safety net // 440 ///////////////////////////////////////////// 441 int smallWordCount = 0; 442 int singleLetterWordCount = 0; 443 if (inputWordLength == 1) { 444 ++singleLetterWordCount; 445 } 446 if (inputWordLength <= 2) { 447 // small word == single letter or 2-letter word 448 ++smallWordCount; 449 } 450 for (int i = 0; i < currentWordIndex; ++i) { 451 const int length = wordLengthArray[i]; 452 if (length == 1) { 453 ++singleLetterWordCount; 454 // Safety net to avoid suggesting sequential single letter words 455 if (i < (currentWordIndex - 1)) { 456 if (wordLengthArray[i + 1] == 1) { 457 return FLAG_MULTIPLE_SUGGEST_ABORT; 458 } 459 } else if (inputWordLength == 1) { 460 return FLAG_MULTIPLE_SUGGEST_ABORT; 461 } 462 } 463 if (length <= 2) { 464 ++smallWordCount; 465 } 466 // Safety net to avoid suggesting multiple words with many (4 or more, for now) small words 467 if (singleLetterWordCount >= 3 || smallWordCount >= 4) { 468 return FLAG_MULTIPLE_SUGGEST_ABORT; 469 } 470 } 471 ////////////////////////////////////////////// 472 // TODO: Remove the safety net above // 473 ////////////////////////////////////////////// 474 475 unsigned short* tempOutputWord = 0; 476 int nextWordLength = 0; 477 // TODO: Optimize init suggestion 478 initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, 479 inputLength, correction); 480 481 int freq = getMostFrequentWordLike( 482 inputWordStartPos, inputWordLength, proximityInfo, mWord); 483 if (freq > 0) { 484 nextWordLength = inputWordLength; 485 tempOutputWord = mWord; 486 } else if (!hasAutoCorrectionCandidate) { 487 if (inputWordStartPos > 0) { 488 const int offset = inputWordStartPos; 489 initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], 490 codes + offset, inputWordLength, correction); 491 queuePool->clearSubQueue(currentWordIndex); 492 // TODO: pass the bigram list for substring suggestion 493 getSuggestionCandidates(useFullEditDistance, inputWordLength, 494 0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool, 495 false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); 496 if (DEBUG_DICT) { 497 if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { 498 AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); 499 for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { 500 queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord(); 501 } 502 } 503 } 504 } 505 WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); 506 // TODO: Return the correct value depending on doAutoCompletion 507 if (!queue || queue->size() <= 0) { 508 return FLAG_MULTIPLE_SUGGEST_ABORT; 509 } 510 int score = 0; 511 const float ns = queue->getHighestNormalizedScore( 512 proximityInfo->getPrimaryInputWord(), inputWordLength, 513 &tempOutputWord, &score, &nextWordLength); 514 if (DEBUG_DICT) { 515 AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score); 516 } 517 // Two words correction won't be done if the score of the first word doesn't exceed the 518 // threshold. 519 if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 520 || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { 521 return FLAG_MULTIPLE_SUGGEST_SKIP; 522 } 523 freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); 524 } 525 if (DEBUG_DICT) { 526 AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)" 527 , currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos, 528 wordLengthArray[0]); 529 } 530 if (freq <= 0 || nextWordLength <= 0 531 || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) { 532 return FLAG_MULTIPLE_SUGGEST_SKIP; 533 } 534 for (int i = 0; i < nextWordLength; ++i) { 535 outputWord[outputWordStartPos + i] = tempOutputWord[i]; 536 } 537 538 // Put output values 539 freqArray[currentWordIndex] = freq; 540 // TODO: put output length instead of input length 541 wordLengthArray[currentWordIndex] = inputWordLength; 542 const int tempOutputWordLength = outputWordStartPos + nextWordLength; 543 if (outputWordLength) { 544 *outputWordLength = tempOutputWordLength; 545 } 546 547 if ((inputWordStartPos + inputWordLength) < inputLength) { 548 if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) { 549 return FLAG_MULTIPLE_SUGGEST_SKIP; 550 } 551 outputWord[tempOutputWordLength] = SPACE; 552 if (outputWordLength) { 553 ++*outputWordLength; 554 } 555 } else if (currentWordIndex >= 1) { 556 // TODO: Handle 3 or more words 557 const int pairFreq = correction->getFreqForSplitMultipleWords( 558 freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord); 559 if (DEBUG_DICT) { 560 DUMP_WORD(outputWord, tempOutputWordLength); 561 for (int i = 0; i < currentWordIndex + 1; ++i) { 562 AKLOGI("Split %d,%d words: freq = %d, length = %d", i, currentWordIndex + 1, 563 freqArray[i], wordLengthArray[i]); 564 } 565 AKLOGI("Split two words: freq = %d, length = %d, %d, isSpace ? %d", pairFreq, 566 inputLength, tempOutputWordLength, isSpaceProximity); 567 } 568 addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue()); 569 } 570 return FLAG_MULTIPLE_SUGGEST_CONTINUE; 571 } 572 573 void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, 574 const int *xcoordinates, const int *ycoordinates, const int *codes, 575 const bool useFullEditDistance, const int inputLength, 576 Correction *correction, WordsPriorityQueuePool* queuePool, 577 const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex, 578 const int outputWordLength, int *freqArray, int* wordLengthArray, 579 unsigned short* outputWord) { 580 if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { 581 // Return if the last word index 582 return; 583 } 584 if (startWordIndex >= 1 585 && (hasAutoCorrectionCandidate 586 || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) { 587 // Do not suggest 3+ words if already has auto correction candidate 588 return; 589 } 590 for (int i = startInputPos + 1; i < inputLength; ++i) { 591 if (DEBUG_CORRECTION_FREQ) { 592 AKLOGI("Multi words(%d), start in %d sep %d start out %d", 593 startWordIndex, startInputPos, i, outputWordLength); 594 DUMP_WORD(outputWord, outputWordLength); 595 } 596 int tempOutputWordLength = 0; 597 // Current word 598 int inputWordStartPos = startInputPos; 599 int inputWordLength = i - startInputPos; 600 const int suggestionFlag = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, 601 codes, useFullEditDistance, correction, queuePool, inputLength, 602 hasAutoCorrectionCandidate, startWordIndex, inputWordStartPos, inputWordLength, 603 outputWordLength, true /* not used */, freqArray, wordLengthArray, outputWord, 604 &tempOutputWordLength); 605 if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_ABORT) { 606 // TODO: break here 607 continue; 608 } else if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_SKIP) { 609 continue; 610 } 611 612 if (DEBUG_CORRECTION_FREQ) { 613 AKLOGI("Do missing space correction"); 614 } 615 // Next word 616 // Missing space 617 inputWordStartPos = i; 618 inputWordLength = inputLength - i; 619 if(getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, 620 useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, 621 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, 622 false /* missing space */, freqArray, wordLengthArray, outputWord, 0) 623 != FLAG_MULTIPLE_SUGGEST_CONTINUE) { 624 getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, 625 useFullEditDistance, inputLength, correction, queuePool, 626 hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1, 627 tempOutputWordLength, freqArray, wordLengthArray, outputWord); 628 } 629 630 // Mistyped space 631 ++inputWordStartPos; 632 --inputWordLength; 633 634 if (inputWordLength <= 0) { 635 continue; 636 } 637 638 const int x = xcoordinates[inputWordStartPos - 1]; 639 const int y = ycoordinates[inputWordStartPos - 1]; 640 if (!proximityInfo->hasSpaceProximity(x, y)) { 641 continue; 642 } 643 644 if (DEBUG_CORRECTION_FREQ) { 645 AKLOGI("Do mistyped space correction"); 646 } 647 getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, 648 useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, 649 startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, 650 true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0); 651 } 652 } 653 654 void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, 655 const int *xcoordinates, const int *ycoordinates, const int *codes, 656 const bool useFullEditDistance, const int inputLength, 657 Correction *correction, WordsPriorityQueuePool* queuePool, 658 const bool hasAutoCorrectionCandidate) { 659 if (inputLength >= MAX_WORD_LENGTH) return; 660 if (DEBUG_DICT) { 661 AKLOGI("--- Suggest multiple words"); 662 } 663 664 // Allocating fixed length array on stack 665 unsigned short outputWord[MAX_WORD_LENGTH]; 666 int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; 667 int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; 668 const int outputWordLength = 0; 669 const int startInputPos = 0; 670 const int startWordIndex = 0; 671 getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, 672 useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate, 673 startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray, 674 outputWord); 675 } 676 677 // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous 678 // interface. 679 inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, 680 const int inputLength, ProximityInfo *proximityInfo, unsigned short *word) { 681 uint16_t inWord[inputLength]; 682 683 for (int i = 0; i < inputLength; ++i) { 684 inWord[i] = (uint16_t)proximityInfo->getPrimaryCharAt(startInputIndex + i); 685 } 686 return getMostFrequentWordLikeInner(inWord, inputLength, word); 687 } 688 689 // This function will take the position of a character array within a CharGroup, 690 // and check it actually like-matches the word in inWord starting at startInputIndex, 691 // that is, it matches it with case and accents squashed. 692 // The function returns true if there was a full match, false otherwise. 693 // The function will copy on-the-fly the characters in the CharGroup to outNewWord. 694 // It will also place the end position of the array in outPos; in outInputIndex, 695 // it will place the index of the first char AFTER the match if there was a match, 696 // and the initial position if there was not. It makes sense because if there was 697 // a match we want to continue searching, but if there was not, we want to go to 698 // the next CharGroup. 699 // In and out parameters may point to the same location. This function takes care 700 // not to use any input parameters after it wrote into its outputs. 701 static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, 702 const uint8_t* const root, const int startPos, 703 const uint16_t* const inWord, const int startInputIndex, 704 int32_t* outNewWord, int* outInputIndex, int* outPos) { 705 const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); 706 int pos = startPos; 707 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 708 int32_t baseChar = toBaseLowerCase(character); 709 const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); 710 711 if (baseChar != wChar) { 712 *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; 713 *outInputIndex = startInputIndex; 714 return false; 715 } 716 int inputIndex = startInputIndex; 717 outNewWord[inputIndex] = character; 718 if (hasMultipleChars) { 719 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 720 while (NOT_A_CHARACTER != character) { 721 baseChar = toBaseLowerCase(character); 722 if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { 723 *outPos = BinaryFormat::skipOtherCharacters(root, pos); 724 *outInputIndex = startInputIndex; 725 return false; 726 } 727 outNewWord[inputIndex] = character; 728 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 729 } 730 } 731 *outInputIndex = inputIndex + 1; 732 *outPos = pos; 733 return true; 734 } 735 736 // This function is invoked when a word like the word searched for is found. 737 // It will compare the frequency to the max frequency, and if greater, will 738 // copy the word into the output buffer. In output value maxFreq, it will 739 // write the new maximum frequency if it changed. 740 static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length, 741 short unsigned int* outWord, int* maxFreq) { 742 if (freq > *maxFreq) { 743 for (int q = 0; q < length; ++q) 744 outWord[q] = newWord[q]; 745 outWord[length] = 0; 746 *maxFreq = freq; 747 } 748 } 749 750 // Will find the highest frequency of the words like the one passed as an argument, 751 // that is, everything that only differs by case/accents. 752 int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord, 753 const int length, short unsigned int* outWord) { 754 int32_t newWord[MAX_WORD_LENGTH_INTERNAL]; 755 int depth = 0; 756 int maxFreq = -1; 757 const uint8_t* const root = DICT_ROOT; 758 759 int startPos = 0; 760 mStackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); 761 mStackInputIndex[0] = 0; 762 mStackSiblingPos[0] = startPos; 763 while (depth >= 0) { 764 const int charGroupCount = mStackChildCount[depth]; 765 int pos = mStackSiblingPos[depth]; 766 for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) { 767 int inputIndex = mStackInputIndex[depth]; 768 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 769 // Test whether all chars in this group match with the word we are searching for. If so, 770 // we want to traverse its children (or if the length match, evaluate its frequency). 771 // Note that this function will output the position regardless, but will only write 772 // into inputIndex if there is a match. 773 const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, 774 inputIndex, newWord, &inputIndex, &pos); 775 if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) { 776 const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); 777 onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); 778 } 779 pos = BinaryFormat::skipFrequency(flags, pos); 780 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); 781 const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); 782 // If we had a match and the word has children, we want to traverse them. We don't have 783 // to traverse words longer than the one we are searching for, since they will not match 784 // anyway, so don't traverse unless inputIndex < length. 785 if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) { 786 // Save position for this depth, to get back to this once children are done 787 mStackChildCount[depth] = charGroupIndex; 788 mStackSiblingPos[depth] = siblingPos; 789 // Prepare stack values for next depth 790 ++depth; 791 int childrenPos = childrenNodePos; 792 mStackChildCount[depth] = 793 BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos); 794 mStackSiblingPos[depth] = childrenPos; 795 mStackInputIndex[depth] = inputIndex; 796 pos = childrenPos; 797 // Go to the next depth level. 798 ++depth; 799 break; 800 } else { 801 // No match, or no children, or word too long to ever match: go the next sibling. 802 pos = siblingPos; 803 } 804 } 805 --depth; 806 } 807 return maxFreq; 808 } 809 810 int UnigramDictionary::getFrequency(const int32_t* const inWord, const int length) const { 811 const uint8_t* const root = DICT_ROOT; 812 int pos = BinaryFormat::getTerminalPosition(root, inWord, length); 813 if (NOT_VALID_WORD == pos) { 814 return NOT_A_PROBABILITY; 815 } 816 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 817 const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); 818 if (hasMultipleChars) { 819 pos = BinaryFormat::skipOtherCharacters(root, pos); 820 } else { 821 BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); 822 } 823 const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); 824 return unigramFreq; 825 } 826 827 // TODO: remove this function. 828 int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, 829 int length) const { 830 return -1; 831 } 832 833 // ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not. 834 // If the return value is false, then the caller should read in the output "nextSiblingPosition" 835 // to find out the address of the next sibling node and pass it to a new call of processCurrentNode. 836 // It is worthy to note that when false is returned, the output values other than 837 // nextSiblingPosition are undefined. 838 // If the return value is true, then the caller must proceed to traverse the children of this 839 // node. processCurrentNode will output the information about the children: their count in 840 // newCount, their position in newChildrenPosition, the traverseAllNodes flag in 841 // newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the 842 // diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into 843 // newOutputIndex. Please also note the following caveat: processCurrentNode does not know when 844 // there aren't any more nodes at this level, it merely returns the address of the first byte after 845 // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any 846 // given level, as output into newCount when traversing this level's parent. 847 inline bool UnigramDictionary::processCurrentNode(const int initialPos, 848 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, 849 int *newCount, int *newChildrenPosition, int *nextSiblingPosition, 850 WordsPriorityQueuePool *queuePool, const int currentWordIndex) { 851 if (DEBUG_DICT) { 852 correction->checkState(); 853 } 854 int pos = initialPos; 855 856 // Flags contain the following information: 857 // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: 858 // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address 859 // is on the specified number of bytes. 860 // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address. 861 // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. 862 // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) 863 // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not 864 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); 865 const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); 866 const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); 867 868 bool needsToInvokeOnTerminal = false; 869 870 // This gets only ONE character from the stream. Next there will be: 871 // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node 872 // else if FLAG_IS_TERMINAL: the frequency 873 // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address 874 // Note that you can't have a node that both is not a terminal and has no children. 875 int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); 876 assert(NOT_A_CHARACTER != c); 877 878 // We are going to loop through each character and make it look like it's a different 879 // node each time. To do that, we will process characters in this node in order until 880 // we find the character terminator. This is signalled by getCharCode* returning 881 // NOT_A_CHARACTER. 882 // As a special case, if there is only one character in this node, we must not read the 883 // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. 884 // This way, each loop run will look like a "virtual node". 885 do { 886 // We prefetch the next char. If 'c' is the last char of this node, we will have 887 // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node 888 // should behave as a terminal or not and whether we have children. 889 const int32_t nextc = hasMultipleChars 890 ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; 891 const bool isLastChar = (NOT_A_CHARACTER == nextc); 892 // If there are more chars in this nodes, then this virtual node is not a terminal. 893 // If we are on the last char, this virtual node is a terminal if this node is. 894 const bool isTerminal = isLastChar && isTerminalNode; 895 896 Correction::CorrectionType stateType = correction->processCharAndCalcState( 897 c, isTerminal); 898 if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL 899 || stateType == Correction::ON_TERMINAL) { 900 needsToInvokeOnTerminal = true; 901 } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) { 902 // We found that this is an unrelated character, so we should give up traversing 903 // this node and its children entirely. 904 // However we may not be on the last virtual node yet so we skip the remaining 905 // characters in this node, the frequency if it's there, read the next sibling 906 // position to output it, then return false. 907 // We don't have to output other values because we return false, as in 908 // "don't traverse children". 909 if (!isLastChar) { 910 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); 911 } 912 pos = BinaryFormat::skipFrequency(flags, pos); 913 *nextSiblingPosition = 914 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 915 return false; 916 } 917 918 // Prepare for the next character. Promote the prefetched char to current char - the loop 919 // will take care of prefetching the next. If we finally found our last char, nextc will 920 // contain NOT_A_CHARACTER. 921 c = nextc; 922 } while (NOT_A_CHARACTER != c); 923 924 if (isTerminalNode) { 925 // The frequency should be here, because we come here only if this is actually 926 // a terminal node, and we are on its last char. 927 const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); 928 const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); 929 const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); 930 TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); 931 // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. 932 // bigramFilter is a bloom filter of said frequencies for even faster rejection. 933 const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, 934 unigramFreq); 935 onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, 936 currentWordIndex); 937 938 // If there are more chars in this node, then this virtual node has children. 939 // If we are on the last char, this virtual node has children if this node has. 940 const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); 941 942 // This character matched the typed character (enough to traverse the node at least) 943 // so we just evaluated it. Now we should evaluate this virtual node's children - that 944 // is, if it has any. If it has no children, we're done here - so we skip the end of 945 // the node, output the siblings position, and return false "don't traverse children". 946 // Note that !hasChildren implies isLastChar, so we know we don't have to skip any 947 // remaining char in this group for there can't be any. 948 if (!hasChildren) { 949 pos = BinaryFormat::skipFrequency(flags, pos); 950 *nextSiblingPosition = 951 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 952 return false; 953 } 954 955 // Optimization: Prune out words that are too long compared to how much was typed. 956 if (correction->needsToPrune()) { 957 pos = BinaryFormat::skipFrequency(flags, pos); 958 *nextSiblingPosition = 959 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 960 if (DEBUG_DICT_FULL) { 961 AKLOGI("Traversing was pruned."); 962 } 963 return false; 964 } 965 } 966 967 // Now we finished processing this node, and we want to traverse children. If there are no 968 // children, we can't come here. 969 assert(BinaryFormat::hasChildrenInFlags(flags)); 970 971 // If this node was a terminal it still has the frequency under the pointer (it may have been 972 // read, but not skipped - see readFrequencyWithoutMovingPointer). 973 // Next come the children position, then possibly attributes (attributes are bigrams only for 974 // now, maybe something related to shortcuts in the future). 975 // Once this is read, we still need to output the number of nodes in the immediate children of 976 // this node, so we read and output it before returning true, as in "please traverse children". 977 pos = BinaryFormat::skipFrequency(flags, pos); 978 int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); 979 *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); 980 *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); 981 *newChildrenPosition = childrenPos; 982 return true; 983 } 984 985 } // namespace latinime 986