1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" 18 19 #include <vector> 20 21 #include "suggest/core/dicnode/dic_node.h" 22 #include "suggest/core/dicnode/dic_node_vector.h" 23 #include "suggest/core/dictionary/ngram_listener.h" 24 #include "suggest/core/dictionary/property/bigram_property.h" 25 #include "suggest/core/dictionary/property/unigram_property.h" 26 #include "suggest/core/dictionary/property/word_property.h" 27 #include "suggest/core/session/prev_words_info.h" 28 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 29 #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 30 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 31 #include "suggest/policyimpl/dictionary/utils/probability_utils.h" 32 33 namespace latinime { 34 35 // Note that there are corresponding definitions in Java side in BinaryDictionaryTests and 36 // BinaryDictionaryDecayingTests. 37 const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 38 const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 39 const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 40 const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 41 const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; 42 const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = 43 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 44 45 void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, 46 DicNodeVector *const childDicNodes) const { 47 if (!dicNode->hasChildren()) { 48 return; 49 } 50 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 51 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); 52 while (!readingHelper.isEnd()) { 53 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); 54 if (!ptNodeParams.isValid()) { 55 break; 56 } 57 bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); 58 if (isTerminal && mHeaderPolicy->isDecayingDict()) { 59 // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose 60 // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a 61 // valid terminal DicNode. 62 isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; 63 } 64 readingHelper.readNextSiblingNode(ptNodeParams); 65 if (ptNodeParams.representsNonWordInfo()) { 66 // Skip PtNodes that represent non-word information. 67 continue; 68 } 69 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), 70 ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, 71 ptNodeParams.hasChildren(), 72 ptNodeParams.isBlacklisted() 73 || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, 74 ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); 75 } 76 if (readingHelper.isError()) { 77 mIsCorrupted = true; 78 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 79 } 80 } 81 82 int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( 83 const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, 84 int *const outUnigramProbability) const { 85 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 86 readingHelper.initWithPtNodePos(ptNodePos); 87 const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( 88 maxCodePointCount, outCodePoints, outUnigramProbability); 89 if (readingHelper.isError()) { 90 mIsCorrupted = true; 91 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); 92 } 93 return codePointCount; 94 } 95 96 int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, 97 const int length, const bool forceLowerCaseSearch) const { 98 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 99 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 100 const int ptNodePos = 101 readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); 102 if (readingHelper.isError()) { 103 mIsCorrupted = true; 104 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 105 } 106 return ptNodePos; 107 } 108 109 int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, 110 const int bigramProbability) const { 111 if (mHeaderPolicy->isDecayingDict()) { 112 // Both probabilities are encoded. Decode them and get probability. 113 return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); 114 } else { 115 if (unigramProbability == NOT_A_PROBABILITY) { 116 return NOT_A_PROBABILITY; 117 } else if (bigramProbability == NOT_A_PROBABILITY) { 118 return ProbabilityUtils::backoff(unigramProbability); 119 } else { 120 return bigramProbability; 121 } 122 } 123 } 124 125 int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos, 126 const int ptNodePos) const { 127 if (ptNodePos == NOT_A_DICT_POS) { 128 return NOT_A_PROBABILITY; 129 } 130 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 131 if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { 132 return NOT_A_PROBABILITY; 133 } 134 if (prevWordsPtNodePos) { 135 const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); 136 BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); 137 while (bigramsIt.hasNext()) { 138 bigramsIt.next(); 139 if (bigramsIt.getBigramPos() == ptNodePos 140 && bigramsIt.getProbability() != NOT_A_PROBABILITY) { 141 return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); 142 } 143 } 144 return NOT_A_PROBABILITY; 145 } 146 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); 147 } 148 149 void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos, 150 NgramListener *const listener) const { 151 if (!prevWordsPtNodePos) { 152 return; 153 } 154 const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); 155 BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); 156 while (bigramsIt.hasNext()) { 157 bigramsIt.next(); 158 listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos()); 159 } 160 } 161 162 int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 163 if (ptNodePos == NOT_A_DICT_POS) { 164 return NOT_A_DICT_POS; 165 } 166 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 167 if (ptNodeParams.isDeleted()) { 168 return NOT_A_DICT_POS; 169 } 170 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( 171 ptNodeParams.getTerminalId()); 172 } 173 174 int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 175 if (ptNodePos == NOT_A_DICT_POS) { 176 return NOT_A_DICT_POS; 177 } 178 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 179 if (ptNodeParams.isDeleted()) { 180 return NOT_A_DICT_POS; 181 } 182 return mBuffers->getBigramDictContent()->getBigramListHeadPos( 183 ptNodeParams.getTerminalId()); 184 } 185 186 bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, 187 const UnigramProperty *const unigramProperty) { 188 if (!mBuffers->isUpdatable()) { 189 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); 190 return false; 191 } 192 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 193 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 194 mDictBuffer->getTailPosition()); 195 return false; 196 } 197 if (length > MAX_WORD_LENGTH) { 198 AKLOGE("The word is too long to insert to the dictionary, length: %d", length); 199 return false; 200 } 201 for (const auto &shortcut : unigramProperty->getShortcuts()) { 202 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 203 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", 204 shortcut.getTargetCodePoints()->size()); 205 return false; 206 } 207 } 208 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 209 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 210 bool addedNewUnigram = false; 211 int codePointsToAdd[MAX_WORD_LENGTH]; 212 int codePointCountToAdd = length; 213 memmove(codePointsToAdd, word, sizeof(int) * length); 214 if (unigramProperty->representsBeginningOfSentence()) { 215 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, 216 codePointCountToAdd, MAX_WORD_LENGTH); 217 } 218 if (codePointCountToAdd <= 0) { 219 return false; 220 } 221 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, 222 unigramProperty, &addedNewUnigram)) { 223 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { 224 mUnigramCount++; 225 } 226 if (unigramProperty->getShortcuts().size() > 0) { 227 // Add shortcut target. 228 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 229 false /* forceLowerCaseSearch */); 230 if (wordPos == NOT_A_DICT_POS) { 231 AKLOGE("Cannot find terminal PtNode position to add shortcut target."); 232 return false; 233 } 234 for (const auto &shortcut : unigramProperty->getShortcuts()) { 235 if (!mUpdatingHelper.addShortcutTarget(wordPos, 236 shortcut.getTargetCodePoints()->data(), 237 shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { 238 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " 239 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), 240 shortcut.getProbability()); 241 return false; 242 } 243 } 244 } 245 return true; 246 } else { 247 return false; 248 } 249 } 250 251 bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) { 252 if (!mBuffers->isUpdatable()) { 253 AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); 254 return false; 255 } 256 const int ptNodePos = getTerminalPtNodePositionOfWord(word, length, 257 false /* forceLowerCaseSearch */); 258 if (ptNodePos == NOT_A_DICT_POS) { 259 return false; 260 } 261 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 262 if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { 263 AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); 264 return false; 265 } 266 if (!ptNodeParams.representsNonWordInfo()) { 267 mUnigramCount--; 268 } 269 return true; 270 } 271 272 bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, 273 const BigramProperty *const bigramProperty) { 274 if (!mBuffers->isUpdatable()) { 275 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); 276 return false; 277 } 278 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 279 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 280 mDictBuffer->getTailPosition()); 281 return false; 282 } 283 if (!prevWordsInfo->isValid()) { 284 AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); 285 return false; 286 } 287 if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 288 AKLOGE("The word is too long to insert the ngram to the dictionary. " 289 "length: %d", bigramProperty->getTargetCodePoints()->size()); 290 return false; 291 } 292 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 293 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 294 false /* tryLowerCaseSearch */); 295 const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos); 296 // TODO: Support N-gram. 297 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 298 if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { 299 const std::vector<UnigramProperty::ShortcutProperty> shortcuts; 300 const UnigramProperty beginningOfSentenceUnigramProperty( 301 true /* representsBeginningOfSentence */, true /* isNotAWord */, 302 false /* isBlacklisted */, MAX_PROBABILITY /* probability */, 303 NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); 304 if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), 305 prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), 306 &beginningOfSentenceUnigramProperty)) { 307 AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); 308 return false; 309 } 310 // Refresh Terminal PtNode positions. 311 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 312 false /* tryLowerCaseSearch */); 313 } else { 314 return false; 315 } 316 } 317 const int word1Pos = getTerminalPtNodePositionOfWord( 318 bigramProperty->getTargetCodePoints()->data(), 319 bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); 320 if (word1Pos == NOT_A_DICT_POS) { 321 return false; 322 } 323 bool addedNewEntry = false; 324 if (mUpdatingHelper.addNgramEntry(prevWordsPtNodePosView, word1Pos, bigramProperty, 325 &addedNewEntry)) { 326 if (addedNewEntry) { 327 mBigramCount++; 328 } 329 return true; 330 } else { 331 return false; 332 } 333 } 334 335 bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, 336 const int *const word, const int length) { 337 if (!mBuffers->isUpdatable()) { 338 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); 339 return false; 340 } 341 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 342 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 343 mDictBuffer->getTailPosition()); 344 return false; 345 } 346 if (!prevWordsInfo->isValid()) { 347 AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); 348 return false; 349 } 350 if (length > MAX_WORD_LENGTH) { 351 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); 352 } 353 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 354 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 355 false /* tryLowerCaseSerch */); 356 const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos); 357 // TODO: Support N-gram. 358 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 359 return false; 360 } 361 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 362 false /* forceLowerCaseSearch */); 363 if (wordPos == NOT_A_DICT_POS) { 364 return false; 365 } 366 if (mUpdatingHelper.removeNgramEntry(prevWordsPtNodePosView, wordPos)) { 367 mBigramCount--; 368 return true; 369 } else { 370 return false; 371 } 372 } 373 374 bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { 375 if (!mBuffers->isUpdatable()) { 376 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); 377 return false; 378 } 379 if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { 380 AKLOGE("Cannot flush the dictionary to file."); 381 mIsCorrupted = true; 382 return false; 383 } 384 return true; 385 } 386 387 bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { 388 if (!mBuffers->isUpdatable()) { 389 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 390 return false; 391 } 392 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { 393 AKLOGE("Cannot flush the dictionary to file with GC."); 394 mIsCorrupted = true; 395 return false; 396 } 397 return true; 398 } 399 400 bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { 401 if (!mBuffers->isUpdatable()) { 402 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 403 return false; 404 } 405 if (mBuffers->isNearSizeLimit()) { 406 // Additional buffer size is near the limit. 407 return true; 408 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() 409 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { 410 // Total extended region size of the trie exceeds the limit. 411 return true; 412 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS 413 && mDictBuffer->getUsedAdditionalBufferSize() > 0) { 414 // Needs to reduce dictionary size. 415 return true; 416 } else if (mHeaderPolicy->isDecayingDict()) { 417 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, 418 mHeaderPolicy); 419 } 420 return false; 421 } 422 423 void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, 424 char *const outResult, const int maxResultLength) { 425 const int compareLength = queryLength + 1 /* terminator */; 426 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { 427 snprintf(outResult, maxResultLength, "%d", mUnigramCount); 428 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { 429 snprintf(outResult, maxResultLength, "%d", mBigramCount); 430 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { 431 snprintf(outResult, maxResultLength, "%d", 432 mHeaderPolicy->isDecayingDict() ? 433 ForgettingCurveUtils::getUnigramCountHardLimit( 434 mHeaderPolicy->getMaxUnigramCount()) : 435 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 436 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { 437 snprintf(outResult, maxResultLength, "%d", 438 mHeaderPolicy->isDecayingDict() ? 439 ForgettingCurveUtils::getBigramCountHardLimit( 440 mHeaderPolicy->getMaxBigramCount()) : 441 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 442 } 443 } 444 445 const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, 446 const int codePointCount) const { 447 const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, 448 false /* forceLowerCaseSearch */); 449 if (ptNodePos == NOT_A_DICT_POS) { 450 AKLOGE("getWordProperty is called for invalid word."); 451 return WordProperty(); 452 } 453 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 454 std::vector<int> codePointVector(ptNodeParams.getCodePoints(), 455 ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); 456 const ProbabilityEntry probabilityEntry = 457 mBuffers->getLanguageModelDictContent()->getProbabilityEntry( 458 ptNodeParams.getTerminalId()); 459 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); 460 // Fetch bigram information. 461 std::vector<BigramProperty> bigrams; 462 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); 463 if (bigramListPos != NOT_A_DICT_POS) { 464 int bigramWord1CodePoints[MAX_WORD_LENGTH]; 465 const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); 466 const TerminalPositionLookupTable *const terminalPositionLookupTable = 467 mBuffers->getTerminalPositionLookupTable(); 468 bool hasNext = true; 469 int readingPos = bigramListPos; 470 while (hasNext) { 471 const BigramEntry bigramEntry = 472 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); 473 hasNext = bigramEntry.hasNext(); 474 const int word1TerminalId = bigramEntry.getTargetTerminalId(); 475 const int word1TerminalPtNodePos = 476 terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); 477 if (word1TerminalPtNodePos == NOT_A_DICT_POS) { 478 continue; 479 } 480 // Word (unigram) probability 481 int word1Probability = NOT_A_PROBABILITY; 482 const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 483 word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, 484 &word1Probability); 485 const std::vector<int> word1(bigramWord1CodePoints, 486 bigramWord1CodePoints + codePointCount); 487 const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); 488 const int probability = bigramEntry.hasHistoricalInfo() ? 489 ForgettingCurveUtils::decodeProbability( 490 bigramEntry.getHistoricalInfo(), mHeaderPolicy) : 491 bigramEntry.getProbability(); 492 bigrams.emplace_back(&word1, probability, 493 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 494 historicalInfo->getCount()); 495 } 496 } 497 // Fetch shortcut information. 498 std::vector<UnigramProperty::ShortcutProperty> shortcuts; 499 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); 500 if (shortcutPos != NOT_A_DICT_POS) { 501 int shortcutTarget[MAX_WORD_LENGTH]; 502 const ShortcutDictContent *const shortcutDictContent = 503 mBuffers->getShortcutDictContent(); 504 bool hasNext = true; 505 while (hasNext) { 506 int shortcutTargetLength = 0; 507 int shortcutProbability = NOT_A_PROBABILITY; 508 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, 509 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); 510 const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); 511 shortcuts.emplace_back(&target, shortcutProbability); 512 } 513 } 514 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), 515 ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), 516 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 517 historicalInfo->getCount(), &shortcuts); 518 return WordProperty(&codePointVector, &unigramProperty, &bigrams); 519 } 520 521 int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, 522 int *const outCodePointCount) { 523 *outCodePointCount = 0; 524 if (token == 0) { 525 mTerminalPtNodePositionsForIteratingWords.clear(); 526 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( 527 &mTerminalPtNodePositionsForIteratingWords); 528 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 529 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 530 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); 531 } 532 const int terminalPtNodePositionsVectorSize = 533 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); 534 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { 535 AKLOGE("Given token %d is invalid.", token); 536 return 0; 537 } 538 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; 539 int unigramProbability = NOT_A_PROBABILITY; 540 *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 541 terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); 542 const int nextToken = token + 1; 543 if (nextToken >= terminalPtNodePositionsVectorSize) { 544 // All words have been iterated. 545 mTerminalPtNodePositionsForIteratingWords.clear(); 546 return 0; 547 } 548 return nextToken; 549 } 550 551 } // namespace latinime 552