1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "dictionary/structure/v4/ver4_patricia_trie_policy.h" 18 19 #include <array> 20 #include <vector> 21 22 #include "suggest/core/dicnode/dic_node.h" 23 #include "suggest/core/dicnode/dic_node_vector.h" 24 #include "dictionary/interface/ngram_listener.h" 25 #include "dictionary/property/ngram_context.h" 26 #include "dictionary/property/ngram_property.h" 27 #include "dictionary/property/unigram_property.h" 28 #include "dictionary/property/word_property.h" 29 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 30 #include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 31 #include "dictionary/utils/forgetting_curve_utils.h" 32 #include "dictionary/utils/multi_bigram_map.h" 33 #include "dictionary/utils/probability_utils.h" 34 #include "utils/ngram_utils.h" 35 36 namespace latinime { 37 38 // Note that there are corresponding definitions in Java side in BinaryDictionaryTests and 39 // BinaryDictionaryDecayingTests. 40 const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 41 const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 42 const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 43 const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 44 const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; 45 const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = 46 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 47 48 void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, 49 DicNodeVector *const childDicNodes) const { 50 if (!dicNode->hasChildren()) { 51 return; 52 } 53 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 54 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); 55 while (!readingHelper.isEnd()) { 56 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); 57 if (!ptNodeParams.isValid()) { 58 break; 59 } 60 const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); 61 const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID; 62 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), 63 wordId, ptNodeParams.getCodePointArrayView()); 64 readingHelper.readNextSiblingNode(ptNodeParams); 65 } 66 if (readingHelper.isError()) { 67 mIsCorrupted = true; 68 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 69 } 70 } 71 72 int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, 73 const int maxCodePointCount, int *const outCodePoints) const { 74 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 75 const int ptNodePos = 76 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); 77 readingHelper.initWithPtNodePos(ptNodePos); 78 const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( 79 maxCodePointCount, outCodePoints); 80 if (readingHelper.isError()) { 81 mIsCorrupted = true; 82 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); 83 } 84 return codePointCount; 85 } 86 87 int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, 88 const bool forceLowerCaseSearch) const { 89 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 90 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 91 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), 92 wordCodePoints.size(), forceLowerCaseSearch); 93 if (readingHelper.isError()) { 94 mIsCorrupted = true; 95 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 96 } 97 if (ptNodePos == NOT_A_DICT_POS) { 98 return NOT_A_WORD_ID; 99 } 100 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 101 if (ptNodeParams.isDeleted()) { 102 return NOT_A_WORD_ID; 103 } 104 return ptNodeParams.getTerminalId(); 105 } 106 107 const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( 108 const WordIdArrayView prevWordIds, const int wordId, 109 MultiBigramMap *const multiBigramMap) const { 110 if (wordId == NOT_A_WORD_ID) { 111 return WordAttributes(); 112 } 113 return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, 114 false /* mustMatchAllPrevWords */, mHeaderPolicy); 115 } 116 117 int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, 118 const int wordId) const { 119 if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { 120 return NOT_A_PROBABILITY; 121 } 122 const WordAttributes wordAttributes = 123 mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, 124 true /* mustMatchAllPrevWords */, mHeaderPolicy); 125 if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { 126 return NOT_A_PROBABILITY; 127 } 128 return wordAttributes.getProbability(); 129 } 130 131 BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( 132 const int wordId) const { 133 const int shortcutPos = getShortcutPositionOfWord(wordId); 134 return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); 135 } 136 137 void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, 138 NgramListener *const listener) const { 139 if (prevWordIds.empty()) { 140 return; 141 } 142 const auto languageModelDictContent = mBuffers->getLanguageModelDictContent(); 143 for (size_t i = 1; i <= prevWordIds.size(); ++i) { 144 for (const auto entry : languageModelDictContent->getProbabilityEntries( 145 prevWordIds.limit(i))) { 146 const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry(); 147 if (!probabilityEntry.isValid()) { 148 continue; 149 } 150 int probability = NOT_A_PROBABILITY; 151 if (probabilityEntry.hasHistoricalInfo()) { 152 // TODO: Quit checking count here. 153 // If count <= 1, the word can be an invaild word. The actual probability should 154 // be checked using getWordAttributesInContext() in onVisitEntry(). 155 probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ? 156 NOT_A_PROBABILITY : 0; 157 } else { 158 probability = probabilityEntry.getProbability(); 159 } 160 listener->onVisitEntry(probability, entry.getWordId()); 161 } 162 } 163 } 164 165 int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const { 166 if (wordId == NOT_A_WORD_ID) { 167 return NOT_A_DICT_POS; 168 } 169 const int ptNodePos = 170 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); 171 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 172 if (ptNodeParams.isDeleted()) { 173 return NOT_A_DICT_POS; 174 } 175 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( 176 ptNodeParams.getTerminalId()); 177 } 178 179 bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, 180 const UnigramProperty *const unigramProperty) { 181 if (!mBuffers->isUpdatable()) { 182 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); 183 return false; 184 } 185 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 186 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 187 mDictBuffer->getTailPosition()); 188 return false; 189 } 190 if (wordCodePoints.size() > MAX_WORD_LENGTH) { 191 AKLOGE("The word is too long to insert to the dictionary, length: %zd", 192 wordCodePoints.size()); 193 return false; 194 } 195 for (const auto &shortcut : unigramProperty->getShortcuts()) { 196 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 197 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", 198 shortcut.getTargetCodePoints()->size()); 199 return false; 200 } 201 } 202 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 203 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 204 bool addedNewUnigram = false; 205 int codePointsToAdd[MAX_WORD_LENGTH]; 206 int codePointCountToAdd = wordCodePoints.size(); 207 memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); 208 if (unigramProperty->representsBeginningOfSentence()) { 209 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, 210 codePointCountToAdd, MAX_WORD_LENGTH); 211 } 212 if (codePointCountToAdd <= 0) { 213 return false; 214 } 215 const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); 216 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, 217 &addedNewUnigram)) { 218 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { 219 mEntryCounters.incrementNgramCount(NgramType::Unigram); 220 } 221 if (unigramProperty->getShortcuts().size() > 0) { 222 // Add shortcut target. 223 const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */); 224 if (wordId == NOT_A_WORD_ID) { 225 AKLOGE("Cannot find word id to add shortcut target."); 226 return false; 227 } 228 const int wordPos = 229 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); 230 for (const auto &shortcut : unigramProperty->getShortcuts()) { 231 if (!mUpdatingHelper.addShortcutTarget(wordPos, 232 CodePointArrayView(*shortcut.getTargetCodePoints()), 233 shortcut.getProbability())) { 234 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " 235 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), 236 shortcut.getProbability()); 237 return false; 238 } 239 } 240 } 241 return true; 242 } else { 243 return false; 244 } 245 } 246 247 bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { 248 if (!mBuffers->isUpdatable()) { 249 AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); 250 return false; 251 } 252 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); 253 if (wordId == NOT_A_WORD_ID) { 254 return false; 255 } 256 const int ptNodePos = 257 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); 258 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 259 if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { 260 AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); 261 return false; 262 } 263 if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) { 264 return false; 265 } 266 if (!ptNodeParams.representsNonWordInfo()) { 267 mEntryCounters.decrementNgramCount(NgramType::Unigram); 268 } 269 return true; 270 } 271 272 bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { 273 if (!mBuffers->isUpdatable()) { 274 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); 275 return false; 276 } 277 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 278 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 279 mDictBuffer->getTailPosition()); 280 return false; 281 } 282 const NgramContext *const ngramContext = ngramProperty->getNgramContext(); 283 if (!ngramContext->isValid()) { 284 AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); 285 return false; 286 } 287 if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 288 AKLOGE("The word is too long to insert the ngram to the dictionary. " 289 "length: %zd", ngramProperty->getTargetCodePoints()->size()); 290 return false; 291 } 292 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; 293 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, 294 false /* tryLowerCaseSearch */); 295 if (prevWordIds.empty()) { 296 return false; 297 } 298 for (size_t i = 0; i < prevWordIds.size(); ++i) { 299 if (prevWordIds[i] != NOT_A_WORD_ID) { 300 continue; 301 } 302 if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) { 303 return false; 304 } 305 const UnigramProperty beginningOfSentenceUnigramProperty( 306 true /* representsBeginningOfSentence */, true /* isNotAWord */, 307 false /* isBlacklisted */, false /* isPossiblyOffensive */, 308 MAX_PROBABILITY /* probability */, HistoricalInfo()); 309 if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), 310 &beginningOfSentenceUnigramProperty)) { 311 AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); 312 return false; 313 } 314 // Refresh word ids. 315 ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); 316 } 317 const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()), 318 false /* forceLowerCaseSearch */); 319 if (wordId == NOT_A_WORD_ID) { 320 return false; 321 } 322 bool addedNewEntry = false; 323 if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { 324 if (addedNewEntry) { 325 mEntryCounters.incrementNgramCount( 326 NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); 327 } 328 return true; 329 } else { 330 return false; 331 } 332 } 333 334 bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, 335 const CodePointArrayView wordCodePoints) { 336 if (!mBuffers->isUpdatable()) { 337 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); 338 return false; 339 } 340 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 341 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 342 mDictBuffer->getTailPosition()); 343 return false; 344 } 345 if (!ngramContext->isValid()) { 346 AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); 347 return false; 348 } 349 if (wordCodePoints.size() > MAX_WORD_LENGTH) { 350 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", 351 wordCodePoints.size()); 352 } 353 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; 354 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, 355 false /* tryLowerCaseSerch */); 356 if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) { 357 return false; 358 } 359 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); 360 if (wordId == NOT_A_WORD_ID) { 361 return false; 362 } 363 if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { 364 mEntryCounters.decrementNgramCount( 365 NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); 366 return true; 367 } else { 368 return false; 369 } 370 } 371 372 bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( 373 const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, 374 const bool isValidWord, const HistoricalInfo historicalInfo) { 375 if (!mBuffers->isUpdatable()) { 376 AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " 377 "dictionary."); 378 return false; 379 } 380 const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? 381 false : isValidWord; 382 int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); 383 if (wordId == NOT_A_WORD_ID) { 384 // The word is not in the dictionary. 385 const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, 386 false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, 387 NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 388 0 /* count */)); 389 if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { 390 AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); 391 return false; 392 } 393 if (!isValidWord) { 394 return true; 395 } 396 wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); 397 } 398 399 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; 400 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, 401 false /* tryLowerCaseSearch */); 402 if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { 403 if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { 404 const UnigramProperty beginningOfSentenceUnigramProperty( 405 true /* representsBeginningOfSentence */, 406 true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, 407 HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); 408 if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), 409 &beginningOfSentenceUnigramProperty)) { 410 AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); 411 return false; 412 } 413 // Refresh word ids. 414 ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); 415 } 416 // Update entries for beginning of sentence. 417 if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( 418 prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, 419 mHeaderPolicy, &mEntryCounters)) { 420 return false; 421 } 422 } 423 if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, 424 wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { 425 return false; 426 } 427 return true; 428 } 429 430 bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { 431 if (!mBuffers->isUpdatable()) { 432 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); 433 return false; 434 } 435 if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { 436 AKLOGE("Cannot flush the dictionary to file."); 437 mIsCorrupted = true; 438 return false; 439 } 440 return true; 441 } 442 443 bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { 444 if (!mBuffers->isUpdatable()) { 445 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 446 return false; 447 } 448 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { 449 AKLOGE("Cannot flush the dictionary to file with GC."); 450 mIsCorrupted = true; 451 return false; 452 } 453 return true; 454 } 455 456 bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { 457 if (!mBuffers->isUpdatable()) { 458 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 459 return false; 460 } 461 if (mBuffers->isNearSizeLimit()) { 462 // Additional buffer size is near the limit. 463 return true; 464 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() 465 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { 466 // Total extended region size of the trie exceeds the limit. 467 return true; 468 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS 469 && mDictBuffer->getUsedAdditionalBufferSize() > 0) { 470 // Needs to reduce dictionary size. 471 return true; 472 } else if (mHeaderPolicy->isDecayingDict()) { 473 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), 474 mHeaderPolicy); 475 } 476 return false; 477 } 478 479 void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, 480 char *const outResult, const int maxResultLength) { 481 const int compareLength = queryLength + 1 /* terminator */; 482 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { 483 snprintf(outResult, maxResultLength, "%d", 484 mEntryCounters.getNgramCount(NgramType::Unigram)); 485 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { 486 snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); 487 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { 488 snprintf(outResult, maxResultLength, "%d", 489 mHeaderPolicy->isDecayingDict() ? 490 ForgettingCurveUtils::getEntryCountHardLimit( 491 mHeaderPolicy->getMaxNgramCounts().getNgramCount( 492 NgramType::Unigram)) : 493 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 494 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { 495 snprintf(outResult, maxResultLength, "%d", 496 mHeaderPolicy->isDecayingDict() ? 497 ForgettingCurveUtils::getEntryCountHardLimit( 498 mHeaderPolicy->getMaxNgramCounts().getNgramCount( 499 NgramType::Bigram)) : 500 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 501 } 502 } 503 504 const WordProperty Ver4PatriciaTriePolicy::getWordProperty( 505 const CodePointArrayView wordCodePoints) const { 506 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); 507 if (wordId == NOT_A_WORD_ID) { 508 AKLOGE("getWordProperty is called for invalid word."); 509 return WordProperty(); 510 } 511 const LanguageModelDictContent *const languageModelDictContent = 512 mBuffers->getLanguageModelDictContent(); 513 // Fetch ngram information. 514 std::vector<NgramProperty> ngrams; 515 int ngramTargetCodePoints[MAX_WORD_LENGTH]; 516 int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; 517 int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 518 bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 519 for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord( 520 mHeaderPolicy, wordId)) { 521 const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(), 522 MAX_WORD_LENGTH, ngramTargetCodePoints); 523 const WordIdArrayView prevWordIds = entry.getPrevWordIds(); 524 for (size_t i = 0; i < prevWordIds.size(); ++i) { 525 ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i], 526 MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]); 527 ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry( 528 prevWordIds[i]).representsBeginningOfSentence(); 529 if (ngramPrevWordIsBeginningOfSentense[i]) { 530 ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker( 531 ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]); 532 } 533 } 534 const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount, 535 ngramPrevWordIsBeginningOfSentense, prevWordIds.size()); 536 const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry(); 537 const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo(); 538 // TODO: Output flags in WordAttributes. 539 ngrams.emplace_back(ngramContext, 540 CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(), 541 entry.getWordAttributes().getProbability(), *historicalInfo); 542 } 543 // Fetch shortcut information. 544 std::vector<UnigramProperty::ShortcutProperty> shortcuts; 545 int shortcutPos = getShortcutPositionOfWord(wordId); 546 if (shortcutPos != NOT_A_DICT_POS) { 547 int shortcutTarget[MAX_WORD_LENGTH]; 548 const ShortcutDictContent *const shortcutDictContent = 549 mBuffers->getShortcutDictContent(); 550 bool hasNext = true; 551 while (hasNext) { 552 int shortcutTargetLength = 0; 553 int shortcutProbability = NOT_A_PROBABILITY; 554 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, 555 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); 556 shortcuts.emplace_back( 557 CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), 558 shortcutProbability); 559 } 560 } 561 const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( 562 WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); 563 const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); 564 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); 565 const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), 566 wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), 567 wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), 568 *historicalInfo, std::move(shortcuts)); 569 return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); 570 } 571 572 int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, 573 int *const outCodePointCount) { 574 *outCodePointCount = 0; 575 if (token == 0) { 576 mTerminalPtNodePositionsForIteratingWords.clear(); 577 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( 578 &mTerminalPtNodePositionsForIteratingWords); 579 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 580 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 581 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); 582 } 583 const int terminalPtNodePositionsVectorSize = 584 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); 585 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { 586 AKLOGE("Given token %d is invalid.", token); 587 return 0; 588 } 589 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; 590 const PtNodeParams ptNodeParams = 591 mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos); 592 *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(), 593 MAX_WORD_LENGTH, outCodePoints); 594 const int nextToken = token + 1; 595 if (nextToken >= terminalPtNodePositionsVectorSize) { 596 // All words have been iterated. 597 mTerminalPtNodePositionsForIteratingWords.clear(); 598 return 0; 599 } 600 return nextToken; 601 } 602 603 } // namespace latinime 604