1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "dictionary/structure/v2/patricia_trie_policy.h" 18 19 #include "defines.h" 20 #include "suggest/core/dicnode/dic_node.h" 21 #include "suggest/core/dicnode/dic_node_vector.h" 22 #include "dictionary/interface/ngram_listener.h" 23 #include "dictionary/property/ngram_context.h" 24 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 25 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" 26 #include "dictionary/utils/binary_dictionary_bigrams_iterator.h" 27 #include "dictionary/utils/multi_bigram_map.h" 28 #include "dictionary/utils/probability_utils.h" 29 #include "utils/char_utils.h" 30 31 namespace latinime { 32 33 void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, 34 DicNodeVector *const childDicNodes) const { 35 if (!dicNode->hasChildren()) { 36 return; 37 } 38 int nextPos = dicNode->getChildrenPtNodeArrayPos(); 39 if (!isValidPos(nextPos)) { 40 AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd", 41 nextPos, mBuffer.size()); 42 mIsCorrupted = true; 43 ASSERT(false); 44 return; 45 } 46 const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( 47 mBuffer.data(), &nextPos); 48 for (int i = 0; i < childCount; i++) { 49 if (!isValidPos(nextPos)) { 50 AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d", 51 nextPos, mBuffer.size(), i, childCount); 52 mIsCorrupted = true; 53 ASSERT(false); 54 return; 55 } 56 nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); 57 } 58 } 59 60 int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, 61 const int maxCodePointCount, int *const outCodePoints) const { 62 return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount, 63 outCodePoints, nullptr /* outUnigramProbability */); 64 } 65 // This retrieves code points and the probability of the word by its id. 66 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, 67 // it is possible to check for this with advantageous complexity. For each PtNode array, we search 68 // for PtNodes with children and compare the children position with the position we look for. 69 // When we shoot the position we look for, it means the word we look for is in the children 70 // of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a 71 // PtNode array with the last PtNode's children position still less than what we are searching for, 72 // we must descend the last PtNode's children (for example, if the word we are searching for starts 73 // with a z, it's the last PtNode of the root array, so all children addresses will be smaller 74 // than the position we look for, and we have to descend the z PtNode). 75 /* Parameters : 76 * wordId: Id of the word we are searching for. 77 * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. 78 * outUnigramProbability: a pointer to an int to write the probability into. 79 * Return value : the code point count, of 0 if the word was not found. 80 */ 81 // TODO: Split this function to be more readable 82 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( 83 const int wordId, const int maxCodePointCount, int *const outCodePoints, 84 int *const outUnigramProbability) const { 85 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); 86 int pos = getRootPosition(); 87 int wordPos = 0; 88 const int *const codePointTable = mHeaderPolicy.getCodePointTable(); 89 if (outUnigramProbability) { 90 *outUnigramProbability = NOT_A_PROBABILITY; 91 } 92 // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will 93 // only traverse PtNodes that are actually a part of the terminal we are searching, so each 94 // time we enter this loop we are one depth level further than last time. 95 // The only reason we count PtNodes is because we want to reduce the probability of infinite 96 // looping in case there is a bug. Since we know there is an upper bound to the depth we are 97 // supposed to traverse, it does not hurt to count iterations. 98 for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { 99 int lastCandidatePtNodePos = 0; 100 // Let's loop through PtNodes in this PtNode array searching for either the terminal 101 // or one of its ascendants. 102 if (!isValidPos(pos)) { 103 AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd", 104 pos, mBuffer.size()); 105 mIsCorrupted = true; 106 ASSERT(false); 107 return 0; 108 } 109 for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( 110 mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) { 111 const int startPos = pos; 112 if (!isValidPos(pos)) { 113 AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size()); 114 mIsCorrupted = true; 115 ASSERT(false); 116 return 0; 117 } 118 const PatriciaTrieReadingUtils::NodeFlags flags = 119 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos); 120 const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 121 mBuffer.data(), codePointTable, &pos); 122 if (ptNodePos == startPos) { 123 // We found the position. Copy the rest of the code points in the buffer and return 124 // the length. 125 outCodePoints[wordPos] = character; 126 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { 127 int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 128 mBuffer.data(), codePointTable, &pos); 129 // We count code points in order to avoid infinite loops if the file is broken 130 // or if there is some other bug 131 int charCount = maxCodePointCount; 132 while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { 133 outCodePoints[++wordPos] = nextChar; 134 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 135 mBuffer.data(), codePointTable, &pos); 136 } 137 } 138 if (outUnigramProbability) { 139 *outUnigramProbability = 140 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( 141 mBuffer.data(), &pos); 142 } 143 return ++wordPos; 144 } 145 // We need to skip past this PtNode, so skip any remaining code points after the 146 // first and possibly the probability. 147 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { 148 PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH, 149 codePointTable, &pos); 150 } 151 if (PatriciaTrieReadingUtils::isTerminal(flags)) { 152 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos); 153 } 154 // The fact that this PtNode has children is very important. Since we already know 155 // that this PtNode does not match, if it has no children we know it is irrelevant 156 // to what we are searching for. 157 const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); 158 // We will write in `found' whether we have passed the children position we are 159 // searching for. For example if we search for "beer", the children of b are less 160 // than the address we are searching for and the children of c are greater. When we 161 // come here for c, we realize this is too big, and that we should descend b. 162 bool found; 163 if (hasChildren) { 164 int currentPos = pos; 165 // Here comes the tricky part. First, read the children position. 166 const int childrenPos = PatriciaTrieReadingUtils 167 ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags, 168 ¤tPos); 169 if (childrenPos > ptNodePos) { 170 // If the children pos is greater than the position, it means the previous 171 // PtNode, which position is stored in lastCandidatePtNodePos, was the right 172 // one. 173 found = true; 174 } else if (1 >= ptNodeCount) { 175 // However if we are on the LAST PtNode of this array, and we have NOT shot the 176 // position we should descend THIS PtNode. So we trick the 177 // lastCandidatePtNodePos so that we will descend this PtNode, not the previous 178 // one. 179 lastCandidatePtNodePos = startPos; 180 found = true; 181 } else { 182 // Else, we should continue looking. 183 found = false; 184 } 185 } else { 186 // Even if we don't have children here, we could still be on the last PtNode of 187 // this array. If this is the case, we should descend the last PtNode that had 188 // children, and their position is already in lastCandidatePtNodePos. 189 found = (1 >= ptNodeCount); 190 } 191 192 if (found) { 193 // Okay, we found the PtNode we should descend. Its position is in 194 // the lastCandidatePtNodePos variable, so we just re-read it. 195 if (0 != lastCandidatePtNodePos) { 196 const PatriciaTrieReadingUtils::NodeFlags lastFlags = 197 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( 198 mBuffer.data(), &lastCandidatePtNodePos); 199 const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 200 mBuffer.data(), codePointTable, &lastCandidatePtNodePos); 201 // We copy all the characters in this PtNode to the buffer 202 outCodePoints[wordPos] = lastChar; 203 if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { 204 int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 205 mBuffer.data(), codePointTable, &lastCandidatePtNodePos); 206 int charCount = maxCodePointCount; 207 while (-1 != nextChar && --charCount > 0) { 208 outCodePoints[++wordPos] = nextChar; 209 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( 210 mBuffer.data(), codePointTable, &lastCandidatePtNodePos); 211 } 212 } 213 ++wordPos; 214 // Now we only need to branch to the children address. Skip the probability if 215 // it's there, read pos, and break to resume the search at pos. 216 if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { 217 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), 218 &lastCandidatePtNodePos); 219 } 220 pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( 221 mBuffer.data(), lastFlags, &lastCandidatePtNodePos); 222 break; 223 } else { 224 // Here is a little tricky part: we come here if we found out that all children 225 // addresses in this PtNode are bigger than the address we are searching for. 226 // Should we conclude the word is not in the dictionary? No! It could still be 227 // one of the remaining PtNodes in this array, so we have to keep looking in 228 // this array until we find it (or we realize it's not there either, in which 229 // case it's actually not in the dictionary). Pass the end of this PtNode, 230 // ready to start the next one. 231 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 232 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( 233 mBuffer.data(), flags, &pos); 234 } 235 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 236 mShortcutListPolicy.skipAllShortcuts(&pos); 237 } 238 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { 239 if (!mBigramListPolicy.skipAllBigrams(&pos)) { 240 AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), 241 pos); 242 mIsCorrupted = true; 243 ASSERT(false); 244 return 0; 245 } 246 } 247 } 248 } else { 249 // If we did not find it, we should record the last children address for the next 250 // iteration. 251 if (hasChildren) lastCandidatePtNodePos = startPos; 252 // Now skip the end of this PtNode (children pos and the attributes if any) so that 253 // our pos is after the end of this PtNode, at the start of the next one. 254 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { 255 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( 256 mBuffer.data(), flags, &pos); 257 } 258 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { 259 mShortcutListPolicy.skipAllShortcuts(&pos); 260 } 261 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { 262 if (!mBigramListPolicy.skipAllBigrams(&pos)) { 263 AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos); 264 mIsCorrupted = true; 265 ASSERT(false); 266 return 0; 267 } 268 } 269 } 270 271 } 272 } 273 // If we have looked through all the PtNodes and found no match, the ptNodePos is 274 // not the position of a terminal in this dictionary. 275 return 0; 276 } 277 278 // This function gets the position of the terminal PtNode of the exact matching word in the 279 // dictionary. If no match is found, it returns NOT_A_WORD_ID. 280 int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, 281 const bool forceLowerCaseSearch) const { 282 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); 283 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 284 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), 285 wordCodePoints.size(), forceLowerCaseSearch); 286 if (readingHelper.isError()) { 287 mIsCorrupted = true; 288 AKLOGE("Dictionary reading error in getWordId()."); 289 } 290 return getWordIdFromTerminalPtNodePos(ptNodePos); 291 } 292 293 const WordAttributes PatriciaTriePolicy::getWordAttributesInContext( 294 const WordIdArrayView prevWordIds, const int wordId, 295 MultiBigramMap *const multiBigramMap) const { 296 if (wordId == NOT_A_WORD_ID) { 297 return WordAttributes(); 298 } 299 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); 300 const PtNodeParams ptNodeParams = 301 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 302 if (multiBigramMap) { 303 const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, 304 prevWordIds, wordId, ptNodeParams.getProbability()); 305 return getWordAttributes(probability, ptNodeParams); 306 } 307 if (!prevWordIds.empty()) { 308 const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId); 309 if (bigramProbability != NOT_A_PROBABILITY) { 310 return getWordAttributes(bigramProbability, ptNodeParams); 311 } 312 } 313 return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), 314 ptNodeParams); 315 } 316 317 const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, 318 const PtNodeParams &ptNodeParams) const { 319 return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), 320 ptNodeParams.isPossiblyOffensive()); 321 } 322 323 int PatriciaTriePolicy::getProbability(const int unigramProbability, 324 const int bigramProbability) const { 325 // Due to space constraints, the probability for bigrams is approximate - the lower the unigram 326 // probability, the worse the precision. The theoritical maximum error in resulting probability 327 // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means 328 // that sometimes, we'll see some bigrams interverted here, but it can't get too bad. 329 if (unigramProbability == NOT_A_PROBABILITY) { 330 return NOT_A_PROBABILITY; 331 } else if (bigramProbability == NOT_A_PROBABILITY) { 332 return ProbabilityUtils::backoff(unigramProbability); 333 } else { 334 return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, 335 bigramProbability); 336 } 337 } 338 339 int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, 340 const int wordId) const { 341 if (wordId == NOT_A_WORD_ID) { 342 return NOT_A_PROBABILITY; 343 } 344 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); 345 const PtNodeParams ptNodeParams = 346 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 347 if (ptNodeParams.isNotAWord()) { 348 // If this is not a word, it should behave as having no probability outside of the 349 // suggestion process (where it should be used for shortcuts). 350 return NOT_A_PROBABILITY; 351 } 352 if (!prevWordIds.empty()) { 353 const int bigramsPosition = getBigramsPositionOfPtNode( 354 getTerminalPtNodePosFromWordId(prevWordIds[0])); 355 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); 356 while (bigramsIt.hasNext()) { 357 bigramsIt.next(); 358 if (bigramsIt.getBigramPos() == ptNodePos 359 && bigramsIt.getProbability() != NOT_A_PROBABILITY) { 360 return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); 361 } 362 } 363 return NOT_A_PROBABILITY; 364 } 365 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); 366 } 367 368 void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, 369 NgramListener *const listener) const { 370 if (prevWordIds.empty()) { 371 return; 372 } 373 const int bigramsPosition = getBigramsPositionOfPtNode( 374 getTerminalPtNodePosFromWordId(prevWordIds[0])); 375 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); 376 while (bigramsIt.hasNext()) { 377 bigramsIt.next(); 378 listener->onVisitEntry(bigramsIt.getProbability(), 379 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); 380 } 381 } 382 383 BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const { 384 const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); 385 return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos); 386 } 387 388 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 389 if (ptNodePos == NOT_A_DICT_POS) { 390 return NOT_A_DICT_POS; 391 } 392 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos(); 393 } 394 395 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 396 if (ptNodePos == NOT_A_DICT_POS) { 397 return NOT_A_DICT_POS; 398 } 399 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos(); 400 } 401 402 int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, 403 const int ptNodePos, DicNodeVector *childDicNodes) const { 404 PatriciaTrieReadingUtils::NodeFlags flags; 405 int mergedNodeCodePointCount = 0; 406 int mergedNodeCodePoints[MAX_WORD_LENGTH]; 407 int probability = NOT_A_PROBABILITY; 408 int childrenPos = NOT_A_DICT_POS; 409 int shortcutPos = NOT_A_DICT_POS; 410 int bigramPos = NOT_A_DICT_POS; 411 int siblingPos = NOT_A_DICT_POS; 412 const int *const codePointTable = mHeaderPolicy.getCodePointTable(); 413 PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy, 414 &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount, 415 mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, 416 &siblingPos); 417 // Skip PtNodes don't start with Unicode code point because they represent non-word information. 418 if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { 419 const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID; 420 childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId, 421 CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount)); 422 } 423 return siblingPos; 424 } 425 426 const WordProperty PatriciaTriePolicy::getWordProperty( 427 const CodePointArrayView wordCodePoints) const { 428 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); 429 if (wordId == NOT_A_WORD_ID) { 430 AKLOGE("getWordProperty was called for invalid word."); 431 return WordProperty(); 432 } 433 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); 434 const PtNodeParams ptNodeParams = 435 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 436 // Fetch bigram information. 437 std::vector<NgramProperty> ngrams; 438 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); 439 int bigramWord1CodePoints[MAX_WORD_LENGTH]; 440 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos); 441 while (bigramsIt.hasNext()) { 442 // Fetch the next bigram information and forward the iterator. 443 bigramsIt.next(); 444 // Skip the entry if the entry has been deleted. This never happens for ver2 dicts. 445 if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { 446 int word1Probability = NOT_A_PROBABILITY; 447 const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 448 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH, 449 bigramWord1CodePoints, &word1Probability); 450 const int probability = getProbability(word1Probability, bigramsIt.getProbability()); 451 ngrams.emplace_back( 452 NgramContext(wordCodePoints.data(), wordCodePoints.size(), 453 ptNodeParams.representsBeginningOfSentence()), 454 CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(), 455 probability, HistoricalInfo()); 456 } 457 } 458 // Fetch shortcut information. 459 std::vector<UnigramProperty::ShortcutProperty> shortcuts; 460 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); 461 if (shortcutPos != NOT_A_DICT_POS) { 462 int shortcutTargetCodePoints[MAX_WORD_LENGTH]; 463 ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos); 464 bool hasNext = true; 465 while (hasNext) { 466 const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = 467 ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos); 468 hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); 469 const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( 470 mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); 471 const int shortcutProbability = 472 ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); 473 shortcuts.emplace_back( 474 CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(), 475 shortcutProbability); 476 } 477 } 478 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), 479 ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), 480 ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts)); 481 return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); 482 } 483 484 int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, 485 int *const outCodePointCount) { 486 *outCodePointCount = 0; 487 if (token == 0) { 488 // Start iterating the dictionary. 489 mTerminalPtNodePositionsForIteratingWords.clear(); 490 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( 491 &mTerminalPtNodePositionsForIteratingWords); 492 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); 493 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 494 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); 495 } 496 const int terminalPtNodePositionsVectorSize = 497 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); 498 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { 499 AKLOGE("Given token %d is invalid.", token); 500 return 0; 501 } 502 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; 503 *outCodePointCount = getCodePointsAndReturnCodePointCount( 504 getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); 505 const int nextToken = token + 1; 506 if (nextToken >= terminalPtNodePositionsVectorSize) { 507 // All words have been iterated. 508 mTerminalPtNodePositionsForIteratingWords.clear(); 509 return 0; 510 } 511 return nextToken; 512 } 513 514 int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { 515 return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; 516 } 517 518 int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { 519 return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; 520 } 521 522 bool PatriciaTriePolicy::isValidPos(const int pos) const { 523 return pos >= 0 && pos < static_cast<int>(mBuffer.size()); 524 } 525 526 } // namespace latinime 527