Home | History | Annotate | Download | only in v2
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "dictionary/structure/v2/patricia_trie_policy.h"
     18 
     19 #include "defines.h"
     20 #include "suggest/core/dicnode/dic_node.h"
     21 #include "suggest/core/dicnode/dic_node_vector.h"
     22 #include "dictionary/interface/ngram_listener.h"
     23 #include "dictionary/property/ngram_context.h"
     24 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
     25 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
     26 #include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
     27 #include "dictionary/utils/multi_bigram_map.h"
     28 #include "dictionary/utils/probability_utils.h"
     29 #include "utils/char_utils.h"
     30 
     31 namespace latinime {
     32 
     33 void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
     34         DicNodeVector *const childDicNodes) const {
     35     if (!dicNode->hasChildren()) {
     36         return;
     37     }
     38     int nextPos = dicNode->getChildrenPtNodeArrayPos();
     39     if (!isValidPos(nextPos)) {
     40         AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd",
     41                 nextPos, mBuffer.size());
     42         mIsCorrupted = true;
     43         ASSERT(false);
     44         return;
     45     }
     46     const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
     47             mBuffer.data(), &nextPos);
     48     for (int i = 0; i < childCount; i++) {
     49         if (!isValidPos(nextPos)) {
     50             AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d",
     51                     nextPos, mBuffer.size(), i, childCount);
     52             mIsCorrupted = true;
     53             ASSERT(false);
     54             return;
     55         }
     56         nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
     57     }
     58 }
     59 
     60 int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
     61         const int maxCodePointCount, int *const outCodePoints) const {
     62     return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount,
     63             outCodePoints, nullptr /* outUnigramProbability */);
     64 }
     65 // This retrieves code points and the probability of the word by its id.
     66 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
     67 // it is possible to check for this with advantageous complexity. For each PtNode array, we search
     68 // for PtNodes with children and compare the children position with the position we look for.
     69 // When we shoot the position we look for, it means the word we look for is in the children
     70 // of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a
     71 // PtNode array with the last PtNode's children position still less than what we are searching for,
     72 // we must descend the last PtNode's children (for example, if the word we are searching for starts
     73 // with a z, it's the last PtNode of the root array, so all children addresses will be smaller
     74 // than the position we look for, and we have to descend the z PtNode).
     75 /* Parameters :
     76  * wordId: Id of the word we are searching for.
     77  * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
     78  * outUnigramProbability: a pointer to an int to write the probability into.
     79  * Return value : the code point count, of 0 if the word was not found.
     80  */
     81 // TODO: Split this function to be more readable
     82 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
     83         const int wordId, const int maxCodePointCount, int *const outCodePoints,
     84         int *const outUnigramProbability) const {
     85     const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
     86     int pos = getRootPosition();
     87     int wordPos = 0;
     88     const int *const codePointTable = mHeaderPolicy.getCodePointTable();
     89     if (outUnigramProbability) {
     90         *outUnigramProbability = NOT_A_PROBABILITY;
     91     }
     92     // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
     93     // only traverse PtNodes that are actually a part of the terminal we are searching, so each
     94     // time we enter this loop we are one depth level further than last time.
     95     // The only reason we count PtNodes is because we want to reduce the probability of infinite
     96     // looping in case there is a bug. Since we know there is an upper bound to the depth we are
     97     // supposed to traverse, it does not hurt to count iterations.
     98     for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {
     99         int lastCandidatePtNodePos = 0;
    100         // Let's loop through PtNodes in this PtNode array searching for either the terminal
    101         // or one of its ascendants.
    102         if (!isValidPos(pos)) {
    103             AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd",
    104                     pos, mBuffer.size());
    105             mIsCorrupted = true;
    106             ASSERT(false);
    107             return 0;
    108         }
    109         for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
    110                 mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) {
    111             const int startPos = pos;
    112             if (!isValidPos(pos)) {
    113                 AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size());
    114                 mIsCorrupted = true;
    115                 ASSERT(false);
    116                 return 0;
    117             }
    118             const PatriciaTrieReadingUtils::NodeFlags flags =
    119                     PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
    120             const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    121                     mBuffer.data(), codePointTable, &pos);
    122             if (ptNodePos == startPos) {
    123                 // We found the position. Copy the rest of the code points in the buffer and return
    124                 // the length.
    125                 outCodePoints[wordPos] = character;
    126                 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
    127                     int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    128                             mBuffer.data(), codePointTable, &pos);
    129                     // We count code points in order to avoid infinite loops if the file is broken
    130                     // or if there is some other bug
    131                     int charCount = maxCodePointCount;
    132                     while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
    133                         outCodePoints[++wordPos] = nextChar;
    134                         nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    135                                 mBuffer.data(), codePointTable, &pos);
    136                     }
    137                 }
    138                 if (outUnigramProbability) {
    139                     *outUnigramProbability =
    140                             PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
    141                                     mBuffer.data(), &pos);
    142                 }
    143                 return ++wordPos;
    144             }
    145             // We need to skip past this PtNode, so skip any remaining code points after the
    146             // first and possibly the probability.
    147             if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
    148                 PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
    149                         codePointTable, &pos);
    150             }
    151             if (PatriciaTrieReadingUtils::isTerminal(flags)) {
    152                 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
    153             }
    154             // The fact that this PtNode has children is very important. Since we already know
    155             // that this PtNode does not match, if it has no children we know it is irrelevant
    156             // to what we are searching for.
    157             const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags);
    158             // We will write in `found' whether we have passed the children position we are
    159             // searching for. For example if we search for "beer", the children of b are less
    160             // than the address we are searching for and the children of c are greater. When we
    161             // come here for c, we realize this is too big, and that we should descend b.
    162             bool found;
    163             if (hasChildren) {
    164                 int currentPos = pos;
    165                 // Here comes the tricky part. First, read the children position.
    166                 const int childrenPos = PatriciaTrieReadingUtils
    167                         ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags,
    168                                 &currentPos);
    169                 if (childrenPos > ptNodePos) {
    170                     // If the children pos is greater than the position, it means the previous
    171                     // PtNode, which position is stored in lastCandidatePtNodePos, was the right
    172                     // one.
    173                     found = true;
    174                 } else if (1 >= ptNodeCount) {
    175                     // However if we are on the LAST PtNode of this array, and we have NOT shot the
    176                     // position we should descend THIS PtNode. So we trick the
    177                     // lastCandidatePtNodePos so that we will descend this PtNode, not the previous
    178                     // one.
    179                     lastCandidatePtNodePos = startPos;
    180                     found = true;
    181                 } else {
    182                     // Else, we should continue looking.
    183                     found = false;
    184                 }
    185             } else {
    186                 // Even if we don't have children here, we could still be on the last PtNode of
    187                 // this array. If this is the case, we should descend the last PtNode that had
    188                 // children, and their position is already in lastCandidatePtNodePos.
    189                 found = (1 >= ptNodeCount);
    190             }
    191 
    192             if (found) {
    193                 // Okay, we found the PtNode we should descend. Its position is in
    194                 // the lastCandidatePtNodePos variable, so we just re-read it.
    195                 if (0 != lastCandidatePtNodePos) {
    196                     const PatriciaTrieReadingUtils::NodeFlags lastFlags =
    197                             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
    198                                     mBuffer.data(), &lastCandidatePtNodePos);
    199                     const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    200                             mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
    201                     // We copy all the characters in this PtNode to the buffer
    202                     outCodePoints[wordPos] = lastChar;
    203                     if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
    204                         int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    205                                 mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
    206                         int charCount = maxCodePointCount;
    207                         while (-1 != nextChar && --charCount > 0) {
    208                             outCodePoints[++wordPos] = nextChar;
    209                             nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    210                                     mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
    211                         }
    212                     }
    213                     ++wordPos;
    214                     // Now we only need to branch to the children address. Skip the probability if
    215                     // it's there, read pos, and break to resume the search at pos.
    216                     if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
    217                         PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
    218                                 &lastCandidatePtNodePos);
    219                     }
    220                     pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
    221                             mBuffer.data(), lastFlags, &lastCandidatePtNodePos);
    222                     break;
    223                 } else {
    224                     // Here is a little tricky part: we come here if we found out that all children
    225                     // addresses in this PtNode are bigger than the address we are searching for.
    226                     // Should we conclude the word is not in the dictionary? No! It could still be
    227                     // one of the remaining PtNodes in this array, so we have to keep looking in
    228                     // this array until we find it (or we realize it's not there either, in which
    229                     // case it's actually not in the dictionary). Pass the end of this PtNode,
    230                     // ready to start the next one.
    231                     if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    232                         PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
    233                                 mBuffer.data(), flags, &pos);
    234                     }
    235                     if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    236                         mShortcutListPolicy.skipAllShortcuts(&pos);
    237                     }
    238                     if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
    239                         if (!mBigramListPolicy.skipAllBigrams(&pos)) {
    240                             AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(),
    241                                     pos);
    242                             mIsCorrupted = true;
    243                             ASSERT(false);
    244                             return 0;
    245                         }
    246                     }
    247                 }
    248             } else {
    249                 // If we did not find it, we should record the last children address for the next
    250                 // iteration.
    251                 if (hasChildren) lastCandidatePtNodePos = startPos;
    252                 // Now skip the end of this PtNode (children pos and the attributes if any) so that
    253                 // our pos is after the end of this PtNode, at the start of the next one.
    254                 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    255                     PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
    256                             mBuffer.data(), flags, &pos);
    257                 }
    258                 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    259                     mShortcutListPolicy.skipAllShortcuts(&pos);
    260                 }
    261                 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
    262                     if (!mBigramListPolicy.skipAllBigrams(&pos)) {
    263                         AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos);
    264                         mIsCorrupted = true;
    265                         ASSERT(false);
    266                         return 0;
    267                     }
    268                 }
    269             }
    270 
    271         }
    272     }
    273     // If we have looked through all the PtNodes and found no match, the ptNodePos is
    274     // not the position of a terminal in this dictionary.
    275     return 0;
    276 }
    277 
    278 // This function gets the position of the terminal PtNode of the exact matching word in the
    279 // dictionary. If no match is found, it returns NOT_A_WORD_ID.
    280 int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
    281         const bool forceLowerCaseSearch) const {
    282     DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
    283     readingHelper.initWithPtNodeArrayPos(getRootPosition());
    284     const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
    285             wordCodePoints.size(), forceLowerCaseSearch);
    286     if (readingHelper.isError()) {
    287         mIsCorrupted = true;
    288         AKLOGE("Dictionary reading error in getWordId().");
    289     }
    290     return getWordIdFromTerminalPtNodePos(ptNodePos);
    291 }
    292 
    293 const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(
    294         const WordIdArrayView prevWordIds, const int wordId,
    295         MultiBigramMap *const multiBigramMap) const {
    296     if (wordId == NOT_A_WORD_ID) {
    297         return WordAttributes();
    298     }
    299     const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
    300     const PtNodeParams ptNodeParams =
    301             mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
    302     if (multiBigramMap) {
    303         const int probability =  multiBigramMap->getBigramProbability(this /* structurePolicy */,
    304                 prevWordIds, wordId, ptNodeParams.getProbability());
    305         return getWordAttributes(probability, ptNodeParams);
    306     }
    307     if (!prevWordIds.empty()) {
    308         const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId);
    309         if (bigramProbability != NOT_A_PROBABILITY) {
    310             return getWordAttributes(bigramProbability, ptNodeParams);
    311         }
    312     }
    313     return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
    314             ptNodeParams);
    315 }
    316 
    317 const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability,
    318         const PtNodeParams &ptNodeParams) const {
    319     return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
    320             ptNodeParams.isPossiblyOffensive());
    321 }
    322 
    323 int PatriciaTriePolicy::getProbability(const int unigramProbability,
    324         const int bigramProbability) const {
    325     // Due to space constraints, the probability for bigrams is approximate - the lower the unigram
    326     // probability, the worse the precision. The theoritical maximum error in resulting probability
    327     // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means
    328     // that sometimes, we'll see some bigrams interverted here, but it can't get too bad.
    329     if (unigramProbability == NOT_A_PROBABILITY) {
    330         return NOT_A_PROBABILITY;
    331     } else if (bigramProbability == NOT_A_PROBABILITY) {
    332         return ProbabilityUtils::backoff(unigramProbability);
    333     } else {
    334         return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
    335                 bigramProbability);
    336     }
    337 }
    338 
    339 int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
    340         const int wordId) const {
    341     if (wordId == NOT_A_WORD_ID) {
    342         return NOT_A_PROBABILITY;
    343     }
    344     const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
    345     const PtNodeParams ptNodeParams =
    346             mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
    347     if (ptNodeParams.isNotAWord()) {
    348         // If this is not a word, it should behave as having no probability outside of the
    349         // suggestion process (where it should be used for shortcuts).
    350         return NOT_A_PROBABILITY;
    351     }
    352     if (!prevWordIds.empty()) {
    353         const int bigramsPosition = getBigramsPositionOfPtNode(
    354                 getTerminalPtNodePosFromWordId(prevWordIds[0]));
    355         BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
    356         while (bigramsIt.hasNext()) {
    357             bigramsIt.next();
    358             if (bigramsIt.getBigramPos() == ptNodePos
    359                     && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
    360                 return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
    361             }
    362         }
    363         return NOT_A_PROBABILITY;
    364     }
    365     return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
    366 }
    367 
    368 void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
    369         NgramListener *const listener) const {
    370     if (prevWordIds.empty()) {
    371         return;
    372     }
    373     const int bigramsPosition = getBigramsPositionOfPtNode(
    374             getTerminalPtNodePosFromWordId(prevWordIds[0]));
    375     BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
    376     while (bigramsIt.hasNext()) {
    377         bigramsIt.next();
    378         listener->onVisitEntry(bigramsIt.getProbability(),
    379                 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
    380     }
    381 }
    382 
    383 BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const {
    384     const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
    385     return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos);
    386 }
    387 
    388 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
    389     if (ptNodePos == NOT_A_DICT_POS) {
    390         return NOT_A_DICT_POS;
    391     }
    392     return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
    393 }
    394 
    395 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
    396     if (ptNodePos == NOT_A_DICT_POS) {
    397         return NOT_A_DICT_POS;
    398     }
    399     return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos();
    400 }
    401 
    402 int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
    403         const int ptNodePos, DicNodeVector *childDicNodes) const {
    404     PatriciaTrieReadingUtils::NodeFlags flags;
    405     int mergedNodeCodePointCount = 0;
    406     int mergedNodeCodePoints[MAX_WORD_LENGTH];
    407     int probability = NOT_A_PROBABILITY;
    408     int childrenPos = NOT_A_DICT_POS;
    409     int shortcutPos = NOT_A_DICT_POS;
    410     int bigramPos = NOT_A_DICT_POS;
    411     int siblingPos = NOT_A_DICT_POS;
    412     const int *const codePointTable = mHeaderPolicy.getCodePointTable();
    413     PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
    414             &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
    415             mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
    416             &siblingPos);
    417     // Skip PtNodes don't start with Unicode code point because they represent non-word information.
    418     if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
    419         const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
    420         childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId,
    421                 CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount));
    422     }
    423     return siblingPos;
    424 }
    425 
    426 const WordProperty PatriciaTriePolicy::getWordProperty(
    427         const CodePointArrayView wordCodePoints) const {
    428     const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
    429     if (wordId == NOT_A_WORD_ID) {
    430         AKLOGE("getWordProperty was called for invalid word.");
    431         return WordProperty();
    432     }
    433     const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
    434     const PtNodeParams ptNodeParams =
    435             mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
    436     // Fetch bigram information.
    437     std::vector<NgramProperty> ngrams;
    438     const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
    439     int bigramWord1CodePoints[MAX_WORD_LENGTH];
    440     BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
    441     while (bigramsIt.hasNext()) {
    442         // Fetch the next bigram information and forward the iterator.
    443         bigramsIt.next();
    444         // Skip the entry if the entry has been deleted. This never happens for ver2 dicts.
    445         if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
    446             int word1Probability = NOT_A_PROBABILITY;
    447             const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
    448                     getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
    449                     bigramWord1CodePoints, &word1Probability);
    450             const int probability = getProbability(word1Probability, bigramsIt.getProbability());
    451             ngrams.emplace_back(
    452                     NgramContext(wordCodePoints.data(), wordCodePoints.size(),
    453                             ptNodeParams.representsBeginningOfSentence()),
    454                     CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
    455                     probability, HistoricalInfo());
    456         }
    457     }
    458     // Fetch shortcut information.
    459     std::vector<UnigramProperty::ShortcutProperty> shortcuts;
    460     int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
    461     if (shortcutPos != NOT_A_DICT_POS) {
    462         int shortcutTargetCodePoints[MAX_WORD_LENGTH];
    463         ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos);
    464         bool hasNext = true;
    465         while (hasNext) {
    466             const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
    467                     ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos);
    468             hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
    469             const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
    470                     mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
    471             const int shortcutProbability =
    472                     ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
    473             shortcuts.emplace_back(
    474                     CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(),
    475                     shortcutProbability);
    476         }
    477     }
    478     const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
    479             ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
    480             ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
    481     return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
    482 }
    483 
    484 int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
    485         int *const outCodePointCount) {
    486     *outCodePointCount = 0;
    487     if (token == 0) {
    488         // Start iterating the dictionary.
    489         mTerminalPtNodePositionsForIteratingWords.clear();
    490         DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
    491                 &mTerminalPtNodePositionsForIteratingWords);
    492         DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
    493         readingHelper.initWithPtNodeArrayPos(getRootPosition());
    494         readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
    495     }
    496     const int terminalPtNodePositionsVectorSize =
    497             static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
    498     if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
    499         AKLOGE("Given token %d is invalid.", token);
    500         return 0;
    501     }
    502     const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
    503     *outCodePointCount = getCodePointsAndReturnCodePointCount(
    504             getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
    505     const int nextToken = token + 1;
    506     if (nextToken >= terminalPtNodePositionsVectorSize) {
    507         // All words have been iterated.
    508         mTerminalPtNodePositionsForIteratingWords.clear();
    509         return 0;
    510     }
    511     return nextToken;
    512 }
    513 
    514 int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
    515     return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
    516 }
    517 
    518 int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
    519     return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
    520 }
    521 
    522 bool PatriciaTriePolicy::isValidPos(const int pos) const {
    523     return pos >= 0 && pos < static_cast<int>(mBuffer.size());
    524 }
    525 
    526 } // namespace latinime
    527