Home | History | Annotate | Download | only in dictionary
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 
     18 #include "suggest/policyimpl/dictionary/patricia_trie_policy.h"
     19 
     20 #include "defines.h"
     21 #include "suggest/core/dicnode/dic_node.h"
     22 #include "suggest/core/dicnode/dic_node_vector.h"
     23 #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
     24 #include "suggest/policyimpl/dictionary/utils/probability_utils.h"
     25 
     26 namespace latinime {
     27 
     28 void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
     29         DicNodeVector *const childDicNodes) const {
     30     if (!dicNode->hasChildren()) {
     31         return;
     32     }
     33     int nextPos = dicNode->getChildrenPos();
     34     if (nextPos < 0 || nextPos >= mDictBufferSize) {
     35         AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d",
     36                 nextPos, mDictBufferSize);
     37         ASSERT(false);
     38         return;
     39     }
     40     const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
     41             mDictRoot, &nextPos);
     42     for (int i = 0; i < childCount; i++) {
     43         if (nextPos < 0 || nextPos >= mDictBufferSize) {
     44             AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d",
     45                     nextPos, mDictBufferSize, i, childCount);
     46             ASSERT(false);
     47             return;
     48         }
     49         nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
     50     }
     51 }
     52 
     53 // This retrieves code points and the probability of the word by its terminal position.
     54 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
     55 // it is possible to check for this with advantageous complexity. For each node, we search
     56 // for PtNodes with children and compare the children position with the position we look for.
     57 // When we shoot the position we look for, it means the word we look for is in the children
     58 // of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a
     59 // PtNode array with the last PtNode's children position still less than what we are searching for,
     60 // we must descend the last PtNode's children (for example, if the word we are searching for starts
     61 // with a z, it's the last PtNode of the root array, so all children addresses will be smaller
     62 // than the position we look for, and we have to descend the z node).
     63 /* Parameters :
     64  * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is
     65  *   what is stored as the "bigram position" in each bigram)
     66  * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
     67  * outUnigramProbability: a pointer to an int to write the probability into.
     68  * Return value : the code point count, of 0 if the word was not found.
     69  */
     70 // TODO: Split this function to be more readable
     71 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
     72         const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
     73         int *const outUnigramProbability) const {
     74     int pos = getRootPosition();
     75     int wordPos = 0;
     76     // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
     77     // only traverse nodes that are actually a part of the terminal we are searching, so each time
     78     // we enter this loop we are one depth level further than last time.
     79     // The only reason we count nodes is because we want to reduce the probability of infinite
     80     // looping in case there is a bug. Since we know there is an upper bound to the depth we are
     81     // supposed to traverse, it does not hurt to count iterations.
     82     for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {
     83         int lastCandidatePtNodePos = 0;
     84         // Let's loop through PtNodes in this PtNode array searching for either the terminal
     85         // or one of its ascendants.
     86         for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
     87                 mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) {
     88             const int startPos = pos;
     89             const PatriciaTrieReadingUtils::NodeFlags flags =
     90                     PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
     91             const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
     92                     mDictRoot, &pos);
     93             if (ptNodePos == startPos) {
     94                 // We found the position. Copy the rest of the code points in the buffer and return
     95                 // the length.
     96                 outCodePoints[wordPos] = character;
     97                 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
     98                     int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
     99                             mDictRoot, &pos);
    100                     // We count code points in order to avoid infinite loops if the file is broken
    101                     // or if there is some other bug
    102                     int charCount = maxCodePointCount;
    103                     while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
    104                         outCodePoints[++wordPos] = nextChar;
    105                         nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    106                                 mDictRoot, &pos);
    107                     }
    108                 }
    109                 *outUnigramProbability =
    110                         PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot,
    111                                 &pos);
    112                 return ++wordPos;
    113             }
    114             // We need to skip past this PtNode, so skip any remaining code points after the
    115             // first and possibly the probability.
    116             if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
    117                 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
    118             }
    119             if (PatriciaTrieReadingUtils::isTerminal(flags)) {
    120                 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
    121             }
    122             // The fact that this PtNode has children is very important. Since we already know
    123             // that this PtNode does not match, if it has no children we know it is irrelevant
    124             // to what we are searching for.
    125             const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags);
    126             // We will write in `found' whether we have passed the children position we are
    127             // searching for. For example if we search for "beer", the children of b are less
    128             // than the address we are searching for and the children of c are greater. When we
    129             // come here for c, we realize this is too big, and that we should descend b.
    130             bool found;
    131             if (hasChildren) {
    132                 int currentPos = pos;
    133                 // Here comes the tricky part. First, read the children position.
    134                 const int childrenPos = PatriciaTrieReadingUtils
    135                         ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &currentPos);
    136                 if (childrenPos > ptNodePos) {
    137                     // If the children pos is greater than the position, it means the previous
    138                     // PtNode, which position is stored in lastCandidatePtNodePos, was the right
    139                     // one.
    140                     found = true;
    141                 } else if (1 >= ptNodeCount) {
    142                     // However if we are on the LAST PtNode of this array, and we have NOT shot the
    143                     // position we should descend THIS node. So we trick the lastCandidatePtNodePos
    144                     // so that we will descend this PtNode, not the previous one.
    145                     lastCandidatePtNodePos = startPos;
    146                     found = true;
    147                 } else {
    148                     // Else, we should continue looking.
    149                     found = false;
    150                 }
    151             } else {
    152                 // Even if we don't have children here, we could still be on the last PtNode of /
    153                 // this array. If this is the case, we should descend the last PtNode that had
    154                 // children, and their position is already in lastCandidatePtNodePos.
    155                 found = (1 >= ptNodeCount);
    156             }
    157 
    158             if (found) {
    159                 // Okay, we found the PtNode we should descend. Its position is in
    160                 // the lastCandidatePtNodePos variable, so we just re-read it.
    161                 if (0 != lastCandidatePtNodePos) {
    162                     const PatriciaTrieReadingUtils::NodeFlags lastFlags =
    163                             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
    164                                     mDictRoot, &lastCandidatePtNodePos);
    165                     const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    166                             mDictRoot, &lastCandidatePtNodePos);
    167                     // We copy all the characters in this PtNode to the buffer
    168                     outCodePoints[wordPos] = lastChar;
    169                     if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
    170                         int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    171                                 mDictRoot, &lastCandidatePtNodePos);
    172                         int charCount = maxCodePointCount;
    173                         while (-1 != nextChar && --charCount > 0) {
    174                             outCodePoints[++wordPos] = nextChar;
    175                             nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    176                                     mDictRoot, &lastCandidatePtNodePos);
    177                         }
    178                     }
    179                     ++wordPos;
    180                     // Now we only need to branch to the children address. Skip the probability if
    181                     // it's there, read pos, and break to resume the search at pos.
    182                     if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
    183                         PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot,
    184                                 &lastCandidatePtNodePos);
    185                     }
    186                     pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
    187                             mDictRoot, lastFlags, &lastCandidatePtNodePos);
    188                     break;
    189                 } else {
    190                     // Here is a little tricky part: we come here if we found out that all children
    191                     // addresses in this PtNode are bigger than the address we are searching for.
    192                     // Should we conclude the word is not in the dictionary? No! It could still be
    193                     // one of the remaining PtNodes in this array, so we have to keep looking in
    194                     // this array until we find it (or we realize it's not there either, in which
    195                     // case it's actually not in the dictionary). Pass the end of this PtNode,
    196                     // ready to start the next one.
    197                     if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    198                         PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
    199                                 mDictRoot, flags, &pos);
    200                     }
    201                     if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    202                         mShortcutListPolicy.skipAllShortcuts(&pos);
    203                     }
    204                     if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
    205                         mBigramListPolicy.skipAllBigrams(&pos);
    206                     }
    207                 }
    208             } else {
    209                 // If we did not find it, we should record the last children address for the next
    210                 // iteration.
    211                 if (hasChildren) lastCandidatePtNodePos = startPos;
    212                 // Now skip the end of this PtNode (children pos and the attributes if any) so that
    213                 // our pos is after the end of this PtNode, at the start of the next one.
    214                 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    215                     PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
    216                             mDictRoot, flags, &pos);
    217                 }
    218                 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    219                     mShortcutListPolicy.skipAllShortcuts(&pos);
    220                 }
    221                 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
    222                     mBigramListPolicy.skipAllBigrams(&pos);
    223                 }
    224             }
    225 
    226         }
    227     }
    228     // If we have looked through all the PtNodes and found no match, the ptNodePos is
    229     // not the position of a terminal in this dictionary.
    230     return 0;
    231 }
    232 
    233 // This function gets the position of the terminal node of the exact matching word in the
    234 // dictionary. If no match is found, it returns NOT_A_DICT_POS.
    235 int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord,
    236         const int length, const bool forceLowerCaseSearch) const {
    237     int pos = getRootPosition();
    238     int wordPos = 0;
    239 
    240     while (true) {
    241         // If we already traversed the tree further than the word is long, there means
    242         // there was no match (or we would have found it).
    243         if (wordPos >= length) return NOT_A_DICT_POS;
    244         int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot,
    245                 &pos);
    246         const int wChar = forceLowerCaseSearch
    247                 ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos];
    248         while (true) {
    249             // If there are no more PtNodes in this array, it means we could not
    250             // find a matching character for this depth, therefore there is no match.
    251             if (0 >= ptNodeCount) return NOT_A_DICT_POS;
    252             const int ptNodePos = pos;
    253             const PatriciaTrieReadingUtils::NodeFlags flags =
    254                     PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
    255             int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot,
    256                     &pos);
    257             if (character == wChar) {
    258                 // This is the correct PtNode. Only one PtNode may start with the same char within
    259                 // a PtNode array, so either we found our match in this array, or there is
    260                 // no match and we can return NOT_A_DICT_POS. So we will check all the
    261                 // characters in this PtNode indeed does match.
    262                 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
    263                     character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot,
    264                             &pos);
    265                     while (NOT_A_CODE_POINT != character) {
    266                         ++wordPos;
    267                         // If we shoot the length of the word we search for, or if we find a single
    268                         // character that does not match, as explained above, it means the word is
    269                         // not in the dictionary (by virtue of this PtNode being the only one to
    270                         // match the word on the first character, but not matching the whole word).
    271                         if (wordPos >= length) return NOT_A_DICT_POS;
    272                         if (inWord[wordPos] != character) return NOT_A_DICT_POS;
    273                         character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
    274                                 mDictRoot, &pos);
    275                     }
    276                 }
    277                 // If we come here we know that so far, we do match. Either we are on a terminal
    278                 // and we match the length, in which case we found it, or we traverse children.
    279                 // If we don't match the length AND don't have children, then a word in the
    280                 // dictionary fully matches a prefix of the searched word but not the full word.
    281                 ++wordPos;
    282                 if (PatriciaTrieReadingUtils::isTerminal(flags)) {
    283                     if (wordPos == length) {
    284                         return ptNodePos;
    285                     }
    286                     PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
    287                 }
    288                 if (!PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    289                     return NOT_A_DICT_POS;
    290                 }
    291                 // We have children and we are still shorter than the word we are searching for, so
    292                 // we need to traverse children. Put the pointer on the children position, and
    293                 // break
    294                 pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot,
    295                         flags, &pos);
    296                 break;
    297             } else {
    298                 // This PtNode does not match, so skip the remaining part and go to the next.
    299                 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
    300                     PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH,
    301                             &pos);
    302                 }
    303                 if (PatriciaTrieReadingUtils::isTerminal(flags)) {
    304                     PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
    305                 }
    306                 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    307                     PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot,
    308                             flags, &pos);
    309                 }
    310                 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    311                     mShortcutListPolicy.skipAllShortcuts(&pos);
    312                 }
    313                 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
    314                     mBigramListPolicy.skipAllBigrams(&pos);
    315                 }
    316             }
    317             --ptNodeCount;
    318         }
    319     }
    320 }
    321 
    322 int PatriciaTriePolicy::getProbability(const int unigramProbability,
    323         const int bigramProbability) const {
    324     if (unigramProbability == NOT_A_PROBABILITY) {
    325         return NOT_A_PROBABILITY;
    326     } else if (bigramProbability == NOT_A_PROBABILITY) {
    327         return ProbabilityUtils::backoff(unigramProbability);
    328     } else {
    329         return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
    330                 bigramProbability);
    331     }
    332 }
    333 
    334 int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const {
    335     if (ptNodePos == NOT_A_DICT_POS) {
    336         return NOT_A_PROBABILITY;
    337     }
    338     int pos = ptNodePos;
    339     const PatriciaTrieReadingUtils::NodeFlags flags =
    340             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
    341     if (!PatriciaTrieReadingUtils::isTerminal(flags)) {
    342         return NOT_A_PROBABILITY;
    343     }
    344     if (PatriciaTrieReadingUtils::isNotAWord(flags)
    345             || PatriciaTrieReadingUtils::isBlacklisted(flags)) {
    346         // If this is not a word, or if it's a blacklisted entry, it should behave as
    347         // having no probability outside of the suggestion process (where it should be used
    348         // for shortcuts).
    349         return NOT_A_PROBABILITY;
    350     }
    351     PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
    352     return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
    353             mDictRoot, &pos), NOT_A_PROBABILITY);
    354 }
    355 
    356 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
    357     if (ptNodePos == NOT_A_DICT_POS) {
    358         return NOT_A_DICT_POS;
    359     }
    360     int pos = ptNodePos;
    361     const PatriciaTrieReadingUtils::NodeFlags flags =
    362             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
    363     if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    364         return NOT_A_DICT_POS;
    365     }
    366     PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
    367     if (PatriciaTrieReadingUtils::isTerminal(flags)) {
    368         PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
    369     }
    370     if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    371         PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos);
    372     }
    373     return pos;
    374 }
    375 
    376 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
    377     if (ptNodePos == NOT_A_DICT_POS) {
    378         return NOT_A_DICT_POS;
    379     }
    380     int pos = ptNodePos;
    381     const PatriciaTrieReadingUtils::NodeFlags flags =
    382             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
    383     if (!PatriciaTrieReadingUtils::hasBigrams(flags)) {
    384         return NOT_A_DICT_POS;
    385     }
    386     PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
    387     if (PatriciaTrieReadingUtils::isTerminal(flags)) {
    388         PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
    389     }
    390     if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
    391         PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos);
    392     }
    393     if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    394         mShortcutListPolicy.skipAllShortcuts(&pos);;
    395     }
    396     return pos;
    397 }
    398 
    399 int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
    400         const int ptNodePos, DicNodeVector *childDicNodes) const {
    401     int pos = ptNodePos;
    402     const PatriciaTrieReadingUtils::NodeFlags flags =
    403             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
    404     int mergedNodeCodePoints[MAX_WORD_LENGTH];
    405     const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
    406             mDictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos);
    407     const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))?
    408             PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos)
    409                     : NOT_A_PROBABILITY;
    410     const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ?
    411             PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
    412                     mDictRoot, flags, &pos) : NOT_A_DICT_POS;
    413     if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
    414         getShortcutsStructurePolicy()->skipAllShortcuts(&pos);
    415     }
    416     if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
    417         getBigramsStructurePolicy()->skipAllBigrams(&pos);
    418     }
    419     if (mergedNodeCodePointCount <= 0) {
    420         AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
    421         ASSERT(false);
    422         return pos;
    423     }
    424     childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
    425             PatriciaTrieReadingUtils::isTerminal(flags),
    426             PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
    427             PatriciaTrieReadingUtils::isBlacklisted(flags) ||
    428                     PatriciaTrieReadingUtils::isNotAWord(flags),
    429             mergedNodeCodePointCount, mergedNodeCodePoints);
    430     return pos;
    431 }
    432 
    433 } // namespace latinime
    434