Home | History | Annotate | Download | only in dictionary
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
     18 
     19 #include <cstdio>
     20 #include <cstring>
     21 #include <ctime>
     22 
     23 #include "defines.h"
     24 #include "suggest/core/dicnode/dic_node.h"
     25 #include "suggest/core/dicnode/dic_node_vector.h"
     26 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
     27 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
     28 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
     29 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
     30 #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
     31 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
     32 #include "suggest/policyimpl/dictionary/utils/probability_utils.h"
     33 
     34 namespace latinime {
     35 
     36 // Note that these are corresponding definitions in Java side in BinaryDictionaryTests and
     37 // BinaryDictionaryDecayingTests.
     38 const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
     39 const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
     40 const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
     41 const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
     42 const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY =
     43         "SET_NEEDS_TO_DECAY_FOR_TESTING";
     44 const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
     45 const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
     46         DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024;
     47 
     48 void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
     49         DicNodeVector *const childDicNodes) const {
     50     if (!dicNode->hasChildren()) {
     51         return;
     52     }
     53     DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
     54             getBigramsStructurePolicy(), getShortcutsStructurePolicy());
     55     readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPos());
     56     const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
     57     while (!readingHelper.isEnd()) {
     58         bool isTerminal = nodeReader->isTerminal() && !nodeReader->isDeleted();
     59         if (isTerminal && mHeaderPolicy.isDecayingDict()) {
     60             // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
     61             // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
     62             // valid terminal DicNode.
     63             isTerminal = getProbability(nodeReader->getProbability(), NOT_A_PROBABILITY)
     64                     != NOT_A_PROBABILITY;
     65         }
     66         childDicNodes->pushLeavingChild(dicNode, nodeReader->getHeadPos(),
     67                 nodeReader->getChildrenPos(), nodeReader->getProbability(), isTerminal,
     68                 nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(),
     69                 nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints());
     70         readingHelper.readNextSiblingNode();
     71     }
     72 }
     73 
     74 int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
     75         const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
     76         int *const outUnigramProbability) const {
     77     // This method traverses parent nodes from the terminal by following parent pointers; thus,
     78     // node code points are stored in the buffer in the reverse order.
     79     int reverseCodePoints[maxCodePointCount];
     80     DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
     81             getBigramsStructurePolicy(), getShortcutsStructurePolicy());
     82     // First, read the terminal node and get its probability.
     83     readingHelper.initWithPtNodePos(ptNodePos);
     84     if (!readingHelper.isValidTerminalNode()) {
     85         // Node at the ptNodePos is not a valid terminal node.
     86         *outUnigramProbability = NOT_A_PROBABILITY;
     87         return 0;
     88     }
     89     // Store terminal node probability.
     90     *outUnigramProbability = readingHelper.getNodeReader()->getProbability();
     91     // Then, following parent node link to the dictionary root and fetch node code points.
     92     while (!readingHelper.isEnd()) {
     93         if (readingHelper.getTotalCodePointCount() > maxCodePointCount) {
     94             // The ptNodePos is not a valid terminal node position in the dictionary.
     95             *outUnigramProbability = NOT_A_PROBABILITY;
     96             return 0;
     97         }
     98         // Store node code points to buffer in the reverse order.
     99         readingHelper.fetchMergedNodeCodePointsInReverseOrder(
    100                 readingHelper.getPrevTotalCodePointCount(), reverseCodePoints);
    101         // Follow parent node toward the root node.
    102         readingHelper.readParentNode();
    103     }
    104     if (readingHelper.isError()) {
    105         // The node position or the dictionary is invalid.
    106         *outUnigramProbability = NOT_A_PROBABILITY;
    107         return 0;
    108     }
    109     // Reverse the stored code points to output them.
    110     const int codePointCount = readingHelper.getTotalCodePointCount();
    111     for (int i = 0; i < codePointCount; ++i) {
    112         outCodePoints[i] = reverseCodePoints[codePointCount - i - 1];
    113     }
    114     return codePointCount;
    115 }
    116 
    117 int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord,
    118         const int length, const bool forceLowerCaseSearch) const {
    119     int searchCodePoints[length];
    120     for (int i = 0; i < length; ++i) {
    121         searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
    122     }
    123     DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
    124             getBigramsStructurePolicy(), getShortcutsStructurePolicy());
    125     readingHelper.initWithPtNodeArrayPos(getRootPosition());
    126     const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
    127     while (!readingHelper.isEnd()) {
    128         const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount();
    129         if (readingHelper.getTotalCodePointCount() > length
    130                 || !readingHelper.isMatchedCodePoint(0 /* index */,
    131                         searchCodePoints[matchedCodePointCount])) {
    132             // Current node has too many code points or its first code point is different from
    133             // target code point. Skip this node and read the next sibling node.
    134             readingHelper.readNextSiblingNode();
    135             continue;
    136         }
    137         // Check following merged node code points.
    138         const int nodeCodePointCount = nodeReader->getCodePointCount();
    139         for (int j = 1; j < nodeCodePointCount; ++j) {
    140             if (!readingHelper.isMatchedCodePoint(
    141                     j, searchCodePoints[matchedCodePointCount + j])) {
    142                 // Different code point is found. The given word is not included in the dictionary.
    143                 return NOT_A_DICT_POS;
    144             }
    145         }
    146         // All characters are matched.
    147         if (length == readingHelper.getTotalCodePointCount()) {
    148             // Terminal position is found.
    149             return nodeReader->getHeadPos();
    150         }
    151         if (!nodeReader->hasChildren()) {
    152             return NOT_A_DICT_POS;
    153         }
    154         // Advance to the children nodes.
    155         readingHelper.readChildNode();
    156     }
    157     // If we already traversed the tree further than the word is long, there means
    158     // there was no match (or we would have found it).
    159     return NOT_A_DICT_POS;
    160 }
    161 
    162 int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
    163         const int bigramProbability) const {
    164     if (mHeaderPolicy.isDecayingDict()) {
    165         return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability);
    166     } else {
    167         if (unigramProbability == NOT_A_PROBABILITY) {
    168             return NOT_A_PROBABILITY;
    169         } else if (bigramProbability == NOT_A_PROBABILITY) {
    170             return ProbabilityUtils::backoff(unigramProbability);
    171         } else {
    172             return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
    173                     bigramProbability);
    174         }
    175     }
    176 }
    177 
    178 int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const {
    179     if (ptNodePos == NOT_A_DICT_POS) {
    180         return NOT_A_PROBABILITY;
    181     }
    182     DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer,
    183             getBigramsStructurePolicy(), getShortcutsStructurePolicy());
    184     nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
    185     if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) {
    186         return NOT_A_PROBABILITY;
    187     }
    188     return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY);
    189 }
    190 
    191 int DynamicPatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
    192     if (ptNodePos == NOT_A_DICT_POS) {
    193         return NOT_A_DICT_POS;
    194     }
    195     DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer,
    196             getBigramsStructurePolicy(), getShortcutsStructurePolicy());
    197     nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
    198     if (nodeReader.isDeleted()) {
    199         return NOT_A_DICT_POS;
    200     }
    201     return nodeReader.getShortcutPos();
    202 }
    203 
    204 int DynamicPatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
    205     if (ptNodePos == NOT_A_DICT_POS) {
    206         return NOT_A_DICT_POS;
    207     }
    208     DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer,
    209             getBigramsStructurePolicy(), getShortcutsStructurePolicy());
    210     nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
    211     if (nodeReader.isDeleted()) {
    212         return NOT_A_DICT_POS;
    213     }
    214     return nodeReader.getBigramsPos();
    215 }
    216 
    217 bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length,
    218         const int probability) {
    219     if (!mBuffer->isUpdatable()) {
    220         AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
    221         return false;
    222     }
    223     if (mBufferWithExtendableBuffer.getTailPosition()
    224             >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
    225         AKLOGE("The dictionary is too large to dynamically update.");
    226         return false;
    227     }
    228     DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
    229             getBigramsStructurePolicy(), getShortcutsStructurePolicy());
    230     readingHelper.initWithPtNodeArrayPos(getRootPosition());
    231     DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
    232             &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
    233     bool addedNewUnigram = false;
    234     if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
    235             &addedNewUnigram)) {
    236         if (addedNewUnigram) {
    237             mUnigramCount++;
    238         }
    239         return true;
    240     } else {
    241         return false;
    242     }
    243 }
    244 
    245 bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
    246         const int *const word1, const int length1, const int probability) {
    247     if (!mBuffer->isUpdatable()) {
    248         AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
    249         return false;
    250     }
    251     if (mBufferWithExtendableBuffer.getTailPosition()
    252             >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
    253         AKLOGE("The dictionary is too large to dynamically update.");
    254         return false;
    255     }
    256     const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
    257             false /* forceLowerCaseSearch */);
    258     if (word0Pos == NOT_A_DICT_POS) {
    259         return false;
    260     }
    261     const int word1Pos = getTerminalNodePositionOfWord(word1, length1,
    262             false /* forceLowerCaseSearch */);
    263     if (word1Pos == NOT_A_DICT_POS) {
    264         return false;
    265     }
    266     DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
    267             &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
    268     bool addedNewBigram = false;
    269     if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
    270         if (addedNewBigram) {
    271             mBigramCount++;
    272         }
    273         return true;
    274     } else {
    275         return false;
    276     }
    277 }
    278 
    279 bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
    280         const int *const word1, const int length1) {
    281     if (!mBuffer->isUpdatable()) {
    282         AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary.");
    283         return false;
    284     }
    285     if (mBufferWithExtendableBuffer.getTailPosition()
    286             >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
    287         AKLOGE("The dictionary is too large to dynamically update.");
    288         return false;
    289     }
    290     const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
    291             false /* forceLowerCaseSearch */);
    292     if (word0Pos == NOT_A_DICT_POS) {
    293         return false;
    294     }
    295     const int word1Pos = getTerminalNodePositionOfWord(word1, length1,
    296             false /* forceLowerCaseSearch */);
    297     if (word1Pos == NOT_A_DICT_POS) {
    298         return false;
    299     }
    300     DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
    301             &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
    302     if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
    303         mBigramCount--;
    304         return true;
    305     } else {
    306         return false;
    307     }
    308 }
    309 
    310 void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
    311     if (!mBuffer->isUpdatable()) {
    312         AKLOGI("Warning: flush() is called for non-updatable dictionary.");
    313         return;
    314     }
    315     DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
    316             &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */);
    317     writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
    318 }
    319 
    320 void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
    321     if (!mBuffer->isUpdatable()) {
    322         AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
    323         return;
    324     }
    325     const bool needsToDecay = mHeaderPolicy.isDecayingDict()
    326             && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
    327                     false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy));
    328     DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer,
    329             &mShortcutListPolicy, needsToDecay);
    330     DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
    331             &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay);
    332     writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy);
    333     mNeedsToDecayForTesting = false;
    334 }
    335 
    336 bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
    337     if (!mBuffer->isUpdatable()) {
    338         AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
    339         return false;
    340     }
    341     if (mBufferWithExtendableBuffer.isNearSizeLimit()) {
    342         // Additional buffer size is near the limit.
    343         return true;
    344     } else if (mHeaderPolicy.getExtendedRegionSize()
    345             + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize()
    346                     > MAX_DICT_EXTENDED_REGION_SIZE) {
    347         // Total extended region size exceeds the limit.
    348         return true;
    349     } else if (mBufferWithExtendableBuffer.getTailPosition()
    350             >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
    351                     && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) {
    352         // Needs to reduce dictionary size.
    353         return true;
    354     } else if (mHeaderPolicy.isDecayingDict()) {
    355         return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
    356                 mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy);
    357     }
    358     return false;
    359 }
    360 
    361 void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
    362         const int maxResultLength) {
    363     if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
    364         snprintf(outResult, maxResultLength, "%d", mUnigramCount);
    365     } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
    366         snprintf(outResult, maxResultLength, "%d", mBigramCount);
    367     } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
    368         snprintf(outResult, maxResultLength, "%d",
    369                 mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT :
    370                         static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE));
    371     } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
    372         snprintf(outResult, maxResultLength, "%d",
    373                 mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
    374                         static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE));
    375     } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) {
    376         mNeedsToDecayForTesting = true;
    377     }
    378 }
    379 
    380 } // namespace latinime
    381