Home | History | Annotate | Download | only in pt_common
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
     18 
     19 #include "suggest/core/dictionary/property/unigram_property.h"
     20 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
     21 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
     22 #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
     23 #include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
     24 #include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
     25 #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
     26 
     27 namespace latinime {
     28 
     29 const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
     30 
     31 bool DynamicPtUpdatingHelper::addUnigramWord(
     32         DynamicPtReadingHelper *const readingHelper,
     33         const int *const wordCodePoints, const int codePointCount,
     34         const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
     35     int parentPos = NOT_A_DICT_POS;
     36     while (!readingHelper->isEnd()) {
     37         const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
     38         if (!ptNodeParams.isValid()) {
     39             break;
     40         }
     41         const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
     42         if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
     43                 wordCodePoints[matchedCodePointCount])) {
     44             // The first code point is different from target code point. Skip this node and read
     45             // the next sibling node.
     46             readingHelper->readNextSiblingNode(ptNodeParams);
     47             continue;
     48         }
     49         // Check following merged node code points.
     50         const int nodeCodePointCount = ptNodeParams.getCodePointCount();
     51         for (int j = 1; j < nodeCodePointCount; ++j) {
     52             const int nextIndex = matchedCodePointCount + j;
     53             if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
     54                     wordCodePoints[matchedCodePointCount + j])) {
     55                 *outAddedNewUnigram = true;
     56                 return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
     57                         wordCodePoints + matchedCodePointCount,
     58                         codePointCount - matchedCodePointCount);
     59             }
     60         }
     61         // All characters are matched.
     62         if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) {
     63             return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
     64         }
     65         if (!ptNodeParams.hasChildren()) {
     66             *outAddedNewUnigram = true;
     67             return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
     68                     wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams),
     69                     codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams));
     70         }
     71         // Advance to the children nodes.
     72         parentPos = ptNodeParams.getHeadPos();
     73         readingHelper->readChildNode(ptNodeParams);
     74     }
     75     if (readingHelper->isError()) {
     76         // The dictionary is invalid.
     77         return false;
     78     }
     79     int pos = readingHelper->getPosOfLastForwardLinkField();
     80     *outAddedNewUnigram = true;
     81     return createAndInsertNodeIntoPtNodeArray(parentPos,
     82             wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
     83             codePointCount - readingHelper->getPrevTotalCodePointCount(),
     84             unigramProperty, &pos);
     85 }
     86 
     87 bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
     88         const int wordPos, const BigramProperty *const bigramProperty,
     89         bool *const outAddedNewEntry) {
     90     if (prevWordsPtNodePos.empty()) {
     91         return false;
     92     }
     93     ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
     94     int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
     95     for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
     96         prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
     97                 prevWordsPtNodePos[i]).getTerminalId();
     98     }
     99     const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
    100     const int wordId =
    101             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
    102     return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, bigramProperty, outAddedNewEntry);
    103 }
    104 
    105 bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
    106         const int wordPos) {
    107     if (prevWordsPtNodePos.empty()) {
    108         return false;
    109     }
    110     ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
    111     int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
    112     for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
    113         prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
    114                 prevWordsPtNodePos[i]).getTerminalId();
    115     }
    116     const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
    117     const int wordId =
    118             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
    119     return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId);
    120 }
    121 
    122 bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
    123         const int *const targetCodePoints, const int targetCodePointCount,
    124         const int shortcutProbability) {
    125     const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
    126     return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount,
    127             shortcutProbability);
    128 }
    129 
    130 bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
    131         const int *const nodeCodePoints, const int nodeCodePointCount,
    132         const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) {
    133     const int newPtNodeArrayPos = mBuffer->getTailPosition();
    134     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
    135             newPtNodeArrayPos, forwardLinkFieldPos)) {
    136         return false;
    137     }
    138     return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount,
    139             unigramProperty);
    140 }
    141 
    142 bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
    143         const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
    144     if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) {
    145         // Overwrites the probability.
    146         *outAddedNewUnigram = false;
    147         return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
    148     } else {
    149         // Make the node terminal and write the probability.
    150         *outAddedNewUnigram = true;
    151         const int movedPos = mBuffer->getTailPosition();
    152         int writingPos = movedPos;
    153         const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
    154                 unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
    155                 true /* isTerminal */, originalPtNodeParams->getParentPos(),
    156                 originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(),
    157                 unigramProperty->getProbability()));
    158         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
    159                 unigramProperty, &writingPos)) {
    160             return false;
    161         }
    162         if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
    163             return false;
    164         }
    165     }
    166     return true;
    167 }
    168 
    169 bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
    170         const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
    171         const int *const codePoints, const int codePointCount) {
    172     const int newPtNodeArrayPos = mBuffer->getTailPosition();
    173     if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
    174         return false;
    175     }
    176     return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
    177             codePointCount, unigramProperty);
    178 }
    179 
    180 bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
    181         const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount,
    182         const UnigramProperty *const unigramProperty) {
    183     int writingPos = mBuffer->getTailPosition();
    184     if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
    185             1 /* arraySize */, &writingPos)) {
    186         return false;
    187     }
    188     const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
    189             unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */,
    190             parentPtNodePos, nodeCodePointCount, nodeCodePoints,
    191             unigramProperty->getProbability()));
    192     if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
    193             unigramProperty, &writingPos)) {
    194         return false;
    195     }
    196     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
    197             NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
    198         return false;
    199     }
    200     return true;
    201 }
    202 
    203 // Returns whether the dictionary updating was succeeded or not.
    204 bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
    205         const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount,
    206         const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints,
    207         const int newNodeCodePointCount) {
    208     // When addsExtraChild is true, split the reallocating PtNode and add new child.
    209     // Reallocating PtNode: abcde, newNode: abcxy.
    210     // abc (1st, not terminal) __ de (2nd)
    211     //                         \_ xy (extra child, terminal)
    212     // Otherwise, this method makes 1st part terminal and write information in unigramProperty.
    213     // Reallocating PtNode: abcde, newNode: abc.
    214     // abc (1st, terminal) __ de (2nd)
    215     const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount;
    216     const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
    217     int writingPos = firstPartOfReallocatedPtNodePos;
    218     // Write the 1st part of the reallocating node. The children position will be updated later
    219     // with actual children position.
    220     if (addsExtraChild) {
    221         const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
    222                 false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */,
    223                 reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount,
    224                 reallocatingPtNodeParams->getCodePoints(), NOT_A_PROBABILITY));
    225         if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
    226             return false;
    227         }
    228     } else {
    229         const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
    230                 unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
    231                 true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
    232                 overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(),
    233                 unigramProperty->getProbability()));
    234         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
    235                 unigramProperty, &writingPos)) {
    236             return false;
    237         }
    238     }
    239     const int actualChildrenPos = writingPos;
    240     // Create new children PtNode array.
    241     const size_t newPtNodeCount = addsExtraChild ? 2 : 1;
    242     if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
    243             newPtNodeCount, &writingPos)) {
    244         return false;
    245     }
    246     // Write the 2nd part of the reallocating node.
    247     const int secondPartOfReallocatedPtNodePos = writingPos;
    248     const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
    249             reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(),
    250             reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
    251             reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount,
    252             reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount,
    253             reallocatingPtNodeParams->getProbability()));
    254     if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
    255         return false;
    256     }
    257     if (addsExtraChild) {
    258         const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
    259                 unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
    260                 true /* isTerminal */, firstPartOfReallocatedPtNodePos,
    261                 newNodeCodePointCount - overlappingCodePointCount,
    262                 newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability()));
    263         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
    264                 unigramProperty, &writingPos)) {
    265             return false;
    266         }
    267     }
    268     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
    269             NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
    270         return false;
    271     }
    272     // Update original reallocating PtNode as moved.
    273     if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos,
    274             secondPartOfReallocatedPtNodePos)) {
    275         return false;
    276     }
    277     // Load node info. Information of the 1st part will be fetched.
    278     const PtNodeParams ptNodeParams(
    279             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos));
    280     // Update children position.
    281     return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos);
    282 }
    283 
    284 const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
    285         const PtNodeParams *const originalPtNodeParams,
    286         const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos,
    287         const int codePointCount, const int *const codePoints, const int probability) const {
    288     const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
    289             isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
    290             false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */,
    291             CHILDREN_POSITION_FIELD_SIZE);
    292     return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints,
    293             probability);
    294 }
    295 
    296 const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(
    297         const bool isNotAWord, const bool isBlacklisted, const bool isTerminal,
    298         const int parentPos, const int codePointCount, const int *const codePoints,
    299         const int probability) const {
    300     const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
    301             isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
    302             false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */,
    303             CHILDREN_POSITION_FIELD_SIZE);
    304     return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability);
    305 }
    306 
    307 } // namespace latinime
    308