Home | History | Annotate | Download | only in pt_common
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
     18 
     19 #include "dictionary/property/unigram_property.h"
     20 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
     21 #include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
     22 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
     23 #include "dictionary/structure/pt_common/pt_node_reader.h"
     24 #include "dictionary/structure/pt_common/pt_node_writer.h"
     25 #include "dictionary/utils/buffer_with_extendable_buffer.h"
     26 
     27 namespace latinime {
     28 
     29 const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
     30 
     31 bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper,
     32         const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
     33         bool *const outAddedNewUnigram) {
     34     int parentPos = NOT_A_DICT_POS;
     35     while (!readingHelper->isEnd()) {
     36         const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
     37         if (!ptNodeParams.isValid()) {
     38             break;
     39         }
     40         const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
     41         if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
     42                 wordCodePoints[matchedCodePointCount])) {
     43             // The first code point is different from target code point. Skip this node and read
     44             // the next sibling node.
     45             readingHelper->readNextSiblingNode(ptNodeParams);
     46             continue;
     47         }
     48         // Check following merged node code points.
     49         const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size();
     50         for (size_t j = 1; j < nodeCodePointCount; ++j) {
     51             const size_t nextIndex = matchedCodePointCount + j;
     52             if (nextIndex >= wordCodePoints.size()
     53                     || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
     54                             wordCodePoints[matchedCodePointCount + j])) {
     55                 *outAddedNewUnigram = true;
     56                 return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
     57                         wordCodePoints.skip(matchedCodePointCount));
     58             }
     59         }
     60         // All characters are matched.
     61         if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) {
     62             return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
     63         }
     64         if (!ptNodeParams.hasChildren()) {
     65             *outAddedNewUnigram = true;
     66             return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
     67                     wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams)));
     68         }
     69         // Advance to the children nodes.
     70         parentPos = ptNodeParams.getHeadPos();
     71         readingHelper->readChildNode(ptNodeParams);
     72     }
     73     if (readingHelper->isError()) {
     74         // The dictionary is invalid.
     75         return false;
     76     }
     77     int pos = readingHelper->getPosOfLastForwardLinkField();
     78     *outAddedNewUnigram = true;
     79     return createAndInsertNodeIntoPtNodeArray(parentPos,
     80             wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty,
     81             &pos);
     82 }
     83 
     84 bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
     85         const int wordPos, const NgramProperty *const ngramProperty,
     86         bool *const outAddedNewEntry) {
     87     if (prevWordsPtNodePos.empty()) {
     88         return false;
     89     }
     90     ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
     91     int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
     92     for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
     93         prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
     94                 prevWordsPtNodePos[i]).getTerminalId();
     95     }
     96     const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
     97     const int wordId =
     98             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
     99     return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry);
    100 }
    101 
    102 bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
    103         const int wordPos) {
    104     if (prevWordsPtNodePos.empty()) {
    105         return false;
    106     }
    107     ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
    108     int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
    109     for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
    110         prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
    111                 prevWordsPtNodePos[i]).getTerminalId();
    112     }
    113     const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
    114     const int wordId =
    115             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
    116     return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId);
    117 }
    118 
    119 bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
    120         const CodePointArrayView targetCodePoints, const int shortcutProbability) {
    121     const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
    122     return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(),
    123             targetCodePoints.size(), shortcutProbability);
    124 }
    125 
    126 bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
    127         const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
    128         int *const forwardLinkFieldPos) {
    129     const int newPtNodeArrayPos = mBuffer->getTailPosition();
    130     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
    131             newPtNodeArrayPos, forwardLinkFieldPos)) {
    132         return false;
    133     }
    134     return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty);
    135 }
    136 
    137 bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
    138         const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
    139     if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) {
    140         // Overwrites the probability.
    141         *outAddedNewUnigram = false;
    142         return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
    143     } else {
    144         // Make the node terminal and write the probability.
    145         *outAddedNewUnigram = true;
    146         const int movedPos = mBuffer->getTailPosition();
    147         int writingPos = movedPos;
    148         const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
    149                 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
    150                 true /* isTerminal */, originalPtNodeParams->getParentPos(),
    151                 originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
    152         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
    153                 unigramProperty, &writingPos)) {
    154             return false;
    155         }
    156         if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
    157             return false;
    158         }
    159     }
    160     return true;
    161 }
    162 
    163 bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
    164         const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
    165         const CodePointArrayView codePoints) {
    166     const int newPtNodeArrayPos = mBuffer->getTailPosition();
    167     if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
    168         return false;
    169     }
    170     return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
    171             unigramProperty);
    172 }
    173 
    174 bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
    175         const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints,
    176         const UnigramProperty *const unigramProperty) {
    177     int writingPos = mBuffer->getTailPosition();
    178     if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
    179             1 /* arraySize */, &writingPos)) {
    180         return false;
    181     }
    182     const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
    183             unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
    184             true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
    185             unigramProperty->getProbability()));
    186     if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
    187             unigramProperty, &writingPos)) {
    188         return false;
    189     }
    190     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
    191             NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
    192         return false;
    193     }
    194     return true;
    195 }
    196 
    197 // Returns whether the dictionary updating was succeeded or not.
    198 bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
    199         const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount,
    200         const UnigramProperty *const unigramProperty,
    201         const CodePointArrayView newPtNodeCodePoints) {
    202     // When addsExtraChild is true, split the reallocating PtNode and add new child.
    203     // Reallocating PtNode: abcde, newNode: abcxy.
    204     // abc (1st, not terminal) __ de (2nd)
    205     //                         \_ xy (extra child, terminal)
    206     // Otherwise, this method makes 1st part terminal and write information in unigramProperty.
    207     // Reallocating PtNode: abcde, newNode: abc.
    208     // abc (1st, terminal) __ de (2nd)
    209     const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount;
    210     const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
    211     int writingPos = firstPartOfReallocatedPtNodePos;
    212     // Write the 1st part of the reallocating node. The children position will be updated later
    213     // with actual children position.
    214     const CodePointArrayView firstPtNodeCodePoints =
    215             reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
    216     if (addsExtraChild) {
    217         const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
    218                 false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
    219                 reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
    220                 NOT_A_PROBABILITY));
    221         if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
    222             return false;
    223         }
    224     } else {
    225         const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
    226                 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
    227                 true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
    228                 firstPtNodeCodePoints, unigramProperty->getProbability()));
    229         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
    230                 unigramProperty, &writingPos)) {
    231             return false;
    232         }
    233     }
    234     const int actualChildrenPos = writingPos;
    235     // Create new children PtNode array.
    236     const size_t newPtNodeCount = addsExtraChild ? 2 : 1;
    237     if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
    238             newPtNodeCount, &writingPos)) {
    239         return false;
    240     }
    241     // Write the 2nd part of the reallocating node.
    242     const int secondPartOfReallocatedPtNodePos = writingPos;
    243     const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
    244             reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
    245             reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
    246             reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
    247             reallocatingPtNodeParams->getProbability()));
    248     if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
    249         return false;
    250     }
    251     if (addsExtraChild) {
    252         const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
    253                 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
    254                 true /* isTerminal */, firstPartOfReallocatedPtNodePos,
    255                 newPtNodeCodePoints.skip(overlappingCodePointCount),
    256                 unigramProperty->getProbability()));
    257         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
    258                 unigramProperty, &writingPos)) {
    259             return false;
    260         }
    261     }
    262     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
    263             NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
    264         return false;
    265     }
    266     // Update original reallocating PtNode as moved.
    267     if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos,
    268             secondPartOfReallocatedPtNodePos)) {
    269         return false;
    270     }
    271     // Load node info. Information of the 1st part will be fetched.
    272     const PtNodeParams ptNodeParams(
    273             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos));
    274     // Update children position.
    275     return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos);
    276 }
    277 
    278 const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
    279         const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
    280         const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
    281         const CodePointArrayView codePoints, const int probability) const {
    282     const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
    283             isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
    284             false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
    285             CHILDREN_POSITION_FIELD_SIZE);
    286     return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
    287 }
    288 
    289 const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
    290         const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
    291         const CodePointArrayView codePoints, const int probability) const {
    292     const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
    293             isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
    294             false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
    295             CHILDREN_POSITION_FIELD_SIZE);
    296     return PtNodeParams(flags, parentPos, codePoints, probability);
    297 }
    298 
    299 } // namespace latinime
    300