Home | History | Annotate | Download | only in v4
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
     18 
     19 #include "suggest/core/dictionary/property/unigram_property.h"
     20 #include "suggest/policyimpl/dictionary/header/header_policy.h"
     21 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
     22 #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
     23 #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
     24 #include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
     25 #include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
     26 #include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
     27 #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
     28 #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
     29 #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
     30 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
     31 
     32 namespace latinime {
     33 
     34 const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3;
     35 
     36 bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
     37         const PtNodeParams *const toBeUpdatedPtNodeParams) {
     38     int pos = toBeUpdatedPtNodeParams->getHeadPos();
     39     const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
     40     const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
     41     if (usesAdditionalBuffer) {
     42         pos -= mTrieBuffer->getOriginalBufferSize();
     43     }
     44     // Read original flags
     45     const PatriciaTrieReadingUtils::NodeFlags originalFlags =
     46             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
     47     const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
     48             DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
     49                     true /* isDeleted */, false /* willBecomeNonTerminal */);
     50     int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
     51     // Update flags.
     52     if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
     53             &writingPos)) {
     54         return false;
     55     }
     56     if (toBeUpdatedPtNodeParams->isTerminal()) {
     57         // The PtNode is a terminal. Delete entry from the terminal position lookup table.
     58         return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
     59                 toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
     60     } else {
     61         return true;
     62     }
     63 }
     64 
     65 bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
     66         const PtNodeParams *const toBeUpdatedPtNodeParams,
     67         const int movedPos, const int bigramLinkedNodePos) {
     68     int pos = toBeUpdatedPtNodeParams->getHeadPos();
     69     const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
     70     const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
     71     if (usesAdditionalBuffer) {
     72         pos -= mTrieBuffer->getOriginalBufferSize();
     73     }
     74     // Read original flags
     75     const PatriciaTrieReadingUtils::NodeFlags originalFlags =
     76             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
     77     const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
     78             DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */,
     79                     false /* isDeleted */, false /* willBecomeNonTerminal */);
     80     int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
     81     // Update flags.
     82     if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
     83             &writingPos)) {
     84         return false;
     85     }
     86     // Update moved position, which is stored in the parent offset field.
     87     if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
     88             mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
     89         return false;
     90     }
     91     if (toBeUpdatedPtNodeParams->hasChildren()) {
     92         // Update children's parent position.
     93         mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos());
     94         while (!mReadingHelper.isEnd()) {
     95             const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams());
     96             int parentOffsetFieldPos = childPtNodeParams.getHeadPos()
     97                     + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
     98             if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
     99                     mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(),
    100                     &parentOffsetFieldPos)) {
    101                 // Parent offset cannot be written because of a bug or a broken dictionary; thus,
    102                 // we give up to update dictionary.
    103                 return false;
    104             }
    105             mReadingHelper.readNextSiblingNode(childPtNodeParams);
    106         }
    107     }
    108     return true;
    109 }
    110 
    111 bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal(
    112         const PtNodeParams *const toBeUpdatedPtNodeParams) {
    113     int pos = toBeUpdatedPtNodeParams->getHeadPos();
    114     const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
    115     const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
    116     if (usesAdditionalBuffer) {
    117         pos -= mTrieBuffer->getOriginalBufferSize();
    118     }
    119     // Read original flags
    120     const PatriciaTrieReadingUtils::NodeFlags originalFlags =
    121             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
    122     const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
    123             DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
    124                     false /* isDeleted */, true /* willBecomeNonTerminal */);
    125     if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
    126             toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) {
    127         AKLOGE("Cannot update terminal position lookup table. terminal id: %d",
    128                 toBeUpdatedPtNodeParams->getTerminalId());
    129         return false;
    130     }
    131     // Update flags.
    132     int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
    133     return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
    134             &writingPos);
    135 }
    136 
    137 bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty(
    138         const PtNodeParams *const toBeUpdatedPtNodeParams,
    139         const UnigramProperty *const unigramProperty) {
    140     // Update probability and historical information.
    141     // TODO: Update other information in the unigram property.
    142     if (!toBeUpdatedPtNodeParams->isTerminal()) {
    143         return false;
    144     }
    145     const ProbabilityEntry originalProbabilityEntry =
    146             mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
    147                     toBeUpdatedPtNodeParams->getTerminalId());
    148     const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry,
    149             unigramProperty);
    150     return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
    151             toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry);
    152 }
    153 
    154 bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
    155         const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) {
    156     if (!toBeUpdatedPtNodeParams->isTerminal()) {
    157         AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
    158         return false;
    159     }
    160     const ProbabilityEntry originalProbabilityEntry =
    161             mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
    162                     toBeUpdatedPtNodeParams->getTerminalId());
    163     if (originalProbabilityEntry.hasHistoricalInfo()) {
    164         const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
    165                 originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy);
    166         const ProbabilityEntry probabilityEntry =
    167                 originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
    168         if (!mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
    169                 toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
    170             AKLOGE("Cannot write updated probability entry. terminalId: %d",
    171                     toBeUpdatedPtNodeParams->getTerminalId());
    172             return false;
    173         }
    174         const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
    175         if (!isValid) {
    176             if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
    177                 AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
    178                 return false;
    179             }
    180         }
    181         *outNeedsToKeepPtNode = isValid;
    182     } else {
    183         // No need to update probability.
    184         *outNeedsToKeepPtNode = true;
    185     }
    186     return true;
    187 }
    188 
    189 bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
    190         const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) {
    191     int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos();
    192     return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
    193             newChildrenPosition, &childrenPosFieldPos);
    194 }
    195 
    196 bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
    197         const int newTerminalId) {
    198     return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE,
    199             toBeUpdatedPtNodeParams->getTerminalIdFieldPos());
    200 }
    201 
    202 bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
    203         const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) {
    204     return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */,
    205             ptNodeWritingPos);
    206 }
    207 
    208 
    209 bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
    210         const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
    211         int *const ptNodeWritingPos) {
    212     int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
    213     if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId,
    214             ptNodeWritingPos)) {
    215         return false;
    216     }
    217     // Write probability.
    218     ProbabilityEntry newProbabilityEntry;
    219     const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom(
    220             &newProbabilityEntry, unigramProperty);
    221     return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
    222             terminalId, &probabilityEntryToWrite);
    223 }
    224 
    225 bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
    226         const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) {
    227     if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewBigram)) {
    228         AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d",
    229                 prevWordIds[0], wordId);
    230         return false;
    231     }
    232     return true;
    233 }
    234 
    235 bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds,
    236         const int wordId) {
    237     return mBigramPolicy->removeEntry(prevWordIds[0], wordId);
    238 }
    239 
    240 bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
    241             const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
    242     return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(
    243             sourcePtNodeParams->getTerminalId(), outBigramEntryCount);
    244 }
    245 
    246 bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
    247         const PtNodeParams *const toBeUpdatedPtNodeParams,
    248         const DictPositionRelocationMap *const dictPositionRelocationMap,
    249         int *const outBigramEntryCount) {
    250     int parentPos = toBeUpdatedPtNodeParams->getParentPos();
    251     if (parentPos != NOT_A_DICT_POS) {
    252         PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
    253                 dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
    254         if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
    255             parentPos = it->second;
    256         }
    257     }
    258     int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
    259             + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
    260     // Write updated parent offset.
    261     if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
    262             parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
    263         return false;
    264     }
    265 
    266     // Updates children position.
    267     int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
    268     if (childrenPos != NOT_A_DICT_POS) {
    269         PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
    270                 dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
    271         if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
    272             childrenPos = it->second;
    273         }
    274     }
    275     if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
    276         return false;
    277     }
    278 
    279     // Counts bigram entries.
    280     if (outBigramEntryCount) {
    281         *outBigramEntryCount = mBigramPolicy->getBigramEntryConut(
    282                 toBeUpdatedPtNodeParams->getTerminalId());
    283     }
    284     return true;
    285 }
    286 
    287 bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
    288         const int *const targetCodePoints, const int targetCodePointCount,
    289         const int shortcutProbability) {
    290     if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
    291             targetCodePoints, targetCodePointCount, shortcutProbability)) {
    292         AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId());
    293         return false;
    294     }
    295     return true;
    296 }
    297 
    298 bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
    299         const PtNodeParams *const ptNodeParams, int *const outTerminalId,
    300         int *const ptNodeWritingPos) {
    301     const int nodePos = *ptNodeWritingPos;
    302     // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the
    303     // PtNode writing.
    304     if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer,
    305             0 /* nodeFlags */, ptNodeWritingPos)) {
    306         return false;
    307     }
    308     // Calculate a parent offset and write the offset.
    309     if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
    310             ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) {
    311         return false;
    312     }
    313     // Write code points
    314     if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer,
    315             ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) {
    316         return false;
    317     }
    318     int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
    319     if (!ptNodeParams->willBecomeNonTerminal()) {
    320         if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) {
    321             terminalId = ptNodeParams->getTerminalId();
    322         } else if (ptNodeParams->isTerminal()) {
    323             // Write terminal information using a new terminal id.
    324             // Get a new unused terminal id.
    325             terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId();
    326         }
    327     }
    328     const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID;
    329     if (isTerminal) {
    330         // Update the lookup table.
    331         if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
    332                 terminalId, nodePos)) {
    333             return false;
    334         }
    335         // Write terminal Id.
    336         if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId,
    337                 Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) {
    338             return false;
    339         }
    340         if (outTerminalId) {
    341             *outTerminalId = terminalId;
    342         }
    343     }
    344     // Write children position
    345     if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
    346             ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
    347         return false;
    348     }
    349     return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
    350             isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
    351 }
    352 
    353 const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
    354         const ProbabilityEntry *const originalProbabilityEntry,
    355         const UnigramProperty *const unigramProperty) const {
    356     // TODO: Consolidate historical info and probability.
    357     if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
    358         const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(),
    359                 unigramProperty->getLevel(), unigramProperty->getCount());
    360         const HistoricalInfo updatedHistoricalInfo =
    361                 ForgettingCurveUtils::createUpdatedHistoricalInfo(
    362                         originalProbabilityEntry->getHistoricalInfo(),
    363                         unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy);
    364         return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
    365                 &updatedHistoricalInfo);
    366     } else {
    367         return originalProbabilityEntry->createEntryWithUpdatedProbability(
    368                 unigramProperty->getProbability());
    369     }
    370 }
    371 
    372 bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos,
    373         const bool isBlacklisted, const bool isNotAWord, const bool isTerminal,
    374         const bool hasMultipleChars) {
    375     // Create node flags and write them.
    376     PatriciaTrieReadingUtils::NodeFlags nodeFlags =
    377             PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal,
    378                     false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars,
    379                     CHILDREN_POSITION_FIELD_SIZE);
    380     if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
    381         AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
    382         return false;
    383     }
    384     return true;
    385 }
    386 
    387 } // namespace latinime
    388