Home | History | Annotate | Download | only in bigram
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h"
     18 
     19 #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
     20 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
     21 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
     22 #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
     23 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
     24 
     25 namespace latinime {
     26 
     27 const int DynamicBigramListPolicy::CONTINUING_BIGRAM_LINK_COUNT_LIMIT = 10000;
     28 const int DynamicBigramListPolicy::BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT = 100000;
     29 
     30 void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability,
     31         bool *const outHasNext, int *const bigramEntryPos) const {
     32     const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramEntryPos);
     33     const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer);
     34     if (usesAdditionalBuffer) {
     35         *bigramEntryPos -= mBuffer->getOriginalBufferSize();
     36     }
     37     BigramListReadWriteUtils::BigramFlags bigramFlags;
     38     int originalBigramPos;
     39     BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(buffer, &bigramFlags,
     40             &originalBigramPos, bigramEntryPos);
     41     if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) {
     42         originalBigramPos += mBuffer->getOriginalBufferSize();
     43     }
     44     *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags);
     45     *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags);
     46     if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) {
     47         // This bigram is too weak to output.
     48         *outBigramPos = NOT_A_DICT_POS;
     49     } else {
     50         *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
     51     }
     52     if (usesAdditionalBuffer) {
     53         *bigramEntryPos += mBuffer->getOriginalBufferSize();
     54     }
     55 }
     56 
     57 void DynamicBigramListPolicy::skipAllBigrams(int *const bigramListPos) const {
     58     const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
     59     const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer);
     60     if (usesAdditionalBuffer) {
     61         *bigramListPos -= mBuffer->getOriginalBufferSize();
     62     }
     63     BigramListReadWriteUtils::skipExistingBigrams(buffer, bigramListPos);
     64     if (usesAdditionalBuffer) {
     65         *bigramListPos += mBuffer->getOriginalBufferSize();
     66     }
     67 }
     68 
     69 bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite,
     70         int *const fromPos, int *const toPos, int *const outBigramsCount) const {
     71     const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos);
     72     if (usesAdditionalBuffer) {
     73         *fromPos -= mBuffer->getOriginalBufferSize();
     74     }
     75     *outBigramsCount = 0;
     76     BigramListReadWriteUtils::BigramFlags bigramFlags;
     77     int bigramEntryCount = 0;
     78     int lastWrittenEntryPos = NOT_A_DICT_POS;
     79     do {
     80         if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) {
     81             AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d",
     82                     bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT);
     83             ASSERT(false);
     84             return false;
     85         }
     86         // The buffer address can be changed after calling buffer writing methods.
     87         int originalBigramPos;
     88         BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(
     89                 mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos,
     90                 fromPos);
     91         if (originalBigramPos == NOT_A_DICT_POS) {
     92             // skip invalid bigram entry.
     93             continue;
     94         }
     95         if (usesAdditionalBuffer) {
     96             originalBigramPos += mBuffer->getOriginalBufferSize();
     97         }
     98         const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
     99         if (bigramPos == NOT_A_DICT_POS) {
    100             // Target PtNode has been invalidated.
    101             continue;
    102         }
    103         lastWrittenEntryPos = *toPos;
    104         if (!BigramListReadWriteUtils::createAndWriteBigramEntry(bufferToWrite, bigramPos,
    105                 BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags),
    106                 BigramListReadWriteUtils::hasNext(bigramFlags), toPos)) {
    107             return false;
    108         }
    109         (*outBigramsCount)++;
    110     } while(BigramListReadWriteUtils::hasNext(bigramFlags));
    111     // Makes the last entry the terminal of the list. Updates the flags.
    112     if (lastWrittenEntryPos != NOT_A_DICT_POS) {
    113         if (!BigramListReadWriteUtils::setHasNextFlag(bufferToWrite, false /* hasNext */,
    114                 lastWrittenEntryPos)) {
    115             return false;
    116         }
    117     }
    118     if (usesAdditionalBuffer) {
    119         *fromPos += mBuffer->getOriginalBufferSize();
    120     }
    121     return true;
    122 }
    123 
    124 // Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode
    125 // has been deleted or is not a valid terminal.
    126 bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
    127         int *const bigramListPos, int *const outValidBigramEntryCount) {
    128     const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
    129     if (usesAdditionalBuffer) {
    130         *bigramListPos -= mBuffer->getOriginalBufferSize();
    131     }
    132     DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy);
    133     BigramListReadWriteUtils::BigramFlags bigramFlags;
    134     int bigramEntryCount = 0;
    135     do {
    136         if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) {
    137             AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d",
    138                     bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT);
    139             ASSERT(false);
    140             return false;
    141         }
    142         int bigramEntryPos = *bigramListPos;
    143         int originalBigramPos;
    144         // The buffer address can be changed after calling buffer writing methods.
    145         BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(
    146                 mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos,
    147                 bigramListPos);
    148         if (usesAdditionalBuffer) {
    149             bigramEntryPos += mBuffer->getOriginalBufferSize();
    150         }
    151         if (originalBigramPos == NOT_A_DICT_POS) {
    152             // This entry has already been removed.
    153             continue;
    154         }
    155         if (usesAdditionalBuffer) {
    156             originalBigramPos += mBuffer->getOriginalBufferSize();
    157         }
    158         const int bigramTargetNodePos =
    159                 followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
    160         nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos);
    161         if (nodeReader.isDeleted() || !nodeReader.isTerminal()
    162                 || bigramTargetNodePos == NOT_A_DICT_POS) {
    163             // The target is no longer valid terminal. Invalidate the current bigram entry.
    164             if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
    165                     NOT_A_DICT_POS /* targetPtNodePos */, &bigramEntryPos)) {
    166                 return false;
    167             }
    168             continue;
    169         }
    170         bool isRemoved = false;
    171         if (!updateProbabilityForDecay(bigramFlags, bigramTargetNodePos, &bigramEntryPos,
    172                 &isRemoved)) {
    173             return false;
    174         }
    175         if (!isRemoved) {
    176             (*outValidBigramEntryCount) += 1;
    177         }
    178     } while(BigramListReadWriteUtils::hasNext(bigramFlags));
    179     return true;
    180 }
    181 
    182 // Updates bigram target PtNode positions in the list after the placing step in GC.
    183 bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos,
    184         const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
    185                 ptNodePositionRelocationMap, int *const outBigramEntryCount) {
    186     const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
    187     if (usesAdditionalBuffer) {
    188         *bigramListPos -= mBuffer->getOriginalBufferSize();
    189     }
    190     BigramListReadWriteUtils::BigramFlags bigramFlags;
    191     int bigramEntryCount = 0;
    192     do {
    193         if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) {
    194             AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d",
    195                     bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT);
    196             ASSERT(false);
    197             return false;
    198         }
    199         int bigramEntryPos = *bigramListPos;
    200         if (usesAdditionalBuffer) {
    201             bigramEntryPos += mBuffer->getOriginalBufferSize();
    202         }
    203         int bigramTargetPtNodePos;
    204         // The buffer address can be changed after calling buffer writing methods.
    205         BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(
    206                 mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &bigramTargetPtNodePos,
    207                 bigramListPos);
    208         if (bigramTargetPtNodePos == NOT_A_DICT_POS) {
    209             continue;
    210         }
    211         if (usesAdditionalBuffer) {
    212             bigramTargetPtNodePos += mBuffer->getOriginalBufferSize();
    213         }
    214 
    215         DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it =
    216                 ptNodePositionRelocationMap->find(bigramTargetPtNodePos);
    217         if (it != ptNodePositionRelocationMap->end()) {
    218             bigramTargetPtNodePos = it->second;
    219         } else {
    220             bigramTargetPtNodePos = NOT_A_DICT_POS;
    221         }
    222         if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
    223                 bigramTargetPtNodePos, &bigramEntryPos)) {
    224             return false;
    225         }
    226     } while(BigramListReadWriteUtils::hasNext(bigramFlags));
    227     (*outBigramEntryCount) = bigramEntryCount;
    228     return true;
    229 }
    230 
    231 bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos,
    232         const int probability, int *const bigramListPos, bool *const outAddedNewBigram) {
    233     const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
    234     if (usesAdditionalBuffer) {
    235         *bigramListPos -= mBuffer->getOriginalBufferSize();
    236     }
    237     BigramListReadWriteUtils::BigramFlags bigramFlags;
    238     int bigramEntryCount = 0;
    239     do {
    240         if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) {
    241             AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d",
    242                     bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT);
    243             ASSERT(false);
    244             return false;
    245         }
    246         int entryPos = *bigramListPos;
    247         if (usesAdditionalBuffer) {
    248             entryPos += mBuffer->getOriginalBufferSize();
    249         }
    250         int originalBigramPos;
    251         // The buffer address can be changed after calling buffer writing methods.
    252         BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(
    253                 mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos,
    254                 bigramListPos);
    255         if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) {
    256             originalBigramPos += mBuffer->getOriginalBufferSize();
    257         }
    258         if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
    259             // Update this bigram entry.
    260             *outAddedNewBigram = false;
    261             const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags(
    262                     bigramFlags);
    263             const int probabilityToWrite = mIsDecayingDict ?
    264                     ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability,
    265                             probability) : probability;
    266             const BigramListReadWriteUtils::BigramFlags updatedFlags =
    267                     BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags,
    268                             probabilityToWrite);
    269             return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
    270                     originalBigramPos, &entryPos);
    271         }
    272         if (BigramListReadWriteUtils::hasNext(bigramFlags)) {
    273             continue;
    274         }
    275         // The current last entry is found.
    276         // First, update the flags of the last entry.
    277         if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) {
    278             *outAddedNewBigram = false;
    279             return false;
    280         }
    281         if (usesAdditionalBuffer) {
    282             *bigramListPos += mBuffer->getOriginalBufferSize();
    283         }
    284         // Then, add a new entry after the last entry.
    285         *outAddedNewBigram = true;
    286         return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos);
    287     } while(BigramListReadWriteUtils::hasNext(bigramFlags));
    288     // We return directly from the while loop.
    289     ASSERT(false);
    290     return false;
    291 }
    292 
    293 bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability,
    294         int *const writingPos) {
    295     // hasNext is false because we are adding a new bigram entry at the end of the bigram list.
    296     const int probabilityToWrite = mIsDecayingDict ?
    297             ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) :
    298                     probability;
    299     return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos,
    300             probabilityToWrite, false /* hasNext */, writingPos);
    301 }
    302 
    303 bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) {
    304     const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(bigramListPos);
    305     int pos = bigramListPos;
    306     if (usesAdditionalBuffer) {
    307         pos -= mBuffer->getOriginalBufferSize();
    308     }
    309     BigramListReadWriteUtils::BigramFlags bigramFlags;
    310     int bigramEntryCount = 0;
    311     do {
    312         if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) {
    313             AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d",
    314                     bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT);
    315             ASSERT(false);
    316             return false;
    317         }
    318         int bigramEntryPos = pos;
    319         int originalBigramPos;
    320         // The buffer address can be changed after calling buffer writing methods.
    321         BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(
    322                 mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, &pos);
    323         if (usesAdditionalBuffer) {
    324             bigramEntryPos += mBuffer->getOriginalBufferSize();
    325         }
    326         if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) {
    327             originalBigramPos += mBuffer->getOriginalBufferSize();
    328         }
    329         const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
    330         if (bigramPos != bigramTargetPos) {
    331             continue;
    332         }
    333         // Target entry is found. Write an invalid target position to mark the bigram invalid.
    334         return BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
    335                 NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos);
    336     } while(BigramListReadWriteUtils::hasNext(bigramFlags));
    337     return false;
    338 }
    339 
    340 int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos(
    341         const int originalBigramPos) const {
    342     if (originalBigramPos == NOT_A_DICT_POS) {
    343         return NOT_A_DICT_POS;
    344     }
    345     int currentPos = originalBigramPos;
    346     DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy);
    347     nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos);
    348     int bigramLinkCount = 0;
    349     while (nodeReader.getBigramLinkedNodePos() != NOT_A_DICT_POS) {
    350         currentPos = nodeReader.getBigramLinkedNodePos();
    351         nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos);
    352         bigramLinkCount++;
    353         if (bigramLinkCount > CONTINUING_BIGRAM_LINK_COUNT_LIMIT) {
    354             AKLOGE("Bigram link is invalid. start position: %d", originalBigramPos);
    355             ASSERT(false);
    356             return NOT_A_DICT_POS;
    357         }
    358     }
    359     return currentPos;
    360 }
    361 
    362 bool DynamicBigramListPolicy::updateProbabilityForDecay(
    363         const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos,
    364         int *const bigramEntryPos, bool *const outRemoved) const {
    365     *outRemoved = false;
    366     if (mIsDecayingDict) {
    367         // Update bigram probability for decaying.
    368         const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave(
    369                 BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy);
    370         if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
    371             // Write new probability.
    372             const BigramListReadWriteUtils::BigramFlags updatedBigramFlags =
    373                     BigramListReadWriteUtils::setProbabilityInFlags(
    374                             bigramFlags, newProbability);
    375             if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedBigramFlags,
    376                     targetPtNodePos, bigramEntryPos)) {
    377                 return false;
    378             }
    379         } else {
    380             // Remove current bigram entry.
    381             *outRemoved = true;
    382             if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
    383                     NOT_A_DICT_POS /* targetPtNodePos */, bigramEntryPos)) {
    384                 return false;
    385             }
    386         }
    387     }
    388     return true;
    389 }
    390 
    391 } // namespace latinime
    392