Home | History | Annotate | Download | only in v4
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
     18 
     19 #include <cstring>
     20 #include <queue>
     21 
     22 #include "suggest/policyimpl/dictionary/header/header_policy.h"
     23 #include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
     24 #include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
     25 #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
     26 #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
     27 #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
     28 #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
     29 #include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h"
     30 #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
     31 #include "suggest/policyimpl/dictionary/utils/file_utils.h"
     32 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
     33 
     34 namespace latinime {
     35 
     36 bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
     37         const int unigramCount, const int bigramCount) const {
     38     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
     39     BufferWithExtendableBuffer headerBuffer(
     40             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
     41     const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
     42             + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
     43     if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
     44             unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) {
     45         AKLOGE("Cannot write header structure to buffer. "
     46                 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
     47                 "extendedRegionSize: %d", false, unigramCount, bigramCount,
     48                 extendedRegionSize);
     49         return false;
     50     }
     51     return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
     52 }
     53 
     54 bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
     55         const char *const dictDirPath) {
     56     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
     57     Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
     58             Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
     59                     Ver4DictConstants::MAX_DICTIONARY_SIZE));
     60     int unigramCount = 0;
     61     int bigramCount = 0;
     62     if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
     63         return false;
     64     }
     65     BufferWithExtendableBuffer headerBuffer(
     66             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
     67     if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
     68             unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) {
     69         return false;
     70     }
     71     return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
     72 }
     73 
     74 bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
     75         const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
     76         int *const outUnigramCount, int *const outBigramCount) {
     77     Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
     78             mBuffers->getLanguageModelDictContent(), headerPolicy);
     79     Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
     80     Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
     81             mBuffers->getTerminalPositionLookupTable(), headerPolicy);
     82     Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
     83             mBuffers->getTerminalPositionLookupTable());
     84     Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
     85             mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
     86             &shortcutPolicy);
     87 
     88     DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
     89     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
     90     DynamicPtGcEventListeners
     91             ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
     92                     traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
     93                             &ptNodeWriter);
     94     if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
     95             &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
     96         return false;
     97     }
     98     const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
     99             .getValidUnigramCount();
    100     const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
    101     if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
    102         if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
    103             AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
    104                     maxUnigramCount);
    105             return false;
    106         }
    107     }
    108 
    109     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    110     DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
    111             traversePolicyToUpdateBigramProbability(&ptNodeWriter);
    112     if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
    113             &traversePolicyToUpdateBigramProbability)) {
    114         return false;
    115     }
    116     const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
    117     const int maxBigramCount = headerPolicy->getMaxBigramCount();
    118     if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
    119         if (!truncateBigrams(maxBigramCount)) {
    120             AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
    121             return false;
    122         }
    123     }
    124 
    125     // Mapping from positions in mBuffer to positions in bufferToWrite.
    126     PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
    127     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    128     Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
    129             buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
    130             &shortcutPolicy);
    131     DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
    132             traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
    133                     buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
    134     if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
    135             &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
    136         return false;
    137     }
    138 
    139     // Create policy instances for the GCed dictionary.
    140     Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
    141             buffersToWrite->getLanguageModelDictContent(), headerPolicy);
    142     Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
    143     Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
    144             buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
    145     Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
    146             buffersToWrite->getTerminalPositionLookupTable());
    147     Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
    148             buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
    149             &newShortcutPolicy);
    150     // Re-assign terminal IDs for valid terminal PtNodes.
    151     TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
    152     if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
    153             &terminalIdMap)) {
    154         return false;
    155     }
    156     // Run GC for probability dict content.
    157     if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap,
    158             mBuffers->getLanguageModelDictContent(), nullptr /* outNgramCount */)) {
    159         return false;
    160     }
    161     // Run GC for bigram dict content.
    162     if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
    163             mBuffers->getBigramDictContent(), outBigramCount)) {
    164         return false;
    165     }
    166     // Run GC for shortcut dict content.
    167     if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
    168             mBuffers->getShortcutDictContent())) {
    169         return false;
    170     }
    171     DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
    172     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    173     DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
    174             traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
    175     if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
    176             &traversePolicyToUpdateAllPositionFields)) {
    177         return false;
    178     }
    179     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    180     TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
    181             traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
    182     if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
    183             &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
    184         return false;
    185     }
    186     *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
    187     return true;
    188 }
    189 
    190 bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
    191         const Ver4PatriciaTrieNodeReader *const ptNodeReader,
    192         Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
    193     const TerminalPositionLookupTable *const terminalPosLookupTable =
    194             mBuffers->getTerminalPositionLookupTable();
    195     const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
    196     std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
    197             priorityQueue;
    198     for (int i = 0; i < nextTerminalId; ++i) {
    199         const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
    200         if (terminalPos == NOT_A_DICT_POS) {
    201             continue;
    202         }
    203         const ProbabilityEntry probabilityEntry =
    204                 mBuffers->getLanguageModelDictContent()->getProbabilityEntry(i);
    205         const int probability = probabilityEntry.hasHistoricalInfo() ?
    206                 ForgettingCurveUtils::decodeProbability(
    207                         probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
    208                 probabilityEntry.getProbability();
    209         priorityQueue.push(DictProbability(terminalPos, probability,
    210                 probabilityEntry.getHistoricalInfo()->getTimeStamp()));
    211     }
    212 
    213     // Delete unigrams.
    214     while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
    215         const int ptNodePos = priorityQueue.top().getDictPos();
    216         priorityQueue.pop();
    217         const PtNodeParams ptNodeParams =
    218                 ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
    219         if (ptNodeParams.representsNonWordInfo()) {
    220             continue;
    221         }
    222         if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
    223             AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
    224             return false;
    225         }
    226     }
    227     return true;
    228 }
    229 
    230 bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
    231     const TerminalPositionLookupTable *const terminalPosLookupTable =
    232             mBuffers->getTerminalPositionLookupTable();
    233     const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
    234     std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
    235             priorityQueue;
    236     BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
    237     for (int i = 0; i < nextTerminalId; ++i) {
    238         const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
    239         if (bigramListPos == NOT_A_DICT_POS) {
    240             continue;
    241         }
    242         bool hasNext = true;
    243         int readingPos = bigramListPos;
    244         while (hasNext) {
    245             const BigramEntry bigramEntry =
    246                     bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
    247             const int entryPos = readingPos - bigramDictContent->getBigramEntrySize();
    248             hasNext = bigramEntry.hasNext();
    249             if (!bigramEntry.isValid()) {
    250                 continue;
    251             }
    252             const int probability = bigramEntry.hasHistoricalInfo() ?
    253                     ForgettingCurveUtils::decodeProbability(
    254                             bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
    255                     bigramEntry.getProbability();
    256             priorityQueue.push(DictProbability(entryPos, probability,
    257                     bigramEntry.getHistoricalInfo()->getTimeStamp()));
    258         }
    259     }
    260 
    261     // Delete bigrams.
    262     while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
    263         const int entryPos = priorityQueue.top().getDictPos();
    264         const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
    265         const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
    266         if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
    267             AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
    268             return false;
    269         }
    270         priorityQueue.pop();
    271     }
    272     return true;
    273 }
    274 
    275 bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
    276         ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
    277     if (!ptNodeParams->isTerminal()) {
    278         return true;
    279     }
    280     TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
    281             mTerminalIdMap->find(ptNodeParams->getTerminalId());
    282     if (it == mTerminalIdMap->end()) {
    283         AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
    284                 ptNodeParams->getTerminalId(), mTerminalIdMap->size());
    285         return false;
    286     }
    287     if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
    288         AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
    289         return false;
    290     }
    291     return true;
    292 }
    293 
    294 } // namespace latinime
    295