Home | History | Annotate | Download | only in v402
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18  * !!!!! DO NOT EDIT THIS FILE !!!!!
     19  *
     20  * This file was generated from
     21  *   dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
     22  */
     23 
     24 #include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
     25 
     26 #include <cstring>
     27 #include <queue>
     28 
     29 #include "dictionary/header/header_policy.h"
     30 #include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
     31 #include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
     32 #include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
     33 #include "dictionary/structure/backward/v402/ver4_dict_constants.h"
     34 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
     35 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
     36 #include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
     37 #include "dictionary/utils/buffer_with_extendable_buffer.h"
     38 #include "dictionary/utils/file_utils.h"
     39 #include "dictionary/utils/forgetting_curve_utils.h"
     40 
     41 namespace latinime {
     42 namespace backward {
     43 namespace v402 {
     44 
     45 bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
     46         const EntryCounts &entryCounts) const {
     47     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
     48     BufferWithExtendableBuffer headerBuffer(
     49             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
     50     const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
     51             + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
     52     if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
     53             entryCounts, extendedRegionSize, &headerBuffer)) {
     54         AKLOGE("Cannot write header structure to buffer. "
     55                 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
     56                 "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
     57                 entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize);
     58         return false;
     59     }
     60     return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
     61 }
     62 
     63 bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
     64         const char *const dictDirPath) {
     65     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
     66     Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
     67             Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
     68                     Ver4DictConstants::MAX_DICTIONARY_SIZE));
     69     int unigramCount = 0;
     70     int bigramCount = 0;
     71     if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
     72         return false;
     73     }
     74     BufferWithExtendableBuffer headerBuffer(
     75             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
     76     MutableEntryCounters entryCounters;
     77     entryCounters.setNgramCount(NgramType::Unigram, unigramCount);
     78     entryCounters.setNgramCount(NgramType::Bigram, bigramCount);
     79     if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
     80             entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
     81         return false;
     82     }
     83     return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
     84 }
     85 
     86 bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
     87         const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
     88         int *const outUnigramCount, int *const outBigramCount) {
     89     Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
     90             mBuffers->getProbabilityDictContent(), headerPolicy);
     91     Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
     92     Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
     93             mBuffers->getTerminalPositionLookupTable(), headerPolicy);
     94     Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
     95             mBuffers->getTerminalPositionLookupTable());
     96     Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
     97             mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
     98             &shortcutPolicy);
     99 
    100     DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
    101     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    102     DynamicPtGcEventListeners
    103             ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
    104                     traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
    105                             &ptNodeWriter);
    106     if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
    107             &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
    108         return false;
    109     }
    110     const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
    111             .getValidUnigramCount();
    112     const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram);
    113     if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
    114         if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
    115             AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
    116                     maxUnigramCount);
    117             return false;
    118         }
    119     }
    120 
    121     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    122     DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
    123             traversePolicyToUpdateBigramProbability(&ptNodeWriter);
    124     if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
    125             &traversePolicyToUpdateBigramProbability)) {
    126         return false;
    127     }
    128     const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
    129     const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram);
    130     if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
    131         if (!truncateBigrams(maxBigramCount)) {
    132             AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
    133             return false;
    134         }
    135     }
    136 
    137     // Mapping from positions in mBuffer to positions in bufferToWrite.
    138     PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
    139     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    140     Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
    141             buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
    142             &shortcutPolicy);
    143     DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
    144             traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
    145                     buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
    146     if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
    147             &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
    148         return false;
    149     }
    150 
    151     // Create policy instances for the GCed dictionary.
    152     Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
    153             buffersToWrite->getProbabilityDictContent(), headerPolicy);
    154     Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
    155     Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
    156             buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
    157     Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
    158             buffersToWrite->getTerminalPositionLookupTable());
    159     Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
    160             buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
    161             &newShortcutPolicy);
    162     // Re-assign terminal IDs for valid terminal PtNodes.
    163     TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
    164     if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
    165             &terminalIdMap)) {
    166         return false;
    167     }
    168     // Run GC for probability dict content.
    169     if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
    170             mBuffers->getProbabilityDictContent())) {
    171         return false;
    172     }
    173     // Run GC for bigram dict content.
    174     if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
    175             mBuffers->getBigramDictContent(), outBigramCount)) {
    176         return false;
    177     }
    178     // Run GC for shortcut dict content.
    179     if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
    180             mBuffers->getShortcutDictContent())) {
    181         return false;
    182     }
    183     DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
    184     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    185     DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
    186             traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
    187     if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
    188             &traversePolicyToUpdateAllPositionFields)) {
    189         return false;
    190     }
    191     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
    192     TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
    193             traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
    194     if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
    195             &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
    196         return false;
    197     }
    198     *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
    199     return true;
    200 }
    201 
    202 bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
    203         const Ver4PatriciaTrieNodeReader *const ptNodeReader,
    204         Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
    205     const TerminalPositionLookupTable *const terminalPosLookupTable =
    206             mBuffers->getTerminalPositionLookupTable();
    207     const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
    208     std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
    209             priorityQueue;
    210     for (int i = 0; i < nextTerminalId; ++i) {
    211         const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
    212         if (terminalPos == NOT_A_DICT_POS) {
    213             continue;
    214         }
    215         const ProbabilityEntry probabilityEntry =
    216                 mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
    217         const int probability = probabilityEntry.hasHistoricalInfo() ?
    218                 ForgettingCurveUtils::decodeProbability(
    219                         probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
    220                 probabilityEntry.getProbability();
    221         priorityQueue.push(DictProbability(terminalPos, probability,
    222                 probabilityEntry.getHistoricalInfo()->getTimestamp()));
    223     }
    224 
    225     // Delete unigrams.
    226     while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
    227         const int ptNodePos = priorityQueue.top().getDictPos();
    228         priorityQueue.pop();
    229         const PtNodeParams ptNodeParams =
    230                 ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
    231         if (ptNodeParams.representsNonWordInfo()) {
    232             continue;
    233         }
    234         if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
    235             AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
    236             return false;
    237         }
    238     }
    239     return true;
    240 }
    241 
    242 bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
    243     const TerminalPositionLookupTable *const terminalPosLookupTable =
    244             mBuffers->getTerminalPositionLookupTable();
    245     const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
    246     std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
    247             priorityQueue;
    248     BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
    249     for (int i = 0; i < nextTerminalId; ++i) {
    250         const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
    251         if (bigramListPos == NOT_A_DICT_POS) {
    252             continue;
    253         }
    254         bool hasNext = true;
    255         int readingPos = bigramListPos;
    256         while (hasNext) {
    257             const int entryPos = readingPos;
    258             const BigramEntry bigramEntry =
    259                     bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
    260             hasNext = bigramEntry.hasNext();
    261             if (!bigramEntry.isValid()) {
    262                 continue;
    263             }
    264             const int probability = bigramEntry.hasHistoricalInfo() ?
    265                     ForgettingCurveUtils::decodeProbability(
    266                             bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
    267                     bigramEntry.getProbability();
    268             priorityQueue.push(DictProbability(entryPos, probability,
    269                     bigramEntry.getHistoricalInfo()->getTimestamp()));
    270         }
    271     }
    272 
    273     // Delete bigrams.
    274     while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
    275         const int entryPos = priorityQueue.top().getDictPos();
    276         const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
    277         const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
    278         if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
    279             AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
    280             return false;
    281         }
    282         priorityQueue.pop();
    283     }
    284     return true;
    285 }
    286 
    287 bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
    288         ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
    289     if (!ptNodeParams->isTerminal()) {
    290         return true;
    291     }
    292     TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
    293             mTerminalIdMap->find(ptNodeParams->getTerminalId());
    294     if (it == mTerminalIdMap->end()) {
    295         AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
    296                 ptNodeParams->getTerminalId(), mTerminalIdMap->size());
    297         return false;
    298     }
    299     if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
    300         AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
    301     }
    302     return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
    303 }
    304 
    305 } // namespace v402
    306 } // namespace backward
    307 } // namespace latinime
    308