1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* 18 * !!!!! DO NOT EDIT THIS FILE !!!!! 19 * 20 * This file was generated from 21 * dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp 22 */ 23 24 #include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" 25 26 #include <cstring> 27 #include <queue> 28 29 #include "dictionary/header/header_policy.h" 30 #include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" 31 #include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" 32 #include "dictionary/structure/backward/v402/ver4_dict_buffers.h" 33 #include "dictionary/structure/backward/v402/ver4_dict_constants.h" 34 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" 35 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" 36 #include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" 37 #include "dictionary/utils/buffer_with_extendable_buffer.h" 38 #include "dictionary/utils/file_utils.h" 39 #include "dictionary/utils/forgetting_curve_utils.h" 40 41 namespace latinime { 42 namespace backward { 43 namespace v402 { 44 45 bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, 46 const EntryCounts &entryCounts) const { 47 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); 48 BufferWithExtendableBuffer headerBuffer( 49 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); 50 const int extendedRegionSize = headerPolicy->getExtendedRegionSize() 51 + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); 52 if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, 53 entryCounts, extendedRegionSize, &headerBuffer)) { 54 AKLOGE("Cannot write header structure to buffer. " 55 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " 56 "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), 57 entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize); 58 return false; 59 } 60 return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); 61 } 62 63 bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, 64 const char *const dictDirPath) { 65 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); 66 Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( 67 Ver4DictBuffers::createVer4DictBuffers(headerPolicy, 68 Ver4DictConstants::MAX_DICTIONARY_SIZE)); 69 int unigramCount = 0; 70 int bigramCount = 0; 71 if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { 72 return false; 73 } 74 BufferWithExtendableBuffer headerBuffer( 75 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); 76 MutableEntryCounters entryCounters; 77 entryCounters.setNgramCount(NgramType::Unigram, unigramCount); 78 entryCounters.setNgramCount(NgramType::Bigram, bigramCount); 79 if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, 80 entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { 81 return false; 82 } 83 return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); 84 } 85 86 bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, 87 const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, 88 int *const outUnigramCount, int *const outBigramCount) { 89 Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), 90 mBuffers->getProbabilityDictContent(), headerPolicy); 91 Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); 92 Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), 93 mBuffers->getTerminalPositionLookupTable(), headerPolicy); 94 Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), 95 mBuffers->getTerminalPositionLookupTable()); 96 Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), 97 mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, 98 &shortcutPolicy); 99 100 DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); 101 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 102 DynamicPtGcEventListeners 103 ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted 104 traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( 105 &ptNodeWriter); 106 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( 107 &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { 108 return false; 109 } 110 const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted 111 .getValidUnigramCount(); 112 const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram); 113 if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { 114 if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { 115 AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, 116 maxUnigramCount); 117 return false; 118 } 119 } 120 121 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 122 DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability 123 traversePolicyToUpdateBigramProbability(&ptNodeWriter); 124 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( 125 &traversePolicyToUpdateBigramProbability)) { 126 return false; 127 } 128 const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); 129 const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram); 130 if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { 131 if (!truncateBigrams(maxBigramCount)) { 132 AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); 133 return false; 134 } 135 } 136 137 // Mapping from positions in mBuffer to positions in bufferToWrite. 138 PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; 139 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 140 Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), 141 buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, 142 &shortcutPolicy); 143 DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer 144 traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, 145 buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); 146 if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( 147 &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { 148 return false; 149 } 150 151 // Create policy instances for the GCed dictionary. 152 Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), 153 buffersToWrite->getProbabilityDictContent(), headerPolicy); 154 Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); 155 Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), 156 buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); 157 Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), 158 buffersToWrite->getTerminalPositionLookupTable()); 159 Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), 160 buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, 161 &newShortcutPolicy); 162 // Re-assign terminal IDs for valid terminal PtNodes. 163 TerminalPositionLookupTable::TerminalIdMap terminalIdMap; 164 if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( 165 &terminalIdMap)) { 166 return false; 167 } 168 // Run GC for probability dict content. 169 if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, 170 mBuffers->getProbabilityDictContent())) { 171 return false; 172 } 173 // Run GC for bigram dict content. 174 if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, 175 mBuffers->getBigramDictContent(), outBigramCount)) { 176 return false; 177 } 178 // Run GC for shortcut dict content. 179 if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, 180 mBuffers->getShortcutDictContent())) { 181 return false; 182 } 183 DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); 184 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 185 DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields 186 traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); 187 if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( 188 &traversePolicyToUpdateAllPositionFields)) { 189 return false; 190 } 191 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 192 TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds 193 traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); 194 if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( 195 &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { 196 return false; 197 } 198 *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); 199 return true; 200 } 201 202 bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( 203 const Ver4PatriciaTrieNodeReader *const ptNodeReader, 204 Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { 205 const TerminalPositionLookupTable *const terminalPosLookupTable = 206 mBuffers->getTerminalPositionLookupTable(); 207 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); 208 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> 209 priorityQueue; 210 for (int i = 0; i < nextTerminalId; ++i) { 211 const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); 212 if (terminalPos == NOT_A_DICT_POS) { 213 continue; 214 } 215 const ProbabilityEntry probabilityEntry = 216 mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); 217 const int probability = probabilityEntry.hasHistoricalInfo() ? 218 ForgettingCurveUtils::decodeProbability( 219 probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : 220 probabilityEntry.getProbability(); 221 priorityQueue.push(DictProbability(terminalPos, probability, 222 probabilityEntry.getHistoricalInfo()->getTimestamp())); 223 } 224 225 // Delete unigrams. 226 while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) { 227 const int ptNodePos = priorityQueue.top().getDictPos(); 228 priorityQueue.pop(); 229 const PtNodeParams ptNodeParams = 230 ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 231 if (ptNodeParams.representsNonWordInfo()) { 232 continue; 233 } 234 if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { 235 AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); 236 return false; 237 } 238 } 239 return true; 240 } 241 242 bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { 243 const TerminalPositionLookupTable *const terminalPosLookupTable = 244 mBuffers->getTerminalPositionLookupTable(); 245 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); 246 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> 247 priorityQueue; 248 BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); 249 for (int i = 0; i < nextTerminalId; ++i) { 250 const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); 251 if (bigramListPos == NOT_A_DICT_POS) { 252 continue; 253 } 254 bool hasNext = true; 255 int readingPos = bigramListPos; 256 while (hasNext) { 257 const int entryPos = readingPos; 258 const BigramEntry bigramEntry = 259 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); 260 hasNext = bigramEntry.hasNext(); 261 if (!bigramEntry.isValid()) { 262 continue; 263 } 264 const int probability = bigramEntry.hasHistoricalInfo() ? 265 ForgettingCurveUtils::decodeProbability( 266 bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : 267 bigramEntry.getProbability(); 268 priorityQueue.push(DictProbability(entryPos, probability, 269 bigramEntry.getHistoricalInfo()->getTimestamp())); 270 } 271 } 272 273 // Delete bigrams. 274 while (static_cast<int>(priorityQueue.size()) > maxBigramCount) { 275 const int entryPos = priorityQueue.top().getDictPos(); 276 const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); 277 const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); 278 if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { 279 AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); 280 return false; 281 } 282 priorityQueue.pop(); 283 } 284 return true; 285 } 286 287 bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds 288 ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { 289 if (!ptNodeParams->isTerminal()) { 290 return true; 291 } 292 TerminalPositionLookupTable::TerminalIdMap::const_iterator it = 293 mTerminalIdMap->find(ptNodeParams->getTerminalId()); 294 if (it == mTerminalIdMap->end()) { 295 AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", 296 ptNodeParams->getTerminalId(), mTerminalIdMap->size()); 297 return false; 298 } 299 if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { 300 AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); 301 } 302 return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); 303 } 304 305 } // namespace v402 306 } // namespace backward 307 } // namespace latinime 308