1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* 18 * !!!!! DO NOT EDIT THIS FILE !!!!! 19 * 20 * This file was generated from 21 * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp 22 */ 23 24 #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" 25 26 #include <cstring> 27 #include <queue> 28 29 #include "suggest/policyimpl/dictionary/header/header_policy.h" 30 #include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" 31 #include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" 32 #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" 33 #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" 34 #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" 35 #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" 36 #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" 37 #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" 38 #include "suggest/policyimpl/dictionary/utils/file_utils.h" 39 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 40 41 namespace latinime { 42 namespace backward { 43 namespace v402 { 44 45 bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, 46 const int unigramCount, const int bigramCount) const { 47 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); 48 BufferWithExtendableBuffer headerBuffer( 49 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); 50 const int extendedRegionSize = headerPolicy->getExtendedRegionSize() 51 + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); 52 if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, 53 unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { 54 AKLOGE("Cannot write header structure to buffer. " 55 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " 56 "extendedRegionSize: %d", false, unigramCount, bigramCount, 57 extendedRegionSize); 58 return false; 59 } 60 return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); 61 } 62 63 bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, 64 const char *const dictDirPath) { 65 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); 66 Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( 67 Ver4DictBuffers::createVer4DictBuffers(headerPolicy, 68 Ver4DictConstants::MAX_DICTIONARY_SIZE)); 69 int unigramCount = 0; 70 int bigramCount = 0; 71 if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { 72 return false; 73 } 74 BufferWithExtendableBuffer headerBuffer( 75 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); 76 if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, 77 unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { 78 return false; 79 } 80 return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); 81 } 82 83 bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, 84 const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, 85 int *const outUnigramCount, int *const outBigramCount) { 86 Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), 87 mBuffers->getProbabilityDictContent(), headerPolicy); 88 Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); 89 Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), 90 mBuffers->getTerminalPositionLookupTable(), headerPolicy); 91 Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), 92 mBuffers->getTerminalPositionLookupTable()); 93 Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), 94 mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, 95 &shortcutPolicy); 96 97 DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); 98 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 99 DynamicPtGcEventListeners 100 ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted 101 traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( 102 &ptNodeWriter); 103 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( 104 &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { 105 return false; 106 } 107 const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted 108 .getValidUnigramCount(); 109 const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); 110 if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { 111 if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { 112 AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, 113 maxUnigramCount); 114 return false; 115 } 116 } 117 118 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 119 DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability 120 traversePolicyToUpdateBigramProbability(&ptNodeWriter); 121 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( 122 &traversePolicyToUpdateBigramProbability)) { 123 return false; 124 } 125 const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); 126 const int maxBigramCount = headerPolicy->getMaxBigramCount(); 127 if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { 128 if (!truncateBigrams(maxBigramCount)) { 129 AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); 130 return false; 131 } 132 } 133 134 // Mapping from positions in mBuffer to positions in bufferToWrite. 135 PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; 136 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 137 Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), 138 buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, 139 &shortcutPolicy); 140 DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer 141 traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, 142 buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); 143 if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( 144 &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { 145 return false; 146 } 147 148 // Create policy instances for the GCed dictionary. 149 Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), 150 buffersToWrite->getProbabilityDictContent(), headerPolicy); 151 Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); 152 Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), 153 buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); 154 Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), 155 buffersToWrite->getTerminalPositionLookupTable()); 156 Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), 157 buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, 158 &newShortcutPolicy); 159 // Re-assign terminal IDs for valid terminal PtNodes. 160 TerminalPositionLookupTable::TerminalIdMap terminalIdMap; 161 if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( 162 &terminalIdMap)) { 163 return false; 164 } 165 // Run GC for probability dict content. 166 if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, 167 mBuffers->getProbabilityDictContent())) { 168 return false; 169 } 170 // Run GC for bigram dict content. 171 if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, 172 mBuffers->getBigramDictContent(), outBigramCount)) { 173 return false; 174 } 175 // Run GC for shortcut dict content. 176 if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, 177 mBuffers->getShortcutDictContent())) { 178 return false; 179 } 180 DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); 181 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 182 DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields 183 traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); 184 if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( 185 &traversePolicyToUpdateAllPositionFields)) { 186 return false; 187 } 188 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); 189 TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds 190 traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); 191 if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( 192 &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { 193 return false; 194 } 195 *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); 196 return true; 197 } 198 199 bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( 200 const Ver4PatriciaTrieNodeReader *const ptNodeReader, 201 Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { 202 const TerminalPositionLookupTable *const terminalPosLookupTable = 203 mBuffers->getTerminalPositionLookupTable(); 204 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); 205 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> 206 priorityQueue; 207 for (int i = 0; i < nextTerminalId; ++i) { 208 const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); 209 if (terminalPos == NOT_A_DICT_POS) { 210 continue; 211 } 212 const ProbabilityEntry probabilityEntry = 213 mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); 214 const int probability = probabilityEntry.hasHistoricalInfo() ? 215 ForgettingCurveUtils::decodeProbability( 216 probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : 217 probabilityEntry.getProbability(); 218 priorityQueue.push(DictProbability(terminalPos, probability, 219 probabilityEntry.getHistoricalInfo()->getTimeStamp())); 220 } 221 222 // Delete unigrams. 223 while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) { 224 const int ptNodePos = priorityQueue.top().getDictPos(); 225 priorityQueue.pop(); 226 const PtNodeParams ptNodeParams = 227 ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 228 if (ptNodeParams.representsNonWordInfo()) { 229 continue; 230 } 231 if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { 232 AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); 233 return false; 234 } 235 } 236 return true; 237 } 238 239 bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { 240 const TerminalPositionLookupTable *const terminalPosLookupTable = 241 mBuffers->getTerminalPositionLookupTable(); 242 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); 243 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> 244 priorityQueue; 245 BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); 246 for (int i = 0; i < nextTerminalId; ++i) { 247 const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); 248 if (bigramListPos == NOT_A_DICT_POS) { 249 continue; 250 } 251 bool hasNext = true; 252 int readingPos = bigramListPos; 253 while (hasNext) { 254 const int entryPos = readingPos; 255 const BigramEntry bigramEntry = 256 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); 257 hasNext = bigramEntry.hasNext(); 258 if (!bigramEntry.isValid()) { 259 continue; 260 } 261 const int probability = bigramEntry.hasHistoricalInfo() ? 262 ForgettingCurveUtils::decodeProbability( 263 bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : 264 bigramEntry.getProbability(); 265 priorityQueue.push(DictProbability(entryPos, probability, 266 bigramEntry.getHistoricalInfo()->getTimeStamp())); 267 } 268 } 269 270 // Delete bigrams. 271 while (static_cast<int>(priorityQueue.size()) > maxBigramCount) { 272 const int entryPos = priorityQueue.top().getDictPos(); 273 const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); 274 const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); 275 if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { 276 AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); 277 return false; 278 } 279 priorityQueue.pop(); 280 } 281 return true; 282 } 283 284 bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds 285 ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { 286 if (!ptNodeParams->isTerminal()) { 287 return true; 288 } 289 TerminalPositionLookupTable::TerminalIdMap::const_iterator it = 290 mTerminalIdMap->find(ptNodeParams->getTerminalId()); 291 if (it == mTerminalIdMap->end()) { 292 AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", 293 ptNodeParams->getTerminalId(), mTerminalIdMap->size()); 294 return false; 295 } 296 if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { 297 AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); 298 } 299 return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); 300 } 301 302 } // namespace v402 303 } // namespace backward 304 } // namespace latinime 305