1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" 18 19 #include <cstdio> 20 #include <cstring> 21 #include <ctime> 22 23 #include "defines.h" 24 #include "suggest/core/dicnode/dic_node.h" 25 #include "suggest/core/dicnode/dic_node_vector.h" 26 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" 27 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" 28 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" 29 #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" 30 #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" 31 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 32 #include "suggest/policyimpl/dictionary/utils/probability_utils.h" 33 34 namespace latinime { 35 36 // Note that these are corresponding definitions in Java side in BinaryDictionaryTests and 37 // BinaryDictionaryDecayingTests. 38 const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 39 const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 40 const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 41 const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 42 const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = 43 "SET_NEEDS_TO_DECAY_FOR_TESTING"; 44 const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; 45 const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = 46 DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024; 47 48 void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, 49 DicNodeVector *const childDicNodes) const { 50 if (!dicNode->hasChildren()) { 51 return; 52 } 53 DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, 54 getBigramsStructurePolicy(), getShortcutsStructurePolicy()); 55 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPos()); 56 const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); 57 while (!readingHelper.isEnd()) { 58 bool isTerminal = nodeReader->isTerminal() && !nodeReader->isDeleted(); 59 if (isTerminal && mHeaderPolicy.isDecayingDict()) { 60 // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose 61 // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a 62 // valid terminal DicNode. 63 isTerminal = getProbability(nodeReader->getProbability(), NOT_A_PROBABILITY) 64 != NOT_A_PROBABILITY; 65 } 66 childDicNodes->pushLeavingChild(dicNode, nodeReader->getHeadPos(), 67 nodeReader->getChildrenPos(), nodeReader->getProbability(), isTerminal, 68 nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), 69 nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); 70 readingHelper.readNextSiblingNode(); 71 } 72 } 73 74 int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( 75 const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, 76 int *const outUnigramProbability) const { 77 // This method traverses parent nodes from the terminal by following parent pointers; thus, 78 // node code points are stored in the buffer in the reverse order. 79 int reverseCodePoints[maxCodePointCount]; 80 DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, 81 getBigramsStructurePolicy(), getShortcutsStructurePolicy()); 82 // First, read the terminal node and get its probability. 83 readingHelper.initWithPtNodePos(ptNodePos); 84 if (!readingHelper.isValidTerminalNode()) { 85 // Node at the ptNodePos is not a valid terminal node. 86 *outUnigramProbability = NOT_A_PROBABILITY; 87 return 0; 88 } 89 // Store terminal node probability. 90 *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); 91 // Then, following parent node link to the dictionary root and fetch node code points. 92 while (!readingHelper.isEnd()) { 93 if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { 94 // The ptNodePos is not a valid terminal node position in the dictionary. 95 *outUnigramProbability = NOT_A_PROBABILITY; 96 return 0; 97 } 98 // Store node code points to buffer in the reverse order. 99 readingHelper.fetchMergedNodeCodePointsInReverseOrder( 100 readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); 101 // Follow parent node toward the root node. 102 readingHelper.readParentNode(); 103 } 104 if (readingHelper.isError()) { 105 // The node position or the dictionary is invalid. 106 *outUnigramProbability = NOT_A_PROBABILITY; 107 return 0; 108 } 109 // Reverse the stored code points to output them. 110 const int codePointCount = readingHelper.getTotalCodePointCount(); 111 for (int i = 0; i < codePointCount; ++i) { 112 outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; 113 } 114 return codePointCount; 115 } 116 117 int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, 118 const int length, const bool forceLowerCaseSearch) const { 119 int searchCodePoints[length]; 120 for (int i = 0; i < length; ++i) { 121 searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; 122 } 123 DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, 124 getBigramsStructurePolicy(), getShortcutsStructurePolicy()); 125 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 126 const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); 127 while (!readingHelper.isEnd()) { 128 const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); 129 if (readingHelper.getTotalCodePointCount() > length 130 || !readingHelper.isMatchedCodePoint(0 /* index */, 131 searchCodePoints[matchedCodePointCount])) { 132 // Current node has too many code points or its first code point is different from 133 // target code point. Skip this node and read the next sibling node. 134 readingHelper.readNextSiblingNode(); 135 continue; 136 } 137 // Check following merged node code points. 138 const int nodeCodePointCount = nodeReader->getCodePointCount(); 139 for (int j = 1; j < nodeCodePointCount; ++j) { 140 if (!readingHelper.isMatchedCodePoint( 141 j, searchCodePoints[matchedCodePointCount + j])) { 142 // Different code point is found. The given word is not included in the dictionary. 143 return NOT_A_DICT_POS; 144 } 145 } 146 // All characters are matched. 147 if (length == readingHelper.getTotalCodePointCount()) { 148 // Terminal position is found. 149 return nodeReader->getHeadPos(); 150 } 151 if (!nodeReader->hasChildren()) { 152 return NOT_A_DICT_POS; 153 } 154 // Advance to the children nodes. 155 readingHelper.readChildNode(); 156 } 157 // If we already traversed the tree further than the word is long, there means 158 // there was no match (or we would have found it). 159 return NOT_A_DICT_POS; 160 } 161 162 int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, 163 const int bigramProbability) const { 164 if (mHeaderPolicy.isDecayingDict()) { 165 return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); 166 } else { 167 if (unigramProbability == NOT_A_PROBABILITY) { 168 return NOT_A_PROBABILITY; 169 } else if (bigramProbability == NOT_A_PROBABILITY) { 170 return ProbabilityUtils::backoff(unigramProbability); 171 } else { 172 return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, 173 bigramProbability); 174 } 175 } 176 } 177 178 int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { 179 if (ptNodePos == NOT_A_DICT_POS) { 180 return NOT_A_PROBABILITY; 181 } 182 DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, 183 getBigramsStructurePolicy(), getShortcutsStructurePolicy()); 184 nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); 185 if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { 186 return NOT_A_PROBABILITY; 187 } 188 return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY); 189 } 190 191 int DynamicPatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 192 if (ptNodePos == NOT_A_DICT_POS) { 193 return NOT_A_DICT_POS; 194 } 195 DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, 196 getBigramsStructurePolicy(), getShortcutsStructurePolicy()); 197 nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); 198 if (nodeReader.isDeleted()) { 199 return NOT_A_DICT_POS; 200 } 201 return nodeReader.getShortcutPos(); 202 } 203 204 int DynamicPatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 205 if (ptNodePos == NOT_A_DICT_POS) { 206 return NOT_A_DICT_POS; 207 } 208 DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, 209 getBigramsStructurePolicy(), getShortcutsStructurePolicy()); 210 nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); 211 if (nodeReader.isDeleted()) { 212 return NOT_A_DICT_POS; 213 } 214 return nodeReader.getBigramsPos(); 215 } 216 217 bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length, 218 const int probability) { 219 if (!mBuffer->isUpdatable()) { 220 AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); 221 return false; 222 } 223 if (mBufferWithExtendableBuffer.getTailPosition() 224 >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 225 AKLOGE("The dictionary is too large to dynamically update."); 226 return false; 227 } 228 DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, 229 getBigramsStructurePolicy(), getShortcutsStructurePolicy()); 230 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 231 DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, 232 &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); 233 bool addedNewUnigram = false; 234 if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, 235 &addedNewUnigram)) { 236 if (addedNewUnigram) { 237 mUnigramCount++; 238 } 239 return true; 240 } else { 241 return false; 242 } 243 } 244 245 bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, 246 const int *const word1, const int length1, const int probability) { 247 if (!mBuffer->isUpdatable()) { 248 AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); 249 return false; 250 } 251 if (mBufferWithExtendableBuffer.getTailPosition() 252 >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 253 AKLOGE("The dictionary is too large to dynamically update."); 254 return false; 255 } 256 const int word0Pos = getTerminalNodePositionOfWord(word0, length0, 257 false /* forceLowerCaseSearch */); 258 if (word0Pos == NOT_A_DICT_POS) { 259 return false; 260 } 261 const int word1Pos = getTerminalNodePositionOfWord(word1, length1, 262 false /* forceLowerCaseSearch */); 263 if (word1Pos == NOT_A_DICT_POS) { 264 return false; 265 } 266 DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, 267 &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); 268 bool addedNewBigram = false; 269 if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) { 270 if (addedNewBigram) { 271 mBigramCount++; 272 } 273 return true; 274 } else { 275 return false; 276 } 277 } 278 279 bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, 280 const int *const word1, const int length1) { 281 if (!mBuffer->isUpdatable()) { 282 AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); 283 return false; 284 } 285 if (mBufferWithExtendableBuffer.getTailPosition() 286 >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 287 AKLOGE("The dictionary is too large to dynamically update."); 288 return false; 289 } 290 const int word0Pos = getTerminalNodePositionOfWord(word0, length0, 291 false /* forceLowerCaseSearch */); 292 if (word0Pos == NOT_A_DICT_POS) { 293 return false; 294 } 295 const int word1Pos = getTerminalNodePositionOfWord(word1, length1, 296 false /* forceLowerCaseSearch */); 297 if (word1Pos == NOT_A_DICT_POS) { 298 return false; 299 } 300 DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, 301 &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); 302 if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { 303 mBigramCount--; 304 return true; 305 } else { 306 return false; 307 } 308 } 309 310 void DynamicPatriciaTriePolicy::flush(const char *const filePath) { 311 if (!mBuffer->isUpdatable()) { 312 AKLOGI("Warning: flush() is called for non-updatable dictionary."); 313 return; 314 } 315 DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, 316 &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */); 317 writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount); 318 } 319 320 void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { 321 if (!mBuffer->isUpdatable()) { 322 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 323 return; 324 } 325 const bool needsToDecay = mHeaderPolicy.isDecayingDict() 326 && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( 327 false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy)); 328 DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer, 329 &mShortcutListPolicy, needsToDecay); 330 DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, 331 &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay); 332 writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); 333 mNeedsToDecayForTesting = false; 334 } 335 336 bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { 337 if (!mBuffer->isUpdatable()) { 338 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 339 return false; 340 } 341 if (mBufferWithExtendableBuffer.isNearSizeLimit()) { 342 // Additional buffer size is near the limit. 343 return true; 344 } else if (mHeaderPolicy.getExtendedRegionSize() 345 + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() 346 > MAX_DICT_EXTENDED_REGION_SIZE) { 347 // Total extended region size exceeds the limit. 348 return true; 349 } else if (mBufferWithExtendableBuffer.getTailPosition() 350 >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS 351 && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) { 352 // Needs to reduce dictionary size. 353 return true; 354 } else if (mHeaderPolicy.isDecayingDict()) { 355 return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( 356 mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy); 357 } 358 return false; 359 } 360 361 void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult, 362 const int maxResultLength) { 363 if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { 364 snprintf(outResult, maxResultLength, "%d", mUnigramCount); 365 } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { 366 snprintf(outResult, maxResultLength, "%d", mBigramCount); 367 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { 368 snprintf(outResult, maxResultLength, "%d", 369 mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : 370 static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); 371 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) { 372 snprintf(outResult, maxResultLength, "%d", 373 mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : 374 static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); 375 } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) { 376 mNeedsToDecayForTesting = true; 377 } 378 } 379 380 } // namespace latinime 381