1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" 18 19 #include "dictionary/property/unigram_property.h" 20 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 21 #include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" 22 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" 23 #include "dictionary/structure/pt_common/pt_node_reader.h" 24 #include "dictionary/structure/pt_common/pt_node_writer.h" 25 #include "dictionary/utils/buffer_with_extendable_buffer.h" 26 27 namespace latinime { 28 29 const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; 30 31 bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper, 32 const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, 33 bool *const outAddedNewUnigram) { 34 int parentPos = NOT_A_DICT_POS; 35 while (!readingHelper->isEnd()) { 36 const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); 37 if (!ptNodeParams.isValid()) { 38 break; 39 } 40 const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); 41 if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, 42 wordCodePoints[matchedCodePointCount])) { 43 // The first code point is different from target code point. Skip this node and read 44 // the next sibling node. 45 readingHelper->readNextSiblingNode(ptNodeParams); 46 continue; 47 } 48 // Check following merged node code points. 49 const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size(); 50 for (size_t j = 1; j < nodeCodePointCount; ++j) { 51 const size_t nextIndex = matchedCodePointCount + j; 52 if (nextIndex >= wordCodePoints.size() 53 || !readingHelper->isMatchedCodePoint(ptNodeParams, j, 54 wordCodePoints[matchedCodePointCount + j])) { 55 *outAddedNewUnigram = true; 56 return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, 57 wordCodePoints.skip(matchedCodePointCount)); 58 } 59 } 60 // All characters are matched. 61 if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) { 62 return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); 63 } 64 if (!ptNodeParams.hasChildren()) { 65 *outAddedNewUnigram = true; 66 return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, 67 wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams))); 68 } 69 // Advance to the children nodes. 70 parentPos = ptNodeParams.getHeadPos(); 71 readingHelper->readChildNode(ptNodeParams); 72 } 73 if (readingHelper->isError()) { 74 // The dictionary is invalid. 75 return false; 76 } 77 int pos = readingHelper->getPosOfLastForwardLinkField(); 78 *outAddedNewUnigram = true; 79 return createAndInsertNodeIntoPtNodeArray(parentPos, 80 wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty, 81 &pos); 82 } 83 84 bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, 85 const int wordPos, const NgramProperty *const ngramProperty, 86 bool *const outAddedNewEntry) { 87 if (prevWordsPtNodePos.empty()) { 88 return false; 89 } 90 ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); 91 int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 92 for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { 93 prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( 94 prevWordsPtNodePos[i]).getTerminalId(); 95 } 96 const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); 97 const int wordId = 98 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); 99 return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry); 100 } 101 102 bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, 103 const int wordPos) { 104 if (prevWordsPtNodePos.empty()) { 105 return false; 106 } 107 ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); 108 int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 109 for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { 110 prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( 111 prevWordsPtNodePos[i]).getTerminalId(); 112 } 113 const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); 114 const int wordId = 115 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); 116 return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId); 117 } 118 119 bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, 120 const CodePointArrayView targetCodePoints, const int shortcutProbability) { 121 const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); 122 return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(), 123 targetCodePoints.size(), shortcutProbability); 124 } 125 126 bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, 127 const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, 128 int *const forwardLinkFieldPos) { 129 const int newPtNodeArrayPos = mBuffer->getTailPosition(); 130 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, 131 newPtNodeArrayPos, forwardLinkFieldPos)) { 132 return false; 133 } 134 return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty); 135 } 136 137 bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, 138 const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { 139 if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) { 140 // Overwrites the probability. 141 *outAddedNewUnigram = false; 142 return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); 143 } else { 144 // Make the node terminal and write the probability. 145 *outAddedNewUnigram = true; 146 const int movedPos = mBuffer->getTailPosition(); 147 int writingPos = movedPos; 148 const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, 149 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), 150 true /* isTerminal */, originalPtNodeParams->getParentPos(), 151 originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability())); 152 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, 153 unigramProperty, &writingPos)) { 154 return false; 155 } 156 if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { 157 return false; 158 } 159 } 160 return true; 161 } 162 163 bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( 164 const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, 165 const CodePointArrayView codePoints) { 166 const int newPtNodeArrayPos = mBuffer->getTailPosition(); 167 if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { 168 return false; 169 } 170 return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, 171 unigramProperty); 172 } 173 174 bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( 175 const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints, 176 const UnigramProperty *const unigramProperty) { 177 int writingPos = mBuffer->getTailPosition(); 178 if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, 179 1 /* arraySize */, &writingPos)) { 180 return false; 181 } 182 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( 183 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), 184 true /* isTerminal */, parentPtNodePos, ptNodeCodePoints, 185 unigramProperty->getProbability())); 186 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, 187 unigramProperty, &writingPos)) { 188 return false; 189 } 190 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, 191 NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { 192 return false; 193 } 194 return true; 195 } 196 197 // Returns whether the dictionary updating was succeeded or not. 198 bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( 199 const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount, 200 const UnigramProperty *const unigramProperty, 201 const CodePointArrayView newPtNodeCodePoints) { 202 // When addsExtraChild is true, split the reallocating PtNode and add new child. 203 // Reallocating PtNode: abcde, newNode: abcxy. 204 // abc (1st, not terminal) __ de (2nd) 205 // \_ xy (extra child, terminal) 206 // Otherwise, this method makes 1st part terminal and write information in unigramProperty. 207 // Reallocating PtNode: abcde, newNode: abc. 208 // abc (1st, terminal) __ de (2nd) 209 const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount; 210 const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); 211 int writingPos = firstPartOfReallocatedPtNodePos; 212 // Write the 1st part of the reallocating node. The children position will be updated later 213 // with actual children position. 214 const CodePointArrayView firstPtNodeCodePoints = 215 reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount); 216 if (addsExtraChild) { 217 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( 218 false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */, 219 reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints, 220 NOT_A_PROBABILITY)); 221 if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { 222 return false; 223 } 224 } else { 225 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( 226 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), 227 true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), 228 firstPtNodeCodePoints, unigramProperty->getProbability())); 229 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, 230 unigramProperty, &writingPos)) { 231 return false; 232 } 233 } 234 const int actualChildrenPos = writingPos; 235 // Create new children PtNode array. 236 const size_t newPtNodeCount = addsExtraChild ? 2 : 1; 237 if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, 238 newPtNodeCount, &writingPos)) { 239 return false; 240 } 241 // Write the 2nd part of the reallocating node. 242 const int secondPartOfReallocatedPtNodePos = writingPos; 243 const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, 244 reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(), 245 reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, 246 reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount), 247 reallocatingPtNodeParams->getProbability())); 248 if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { 249 return false; 250 } 251 if (addsExtraChild) { 252 const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( 253 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), 254 true /* isTerminal */, firstPartOfReallocatedPtNodePos, 255 newPtNodeCodePoints.skip(overlappingCodePointCount), 256 unigramProperty->getProbability())); 257 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, 258 unigramProperty, &writingPos)) { 259 return false; 260 } 261 } 262 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, 263 NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { 264 return false; 265 } 266 // Update original reallocating PtNode as moved. 267 if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, 268 secondPartOfReallocatedPtNodePos)) { 269 return false; 270 } 271 // Load node info. Information of the 1st part will be fetched. 272 const PtNodeParams ptNodeParams( 273 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); 274 // Update children position. 275 return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); 276 } 277 278 const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( 279 const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, 280 const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, 281 const CodePointArrayView codePoints, const int probability) const { 282 const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( 283 isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, 284 false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, 285 CHILDREN_POSITION_FIELD_SIZE); 286 return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability); 287 } 288 289 const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord, 290 const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, 291 const CodePointArrayView codePoints, const int probability) const { 292 const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( 293 isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, 294 false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, 295 CHILDREN_POSITION_FIELD_SIZE); 296 return PtNodeParams(flags, parentPos, codePoints, probability); 297 } 298 299 } // namespace latinime 300