1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; 22 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; 23 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 24 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 25 26 import java.io.IOException; 27 import java.io.OutputStream; 28 import java.util.ArrayList; 29 import java.util.Arrays; 30 31 /** 32 * The utility class to help dynamic updates on the binary dictionary. 33 * 34 * All the methods in this class are static. 35 */ 36 @UsedForTesting 37 public final class DynamicBinaryDictIOUtils { 38 private static final boolean DBG = false; 39 private static final int MAX_JUMPS = 10000; 40 41 private DynamicBinaryDictIOUtils() { 42 // This utility class is not publicly instantiable. 43 } 44 45 /* package */ static int markAsDeleted(final int flags) { 46 return (flags & (~FormatSpec.MASK_CHILDREN_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; 47 } 48 49 /** 50 * Update a parent address in a PtNode that is referred to by ptNodeOriginAddress. 51 * 52 * @param dictUpdater the DictUpdater to write. 53 * @param ptNodeOriginAddress the address of the PtNode. 54 * @param newParentAddress the absolute address of the parent. 55 * @param formatOptions file format options. 56 */ 57 private static void updateParentAddress(final Ver3DictUpdater dictUpdater, 58 final int ptNodeOriginAddress, final int newParentAddress, 59 final FormatOptions formatOptions) { 60 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 61 final int originalPosition = dictBuffer.position(); 62 dictBuffer.position(ptNodeOriginAddress); 63 if (!formatOptions.mSupportsDynamicUpdate) { 64 throw new RuntimeException("this file format does not support parent addresses"); 65 } 66 final int flags = dictBuffer.readUnsignedByte(); 67 if (BinaryDictIOUtils.isMovedPtNode(flags, formatOptions)) { 68 // If the node is moved, the parent address is stored in the destination node. 69 // We are guaranteed to process the destination node later, so there is no need to 70 // update anything here. 71 dictBuffer.position(originalPosition); 72 return; 73 } 74 if (DBG) { 75 MakedictLog.d("update parent address flags=" + flags + ", " + ptNodeOriginAddress); 76 } 77 final int parentOffset = newParentAddress - ptNodeOriginAddress; 78 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, parentOffset); 79 dictBuffer.position(originalPosition); 80 } 81 82 /** 83 * Update parent addresses in a node array stored at ptNodeOriginAddress. 84 * 85 * @param dictUpdater the DictUpdater to be modified. 86 * @param ptNodeOriginAddress the address of the node array to update. 87 * @param newParentAddress the address to be written. 88 * @param formatOptions file format options. 89 */ 90 private static void updateParentAddresses(final Ver3DictUpdater dictUpdater, 91 final int ptNodeOriginAddress, final int newParentAddress, 92 final FormatOptions formatOptions) { 93 final int originalPosition = dictUpdater.getPosition(); 94 dictUpdater.setPosition(ptNodeOriginAddress); 95 do { 96 final int count = dictUpdater.readPtNodeCount(); 97 for (int i = 0; i < count; ++i) { 98 updateParentAddress(dictUpdater, dictUpdater.getPosition(), newParentAddress, 99 formatOptions); 100 dictUpdater.skipPtNode(formatOptions); 101 } 102 if (!dictUpdater.readAndFollowForwardLink()) break; 103 if (dictUpdater.getPosition() == FormatSpec.NO_FORWARD_LINK_ADDRESS) break; 104 } while (formatOptions.mSupportsDynamicUpdate); 105 dictUpdater.setPosition(originalPosition); 106 } 107 108 /** 109 * Update a children address in a PtNode that is addressed by ptNodeOriginAddress. 110 * 111 * @param dictUpdater the DictUpdater to write. 112 * @param ptNodeOriginAddress the address of the PtNode. 113 * @param newChildrenAddress the absolute address of the child. 114 * @param formatOptions file format options. 115 */ 116 private static void updateChildrenAddress(final Ver3DictUpdater dictUpdater, 117 final int ptNodeOriginAddress, final int newChildrenAddress, 118 final FormatOptions formatOptions) { 119 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 120 final int originalPosition = dictBuffer.position(); 121 dictBuffer.position(ptNodeOriginAddress); 122 final int flags = dictBuffer.readUnsignedByte(); 123 BinaryDictDecoderUtils.readParentAddress(dictBuffer, formatOptions); 124 BinaryDictIOUtils.skipString(dictBuffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); 125 if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) dictBuffer.readUnsignedByte(); 126 final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS 127 ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - dictBuffer.position(); 128 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, childrenOffset); 129 dictBuffer.position(originalPosition); 130 } 131 132 /** 133 * Helper method to move a PtNode to the tail of the file. 134 */ 135 private static int movePtNode(final OutputStream destination, 136 final Ver3DictUpdater dictUpdater, final PtNodeInfo info, 137 final int nodeArrayOriginAddress, final int oldNodeAddress, 138 final FormatOptions formatOptions) throws IOException { 139 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 140 updateParentAddress(dictUpdater, oldNodeAddress, dictBuffer.limit() + 1, formatOptions); 141 dictBuffer.position(oldNodeAddress); 142 final int currentFlags = dictBuffer.readUnsignedByte(); 143 dictBuffer.position(oldNodeAddress); 144 dictBuffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags 145 & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); 146 int size = FormatSpec.PTNODE_FLAGS_SIZE; 147 updateForwardLink(dictUpdater, nodeArrayOriginAddress, dictBuffer.limit(), formatOptions); 148 size += BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { info }); 149 return size; 150 } 151 152 @SuppressWarnings("unused") 153 private static void updateForwardLink(final Ver3DictUpdater dictUpdater, 154 final int nodeArrayOriginAddress, final int newNodeArrayAddress, 155 final FormatOptions formatOptions) { 156 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 157 dictUpdater.setPosition(nodeArrayOriginAddress); 158 int jumpCount = 0; 159 while (jumpCount++ < MAX_JUMPS) { 160 final int count = dictUpdater.readPtNodeCount(); 161 for (int i = 0; i < count; ++i) { 162 dictUpdater.readPtNode(dictUpdater.getPosition(), formatOptions); 163 } 164 final int forwardLinkAddress = dictBuffer.readUnsignedInt24(); 165 if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 166 dictBuffer.position(dictBuffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); 167 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, newNodeArrayAddress); 168 return; 169 } 170 dictBuffer.position(forwardLinkAddress); 171 } 172 if (DBG && jumpCount >= MAX_JUMPS) { 173 throw new RuntimeException("too many jumps, probably a bug."); 174 } 175 } 176 177 /** 178 * Move a PtNode that is referred to by oldPtNodeOrigin to the tail of the file, and set the 179 * children address to the byte after the PtNode. 180 * 181 * @param fileEndAddress the address of the tail of the file. 182 * @param codePoints the characters to put inside the PtNode. 183 * @param length how many code points to read from codePoints. 184 * @param flags the flags for this PtNode. 185 * @param frequency the frequency of this terminal. 186 * @param parentAddress the address of the parent PtNode of this PtNode. 187 * @param shortcutTargets the shortcut targets for this PtNode. 188 * @param bigrams the bigrams for this PtNode. 189 * @param destination the stream representing the tail of the file. 190 * @param dictUpdater the DictUpdater. 191 * @param oldPtNodeArrayOrigin the origin of the old PtNode array this PtNode was a part of. 192 * @param oldPtNodeOrigin the old origin where this PtNode used to be stored. 193 * @param formatOptions format options for this dictionary. 194 * @return the size written, in bytes. 195 * @throws IOException if the file can't be accessed 196 */ 197 private static int movePtNode(final int fileEndAddress, final int[] codePoints, 198 final int length, final int flags, final int frequency, final int parentAddress, 199 final ArrayList<WeightedString> shortcutTargets, 200 final ArrayList<PendingAttribute> bigrams, final OutputStream destination, 201 final Ver3DictUpdater dictUpdater, final int oldPtNodeArrayOrigin, 202 final int oldPtNodeOrigin, final FormatOptions formatOptions) throws IOException { 203 int size = 0; 204 final int newPtNodeOrigin = fileEndAddress + 1; 205 final int[] writtenCharacters = Arrays.copyOfRange(codePoints, 0, length); 206 final PtNodeInfo tmpInfo = new PtNodeInfo(newPtNodeOrigin, -1 /* endAddress */, 207 flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS, 208 shortcutTargets, bigrams); 209 size = BinaryDictIOUtils.computePtNodeSize(tmpInfo, formatOptions); 210 final PtNodeInfo newInfo = new PtNodeInfo(newPtNodeOrigin, newPtNodeOrigin + size, 211 flags, writtenCharacters, frequency, parentAddress, 212 fileEndAddress + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, 213 bigrams); 214 movePtNode(destination, dictUpdater, newInfo, oldPtNodeArrayOrigin, oldPtNodeOrigin, 215 formatOptions); 216 return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; 217 } 218 219 /** 220 * Insert a word into a binary dictionary. 221 * 222 * @param dictUpdater the dict updater. 223 * @param destination a stream to the underlying file, with the pointer at the end of the file. 224 * @param word the word to insert. 225 * @param frequency the frequency of the new word. 226 * @param bigramStrings bigram list, or null if none. 227 * @param shortcuts shortcut list, or null if none. 228 * @param isBlackListEntry whether this should be a blacklist entry. 229 * @throws IOException if the file can't be accessed. 230 * @throws UnsupportedFormatException if the existing dictionary is in an unexpected format. 231 */ 232 // TODO: Support batch insertion. 233 // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. 234 @UsedForTesting 235 public static void insertWord(final Ver3DictUpdater dictUpdater, 236 final OutputStream destination, final String word, final int frequency, 237 final ArrayList<WeightedString> bigramStrings, 238 final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, 239 final boolean isBlackListEntry) 240 throws IOException, UnsupportedFormatException { 241 final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>(); 242 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 243 if (bigramStrings != null) { 244 for (final WeightedString bigram : bigramStrings) { 245 int position = dictUpdater.getTerminalPosition(bigram.mWord); 246 if (position == FormatSpec.NOT_VALID_WORD) { 247 // TODO: figure out what is the correct thing to do here. 248 } else { 249 bigrams.add(new PendingAttribute(bigram.mFrequency, position)); 250 } 251 } 252 } 253 254 final boolean isTerminal = true; 255 final boolean hasBigrams = !bigrams.isEmpty(); 256 final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty(); 257 258 // find the insert position of the word. 259 if (dictBuffer.position() != 0) dictBuffer.position(0); 260 final FileHeader fileHeader = dictUpdater.readHeader(); 261 262 int wordPos = 0, address = dictBuffer.position(), nodeOriginAddress = dictBuffer.position(); 263 final int[] codePoints = FusionDictionary.getCodePoints(word); 264 final int wordLen = codePoints.length; 265 266 for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { 267 if (wordPos >= wordLen) break; 268 nodeOriginAddress = dictBuffer.position(); 269 int nodeParentAddress = -1; 270 final int ptNodeCount = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer); 271 boolean foundNextNode = false; 272 273 for (int i = 0; i < ptNodeCount; ++i) { 274 address = dictBuffer.position(); 275 final PtNodeInfo currentInfo = dictUpdater.readPtNode(address, 276 fileHeader.mFormatOptions); 277 final boolean isMovedNode = BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, 278 fileHeader.mFormatOptions); 279 if (isMovedNode) continue; 280 nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) 281 ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address; 282 boolean matched = true; 283 for (int p = 0; p < currentInfo.mCharacters.length; ++p) { 284 if (wordPos + p >= wordLen) { 285 /* 286 * splitting 287 * before 288 * abcd - ef 289 * 290 * insert "abc" 291 * 292 * after 293 * abc - d - ef 294 */ 295 final int newNodeAddress = dictBuffer.limit(); 296 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(p > 1, 297 isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */, 298 false /* isBlackListEntry */, fileHeader.mFormatOptions); 299 int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p, flags, 300 frequency, nodeParentAddress, shortcuts, bigrams, destination, 301 dictUpdater, nodeOriginAddress, address, fileHeader.mFormatOptions); 302 303 final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, 304 currentInfo.mCharacters.length); 305 if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { 306 updateParentAddresses(dictUpdater, currentInfo.mChildrenAddress, 307 newNodeAddress + written + 1, fileHeader.mFormatOptions); 308 } 309 final PtNodeInfo newInfo2 = new PtNodeInfo( 310 newNodeAddress + written + 1, -1 /* endAddress */, 311 currentInfo.mFlags, characters2, currentInfo.mFrequency, 312 newNodeAddress + 1, currentInfo.mChildrenAddress, 313 currentInfo.mShortcutTargets, currentInfo.mBigrams); 314 BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { newInfo2 }); 315 return; 316 } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) { 317 if (p > 0) { 318 /* 319 * splitting 320 * before 321 * ab - cd 322 * 323 * insert "ac" 324 * 325 * after 326 * a - b - cd 327 * | 328 * - c 329 */ 330 331 final int newNodeAddress = dictBuffer.limit(); 332 final int childrenAddress = currentInfo.mChildrenAddress; 333 334 // move prefix 335 final int prefixFlags = BinaryDictEncoderUtils.makePtNodeFlags(p > 1, 336 false /* isTerminal */, 0 /* childrenAddressSize*/, 337 false /* hasShortcut */, false /* hasBigrams */, 338 false /* isNotAWord */, false /* isBlackListEntry */, 339 fileHeader.mFormatOptions); 340 int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p, 341 prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, 342 destination, dictUpdater, nodeOriginAddress, address, 343 fileHeader.mFormatOptions); 344 345 final int[] suffixCharacters = Arrays.copyOfRange( 346 currentInfo.mCharacters, p, currentInfo.mCharacters.length); 347 if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { 348 updateParentAddresses(dictUpdater, currentInfo.mChildrenAddress, 349 newNodeAddress + written + 1, fileHeader.mFormatOptions); 350 } 351 final int suffixFlags = BinaryDictEncoderUtils.makePtNodeFlags( 352 suffixCharacters.length > 1, 353 (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0, 354 0 /* childrenAddressSize */, 355 (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) 356 != 0, 357 (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0, 358 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 359 final PtNodeInfo suffixInfo = new PtNodeInfo( 360 newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags, 361 suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1, 362 currentInfo.mChildrenAddress, currentInfo.mShortcutTargets, 363 currentInfo.mBigrams); 364 written += BinaryDictIOUtils.computePtNodeSize(suffixInfo, 365 fileHeader.mFormatOptions) + 1; 366 367 final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p, 368 codePoints.length); 369 final int flags = BinaryDictEncoderUtils.makePtNodeFlags( 370 newCharacters.length > 1, isTerminal, 371 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 372 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 373 final PtNodeInfo newInfo = new PtNodeInfo( 374 newNodeAddress + written, -1 /* endAddress */, flags, 375 newCharacters, frequency, newNodeAddress + 1, 376 FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); 377 BinaryDictIOUtils.writeNodes(destination, 378 new PtNodeInfo[] { suffixInfo, newInfo }); 379 return; 380 } 381 matched = false; 382 break; 383 } 384 } 385 386 if (matched) { 387 if (wordPos + currentInfo.mCharacters.length == wordLen) { 388 // the word exists in the dictionary. 389 // only update the PtNode. 390 final int newNodeAddress = dictBuffer.limit(); 391 final boolean hasMultipleChars = currentInfo.mCharacters.length > 1; 392 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(hasMultipleChars, 393 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 394 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 395 final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress + 1, 396 -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, 397 nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, 398 bigrams); 399 movePtNode(destination, dictUpdater, newInfo, nodeOriginAddress, address, 400 fileHeader.mFormatOptions); 401 return; 402 } 403 wordPos += currentInfo.mCharacters.length; 404 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 405 /* 406 * found the prefix of the word. 407 * make new PtNode and link to the PtNode from this PtNode. 408 * 409 * before 410 * ab - cd 411 * 412 * insert "abcde" 413 * 414 * after 415 * ab - cd - e 416 */ 417 final int newNodeArrayAddress = dictBuffer.limit(); 418 updateChildrenAddress(dictUpdater, address, newNodeArrayAddress, 419 fileHeader.mFormatOptions); 420 final int newNodeAddress = newNodeArrayAddress + 1; 421 final boolean hasMultipleChars = (wordLen - wordPos) > 1; 422 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(hasMultipleChars, 423 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 424 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 425 final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); 426 final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress, -1, flags, 427 characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS, 428 shortcuts, bigrams); 429 BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { newInfo }); 430 return; 431 } 432 dictBuffer.position(currentInfo.mChildrenAddress); 433 foundNextNode = true; 434 break; 435 } 436 } 437 438 if (foundNextNode) continue; 439 440 // reached the end of the array. 441 final int linkAddressPosition = dictBuffer.position(); 442 int nextLink = dictBuffer.readUnsignedInt24(); 443 if ((nextLink & FormatSpec.MSB24) != 0) { 444 nextLink = -(nextLink & FormatSpec.SINT24_MAX); 445 } 446 if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 447 /* 448 * expand this node. 449 * 450 * before 451 * ab - cd 452 * 453 * insert "abef" 454 * 455 * after 456 * ab - cd 457 * | 458 * - ef 459 */ 460 461 // change the forward link address. 462 final int newNodeAddress = dictBuffer.limit(); 463 dictBuffer.position(linkAddressPosition); 464 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, newNodeAddress); 465 466 final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); 467 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(characters.length > 1, 468 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 469 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 470 final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress + 1, 471 -1 /* endAddress */, flags, characters, frequency, nodeParentAddress, 472 FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); 473 BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[]{ newInfo }); 474 return; 475 } else { 476 depth--; 477 dictBuffer.position(nextLink); 478 } 479 } 480 } 481 } 482