1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; 22 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; 23 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 24 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 25 import com.android.inputmethod.latin.utils.CollectionUtils; 26 27 import java.io.IOException; 28 import java.io.OutputStream; 29 import java.util.ArrayList; 30 import java.util.Arrays; 31 32 /** 33 * The utility class to help dynamic updates on the binary dictionary. 34 * 35 * All the methods in this class are static. 36 */ 37 @UsedForTesting 38 public final class DynamicBinaryDictIOUtils { 39 private static final boolean DBG = false; 40 private static final int MAX_JUMPS = 10000; 41 42 private DynamicBinaryDictIOUtils() { 43 // This utility class is not publicly instantiable. 44 } 45 46 /* package */ static int markAsDeleted(final int flags) { 47 return (flags & (~FormatSpec.MASK_CHILDREN_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; 48 } 49 50 /** 51 * Update a parent address in a PtNode that is referred to by ptNodeOriginAddress. 52 * 53 * @param dictUpdater the DictUpdater to write. 54 * @param ptNodeOriginAddress the address of the PtNode. 55 * @param newParentAddress the absolute address of the parent. 56 * @param formatOptions file format options. 57 */ 58 private static void updateParentAddress(final Ver3DictUpdater dictUpdater, 59 final int ptNodeOriginAddress, final int newParentAddress, 60 final FormatOptions formatOptions) { 61 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 62 final int originalPosition = dictBuffer.position(); 63 dictBuffer.position(ptNodeOriginAddress); 64 if (!formatOptions.mSupportsDynamicUpdate) { 65 throw new RuntimeException("this file format does not support parent addresses"); 66 } 67 final int flags = dictBuffer.readUnsignedByte(); 68 if (BinaryDictIOUtils.isMovedPtNode(flags, formatOptions)) { 69 // If the node is moved, the parent address is stored in the destination node. 70 // We are guaranteed to process the destination node later, so there is no need to 71 // update anything here. 72 dictBuffer.position(originalPosition); 73 return; 74 } 75 if (DBG) { 76 MakedictLog.d("update parent address flags=" + flags + ", " + ptNodeOriginAddress); 77 } 78 final int parentOffset = newParentAddress - ptNodeOriginAddress; 79 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, parentOffset); 80 dictBuffer.position(originalPosition); 81 } 82 83 /** 84 * Update parent addresses in a node array stored at ptNodeOriginAddress. 85 * 86 * @param dictUpdater the DictUpdater to be modified. 87 * @param ptNodeOriginAddress the address of the node array to update. 88 * @param newParentAddress the address to be written. 89 * @param formatOptions file format options. 90 */ 91 private static void updateParentAddresses(final Ver3DictUpdater dictUpdater, 92 final int ptNodeOriginAddress, final int newParentAddress, 93 final FormatOptions formatOptions) { 94 final int originalPosition = dictUpdater.getPosition(); 95 dictUpdater.setPosition(ptNodeOriginAddress); 96 do { 97 final int count = dictUpdater.readPtNodeCount(); 98 for (int i = 0; i < count; ++i) { 99 updateParentAddress(dictUpdater, dictUpdater.getPosition(), newParentAddress, 100 formatOptions); 101 dictUpdater.skipPtNode(formatOptions); 102 } 103 if (!dictUpdater.readAndFollowForwardLink()) break; 104 if (dictUpdater.getPosition() == FormatSpec.NO_FORWARD_LINK_ADDRESS) break; 105 } while (formatOptions.mSupportsDynamicUpdate); 106 dictUpdater.setPosition(originalPosition); 107 } 108 109 /** 110 * Update a children address in a PtNode that is addressed by ptNodeOriginAddress. 111 * 112 * @param dictUpdater the DictUpdater to write. 113 * @param ptNodeOriginAddress the address of the PtNode. 114 * @param newChildrenAddress the absolute address of the child. 115 * @param formatOptions file format options. 116 */ 117 private static void updateChildrenAddress(final Ver3DictUpdater dictUpdater, 118 final int ptNodeOriginAddress, final int newChildrenAddress, 119 final FormatOptions formatOptions) { 120 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 121 final int originalPosition = dictBuffer.position(); 122 dictBuffer.position(ptNodeOriginAddress); 123 final int flags = dictBuffer.readUnsignedByte(); 124 BinaryDictDecoderUtils.readParentAddress(dictBuffer, formatOptions); 125 BinaryDictIOUtils.skipString(dictBuffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); 126 if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) dictBuffer.readUnsignedByte(); 127 final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS 128 ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - dictBuffer.position(); 129 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, childrenOffset); 130 dictBuffer.position(originalPosition); 131 } 132 133 /** 134 * Helper method to move a PtNode to the tail of the file. 135 */ 136 private static int movePtNode(final OutputStream destination, 137 final Ver3DictUpdater dictUpdater, final PtNodeInfo info, 138 final int nodeArrayOriginAddress, final int oldNodeAddress, 139 final FormatOptions formatOptions) throws IOException { 140 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 141 updateParentAddress(dictUpdater, oldNodeAddress, dictBuffer.limit() + 1, formatOptions); 142 dictBuffer.position(oldNodeAddress); 143 final int currentFlags = dictBuffer.readUnsignedByte(); 144 dictBuffer.position(oldNodeAddress); 145 dictBuffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags 146 & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); 147 int size = FormatSpec.PTNODE_FLAGS_SIZE; 148 updateForwardLink(dictUpdater, nodeArrayOriginAddress, dictBuffer.limit(), formatOptions); 149 size += BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { info }); 150 return size; 151 } 152 153 @SuppressWarnings("unused") 154 private static void updateForwardLink(final Ver3DictUpdater dictUpdater, 155 final int nodeArrayOriginAddress, final int newNodeArrayAddress, 156 final FormatOptions formatOptions) { 157 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 158 dictUpdater.setPosition(nodeArrayOriginAddress); 159 int jumpCount = 0; 160 while (jumpCount++ < MAX_JUMPS) { 161 final int count = dictUpdater.readPtNodeCount(); 162 for (int i = 0; i < count; ++i) { 163 dictUpdater.readPtNode(dictUpdater.getPosition(), formatOptions); 164 } 165 final int forwardLinkAddress = dictBuffer.readUnsignedInt24(); 166 if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 167 dictBuffer.position(dictBuffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); 168 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, newNodeArrayAddress); 169 return; 170 } 171 dictBuffer.position(forwardLinkAddress); 172 } 173 if (DBG && jumpCount >= MAX_JUMPS) { 174 throw new RuntimeException("too many jumps, probably a bug."); 175 } 176 } 177 178 /** 179 * Move a PtNode that is referred to by oldPtNodeOrigin to the tail of the file, and set the 180 * children address to the byte after the PtNode. 181 * 182 * @param fileEndAddress the address of the tail of the file. 183 * @param codePoints the characters to put inside the PtNode. 184 * @param length how many code points to read from codePoints. 185 * @param flags the flags for this PtNode. 186 * @param frequency the frequency of this terminal. 187 * @param parentAddress the address of the parent PtNode of this PtNode. 188 * @param shortcutTargets the shortcut targets for this PtNode. 189 * @param bigrams the bigrams for this PtNode. 190 * @param destination the stream representing the tail of the file. 191 * @param dictUpdater the DictUpdater. 192 * @param oldPtNodeArrayOrigin the origin of the old PtNode array this PtNode was a part of. 193 * @param oldPtNodeOrigin the old origin where this PtNode used to be stored. 194 * @param formatOptions format options for this dictionary. 195 * @return the size written, in bytes. 196 * @throws IOException if the file can't be accessed 197 */ 198 private static int movePtNode(final int fileEndAddress, final int[] codePoints, 199 final int length, final int flags, final int frequency, final int parentAddress, 200 final ArrayList<WeightedString> shortcutTargets, 201 final ArrayList<PendingAttribute> bigrams, final OutputStream destination, 202 final Ver3DictUpdater dictUpdater, final int oldPtNodeArrayOrigin, 203 final int oldPtNodeOrigin, final FormatOptions formatOptions) throws IOException { 204 int size = 0; 205 final int newPtNodeOrigin = fileEndAddress + 1; 206 final int[] writtenCharacters = Arrays.copyOfRange(codePoints, 0, length); 207 final PtNodeInfo tmpInfo = new PtNodeInfo(newPtNodeOrigin, -1 /* endAddress */, 208 flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS, 209 shortcutTargets, bigrams); 210 size = BinaryDictIOUtils.computePtNodeSize(tmpInfo, formatOptions); 211 final PtNodeInfo newInfo = new PtNodeInfo(newPtNodeOrigin, newPtNodeOrigin + size, 212 flags, writtenCharacters, frequency, parentAddress, 213 fileEndAddress + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, 214 bigrams); 215 movePtNode(destination, dictUpdater, newInfo, oldPtNodeArrayOrigin, oldPtNodeOrigin, 216 formatOptions); 217 return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; 218 } 219 220 /** 221 * Converts a list of WeightedString to a list of PendingAttribute. 222 */ 223 public static ArrayList<PendingAttribute> resolveBigramPositions(final DictUpdater dictUpdater, 224 final ArrayList<WeightedString> bigramStrings) 225 throws IOException, UnsupportedFormatException { 226 if (bigramStrings == null) return CollectionUtils.newArrayList(); 227 final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList(); 228 for (final WeightedString bigram : bigramStrings) { 229 final int pos = dictUpdater.getTerminalPosition(bigram.mWord); 230 if (pos == FormatSpec.NOT_VALID_WORD) { 231 // TODO: figure out what is the correct thing to do here. 232 } else { 233 bigrams.add(new PendingAttribute(bigram.mFrequency, pos)); 234 } 235 } 236 return bigrams; 237 } 238 239 /** 240 * Insert a word into a binary dictionary. 241 * 242 * @param dictUpdater the dict updater. 243 * @param destination a stream to the underlying file, with the pointer at the end of the file. 244 * @param word the word to insert. 245 * @param frequency the frequency of the new word. 246 * @param bigramStrings bigram list, or null if none. 247 * @param shortcuts shortcut list, or null if none. 248 * @param isBlackListEntry whether this should be a blacklist entry. 249 * @throws IOException if the file can't be accessed. 250 * @throws UnsupportedFormatException if the existing dictionary is in an unexpected format. 251 */ 252 // TODO: Support batch insertion. 253 // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. 254 @UsedForTesting 255 public static void insertWord(final Ver3DictUpdater dictUpdater, 256 final OutputStream destination, final String word, final int frequency, 257 final ArrayList<WeightedString> bigramStrings, 258 final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, 259 final boolean isBlackListEntry) 260 throws IOException, UnsupportedFormatException { 261 final ArrayList<PendingAttribute> bigrams = resolveBigramPositions(dictUpdater, 262 bigramStrings); 263 final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); 264 265 final boolean isTerminal = true; 266 final boolean hasBigrams = !bigrams.isEmpty(); 267 final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty(); 268 269 // find the insert position of the word. 270 if (dictBuffer.position() != 0) dictBuffer.position(0); 271 final FileHeader fileHeader = dictUpdater.readHeader(); 272 273 int wordPos = 0, address = dictBuffer.position(), nodeOriginAddress = dictBuffer.position(); 274 final int[] codePoints = FusionDictionary.getCodePoints(word); 275 final int wordLen = codePoints.length; 276 277 for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { 278 if (wordPos >= wordLen) break; 279 nodeOriginAddress = dictBuffer.position(); 280 int nodeParentAddress = -1; 281 final int ptNodeCount = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer); 282 boolean foundNextNode = false; 283 284 for (int i = 0; i < ptNodeCount; ++i) { 285 address = dictBuffer.position(); 286 final PtNodeInfo currentInfo = dictUpdater.readPtNode(address, 287 fileHeader.mFormatOptions); 288 final boolean isMovedNode = BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, 289 fileHeader.mFormatOptions); 290 if (isMovedNode) continue; 291 nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) 292 ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address; 293 boolean matched = true; 294 for (int p = 0; p < currentInfo.mCharacters.length; ++p) { 295 if (wordPos + p >= wordLen) { 296 /* 297 * splitting 298 * before 299 * abcd - ef 300 * 301 * insert "abc" 302 * 303 * after 304 * abc - d - ef 305 */ 306 final int newNodeAddress = dictBuffer.limit(); 307 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(p > 1, 308 isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */, 309 false /* isBlackListEntry */, fileHeader.mFormatOptions); 310 int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p, flags, 311 frequency, nodeParentAddress, shortcuts, bigrams, destination, 312 dictUpdater, nodeOriginAddress, address, fileHeader.mFormatOptions); 313 314 final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, 315 currentInfo.mCharacters.length); 316 if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { 317 updateParentAddresses(dictUpdater, currentInfo.mChildrenAddress, 318 newNodeAddress + written + 1, fileHeader.mFormatOptions); 319 } 320 final PtNodeInfo newInfo2 = new PtNodeInfo( 321 newNodeAddress + written + 1, -1 /* endAddress */, 322 currentInfo.mFlags, characters2, currentInfo.mFrequency, 323 newNodeAddress + 1, currentInfo.mChildrenAddress, 324 currentInfo.mShortcutTargets, currentInfo.mBigrams); 325 BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { newInfo2 }); 326 return; 327 } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) { 328 if (p > 0) { 329 /* 330 * splitting 331 * before 332 * ab - cd 333 * 334 * insert "ac" 335 * 336 * after 337 * a - b - cd 338 * | 339 * - c 340 */ 341 342 final int newNodeAddress = dictBuffer.limit(); 343 final int childrenAddress = currentInfo.mChildrenAddress; 344 345 // move prefix 346 final int prefixFlags = BinaryDictEncoderUtils.makePtNodeFlags(p > 1, 347 false /* isTerminal */, 0 /* childrenAddressSize*/, 348 false /* hasShortcut */, false /* hasBigrams */, 349 false /* isNotAWord */, false /* isBlackListEntry */, 350 fileHeader.mFormatOptions); 351 int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p, 352 prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, 353 destination, dictUpdater, nodeOriginAddress, address, 354 fileHeader.mFormatOptions); 355 356 final int[] suffixCharacters = Arrays.copyOfRange( 357 currentInfo.mCharacters, p, currentInfo.mCharacters.length); 358 if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { 359 updateParentAddresses(dictUpdater, currentInfo.mChildrenAddress, 360 newNodeAddress + written + 1, fileHeader.mFormatOptions); 361 } 362 final int suffixFlags = BinaryDictEncoderUtils.makePtNodeFlags( 363 suffixCharacters.length > 1, 364 (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0, 365 0 /* childrenAddressSize */, 366 (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) 367 != 0, 368 (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0, 369 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 370 final PtNodeInfo suffixInfo = new PtNodeInfo( 371 newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags, 372 suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1, 373 currentInfo.mChildrenAddress, currentInfo.mShortcutTargets, 374 currentInfo.mBigrams); 375 written += BinaryDictIOUtils.computePtNodeSize(suffixInfo, 376 fileHeader.mFormatOptions) + 1; 377 378 final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p, 379 codePoints.length); 380 final int flags = BinaryDictEncoderUtils.makePtNodeFlags( 381 newCharacters.length > 1, isTerminal, 382 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 383 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 384 final PtNodeInfo newInfo = new PtNodeInfo( 385 newNodeAddress + written, -1 /* endAddress */, flags, 386 newCharacters, frequency, newNodeAddress + 1, 387 FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); 388 BinaryDictIOUtils.writeNodes(destination, 389 new PtNodeInfo[] { suffixInfo, newInfo }); 390 return; 391 } 392 matched = false; 393 break; 394 } 395 } 396 397 if (matched) { 398 if (wordPos + currentInfo.mCharacters.length == wordLen) { 399 // the word exists in the dictionary. 400 // only update the PtNode. 401 final int newNodeAddress = dictBuffer.limit(); 402 final boolean hasMultipleChars = currentInfo.mCharacters.length > 1; 403 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(hasMultipleChars, 404 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 405 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 406 final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress + 1, 407 -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, 408 nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, 409 bigrams); 410 movePtNode(destination, dictUpdater, newInfo, nodeOriginAddress, address, 411 fileHeader.mFormatOptions); 412 return; 413 } 414 wordPos += currentInfo.mCharacters.length; 415 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 416 /* 417 * found the prefix of the word. 418 * make new PtNode and link to the PtNode from this PtNode. 419 * 420 * before 421 * ab - cd 422 * 423 * insert "abcde" 424 * 425 * after 426 * ab - cd - e 427 */ 428 final int newNodeArrayAddress = dictBuffer.limit(); 429 updateChildrenAddress(dictUpdater, address, newNodeArrayAddress, 430 fileHeader.mFormatOptions); 431 final int newNodeAddress = newNodeArrayAddress + 1; 432 final boolean hasMultipleChars = (wordLen - wordPos) > 1; 433 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(hasMultipleChars, 434 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 435 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 436 final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); 437 final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress, -1, flags, 438 characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS, 439 shortcuts, bigrams); 440 BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { newInfo }); 441 return; 442 } 443 dictBuffer.position(currentInfo.mChildrenAddress); 444 foundNextNode = true; 445 break; 446 } 447 } 448 449 if (foundNextNode) continue; 450 451 // reached the end of the array. 452 final int linkAddressPosition = dictBuffer.position(); 453 int nextLink = dictBuffer.readUnsignedInt24(); 454 if ((nextLink & FormatSpec.MSB24) != 0) { 455 nextLink = -(nextLink & FormatSpec.SINT24_MAX); 456 } 457 if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 458 /* 459 * expand this node. 460 * 461 * before 462 * ab - cd 463 * 464 * insert "abef" 465 * 466 * after 467 * ab - cd 468 * | 469 * - ef 470 */ 471 472 // change the forward link address. 473 final int newNodeAddress = dictBuffer.limit(); 474 dictBuffer.position(linkAddressPosition); 475 BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, newNodeAddress); 476 477 final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); 478 final int flags = BinaryDictEncoderUtils.makePtNodeFlags(characters.length > 1, 479 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 480 isNotAWord, isBlackListEntry, fileHeader.mFormatOptions); 481 final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress + 1, 482 -1 /* endAddress */, flags, characters, frequency, nodeParentAddress, 483 FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); 484 BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[]{ newInfo }); 485 return; 486 } else { 487 depth--; 488 dictBuffer.position(nextLink); 489 } 490 } 491 } 492 } 493