1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding; 22 import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; 23 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; 24 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 25 import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; 26 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 27 28 import java.io.File; 29 import java.io.FileInputStream; 30 import java.io.FileNotFoundException; 31 import java.io.IOException; 32 import java.io.OutputStream; 33 import java.nio.channels.FileChannel; 34 import java.util.ArrayList; 35 import java.util.Arrays; 36 import java.util.Iterator; 37 import java.util.Map; 38 import java.util.Stack; 39 40 public final class BinaryDictIOUtils { 41 private static final boolean DBG = false; 42 private static final int MSB24 = 0x800000; 43 private static final int SINT24_MAX = 0x7FFFFF; 44 private static final int MAX_JUMPS = 10000; 45 46 private BinaryDictIOUtils() { 47 // This utility class is not publicly instantiable. 48 } 49 50 private static final class Position { 51 public static final int NOT_READ_GROUPCOUNT = -1; 52 53 public int mAddress; 54 public int mNumOfCharGroup; 55 public int mPosition; 56 public int mLength; 57 58 public Position(int address, int length) { 59 mAddress = address; 60 mLength = length; 61 mNumOfCharGroup = NOT_READ_GROUPCOUNT; 62 } 63 } 64 65 /** 66 * Tours all node without recursive call. 67 */ 68 private static void readUnigramsAndBigramsBinaryInner( 69 final FusionDictionaryBufferInterface buffer, final int headerSize, 70 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 71 final Map<Integer, ArrayList<PendingAttribute>> bigrams, 72 final FormatOptions formatOptions) { 73 int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; 74 75 Stack<Position> stack = new Stack<Position>(); 76 int index = 0; 77 78 Position initPos = new Position(headerSize, 0); 79 stack.push(initPos); 80 81 while (!stack.empty()) { 82 Position p = stack.peek(); 83 84 if (DBG) { 85 MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" + 86 p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength); 87 } 88 89 if (buffer.position() != p.mAddress) buffer.position(p.mAddress); 90 if (index != p.mLength) index = p.mLength; 91 92 if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) { 93 p.mNumOfCharGroup = BinaryDictInputOutput.readCharGroupCount(buffer); 94 p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup); 95 p.mPosition = 0; 96 } 97 if (p.mNumOfCharGroup == 0) { 98 stack.pop(); 99 continue; 100 } 101 CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer, 102 p.mAddress - headerSize, formatOptions); 103 for (int i = 0; i < info.mCharacters.length; ++i) { 104 pushedChars[index++] = info.mCharacters[i]; 105 } 106 p.mPosition++; 107 108 final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags, 109 formatOptions); 110 final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags, 111 formatOptions); 112 if (!isMovedGroup && !isDeletedGroup 113 && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word 114 words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); 115 frequencies.put(info.mOriginalAddress, info.mFrequency); 116 if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); 117 } 118 119 if (p.mPosition == p.mNumOfCharGroup) { 120 if (formatOptions.mSupportsDynamicUpdate) { 121 final int forwardLinkAddress = buffer.readUnsignedInt24(); 122 if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) { 123 // the node has a forward link. 124 p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT; 125 p.mAddress = forwardLinkAddress; 126 } else { 127 stack.pop(); 128 } 129 } else { 130 stack.pop(); 131 } 132 } else { 133 // the node has more groups. 134 p.mAddress = buffer.position(); 135 } 136 137 if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) { 138 Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); 139 stack.push(childrenPos); 140 } 141 } 142 } 143 144 /** 145 * Reads unigrams and bigrams from the binary file. 146 * Doesn't make the memory representation of the dictionary. 147 * 148 * @param buffer the buffer to read. 149 * @param words the map to store the address as a key and the word as a value. 150 * @param frequencies the map to store the address as a key and the frequency as a value. 151 * @param bigrams the map to store the address as a key and the list of address as a value. 152 * @throws IOException 153 * @throws UnsupportedFormatException 154 */ 155 public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer, 156 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 157 final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 158 UnsupportedFormatException { 159 // Read header 160 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 161 readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams, 162 header.mFormatOptions); 163 } 164 165 /** 166 * Gets the address of the last CharGroup of the exact matching word in the dictionary. 167 * If no match is found, returns NOT_VALID_WORD. 168 * 169 * @param buffer the buffer to read. 170 * @param word the word we search for. 171 * @return the address of the terminal node. 172 * @throws IOException 173 * @throws UnsupportedFormatException 174 */ 175 @UsedForTesting 176 public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer, 177 final String word) throws IOException, UnsupportedFormatException { 178 if (word == null) return FormatSpec.NOT_VALID_WORD; 179 if (buffer.position() != 0) buffer.position(0); 180 181 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 182 int wordPos = 0; 183 final int wordLen = word.codePointCount(0, word.length()); 184 for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { 185 if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; 186 187 do { 188 final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); 189 boolean foundNextCharGroup = false; 190 for (int i = 0; i < charGroupCount; ++i) { 191 final int charGroupPos = buffer.position(); 192 final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, 193 buffer.position(), header.mFormatOptions); 194 final boolean isMovedGroup = 195 BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, 196 header.mFormatOptions); 197 final boolean isDeletedGroup = 198 BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags, 199 header.mFormatOptions); 200 if (isMovedGroup) continue; 201 boolean same = true; 202 for (int p = 0, j = word.offsetByCodePoints(0, wordPos); 203 p < currentInfo.mCharacters.length; 204 ++p, j = word.offsetByCodePoints(j, 1)) { 205 if (wordPos + p >= wordLen 206 || word.codePointAt(j) != currentInfo.mCharacters[p]) { 207 same = false; 208 break; 209 } 210 } 211 212 if (same) { 213 // found the group matches the word. 214 if (wordPos + currentInfo.mCharacters.length == wordLen) { 215 if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL 216 || isDeletedGroup) { 217 return FormatSpec.NOT_VALID_WORD; 218 } else { 219 return charGroupPos; 220 } 221 } 222 wordPos += currentInfo.mCharacters.length; 223 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 224 return FormatSpec.NOT_VALID_WORD; 225 } 226 foundNextCharGroup = true; 227 buffer.position(currentInfo.mChildrenAddress); 228 break; 229 } 230 } 231 232 // If we found the next char group, it is under the file pointer. 233 // But if not, we are at the end of this node so we expect to have 234 // a forward link address that we need to consult and possibly resume 235 // search on the next node in the linked list. 236 if (foundNextCharGroup) break; 237 if (!header.mFormatOptions.mSupportsDynamicUpdate) { 238 return FormatSpec.NOT_VALID_WORD; 239 } 240 241 final int forwardLinkAddress = buffer.readUnsignedInt24(); 242 if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 243 return FormatSpec.NOT_VALID_WORD; 244 } 245 buffer.position(forwardLinkAddress); 246 } while(true); 247 } 248 return FormatSpec.NOT_VALID_WORD; 249 } 250 251 private static int markAsDeleted(final int flags) { 252 return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; 253 } 254 255 /** 256 * Delete the word from the binary file. 257 * 258 * @param buffer the buffer to write. 259 * @param word the word we delete 260 * @throws IOException 261 * @throws UnsupportedFormatException 262 */ 263 @UsedForTesting 264 public static void deleteWord(final FusionDictionaryBufferInterface buffer, 265 final String word) throws IOException, UnsupportedFormatException { 266 buffer.position(0); 267 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 268 final int wordPosition = getTerminalPosition(buffer, word); 269 if (wordPosition == FormatSpec.NOT_VALID_WORD) return; 270 271 buffer.position(wordPosition); 272 final int flags = buffer.readUnsignedByte(); 273 buffer.position(wordPosition); 274 buffer.put((byte)markAsDeleted(flags)); 275 } 276 277 /** 278 * @return the size written, in bytes. Always 3 bytes. 279 */ 280 private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer, 281 final int value) { 282 final int absValue = Math.abs(value); 283 buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); 284 buffer.put((byte)((absValue >> 8) & 0xFF)); 285 buffer.put((byte)(absValue & 0xFF)); 286 return 3; 287 } 288 289 /** 290 * @return the size written, in bytes. Always 3 bytes. 291 */ 292 private static int writeSInt24ToStream(final OutputStream destination, final int value) 293 throws IOException { 294 final int absValue = Math.abs(value); 295 destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); 296 destination.write((byte)((absValue >> 8) & 0xFF)); 297 destination.write((byte)(absValue & 0xFF)); 298 return 3; 299 } 300 301 /** 302 * @return the size written, in bytes. 1, 2, or 3 bytes. 303 */ 304 private static int writeVariableAddress(final OutputStream destination, final int value) 305 throws IOException { 306 switch (BinaryDictInputOutput.getByteSize(value)) { 307 case 1: 308 destination.write((byte)value); 309 break; 310 case 2: 311 destination.write((byte)(0xFF & (value >> 8))); 312 destination.write((byte)(0xFF & value)); 313 break; 314 case 3: 315 destination.write((byte)(0xFF & (value >> 16))); 316 destination.write((byte)(0xFF & (value >> 8))); 317 destination.write((byte)(0xFF & value)); 318 break; 319 } 320 return BinaryDictInputOutput.getByteSize(value); 321 } 322 323 /** 324 * Update a parent address in a CharGroup that is referred to by groupOriginAddress. 325 * 326 * @param buffer the buffer to write. 327 * @param groupOriginAddress the address of the group. 328 * @param newParentAddress the absolute address of the parent. 329 * @param formatOptions file format options. 330 */ 331 public static void updateParentAddress(final FusionDictionaryBufferInterface buffer, 332 final int groupOriginAddress, final int newParentAddress, 333 final FormatOptions formatOptions) { 334 final int originalPosition = buffer.position(); 335 buffer.position(groupOriginAddress); 336 if (!formatOptions.mSupportsDynamicUpdate) { 337 throw new RuntimeException("this file format does not support parent addresses"); 338 } 339 final int flags = buffer.readUnsignedByte(); 340 if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) { 341 // if the group is moved, the parent address is stored in the destination group. 342 // We are guaranteed to process the destination group later, so there is no need to 343 // update anything here. 344 buffer.position(originalPosition); 345 return; 346 } 347 if (DBG) { 348 MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress); 349 } 350 final int parentOffset = newParentAddress - groupOriginAddress; 351 writeSInt24ToBuffer(buffer, parentOffset); 352 buffer.position(originalPosition); 353 } 354 355 private static void skipCharGroup(final FusionDictionaryBufferInterface buffer, 356 final FormatOptions formatOptions) { 357 final int flags = buffer.readUnsignedByte(); 358 BinaryDictInputOutput.readParentAddress(buffer, formatOptions); 359 skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); 360 BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions); 361 if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); 362 if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { 363 final int shortcutsSize = buffer.readUnsignedShort(); 364 buffer.position(buffer.position() + shortcutsSize 365 - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE); 366 } 367 if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { 368 int bigramCount = 0; 369 while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { 370 final int bigramFlags = buffer.readUnsignedByte(); 371 switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) { 372 case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 373 buffer.readUnsignedByte(); 374 break; 375 case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 376 buffer.readUnsignedShort(); 377 break; 378 case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 379 buffer.readUnsignedInt24(); 380 break; 381 } 382 if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break; 383 } 384 if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { 385 throw new RuntimeException("Too many bigrams in a group."); 386 } 387 } 388 } 389 390 /** 391 * Update parent addresses in a Node that is referred to by nodeOriginAddress. 392 * 393 * @param buffer the buffer to be modified. 394 * @param nodeOriginAddress the address of a modified Node. 395 * @param newParentAddress the address to be written. 396 * @param formatOptions file format options. 397 */ 398 public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer, 399 final int nodeOriginAddress, final int newParentAddress, 400 final FormatOptions formatOptions) { 401 final int originalPosition = buffer.position(); 402 buffer.position(nodeOriginAddress); 403 do { 404 final int count = BinaryDictInputOutput.readCharGroupCount(buffer); 405 for (int i = 0; i < count; ++i) { 406 updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions); 407 skipCharGroup(buffer, formatOptions); 408 } 409 final int forwardLinkAddress = buffer.readUnsignedInt24(); 410 buffer.position(forwardLinkAddress); 411 } while (formatOptions.mSupportsDynamicUpdate 412 && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); 413 buffer.position(originalPosition); 414 } 415 416 private static void skipString(final FusionDictionaryBufferInterface buffer, 417 final boolean hasMultipleChars) { 418 if (hasMultipleChars) { 419 int character = CharEncoding.readChar(buffer); 420 while (character != FormatSpec.INVALID_CHARACTER) { 421 character = CharEncoding.readChar(buffer); 422 } 423 } else { 424 CharEncoding.readChar(buffer); 425 } 426 } 427 428 /** 429 * Write a string to a stream. 430 * 431 * @param destination the stream to write. 432 * @param word the string to be written. 433 * @return the size written, in bytes. 434 * @throws IOException 435 */ 436 private static int writeString(final OutputStream destination, final String word) 437 throws IOException { 438 int size = 0; 439 final int length = word.length(); 440 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 441 final int codePoint = word.codePointAt(i); 442 if (CharEncoding.getCharSize(codePoint) == 1) { 443 destination.write((byte)codePoint); 444 size++; 445 } else { 446 destination.write((byte)(0xFF & (codePoint >> 16))); 447 destination.write((byte)(0xFF & (codePoint >> 8))); 448 destination.write((byte)(0xFF & codePoint)); 449 size += 3; 450 } 451 } 452 destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); 453 size += FormatSpec.GROUP_TERMINATOR_SIZE; 454 return size; 455 } 456 457 /** 458 * Update a children address in a CharGroup that is addressed by groupOriginAddress. 459 * 460 * @param buffer the buffer to write. 461 * @param groupOriginAddress the address of the group. 462 * @param newChildrenAddress the absolute address of the child. 463 * @param formatOptions file format options. 464 */ 465 public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer, 466 final int groupOriginAddress, final int newChildrenAddress, 467 final FormatOptions formatOptions) { 468 final int originalPosition = buffer.position(); 469 buffer.position(groupOriginAddress); 470 final int flags = buffer.readUnsignedByte(); 471 final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); 472 skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); 473 if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); 474 final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS 475 ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); 476 writeSInt24ToBuffer(buffer, childrenOffset); 477 buffer.position(originalPosition); 478 } 479 480 /** 481 * Write a char group to an output stream. 482 * A char group is an in-memory representation of a node in trie. 483 * A char group info is an on-disk representation of a node. 484 * 485 * @param destination the stream to write. 486 * @param info the char group info to be written. 487 * @return the size written, in bytes. 488 */ 489 public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info) 490 throws IOException { 491 int size = FormatSpec.GROUP_FLAGS_SIZE; 492 destination.write((byte)info.mFlags); 493 final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ? 494 FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress; 495 size += writeSInt24ToStream(destination, parentOffset); 496 497 for (int i = 0; i < info.mCharacters.length; ++i) { 498 if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) { 499 destination.write((byte)info.mCharacters[i]); 500 size++; 501 } else { 502 size += writeSInt24ToStream(destination, info.mCharacters[i]); 503 } 504 } 505 if (info.mCharacters.length > 1) { 506 destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); 507 size++; 508 } 509 510 if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { 511 destination.write((byte)info.mFrequency); 512 size++; 513 } 514 515 if (DBG) { 516 MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size 517 + ", child=" + info.mChildrenAddress + ", characters =" 518 + new String(info.mCharacters, 0, info.mCharacters.length)); 519 } 520 final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ? 521 0 : info.mChildrenAddress - (info.mOriginalAddress + size); 522 writeSInt24ToStream(destination, childrenOffset); 523 size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; 524 525 if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) { 526 final int shortcutListSize = 527 BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); 528 destination.write((byte)(shortcutListSize >> 8)); 529 destination.write((byte)(shortcutListSize & 0xFF)); 530 size += 2; 531 final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator(); 532 while (shortcutIterator.hasNext()) { 533 final WeightedString target = shortcutIterator.next(); 534 destination.write((byte)BinaryDictInputOutput.makeShortcutFlags( 535 shortcutIterator.hasNext(), target.mFrequency)); 536 size++; 537 size += writeString(destination, target.mWord); 538 } 539 } 540 541 if (info.mBigrams != null) { 542 // TODO: Consolidate this code with the code that computes the size of the bigram list 543 // in BinaryDictionaryInputOutput#computeActualNodeSize 544 for (int i = 0; i < info.mBigrams.size(); ++i) { 545 546 final int bigramFrequency = info.mBigrams.get(i).mFrequency; 547 int bigramFlags = (i < info.mBigrams.size() - 1) 548 ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0; 549 size++; 550 final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress 551 + size); 552 bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0; 553 switch (BinaryDictInputOutput.getByteSize(bigramOffset)) { 554 case 1: 555 bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; 556 break; 557 case 2: 558 bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; 559 break; 560 case 3: 561 bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; 562 break; 563 } 564 bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY; 565 destination.write((byte)bigramFlags); 566 size += writeVariableAddress(destination, Math.abs(bigramOffset)); 567 } 568 } 569 return size; 570 } 571 572 @SuppressWarnings("unused") 573 private static void updateForwardLink(final FusionDictionaryBufferInterface buffer, 574 final int nodeOriginAddress, final int newNodeAddress, 575 final FormatOptions formatOptions) { 576 buffer.position(nodeOriginAddress); 577 int jumpCount = 0; 578 while (jumpCount++ < MAX_JUMPS) { 579 final int count = BinaryDictInputOutput.readCharGroupCount(buffer); 580 for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions); 581 final int forwardLinkAddress = buffer.readUnsignedInt24(); 582 if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 583 buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); 584 writeSInt24ToBuffer(buffer, newNodeAddress); 585 return; 586 } 587 buffer.position(forwardLinkAddress); 588 } 589 if (DBG && jumpCount >= MAX_JUMPS) { 590 throw new RuntimeException("too many jumps, probably a bug."); 591 } 592 } 593 594 /** 595 * Helper method to move a char group to the tail of the file. 596 */ 597 private static int moveCharGroup(final OutputStream destination, 598 final FusionDictionaryBufferInterface buffer, final CharGroupInfo info, 599 final int nodeOriginAddress, final int oldGroupAddress, 600 final FormatOptions formatOptions) throws IOException { 601 updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions); 602 buffer.position(oldGroupAddress); 603 final int currentFlags = buffer.readUnsignedByte(); 604 buffer.position(oldGroupAddress); 605 buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags 606 & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); 607 int size = FormatSpec.GROUP_FLAGS_SIZE; 608 updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions); 609 size += writeNode(destination, new CharGroupInfo[] { info }); 610 return size; 611 } 612 613 /** 614 * Compute the size of the char group. 615 */ 616 private static int computeGroupSize(final CharGroupInfo info, 617 final FormatOptions formatOptions) { 618 int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE 619 + BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters) 620 + BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions); 621 if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { 622 size += FormatSpec.GROUP_FREQUENCY_SIZE; 623 } 624 if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) { 625 size += BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); 626 } 627 if (info.mBigrams != null) { 628 for (final PendingAttribute attr : info.mBigrams) { 629 size += FormatSpec.GROUP_FLAGS_SIZE; 630 size += BinaryDictInputOutput.getByteSize(attr.mAddress); 631 } 632 } 633 return size; 634 } 635 636 /** 637 * Write a node to the stream. 638 * 639 * @param destination the stream to write. 640 * @param infos groups to be written. 641 * @return the size written, in bytes. 642 * @throws IOException 643 */ 644 private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos) 645 throws IOException { 646 int size = BinaryDictInputOutput.getGroupCountSize(infos.length); 647 switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) { 648 case 1: 649 destination.write((byte)infos.length); 650 break; 651 case 2: 652 destination.write((byte)(infos.length >> 8)); 653 destination.write((byte)(infos.length & 0xFF)); 654 break; 655 default: 656 throw new RuntimeException("Invalid group count size."); 657 } 658 for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info); 659 writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS); 660 return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; 661 } 662 663 /** 664 * Move a group that is referred to by oldGroupOrigin to the tail of the file. 665 * And set the children address to the byte after the group. 666 * 667 * @param nodeOrigin the address of the tail of the file. 668 * @param characters 669 * @param length 670 * @param flags 671 * @param frequency 672 * @param parentAddress 673 * @param shortcutTargets 674 * @param bigrams 675 * @param destination the stream representing the tail of the file. 676 * @param buffer the buffer representing the (constant-size) body of the file. 677 * @param oldNodeOrigin 678 * @param oldGroupOrigin 679 * @param formatOptions 680 * @return the size written, in bytes. 681 * @throws IOException 682 */ 683 private static int moveGroup(final int nodeOrigin, final int[] characters, final int length, 684 final int flags, final int frequency, final int parentAddress, 685 final ArrayList<WeightedString> shortcutTargets, 686 final ArrayList<PendingAttribute> bigrams, final OutputStream destination, 687 final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin, 688 final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException { 689 int size = 0; 690 final int newGroupOrigin = nodeOrigin + 1; 691 final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length); 692 final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */, 693 flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS, 694 shortcutTargets, bigrams); 695 size = computeGroupSize(tmpInfo, formatOptions); 696 final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size, 697 flags, writtenCharacters, frequency, parentAddress, 698 nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, 699 bigrams); 700 moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions); 701 return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; 702 } 703 704 /** 705 * Insert a word into a binary dictionary. 706 * 707 * @param buffer 708 * @param destination 709 * @param word 710 * @param frequency 711 * @param bigramStrings 712 * @param shortcuts 713 * @throws IOException 714 * @throws UnsupportedFormatException 715 */ 716 // TODO: Support batch insertion. 717 // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. 718 @UsedForTesting 719 public static void insertWord(final FusionDictionaryBufferInterface buffer, 720 final OutputStream destination, final String word, final int frequency, 721 final ArrayList<WeightedString> bigramStrings, 722 final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, 723 final boolean isBlackListEntry) 724 throws IOException, UnsupportedFormatException { 725 final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>(); 726 if (bigramStrings != null) { 727 for (final WeightedString bigram : bigramStrings) { 728 int position = getTerminalPosition(buffer, bigram.mWord); 729 if (position == FormatSpec.NOT_VALID_WORD) { 730 // TODO: figure out what is the correct thing to do here. 731 } else { 732 bigrams.add(new PendingAttribute(bigram.mFrequency, position)); 733 } 734 } 735 } 736 737 final boolean isTerminal = true; 738 final boolean hasBigrams = !bigrams.isEmpty(); 739 final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty(); 740 741 // find the insert position of the word. 742 if (buffer.position() != 0) buffer.position(0); 743 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 744 745 int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position(); 746 final int[] codePoints = FusionDictionary.getCodePoints(word); 747 final int wordLen = codePoints.length; 748 749 for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { 750 if (wordPos >= wordLen) break; 751 nodeOriginAddress = buffer.position(); 752 int nodeParentAddress = -1; 753 final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); 754 boolean foundNextGroup = false; 755 756 for (int i = 0; i < charGroupCount; ++i) { 757 address = buffer.position(); 758 final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, 759 buffer.position(), header.mFormatOptions); 760 final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, 761 header.mFormatOptions); 762 if (isMovedGroup) continue; 763 nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) 764 ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address; 765 boolean matched = true; 766 for (int p = 0; p < currentInfo.mCharacters.length; ++p) { 767 if (wordPos + p >= wordLen) { 768 /* 769 * splitting 770 * before 771 * abcd - ef 772 * 773 * insert "abc" 774 * 775 * after 776 * abc - d - ef 777 */ 778 final int newNodeAddress = buffer.limit(); 779 final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, 780 isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */, 781 false /* isBlackListEntry */, header.mFormatOptions); 782 int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags, 783 frequency, nodeParentAddress, shortcuts, bigrams, destination, 784 buffer, nodeOriginAddress, address, header.mFormatOptions); 785 786 final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, 787 currentInfo.mCharacters.length); 788 if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { 789 updateParentAddresses(buffer, currentInfo.mChildrenAddress, 790 newNodeAddress + written + 1, header.mFormatOptions); 791 } 792 final CharGroupInfo newInfo2 = new CharGroupInfo( 793 newNodeAddress + written + 1, -1 /* endAddress */, 794 currentInfo.mFlags, characters2, currentInfo.mFrequency, 795 newNodeAddress + 1, currentInfo.mChildrenAddress, 796 currentInfo.mShortcutTargets, currentInfo.mBigrams); 797 writeNode(destination, new CharGroupInfo[] { newInfo2 }); 798 return; 799 } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) { 800 if (p > 0) { 801 /* 802 * splitting 803 * before 804 * ab - cd 805 * 806 * insert "ac" 807 * 808 * after 809 * a - b - cd 810 * | 811 * - c 812 */ 813 814 final int newNodeAddress = buffer.limit(); 815 final int childrenAddress = currentInfo.mChildrenAddress; 816 817 // move prefix 818 final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, 819 false /* isTerminal */, 0 /* childrenAddressSize*/, 820 false /* hasShortcut */, false /* hasBigrams */, 821 false /* isNotAWord */, false /* isBlackListEntry */, 822 header.mFormatOptions); 823 int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, 824 prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, 825 destination, buffer, nodeOriginAddress, address, 826 header.mFormatOptions); 827 828 final int[] suffixCharacters = Arrays.copyOfRange( 829 currentInfo.mCharacters, p, currentInfo.mCharacters.length); 830 if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { 831 updateParentAddresses(buffer, currentInfo.mChildrenAddress, 832 newNodeAddress + written + 1, header.mFormatOptions); 833 } 834 final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags( 835 suffixCharacters.length > 1, 836 (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0, 837 0 /* childrenAddressSize */, 838 (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) 839 != 0, 840 (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0, 841 isNotAWord, isBlackListEntry, header.mFormatOptions); 842 final CharGroupInfo suffixInfo = new CharGroupInfo( 843 newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags, 844 suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1, 845 currentInfo.mChildrenAddress, currentInfo.mShortcutTargets, 846 currentInfo.mBigrams); 847 written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1; 848 849 final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p, 850 codePoints.length); 851 final int flags = BinaryDictInputOutput.makeCharGroupFlags( 852 newCharacters.length > 1, isTerminal, 853 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 854 isNotAWord, isBlackListEntry, header.mFormatOptions); 855 final CharGroupInfo newInfo = new CharGroupInfo( 856 newNodeAddress + written, -1 /* endAddress */, flags, 857 newCharacters, frequency, newNodeAddress + 1, 858 FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); 859 writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo }); 860 return; 861 } 862 matched = false; 863 break; 864 } 865 } 866 867 if (matched) { 868 if (wordPos + currentInfo.mCharacters.length == wordLen) { 869 // the word exists in the dictionary. 870 // only update group. 871 final int newNodeAddress = buffer.limit(); 872 final boolean hasMultipleChars = currentInfo.mCharacters.length > 1; 873 final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, 874 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 875 isNotAWord, isBlackListEntry, header.mFormatOptions); 876 final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, 877 -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, 878 nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, 879 bigrams); 880 moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address, 881 header.mFormatOptions); 882 return; 883 } 884 wordPos += currentInfo.mCharacters.length; 885 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 886 /* 887 * found the prefix of the word. 888 * make new node and link to the node from this group. 889 * 890 * before 891 * ab - cd 892 * 893 * insert "abcde" 894 * 895 * after 896 * ab - cd - e 897 */ 898 final int newNodeAddress = buffer.limit(); 899 updateChildrenAddress(buffer, address, newNodeAddress, 900 header.mFormatOptions); 901 final int newGroupAddress = newNodeAddress + 1; 902 final boolean hasMultipleChars = (wordLen - wordPos) > 1; 903 final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, 904 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 905 isNotAWord, isBlackListEntry, header.mFormatOptions); 906 final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); 907 final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags, 908 characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS, 909 shortcuts, bigrams); 910 writeNode(destination, new CharGroupInfo[] { newInfo }); 911 return; 912 } 913 buffer.position(currentInfo.mChildrenAddress); 914 foundNextGroup = true; 915 break; 916 } 917 } 918 919 if (foundNextGroup) continue; 920 921 // reached the end of the array. 922 final int linkAddressPosition = buffer.position(); 923 int nextLink = buffer.readUnsignedInt24(); 924 if ((nextLink & MSB24) != 0) { 925 nextLink = -(nextLink & SINT24_MAX); 926 } 927 if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 928 /* 929 * expand this node. 930 * 931 * before 932 * ab - cd 933 * 934 * insert "abef" 935 * 936 * after 937 * ab - cd 938 * | 939 * - ef 940 */ 941 942 // change the forward link address. 943 final int newNodeAddress = buffer.limit(); 944 buffer.position(linkAddressPosition); 945 writeSInt24ToBuffer(buffer, newNodeAddress); 946 947 final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); 948 final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1, 949 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, 950 isNotAWord, isBlackListEntry, header.mFormatOptions); 951 final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, 952 -1 /* endAddress */, flags, characters, frequency, nodeParentAddress, 953 FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); 954 writeNode(destination, new CharGroupInfo[]{ newInfo }); 955 return; 956 } else { 957 depth--; 958 buffer.position(nextLink); 959 } 960 } 961 } 962 963 /** 964 * Find a word from the buffer. 965 * 966 * @param buffer the buffer representing the body of the dictionary file. 967 * @param word the word searched 968 * @return the found group 969 * @throws IOException 970 * @throws UnsupportedFormatException 971 */ 972 @UsedForTesting 973 public static CharGroupInfo findWordFromBuffer(final FusionDictionaryBufferInterface buffer, 974 final String word) throws IOException, UnsupportedFormatException { 975 int position = getTerminalPosition(buffer, word); 976 if (position != FormatSpec.NOT_VALID_WORD) { 977 buffer.position(0); 978 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 979 buffer.position(position); 980 return BinaryDictInputOutput.readCharGroup(buffer, position, header.mFormatOptions); 981 } 982 return null; 983 } 984 985 /** 986 * Convenience method to read the header of a binary file. 987 * 988 * This is quite resource intensive - don't call when performance is critical. 989 * 990 * @param file The file to read. 991 * @param offset The offset in the file where to start reading the data. 992 * @param length The length of the data file. 993 */ 994 private static final int HEADER_READING_BUFFER_SIZE = 16384; 995 public static FileHeader getDictionaryFileHeader( 996 final File file, final long offset, final long length) 997 throws FileNotFoundException, IOException, UnsupportedFormatException { 998 final byte[] buffer = new byte[HEADER_READING_BUFFER_SIZE]; 999 final FileInputStream inStream = new FileInputStream(file); 1000 try { 1001 inStream.read(buffer); 1002 final BinaryDictInputOutput.ByteBufferWrapper wrapper = 1003 new BinaryDictInputOutput.ByteBufferWrapper(inStream.getChannel().map( 1004 FileChannel.MapMode.READ_ONLY, offset, length)); 1005 return BinaryDictInputOutput.readHeader(wrapper); 1006 } finally { 1007 inStream.close(); 1008 } 1009 } 1010 1011 public static FileHeader getDictionaryFileHeaderOrNull(final File file, final long offset, 1012 final long length) { 1013 try { 1014 final FileHeader header = getDictionaryFileHeader(file, offset, length); 1015 return header; 1016 } catch (UnsupportedFormatException e) { 1017 return null; 1018 } catch (IOException e) { 1019 return null; 1020 } 1021 } 1022 } 1023