1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; 20 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 21 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; 22 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; 23 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 24 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 25 26 import java.io.ByteArrayOutputStream; 27 import java.io.IOException; 28 import java.io.OutputStream; 29 import java.util.ArrayList; 30 31 /** 32 * Encodes binary files for a FusionDictionary. 33 * 34 * All the methods in this class are static. 35 * 36 * TODO: Rename this class to DictEncoderUtils. 37 */ 38 public class BinaryDictEncoderUtils { 39 40 private static final boolean DBG = MakedictLog.DBG; 41 42 private BinaryDictEncoderUtils() { 43 // This utility class is not publicly instantiable. 44 } 45 46 // Arbitrary limit to how much passes we consider address size compression should 47 // terminate in. At the time of this writing, our largest dictionary completes 48 // compression in five passes. 49 // If the number of passes exceeds this number, makedict bails with an exception on 50 // suspicion that a bug might be causing an infinite loop. 51 private static final int MAX_PASSES = 24; 52 53 /** 54 * Compute the binary size of the character array. 55 * 56 * If only one character, this is the size of this character. If many, it's the sum of their 57 * sizes + 1 byte for the terminator. 58 * 59 * @param characters the character array 60 * @return the size of the char array, including the terminator if any 61 */ 62 static int getPtNodeCharactersSize(final int[] characters) { 63 int size = CharEncoding.getCharArraySize(characters); 64 if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; 65 return size; 66 } 67 68 /** 69 * Compute the binary size of the character array in a PtNode 70 * 71 * If only one character, this is the size of this character. If many, it's the sum of their 72 * sizes + 1 byte for the terminator. 73 * 74 * @param ptNode the PtNode 75 * @return the size of the char array, including the terminator if any 76 */ 77 private static int getPtNodeCharactersSize(final PtNode ptNode) { 78 return getPtNodeCharactersSize(ptNode.mChars); 79 } 80 81 /** 82 * Compute the binary size of the PtNode count for a node array. 83 * @param nodeArray the nodeArray 84 * @return the size of the PtNode count, either 1 or 2 bytes. 85 */ 86 private static int getPtNodeCountSize(final PtNodeArray nodeArray) { 87 return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size()); 88 } 89 90 /** 91 * Compute the size of a shortcut in bytes. 92 */ 93 private static int getShortcutSize(final WeightedString shortcut) { 94 int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; 95 final String word = shortcut.mWord; 96 final int length = word.length(); 97 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 98 final int codePoint = word.codePointAt(i); 99 size += CharEncoding.getCharSize(codePoint); 100 } 101 size += FormatSpec.PTNODE_TERMINATOR_SIZE; 102 return size; 103 } 104 105 /** 106 * Compute the size of a shortcut list in bytes. 107 * 108 * This is known in advance and does not change according to position in the file 109 * like address lists do. 110 */ 111 static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { 112 if (null == shortcutList || shortcutList.isEmpty()) return 0; 113 int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; 114 for (final WeightedString shortcut : shortcutList) { 115 size += getShortcutSize(shortcut); 116 } 117 return size; 118 } 119 120 /** 121 * Compute the maximum size of a PtNode, assuming 3-byte addresses for everything. 122 * 123 * @param ptNode the PtNode to compute the size of. 124 * @param options file format options. 125 * @return the maximum size of the PtNode. 126 */ 127 private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) { 128 int size = getNodeHeaderSize(ptNode, options); 129 if (ptNode.isTerminal()) { 130 // If terminal, one byte for the frequency or four bytes for the terminal id. 131 if (options.mHasTerminalId) { 132 size += FormatSpec.PTNODE_TERMINAL_ID_SIZE; 133 } else { 134 size += FormatSpec.PTNODE_FREQUENCY_SIZE; 135 } 136 } 137 size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address 138 size += getShortcutListSize(ptNode.mShortcutTargets); 139 if (null != ptNode.mBigrams) { 140 size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE 141 + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) 142 * ptNode.mBigrams.size(); 143 } 144 return size; 145 } 146 147 /** 148 * Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for 149 * everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of 150 * the containing node array, and cache it it its 'mCachedSize' member. 151 * 152 * @param ptNodeArray the node array to compute the maximum size of. 153 * @param options file format options. 154 */ 155 private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray, 156 final FormatOptions options) { 157 int size = getPtNodeCountSize(ptNodeArray); 158 for (PtNode node : ptNodeArray.mData) { 159 final int nodeSize = getPtNodeMaximumSize(node, options); 160 node.mCachedSize = nodeSize; 161 size += nodeSize; 162 } 163 if (options.mSupportsDynamicUpdate) { 164 size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; 165 } 166 ptNodeArray.mCachedSize = size; 167 } 168 169 /** 170 * Compute the size of the header (flag + [parent address] + characters size) of a PtNode. 171 * 172 * @param ptNode the PtNode of which to compute the size of the header 173 * @param options file format options. 174 */ 175 private static int getNodeHeaderSize(final PtNode ptNode, final FormatOptions options) { 176 if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { 177 return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE 178 + getPtNodeCharactersSize(ptNode); 179 } else { 180 return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode); 181 } 182 } 183 184 /** 185 * Compute the size, in bytes, that an address will occupy. 186 * 187 * This can be used either for children addresses (which are always positive) or for 188 * attribute, which may be positive or negative but 189 * store their sign bit separately. 190 * 191 * @param address the address 192 * @return the byte size. 193 */ 194 static int getByteSize(final int address) { 195 assert(address <= FormatSpec.UINT24_MAX); 196 if (!BinaryDictIOUtils.hasChildrenAddress(address)) { 197 return 0; 198 } else if (Math.abs(address) <= FormatSpec.UINT8_MAX) { 199 return 1; 200 } else if (Math.abs(address) <= FormatSpec.UINT16_MAX) { 201 return 2; 202 } else { 203 return 3; 204 } 205 } 206 207 static int writeUIntToBuffer(final byte[] buffer, int position, final int value, 208 final int size) { 209 switch(size) { 210 case 4: 211 buffer[position++] = (byte) ((value >> 24) & 0xFF); 212 /* fall through */ 213 case 3: 214 buffer[position++] = (byte) ((value >> 16) & 0xFF); 215 /* fall through */ 216 case 2: 217 buffer[position++] = (byte) ((value >> 8) & 0xFF); 218 /* fall through */ 219 case 1: 220 buffer[position++] = (byte) (value & 0xFF); 221 break; 222 default: 223 /* nop */ 224 } 225 return position; 226 } 227 228 static void writeUIntToStream(final OutputStream stream, final int value, final int size) 229 throws IOException { 230 switch(size) { 231 case 4: 232 stream.write((value >> 24) & 0xFF); 233 /* fall through */ 234 case 3: 235 stream.write((value >> 16) & 0xFF); 236 /* fall through */ 237 case 2: 238 stream.write((value >> 8) & 0xFF); 239 /* fall through */ 240 case 1: 241 stream.write(value & 0xFF); 242 break; 243 default: 244 /* nop */ 245 } 246 } 247 248 // End utility methods 249 250 // This method is responsible for finding a nice ordering of the nodes that favors run-time 251 // cache performance and dictionary size. 252 /* package for tests */ static ArrayList<PtNodeArray> flattenTree( 253 final PtNodeArray rootNodeArray) { 254 final int treeSize = FusionDictionary.countPtNodes(rootNodeArray); 255 MakedictLog.i("Counted nodes : " + treeSize); 256 final ArrayList<PtNodeArray> flatTree = new ArrayList<PtNodeArray>(treeSize); 257 return flattenTreeInner(flatTree, rootNodeArray); 258 } 259 260 private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list, 261 final PtNodeArray ptNodeArray) { 262 // Removing the node is necessary if the tails are merged, because we would then 263 // add the same node several times when we only want it once. A number of places in 264 // the code also depends on any node being only once in the list. 265 // Merging tails can only be done if there are no attributes. Searching for attributes 266 // in LatinIME code depends on a total breadth-first ordering, which merging tails 267 // breaks. If there are no attributes, it should be fine (and reduce the file size) 268 // to merge tails, and removing the node from the list would be necessary. However, 269 // we don't merge tails because breaking the breadth-first ordering would result in 270 // extreme overhead at bigram lookup time (it would make the search function O(n) instead 271 // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty 272 // high). 273 // If no nodes are ever merged, we can't have the same node twice in the list, hence 274 // searching for duplicates in unnecessary. It is also very performance consuming, 275 // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making 276 // this simple list.remove operation O(n*n) overall. On Android this overhead is very 277 // high. 278 // For future reference, the code to remove duplicate is a simple : list.remove(node); 279 list.add(ptNodeArray); 280 final ArrayList<PtNode> branches = ptNodeArray.mData; 281 for (PtNode ptNode : branches) { 282 if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren); 283 } 284 return list; 285 } 286 287 /** 288 * Get the offset from a position inside a current node array to a target node array, during 289 * update. 290 * 291 * If the current node array is before the target node array, the target node array has not 292 * been updated yet, so we should return the offset from the old position of the current node 293 * array to the old position of the target node array. If on the other hand the target is 294 * before the current node array, it already has been updated, so we should return the offset 295 * from the new position in the current node array to the new position in the target node 296 * array. 297 * 298 * @param currentNodeArray node array containing the PtNode where the offset will be written 299 * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray 300 * @param targetNodeArray the target node array to get the offset to 301 * @return the offset to the target node array 302 */ 303 private static int getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray, 304 final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray) { 305 final boolean isTargetBeforeCurrent = (targetNodeArray.mCachedAddressBeforeUpdate 306 < currentNodeArray.mCachedAddressBeforeUpdate); 307 if (isTargetBeforeCurrent) { 308 return targetNodeArray.mCachedAddressAfterUpdate 309 - (currentNodeArray.mCachedAddressAfterUpdate 310 + offsetFromStartOfCurrentNodeArray); 311 } else { 312 return targetNodeArray.mCachedAddressBeforeUpdate 313 - (currentNodeArray.mCachedAddressBeforeUpdate 314 + offsetFromStartOfCurrentNodeArray); 315 } 316 } 317 318 /** 319 * Get the offset from a position inside a current node array to a target PtNode, during 320 * update. 321 * 322 * @param currentNodeArray node array containing the PtNode where the offset will be written 323 * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray 324 * @param targetPtNode the target PtNode to get the offset to 325 * @return the offset to the target PtNode 326 */ 327 // TODO: is there any way to factorize this method with the one above? 328 private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray, 329 final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) { 330 final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate 331 + offsetFromStartOfCurrentNodeArray; 332 final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate 333 < oldOffsetBasePoint); 334 // If the target is before the current node array, then its address has already been 335 // updated. We can use the AfterUpdate member, and compare it to our own member after 336 // update. Otherwise, the AfterUpdate member is not updated yet, so we need to use the 337 // BeforeUpdate member, and of course we have to compare this to our own address before 338 // update. 339 if (isTargetBeforeCurrent) { 340 final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate 341 + offsetFromStartOfCurrentNodeArray; 342 return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint; 343 } else { 344 return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint; 345 } 346 } 347 348 /** 349 * Computes the actual node array size, based on the cached addresses of the children nodes. 350 * 351 * Each node array stores its tentative address. During dictionary address computing, these 352 * are not final, but they can be used to compute the node array size (the node array size 353 * depends on the address of the children because the number of bytes necessary to store an 354 * address depends on its numeric value. The return value indicates whether the node array 355 * contents (as in, any of the addresses stored in the cache fields) have changed with 356 * respect to their previous value. 357 * 358 * @param ptNodeArray the node array to compute the size of. 359 * @param dict the dictionary in which the word/attributes are to be found. 360 * @param formatOptions file format options. 361 * @return false if none of the cached addresses inside the node array changed, true otherwise. 362 */ 363 private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, 364 final FusionDictionary dict, final FormatOptions formatOptions) { 365 boolean changed = false; 366 int size = getPtNodeCountSize(ptNodeArray); 367 for (PtNode ptNode : ptNodeArray.mData) { 368 ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size; 369 if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { 370 changed = true; 371 } 372 int nodeSize = getNodeHeaderSize(ptNode, formatOptions); 373 if (ptNode.isTerminal()) { 374 if (formatOptions.mHasTerminalId) { 375 nodeSize += FormatSpec.PTNODE_TERMINAL_ID_SIZE; 376 } else { 377 nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; 378 } 379 } 380 if (formatOptions.mSupportsDynamicUpdate) { 381 nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; 382 } else if (null != ptNode.mChildren) { 383 nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, 384 nodeSize + size, ptNode.mChildren)); 385 } 386 if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) { 387 nodeSize += getShortcutListSize(ptNode.mShortcutTargets); 388 if (null != ptNode.mBigrams) { 389 for (WeightedString bigram : ptNode.mBigrams) { 390 final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, 391 nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, 392 FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); 393 nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; 394 } 395 } 396 } 397 ptNode.mCachedSize = nodeSize; 398 size += nodeSize; 399 } 400 if (formatOptions.mSupportsDynamicUpdate) { 401 size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; 402 } 403 if (ptNodeArray.mCachedSize != size) { 404 ptNodeArray.mCachedSize = size; 405 changed = true; 406 } 407 return changed; 408 } 409 410 /** 411 * Initializes the cached addresses of node arrays and their containing nodes from their size. 412 * 413 * @param flatNodes the list of node arrays. 414 * @param formatOptions file format options. 415 * @return the byte size of the entire stack. 416 */ 417 private static int initializePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes, 418 final FormatOptions formatOptions) { 419 int nodeArrayOffset = 0; 420 for (final PtNodeArray nodeArray : flatNodes) { 421 nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset; 422 int nodeCountSize = getPtNodeCountSize(nodeArray); 423 int nodeffset = 0; 424 for (final PtNode ptNode : nodeArray.mData) { 425 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate = 426 nodeCountSize + nodeArrayOffset + nodeffset; 427 nodeffset += ptNode.mCachedSize; 428 } 429 nodeArrayOffset += nodeArray.mCachedSize; 430 } 431 return nodeArrayOffset; 432 } 433 434 /** 435 * Updates the cached addresses of node arrays after recomputing their new positions. 436 * 437 * @param flatNodes the list of node arrays. 438 */ 439 private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) { 440 for (final PtNodeArray nodeArray : flatNodes) { 441 nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate; 442 for (final PtNode ptNode : nodeArray.mData) { 443 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate; 444 } 445 } 446 } 447 448 /** 449 * Compute the cached parent addresses after all has been updated. 450 * 451 * The parent addresses are used by some binary formats at write-to-disk time. Not all formats 452 * need them. In particular, version 2 does not need them, and version 3 does. 453 * 454 * @param flatNodes the flat array of node arrays to fill in 455 */ 456 private static void computeParentAddresses(final ArrayList<PtNodeArray> flatNodes) { 457 for (final PtNodeArray nodeArray : flatNodes) { 458 for (final PtNode ptNode : nodeArray.mData) { 459 if (null != ptNode.mChildren) { 460 // Assign my address to children's parent address 461 // Here BeforeUpdate and AfterUpdate addresses have the same value, so it 462 // does not matter which we use. 463 ptNode.mChildren.mCachedParentAddress = ptNode.mCachedAddressAfterUpdate 464 - ptNode.mChildren.mCachedAddressAfterUpdate; 465 } 466 } 467 } 468 } 469 470 /** 471 * Compute the addresses and sizes of an ordered list of PtNode arrays. 472 * 473 * This method takes a list of PtNode arrays and will update their cached address and size 474 * values so that they can be written into a file. It determines the smallest size each of the 475 * PtNode arrays can be given the addresses of its children and attributes, and store that into 476 * each PtNode. 477 * The order of the PtNode is given by the order of the array. This method makes no effort 478 * to find a good order; it only mechanically computes the size this order results in. 479 * 480 * @param dict the dictionary 481 * @param flatNodes the ordered list of PtNode arrays 482 * @param formatOptions file format options. 483 * @return the same array it was passed. The nodes have been updated for address and size. 484 */ 485 /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, 486 final ArrayList<PtNodeArray> flatNodes, final FormatOptions formatOptions) { 487 // First get the worst possible sizes and offsets 488 for (final PtNodeArray n : flatNodes) calculatePtNodeArrayMaximumSize(n, formatOptions); 489 final int offset = initializePtNodeArraysCachedAddresses(flatNodes, formatOptions); 490 491 MakedictLog.i("Compressing the array addresses. Original size : " + offset); 492 MakedictLog.i("(Recursively seen size : " + offset + ")"); 493 494 int passes = 0; 495 boolean changesDone = false; 496 do { 497 changesDone = false; 498 int ptNodeArrayStartOffset = 0; 499 for (final PtNodeArray ptNodeArray : flatNodes) { 500 ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; 501 final int oldNodeArraySize = ptNodeArray.mCachedSize; 502 final boolean changed = 503 computeActualPtNodeArraySize(ptNodeArray, dict, formatOptions); 504 final int newNodeArraySize = ptNodeArray.mCachedSize; 505 if (oldNodeArraySize < newNodeArraySize) { 506 throw new RuntimeException("Increased size ?!"); 507 } 508 ptNodeArrayStartOffset += newNodeArraySize; 509 changesDone |= changed; 510 } 511 updatePtNodeArraysCachedAddresses(flatNodes); 512 ++passes; 513 if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); 514 } while (changesDone); 515 516 if (formatOptions.mSupportsDynamicUpdate) { 517 computeParentAddresses(flatNodes); 518 } 519 final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1); 520 MakedictLog.i("Compression complete in " + passes + " passes."); 521 MakedictLog.i("After address compression : " 522 + (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize)); 523 524 return flatNodes; 525 } 526 527 /** 528 * Sanity-checking method. 529 * 530 * This method checks a list of PtNode arrays for juxtaposition, that is, it will do 531 * nothing if each node array's cached address is actually the previous node array's address 532 * plus the previous node's size. 533 * If this is not the case, it will throw an exception. 534 * 535 * @param arrays the list of node arrays to check 536 */ 537 /* package */ static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) { 538 int offset = 0; 539 int index = 0; 540 for (final PtNodeArray ptNodeArray : arrays) { 541 // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter 542 // which we use. 543 if (ptNodeArray.mCachedAddressAfterUpdate != offset) { 544 throw new RuntimeException("Wrong address for node " + index 545 + " : expected " + offset + ", got " + 546 ptNodeArray.mCachedAddressAfterUpdate); 547 } 548 ++index; 549 offset += ptNodeArray.mCachedSize; 550 } 551 } 552 553 /** 554 * Helper method to write a children position to a file. 555 * 556 * @param buffer the buffer to write to. 557 * @param index the index in the buffer to write the address to. 558 * @param position the position to write. 559 * @return the size in bytes the address actually took. 560 */ 561 /* package */ static int writeChildrenPosition(final byte[] buffer, int index, 562 final int position) { 563 switch (getByteSize(position)) { 564 case 1: 565 buffer[index++] = (byte)position; 566 return 1; 567 case 2: 568 buffer[index++] = (byte)(0xFF & (position >> 8)); 569 buffer[index++] = (byte)(0xFF & position); 570 return 2; 571 case 3: 572 buffer[index++] = (byte)(0xFF & (position >> 16)); 573 buffer[index++] = (byte)(0xFF & (position >> 8)); 574 buffer[index++] = (byte)(0xFF & position); 575 return 3; 576 case 0: 577 return 0; 578 default: 579 throw new RuntimeException("Position " + position + " has a strange size"); 580 } 581 } 582 583 /** 584 * Helper method to write a signed children position to a file. 585 * 586 * @param buffer the buffer to write to. 587 * @param index the index in the buffer to write the address to. 588 * @param position the position to write. 589 * @return the size in bytes the address actually took. 590 */ 591 /* package */ static int writeSignedChildrenPosition(final byte[] buffer, int index, 592 final int position) { 593 if (!BinaryDictIOUtils.hasChildrenAddress(position)) { 594 buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; 595 } else { 596 final int absPosition = Math.abs(position); 597 buffer[index++] = 598 (byte)((position < 0 ? FormatSpec.MSB8 : 0) | (0xFF & (absPosition >> 16))); 599 buffer[index++] = (byte)(0xFF & (absPosition >> 8)); 600 buffer[index++] = (byte)(0xFF & absPosition); 601 } 602 return 3; 603 } 604 605 /** 606 * Makes the flag value for a PtNode. 607 * 608 * @param hasMultipleChars whether the PtNode has multiple chars. 609 * @param isTerminal whether the PtNode is terminal. 610 * @param childrenAddressSize the size of a children address. 611 * @param hasShortcuts whether the PtNode has shortcuts. 612 * @param hasBigrams whether the PtNode has bigrams. 613 * @param isNotAWord whether the PtNode is not a word. 614 * @param isBlackListEntry whether the PtNode is a blacklist entry. 615 * @param formatOptions file format options. 616 * @return the flags 617 */ 618 static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal, 619 final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams, 620 final boolean isNotAWord, final boolean isBlackListEntry, 621 final FormatOptions formatOptions) { 622 byte flags = 0; 623 if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; 624 if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; 625 if (formatOptions.mSupportsDynamicUpdate) { 626 flags |= FormatSpec.FLAG_IS_NOT_MOVED; 627 } else if (true) { 628 switch (childrenAddressSize) { 629 case 1: 630 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE; 631 break; 632 case 2: 633 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES; 634 break; 635 case 3: 636 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES; 637 break; 638 case 0: 639 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS; 640 break; 641 default: 642 throw new RuntimeException("Node with a strange address"); 643 } 644 } 645 if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; 646 if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; 647 if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; 648 if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED; 649 return flags; 650 } 651 652 /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset, 653 final FormatOptions formatOptions) { 654 return (byte) makePtNodeFlags(node.mChars.length > 1, node.mFrequency >= 0, 655 getByteSize(childrenOffset), 656 node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(), 657 node.mBigrams != null, node.mIsNotAWord, node.mIsBlacklistEntry, formatOptions); 658 } 659 660 /** 661 * Makes the flag value for a bigram. 662 * 663 * @param more whether there are more bigrams after this one. 664 * @param offset the offset of the bigram. 665 * @param bigramFrequency the frequency of the bigram, 0..255. 666 * @param unigramFrequency the unigram frequency of the same word, 0..255. 667 * @param word the second bigram, for debugging purposes 668 * @return the flags 669 */ 670 /* package */ static final int makeBigramFlags(final boolean more, final int offset, 671 int bigramFrequency, final int unigramFrequency, final String word) { 672 int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) 673 + (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0); 674 switch (getByteSize(offset)) { 675 case 1: 676 bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE; 677 break; 678 case 2: 679 bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES; 680 break; 681 case 3: 682 bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES; 683 break; 684 default: 685 throw new RuntimeException("Strange offset size"); 686 } 687 if (unigramFrequency > bigramFrequency) { 688 MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word 689 + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " 690 + word + " is " + unigramFrequency); 691 bigramFrequency = unigramFrequency; 692 } 693 // We compute the difference between 255 (which means probability = 1) and the 694 // unigram score. We split this into a number of discrete steps. 695 // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 696 // represents an increase of 16 steps: a value of 15 will be interpreted as the median 697 // value of the 16th step. In all justice, if the bigram frequency is low enough to be 698 // rounded below the first step (which means it is less than half a step higher than the 699 // unigram frequency) then the unigram frequency itself is the best approximation of the 700 // bigram freq that we could possibly supply, hence we should *not* include this bigram 701 // in the file at all. 702 // until this is done, we'll write 0 and slightly overestimate this case. 703 // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step 704 // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to 705 // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the 706 // step size. Then we compute the start of the first step (the one where value 0 starts) 707 // by adding half-a-step to the unigramFrequency. From there, we compute the integer 708 // number of steps to the bigramFrequency. One last thing: we want our steps to include 709 // their lower bound and exclude their higher bound so we need to have the first step 710 // start at exactly 1 unit higher than floor(unigramFreq + half a step). 711 // Note : to reconstruct the score, the dictionary reader will need to divide 712 // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, 713 // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best 714 // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the 715 // step pointed by the discretized frequency. 716 final float stepSize = 717 (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) 718 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); 719 final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); 720 final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); 721 // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 722 // here. The best approximation would be the unigram freq itself, so we should not 723 // include this bigram in the dictionary. For now, register as 0, and live with the 724 // small over-estimation that we get in this case. TODO: actually remove this bigram 725 // if discretizedFrequency < 0. 726 final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0; 727 bigramFlags += finalBigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; 728 return bigramFlags; 729 } 730 731 /** 732 * Makes the 2-byte value for options flags. 733 */ 734 private static final int makeOptionsValue(final FusionDictionary dictionary, 735 final FormatOptions formatOptions) { 736 final DictionaryOptions options = dictionary.mOptions; 737 final boolean hasBigrams = dictionary.hasBigrams(); 738 return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0) 739 + (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0) 740 + (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0) 741 + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0); 742 } 743 744 /** 745 * Makes the flag value for a shortcut. 746 * 747 * @param more whether there are more attributes after this one. 748 * @param frequency the frequency of the attribute, 0..15 749 * @return the flags 750 */ 751 static final int makeShortcutFlags(final boolean more, final int frequency) { 752 return (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) 753 + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY); 754 } 755 756 /* package */ static final int writeParentAddress(final byte[] buffer, final int index, 757 final int address, final FormatOptions formatOptions) { 758 if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { 759 if (address == FormatSpec.NO_PARENT_ADDRESS) { 760 buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; 761 } else { 762 final int absAddress = Math.abs(address); 763 assert(absAddress <= FormatSpec.SINT24_MAX); 764 buffer[index] = (byte)((address < 0 ? FormatSpec.MSB8 : 0) 765 | ((absAddress >> 16) & 0xFF)); 766 buffer[index + 1] = (byte)((absAddress >> 8) & 0xFF); 767 buffer[index + 2] = (byte)(absAddress & 0xFF); 768 } 769 return index + 3; 770 } else { 771 return index; 772 } 773 } 774 775 /* package */ static final int getChildrenPosition(final PtNode ptNode, 776 final FormatOptions formatOptions) { 777 int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate 778 + getNodeHeaderSize(ptNode, formatOptions); 779 if (ptNode.isTerminal()) { 780 // A terminal node has either the terminal id or the frequency. 781 // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children 782 // position. 783 if (formatOptions.mHasTerminalId) { 784 positionOfChildrenPosField += FormatSpec.PTNODE_TERMINAL_ID_SIZE; 785 } else { 786 positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE; 787 } 788 } 789 return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS 790 : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField; 791 } 792 793 /** 794 * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached. 795 * 796 * @param dict the dictionary the node array is a part of (for relative offsets). 797 * @param dictEncoder the dictionary encoder. 798 * @param ptNodeArray the node array to write. 799 * @param formatOptions file format options. 800 */ 801 @SuppressWarnings("unused") 802 /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, 803 final DictEncoder dictEncoder, final PtNodeArray ptNodeArray, 804 final FormatOptions formatOptions) { 805 // TODO: Make the code in common with BinaryDictIOUtils#writePtNode 806 dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); 807 808 final int ptNodeCount = ptNodeArray.mData.size(); 809 dictEncoder.writePtNodeCount(ptNodeCount); 810 final int parentPosition = 811 (ptNodeArray.mCachedParentAddress == FormatSpec.NO_PARENT_ADDRESS) 812 ? FormatSpec.NO_PARENT_ADDRESS 813 : ptNodeArray.mCachedParentAddress + ptNodeArray.mCachedAddressAfterUpdate; 814 for (int i = 0; i < ptNodeCount; ++i) { 815 final PtNode ptNode = ptNodeArray.mData.get(i); 816 if (dictEncoder.getPosition() != ptNode.mCachedAddressAfterUpdate) { 817 throw new RuntimeException("Bug: write index is not the same as the cached address " 818 + "of the node : " + dictEncoder.getPosition() + " <> " 819 + ptNode.mCachedAddressAfterUpdate); 820 } 821 // Sanity checks. 822 if (DBG && ptNode.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) { 823 throw new RuntimeException("A node has a frequency > " 824 + FormatSpec.MAX_TERMINAL_FREQUENCY 825 + " : " + ptNode.mFrequency); 826 } 827 dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict); 828 } 829 if (formatOptions.mSupportsDynamicUpdate) { 830 dictEncoder.writeForwardLinkAddress(FormatSpec.NO_FORWARD_LINK_ADDRESS); 831 } 832 if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate 833 + ptNodeArray.mCachedSize) { 834 throw new RuntimeException("Not the same size : written " 835 + (dictEncoder.getPosition() - ptNodeArray.mCachedAddressAfterUpdate) 836 + " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes"); 837 } 838 } 839 840 /** 841 * Dumps a collection of useful statistics about a list of PtNode arrays. 842 * 843 * This prints purely informative stuff, like the total estimated file size, the 844 * number of PtNode arrays, of PtNodes, the repartition of each address size, etc 845 * 846 * @param ptNodeArrays the list of PtNode arrays. 847 */ 848 /* package */ static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) { 849 int firstTerminalAddress = Integer.MAX_VALUE; 850 int lastTerminalAddress = Integer.MIN_VALUE; 851 int size = 0; 852 int ptNodes = 0; 853 int maxNodes = 0; 854 int maxRuns = 0; 855 for (final PtNodeArray ptNodeArray : ptNodeArrays) { 856 if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size(); 857 for (final PtNode ptNode : ptNodeArray.mData) { 858 ++ptNodes; 859 if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length; 860 if (ptNode.mFrequency >= 0) { 861 if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress) 862 firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; 863 if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress) 864 lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; 865 } 866 } 867 if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) { 868 size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize; 869 } 870 } 871 final int[] ptNodeCounts = new int[maxNodes + 1]; 872 final int[] runCounts = new int[maxRuns + 1]; 873 for (final PtNodeArray ptNodeArray : ptNodeArrays) { 874 ++ptNodeCounts[ptNodeArray.mData.size()]; 875 for (final PtNode ptNode : ptNodeArray.mData) { 876 ++runCounts[ptNode.mChars.length]; 877 } 878 } 879 880 MakedictLog.i("Statistics:\n" 881 + " total file size " + size + "\n" 882 + " " + ptNodeArrays.size() + " node arrays\n" 883 + " " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size()) 884 + " PtNodes per node)\n" 885 + " first terminal at " + firstTerminalAddress + "\n" 886 + " last terminal at " + lastTerminalAddress + "\n" 887 + " PtNode stats : max = " + maxNodes); 888 for (int i = 0; i < ptNodeCounts.length; ++i) { 889 MakedictLog.i(" " + i + " : " + ptNodeCounts[i]); 890 } 891 MakedictLog.i(" Character run stats : max = " + maxRuns); 892 for (int i = 0; i < runCounts.length; ++i) { 893 MakedictLog.i(" " + i + " : " + runCounts[i]); 894 } 895 } 896 897 /** 898 * Writes a file header to an output stream. 899 * 900 * @param destination the stream to write the file header to. 901 * @param dict the dictionary to write. 902 * @param formatOptions file format options. 903 * @return the size of the header. 904 */ 905 /* package */ static int writeDictionaryHeader(final OutputStream destination, 906 final FusionDictionary dict, final FormatOptions formatOptions) 907 throws IOException, UnsupportedFormatException { 908 final int version = formatOptions.mVersion; 909 if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION 910 || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { 911 throw new UnsupportedFormatException("Requested file format version " + version 912 + ", but this implementation only supports versions " 913 + FormatSpec.MINIMUM_SUPPORTED_VERSION + " through " 914 + FormatSpec.MAXIMUM_SUPPORTED_VERSION); 915 } 916 917 ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); 918 919 // The magic number in big-endian order. 920 // Magic number for all versions. 921 headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 24))); 922 headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 16))); 923 headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 8))); 924 headerBuffer.write((byte) (0xFF & FormatSpec.MAGIC_NUMBER)); 925 // Dictionary version. 926 headerBuffer.write((byte) (0xFF & (version >> 8))); 927 headerBuffer.write((byte) (0xFF & version)); 928 929 // Options flags 930 final int options = makeOptionsValue(dict, formatOptions); 931 headerBuffer.write((byte) (0xFF & (options >> 8))); 932 headerBuffer.write((byte) (0xFF & options)); 933 final int headerSizeOffset = headerBuffer.size(); 934 // Placeholder to be written later with header size. 935 for (int i = 0; i < 4; ++i) { 936 headerBuffer.write(0); 937 } 938 // Write out the options. 939 for (final String key : dict.mOptions.mAttributes.keySet()) { 940 final String value = dict.mOptions.mAttributes.get(key); 941 CharEncoding.writeString(headerBuffer, key); 942 CharEncoding.writeString(headerBuffer, value); 943 } 944 final int size = headerBuffer.size(); 945 final byte[] bytes = headerBuffer.toByteArray(); 946 // Write out the header size. 947 bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); 948 bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); 949 bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); 950 bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); 951 destination.write(bytes); 952 953 headerBuffer.close(); 954 return size; 955 } 956 } 957