1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import com.android.inputmethod.latin.FusionDictionary.CharGroup; 20 import com.android.inputmethod.latin.FusionDictionary.Node; 21 import com.android.inputmethod.latin.FusionDictionary.WeightedString; 22 23 import java.io.FileNotFoundException; 24 import java.io.IOException; 25 import java.io.OutputStream; 26 import java.io.RandomAccessFile; 27 import java.util.ArrayList; 28 import java.util.Arrays; 29 import java.util.Map; 30 import java.util.TreeMap; 31 32 /** 33 * Reads and writes XML files for a FusionDictionary. 34 * 35 * All the methods in this class are static. 36 */ 37 public class BinaryDictInputOutput { 38 39 /* Node layout is as follows: 40 * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE 41 * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 42 * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE 43 * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES 44 * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES 45 * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS 46 * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL 47 * | reserved 1 bit, 1 = yes, 0 = no 48 * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS 49 * 50 * c | IF FLAG_HAS_MULTIPLE_CHARS 51 * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers 52 * a | end 1 byte, = 0 53 * r | ELSE 54 * s | char 1 or 3 bytes 55 * | END 56 * 57 * f | 58 * r | IF FLAG_IS_TERMINAL 59 * e | frequency 1 byte 60 * q | 61 * 62 * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType 63 * h | // nothing 64 * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType 65 * l | children address, 1 byte 66 * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType 67 * r | children address, 2 bytes 68 * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType 69 * n | children address, 3 bytes 70 * A | END 71 * d 72 * dress 73 * 74 * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS 75 * | bigrams address list 76 * 77 * Char format is: 78 * 1 byte = bbbbbbbb match 79 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 80 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 81 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 82 * 00011111 would be outside unicode. 83 * else: iso-latin-1 code 84 * This allows for the whole unicode range to be encoded, including chars outside of 85 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 86 * characters which should never happen anyway (and still work, but take 3 bytes). 87 * 88 * bigram and shortcut address list is: 89 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 90 * | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE 91 * | 1 = must take -address, 0 = must take +address 92 * | xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE 93 * | addressFormat = 2 bits, 00 = unused : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 94 * | 01 = 1 byte : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 95 * | 10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES 96 * | 11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES 97 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 98 * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat) 99 * | read 1 byte, add top 4 bits 100 * | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat) 101 * | read 2 bytes, add top 4 bits 102 * | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat 103 * | read 3 bytes, add top 4 bits 104 * | END 105 * | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address 106 * if (FLAG_ATTRIBUTE_HAS_NET) goto bigram_and_shortcut_address_list_is 107 * 108 */ 109 110 private static final int MAGIC_NUMBER = 0x78B1; 111 private static final int VERSION = 1; 112 private static final int MAXIMUM_SUPPORTED_VERSION = VERSION; 113 // No options yet, reserved for future use. 114 private static final int OPTIONS = 0; 115 116 // TODO: Make this value adaptative to content data, store it in the header, and 117 // use it in the reading code. 118 private static final int MAX_WORD_LENGTH = 48; 119 120 private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; 121 private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 122 private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 123 private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 124 private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 125 126 private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; 127 128 private static final int FLAG_IS_TERMINAL = 0x10; 129 private static final int FLAG_HAS_BIGRAMS = 0x04; 130 131 private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 132 private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 133 private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 134 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 135 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 136 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 137 private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F; 138 139 private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F; 140 141 private static final int GROUP_COUNT_SIZE = 1; 142 private static final int GROUP_TERMINATOR_SIZE = 1; 143 private static final int GROUP_FLAGS_SIZE = 1; 144 private static final int GROUP_FREQUENCY_SIZE = 1; 145 private static final int GROUP_MAX_ADDRESS_SIZE = 3; 146 private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1; 147 private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; 148 149 private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; 150 private static final int INVALID_CHARACTER = -1; 151 152 // Limiting to 127 for upward compatibility 153 // TODO: implement a scheme to be able to shoot 256 chargroups in a node 154 private static final int MAX_CHARGROUPS_IN_A_NODE = 127; 155 156 private static final int MAX_TERMINAL_FREQUENCY = 255; 157 158 /** 159 * A class grouping utility function for our specific character encoding. 160 */ 161 private static class CharEncoding { 162 163 private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 164 private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; 165 166 /** 167 * Helper method to find out whether this code fits on one byte 168 */ 169 private static boolean fitsOnOneByte(int character) { 170 return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE 171 && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; 172 } 173 174 /** 175 * Compute the size of a character given its character code. 176 * 177 * Char format is: 178 * 1 byte = bbbbbbbb match 179 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 180 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 181 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 182 * 00011111 would be outside unicode. 183 * else: iso-latin-1 code 184 * This allows for the whole unicode range to be encoded, including chars outside of 185 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 186 * characters which should never happen anyway (and still work, but take 3 bytes). 187 * 188 * @param character the character code. 189 * @return the size in binary encoded-form, either 1 or 3 bytes. 190 */ 191 private static int getCharSize(int character) { 192 // See char encoding in FusionDictionary.java 193 if (fitsOnOneByte(character)) return 1; 194 if (INVALID_CHARACTER == character) return 1; 195 return 3; 196 } 197 198 /** 199 * Compute the byte size of a character array. 200 */ 201 private static int getCharArraySize(final int[] chars) { 202 int size = 0; 203 for (int character : chars) size += getCharSize(character); 204 return size; 205 } 206 207 /** 208 * Writes a char array to a byte buffer. 209 * 210 * @param characters the character array to write. 211 * @param buffer the byte buffer to write to. 212 * @param index the index in buffer to write the character array to. 213 * @return the index after the last character. 214 */ 215 private static int writeCharArray(int[] characters, byte[] buffer, int index) { 216 for (int character : characters) { 217 if (1 == getCharSize(character)) { 218 buffer[index++] = (byte)character; 219 } else { 220 buffer[index++] = (byte)(0xFF & (character >> 16)); 221 buffer[index++] = (byte)(0xFF & (character >> 8)); 222 buffer[index++] = (byte)(0xFF & character); 223 } 224 } 225 return index; 226 } 227 228 /** 229 * Reads a character from the file. 230 * 231 * This follows the character format documented earlier in this source file. 232 * 233 * @param source the file, positioned over an encoded character. 234 * @return the character code. 235 */ 236 private static int readChar(RandomAccessFile source) throws IOException { 237 int character = source.readUnsignedByte(); 238 if (!fitsOnOneByte(character)) { 239 if (GROUP_CHARACTERS_TERMINATOR == character) 240 return INVALID_CHARACTER; 241 character <<= 16; 242 character += source.readUnsignedShort(); 243 } 244 return character; 245 } 246 } 247 248 /** 249 * Compute the binary size of the character array in a group 250 * 251 * If only one character, this is the size of this character. If many, it's the sum of their 252 * sizes + 1 byte for the terminator. 253 * 254 * @param group the group 255 * @return the size of the char array, including the terminator if any 256 */ 257 private static int getGroupCharactersSize(CharGroup group) { 258 int size = CharEncoding.getCharArraySize(group.mChars); 259 if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE; 260 return size; 261 } 262 263 /** 264 * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. 265 * 266 * @param group the CharGroup to compute the size of. 267 * @return the maximum size of the group. 268 */ 269 private static int getCharGroupMaximumSize(CharGroup group) { 270 int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE; 271 // If terminal, one byte for the frequency 272 if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; 273 size += GROUP_MAX_ADDRESS_SIZE; // For children address 274 if (null != group.mBigrams) { 275 for (WeightedString bigram : group.mBigrams) { 276 size += GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE; 277 } 278 } 279 return size; 280 } 281 282 /** 283 * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches 284 * it in the 'actualSize' member of the node. 285 * 286 * @param node the node to compute the maximum size of. 287 */ 288 private static void setNodeMaximumSize(Node node) { 289 int size = GROUP_COUNT_SIZE; 290 for (CharGroup g : node.mData) { 291 final int groupSize = getCharGroupMaximumSize(g); 292 g.mCachedSize = groupSize; 293 size += groupSize; 294 } 295 node.mCachedSize = size; 296 } 297 298 /** 299 * Helper method to hide the actual value of the no children address. 300 */ 301 private static boolean hasChildrenAddress(int address) { 302 return NO_CHILDREN_ADDRESS != address; 303 } 304 305 /** 306 * Compute the size, in bytes, that an address will occupy. 307 * 308 * This can be used either for children addresses (which are always positive) or for 309 * attribute, which may be positive or negative but 310 * store their sign bit separately. 311 * 312 * @param address the address 313 * @return the byte size. 314 */ 315 private static int getByteSize(int address) { 316 assert(address < 0x1000000); 317 if (!hasChildrenAddress(address)) { 318 return 0; 319 } else if (Math.abs(address) < 0x100) { 320 return 1; 321 } else if (Math.abs(address) < 0x10000) { 322 return 2; 323 } else { 324 return 3; 325 } 326 } 327 // End utility methods. 328 329 // This method is responsible for finding a nice ordering of the nodes that favors run-time 330 // cache performance and dictionary size. 331 /* package for tests */ static ArrayList<Node> flattenTree(Node root) { 332 final int treeSize = FusionDictionary.countCharGroups(root); 333 MakedictLog.i("Counted nodes : " + treeSize); 334 final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize); 335 return flattenTreeInner(flatTree, root); 336 } 337 338 private static ArrayList<Node> flattenTreeInner(ArrayList<Node> list, Node node) { 339 // Removing the node is necessary if the tails are merged, because we would then 340 // add the same node several times when we only want it once. A number of places in 341 // the code also depends on any node being only once in the list. 342 // Merging tails can only be done if there are no attributes. Searching for attributes 343 // in LatinIME code depends on a total breadth-first ordering, which merging tails 344 // breaks. If there are no attributes, it should be fine (and reduce the file size) 345 // to merge tails, and the following step would be necessary. 346 // If eventually the code runs on Android, searching through the whole array each time 347 // may be a performance concern. 348 list.remove(node); 349 list.add(node); 350 final ArrayList<CharGroup> branches = node.mData; 351 final int nodeSize = branches.size(); 352 for (CharGroup group : branches) { 353 if (null != group.mChildren) flattenTreeInner(list, group.mChildren); 354 } 355 return list; 356 } 357 358 /** 359 * Finds the absolute address of a word in the dictionary. 360 * 361 * @param dict the dictionary in which to search. 362 * @param word the word we are searching for. 363 * @return the word address. If it is not found, an exception is thrown. 364 */ 365 private static int findAddressOfWord(final FusionDictionary dict, final String word) { 366 return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress; 367 } 368 369 /** 370 * Computes the actual node size, based on the cached addresses of the children nodes. 371 * 372 * Each node stores its tentative address. During dictionary address computing, these 373 * are not final, but they can be used to compute the node size (the node size depends 374 * on the address of the children because the number of bytes necessary to store an 375 * address depends on its numeric value. 376 * 377 * @param node the node to compute the size of. 378 * @param dict the dictionary in which the word/attributes are to be found. 379 */ 380 private static void computeActualNodeSize(Node node, FusionDictionary dict) { 381 int size = GROUP_COUNT_SIZE; 382 for (CharGroup group : node.mData) { 383 int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 384 if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; 385 if (null != group.mChildren) { 386 final int offsetBasePoint= groupSize + node.mCachedAddress + size; 387 final int offset = group.mChildren.mCachedAddress - offsetBasePoint; 388 groupSize += getByteSize(offset); 389 } 390 if (null != group.mBigrams) { 391 for (WeightedString bigram : group.mBigrams) { 392 final int offsetBasePoint = groupSize + node.mCachedAddress + size 393 + GROUP_FLAGS_SIZE; 394 final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); 395 final int offset = addressOfBigram - offsetBasePoint; 396 groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; 397 } 398 } 399 group.mCachedSize = groupSize; 400 size += groupSize; 401 } 402 node.mCachedSize = size; 403 } 404 405 /** 406 * Computes the byte size of a list of nodes and updates each node cached position. 407 * 408 * @param flatNodes the array of nodes. 409 * @return the byte size of the entire stack. 410 */ 411 private static int stackNodes(ArrayList<Node> flatNodes) { 412 int nodeOffset = 0; 413 for (Node n : flatNodes) { 414 n.mCachedAddress = nodeOffset; 415 int groupOffset = 0; 416 for (CharGroup g : n.mData) { 417 g.mCachedAddress = GROUP_COUNT_SIZE + nodeOffset + groupOffset; 418 groupOffset += g.mCachedSize; 419 } 420 if (groupOffset + GROUP_COUNT_SIZE != n.mCachedSize) { 421 throw new RuntimeException("Bug : Stored and computed node size differ"); 422 } 423 nodeOffset += n.mCachedSize; 424 } 425 return nodeOffset; 426 } 427 428 /** 429 * Compute the addresses and sizes of an ordered node array. 430 * 431 * This method takes a node array and will update its cached address and size values 432 * so that they can be written into a file. It determines the smallest size each of the 433 * nodes can be given the addresses of its children and attributes, and store that into 434 * each node. 435 * The order of the node is given by the order of the array. This method makes no effort 436 * to find a good order; it only mechanically computes the size this order results in. 437 * 438 * @param dict the dictionary 439 * @param flatNodes the ordered array of nodes 440 * @return the same array it was passed. The nodes have been updated for address and size. 441 */ 442 private static ArrayList<Node> computeAddresses(FusionDictionary dict, 443 ArrayList<Node> flatNodes) { 444 // First get the worst sizes and offsets 445 for (Node n : flatNodes) setNodeMaximumSize(n); 446 final int offset = stackNodes(flatNodes); 447 448 MakedictLog.i("Compressing the array addresses. Original size : " + offset); 449 MakedictLog.i("(Recursively seen size : " + offset + ")"); 450 451 int passes = 0; 452 boolean changesDone = false; 453 do { 454 changesDone = false; 455 for (Node n : flatNodes) { 456 final int oldNodeSize = n.mCachedSize; 457 computeActualNodeSize(n, dict); 458 final int newNodeSize = n.mCachedSize; 459 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); 460 if (oldNodeSize != newNodeSize) changesDone = true; 461 } 462 stackNodes(flatNodes); 463 ++passes; 464 } while (changesDone); 465 466 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 467 MakedictLog.i("Compression complete in " + passes + " passes."); 468 MakedictLog.i("After address compression : " 469 + (lastNode.mCachedAddress + lastNode.mCachedSize)); 470 471 return flatNodes; 472 } 473 474 /** 475 * Sanity-checking method. 476 * 477 * This method checks an array of node for juxtaposition, that is, it will do 478 * nothing if each node's cached address is actually the previous node's address 479 * plus the previous node's size. 480 * If this is not the case, it will throw an exception. 481 * 482 * @param array the array node to check 483 */ 484 private static void checkFlatNodeArray(ArrayList<Node> array) { 485 int offset = 0; 486 int index = 0; 487 for (Node n : array) { 488 if (n.mCachedAddress != offset) { 489 throw new RuntimeException("Wrong address for node " + index 490 + " : expected " + offset + ", got " + n.mCachedAddress); 491 } 492 ++index; 493 offset += n.mCachedSize; 494 } 495 } 496 497 /** 498 * Helper method to write a variable-size address to a file. 499 * 500 * @param buffer the buffer to write to. 501 * @param index the index in the buffer to write the address to. 502 * @param address the address to write. 503 * @return the size in bytes the address actually took. 504 */ 505 private static int writeVariableAddress(byte[] buffer, int index, int address) { 506 switch (getByteSize(address)) { 507 case 1: 508 buffer[index++] = (byte)address; 509 return 1; 510 case 2: 511 buffer[index++] = (byte)(0xFF & (address >> 8)); 512 buffer[index++] = (byte)(0xFF & address); 513 return 2; 514 case 3: 515 buffer[index++] = (byte)(0xFF & (address >> 16)); 516 buffer[index++] = (byte)(0xFF & (address >> 8)); 517 buffer[index++] = (byte)(0xFF & address); 518 return 3; 519 case 0: 520 return 0; 521 default: 522 throw new RuntimeException("Address " + address + " has a strange size"); 523 } 524 } 525 526 private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, 527 final int childrenOffset) { 528 byte flags = 0; 529 if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS; 530 if (group.mFrequency >= 0) { 531 flags |= FLAG_IS_TERMINAL; 532 } 533 if (null != group.mChildren) { 534 switch (getByteSize(childrenOffset)) { 535 case 1: 536 flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; 537 break; 538 case 2: 539 flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; 540 break; 541 case 3: 542 flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; 543 break; 544 default: 545 throw new RuntimeException("Node with a strange address"); 546 } 547 } 548 if (null != group.mBigrams) flags |= FLAG_HAS_BIGRAMS; 549 return flags; 550 } 551 552 /** 553 * Makes the flag value for an attribute. 554 * 555 * @param more whether there are more attributes after this one. 556 * @param offset the offset of the attribute. 557 * @param frequency the frequency of the attribute, 0..15 558 * @return the flags 559 */ 560 private static final int makeAttributeFlags(final boolean more, final int offset, 561 final int frequency) { 562 int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) 563 + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); 564 switch (getByteSize(offset)) { 565 case 1: 566 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; 567 break; 568 case 2: 569 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; 570 break; 571 case 3: 572 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; 573 break; 574 default: 575 throw new RuntimeException("Strange offset size"); 576 } 577 bigramFlags += frequency & FLAG_ATTRIBUTE_FREQUENCY; 578 return bigramFlags; 579 } 580 581 /** 582 * Write a node to memory. The node is expected to have its final position cached. 583 * 584 * This can be an empty map, but the more is inside the faster the lookups will be. It can 585 * be carried on as long as nodes do not move. 586 * 587 * @param dict the dictionary the node is a part of (for relative offsets). 588 * @param buffer the memory buffer to write to. 589 * @param node the node to write. 590 * @return the address of the END of the node. 591 */ 592 private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) { 593 int index = node.mCachedAddress; 594 595 final int size = node.mData.size(); 596 if (size > MAX_CHARGROUPS_IN_A_NODE) 597 throw new RuntimeException("A node has a group count over 127 (" + size + ")."); 598 599 buffer[index++] = (byte)size; 600 int groupAddress = index; 601 for (int i = 0; i < size; ++i) { 602 CharGroup group = node.mData.get(i); 603 if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " 604 + "the same as the cached address of the group"); 605 groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 606 // Sanity checks. 607 if (group.mFrequency > MAX_TERMINAL_FREQUENCY) { 608 throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY 609 + " : " + group.mFrequency); 610 } 611 if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE; 612 final int childrenOffset = null == group.mChildren 613 ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress; 614 byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset); 615 buffer[index++] = flags; 616 index = CharEncoding.writeCharArray(group.mChars, buffer, index); 617 if (group.hasSeveralChars()) { 618 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 619 } 620 if (group.mFrequency >= 0) { 621 buffer[index++] = (byte) group.mFrequency; 622 } 623 final int shift = writeVariableAddress(buffer, index, childrenOffset); 624 index += shift; 625 groupAddress += shift; 626 627 // Write bigrams 628 if (null != group.mBigrams) { 629 int remainingBigrams = group.mBigrams.size(); 630 for (WeightedString bigram : group.mBigrams) { 631 boolean more = remainingBigrams > 1; 632 final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); 633 ++groupAddress; 634 final int offset = addressOfBigram - groupAddress; 635 int bigramFlags = makeAttributeFlags(more, offset, bigram.mFrequency); 636 buffer[index++] = (byte)bigramFlags; 637 final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); 638 index += bigramShift; 639 groupAddress += bigramShift; 640 --remainingBigrams; 641 } 642 } 643 644 } 645 if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException( 646 "Not the same size : written " 647 + (index - node.mCachedAddress) + " bytes out of a node that should have " 648 + node.mCachedSize + " bytes"); 649 return index; 650 } 651 652 /** 653 * Dumps a collection of useful statistics about a node array. 654 * 655 * This prints purely informative stuff, like the total estimated file size, the 656 * number of nodes, of character groups, the repartition of each address size, etc 657 * 658 * @param nodes the node array. 659 */ 660 private static void showStatistics(ArrayList<Node> nodes) { 661 int firstTerminalAddress = Integer.MAX_VALUE; 662 int lastTerminalAddress = Integer.MIN_VALUE; 663 int size = 0; 664 int charGroups = 0; 665 int maxGroups = 0; 666 int maxRuns = 0; 667 for (Node n : nodes) { 668 if (maxGroups < n.mData.size()) maxGroups = n.mData.size(); 669 for (CharGroup cg : n.mData) { 670 ++charGroups; 671 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length; 672 if (cg.mFrequency >= 0) { 673 if (n.mCachedAddress < firstTerminalAddress) 674 firstTerminalAddress = n.mCachedAddress; 675 if (n.mCachedAddress > lastTerminalAddress) 676 lastTerminalAddress = n.mCachedAddress; 677 } 678 } 679 if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize; 680 } 681 final int[] groupCounts = new int[maxGroups + 1]; 682 final int[] runCounts = new int[maxRuns + 1]; 683 for (Node n : nodes) { 684 ++groupCounts[n.mData.size()]; 685 for (CharGroup cg : n.mData) { 686 ++runCounts[cg.mChars.length]; 687 } 688 } 689 690 MakedictLog.i("Statistics:\n" 691 + " total file size " + size + "\n" 692 + " " + nodes.size() + " nodes\n" 693 + " " + charGroups + " groups (" + ((float)charGroups / nodes.size()) 694 + " groups per node)\n" 695 + " first terminal at " + firstTerminalAddress + "\n" 696 + " last terminal at " + lastTerminalAddress + "\n" 697 + " Group stats : max = " + maxGroups); 698 for (int i = 0; i < groupCounts.length; ++i) { 699 MakedictLog.i(" " + i + " : " + groupCounts[i]); 700 } 701 MakedictLog.i(" Character run stats : max = " + maxRuns); 702 for (int i = 0; i < runCounts.length; ++i) { 703 MakedictLog.i(" " + i + " : " + runCounts[i]); 704 } 705 } 706 707 /** 708 * Dumps a FusionDictionary to a file. 709 * 710 * This is the public entry point to write a dictionary to a file. 711 * 712 * @param destination the stream to write the binary data to. 713 * @param dict the dictionary to write. 714 */ 715 public static void writeDictionaryBinary(OutputStream destination, FusionDictionary dict) 716 throws IOException { 717 718 // Addresses are limited to 3 bytes, so we'll just make a 16MB buffer. Since addresses 719 // can be relative to each node, the structure itself is not limited to 16MB at all, but 720 // I doubt this will ever be shot. If it is, deciding the order of the nodes becomes 721 // a quite complicated problem, because though the dictionary itself does not have a 722 // size limit, each node must still be within 16MB of all its children and parents. 723 // As long as this is ensured, the dictionary file may grow to any size. 724 // Anyway, to make a dictionary bigger than 16MB just increase the size of this buffer. 725 final byte[] buffer = new byte[1 << 24]; 726 int index = 0; 727 728 // Magic number in big-endian order. 729 buffer[index++] = (byte) (0xFF & (MAGIC_NUMBER >> 8)); 730 buffer[index++] = (byte) (0xFF & MAGIC_NUMBER); 731 // Dictionary version. 732 buffer[index++] = (byte) (0xFF & VERSION); 733 // Options flags 734 buffer[index++] = (byte) (0xFF & (OPTIONS >> 8)); 735 buffer[index++] = (byte) (0xFF & OPTIONS); 736 737 // Should we include the locale and title of the dictionary ? 738 739 destination.write(buffer, 0, index); 740 index = 0; 741 742 // Leave the choice of the optimal node order to the flattenTree function. 743 MakedictLog.i("Flattening the tree..."); 744 ArrayList<Node> flatNodes = flattenTree(dict.mRoot); 745 746 MakedictLog.i("Computing addresses..."); 747 computeAddresses(dict, flatNodes); 748 MakedictLog.i("Checking array..."); 749 checkFlatNodeArray(flatNodes); 750 751 MakedictLog.i("Writing file..."); 752 int dataEndOffset = 0; 753 for (Node n : flatNodes) { 754 dataEndOffset = writePlacedNode(dict, buffer, n); 755 } 756 757 showStatistics(flatNodes); 758 759 destination.write(buffer, 0, dataEndOffset); 760 761 destination.close(); 762 MakedictLog.i("Done"); 763 } 764 765 766 // Input methods: Read a binary dictionary to memory. 767 // readDictionaryBinary is the public entry point for them. 768 769 static final int[] characterBuffer = new int[MAX_WORD_LENGTH]; 770 private static CharGroupInfo readCharGroup(RandomAccessFile source, 771 final int originalGroupAddress) throws IOException { 772 int addressPointer = originalGroupAddress; 773 final int flags = source.readUnsignedByte(); 774 ++addressPointer; 775 final int characters[]; 776 if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) { 777 int index = 0; 778 int character = CharEncoding.readChar(source); 779 addressPointer += CharEncoding.getCharSize(character); 780 while (-1 != character) { 781 characterBuffer[index++] = character; 782 character = CharEncoding.readChar(source); 783 addressPointer += CharEncoding.getCharSize(character); 784 } 785 characters = Arrays.copyOfRange(characterBuffer, 0, index); 786 } else { 787 final int character = CharEncoding.readChar(source); 788 addressPointer += CharEncoding.getCharSize(character); 789 characters = new int[] { character }; 790 } 791 final int frequency; 792 if (0 != (FLAG_IS_TERMINAL & flags)) { 793 ++addressPointer; 794 frequency = source.readUnsignedByte(); 795 } else { 796 frequency = CharGroup.NOT_A_TERMINAL; 797 } 798 int childrenAddress = addressPointer; 799 switch (flags & MASK_GROUP_ADDRESS_TYPE) { 800 case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 801 childrenAddress += source.readUnsignedByte(); 802 addressPointer += 1; 803 break; 804 case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 805 childrenAddress += source.readUnsignedShort(); 806 addressPointer += 2; 807 break; 808 case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 809 childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort(); 810 addressPointer += 3; 811 break; 812 case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: 813 default: 814 childrenAddress = NO_CHILDREN_ADDRESS; 815 break; 816 } 817 ArrayList<PendingAttribute> bigrams = null; 818 if (0 != (flags & FLAG_HAS_BIGRAMS)) { 819 bigrams = new ArrayList<PendingAttribute>(); 820 boolean more = true; 821 while (more) { 822 int bigramFlags = source.readUnsignedByte(); 823 ++addressPointer; 824 more = (0 != (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)); 825 final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; 826 int bigramAddress = addressPointer; 827 switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { 828 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 829 bigramAddress += sign * source.readUnsignedByte(); 830 addressPointer += 1; 831 break; 832 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 833 bigramAddress += sign * source.readUnsignedShort(); 834 addressPointer += 2; 835 break; 836 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 837 final int offset = ((source.readUnsignedByte() << 16) 838 + source.readUnsignedShort()); 839 bigramAddress += sign * offset; 840 addressPointer += 3; 841 break; 842 default: 843 throw new RuntimeException("Has attribute with no address"); 844 } 845 bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, 846 bigramAddress)); 847 } 848 } 849 return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, 850 childrenAddress, bigrams); 851 } 852 853 /** 854 * Finds, as a string, the word at the address passed as an argument. 855 * 856 * @param source the file to read from. 857 * @param headerSize the size of the header. 858 * @param address the address to seek. 859 * @return the word, as a string. 860 * @throws IOException if the file can't be read. 861 */ 862 private static String getWordAtAddress(RandomAccessFile source, long headerSize, 863 int address) throws IOException { 864 final long originalPointer = source.getFilePointer(); 865 source.seek(headerSize); 866 final int count = source.readUnsignedByte(); 867 int groupOffset = 1; // 1 for the group count 868 final StringBuilder builder = new StringBuilder(); 869 String result = null; 870 871 CharGroupInfo last = null; 872 for (int i = count - 1; i >= 0; --i) { 873 CharGroupInfo info = readCharGroup(source, groupOffset); 874 groupOffset = info.mEndAddress; 875 if (info.mOriginalAddress == address) { 876 builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); 877 result = builder.toString(); 878 break; // and return 879 } 880 if (hasChildrenAddress(info.mChildrenAddress)) { 881 if (info.mChildrenAddress > address) { 882 if (null == last) continue; 883 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 884 source.seek(last.mChildrenAddress + headerSize); 885 groupOffset = last.mChildrenAddress + 1; 886 i = source.readUnsignedByte(); 887 last = null; 888 continue; 889 } 890 last = info; 891 } 892 if (0 == i && hasChildrenAddress(last.mChildrenAddress)) { 893 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 894 source.seek(last.mChildrenAddress + headerSize); 895 groupOffset = last.mChildrenAddress + 1; 896 i = source.readUnsignedByte(); 897 last = null; 898 continue; 899 } 900 } 901 source.seek(originalPointer); 902 return result; 903 } 904 905 /** 906 * Reads a single node from a binary file. 907 * 908 * This methods reads the file at the current position of its file pointer. A node is 909 * fully expected to start at the current position. 910 * This will recursively read other nodes into the structure, populating the reverse 911 * maps on the fly and using them to keep track of already read nodes. 912 * 913 * @param source the data file, correctly positioned at the start of a node. 914 * @param headerSize the size, in bytes, of the file header. 915 * @param reverseNodeMap a mapping from addresses to already read nodes. 916 * @param reverseGroupMap a mapping from addresses to already read character groups. 917 * @return the read node with all his children already read. 918 */ 919 private static Node readNode(RandomAccessFile source, long headerSize, 920 Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap) 921 throws IOException { 922 final int nodeOrigin = (int)(source.getFilePointer() - headerSize); 923 final int count = source.readUnsignedByte(); 924 final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>(); 925 int groupOffset = nodeOrigin + 1; // 1 byte for the group count 926 for (int i = count; i > 0; --i) { 927 CharGroupInfo info = readCharGroup(source, groupOffset); 928 ArrayList<WeightedString> bigrams = null; 929 if (null != info.mBigrams) { 930 bigrams = new ArrayList<WeightedString>(); 931 for (PendingAttribute bigram : info.mBigrams) { 932 final String word = getWordAtAddress(source, headerSize, bigram.mAddress); 933 bigrams.add(new WeightedString(word, bigram.mFrequency)); 934 } 935 } 936 if (hasChildrenAddress(info.mChildrenAddress)) { 937 Node children = reverseNodeMap.get(info.mChildrenAddress); 938 if (null == children) { 939 final long currentPosition = source.getFilePointer(); 940 source.seek(info.mChildrenAddress + headerSize); 941 children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap); 942 source.seek(currentPosition); 943 } 944 nodeContents.add( 945 new CharGroup(info.mCharacters, bigrams, info.mFrequency, 946 children)); 947 } else { 948 nodeContents.add( 949 new CharGroup(info.mCharacters, bigrams, info.mFrequency)); 950 } 951 groupOffset = info.mEndAddress; 952 } 953 final Node node = new Node(nodeContents); 954 node.mCachedAddress = nodeOrigin; 955 reverseNodeMap.put(node.mCachedAddress, node); 956 return node; 957 } 958 959 /** 960 * Reads a random access file and returns the memory representation of the dictionary. 961 * 962 * This high-level method takes a binary file and reads its contents, populating a 963 * FusionDictionary structure. The optional dict argument is an existing dictionary to 964 * which words from the file should be added. If it is null, a new dictionary is created. 965 * 966 * @param source the file to read. 967 * @param dict an optional dictionary to add words to, or null. 968 * @return the created (or merged) dictionary. 969 */ 970 public static FusionDictionary readDictionaryBinary(RandomAccessFile source, 971 FusionDictionary dict) throws IOException, UnsupportedFormatException { 972 // Check magic number 973 final int magic = source.readUnsignedShort(); 974 if (MAGIC_NUMBER != magic) { 975 throw new UnsupportedFormatException("The magic number in this file does not match " 976 + "the expected value"); 977 } 978 979 // Check file version 980 final int version = source.readUnsignedByte(); 981 if (version > MAXIMUM_SUPPORTED_VERSION) { 982 throw new UnsupportedFormatException("This file has version " + version 983 + ", but this implementation does not support versions above " 984 + MAXIMUM_SUPPORTED_VERSION); 985 } 986 987 // Read options 988 source.readUnsignedShort(); 989 990 long headerSize = source.getFilePointer(); 991 Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>(); 992 Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>(); 993 final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping); 994 995 FusionDictionary newDict = new FusionDictionary(root, 996 new FusionDictionary.DictionaryOptions()); 997 if (null != dict) { 998 for (Word w : dict) { 999 newDict.add(w.mWord, w.mFrequency, w.mBigrams); 1000 } 1001 } 1002 1003 return newDict; 1004 } 1005 1006 /** 1007 * Basic test to find out whether the file is a binary dictionary or not. 1008 * 1009 * Concretely this only tests the magic number. 1010 * 1011 * @param filename The name of the file to test. 1012 * @return true if it's a binary dictionary, false otherwise 1013 */ 1014 public static boolean isBinaryDictionary(String filename) { 1015 try { 1016 RandomAccessFile f = new RandomAccessFile(filename, "r"); 1017 return MAGIC_NUMBER == f.readUnsignedShort(); 1018 } catch (FileNotFoundException e) { 1019 return false; 1020 } catch (IOException e) { 1021 return false; 1022 } 1023 } 1024 } 1025