Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
     20 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
     21 import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
     22 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     23 
     24 import java.io.ByteArrayOutputStream;
     25 import java.io.FileNotFoundException;
     26 import java.io.IOException;
     27 import java.io.OutputStream;
     28 import java.io.RandomAccessFile;
     29 import java.util.ArrayList;
     30 import java.util.Arrays;
     31 import java.util.HashMap;
     32 import java.util.Iterator;
     33 import java.util.Map;
     34 import java.util.TreeMap;
     35 
     36 /**
     37  * Reads and writes XML files for a FusionDictionary.
     38  *
     39  * All the methods in this class are static.
     40  */
     41 public class BinaryDictInputOutput {
     42 
     43     final static boolean DBG = MakedictLog.DBG;
     44 
     45     /* Node layout is as follows:
     46      *   | addressType                         xx     : mask with MASK_GROUP_ADDRESS_TYPE
     47      *                                 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
     48      * f |                                     01 = 1 byte      : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
     49      * l |                                     10 = 2 bytes     : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
     50      * a |                                     11 = 3 bytes     : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
     51      * g | has several chars ?         1 bit, 1 = yes, 0 = no   : FLAG_HAS_MULTIPLE_CHARS
     52      * s | has a terminal ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_TERMINAL
     53      *   | has shortcut targets ?      1 bit, 1 = yes, 0 = no   : FLAG_HAS_SHORTCUT_TARGETS
     54      *   | has bigrams ?               1 bit, 1 = yes, 0 = no   : FLAG_HAS_BIGRAMS
     55      *
     56      * c | IF FLAG_HAS_MULTIPLE_CHARS
     57      * h |   char, char, char, char    n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
     58      * a |   end                       1 byte, = 0
     59      * r | ELSE
     60      * s |   char                      1 or 3 bytes
     61      *   | END
     62      *
     63      * f |
     64      * r | IF FLAG_IS_TERMINAL
     65      * e |   frequency                 1 byte
     66      * q |
     67      *
     68      * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType
     69      * h |   // nothing
     70      * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType
     71      * l |   children address, 1 byte
     72      * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType
     73      * r |   children address, 2 bytes
     74      * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType
     75      * n |   children address, 3 bytes
     76      * A | END
     77      * d
     78      * dress
     79      *
     80      *   | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
     81      *   | shortcut string list
     82      *   | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
     83      *   | bigrams address list
     84      *
     85      * Char format is:
     86      * 1 byte = bbbbbbbb match
     87      * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
     88      * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
     89      *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
     90      *       00011111 would be outside unicode.
     91      * else: iso-latin-1 code
     92      * This allows for the whole unicode range to be encoded, including chars outside of
     93      * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
     94      * characters which should never happen anyway (and still work, but take 3 bytes).
     95      *
     96      * bigram address list is:
     97      * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no     : FLAG_ATTRIBUTE_HAS_NEXT
     98      *           | addressSign = 1 bit,                 : FLAG_ATTRIBUTE_OFFSET_NEGATIVE
     99      *           |                      1 = must take -address, 0 = must take +address
    100      *           |                         xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE
    101      *           | addressFormat = 2 bits, 00 = unused  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
    102      *           |                         01 = 1 byte  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
    103      *           |                         10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES
    104      *           |                         11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES
    105      *           | 4 bits : frequency         : mask with FLAG_ATTRIBUTE_FREQUENCY
    106      * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat)
    107      *           |   read 1 byte, add top 4 bits
    108      *           | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat)
    109      *           |   read 2 bytes, add top 4 bits
    110      *           | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat
    111      *           |   read 3 bytes, add top 4 bits
    112      *           | END
    113      *           | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address
    114      * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is
    115      *
    116      * shortcut string list is:
    117      * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
    118      * <flags>     = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
    119      *               | reserved = 3 bits, must be 0
    120      *               | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
    121      * <shortcut>  = | string of characters at the char format described above, with the terminator
    122      *               | used to signal the end of the string.
    123      * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags
    124      */
    125 
    126     private static final int VERSION_1_MAGIC_NUMBER = 0x78B1;
    127     private static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
    128     private static final int MINIMUM_SUPPORTED_VERSION = 1;
    129     private static final int MAXIMUM_SUPPORTED_VERSION = 2;
    130     private static final int NOT_A_VERSION_NUMBER = -1;
    131     private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2;
    132 
    133     // These options need to be the same numeric values as the one in the native reading code.
    134     private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
    135     private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
    136     private static final int CONTAINS_BIGRAMS_FLAG = 0x8;
    137 
    138     // TODO: Make this value adaptative to content data, store it in the header, and
    139     // use it in the reading code.
    140     private static final int MAX_WORD_LENGTH = 48;
    141 
    142     private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
    143     private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
    144     private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
    145     private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
    146     private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
    147 
    148     private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
    149 
    150     private static final int FLAG_IS_TERMINAL = 0x10;
    151     private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
    152     private static final int FLAG_HAS_BIGRAMS = 0x04;
    153 
    154     private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
    155     private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
    156     private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
    157     private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
    158     private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
    159     private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
    160     private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F;
    161 
    162     private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F;
    163 
    164     private static final int GROUP_TERMINATOR_SIZE = 1;
    165     private static final int GROUP_FLAGS_SIZE = 1;
    166     private static final int GROUP_FREQUENCY_SIZE = 1;
    167     private static final int GROUP_MAX_ADDRESS_SIZE = 3;
    168     private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1;
    169     private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
    170     private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2;
    171 
    172     private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
    173     private static final int INVALID_CHARACTER = -1;
    174 
    175     private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
    176     private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
    177 
    178     private static final int MAX_TERMINAL_FREQUENCY = 255;
    179     private static final int MAX_BIGRAM_FREQUENCY = 15;
    180 
    181     // Arbitrary limit to how much passes we consider address size compression should
    182     // terminate in. At the time of this writing, our largest dictionary completes
    183     // compression in five passes.
    184     // If the number of passes exceeds this number, makedict bails with an exception on
    185     // suspicion that a bug might be causing an infinite loop.
    186     private static final int MAX_PASSES = 24;
    187 
    188     /**
    189      * A class grouping utility function for our specific character encoding.
    190      */
    191     private static class CharEncoding {
    192 
    193         private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
    194         private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
    195 
    196         /**
    197          * Helper method to find out whether this code fits on one byte
    198          */
    199         private static boolean fitsOnOneByte(int character) {
    200             return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
    201                     && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
    202         }
    203 
    204         /**
    205          * Compute the size of a character given its character code.
    206          *
    207          * Char format is:
    208          * 1 byte = bbbbbbbb match
    209          * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
    210          * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
    211          *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
    212          *       00011111 would be outside unicode.
    213          * else: iso-latin-1 code
    214          * This allows for the whole unicode range to be encoded, including chars outside of
    215          * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
    216          * characters which should never happen anyway (and still work, but take 3 bytes).
    217          *
    218          * @param character the character code.
    219          * @return the size in binary encoded-form, either 1 or 3 bytes.
    220          */
    221         private static int getCharSize(int character) {
    222             // See char encoding in FusionDictionary.java
    223             if (fitsOnOneByte(character)) return 1;
    224             if (INVALID_CHARACTER == character) return 1;
    225             return 3;
    226         }
    227 
    228         /**
    229          * Compute the byte size of a character array.
    230          */
    231         private static int getCharArraySize(final int[] chars) {
    232             int size = 0;
    233             for (int character : chars) size += getCharSize(character);
    234             return size;
    235         }
    236 
    237         /**
    238          * Writes a char array to a byte buffer.
    239          *
    240          * @param codePoints the code point array to write.
    241          * @param buffer the byte buffer to write to.
    242          * @param index the index in buffer to write the character array to.
    243          * @return the index after the last character.
    244          */
    245         private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
    246             for (int codePoint : codePoints) {
    247                 if (1 == getCharSize(codePoint)) {
    248                     buffer[index++] = (byte)codePoint;
    249                 } else {
    250                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    251                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    252                     buffer[index++] = (byte)(0xFF & codePoint);
    253                 }
    254             }
    255             return index;
    256         }
    257 
    258         /**
    259          * Writes a string with our character format to a byte buffer.
    260          *
    261          * This will also write the terminator byte.
    262          *
    263          * @param buffer the byte buffer to write to.
    264          * @param origin the offset to write from.
    265          * @param word the string to write.
    266          * @return the size written, in bytes.
    267          */
    268         private static int writeString(final byte[] buffer, final int origin,
    269                 final String word) {
    270             final int length = word.length();
    271             int index = origin;
    272             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    273                 final int codePoint = word.codePointAt(i);
    274                 if (1 == getCharSize(codePoint)) {
    275                     buffer[index++] = (byte)codePoint;
    276                 } else {
    277                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    278                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    279                     buffer[index++] = (byte)(0xFF & codePoint);
    280                 }
    281             }
    282             buffer[index++] = GROUP_CHARACTERS_TERMINATOR;
    283             return index - origin;
    284         }
    285 
    286         /**
    287          * Writes a string with our character format to a ByteArrayOutputStream.
    288          *
    289          * This will also write the terminator byte.
    290          *
    291          * @param buffer the ByteArrayOutputStream to write to.
    292          * @param word the string to write.
    293          */
    294         private static void writeString(ByteArrayOutputStream buffer, final String word) {
    295             final int length = word.length();
    296             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    297                 final int codePoint = word.codePointAt(i);
    298                 if (1 == getCharSize(codePoint)) {
    299                     buffer.write((byte) codePoint);
    300                 } else {
    301                     buffer.write((byte) (0xFF & (codePoint >> 16)));
    302                     buffer.write((byte) (0xFF & (codePoint >> 8)));
    303                     buffer.write((byte) (0xFF & codePoint));
    304                 }
    305             }
    306             buffer.write(GROUP_CHARACTERS_TERMINATOR);
    307         }
    308 
    309         /**
    310          * Reads a string from a RandomAccessFile. This is the converse of the above method.
    311          */
    312         private static String readString(final RandomAccessFile source) throws IOException {
    313             final StringBuilder s = new StringBuilder();
    314             int character = readChar(source);
    315             while (character != INVALID_CHARACTER) {
    316                 s.appendCodePoint(character);
    317                 character = readChar(source);
    318             }
    319             return s.toString();
    320         }
    321 
    322         /**
    323          * Reads a character from the file.
    324          *
    325          * This follows the character format documented earlier in this source file.
    326          *
    327          * @param source the file, positioned over an encoded character.
    328          * @return the character code.
    329          */
    330         private static int readChar(RandomAccessFile source) throws IOException {
    331             int character = source.readUnsignedByte();
    332             if (!fitsOnOneByte(character)) {
    333                 if (GROUP_CHARACTERS_TERMINATOR == character)
    334                     return INVALID_CHARACTER;
    335                 character <<= 16;
    336                 character += source.readUnsignedShort();
    337             }
    338             return character;
    339         }
    340     }
    341 
    342     /**
    343      * Compute the binary size of the character array in a group
    344      *
    345      * If only one character, this is the size of this character. If many, it's the sum of their
    346      * sizes + 1 byte for the terminator.
    347      *
    348      * @param group the group
    349      * @return the size of the char array, including the terminator if any
    350      */
    351     private static int getGroupCharactersSize(CharGroup group) {
    352         int size = CharEncoding.getCharArraySize(group.mChars);
    353         if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE;
    354         return size;
    355     }
    356 
    357     /**
    358      * Compute the binary size of the group count
    359      * @param count the group count
    360      * @return the size of the group count, either 1 or 2 bytes.
    361      */
    362     private static int getGroupCountSize(final int count) {
    363         if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) {
    364             return 1;
    365         } else if (MAX_CHARGROUPS_IN_A_NODE >= count) {
    366             return 2;
    367         } else {
    368             throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE
    369                     + " groups in a node (found " + count +")");
    370         }
    371     }
    372 
    373     /**
    374      * Compute the binary size of the group count for a node
    375      * @param node the node
    376      * @return the size of the group count, either 1 or 2 bytes.
    377      */
    378     private static int getGroupCountSize(final Node node) {
    379         return getGroupCountSize(node.mData.size());
    380     }
    381 
    382     /**
    383      * Compute the size of a shortcut in bytes.
    384      */
    385     private static int getShortcutSize(final WeightedString shortcut) {
    386         int size = GROUP_ATTRIBUTE_FLAGS_SIZE;
    387         final String word = shortcut.mWord;
    388         final int length = word.length();
    389         for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    390             final int codePoint = word.codePointAt(i);
    391             size += CharEncoding.getCharSize(codePoint);
    392         }
    393         size += GROUP_TERMINATOR_SIZE;
    394         return size;
    395     }
    396 
    397     /**
    398      * Compute the size of a shortcut list in bytes.
    399      *
    400      * This is known in advance and does not change according to position in the file
    401      * like address lists do.
    402      */
    403     private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
    404         if (null == shortcutList) return 0;
    405         int size = GROUP_SHORTCUT_LIST_SIZE_SIZE;
    406         for (final WeightedString shortcut : shortcutList) {
    407             size += getShortcutSize(shortcut);
    408         }
    409         return size;
    410     }
    411 
    412     /**
    413      * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything.
    414      *
    415      * @param group the CharGroup to compute the size of.
    416      * @return the maximum size of the group.
    417      */
    418     private static int getCharGroupMaximumSize(CharGroup group) {
    419         int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE;
    420         // If terminal, one byte for the frequency
    421         if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE;
    422         size += GROUP_MAX_ADDRESS_SIZE; // For children address
    423         size += getShortcutListSize(group.mShortcutTargets);
    424         if (null != group.mBigrams) {
    425             size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE)
    426                     * group.mBigrams.size();
    427         }
    428         return size;
    429     }
    430 
    431     /**
    432      * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches
    433      * it in the 'actualSize' member of the node.
    434      *
    435      * @param node the node to compute the maximum size of.
    436      */
    437     private static void setNodeMaximumSize(Node node) {
    438         int size = getGroupCountSize(node);
    439         for (CharGroup g : node.mData) {
    440             final int groupSize = getCharGroupMaximumSize(g);
    441             g.mCachedSize = groupSize;
    442             size += groupSize;
    443         }
    444         node.mCachedSize = size;
    445     }
    446 
    447     /**
    448      * Helper method to hide the actual value of the no children address.
    449      */
    450     private static boolean hasChildrenAddress(int address) {
    451         return NO_CHILDREN_ADDRESS != address;
    452     }
    453 
    454     /**
    455      * Compute the size, in bytes, that an address will occupy.
    456      *
    457      * This can be used either for children addresses (which are always positive) or for
    458      * attribute, which may be positive or negative but
    459      * store their sign bit separately.
    460      *
    461      * @param address the address
    462      * @return the byte size.
    463      */
    464     private static int getByteSize(int address) {
    465         assert(address < 0x1000000);
    466         if (!hasChildrenAddress(address)) {
    467             return 0;
    468         } else if (Math.abs(address) < 0x100) {
    469             return 1;
    470         } else if (Math.abs(address) < 0x10000) {
    471             return 2;
    472         } else {
    473             return 3;
    474         }
    475     }
    476     // End utility methods.
    477 
    478     // This method is responsible for finding a nice ordering of the nodes that favors run-time
    479     // cache performance and dictionary size.
    480     /* package for tests */ static ArrayList<Node> flattenTree(Node root) {
    481         final int treeSize = FusionDictionary.countCharGroups(root);
    482         MakedictLog.i("Counted nodes : " + treeSize);
    483         final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize);
    484         return flattenTreeInner(flatTree, root);
    485     }
    486 
    487     private static ArrayList<Node> flattenTreeInner(ArrayList<Node> list, Node node) {
    488         // Removing the node is necessary if the tails are merged, because we would then
    489         // add the same node several times when we only want it once. A number of places in
    490         // the code also depends on any node being only once in the list.
    491         // Merging tails can only be done if there are no attributes. Searching for attributes
    492         // in LatinIME code depends on a total breadth-first ordering, which merging tails
    493         // breaks. If there are no attributes, it should be fine (and reduce the file size)
    494         // to merge tails, and removing the node from the list would be necessary. However,
    495         // we don't merge tails because breaking the breadth-first ordering would result in
    496         // extreme overhead at bigram lookup time (it would make the search function O(n) instead
    497         // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty
    498         // high).
    499         // If no nodes are ever merged, we can't have the same node twice in the list, hence
    500         // searching for duplicates in unnecessary. It is also very performance consuming,
    501         // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making
    502         // this simple list.remove operation O(n*n) overall. On Android this overhead is very
    503         // high.
    504         // For future reference, the code to remove duplicate is a simple : list.remove(node);
    505         list.add(node);
    506         final ArrayList<CharGroup> branches = node.mData;
    507         final int nodeSize = branches.size();
    508         for (CharGroup group : branches) {
    509             if (null != group.mChildren) flattenTreeInner(list, group.mChildren);
    510         }
    511         return list;
    512     }
    513 
    514     /**
    515      * Finds the absolute address of a word in the dictionary.
    516      *
    517      * @param dict the dictionary in which to search.
    518      * @param word the word we are searching for.
    519      * @return the word address. If it is not found, an exception is thrown.
    520      */
    521     private static int findAddressOfWord(final FusionDictionary dict, final String word) {
    522         return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress;
    523     }
    524 
    525     /**
    526      * Computes the actual node size, based on the cached addresses of the children nodes.
    527      *
    528      * Each node stores its tentative address. During dictionary address computing, these
    529      * are not final, but they can be used to compute the node size (the node size depends
    530      * on the address of the children because the number of bytes necessary to store an
    531      * address depends on its numeric value. The return value indicates whether the node
    532      * contents (as in, any of the addresses stored in the cache fields) have changed with
    533      * respect to their previous value.
    534      *
    535      * @param node the node to compute the size of.
    536      * @param dict the dictionary in which the word/attributes are to be found.
    537      * @return false if none of the cached addresses inside the node changed, true otherwise.
    538      */
    539     private static boolean computeActualNodeSize(Node node, FusionDictionary dict) {
    540         boolean changed = false;
    541         int size = getGroupCountSize(node);
    542         for (CharGroup group : node.mData) {
    543             if (group.mCachedAddress != node.mCachedAddress + size) {
    544                 changed = true;
    545                 group.mCachedAddress = node.mCachedAddress + size;
    546             }
    547             int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
    548             if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE;
    549             if (null != group.mChildren) {
    550                 final int offsetBasePoint= groupSize + node.mCachedAddress + size;
    551                 final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
    552                 groupSize += getByteSize(offset);
    553             }
    554             groupSize += getShortcutListSize(group.mShortcutTargets);
    555             if (null != group.mBigrams) {
    556                 for (WeightedString bigram : group.mBigrams) {
    557                     final int offsetBasePoint = groupSize + node.mCachedAddress + size
    558                             + GROUP_FLAGS_SIZE;
    559                     final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
    560                     final int offset = addressOfBigram - offsetBasePoint;
    561                     groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE;
    562                 }
    563             }
    564             group.mCachedSize = groupSize;
    565             size += groupSize;
    566         }
    567         if (node.mCachedSize != size) {
    568             node.mCachedSize = size;
    569             changed = true;
    570         }
    571         return changed;
    572     }
    573 
    574     /**
    575      * Computes the byte size of a list of nodes and updates each node cached position.
    576      *
    577      * @param flatNodes the array of nodes.
    578      * @return the byte size of the entire stack.
    579      */
    580     private static int stackNodes(ArrayList<Node> flatNodes) {
    581         int nodeOffset = 0;
    582         for (Node n : flatNodes) {
    583             n.mCachedAddress = nodeOffset;
    584             int groupCountSize = getGroupCountSize(n);
    585             int groupOffset = 0;
    586             for (CharGroup g : n.mData) {
    587                 g.mCachedAddress = groupCountSize + nodeOffset + groupOffset;
    588                 groupOffset += g.mCachedSize;
    589             }
    590             if (groupOffset + groupCountSize != n.mCachedSize) {
    591                 throw new RuntimeException("Bug : Stored and computed node size differ");
    592             }
    593             nodeOffset += n.mCachedSize;
    594         }
    595         return nodeOffset;
    596     }
    597 
    598     /**
    599      * Compute the addresses and sizes of an ordered node array.
    600      *
    601      * This method takes a node array and will update its cached address and size values
    602      * so that they can be written into a file. It determines the smallest size each of the
    603      * nodes can be given the addresses of its children and attributes, and store that into
    604      * each node.
    605      * The order of the node is given by the order of the array. This method makes no effort
    606      * to find a good order; it only mechanically computes the size this order results in.
    607      *
    608      * @param dict the dictionary
    609      * @param flatNodes the ordered array of nodes
    610      * @return the same array it was passed. The nodes have been updated for address and size.
    611      */
    612     private static ArrayList<Node> computeAddresses(FusionDictionary dict,
    613             ArrayList<Node> flatNodes) {
    614         // First get the worst sizes and offsets
    615         for (Node n : flatNodes) setNodeMaximumSize(n);
    616         final int offset = stackNodes(flatNodes);
    617 
    618         MakedictLog.i("Compressing the array addresses. Original size : " + offset);
    619         MakedictLog.i("(Recursively seen size : " + offset + ")");
    620 
    621         int passes = 0;
    622         boolean changesDone = false;
    623         do {
    624             changesDone = false;
    625             for (Node n : flatNodes) {
    626                 final int oldNodeSize = n.mCachedSize;
    627                 final boolean changed = computeActualNodeSize(n, dict);
    628                 final int newNodeSize = n.mCachedSize;
    629                 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!");
    630                 changesDone |= changed;
    631             }
    632             stackNodes(flatNodes);
    633             ++passes;
    634             if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
    635         } while (changesDone);
    636 
    637         final Node lastNode = flatNodes.get(flatNodes.size() - 1);
    638         MakedictLog.i("Compression complete in " + passes + " passes.");
    639         MakedictLog.i("After address compression : "
    640                 + (lastNode.mCachedAddress + lastNode.mCachedSize));
    641 
    642         return flatNodes;
    643     }
    644 
    645     /**
    646      * Sanity-checking method.
    647      *
    648      * This method checks an array of node for juxtaposition, that is, it will do
    649      * nothing if each node's cached address is actually the previous node's address
    650      * plus the previous node's size.
    651      * If this is not the case, it will throw an exception.
    652      *
    653      * @param array the array node to check
    654      */
    655     private static void checkFlatNodeArray(ArrayList<Node> array) {
    656         int offset = 0;
    657         int index = 0;
    658         for (Node n : array) {
    659             if (n.mCachedAddress != offset) {
    660                 throw new RuntimeException("Wrong address for node " + index
    661                         + " : expected " + offset + ", got " + n.mCachedAddress);
    662             }
    663             ++index;
    664             offset += n.mCachedSize;
    665         }
    666     }
    667 
    668     /**
    669      * Helper method to write a variable-size address to a file.
    670      *
    671      * @param buffer the buffer to write to.
    672      * @param index the index in the buffer to write the address to.
    673      * @param address the address to write.
    674      * @return the size in bytes the address actually took.
    675      */
    676     private static int writeVariableAddress(final byte[] buffer, int index, final int address) {
    677         switch (getByteSize(address)) {
    678         case 1:
    679             buffer[index++] = (byte)address;
    680             return 1;
    681         case 2:
    682             buffer[index++] = (byte)(0xFF & (address >> 8));
    683             buffer[index++] = (byte)(0xFF & address);
    684             return 2;
    685         case 3:
    686             buffer[index++] = (byte)(0xFF & (address >> 16));
    687             buffer[index++] = (byte)(0xFF & (address >> 8));
    688             buffer[index++] = (byte)(0xFF & address);
    689             return 3;
    690         case 0:
    691             return 0;
    692         default:
    693             throw new RuntimeException("Address " + address + " has a strange size");
    694         }
    695     }
    696 
    697     private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress,
    698             final int childrenOffset) {
    699         byte flags = 0;
    700         if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS;
    701         if (group.mFrequency >= 0) {
    702             flags |= FLAG_IS_TERMINAL;
    703         }
    704         if (null != group.mChildren) {
    705             switch (getByteSize(childrenOffset)) {
    706              case 1:
    707                  flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
    708                  break;
    709              case 2:
    710                  flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
    711                  break;
    712              case 3:
    713                  flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
    714                  break;
    715              default:
    716                  throw new RuntimeException("Node with a strange address");
    717              }
    718         }
    719         if (null != group.mShortcutTargets) {
    720             if (DBG && 0 == group.mShortcutTargets.size()) {
    721                 throw new RuntimeException("0-sized shortcut list must be null");
    722             }
    723             flags |= FLAG_HAS_SHORTCUT_TARGETS;
    724         }
    725         if (null != group.mBigrams) {
    726             if (DBG && 0 == group.mBigrams.size()) {
    727                 throw new RuntimeException("0-sized bigram list must be null");
    728             }
    729             flags |= FLAG_HAS_BIGRAMS;
    730         }
    731         return flags;
    732     }
    733 
    734     /**
    735      * Makes the flag value for a bigram.
    736      *
    737      * @param more whether there are more bigrams after this one.
    738      * @param offset the offset of the bigram.
    739      * @param bigramFrequency the frequency of the bigram, 0..255.
    740      * @param unigramFrequency the unigram frequency of the same word, 0..255.
    741      * @param word the second bigram, for debugging purposes
    742      * @return the flags
    743      */
    744     private static final int makeBigramFlags(final boolean more, final int offset,
    745             int bigramFrequency, final int unigramFrequency, final String word) {
    746         int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0)
    747                 + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0);
    748         switch (getByteSize(offset)) {
    749         case 1:
    750             bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
    751             break;
    752         case 2:
    753             bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
    754             break;
    755         case 3:
    756             bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
    757             break;
    758         default:
    759             throw new RuntimeException("Strange offset size");
    760         }
    761         if (unigramFrequency > bigramFrequency) {
    762             MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word
    763                     + "\". Bigram freq is " + bigramFrequency + ", unigram freq for "
    764                     + word + " is " + unigramFrequency);
    765             bigramFrequency = unigramFrequency;
    766         }
    767         // We compute the difference between 255 (which means probability = 1) and the
    768         // unigram score. We split this into a number of discrete steps.
    769         // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15
    770         // represents an increase of 16 steps: a value of 15 will be interpreted as the median
    771         // value of the 16th step. In all justice, if the bigram frequency is low enough to be
    772         // rounded below the first step (which means it is less than half a step higher than the
    773         // unigram frequency) then the unigram frequency itself is the best approximation of the
    774         // bigram freq that we could possibly supply, hence we should *not* include this bigram
    775         // in the file at all.
    776         // until this is done, we'll write 0 and slightly overestimate this case.
    777         // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step
    778         // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to
    779         // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the
    780         // step size. Then we compute the start of the first step (the one where value 0 starts)
    781         // by adding half-a-step to the unigramFrequency. From there, we compute the integer
    782         // number of steps to the bigramFrequency. One last thing: we want our steps to include
    783         // their lower bound and exclude their higher bound so we need to have the first step
    784         // start at exactly 1 unit higher than floor(unigramFreq + half a step).
    785         // Note : to reconstruct the score, the dictionary reader will need to divide
    786         // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise, and add
    787         // (discretizedFrequency + 0.5) times this value to get the median value of the step,
    788         // which is the best approximation. This is how we get the most precise result with
    789         // only four bits.
    790         final double stepSize =
    791                 (double)(MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5 + MAX_BIGRAM_FREQUENCY);
    792         final double firstStepStart = 1 + unigramFrequency + (stepSize / 2.0);
    793         final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize);
    794         // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1
    795         // here. The best approximation would be the unigram freq itself, so we should not
    796         // include this bigram in the dictionary. For now, register as 0, and live with the
    797         // small over-estimation that we get in this case. TODO: actually remove this bigram
    798         // if discretizedFrequency < 0.
    799         final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0;
    800         bigramFlags += finalBigramFrequency & FLAG_ATTRIBUTE_FREQUENCY;
    801         return bigramFlags;
    802     }
    803 
    804     /**
    805      * Makes the 2-byte value for options flags.
    806      */
    807     private static final int makeOptionsValue(final FusionDictionary dictionary) {
    808         final DictionaryOptions options = dictionary.mOptions;
    809         final boolean hasBigrams = dictionary.hasBigrams();
    810         return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0)
    811                 + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0)
    812                 + (hasBigrams ? CONTAINS_BIGRAMS_FLAG : 0);
    813     }
    814 
    815     /**
    816      * Makes the flag value for a shortcut.
    817      *
    818      * @param more whether there are more attributes after this one.
    819      * @param frequency the frequency of the attribute, 0..15
    820      * @return the flags
    821      */
    822     private static final int makeShortcutFlags(final boolean more, final int frequency) {
    823         return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY);
    824     }
    825 
    826     /**
    827      * Write a node to memory. The node is expected to have its final position cached.
    828      *
    829      * This can be an empty map, but the more is inside the faster the lookups will be. It can
    830      * be carried on as long as nodes do not move.
    831      *
    832      * @param dict the dictionary the node is a part of (for relative offsets).
    833      * @param buffer the memory buffer to write to.
    834      * @param node the node to write.
    835      * @return the address of the END of the node.
    836      */
    837     private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) {
    838         int index = node.mCachedAddress;
    839 
    840         final int groupCount = node.mData.size();
    841         final int countSize = getGroupCountSize(node);
    842         if (1 == countSize) {
    843             buffer[index++] = (byte)groupCount;
    844         } else if (2 == countSize) {
    845             // We need to signal 2-byte size by setting the top bit of the MSB to 1, so
    846             // we | 0x80 to do this.
    847             buffer[index++] = (byte)((groupCount >> 8) | 0x80);
    848             buffer[index++] = (byte)(groupCount & 0xFF);
    849         } else {
    850             throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
    851         }
    852         int groupAddress = index;
    853         for (int i = 0; i < groupCount; ++i) {
    854             CharGroup group = node.mData.get(i);
    855             if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not "
    856                     + "the same as the cached address of the group : "
    857                     + index + " <> " + group.mCachedAddress);
    858             groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
    859             // Sanity checks.
    860             if (DBG && group.mFrequency > MAX_TERMINAL_FREQUENCY) {
    861                 throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY
    862                         + " : " + group.mFrequency);
    863             }
    864             if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE;
    865             final int childrenOffset = null == group.mChildren
    866                     ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress;
    867             byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset);
    868             buffer[index++] = flags;
    869             index = CharEncoding.writeCharArray(group.mChars, buffer, index);
    870             if (group.hasSeveralChars()) {
    871                 buffer[index++] = GROUP_CHARACTERS_TERMINATOR;
    872             }
    873             if (group.mFrequency >= 0) {
    874                 buffer[index++] = (byte) group.mFrequency;
    875             }
    876             final int shift = writeVariableAddress(buffer, index, childrenOffset);
    877             index += shift;
    878             groupAddress += shift;
    879 
    880             // Write shortcuts
    881             if (null != group.mShortcutTargets) {
    882                 final int indexOfShortcutByteSize = index;
    883                 index += GROUP_SHORTCUT_LIST_SIZE_SIZE;
    884                 groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE;
    885                 final Iterator<WeightedString> shortcutIterator = group.mShortcutTargets.iterator();
    886                 while (shortcutIterator.hasNext()) {
    887                     final WeightedString target = shortcutIterator.next();
    888                     ++groupAddress;
    889                     int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(),
    890                             target.mFrequency);
    891                     buffer[index++] = (byte)shortcutFlags;
    892                     final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord);
    893                     index += shortcutShift;
    894                     groupAddress += shortcutShift;
    895                 }
    896                 final int shortcutByteSize = index - indexOfShortcutByteSize;
    897                 if (shortcutByteSize > 0xFFFF) {
    898                     throw new RuntimeException("Shortcut list too large");
    899                 }
    900                 buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8);
    901                 buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF);
    902             }
    903             // Write bigrams
    904             if (null != group.mBigrams) {
    905                 final Iterator<WeightedString> bigramIterator = group.mBigrams.iterator();
    906                 while (bigramIterator.hasNext()) {
    907                     final WeightedString bigram = bigramIterator.next();
    908                     final CharGroup target =
    909                             FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord);
    910                     final int addressOfBigram = target.mCachedAddress;
    911                     final int unigramFrequencyForThisWord = target.mFrequency;
    912                     ++groupAddress;
    913                     final int offset = addressOfBigram - groupAddress;
    914                     int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset,
    915                             bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
    916                     buffer[index++] = (byte)bigramFlags;
    917                     final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
    918                     index += bigramShift;
    919                     groupAddress += bigramShift;
    920                 }
    921             }
    922 
    923         }
    924         if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException(
    925                 "Not the same size : written "
    926                 + (index - node.mCachedAddress) + " bytes out of a node that should have "
    927                 + node.mCachedSize + " bytes");
    928         return index;
    929     }
    930 
    931     /**
    932      * Dumps a collection of useful statistics about a node array.
    933      *
    934      * This prints purely informative stuff, like the total estimated file size, the
    935      * number of nodes, of character groups, the repartition of each address size, etc
    936      *
    937      * @param nodes the node array.
    938      */
    939     private static void showStatistics(ArrayList<Node> nodes) {
    940         int firstTerminalAddress = Integer.MAX_VALUE;
    941         int lastTerminalAddress = Integer.MIN_VALUE;
    942         int size = 0;
    943         int charGroups = 0;
    944         int maxGroups = 0;
    945         int maxRuns = 0;
    946         for (Node n : nodes) {
    947             if (maxGroups < n.mData.size()) maxGroups = n.mData.size();
    948             for (CharGroup cg : n.mData) {
    949                 ++charGroups;
    950                 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
    951                 if (cg.mFrequency >= 0) {
    952                     if (n.mCachedAddress < firstTerminalAddress)
    953                         firstTerminalAddress = n.mCachedAddress;
    954                     if (n.mCachedAddress > lastTerminalAddress)
    955                         lastTerminalAddress = n.mCachedAddress;
    956                 }
    957             }
    958             if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize;
    959         }
    960         final int[] groupCounts = new int[maxGroups + 1];
    961         final int[] runCounts = new int[maxRuns + 1];
    962         for (Node n : nodes) {
    963             ++groupCounts[n.mData.size()];
    964             for (CharGroup cg : n.mData) {
    965                 ++runCounts[cg.mChars.length];
    966             }
    967         }
    968 
    969         MakedictLog.i("Statistics:\n"
    970                 + "  total file size " + size + "\n"
    971                 + "  " + nodes.size() + " nodes\n"
    972                 + "  " + charGroups + " groups (" + ((float)charGroups / nodes.size())
    973                         + " groups per node)\n"
    974                 + "  first terminal at " + firstTerminalAddress + "\n"
    975                 + "  last terminal at " + lastTerminalAddress + "\n"
    976                 + "  Group stats : max = " + maxGroups);
    977         for (int i = 0; i < groupCounts.length; ++i) {
    978             MakedictLog.i("    " + i + " : " + groupCounts[i]);
    979         }
    980         MakedictLog.i("  Character run stats : max = " + maxRuns);
    981         for (int i = 0; i < runCounts.length; ++i) {
    982             MakedictLog.i("    " + i + " : " + runCounts[i]);
    983         }
    984     }
    985 
    986     /**
    987      * Dumps a FusionDictionary to a file.
    988      *
    989      * This is the public entry point to write a dictionary to a file.
    990      *
    991      * @param destination the stream to write the binary data to.
    992      * @param dict the dictionary to write.
    993      * @param version the version of the format to write, currently either 1 or 2.
    994      */
    995     public static void writeDictionaryBinary(final OutputStream destination,
    996             final FusionDictionary dict, final int version)
    997             throws IOException, UnsupportedFormatException {
    998 
    999         // Addresses are limited to 3 bytes, but since addresses can be relative to each node, the
   1000         // structure itself is not limited to 16MB. However, if it is over 16MB deciding the order
   1001         // of the nodes becomes a quite complicated problem, because though the dictionary itself
   1002         // does not have a size limit, each node must still be within 16MB of all its children and
   1003         // parents. As long as this is ensured, the dictionary file may grow to any size.
   1004 
   1005         if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) {
   1006             throw new UnsupportedFormatException("Requested file format version " + version
   1007                     + ", but this implementation only supports versions "
   1008                     + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION);
   1009         }
   1010 
   1011         ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256);
   1012 
   1013         // The magic number in big-endian order.
   1014         if (version >= FIRST_VERSION_WITH_HEADER_SIZE) {
   1015             // Magic number for version 2+.
   1016             headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24)));
   1017             headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16)));
   1018             headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8)));
   1019             headerBuffer.write((byte) (0xFF & VERSION_2_MAGIC_NUMBER));
   1020             // Dictionary version.
   1021             headerBuffer.write((byte) (0xFF & (version >> 8)));
   1022             headerBuffer.write((byte) (0xFF & version));
   1023         } else {
   1024             // Magic number for version 1.
   1025             headerBuffer.write((byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8)));
   1026             headerBuffer.write((byte) (0xFF & VERSION_1_MAGIC_NUMBER));
   1027             // Dictionary version.
   1028             headerBuffer.write((byte) (0xFF & version));
   1029         }
   1030         // Options flags
   1031         final int options = makeOptionsValue(dict);
   1032         headerBuffer.write((byte) (0xFF & (options >> 8)));
   1033         headerBuffer.write((byte) (0xFF & options));
   1034         if (version >= FIRST_VERSION_WITH_HEADER_SIZE) {
   1035             final int headerSizeOffset = headerBuffer.size();
   1036             // Placeholder to be written later with header size.
   1037             for (int i = 0; i < 4; ++i) {
   1038                 headerBuffer.write(0);
   1039             }
   1040             // Write out the options.
   1041             for (final String key : dict.mOptions.mAttributes.keySet()) {
   1042                 final String value = dict.mOptions.mAttributes.get(key);
   1043                 CharEncoding.writeString(headerBuffer, key);
   1044                 CharEncoding.writeString(headerBuffer, value);
   1045             }
   1046             final int size = headerBuffer.size();
   1047             final byte[] bytes = headerBuffer.toByteArray();
   1048             // Write out the header size.
   1049             bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24));
   1050             bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16));
   1051             bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8));
   1052             bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0));
   1053             destination.write(bytes);
   1054         } else {
   1055             headerBuffer.writeTo(destination);
   1056         }
   1057 
   1058         headerBuffer.close();
   1059 
   1060         // Leave the choice of the optimal node order to the flattenTree function.
   1061         MakedictLog.i("Flattening the tree...");
   1062         ArrayList<Node> flatNodes = flattenTree(dict.mRoot);
   1063 
   1064         MakedictLog.i("Computing addresses...");
   1065         computeAddresses(dict, flatNodes);
   1066         MakedictLog.i("Checking array...");
   1067         if (DBG) checkFlatNodeArray(flatNodes);
   1068 
   1069         // Create a buffer that matches the final dictionary size.
   1070         final Node lastNode = flatNodes.get(flatNodes.size() - 1);
   1071         final int bufferSize =(lastNode.mCachedAddress + lastNode.mCachedSize);
   1072         final byte[] buffer = new byte[bufferSize];
   1073         int index = 0;
   1074 
   1075         MakedictLog.i("Writing file...");
   1076         int dataEndOffset = 0;
   1077         for (Node n : flatNodes) {
   1078             dataEndOffset = writePlacedNode(dict, buffer, n);
   1079         }
   1080 
   1081         if (DBG) showStatistics(flatNodes);
   1082 
   1083         destination.write(buffer, 0, dataEndOffset);
   1084 
   1085         destination.close();
   1086         MakedictLog.i("Done");
   1087     }
   1088 
   1089 
   1090     // Input methods: Read a binary dictionary to memory.
   1091     // readDictionaryBinary is the public entry point for them.
   1092 
   1093     static final int[] characterBuffer = new int[MAX_WORD_LENGTH];
   1094     private static CharGroupInfo readCharGroup(RandomAccessFile source,
   1095             final int originalGroupAddress) throws IOException {
   1096         int addressPointer = originalGroupAddress;
   1097         final int flags = source.readUnsignedByte();
   1098         ++addressPointer;
   1099         final int characters[];
   1100         if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) {
   1101             int index = 0;
   1102             int character = CharEncoding.readChar(source);
   1103             addressPointer += CharEncoding.getCharSize(character);
   1104             while (-1 != character) {
   1105                 characterBuffer[index++] = character;
   1106                 character = CharEncoding.readChar(source);
   1107                 addressPointer += CharEncoding.getCharSize(character);
   1108             }
   1109             characters = Arrays.copyOfRange(characterBuffer, 0, index);
   1110         } else {
   1111             final int character = CharEncoding.readChar(source);
   1112             addressPointer += CharEncoding.getCharSize(character);
   1113             characters = new int[] { character };
   1114         }
   1115         final int frequency;
   1116         if (0 != (FLAG_IS_TERMINAL & flags)) {
   1117             ++addressPointer;
   1118             frequency = source.readUnsignedByte();
   1119         } else {
   1120             frequency = CharGroup.NOT_A_TERMINAL;
   1121         }
   1122         int childrenAddress = addressPointer;
   1123         switch (flags & MASK_GROUP_ADDRESS_TYPE) {
   1124         case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
   1125             childrenAddress += source.readUnsignedByte();
   1126             addressPointer += 1;
   1127             break;
   1128         case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
   1129             childrenAddress += source.readUnsignedShort();
   1130             addressPointer += 2;
   1131             break;
   1132         case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
   1133             childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort();
   1134             addressPointer += 3;
   1135             break;
   1136         case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
   1137         default:
   1138             childrenAddress = NO_CHILDREN_ADDRESS;
   1139             break;
   1140         }
   1141         ArrayList<WeightedString> shortcutTargets = null;
   1142         if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) {
   1143             final long pointerBefore = source.getFilePointer();
   1144             shortcutTargets = new ArrayList<WeightedString>();
   1145             source.readUnsignedShort(); // Skip the size
   1146             while (true) {
   1147                 final int targetFlags = source.readUnsignedByte();
   1148                 final String word = CharEncoding.readString(source);
   1149                 shortcutTargets.add(new WeightedString(word,
   1150                         targetFlags & FLAG_ATTRIBUTE_FREQUENCY));
   1151                 if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
   1152             }
   1153             addressPointer += (source.getFilePointer() - pointerBefore);
   1154         }
   1155         ArrayList<PendingAttribute> bigrams = null;
   1156         if (0 != (flags & FLAG_HAS_BIGRAMS)) {
   1157             bigrams = new ArrayList<PendingAttribute>();
   1158             while (true) {
   1159                 final int bigramFlags = source.readUnsignedByte();
   1160                 ++addressPointer;
   1161                 final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1;
   1162                 int bigramAddress = addressPointer;
   1163                 switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) {
   1164                 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
   1165                     bigramAddress += sign * source.readUnsignedByte();
   1166                     addressPointer += 1;
   1167                     break;
   1168                 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
   1169                     bigramAddress += sign * source.readUnsignedShort();
   1170                     addressPointer += 2;
   1171                     break;
   1172                 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
   1173                     final int offset = ((source.readUnsignedByte() << 16)
   1174                             + source.readUnsignedShort());
   1175                     bigramAddress += sign * offset;
   1176                     addressPointer += 3;
   1177                     break;
   1178                 default:
   1179                     throw new RuntimeException("Has bigrams with no address");
   1180                 }
   1181                 bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY,
   1182                         bigramAddress));
   1183                 if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
   1184             }
   1185         }
   1186         return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency,
   1187                 childrenAddress, shortcutTargets, bigrams);
   1188     }
   1189 
   1190     /**
   1191      * Reads and returns the char group count out of a file and forwards the pointer.
   1192      */
   1193     private static int readCharGroupCount(RandomAccessFile source) throws IOException {
   1194         final int msb = source.readUnsignedByte();
   1195         if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) {
   1196             return msb;
   1197         } else {
   1198             return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8)
   1199                     + source.readUnsignedByte();
   1200         }
   1201     }
   1202 
   1203     // The word cache here is a stopgap bandaid to help the catastrophic performance
   1204     // of this method. Since it performs direct, unbuffered random access to the file and
   1205     // may be called hundreds of thousands of times, the resulting performance is not
   1206     // reasonable without some kind of cache. Thus:
   1207     // TODO: perform buffered I/O here and in other places in the code.
   1208     private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>();
   1209     /**
   1210      * Finds, as a string, the word at the address passed as an argument.
   1211      *
   1212      * @param source the file to read from.
   1213      * @param headerSize the size of the header.
   1214      * @param address the address to seek.
   1215      * @return the word, as a string.
   1216      * @throws IOException if the file can't be read.
   1217      */
   1218     private static String getWordAtAddress(final RandomAccessFile source, final long headerSize,
   1219             int address) throws IOException {
   1220         final String cachedString = wordCache.get(address);
   1221         if (null != cachedString) return cachedString;
   1222         final long originalPointer = source.getFilePointer();
   1223         source.seek(headerSize);
   1224         final int count = readCharGroupCount(source);
   1225         int groupOffset = getGroupCountSize(count);
   1226         final StringBuilder builder = new StringBuilder();
   1227         String result = null;
   1228 
   1229         CharGroupInfo last = null;
   1230         for (int i = count - 1; i >= 0; --i) {
   1231             CharGroupInfo info = readCharGroup(source, groupOffset);
   1232             groupOffset = info.mEndAddress;
   1233             if (info.mOriginalAddress == address) {
   1234                 builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
   1235                 result = builder.toString();
   1236                 break; // and return
   1237             }
   1238             if (hasChildrenAddress(info.mChildrenAddress)) {
   1239                 if (info.mChildrenAddress > address) {
   1240                     if (null == last) continue;
   1241                     builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
   1242                     source.seek(last.mChildrenAddress + headerSize);
   1243                     groupOffset = last.mChildrenAddress + 1;
   1244                     i = source.readUnsignedByte();
   1245                     last = null;
   1246                     continue;
   1247                 }
   1248                 last = info;
   1249             }
   1250             if (0 == i && hasChildrenAddress(last.mChildrenAddress)) {
   1251                 builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
   1252                 source.seek(last.mChildrenAddress + headerSize);
   1253                 groupOffset = last.mChildrenAddress + 1;
   1254                 i = source.readUnsignedByte();
   1255                 last = null;
   1256                 continue;
   1257             }
   1258         }
   1259         source.seek(originalPointer);
   1260         wordCache.put(address, result);
   1261         return result;
   1262     }
   1263 
   1264     /**
   1265      * Reads a single node from a binary file.
   1266      *
   1267      * This methods reads the file at the current position of its file pointer. A node is
   1268      * fully expected to start at the current position.
   1269      * This will recursively read other nodes into the structure, populating the reverse
   1270      * maps on the fly and using them to keep track of already read nodes.
   1271      *
   1272      * @param source the data file, correctly positioned at the start of a node.
   1273      * @param headerSize the size, in bytes, of the file header.
   1274      * @param reverseNodeMap a mapping from addresses to already read nodes.
   1275      * @param reverseGroupMap a mapping from addresses to already read character groups.
   1276      * @return the read node with all his children already read.
   1277      */
   1278     private static Node readNode(RandomAccessFile source, long headerSize,
   1279             Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap)
   1280             throws IOException {
   1281         final int nodeOrigin = (int)(source.getFilePointer() - headerSize);
   1282         final int count = readCharGroupCount(source);
   1283         final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>();
   1284         int groupOffset = nodeOrigin + getGroupCountSize(count);
   1285         for (int i = count; i > 0; --i) {
   1286             CharGroupInfo info = readCharGroup(source, groupOffset);
   1287             ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets;
   1288             ArrayList<WeightedString> bigrams = null;
   1289             if (null != info.mBigrams) {
   1290                 bigrams = new ArrayList<WeightedString>();
   1291                 for (PendingAttribute bigram : info.mBigrams) {
   1292                     final String word = getWordAtAddress(source, headerSize, bigram.mAddress);
   1293                     bigrams.add(new WeightedString(word, bigram.mFrequency));
   1294                 }
   1295             }
   1296             if (hasChildrenAddress(info.mChildrenAddress)) {
   1297                 Node children = reverseNodeMap.get(info.mChildrenAddress);
   1298                 if (null == children) {
   1299                     final long currentPosition = source.getFilePointer();
   1300                     source.seek(info.mChildrenAddress + headerSize);
   1301                     children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap);
   1302                     source.seek(currentPosition);
   1303                 }
   1304                 nodeContents.add(
   1305                         new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
   1306                                 children));
   1307             } else {
   1308                 nodeContents.add(
   1309                         new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency));
   1310             }
   1311             groupOffset = info.mEndAddress;
   1312         }
   1313         final Node node = new Node(nodeContents);
   1314         node.mCachedAddress = nodeOrigin;
   1315         reverseNodeMap.put(node.mCachedAddress, node);
   1316         return node;
   1317     }
   1318 
   1319     /**
   1320      * Helper function to get the binary format version from the header.
   1321      */
   1322     private static int getFormatVersion(final RandomAccessFile source) throws IOException {
   1323         final int magic_v1 = source.readUnsignedShort();
   1324         if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte();
   1325         final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort();
   1326         if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort();
   1327         return NOT_A_VERSION_NUMBER;
   1328     }
   1329 
   1330     /**
   1331      * Reads a random access file and returns the memory representation of the dictionary.
   1332      *
   1333      * This high-level method takes a binary file and reads its contents, populating a
   1334      * FusionDictionary structure. The optional dict argument is an existing dictionary to
   1335      * which words from the file should be added. If it is null, a new dictionary is created.
   1336      *
   1337      * @param source the file to read.
   1338      * @param dict an optional dictionary to add words to, or null.
   1339      * @return the created (or merged) dictionary.
   1340      */
   1341     public static FusionDictionary readDictionaryBinary(final RandomAccessFile source,
   1342             final FusionDictionary dict) throws IOException, UnsupportedFormatException {
   1343         // Check file version
   1344         final int version = getFormatVersion(source);
   1345         if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION ) {
   1346             throw new UnsupportedFormatException("This file has version " + version
   1347                     + ", but this implementation does not support versions above "
   1348                     + MAXIMUM_SUPPORTED_VERSION);
   1349         }
   1350 
   1351         // Read options
   1352         final int optionsFlags = source.readUnsignedShort();
   1353 
   1354         final long headerSize;
   1355         final HashMap<String, String> options = new HashMap<String, String>();
   1356         if (version < FIRST_VERSION_WITH_HEADER_SIZE) {
   1357             headerSize = source.getFilePointer();
   1358         } else {
   1359             headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16)
   1360                     + (source.readUnsignedByte() << 8) + source.readUnsignedByte();
   1361             while (source.getFilePointer() < headerSize) {
   1362                 final String key = CharEncoding.readString(source);
   1363                 final String value = CharEncoding.readString(source);
   1364                 options.put(key, value);
   1365             }
   1366             source.seek(headerSize);
   1367         }
   1368 
   1369         Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>();
   1370         Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>();
   1371         final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping);
   1372 
   1373         FusionDictionary newDict = new FusionDictionary(root,
   1374                 new FusionDictionary.DictionaryOptions(options,
   1375                         0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
   1376                         0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
   1377         if (null != dict) {
   1378             for (final Word w : dict) {
   1379                 newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
   1380             }
   1381             for (final Word w : dict) {
   1382                 // By construction a binary dictionary may not have bigrams pointing to
   1383                 // words that are not also registered as unigrams so we don't have to avoid
   1384                 // them explicitly here.
   1385                 for (final WeightedString bigram : w.mBigrams) {
   1386                     newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
   1387                 }
   1388             }
   1389         }
   1390 
   1391         return newDict;
   1392     }
   1393 
   1394     /**
   1395      * Basic test to find out whether the file is a binary dictionary or not.
   1396      *
   1397      * Concretely this only tests the magic number.
   1398      *
   1399      * @param filename The name of the file to test.
   1400      * @return true if it's a binary dictionary, false otherwise
   1401      */
   1402     public static boolean isBinaryDictionary(final String filename) {
   1403         try {
   1404             RandomAccessFile f = new RandomAccessFile(filename, "r");
   1405             final int version = getFormatVersion(f);
   1406             return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION);
   1407         } catch (FileNotFoundException e) {
   1408             return false;
   1409         } catch (IOException e) {
   1410             return false;
   1411         }
   1412     }
   1413 }
   1414