Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
     20 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     21 import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
     22 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
     23 import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
     24 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     25 
     26 import java.io.ByteArrayOutputStream;
     27 import java.io.File;
     28 import java.io.FileInputStream;
     29 import java.io.FileNotFoundException;
     30 import java.io.IOException;
     31 import java.io.OutputStream;
     32 import java.nio.ByteBuffer;
     33 import java.nio.channels.FileChannel;
     34 import java.util.ArrayList;
     35 import java.util.Arrays;
     36 import java.util.HashMap;
     37 import java.util.Iterator;
     38 import java.util.Map;
     39 import java.util.TreeMap;
     40 
     41 /**
     42  * Reads and writes XML files for a FusionDictionary.
     43  *
     44  * All the methods in this class are static.
     45  */
     46 public final class BinaryDictInputOutput {
     47 
     48     private static final boolean DBG = MakedictLog.DBG;
     49 
     50     // Arbitrary limit to how much passes we consider address size compression should
     51     // terminate in. At the time of this writing, our largest dictionary completes
     52     // compression in five passes.
     53     // If the number of passes exceeds this number, makedict bails with an exception on
     54     // suspicion that a bug might be causing an infinite loop.
     55     private static final int MAX_PASSES = 24;
     56     private static final int MAX_JUMPS = 12;
     57 
     58     public interface FusionDictionaryBufferInterface {
     59         public int readUnsignedByte();
     60         public int readUnsignedShort();
     61         public int readUnsignedInt24();
     62         public int readInt();
     63         public int position();
     64         public void position(int newPosition);
     65         public void put(final byte b);
     66         public int limit();
     67         public int capacity();
     68     }
     69 
     70     public static final class ByteBufferWrapper implements FusionDictionaryBufferInterface {
     71         private ByteBuffer mBuffer;
     72 
     73         public ByteBufferWrapper(final ByteBuffer buffer) {
     74             mBuffer = buffer;
     75         }
     76 
     77         @Override
     78         public int readUnsignedByte() {
     79             return ((int)mBuffer.get()) & 0xFF;
     80         }
     81 
     82         @Override
     83         public int readUnsignedShort() {
     84             return ((int)mBuffer.getShort()) & 0xFFFF;
     85         }
     86 
     87         @Override
     88         public int readUnsignedInt24() {
     89             final int retval = readUnsignedByte();
     90             return (retval << 16) + readUnsignedShort();
     91         }
     92 
     93         @Override
     94         public int readInt() {
     95             return mBuffer.getInt();
     96         }
     97 
     98         @Override
     99         public int position() {
    100             return mBuffer.position();
    101         }
    102 
    103         @Override
    104         public void position(int newPos) {
    105             mBuffer.position(newPos);
    106         }
    107 
    108         @Override
    109         public void put(final byte b) {
    110             mBuffer.put(b);
    111         }
    112 
    113         @Override
    114         public int limit() {
    115             return mBuffer.limit();
    116         }
    117 
    118         @Override
    119         public int capacity() {
    120             return mBuffer.capacity();
    121         }
    122     }
    123 
    124     /**
    125      * A class grouping utility function for our specific character encoding.
    126      */
    127     private static final class CharEncoding {
    128 
    129         private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
    130         private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
    131 
    132         /**
    133          * Helper method to find out whether this code fits on one byte
    134          */
    135         private static boolean fitsOnOneByte(final int character) {
    136             return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
    137                     && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
    138         }
    139 
    140         /**
    141          * Compute the size of a character given its character code.
    142          *
    143          * Char format is:
    144          * 1 byte = bbbbbbbb match
    145          * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
    146          * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
    147          *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
    148          *       00011111 would be outside unicode.
    149          * else: iso-latin-1 code
    150          * This allows for the whole unicode range to be encoded, including chars outside of
    151          * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
    152          * characters which should never happen anyway (and still work, but take 3 bytes).
    153          *
    154          * @param character the character code.
    155          * @return the size in binary encoded-form, either 1 or 3 bytes.
    156          */
    157         private static int getCharSize(final int character) {
    158             // See char encoding in FusionDictionary.java
    159             if (fitsOnOneByte(character)) return 1;
    160             if (FormatSpec.INVALID_CHARACTER == character) return 1;
    161             return 3;
    162         }
    163 
    164         /**
    165          * Compute the byte size of a character array.
    166          */
    167         private static int getCharArraySize(final int[] chars) {
    168             int size = 0;
    169             for (int character : chars) size += getCharSize(character);
    170             return size;
    171         }
    172 
    173         /**
    174          * Writes a char array to a byte buffer.
    175          *
    176          * @param codePoints the code point array to write.
    177          * @param buffer the byte buffer to write to.
    178          * @param index the index in buffer to write the character array to.
    179          * @return the index after the last character.
    180          */
    181         private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
    182             for (int codePoint : codePoints) {
    183                 if (1 == getCharSize(codePoint)) {
    184                     buffer[index++] = (byte)codePoint;
    185                 } else {
    186                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    187                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    188                     buffer[index++] = (byte)(0xFF & codePoint);
    189                 }
    190             }
    191             return index;
    192         }
    193 
    194         /**
    195          * Writes a string with our character format to a byte buffer.
    196          *
    197          * This will also write the terminator byte.
    198          *
    199          * @param buffer the byte buffer to write to.
    200          * @param origin the offset to write from.
    201          * @param word the string to write.
    202          * @return the size written, in bytes.
    203          */
    204         private static int writeString(final byte[] buffer, final int origin,
    205                 final String word) {
    206             final int length = word.length();
    207             int index = origin;
    208             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    209                 final int codePoint = word.codePointAt(i);
    210                 if (1 == getCharSize(codePoint)) {
    211                     buffer[index++] = (byte)codePoint;
    212                 } else {
    213                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    214                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    215                     buffer[index++] = (byte)(0xFF & codePoint);
    216                 }
    217             }
    218             buffer[index++] = FormatSpec.GROUP_CHARACTERS_TERMINATOR;
    219             return index - origin;
    220         }
    221 
    222         /**
    223          * Writes a string with our character format to a ByteArrayOutputStream.
    224          *
    225          * This will also write the terminator byte.
    226          *
    227          * @param buffer the ByteArrayOutputStream to write to.
    228          * @param word the string to write.
    229          */
    230         private static void writeString(final ByteArrayOutputStream buffer, final String word) {
    231             final int length = word.length();
    232             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    233                 final int codePoint = word.codePointAt(i);
    234                 if (1 == getCharSize(codePoint)) {
    235                     buffer.write((byte) codePoint);
    236                 } else {
    237                     buffer.write((byte) (0xFF & (codePoint >> 16)));
    238                     buffer.write((byte) (0xFF & (codePoint >> 8)));
    239                     buffer.write((byte) (0xFF & codePoint));
    240                 }
    241             }
    242             buffer.write(FormatSpec.GROUP_CHARACTERS_TERMINATOR);
    243         }
    244 
    245         /**
    246          * Reads a string from a buffer. This is the converse of the above method.
    247          */
    248         private static String readString(final FusionDictionaryBufferInterface buffer) {
    249             final StringBuilder s = new StringBuilder();
    250             int character = readChar(buffer);
    251             while (character != FormatSpec.INVALID_CHARACTER) {
    252                 s.appendCodePoint(character);
    253                 character = readChar(buffer);
    254             }
    255             return s.toString();
    256         }
    257 
    258         /**
    259          * Reads a character from the buffer.
    260          *
    261          * This follows the character format documented earlier in this source file.
    262          *
    263          * @param buffer the buffer, positioned over an encoded character.
    264          * @return the character code.
    265          */
    266         private static int readChar(final FusionDictionaryBufferInterface buffer) {
    267             int character = buffer.readUnsignedByte();
    268             if (!fitsOnOneByte(character)) {
    269                 if (FormatSpec.GROUP_CHARACTERS_TERMINATOR == character) {
    270                     return FormatSpec.INVALID_CHARACTER;
    271                 }
    272                 character <<= 16;
    273                 character += buffer.readUnsignedShort();
    274             }
    275             return character;
    276         }
    277     }
    278 
    279     /**
    280      * Compute the binary size of the character array in a group
    281      *
    282      * If only one character, this is the size of this character. If many, it's the sum of their
    283      * sizes + 1 byte for the terminator.
    284      *
    285      * @param group the group
    286      * @return the size of the char array, including the terminator if any
    287      */
    288     private static int getGroupCharactersSize(final CharGroup group) {
    289         int size = CharEncoding.getCharArraySize(group.mChars);
    290         if (group.hasSeveralChars()) size += FormatSpec.GROUP_TERMINATOR_SIZE;
    291         return size;
    292     }
    293 
    294     /**
    295      * Compute the binary size of the group count
    296      * @param count the group count
    297      * @return the size of the group count, either 1 or 2 bytes.
    298      */
    299     public static int getGroupCountSize(final int count) {
    300         if (FormatSpec.MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) {
    301             return 1;
    302         } else if (FormatSpec.MAX_CHARGROUPS_IN_A_NODE >= count) {
    303             return 2;
    304         } else {
    305             throw new RuntimeException("Can't have more than "
    306                     + FormatSpec.MAX_CHARGROUPS_IN_A_NODE + " groups in a node (found " + count
    307                     + ")");
    308         }
    309     }
    310 
    311     /**
    312      * Compute the binary size of the group count for a node
    313      * @param node the node
    314      * @return the size of the group count, either 1 or 2 bytes.
    315      */
    316     private static int getGroupCountSize(final Node node) {
    317         return getGroupCountSize(node.mData.size());
    318     }
    319 
    320     /**
    321      * Compute the size of a shortcut in bytes.
    322      */
    323     private static int getShortcutSize(final WeightedString shortcut) {
    324         int size = FormatSpec.GROUP_ATTRIBUTE_FLAGS_SIZE;
    325         final String word = shortcut.mWord;
    326         final int length = word.length();
    327         for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    328             final int codePoint = word.codePointAt(i);
    329             size += CharEncoding.getCharSize(codePoint);
    330         }
    331         size += FormatSpec.GROUP_TERMINATOR_SIZE;
    332         return size;
    333     }
    334 
    335     /**
    336      * Compute the size of a shortcut list in bytes.
    337      *
    338      * This is known in advance and does not change according to position in the file
    339      * like address lists do.
    340      */
    341     private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
    342         if (null == shortcutList) return 0;
    343         int size = FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE;
    344         for (final WeightedString shortcut : shortcutList) {
    345             size += getShortcutSize(shortcut);
    346         }
    347         return size;
    348     }
    349 
    350     /**
    351      * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything.
    352      *
    353      * @param group the CharGroup to compute the size of.
    354      * @param options file format options.
    355      * @return the maximum size of the group.
    356      */
    357     private static int getCharGroupMaximumSize(final CharGroup group, final FormatOptions options) {
    358         int size = getGroupHeaderSize(group, options);
    359         // If terminal, one byte for the frequency
    360         if (group.isTerminal()) size += FormatSpec.GROUP_FREQUENCY_SIZE;
    361         size += FormatSpec.GROUP_MAX_ADDRESS_SIZE; // For children address
    362         size += getShortcutListSize(group.mShortcutTargets);
    363         if (null != group.mBigrams) {
    364             size += (FormatSpec.GROUP_ATTRIBUTE_FLAGS_SIZE
    365                     + FormatSpec.GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE)
    366                     * group.mBigrams.size();
    367         }
    368         return size;
    369     }
    370 
    371     /**
    372      * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches
    373      * it in the 'actualSize' member of the node.
    374      *
    375      * @param node the node to compute the maximum size of.
    376      * @param options file format options.
    377      */
    378     private static void setNodeMaximumSize(final Node node, final FormatOptions options) {
    379         int size = getGroupCountSize(node);
    380         for (CharGroup g : node.mData) {
    381             final int groupSize = getCharGroupMaximumSize(g, options);
    382             g.mCachedSize = groupSize;
    383             size += groupSize;
    384         }
    385         if (options.mSupportsDynamicUpdate) {
    386             size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
    387         }
    388         node.mCachedSize = size;
    389     }
    390 
    391     /**
    392      * Helper method to hide the actual value of the no children address.
    393      */
    394     public static boolean hasChildrenAddress(final int address) {
    395         return FormatSpec.NO_CHILDREN_ADDRESS != address;
    396     }
    397 
    398     /**
    399      * Helper method to check whether the group is moved.
    400      */
    401     public static boolean isMovedGroup(final int flags, final FormatOptions options) {
    402         return options.mSupportsDynamicUpdate && ((flags & FormatSpec.FLAG_IS_MOVED) == 1);
    403     }
    404 
    405     /**
    406      * Helper method to check whether the dictionary can be updated dynamically.
    407      */
    408     public static boolean supportsDynamicUpdate(final FormatOptions options) {
    409         return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_DYNAMIC_UPDATE
    410                 && options.mSupportsDynamicUpdate;
    411     }
    412 
    413     /**
    414      * Compute the size of the header (flag + [parent address] + characters size) of a CharGroup.
    415      *
    416      * @param group the group of which to compute the size of the header
    417      * @param options file format options.
    418      */
    419     private static int getGroupHeaderSize(final CharGroup group, final FormatOptions options) {
    420         if (supportsDynamicUpdate(options)) {
    421             return FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
    422                     + getGroupCharactersSize(group);
    423         } else {
    424             return FormatSpec.GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
    425         }
    426     }
    427 
    428     private static final int UINT8_MAX = 0xFF;
    429     private static final int UINT16_MAX = 0xFFFF;
    430     private static final int UINT24_MAX = 0xFFFFFF;
    431 
    432     /**
    433      * Compute the size, in bytes, that an address will occupy.
    434      *
    435      * This can be used either for children addresses (which are always positive) or for
    436      * attribute, which may be positive or negative but
    437      * store their sign bit separately.
    438      *
    439      * @param address the address
    440      * @return the byte size.
    441      */
    442     private static int getByteSize(final int address) {
    443         assert(address <= UINT24_MAX);
    444         if (!hasChildrenAddress(address)) {
    445             return 0;
    446         } else if (Math.abs(address) <= UINT8_MAX) {
    447             return 1;
    448         } else if (Math.abs(address) <= UINT16_MAX) {
    449             return 2;
    450         } else {
    451             return 3;
    452         }
    453     }
    454 
    455     private static final int SINT8_MAX = 0x7F;
    456     private static final int SINT16_MAX = 0x7FFF;
    457     private static final int SINT24_MAX = 0x7FFFFF;
    458     private static final int MSB8 = 0x80;
    459     private static final int MSB16 = 0x8000;
    460     private static final int MSB24 = 0x800000;
    461 
    462     // End utility methods.
    463 
    464     // This method is responsible for finding a nice ordering of the nodes that favors run-time
    465     // cache performance and dictionary size.
    466     /* package for tests */ static ArrayList<Node> flattenTree(final Node root) {
    467         final int treeSize = FusionDictionary.countCharGroups(root);
    468         MakedictLog.i("Counted nodes : " + treeSize);
    469         final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize);
    470         return flattenTreeInner(flatTree, root);
    471     }
    472 
    473     private static ArrayList<Node> flattenTreeInner(final ArrayList<Node> list, final Node node) {
    474         // Removing the node is necessary if the tails are merged, because we would then
    475         // add the same node several times when we only want it once. A number of places in
    476         // the code also depends on any node being only once in the list.
    477         // Merging tails can only be done if there are no attributes. Searching for attributes
    478         // in LatinIME code depends on a total breadth-first ordering, which merging tails
    479         // breaks. If there are no attributes, it should be fine (and reduce the file size)
    480         // to merge tails, and removing the node from the list would be necessary. However,
    481         // we don't merge tails because breaking the breadth-first ordering would result in
    482         // extreme overhead at bigram lookup time (it would make the search function O(n) instead
    483         // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty
    484         // high).
    485         // If no nodes are ever merged, we can't have the same node twice in the list, hence
    486         // searching for duplicates in unnecessary. It is also very performance consuming,
    487         // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making
    488         // this simple list.remove operation O(n*n) overall. On Android this overhead is very
    489         // high.
    490         // For future reference, the code to remove duplicate is a simple : list.remove(node);
    491         list.add(node);
    492         final ArrayList<CharGroup> branches = node.mData;
    493         final int nodeSize = branches.size();
    494         for (CharGroup group : branches) {
    495             if (null != group.mChildren) flattenTreeInner(list, group.mChildren);
    496         }
    497         return list;
    498     }
    499 
    500     /**
    501      * Finds the absolute address of a word in the dictionary.
    502      *
    503      * @param dict the dictionary in which to search.
    504      * @param word the word we are searching for.
    505      * @return the word address. If it is not found, an exception is thrown.
    506      */
    507     private static int findAddressOfWord(final FusionDictionary dict, final String word) {
    508         return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress;
    509     }
    510 
    511     /**
    512      * Computes the actual node size, based on the cached addresses of the children nodes.
    513      *
    514      * Each node stores its tentative address. During dictionary address computing, these
    515      * are not final, but they can be used to compute the node size (the node size depends
    516      * on the address of the children because the number of bytes necessary to store an
    517      * address depends on its numeric value. The return value indicates whether the node
    518      * contents (as in, any of the addresses stored in the cache fields) have changed with
    519      * respect to their previous value.
    520      *
    521      * @param node the node to compute the size of.
    522      * @param dict the dictionary in which the word/attributes are to be found.
    523      * @param formatOptions file format options.
    524      * @return false if none of the cached addresses inside the node changed, true otherwise.
    525      */
    526     private static boolean computeActualNodeSize(final Node node, final FusionDictionary dict,
    527             final FormatOptions formatOptions) {
    528         boolean changed = false;
    529         int size = getGroupCountSize(node);
    530         for (CharGroup group : node.mData) {
    531             if (group.mCachedAddress != node.mCachedAddress + size) {
    532                 changed = true;
    533                 group.mCachedAddress = node.mCachedAddress + size;
    534             }
    535             int groupSize = getGroupHeaderSize(group, formatOptions);
    536             if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
    537             if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
    538                 groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
    539             } else if (null != group.mChildren) {
    540                 final int offsetBasePoint = groupSize + node.mCachedAddress + size;
    541                 final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
    542                 // assign my address to children's parent address
    543                 group.mChildren.mCachedParentAddress = group.mCachedAddress
    544                         - group.mChildren.mCachedAddress;
    545                 if (formatOptions.mSupportsDynamicUpdate) {
    546                     groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
    547                 } else {
    548                     groupSize += getByteSize(offset);
    549                 }
    550             }
    551             groupSize += getShortcutListSize(group.mShortcutTargets);
    552             if (null != group.mBigrams) {
    553                 for (WeightedString bigram : group.mBigrams) {
    554                     final int offsetBasePoint = groupSize + node.mCachedAddress + size
    555                             + FormatSpec.GROUP_FLAGS_SIZE;
    556                     final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
    557                     final int offset = addressOfBigram - offsetBasePoint;
    558                     groupSize += getByteSize(offset) + FormatSpec.GROUP_FLAGS_SIZE;
    559                 }
    560             }
    561             group.mCachedSize = groupSize;
    562             size += groupSize;
    563         }
    564         if (formatOptions.mSupportsDynamicUpdate) {
    565             size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
    566         }
    567         if (node.mCachedSize != size) {
    568             node.mCachedSize = size;
    569             changed = true;
    570         }
    571         return changed;
    572     }
    573 
    574     /**
    575      * Computes the byte size of a list of nodes and updates each node cached position.
    576      *
    577      * @param flatNodes the array of nodes.
    578      * @param formatOptions file format options.
    579      * @return the byte size of the entire stack.
    580      */
    581     private static int stackNodes(final ArrayList<Node> flatNodes,
    582             final FormatOptions formatOptions) {
    583         int nodeOffset = 0;
    584         for (Node n : flatNodes) {
    585             n.mCachedAddress = nodeOffset;
    586             int groupCountSize = getGroupCountSize(n);
    587             int groupOffset = 0;
    588             for (CharGroup g : n.mData) {
    589                 g.mCachedAddress = groupCountSize + nodeOffset + groupOffset;
    590                 groupOffset += g.mCachedSize;
    591             }
    592             final int nodeSize = groupCountSize + groupOffset
    593                     + (formatOptions.mSupportsDynamicUpdate
    594                             ? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0);
    595             if (nodeSize != n.mCachedSize) {
    596                 throw new RuntimeException("Bug : Stored and computed node size differ");
    597             }
    598             nodeOffset += n.mCachedSize;
    599         }
    600         return nodeOffset;
    601     }
    602 
    603     /**
    604      * Compute the addresses and sizes of an ordered node array.
    605      *
    606      * This method takes a node array and will update its cached address and size values
    607      * so that they can be written into a file. It determines the smallest size each of the
    608      * nodes can be given the addresses of its children and attributes, and store that into
    609      * each node.
    610      * The order of the node is given by the order of the array. This method makes no effort
    611      * to find a good order; it only mechanically computes the size this order results in.
    612      *
    613      * @param dict the dictionary
    614      * @param flatNodes the ordered array of nodes
    615      * @param formatOptions file format options.
    616      * @return the same array it was passed. The nodes have been updated for address and size.
    617      */
    618     private static ArrayList<Node> computeAddresses(final FusionDictionary dict,
    619             final ArrayList<Node> flatNodes, final FormatOptions formatOptions) {
    620         // First get the worst sizes and offsets
    621         for (Node n : flatNodes) setNodeMaximumSize(n, formatOptions);
    622         final int offset = stackNodes(flatNodes, formatOptions);
    623 
    624         MakedictLog.i("Compressing the array addresses. Original size : " + offset);
    625         MakedictLog.i("(Recursively seen size : " + offset + ")");
    626 
    627         int passes = 0;
    628         boolean changesDone = false;
    629         do {
    630             changesDone = false;
    631             for (Node n : flatNodes) {
    632                 final int oldNodeSize = n.mCachedSize;
    633                 final boolean changed = computeActualNodeSize(n, dict, formatOptions);
    634                 final int newNodeSize = n.mCachedSize;
    635                 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!");
    636                 changesDone |= changed;
    637             }
    638             stackNodes(flatNodes, formatOptions);
    639             ++passes;
    640             if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
    641         } while (changesDone);
    642 
    643         final Node lastNode = flatNodes.get(flatNodes.size() - 1);
    644         MakedictLog.i("Compression complete in " + passes + " passes.");
    645         MakedictLog.i("After address compression : "
    646                 + (lastNode.mCachedAddress + lastNode.mCachedSize));
    647 
    648         return flatNodes;
    649     }
    650 
    651     /**
    652      * Sanity-checking method.
    653      *
    654      * This method checks an array of node for juxtaposition, that is, it will do
    655      * nothing if each node's cached address is actually the previous node's address
    656      * plus the previous node's size.
    657      * If this is not the case, it will throw an exception.
    658      *
    659      * @param array the array node to check
    660      */
    661     private static void checkFlatNodeArray(final ArrayList<Node> array) {
    662         int offset = 0;
    663         int index = 0;
    664         for (Node n : array) {
    665             if (n.mCachedAddress != offset) {
    666                 throw new RuntimeException("Wrong address for node " + index
    667                         + " : expected " + offset + ", got " + n.mCachedAddress);
    668             }
    669             ++index;
    670             offset += n.mCachedSize;
    671         }
    672     }
    673 
    674     /**
    675      * Helper method to write a variable-size address to a file.
    676      *
    677      * @param buffer the buffer to write to.
    678      * @param index the index in the buffer to write the address to.
    679      * @param address the address to write.
    680      * @return the size in bytes the address actually took.
    681      */
    682     private static int writeVariableAddress(final byte[] buffer, int index, final int address) {
    683         switch (getByteSize(address)) {
    684         case 1:
    685             buffer[index++] = (byte)address;
    686             return 1;
    687         case 2:
    688             buffer[index++] = (byte)(0xFF & (address >> 8));
    689             buffer[index++] = (byte)(0xFF & address);
    690             return 2;
    691         case 3:
    692             buffer[index++] = (byte)(0xFF & (address >> 16));
    693             buffer[index++] = (byte)(0xFF & (address >> 8));
    694             buffer[index++] = (byte)(0xFF & address);
    695             return 3;
    696         case 0:
    697             return 0;
    698         default:
    699             throw new RuntimeException("Address " + address + " has a strange size");
    700         }
    701     }
    702 
    703     /**
    704      * Helper method to write a variable-size signed address to a file.
    705      *
    706      * @param buffer the buffer to write to.
    707      * @param index the index in the buffer to write the address to.
    708      * @param address the address to write.
    709      * @return the size in bytes the address actually took.
    710      */
    711     private static int writeVariableSignedAddress(final byte[] buffer, int index,
    712             final int address) {
    713         if (!hasChildrenAddress(address)) {
    714             buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
    715         } else {
    716             final int absAddress = Math.abs(address);
    717             buffer[index++] = (byte)((address < 0 ? MSB8 : 0) | (0xFF & (absAddress >> 16)));
    718             buffer[index++] = (byte)(0xFF & (absAddress >> 8));
    719             buffer[index++] = (byte)(0xFF & absAddress);
    720         }
    721         return 3;
    722     }
    723 
    724     private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress,
    725             final int childrenOffset, final FormatOptions formatOptions) {
    726         byte flags = 0;
    727         if (group.mChars.length > 1) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
    728         if (group.mFrequency >= 0) {
    729             flags |= FormatSpec.FLAG_IS_TERMINAL;
    730         }
    731         if (null != group.mChildren) {
    732             final int byteSize = formatOptions.mSupportsDynamicUpdate
    733                     ? FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE : getByteSize(childrenOffset);
    734             switch (byteSize) {
    735             case 1:
    736                 flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
    737                 break;
    738             case 2:
    739                 flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
    740                 break;
    741             case 3:
    742                 flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
    743                 break;
    744             default:
    745                 throw new RuntimeException("Node with a strange address");
    746             }
    747         } else if (formatOptions.mSupportsDynamicUpdate) {
    748             flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
    749         }
    750         if (null != group.mShortcutTargets) {
    751             if (DBG && 0 == group.mShortcutTargets.size()) {
    752                 throw new RuntimeException("0-sized shortcut list must be null");
    753             }
    754             flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
    755         }
    756         if (null != group.mBigrams) {
    757             if (DBG && 0 == group.mBigrams.size()) {
    758                 throw new RuntimeException("0-sized bigram list must be null");
    759             }
    760             flags |= FormatSpec.FLAG_HAS_BIGRAMS;
    761         }
    762         if (group.mIsNotAWord) {
    763             flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
    764         }
    765         if (group.mIsBlacklistEntry) {
    766             flags |= FormatSpec.FLAG_IS_BLACKLISTED;
    767         }
    768         return flags;
    769     }
    770 
    771     /**
    772      * Makes the flag value for a bigram.
    773      *
    774      * @param more whether there are more bigrams after this one.
    775      * @param offset the offset of the bigram.
    776      * @param bigramFrequency the frequency of the bigram, 0..255.
    777      * @param unigramFrequency the unigram frequency of the same word, 0..255.
    778      * @param word the second bigram, for debugging purposes
    779      * @return the flags
    780      */
    781     private static final int makeBigramFlags(final boolean more, final int offset,
    782             int bigramFrequency, final int unigramFrequency, final String word) {
    783         int bigramFlags = (more ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0)
    784                 + (offset < 0 ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0);
    785         switch (getByteSize(offset)) {
    786         case 1:
    787             bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
    788             break;
    789         case 2:
    790             bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
    791             break;
    792         case 3:
    793             bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
    794             break;
    795         default:
    796             throw new RuntimeException("Strange offset size");
    797         }
    798         if (unigramFrequency > bigramFrequency) {
    799             MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word
    800                     + "\". Bigram freq is " + bigramFrequency + ", unigram freq for "
    801                     + word + " is " + unigramFrequency);
    802             bigramFrequency = unigramFrequency;
    803         }
    804         // We compute the difference between 255 (which means probability = 1) and the
    805         // unigram score. We split this into a number of discrete steps.
    806         // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15
    807         // represents an increase of 16 steps: a value of 15 will be interpreted as the median
    808         // value of the 16th step. In all justice, if the bigram frequency is low enough to be
    809         // rounded below the first step (which means it is less than half a step higher than the
    810         // unigram frequency) then the unigram frequency itself is the best approximation of the
    811         // bigram freq that we could possibly supply, hence we should *not* include this bigram
    812         // in the file at all.
    813         // until this is done, we'll write 0 and slightly overestimate this case.
    814         // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step
    815         // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to
    816         // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the
    817         // step size. Then we compute the start of the first step (the one where value 0 starts)
    818         // by adding half-a-step to the unigramFrequency. From there, we compute the integer
    819         // number of steps to the bigramFrequency. One last thing: we want our steps to include
    820         // their lower bound and exclude their higher bound so we need to have the first step
    821         // start at exactly 1 unit higher than floor(unigramFreq + half a step).
    822         // Note : to reconstruct the score, the dictionary reader will need to divide
    823         // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step,
    824         // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best
    825         // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the
    826         // step pointed by the discretized frequency.
    827         final float stepSize =
    828                 (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
    829                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
    830         final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f);
    831         final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize);
    832         // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1
    833         // here. The best approximation would be the unigram freq itself, so we should not
    834         // include this bigram in the dictionary. For now, register as 0, and live with the
    835         // small over-estimation that we get in this case. TODO: actually remove this bigram
    836         // if discretizedFrequency < 0.
    837         final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0;
    838         bigramFlags += finalBigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
    839         return bigramFlags;
    840     }
    841 
    842     /**
    843      * Makes the 2-byte value for options flags.
    844      */
    845     private static final int makeOptionsValue(final FusionDictionary dictionary,
    846             final FormatOptions formatOptions) {
    847         final DictionaryOptions options = dictionary.mOptions;
    848         final boolean hasBigrams = dictionary.hasBigrams();
    849         return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0)
    850                 + (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0)
    851                 + (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0)
    852                 + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0);
    853     }
    854 
    855     /**
    856      * Makes the flag value for a shortcut.
    857      *
    858      * @param more whether there are more attributes after this one.
    859      * @param frequency the frequency of the attribute, 0..15
    860      * @return the flags
    861      */
    862     private static final int makeShortcutFlags(final boolean more, final int frequency) {
    863         return (more ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0)
    864                 + (frequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY);
    865     }
    866 
    867     private static final int writeParentAddress(final byte[] buffer, final int index,
    868             final int address, final FormatOptions formatOptions) {
    869         if (supportsDynamicUpdate(formatOptions)) {
    870             if (address == FormatSpec.NO_PARENT_ADDRESS) {
    871                 buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
    872             } else {
    873                 final int absAddress = Math.abs(address);
    874                 assert(absAddress <= SINT24_MAX);
    875                 buffer[index] = (byte)((address < 0 ? MSB8 : 0)
    876                         | ((absAddress >> 16) & 0xFF));
    877                 buffer[index + 1] = (byte)((absAddress >> 8) & 0xFF);
    878                 buffer[index + 2] = (byte)(absAddress & 0xFF);
    879             }
    880             return index + 3;
    881         } else {
    882             return index;
    883         }
    884     }
    885 
    886     /**
    887      * Write a node to memory. The node is expected to have its final position cached.
    888      *
    889      * This can be an empty map, but the more is inside the faster the lookups will be. It can
    890      * be carried on as long as nodes do not move.
    891      *
    892      * @param dict the dictionary the node is a part of (for relative offsets).
    893      * @param buffer the memory buffer to write to.
    894      * @param node the node to write.
    895      * @param formatOptions file format options.
    896      * @return the address of the END of the node.
    897      */
    898     private static int writePlacedNode(final FusionDictionary dict, byte[] buffer,
    899             final Node node, final FormatOptions formatOptions) {
    900         int index = node.mCachedAddress;
    901 
    902         final int groupCount = node.mData.size();
    903         final int countSize = getGroupCountSize(node);
    904         final int parentAddress = node.mCachedParentAddress;
    905         if (1 == countSize) {
    906             buffer[index++] = (byte)groupCount;
    907         } else if (2 == countSize) {
    908             // We need to signal 2-byte size by setting the top bit of the MSB to 1, so
    909             // we | 0x80 to do this.
    910             buffer[index++] = (byte)((groupCount >> 8) | 0x80);
    911             buffer[index++] = (byte)(groupCount & 0xFF);
    912         } else {
    913             throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
    914         }
    915         int groupAddress = index;
    916         for (int i = 0; i < groupCount; ++i) {
    917             CharGroup group = node.mData.get(i);
    918             if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not "
    919                     + "the same as the cached address of the group : "
    920                     + index + " <> " + group.mCachedAddress);
    921             groupAddress += getGroupHeaderSize(group, formatOptions);
    922             // Sanity checks.
    923             if (DBG && group.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
    924                 throw new RuntimeException("A node has a frequency > "
    925                         + FormatSpec.MAX_TERMINAL_FREQUENCY
    926                         + " : " + group.mFrequency);
    927             }
    928             if (group.mFrequency >= 0) groupAddress += FormatSpec.GROUP_FREQUENCY_SIZE;
    929             final int childrenOffset = null == group.mChildren
    930                     ? FormatSpec.NO_CHILDREN_ADDRESS
    931                             : group.mChildren.mCachedAddress - groupAddress;
    932             byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
    933             buffer[index++] = flags;
    934 
    935             if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) {
    936                 index = writeParentAddress(buffer, index, parentAddress, formatOptions);
    937             } else {
    938                 index = writeParentAddress(buffer, index,
    939                         parentAddress + (node.mCachedAddress - group.mCachedAddress),
    940                         formatOptions);
    941             }
    942 
    943             index = CharEncoding.writeCharArray(group.mChars, buffer, index);
    944             if (group.hasSeveralChars()) {
    945                 buffer[index++] = FormatSpec.GROUP_CHARACTERS_TERMINATOR;
    946             }
    947             if (group.mFrequency >= 0) {
    948                 buffer[index++] = (byte) group.mFrequency;
    949             }
    950 
    951             final int shift;
    952             if (formatOptions.mSupportsDynamicUpdate) {
    953                 shift = writeVariableSignedAddress(buffer, index, childrenOffset);
    954             } else {
    955                 shift = writeVariableAddress(buffer, index, childrenOffset);
    956             }
    957             index += shift;
    958             groupAddress += shift;
    959 
    960             // Write shortcuts
    961             if (null != group.mShortcutTargets) {
    962                 final int indexOfShortcutByteSize = index;
    963                 index += FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE;
    964                 groupAddress += FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE;
    965                 final Iterator<WeightedString> shortcutIterator = group.mShortcutTargets.iterator();
    966                 while (shortcutIterator.hasNext()) {
    967                     final WeightedString target = shortcutIterator.next();
    968                     ++groupAddress;
    969                     int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(),
    970                             target.mFrequency);
    971                     buffer[index++] = (byte)shortcutFlags;
    972                     final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord);
    973                     index += shortcutShift;
    974                     groupAddress += shortcutShift;
    975                 }
    976                 final int shortcutByteSize = index - indexOfShortcutByteSize;
    977                 if (shortcutByteSize > 0xFFFF) {
    978                     throw new RuntimeException("Shortcut list too large");
    979                 }
    980                 buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8);
    981                 buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF);
    982             }
    983             // Write bigrams
    984             if (null != group.mBigrams) {
    985                 final Iterator<WeightedString> bigramIterator = group.mBigrams.iterator();
    986                 while (bigramIterator.hasNext()) {
    987                     final WeightedString bigram = bigramIterator.next();
    988                     final CharGroup target =
    989                             FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord);
    990                     final int addressOfBigram = target.mCachedAddress;
    991                     final int unigramFrequencyForThisWord = target.mFrequency;
    992                     ++groupAddress;
    993                     final int offset = addressOfBigram - groupAddress;
    994                     int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset,
    995                             bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
    996                     buffer[index++] = (byte)bigramFlags;
    997                     final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
    998                     index += bigramShift;
    999                     groupAddress += bigramShift;
   1000                 }
   1001             }
   1002 
   1003         }
   1004         if (formatOptions.mSupportsDynamicUpdate) {
   1005             buffer[index] = buffer[index + 1] = buffer[index + 2]
   1006                     = FormatSpec.NO_FORWARD_LINK_ADDRESS;
   1007             index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
   1008         }
   1009         if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException(
   1010                 "Not the same size : written "
   1011                 + (index - node.mCachedAddress) + " bytes out of a node that should have "
   1012                 + node.mCachedSize + " bytes");
   1013         return index;
   1014     }
   1015 
   1016     /**
   1017      * Dumps a collection of useful statistics about a node array.
   1018      *
   1019      * This prints purely informative stuff, like the total estimated file size, the
   1020      * number of nodes, of character groups, the repartition of each address size, etc
   1021      *
   1022      * @param nodes the node array.
   1023      */
   1024     private static void showStatistics(ArrayList<Node> nodes) {
   1025         int firstTerminalAddress = Integer.MAX_VALUE;
   1026         int lastTerminalAddress = Integer.MIN_VALUE;
   1027         int size = 0;
   1028         int charGroups = 0;
   1029         int maxGroups = 0;
   1030         int maxRuns = 0;
   1031         for (Node n : nodes) {
   1032             if (maxGroups < n.mData.size()) maxGroups = n.mData.size();
   1033             for (CharGroup cg : n.mData) {
   1034                 ++charGroups;
   1035                 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
   1036                 if (cg.mFrequency >= 0) {
   1037                     if (n.mCachedAddress < firstTerminalAddress)
   1038                         firstTerminalAddress = n.mCachedAddress;
   1039                     if (n.mCachedAddress > lastTerminalAddress)
   1040                         lastTerminalAddress = n.mCachedAddress;
   1041                 }
   1042             }
   1043             if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize;
   1044         }
   1045         final int[] groupCounts = new int[maxGroups + 1];
   1046         final int[] runCounts = new int[maxRuns + 1];
   1047         for (Node n : nodes) {
   1048             ++groupCounts[n.mData.size()];
   1049             for (CharGroup cg : n.mData) {
   1050                 ++runCounts[cg.mChars.length];
   1051             }
   1052         }
   1053 
   1054         MakedictLog.i("Statistics:\n"
   1055                 + "  total file size " + size + "\n"
   1056                 + "  " + nodes.size() + " nodes\n"
   1057                 + "  " + charGroups + " groups (" + ((float)charGroups / nodes.size())
   1058                         + " groups per node)\n"
   1059                 + "  first terminal at " + firstTerminalAddress + "\n"
   1060                 + "  last terminal at " + lastTerminalAddress + "\n"
   1061                 + "  Group stats : max = " + maxGroups);
   1062         for (int i = 0; i < groupCounts.length; ++i) {
   1063             MakedictLog.i("    " + i + " : " + groupCounts[i]);
   1064         }
   1065         MakedictLog.i("  Character run stats : max = " + maxRuns);
   1066         for (int i = 0; i < runCounts.length; ++i) {
   1067             MakedictLog.i("    " + i + " : " + runCounts[i]);
   1068         }
   1069     }
   1070 
   1071     /**
   1072      * Dumps a FusionDictionary to a file.
   1073      *
   1074      * This is the public entry point to write a dictionary to a file.
   1075      *
   1076      * @param destination the stream to write the binary data to.
   1077      * @param dict the dictionary to write.
   1078      * @param formatOptions file format options.
   1079      */
   1080     public static void writeDictionaryBinary(final OutputStream destination,
   1081             final FusionDictionary dict, final FormatOptions formatOptions)
   1082             throws IOException, UnsupportedFormatException {
   1083 
   1084         // Addresses are limited to 3 bytes, but since addresses can be relative to each node, the
   1085         // structure itself is not limited to 16MB. However, if it is over 16MB deciding the order
   1086         // of the nodes becomes a quite complicated problem, because though the dictionary itself
   1087         // does not have a size limit, each node must still be within 16MB of all its children and
   1088         // parents. As long as this is ensured, the dictionary file may grow to any size.
   1089 
   1090         final int version = formatOptions.mVersion;
   1091         if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
   1092                 || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) {
   1093             throw new UnsupportedFormatException("Requested file format version " + version
   1094                     + ", but this implementation only supports versions "
   1095                     + FormatSpec.MINIMUM_SUPPORTED_VERSION + " through "
   1096                     + FormatSpec.MAXIMUM_SUPPORTED_VERSION);
   1097         }
   1098 
   1099         ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256);
   1100 
   1101         // The magic number in big-endian order.
   1102         if (version >= FormatSpec.FIRST_VERSION_WITH_HEADER_SIZE) {
   1103             // Magic number for version 2+.
   1104             headerBuffer.write((byte) (0xFF & (FormatSpec.VERSION_2_MAGIC_NUMBER >> 24)));
   1105             headerBuffer.write((byte) (0xFF & (FormatSpec.VERSION_2_MAGIC_NUMBER >> 16)));
   1106             headerBuffer.write((byte) (0xFF & (FormatSpec.VERSION_2_MAGIC_NUMBER >> 8)));
   1107             headerBuffer.write((byte) (0xFF & FormatSpec.VERSION_2_MAGIC_NUMBER));
   1108             // Dictionary version.
   1109             headerBuffer.write((byte) (0xFF & (version >> 8)));
   1110             headerBuffer.write((byte) (0xFF & version));
   1111         } else {
   1112             // Magic number for version 1.
   1113             headerBuffer.write((byte) (0xFF & (FormatSpec.VERSION_1_MAGIC_NUMBER >> 8)));
   1114             headerBuffer.write((byte) (0xFF & FormatSpec.VERSION_1_MAGIC_NUMBER));
   1115             // Dictionary version.
   1116             headerBuffer.write((byte) (0xFF & version));
   1117         }
   1118         // Options flags
   1119         final int options = makeOptionsValue(dict, formatOptions);
   1120         headerBuffer.write((byte) (0xFF & (options >> 8)));
   1121         headerBuffer.write((byte) (0xFF & options));
   1122         if (version >= FormatSpec.FIRST_VERSION_WITH_HEADER_SIZE) {
   1123             final int headerSizeOffset = headerBuffer.size();
   1124             // Placeholder to be written later with header size.
   1125             for (int i = 0; i < 4; ++i) {
   1126                 headerBuffer.write(0);
   1127             }
   1128             // Write out the options.
   1129             for (final String key : dict.mOptions.mAttributes.keySet()) {
   1130                 final String value = dict.mOptions.mAttributes.get(key);
   1131                 CharEncoding.writeString(headerBuffer, key);
   1132                 CharEncoding.writeString(headerBuffer, value);
   1133             }
   1134             final int size = headerBuffer.size();
   1135             final byte[] bytes = headerBuffer.toByteArray();
   1136             // Write out the header size.
   1137             bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24));
   1138             bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16));
   1139             bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8));
   1140             bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0));
   1141             destination.write(bytes);
   1142         } else {
   1143             headerBuffer.writeTo(destination);
   1144         }
   1145 
   1146         headerBuffer.close();
   1147 
   1148         // Leave the choice of the optimal node order to the flattenTree function.
   1149         MakedictLog.i("Flattening the tree...");
   1150         ArrayList<Node> flatNodes = flattenTree(dict.mRoot);
   1151 
   1152         MakedictLog.i("Computing addresses...");
   1153         computeAddresses(dict, flatNodes, formatOptions);
   1154         MakedictLog.i("Checking array...");
   1155         if (DBG) checkFlatNodeArray(flatNodes);
   1156 
   1157         // Create a buffer that matches the final dictionary size.
   1158         final Node lastNode = flatNodes.get(flatNodes.size() - 1);
   1159         final int bufferSize = lastNode.mCachedAddress + lastNode.mCachedSize;
   1160         final byte[] buffer = new byte[bufferSize];
   1161         int index = 0;
   1162 
   1163         MakedictLog.i("Writing file...");
   1164         int dataEndOffset = 0;
   1165         for (Node n : flatNodes) {
   1166             dataEndOffset = writePlacedNode(dict, buffer, n, formatOptions);
   1167         }
   1168 
   1169         if (DBG) showStatistics(flatNodes);
   1170 
   1171         destination.write(buffer, 0, dataEndOffset);
   1172 
   1173         destination.close();
   1174         MakedictLog.i("Done");
   1175     }
   1176 
   1177 
   1178     // Input methods: Read a binary dictionary to memory.
   1179     // readDictionaryBinary is the public entry point for them.
   1180 
   1181     private static int getChildrenAddressSize(final int optionFlags,
   1182             final FormatOptions formatOptions) {
   1183         if (formatOptions.mSupportsDynamicUpdate) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
   1184         switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
   1185             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
   1186                 return 1;
   1187             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
   1188                 return 2;
   1189             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
   1190                 return 3;
   1191             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
   1192             default:
   1193                 return 0;
   1194         }
   1195     }
   1196 
   1197     private static int readChildrenAddress(final FusionDictionaryBufferInterface buffer,
   1198             final int optionFlags, final FormatOptions options) {
   1199         if (options.mSupportsDynamicUpdate) {
   1200             final int address = buffer.readUnsignedInt24();
   1201             if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS;
   1202             if ((address & MSB24) != 0) {
   1203                 return -(address & SINT24_MAX);
   1204             } else {
   1205                 return address;
   1206             }
   1207         }
   1208         int address;
   1209         switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
   1210             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
   1211                 return buffer.readUnsignedByte();
   1212             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
   1213                 return buffer.readUnsignedShort();
   1214             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
   1215                 return buffer.readUnsignedInt24();
   1216             case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
   1217             default:
   1218                 return FormatSpec.NO_CHILDREN_ADDRESS;
   1219         }
   1220     }
   1221 
   1222     private static int readParentAddress(final FusionDictionaryBufferInterface buffer,
   1223             final FormatOptions formatOptions) {
   1224         if (supportsDynamicUpdate(formatOptions)) {
   1225             final int parentAddress = buffer.readUnsignedInt24();
   1226             final int sign = ((parentAddress & MSB24) != 0) ? -1 : 1;
   1227             return sign * (parentAddress & SINT24_MAX);
   1228         } else {
   1229             return FormatSpec.NO_PARENT_ADDRESS;
   1230         }
   1231     }
   1232 
   1233     private static final int[] CHARACTER_BUFFER = new int[FormatSpec.MAX_WORD_LENGTH];
   1234     public static CharGroupInfo readCharGroup(final FusionDictionaryBufferInterface buffer,
   1235             final int originalGroupAddress, final FormatOptions options) {
   1236         int addressPointer = originalGroupAddress;
   1237         final int flags = buffer.readUnsignedByte();
   1238         ++addressPointer;
   1239 
   1240         final int parentAddress = readParentAddress(buffer, options);
   1241         if (supportsDynamicUpdate(options)) {
   1242             addressPointer += 3;
   1243         }
   1244 
   1245         final int characters[];
   1246         if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
   1247             int index = 0;
   1248             int character = CharEncoding.readChar(buffer);
   1249             addressPointer += CharEncoding.getCharSize(character);
   1250             while (-1 != character) {
   1251                 // FusionDictionary is making sure that the length of the word is smaller than
   1252                 // MAX_WORD_LENGTH.
   1253                 // So we'll never write past the end of CHARACTER_BUFFER.
   1254                 CHARACTER_BUFFER[index++] = character;
   1255                 character = CharEncoding.readChar(buffer);
   1256                 addressPointer += CharEncoding.getCharSize(character);
   1257             }
   1258             characters = Arrays.copyOfRange(CHARACTER_BUFFER, 0, index);
   1259         } else {
   1260             final int character = CharEncoding.readChar(buffer);
   1261             addressPointer += CharEncoding.getCharSize(character);
   1262             characters = new int[] { character };
   1263         }
   1264         final int frequency;
   1265         if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
   1266             ++addressPointer;
   1267             frequency = buffer.readUnsignedByte();
   1268         } else {
   1269             frequency = CharGroup.NOT_A_TERMINAL;
   1270         }
   1271         int childrenAddress = readChildrenAddress(buffer, flags, options);
   1272         if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
   1273             childrenAddress += addressPointer;
   1274         }
   1275         addressPointer += getChildrenAddressSize(flags, options);
   1276         ArrayList<WeightedString> shortcutTargets = null;
   1277         if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
   1278             final int pointerBefore = buffer.position();
   1279             shortcutTargets = new ArrayList<WeightedString>();
   1280             buffer.readUnsignedShort(); // Skip the size
   1281             while (true) {
   1282                 final int targetFlags = buffer.readUnsignedByte();
   1283                 final String word = CharEncoding.readString(buffer);
   1284                 shortcutTargets.add(new WeightedString(word,
   1285                         targetFlags & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY));
   1286                 if (0 == (targetFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break;
   1287             }
   1288             addressPointer += buffer.position() - pointerBefore;
   1289         }
   1290         ArrayList<PendingAttribute> bigrams = null;
   1291         if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
   1292             bigrams = new ArrayList<PendingAttribute>();
   1293             while (true) {
   1294                 final int bigramFlags = buffer.readUnsignedByte();
   1295                 ++addressPointer;
   1296                 final int sign = 0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE)
   1297                         ? 1 : -1;
   1298                 int bigramAddress = addressPointer;
   1299                 switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) {
   1300                 case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
   1301                     bigramAddress += sign * buffer.readUnsignedByte();
   1302                     addressPointer += 1;
   1303                     break;
   1304                 case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
   1305                     bigramAddress += sign * buffer.readUnsignedShort();
   1306                     addressPointer += 2;
   1307                     break;
   1308                 case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
   1309                     final int offset = (buffer.readUnsignedByte() << 16)
   1310                             + buffer.readUnsignedShort();
   1311                     bigramAddress += sign * offset;
   1312                     addressPointer += 3;
   1313                     break;
   1314                 default:
   1315                     throw new RuntimeException("Has bigrams with no address");
   1316                 }
   1317                 bigrams.add(new PendingAttribute(bigramFlags & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY,
   1318                         bigramAddress));
   1319                 if (0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break;
   1320             }
   1321         }
   1322         return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency,
   1323                 parentAddress, childrenAddress, shortcutTargets, bigrams);
   1324     }
   1325 
   1326     /**
   1327      * Reads and returns the char group count out of a buffer and forwards the pointer.
   1328      */
   1329     public static int readCharGroupCount(final FusionDictionaryBufferInterface buffer) {
   1330         final int msb = buffer.readUnsignedByte();
   1331         if (FormatSpec.MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) {
   1332             return msb;
   1333         } else {
   1334             return ((FormatSpec.MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8)
   1335                     + buffer.readUnsignedByte();
   1336         }
   1337     }
   1338 
   1339     // The word cache here is a stopgap bandaid to help the catastrophic performance
   1340     // of this method. Since it performs direct, unbuffered random access to the file and
   1341     // may be called hundreds of thousands of times, the resulting performance is not
   1342     // reasonable without some kind of cache. Thus:
   1343     private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>();
   1344     /**
   1345      * Finds, as a string, the word at the address passed as an argument.
   1346      *
   1347      * @param buffer the buffer to read from.
   1348      * @param headerSize the size of the header.
   1349      * @param address the address to seek.
   1350      * @param formatOptions file format options.
   1351      * @return the word, as a string.
   1352      */
   1353     /* packages for tests */ static String getWordAtAddress(
   1354             final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
   1355             final FormatOptions formatOptions) {
   1356         final String cachedString = wordCache.get(address);
   1357         if (null != cachedString) return cachedString;
   1358 
   1359         final String result;
   1360         final int originalPointer = buffer.position();
   1361         buffer.position(address);
   1362 
   1363         if (supportsDynamicUpdate(formatOptions)) {
   1364             result = getWordAtAddressWithParentAddress(buffer, headerSize, address, formatOptions);
   1365         } else {
   1366             result = getWordAtAddressWithoutParentAddress(buffer, headerSize, address,
   1367                     formatOptions);
   1368         }
   1369 
   1370         wordCache.put(address, result);
   1371         buffer.position(originalPointer);
   1372         return result;
   1373     }
   1374 
   1375     private static int[] sGetWordBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
   1376     private static String getWordAtAddressWithParentAddress(
   1377             final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
   1378             final FormatOptions options) {
   1379         final StringBuilder builder = new StringBuilder();
   1380 
   1381         int currentAddress = address;
   1382         int index = FormatSpec.MAX_WORD_LENGTH - 1;
   1383         // the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH
   1384         for (int count = 0; count < FormatSpec.MAX_WORD_LENGTH; ++count) {
   1385             CharGroupInfo currentInfo;
   1386             int loopCounter = 0;
   1387             do {
   1388                 buffer.position(currentAddress + headerSize);
   1389                 currentInfo = readCharGroup(buffer, currentAddress, options);
   1390                 if (isMovedGroup(currentInfo.mFlags, options)) {
   1391                     currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
   1392                 }
   1393                 if (DBG && loopCounter++ > MAX_JUMPS) {
   1394                     MakedictLog.d("Too many jumps - probably a bug");
   1395                 }
   1396             } while (isMovedGroup(currentInfo.mFlags, options));
   1397             for (int i = 0; i < currentInfo.mCharacters.length; ++i) {
   1398                 sGetWordBuffer[index--] =
   1399                         currentInfo.mCharacters[currentInfo.mCharacters.length - i - 1];
   1400             }
   1401             if (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) break;
   1402             currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
   1403         }
   1404 
   1405         return new String(sGetWordBuffer, index + 1, FormatSpec.MAX_WORD_LENGTH - index - 1);
   1406     }
   1407 
   1408     private static String getWordAtAddressWithoutParentAddress(
   1409             final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
   1410             final FormatOptions options) {
   1411         buffer.position(headerSize);
   1412         final int count = readCharGroupCount(buffer);
   1413         int groupOffset = getGroupCountSize(count);
   1414         final StringBuilder builder = new StringBuilder();
   1415         String result = null;
   1416 
   1417         CharGroupInfo last = null;
   1418         for (int i = count - 1; i >= 0; --i) {
   1419             CharGroupInfo info = readCharGroup(buffer, groupOffset, options);
   1420             groupOffset = info.mEndAddress;
   1421             if (info.mOriginalAddress == address) {
   1422                 builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
   1423                 result = builder.toString();
   1424                 break; // and return
   1425             }
   1426             if (hasChildrenAddress(info.mChildrenAddress)) {
   1427                 if (info.mChildrenAddress > address) {
   1428                     if (null == last) continue;
   1429                     builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
   1430                     buffer.position(last.mChildrenAddress + headerSize);
   1431                     groupOffset = last.mChildrenAddress + 1;
   1432                     i = buffer.readUnsignedByte();
   1433                     last = null;
   1434                     continue;
   1435                 }
   1436                 last = info;
   1437             }
   1438             if (0 == i && hasChildrenAddress(last.mChildrenAddress)) {
   1439                 builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
   1440                 buffer.position(last.mChildrenAddress + headerSize);
   1441                 groupOffset = last.mChildrenAddress + 1;
   1442                 i = buffer.readUnsignedByte();
   1443                 last = null;
   1444                 continue;
   1445             }
   1446         }
   1447         return result;
   1448     }
   1449 
   1450     /**
   1451      * Reads a single node from a buffer.
   1452      *
   1453      * This methods reads the file at the current position. A node is fully expected to start at
   1454      * the current position.
   1455      * This will recursively read other nodes into the structure, populating the reverse
   1456      * maps on the fly and using them to keep track of already read nodes.
   1457      *
   1458      * @param buffer the buffer, correctly positioned at the start of a node.
   1459      * @param headerSize the size, in bytes, of the file header.
   1460      * @param reverseNodeMap a mapping from addresses to already read nodes.
   1461      * @param reverseGroupMap a mapping from addresses to already read character groups.
   1462      * @param options file format options.
   1463      * @return the read node with all his children already read.
   1464      */
   1465     private static Node readNode(final FusionDictionaryBufferInterface buffer, final int headerSize,
   1466             final Map<Integer, Node> reverseNodeMap, final Map<Integer, CharGroup> reverseGroupMap,
   1467             final FormatOptions options)
   1468             throws IOException {
   1469         final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>();
   1470         final int nodeOrigin = buffer.position() - headerSize;
   1471 
   1472         do { // Scan the linked-list node.
   1473             final int nodeHeadPosition = buffer.position() - headerSize;
   1474             final int count = readCharGroupCount(buffer);
   1475             int groupOffset = nodeHeadPosition + getGroupCountSize(count);
   1476             for (int i = count; i > 0; --i) { // Scan the array of CharGroup.
   1477                 CharGroupInfo info = readCharGroup(buffer, groupOffset, options);
   1478                 if (isMovedGroup(info.mFlags, options)) continue;
   1479                 ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets;
   1480                 ArrayList<WeightedString> bigrams = null;
   1481                 if (null != info.mBigrams) {
   1482                     bigrams = new ArrayList<WeightedString>();
   1483                     for (PendingAttribute bigram : info.mBigrams) {
   1484                         final String word = getWordAtAddress(
   1485                                 buffer, headerSize, bigram.mAddress, options);
   1486                         bigrams.add(new WeightedString(word, bigram.mFrequency));
   1487                     }
   1488                 }
   1489                 if (hasChildrenAddress(info.mChildrenAddress)) {
   1490                     Node children = reverseNodeMap.get(info.mChildrenAddress);
   1491                     if (null == children) {
   1492                         final int currentPosition = buffer.position();
   1493                         buffer.position(info.mChildrenAddress + headerSize);
   1494                         children = readNode(
   1495                                 buffer, headerSize, reverseNodeMap, reverseGroupMap, options);
   1496                         buffer.position(currentPosition);
   1497                     }
   1498                     nodeContents.add(
   1499                             new CharGroup(info.mCharacters, shortcutTargets, bigrams,
   1500                                     info.mFrequency,
   1501                                     0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
   1502                                     0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED), children));
   1503                 } else {
   1504                     nodeContents.add(
   1505                             new CharGroup(info.mCharacters, shortcutTargets, bigrams,
   1506                                     info.mFrequency,
   1507                                     0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
   1508                                     0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED)));
   1509                 }
   1510                 groupOffset = info.mEndAddress;
   1511             }
   1512 
   1513             // reach the end of the array.
   1514             if (options.mSupportsDynamicUpdate) {
   1515                 final int nextAddress = buffer.readUnsignedInt24();
   1516                 if (nextAddress >= 0 && nextAddress < buffer.limit()) {
   1517                     buffer.position(nextAddress);
   1518                 } else {
   1519                     break;
   1520                 }
   1521             }
   1522         } while (options.mSupportsDynamicUpdate &&
   1523                 buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
   1524 
   1525         final Node node = new Node(nodeContents);
   1526         node.mCachedAddress = nodeOrigin;
   1527         reverseNodeMap.put(node.mCachedAddress, node);
   1528         return node;
   1529     }
   1530 
   1531     /**
   1532      * Helper function to get the binary format version from the header.
   1533      * @throws IOException
   1534      */
   1535     private static int getFormatVersion(final FusionDictionaryBufferInterface buffer)
   1536             throws IOException {
   1537         final int magic_v1 = buffer.readUnsignedShort();
   1538         if (FormatSpec.VERSION_1_MAGIC_NUMBER == magic_v1) return buffer.readUnsignedByte();
   1539         final int magic_v2 = (magic_v1 << 16) + buffer.readUnsignedShort();
   1540         if (FormatSpec.VERSION_2_MAGIC_NUMBER == magic_v2) return buffer.readUnsignedShort();
   1541         return FormatSpec.NOT_A_VERSION_NUMBER;
   1542     }
   1543 
   1544     /**
   1545      * Helper function to get and validate the binary format version.
   1546      * @throws UnsupportedFormatException
   1547      * @throws IOException
   1548      */
   1549     private static int checkFormatVersion(final FusionDictionaryBufferInterface buffer)
   1550             throws IOException, UnsupportedFormatException {
   1551         final int version = getFormatVersion(buffer);
   1552         if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
   1553                 || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) {
   1554             throw new UnsupportedFormatException("This file has version " + version
   1555                     + ", but this implementation does not support versions above "
   1556                     + FormatSpec.MAXIMUM_SUPPORTED_VERSION);
   1557         }
   1558         return version;
   1559     }
   1560 
   1561     /**
   1562      * Reads a header from a buffer.
   1563      * @param buffer the buffer to read.
   1564      * @throws IOException
   1565      * @throws UnsupportedFormatException
   1566      */
   1567     public static FileHeader readHeader(final FusionDictionaryBufferInterface buffer)
   1568             throws IOException, UnsupportedFormatException {
   1569         final int version = checkFormatVersion(buffer);
   1570         final int optionsFlags = buffer.readUnsignedShort();
   1571 
   1572         final HashMap<String, String> attributes = new HashMap<String, String>();
   1573         final int headerSize;
   1574         if (version < FormatSpec.FIRST_VERSION_WITH_HEADER_SIZE) {
   1575             headerSize = buffer.position();
   1576         } else {
   1577             headerSize = buffer.readInt();
   1578             populateOptions(buffer, headerSize, attributes);
   1579             buffer.position(headerSize);
   1580         }
   1581 
   1582         if (headerSize < 0) {
   1583             throw new UnsupportedFormatException("header size can't be negative.");
   1584         }
   1585 
   1586         final FileHeader header = new FileHeader(headerSize,
   1587                 new FusionDictionary.DictionaryOptions(attributes,
   1588                         0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG),
   1589                         0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)),
   1590                 new FormatOptions(version,
   1591                         0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE)));
   1592         return header;
   1593     }
   1594 
   1595     /**
   1596      * Reads options from a buffer and populate a map with their contents.
   1597      *
   1598      * The buffer is read at the current position, so the caller must take care the pointer
   1599      * is in the right place before calling this.
   1600      */
   1601     public static void populateOptions(final FusionDictionaryBufferInterface buffer,
   1602             final int headerSize, final HashMap<String, String> options) {
   1603         while (buffer.position() < headerSize) {
   1604             final String key = CharEncoding.readString(buffer);
   1605             final String value = CharEncoding.readString(buffer);
   1606             options.put(key, value);
   1607         }
   1608     }
   1609 
   1610     /**
   1611      * Reads a buffer and returns the memory representation of the dictionary.
   1612      *
   1613      * This high-level method takes a buffer and reads its contents, populating a
   1614      * FusionDictionary structure. The optional dict argument is an existing dictionary to
   1615      * which words from the buffer should be added. If it is null, a new dictionary is created.
   1616      *
   1617      * @param buffer the buffer to read.
   1618      * @param dict an optional dictionary to add words to, or null.
   1619      * @return the created (or merged) dictionary.
   1620      */
   1621     public static FusionDictionary readDictionaryBinary(
   1622             final FusionDictionaryBufferInterface buffer, final FusionDictionary dict)
   1623                     throws IOException, UnsupportedFormatException {
   1624         // clear cache
   1625         wordCache.clear();
   1626 
   1627         // Read header
   1628         final FileHeader header = readHeader(buffer);
   1629 
   1630         Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>();
   1631         Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>();
   1632         final Node root = readNode(buffer, header.mHeaderSize, reverseNodeMapping,
   1633                 reverseGroupMapping, header.mFormatOptions);
   1634 
   1635         FusionDictionary newDict = new FusionDictionary(root, header.mDictionaryOptions);
   1636         if (null != dict) {
   1637             for (final Word w : dict) {
   1638                 if (w.mIsBlacklistEntry) {
   1639                     newDict.addBlacklistEntry(w.mWord, w.mShortcutTargets, w.mIsNotAWord);
   1640                 } else {
   1641                     newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mIsNotAWord);
   1642                 }
   1643             }
   1644             for (final Word w : dict) {
   1645                 // By construction a binary dictionary may not have bigrams pointing to
   1646                 // words that are not also registered as unigrams so we don't have to avoid
   1647                 // them explicitly here.
   1648                 for (final WeightedString bigram : w.mBigrams) {
   1649                     newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
   1650                 }
   1651             }
   1652         }
   1653 
   1654         return newDict;
   1655     }
   1656 
   1657     /**
   1658      * Basic test to find out whether the file is a binary dictionary or not.
   1659      *
   1660      * Concretely this only tests the magic number.
   1661      *
   1662      * @param filename The name of the file to test.
   1663      * @return true if it's a binary dictionary, false otherwise
   1664      */
   1665     public static boolean isBinaryDictionary(final String filename) {
   1666         FileInputStream inStream = null;
   1667         try {
   1668             final File file = new File(filename);
   1669             inStream = new FileInputStream(file);
   1670             final ByteBuffer buffer = inStream.getChannel().map(
   1671                     FileChannel.MapMode.READ_ONLY, 0, file.length());
   1672             final int version = getFormatVersion(new ByteBufferWrapper(buffer));
   1673             return (version >= FormatSpec.MINIMUM_SUPPORTED_VERSION
   1674                     && version <= FormatSpec.MAXIMUM_SUPPORTED_VERSION);
   1675         } catch (FileNotFoundException e) {
   1676             return false;
   1677         } catch (IOException e) {
   1678             return false;
   1679         } finally {
   1680             if (inStream != null) {
   1681                 try {
   1682                     inStream.close();
   1683                 } catch (IOException e) {
   1684                     // do nothing
   1685                 }
   1686             }
   1687         }
   1688     }
   1689 
   1690     /**
   1691      * Calculate bigram frequency from compressed value
   1692      *
   1693      * @see #makeBigramFlags
   1694      *
   1695      * @param unigramFrequency
   1696      * @param bigramFrequency compressed frequency
   1697      * @return approximate bigram frequency
   1698      */
   1699     public static int reconstructBigramFrequency(final int unigramFrequency,
   1700             final int bigramFrequency) {
   1701         final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
   1702                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
   1703         final float resultFreqFloat = (float)unigramFrequency
   1704                 + stepSize * (bigramFrequency + 1.0f);
   1705         return (int)resultFreqFloat;
   1706     }
   1707 }
   1708