Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
     20 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     21 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
     22 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
     23 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
     24 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     25 
     26 import java.io.ByteArrayOutputStream;
     27 import java.io.IOException;
     28 import java.io.OutputStream;
     29 import java.util.ArrayList;
     30 
     31 /**
     32  * Encodes binary files for a FusionDictionary.
     33  *
     34  * All the methods in this class are static.
     35  *
     36  * TODO: Rename this class to DictEncoderUtils.
     37  */
     38 public class BinaryDictEncoderUtils {
     39 
     40     private static final boolean DBG = MakedictLog.DBG;
     41 
     42     private BinaryDictEncoderUtils() {
     43         // This utility class is not publicly instantiable.
     44     }
     45 
     46     // Arbitrary limit to how much passes we consider address size compression should
     47     // terminate in. At the time of this writing, our largest dictionary completes
     48     // compression in five passes.
     49     // If the number of passes exceeds this number, makedict bails with an exception on
     50     // suspicion that a bug might be causing an infinite loop.
     51     private static final int MAX_PASSES = 24;
     52 
     53     /**
     54      * Compute the binary size of the character array.
     55      *
     56      * If only one character, this is the size of this character. If many, it's the sum of their
     57      * sizes + 1 byte for the terminator.
     58      *
     59      * @param characters the character array
     60      * @return the size of the char array, including the terminator if any
     61      */
     62     static int getPtNodeCharactersSize(final int[] characters) {
     63         int size = CharEncoding.getCharArraySize(characters);
     64         if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE;
     65         return size;
     66     }
     67 
     68     /**
     69      * Compute the binary size of the character array in a PtNode
     70      *
     71      * If only one character, this is the size of this character. If many, it's the sum of their
     72      * sizes + 1 byte for the terminator.
     73      *
     74      * @param ptNode the PtNode
     75      * @return the size of the char array, including the terminator if any
     76      */
     77     private static int getPtNodeCharactersSize(final PtNode ptNode) {
     78         return getPtNodeCharactersSize(ptNode.mChars);
     79     }
     80 
     81     /**
     82      * Compute the binary size of the PtNode count for a node array.
     83      * @param nodeArray the nodeArray
     84      * @return the size of the PtNode count, either 1 or 2 bytes.
     85      */
     86     private static int getPtNodeCountSize(final PtNodeArray nodeArray) {
     87         return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size());
     88     }
     89 
     90     /**
     91      * Compute the size of a shortcut in bytes.
     92      */
     93     private static int getShortcutSize(final WeightedString shortcut) {
     94         int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
     95         final String word = shortcut.mWord;
     96         final int length = word.length();
     97         for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
     98             final int codePoint = word.codePointAt(i);
     99             size += CharEncoding.getCharSize(codePoint);
    100         }
    101         size += FormatSpec.PTNODE_TERMINATOR_SIZE;
    102         return size;
    103     }
    104 
    105     /**
    106      * Compute the size of a shortcut list in bytes.
    107      *
    108      * This is known in advance and does not change according to position in the file
    109      * like address lists do.
    110      */
    111     static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
    112         if (null == shortcutList || shortcutList.isEmpty()) return 0;
    113         int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
    114         for (final WeightedString shortcut : shortcutList) {
    115             size += getShortcutSize(shortcut);
    116         }
    117         return size;
    118     }
    119 
    120     /**
    121      * Compute the maximum size of a PtNode, assuming 3-byte addresses for everything.
    122      *
    123      * @param ptNode the PtNode to compute the size of.
    124      * @param options file format options.
    125      * @return the maximum size of the PtNode.
    126      */
    127     private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) {
    128         int size = getNodeHeaderSize(ptNode, options);
    129         if (ptNode.isTerminal()) {
    130             // If terminal, one byte for the frequency or four bytes for the terminal id.
    131             if (options.mHasTerminalId) {
    132                 size += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
    133             } else {
    134                 size += FormatSpec.PTNODE_FREQUENCY_SIZE;
    135             }
    136         }
    137         size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address
    138         size += getShortcutListSize(ptNode.mShortcutTargets);
    139         if (null != ptNode.mBigrams) {
    140             size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE
    141                     + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE)
    142                     * ptNode.mBigrams.size();
    143         }
    144         return size;
    145     }
    146 
    147     /**
    148      * Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for
    149      * everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of
    150      * the containing node array, and cache it it its 'mCachedSize' member.
    151      *
    152      * @param ptNodeArray the node array to compute the maximum size of.
    153      * @param options file format options.
    154      */
    155     private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray,
    156             final FormatOptions options) {
    157         int size = getPtNodeCountSize(ptNodeArray);
    158         for (PtNode node : ptNodeArray.mData) {
    159             final int nodeSize = getPtNodeMaximumSize(node, options);
    160             node.mCachedSize = nodeSize;
    161             size += nodeSize;
    162         }
    163         if (options.mSupportsDynamicUpdate) {
    164             size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
    165         }
    166         ptNodeArray.mCachedSize = size;
    167     }
    168 
    169     /**
    170      * Compute the size of the header (flag + [parent address] + characters size) of a PtNode.
    171      *
    172      * @param ptNode the PtNode of which to compute the size of the header
    173      * @param options file format options.
    174      */
    175     private static int getNodeHeaderSize(final PtNode ptNode, final FormatOptions options) {
    176         if (BinaryDictIOUtils.supportsDynamicUpdate(options)) {
    177             return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
    178                     + getPtNodeCharactersSize(ptNode);
    179         } else {
    180             return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode);
    181         }
    182     }
    183 
    184     /**
    185      * Compute the size, in bytes, that an address will occupy.
    186      *
    187      * This can be used either for children addresses (which are always positive) or for
    188      * attribute, which may be positive or negative but
    189      * store their sign bit separately.
    190      *
    191      * @param address the address
    192      * @return the byte size.
    193      */
    194     static int getByteSize(final int address) {
    195         assert(address <= FormatSpec.UINT24_MAX);
    196         if (!BinaryDictIOUtils.hasChildrenAddress(address)) {
    197             return 0;
    198         } else if (Math.abs(address) <= FormatSpec.UINT8_MAX) {
    199             return 1;
    200         } else if (Math.abs(address) <= FormatSpec.UINT16_MAX) {
    201             return 2;
    202         } else {
    203             return 3;
    204         }
    205     }
    206 
    207     static int writeUIntToBuffer(final byte[] buffer, int position, final int value,
    208             final int size) {
    209         switch(size) {
    210             case 4:
    211                 buffer[position++] = (byte) ((value >> 24) & 0xFF);
    212                 /* fall through */
    213             case 3:
    214                 buffer[position++] = (byte) ((value >> 16) & 0xFF);
    215                 /* fall through */
    216             case 2:
    217                 buffer[position++] = (byte) ((value >> 8) & 0xFF);
    218                 /* fall through */
    219             case 1:
    220                 buffer[position++] = (byte) (value & 0xFF);
    221                 break;
    222             default:
    223                 /* nop */
    224         }
    225         return position;
    226     }
    227 
    228     static void writeUIntToStream(final OutputStream stream, final int value, final int size)
    229             throws IOException {
    230         switch(size) {
    231             case 4:
    232                 stream.write((value >> 24) & 0xFF);
    233                 /* fall through */
    234             case 3:
    235                 stream.write((value >> 16) & 0xFF);
    236                 /* fall through */
    237             case 2:
    238                 stream.write((value >> 8) & 0xFF);
    239                 /* fall through */
    240             case 1:
    241                 stream.write(value & 0xFF);
    242                 break;
    243             default:
    244                 /* nop */
    245         }
    246     }
    247 
    248     // End utility methods
    249 
    250     // This method is responsible for finding a nice ordering of the nodes that favors run-time
    251     // cache performance and dictionary size.
    252     /* package for tests */ static ArrayList<PtNodeArray> flattenTree(
    253             final PtNodeArray rootNodeArray) {
    254         final int treeSize = FusionDictionary.countPtNodes(rootNodeArray);
    255         MakedictLog.i("Counted nodes : " + treeSize);
    256         final ArrayList<PtNodeArray> flatTree = new ArrayList<PtNodeArray>(treeSize);
    257         return flattenTreeInner(flatTree, rootNodeArray);
    258     }
    259 
    260     private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list,
    261             final PtNodeArray ptNodeArray) {
    262         // Removing the node is necessary if the tails are merged, because we would then
    263         // add the same node several times when we only want it once. A number of places in
    264         // the code also depends on any node being only once in the list.
    265         // Merging tails can only be done if there are no attributes. Searching for attributes
    266         // in LatinIME code depends on a total breadth-first ordering, which merging tails
    267         // breaks. If there are no attributes, it should be fine (and reduce the file size)
    268         // to merge tails, and removing the node from the list would be necessary. However,
    269         // we don't merge tails because breaking the breadth-first ordering would result in
    270         // extreme overhead at bigram lookup time (it would make the search function O(n) instead
    271         // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty
    272         // high).
    273         // If no nodes are ever merged, we can't have the same node twice in the list, hence
    274         // searching for duplicates in unnecessary. It is also very performance consuming,
    275         // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making
    276         // this simple list.remove operation O(n*n) overall. On Android this overhead is very
    277         // high.
    278         // For future reference, the code to remove duplicate is a simple : list.remove(node);
    279         list.add(ptNodeArray);
    280         final ArrayList<PtNode> branches = ptNodeArray.mData;
    281         for (PtNode ptNode : branches) {
    282             if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren);
    283         }
    284         return list;
    285     }
    286 
    287     /**
    288      * Get the offset from a position inside a current node array to a target node array, during
    289      * update.
    290      *
    291      * If the current node array is before the target node array, the target node array has not
    292      * been updated yet, so we should return the offset from the old position of the current node
    293      * array to the old position of the target node array. If on the other hand the target is
    294      * before the current node array, it already has been updated, so we should return the offset
    295      * from the new position in the current node array to the new position in the target node
    296      * array.
    297      *
    298      * @param currentNodeArray node array containing the PtNode where the offset will be written
    299      * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray
    300      * @param targetNodeArray the target node array to get the offset to
    301      * @return the offset to the target node array
    302      */
    303     private static int getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray,
    304             final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray) {
    305         final boolean isTargetBeforeCurrent = (targetNodeArray.mCachedAddressBeforeUpdate
    306                 < currentNodeArray.mCachedAddressBeforeUpdate);
    307         if (isTargetBeforeCurrent) {
    308             return targetNodeArray.mCachedAddressAfterUpdate
    309                     - (currentNodeArray.mCachedAddressAfterUpdate
    310                             + offsetFromStartOfCurrentNodeArray);
    311         } else {
    312             return targetNodeArray.mCachedAddressBeforeUpdate
    313                     - (currentNodeArray.mCachedAddressBeforeUpdate
    314                             + offsetFromStartOfCurrentNodeArray);
    315         }
    316     }
    317 
    318     /**
    319      * Get the offset from a position inside a current node array to a target PtNode, during
    320      * update.
    321      *
    322      * @param currentNodeArray node array containing the PtNode where the offset will be written
    323      * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray
    324      * @param targetPtNode the target PtNode to get the offset to
    325      * @return the offset to the target PtNode
    326      */
    327     // TODO: is there any way to factorize this method with the one above?
    328     private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray,
    329             final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) {
    330         final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate
    331                 + offsetFromStartOfCurrentNodeArray;
    332         final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate
    333                 < oldOffsetBasePoint);
    334         // If the target is before the current node array, then its address has already been
    335         // updated. We can use the AfterUpdate member, and compare it to our own member after
    336         // update. Otherwise, the AfterUpdate member is not updated yet, so we need to use the
    337         // BeforeUpdate member, and of course we have to compare this to our own address before
    338         // update.
    339         if (isTargetBeforeCurrent) {
    340             final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate
    341                     + offsetFromStartOfCurrentNodeArray;
    342             return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint;
    343         } else {
    344             return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint;
    345         }
    346     }
    347 
    348     /**
    349      * Computes the actual node array size, based on the cached addresses of the children nodes.
    350      *
    351      * Each node array stores its tentative address. During dictionary address computing, these
    352      * are not final, but they can be used to compute the node array size (the node array size
    353      * depends on the address of the children because the number of bytes necessary to store an
    354      * address depends on its numeric value. The return value indicates whether the node array
    355      * contents (as in, any of the addresses stored in the cache fields) have changed with
    356      * respect to their previous value.
    357      *
    358      * @param ptNodeArray the node array to compute the size of.
    359      * @param dict the dictionary in which the word/attributes are to be found.
    360      * @param formatOptions file format options.
    361      * @return false if none of the cached addresses inside the node array changed, true otherwise.
    362      */
    363     private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray,
    364             final FusionDictionary dict, final FormatOptions formatOptions) {
    365         boolean changed = false;
    366         int size = getPtNodeCountSize(ptNodeArray);
    367         for (PtNode ptNode : ptNodeArray.mData) {
    368             ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size;
    369             if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) {
    370                 changed = true;
    371             }
    372             int nodeSize = getNodeHeaderSize(ptNode, formatOptions);
    373             if (ptNode.isTerminal()) {
    374                 if (formatOptions.mHasTerminalId) {
    375                     nodeSize += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
    376                 } else {
    377                     nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
    378                 }
    379             }
    380             if (formatOptions.mSupportsDynamicUpdate) {
    381                 nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
    382             } else if (null != ptNode.mChildren) {
    383                 nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
    384                         nodeSize + size, ptNode.mChildren));
    385             }
    386             if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) {
    387                 nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
    388                 if (null != ptNode.mBigrams) {
    389                     for (WeightedString bigram : ptNode.mBigrams) {
    390                         final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
    391                                 nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE,
    392                                 FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord));
    393                         nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
    394                     }
    395                 }
    396             }
    397             ptNode.mCachedSize = nodeSize;
    398             size += nodeSize;
    399         }
    400         if (formatOptions.mSupportsDynamicUpdate) {
    401             size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
    402         }
    403         if (ptNodeArray.mCachedSize != size) {
    404             ptNodeArray.mCachedSize = size;
    405             changed = true;
    406         }
    407         return changed;
    408     }
    409 
    410     /**
    411      * Initializes the cached addresses of node arrays and their containing nodes from their size.
    412      *
    413      * @param flatNodes the list of node arrays.
    414      * @param formatOptions file format options.
    415      * @return the byte size of the entire stack.
    416      */
    417     private static int initializePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes,
    418             final FormatOptions formatOptions) {
    419         int nodeArrayOffset = 0;
    420         for (final PtNodeArray nodeArray : flatNodes) {
    421             nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset;
    422             int nodeCountSize = getPtNodeCountSize(nodeArray);
    423             int nodeffset = 0;
    424             for (final PtNode ptNode : nodeArray.mData) {
    425                 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate =
    426                         nodeCountSize + nodeArrayOffset + nodeffset;
    427                 nodeffset += ptNode.mCachedSize;
    428             }
    429             nodeArrayOffset += nodeArray.mCachedSize;
    430         }
    431         return nodeArrayOffset;
    432     }
    433 
    434     /**
    435      * Updates the cached addresses of node arrays after recomputing their new positions.
    436      *
    437      * @param flatNodes the list of node arrays.
    438      */
    439     private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) {
    440         for (final PtNodeArray nodeArray : flatNodes) {
    441             nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate;
    442             for (final PtNode ptNode : nodeArray.mData) {
    443                 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate;
    444             }
    445         }
    446     }
    447 
    448     /**
    449      * Compute the cached parent addresses after all has been updated.
    450      *
    451      * The parent addresses are used by some binary formats at write-to-disk time. Not all formats
    452      * need them. In particular, version 2 does not need them, and version 3 does.
    453      *
    454      * @param flatNodes the flat array of node arrays to fill in
    455      */
    456     private static void computeParentAddresses(final ArrayList<PtNodeArray> flatNodes) {
    457         for (final PtNodeArray nodeArray : flatNodes) {
    458             for (final PtNode ptNode : nodeArray.mData) {
    459                 if (null != ptNode.mChildren) {
    460                     // Assign my address to children's parent address
    461                     // Here BeforeUpdate and AfterUpdate addresses have the same value, so it
    462                     // does not matter which we use.
    463                     ptNode.mChildren.mCachedParentAddress = ptNode.mCachedAddressAfterUpdate
    464                             - ptNode.mChildren.mCachedAddressAfterUpdate;
    465                 }
    466             }
    467         }
    468     }
    469 
    470     /**
    471      * Compute the addresses and sizes of an ordered list of PtNode arrays.
    472      *
    473      * This method takes a list of PtNode arrays and will update their cached address and size
    474      * values so that they can be written into a file. It determines the smallest size each of the
    475      * PtNode arrays can be given the addresses of its children and attributes, and store that into
    476      * each PtNode.
    477      * The order of the PtNode is given by the order of the array. This method makes no effort
    478      * to find a good order; it only mechanically computes the size this order results in.
    479      *
    480      * @param dict the dictionary
    481      * @param flatNodes the ordered list of PtNode arrays
    482      * @param formatOptions file format options.
    483      * @return the same array it was passed. The nodes have been updated for address and size.
    484      */
    485     /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict,
    486             final ArrayList<PtNodeArray> flatNodes, final FormatOptions formatOptions) {
    487         // First get the worst possible sizes and offsets
    488         for (final PtNodeArray n : flatNodes) calculatePtNodeArrayMaximumSize(n, formatOptions);
    489         final int offset = initializePtNodeArraysCachedAddresses(flatNodes, formatOptions);
    490 
    491         MakedictLog.i("Compressing the array addresses. Original size : " + offset);
    492         MakedictLog.i("(Recursively seen size : " + offset + ")");
    493 
    494         int passes = 0;
    495         boolean changesDone = false;
    496         do {
    497             changesDone = false;
    498             int ptNodeArrayStartOffset = 0;
    499             for (final PtNodeArray ptNodeArray : flatNodes) {
    500                 ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset;
    501                 final int oldNodeArraySize = ptNodeArray.mCachedSize;
    502                 final boolean changed =
    503                         computeActualPtNodeArraySize(ptNodeArray, dict, formatOptions);
    504                 final int newNodeArraySize = ptNodeArray.mCachedSize;
    505                 if (oldNodeArraySize < newNodeArraySize) {
    506                     throw new RuntimeException("Increased size ?!");
    507                 }
    508                 ptNodeArrayStartOffset += newNodeArraySize;
    509                 changesDone |= changed;
    510             }
    511             updatePtNodeArraysCachedAddresses(flatNodes);
    512             ++passes;
    513             if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
    514         } while (changesDone);
    515 
    516         if (formatOptions.mSupportsDynamicUpdate) {
    517             computeParentAddresses(flatNodes);
    518         }
    519         final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1);
    520         MakedictLog.i("Compression complete in " + passes + " passes.");
    521         MakedictLog.i("After address compression : "
    522                 + (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize));
    523 
    524         return flatNodes;
    525     }
    526 
    527     /**
    528      * Sanity-checking method.
    529      *
    530      * This method checks a list of PtNode arrays for juxtaposition, that is, it will do
    531      * nothing if each node array's cached address is actually the previous node array's address
    532      * plus the previous node's size.
    533      * If this is not the case, it will throw an exception.
    534      *
    535      * @param arrays the list of node arrays to check
    536      */
    537     /* package */ static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) {
    538         int offset = 0;
    539         int index = 0;
    540         for (final PtNodeArray ptNodeArray : arrays) {
    541             // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter
    542             // which we use.
    543             if (ptNodeArray.mCachedAddressAfterUpdate != offset) {
    544                 throw new RuntimeException("Wrong address for node " + index
    545                         + " : expected " + offset + ", got " +
    546                         ptNodeArray.mCachedAddressAfterUpdate);
    547             }
    548             ++index;
    549             offset += ptNodeArray.mCachedSize;
    550         }
    551     }
    552 
    553     /**
    554      * Helper method to write a children position to a file.
    555      *
    556      * @param buffer the buffer to write to.
    557      * @param index the index in the buffer to write the address to.
    558      * @param position the position to write.
    559      * @return the size in bytes the address actually took.
    560      */
    561     /* package */ static int writeChildrenPosition(final byte[] buffer, int index,
    562             final int position) {
    563         switch (getByteSize(position)) {
    564         case 1:
    565             buffer[index++] = (byte)position;
    566             return 1;
    567         case 2:
    568             buffer[index++] = (byte)(0xFF & (position >> 8));
    569             buffer[index++] = (byte)(0xFF & position);
    570             return 2;
    571         case 3:
    572             buffer[index++] = (byte)(0xFF & (position >> 16));
    573             buffer[index++] = (byte)(0xFF & (position >> 8));
    574             buffer[index++] = (byte)(0xFF & position);
    575             return 3;
    576         case 0:
    577             return 0;
    578         default:
    579             throw new RuntimeException("Position " + position + " has a strange size");
    580         }
    581     }
    582 
    583     /**
    584      * Helper method to write a signed children position to a file.
    585      *
    586      * @param buffer the buffer to write to.
    587      * @param index the index in the buffer to write the address to.
    588      * @param position the position to write.
    589      * @return the size in bytes the address actually took.
    590      */
    591     /* package */ static int writeSignedChildrenPosition(final byte[] buffer, int index,
    592             final int position) {
    593         if (!BinaryDictIOUtils.hasChildrenAddress(position)) {
    594             buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
    595         } else {
    596             final int absPosition = Math.abs(position);
    597             buffer[index++] =
    598                     (byte)((position < 0 ? FormatSpec.MSB8 : 0) | (0xFF & (absPosition >> 16)));
    599             buffer[index++] = (byte)(0xFF & (absPosition >> 8));
    600             buffer[index++] = (byte)(0xFF & absPosition);
    601         }
    602         return 3;
    603     }
    604 
    605     /**
    606      * Makes the flag value for a PtNode.
    607      *
    608      * @param hasMultipleChars whether the PtNode has multiple chars.
    609      * @param isTerminal whether the PtNode is terminal.
    610      * @param childrenAddressSize the size of a children address.
    611      * @param hasShortcuts whether the PtNode has shortcuts.
    612      * @param hasBigrams whether the PtNode has bigrams.
    613      * @param isNotAWord whether the PtNode is not a word.
    614      * @param isBlackListEntry whether the PtNode is a blacklist entry.
    615      * @param formatOptions file format options.
    616      * @return the flags
    617      */
    618     static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal,
    619             final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams,
    620             final boolean isNotAWord, final boolean isBlackListEntry,
    621             final FormatOptions formatOptions) {
    622         byte flags = 0;
    623         if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
    624         if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL;
    625         if (formatOptions.mSupportsDynamicUpdate) {
    626             flags |= FormatSpec.FLAG_IS_NOT_MOVED;
    627         } else if (true) {
    628             switch (childrenAddressSize) {
    629                 case 1:
    630                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE;
    631                     break;
    632                 case 2:
    633                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES;
    634                     break;
    635                 case 3:
    636                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES;
    637                     break;
    638                 case 0:
    639                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS;
    640                     break;
    641                 default:
    642                     throw new RuntimeException("Node with a strange address");
    643             }
    644         }
    645         if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
    646         if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS;
    647         if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
    648         if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED;
    649         return flags;
    650     }
    651 
    652     /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset,
    653             final FormatOptions formatOptions) {
    654         return (byte) makePtNodeFlags(node.mChars.length > 1, node.mFrequency >= 0,
    655                 getByteSize(childrenOffset),
    656                 node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
    657                 node.mBigrams != null, node.mIsNotAWord, node.mIsBlacklistEntry, formatOptions);
    658     }
    659 
    660     /**
    661      * Makes the flag value for a bigram.
    662      *
    663      * @param more whether there are more bigrams after this one.
    664      * @param offset the offset of the bigram.
    665      * @param bigramFrequency the frequency of the bigram, 0..255.
    666      * @param unigramFrequency the unigram frequency of the same word, 0..255.
    667      * @param word the second bigram, for debugging purposes
    668      * @return the flags
    669      */
    670     /* package */ static final int makeBigramFlags(final boolean more, final int offset,
    671             int bigramFrequency, final int unigramFrequency, final String word) {
    672         int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0)
    673                 + (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0);
    674         switch (getByteSize(offset)) {
    675         case 1:
    676             bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE;
    677             break;
    678         case 2:
    679             bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES;
    680             break;
    681         case 3:
    682             bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES;
    683             break;
    684         default:
    685             throw new RuntimeException("Strange offset size");
    686         }
    687         if (unigramFrequency > bigramFrequency) {
    688             MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word
    689                     + "\". Bigram freq is " + bigramFrequency + ", unigram freq for "
    690                     + word + " is " + unigramFrequency);
    691             bigramFrequency = unigramFrequency;
    692         }
    693         // We compute the difference between 255 (which means probability = 1) and the
    694         // unigram score. We split this into a number of discrete steps.
    695         // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15
    696         // represents an increase of 16 steps: a value of 15 will be interpreted as the median
    697         // value of the 16th step. In all justice, if the bigram frequency is low enough to be
    698         // rounded below the first step (which means it is less than half a step higher than the
    699         // unigram frequency) then the unigram frequency itself is the best approximation of the
    700         // bigram freq that we could possibly supply, hence we should *not* include this bigram
    701         // in the file at all.
    702         // until this is done, we'll write 0 and slightly overestimate this case.
    703         // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step
    704         // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to
    705         // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the
    706         // step size. Then we compute the start of the first step (the one where value 0 starts)
    707         // by adding half-a-step to the unigramFrequency. From there, we compute the integer
    708         // number of steps to the bigramFrequency. One last thing: we want our steps to include
    709         // their lower bound and exclude their higher bound so we need to have the first step
    710         // start at exactly 1 unit higher than floor(unigramFreq + half a step).
    711         // Note : to reconstruct the score, the dictionary reader will need to divide
    712         // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step,
    713         // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best
    714         // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the
    715         // step pointed by the discretized frequency.
    716         final float stepSize =
    717                 (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
    718                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
    719         final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f);
    720         final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize);
    721         // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1
    722         // here. The best approximation would be the unigram freq itself, so we should not
    723         // include this bigram in the dictionary. For now, register as 0, and live with the
    724         // small over-estimation that we get in this case. TODO: actually remove this bigram
    725         // if discretizedFrequency < 0.
    726         final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0;
    727         bigramFlags += finalBigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
    728         return bigramFlags;
    729     }
    730 
    731     /**
    732      * Makes the 2-byte value for options flags.
    733      */
    734     private static final int makeOptionsValue(final FusionDictionary dictionary,
    735             final FormatOptions formatOptions) {
    736         final DictionaryOptions options = dictionary.mOptions;
    737         final boolean hasBigrams = dictionary.hasBigrams();
    738         return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0)
    739                 + (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0)
    740                 + (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0)
    741                 + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0);
    742     }
    743 
    744     /**
    745      * Makes the flag value for a shortcut.
    746      *
    747      * @param more whether there are more attributes after this one.
    748      * @param frequency the frequency of the attribute, 0..15
    749      * @return the flags
    750      */
    751     static final int makeShortcutFlags(final boolean more, final int frequency) {
    752         return (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0)
    753                 + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY);
    754     }
    755 
    756     /* package */ static final int writeParentAddress(final byte[] buffer, final int index,
    757             final int address, final FormatOptions formatOptions) {
    758         if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) {
    759             if (address == FormatSpec.NO_PARENT_ADDRESS) {
    760                 buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
    761             } else {
    762                 final int absAddress = Math.abs(address);
    763                 assert(absAddress <= FormatSpec.SINT24_MAX);
    764                 buffer[index] = (byte)((address < 0 ? FormatSpec.MSB8 : 0)
    765                         | ((absAddress >> 16) & 0xFF));
    766                 buffer[index + 1] = (byte)((absAddress >> 8) & 0xFF);
    767                 buffer[index + 2] = (byte)(absAddress & 0xFF);
    768             }
    769             return index + 3;
    770         } else {
    771             return index;
    772         }
    773     }
    774 
    775     /* package */ static final int getChildrenPosition(final PtNode ptNode,
    776             final FormatOptions formatOptions) {
    777         int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate
    778                 + getNodeHeaderSize(ptNode, formatOptions);
    779         if (ptNode.isTerminal()) {
    780             // A terminal node has either the terminal id or the frequency.
    781             // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children
    782             // position.
    783             if (formatOptions.mHasTerminalId) {
    784                 positionOfChildrenPosField += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
    785             } else {
    786                 positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE;
    787             }
    788         }
    789         return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS
    790                 : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField;
    791     }
    792 
    793     /**
    794      * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached.
    795      *
    796      * @param dict the dictionary the node array is a part of (for relative offsets).
    797      * @param dictEncoder the dictionary encoder.
    798      * @param ptNodeArray the node array to write.
    799      * @param formatOptions file format options.
    800      */
    801     @SuppressWarnings("unused")
    802     /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict,
    803             final DictEncoder dictEncoder, final PtNodeArray ptNodeArray,
    804             final FormatOptions formatOptions) {
    805         // TODO: Make the code in common with BinaryDictIOUtils#writePtNode
    806         dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate);
    807 
    808         final int ptNodeCount = ptNodeArray.mData.size();
    809         dictEncoder.writePtNodeCount(ptNodeCount);
    810         final int parentPosition =
    811                 (ptNodeArray.mCachedParentAddress == FormatSpec.NO_PARENT_ADDRESS)
    812                 ? FormatSpec.NO_PARENT_ADDRESS
    813                 : ptNodeArray.mCachedParentAddress + ptNodeArray.mCachedAddressAfterUpdate;
    814         for (int i = 0; i < ptNodeCount; ++i) {
    815             final PtNode ptNode = ptNodeArray.mData.get(i);
    816             if (dictEncoder.getPosition() != ptNode.mCachedAddressAfterUpdate) {
    817                 throw new RuntimeException("Bug: write index is not the same as the cached address "
    818                         + "of the node : " + dictEncoder.getPosition() + " <> "
    819                         + ptNode.mCachedAddressAfterUpdate);
    820             }
    821             // Sanity checks.
    822             if (DBG && ptNode.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
    823                 throw new RuntimeException("A node has a frequency > "
    824                         + FormatSpec.MAX_TERMINAL_FREQUENCY
    825                         + " : " + ptNode.mFrequency);
    826             }
    827             dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict);
    828         }
    829         if (formatOptions.mSupportsDynamicUpdate) {
    830             dictEncoder.writeForwardLinkAddress(FormatSpec.NO_FORWARD_LINK_ADDRESS);
    831         }
    832         if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate
    833                 + ptNodeArray.mCachedSize) {
    834             throw new RuntimeException("Not the same size : written "
    835                      + (dictEncoder.getPosition() - ptNodeArray.mCachedAddressAfterUpdate)
    836                      + " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes");
    837         }
    838     }
    839 
    840     /**
    841      * Dumps a collection of useful statistics about a list of PtNode arrays.
    842      *
    843      * This prints purely informative stuff, like the total estimated file size, the
    844      * number of PtNode arrays, of PtNodes, the repartition of each address size, etc
    845      *
    846      * @param ptNodeArrays the list of PtNode arrays.
    847      */
    848     /* package */ static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) {
    849         int firstTerminalAddress = Integer.MAX_VALUE;
    850         int lastTerminalAddress = Integer.MIN_VALUE;
    851         int size = 0;
    852         int ptNodes = 0;
    853         int maxNodes = 0;
    854         int maxRuns = 0;
    855         for (final PtNodeArray ptNodeArray : ptNodeArrays) {
    856             if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size();
    857             for (final PtNode ptNode : ptNodeArray.mData) {
    858                 ++ptNodes;
    859                 if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length;
    860                 if (ptNode.mFrequency >= 0) {
    861                     if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress)
    862                         firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
    863                     if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress)
    864                         lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
    865                 }
    866             }
    867             if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) {
    868                 size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize;
    869             }
    870         }
    871         final int[] ptNodeCounts = new int[maxNodes + 1];
    872         final int[] runCounts = new int[maxRuns + 1];
    873         for (final PtNodeArray ptNodeArray : ptNodeArrays) {
    874             ++ptNodeCounts[ptNodeArray.mData.size()];
    875             for (final PtNode ptNode : ptNodeArray.mData) {
    876                 ++runCounts[ptNode.mChars.length];
    877             }
    878         }
    879 
    880         MakedictLog.i("Statistics:\n"
    881                 + "  total file size " + size + "\n"
    882                 + "  " + ptNodeArrays.size() + " node arrays\n"
    883                 + "  " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size())
    884                         + " PtNodes per node)\n"
    885                 + "  first terminal at " + firstTerminalAddress + "\n"
    886                 + "  last terminal at " + lastTerminalAddress + "\n"
    887                 + "  PtNode stats : max = " + maxNodes);
    888         for (int i = 0; i < ptNodeCounts.length; ++i) {
    889             MakedictLog.i("    " + i + " : " + ptNodeCounts[i]);
    890         }
    891         MakedictLog.i("  Character run stats : max = " + maxRuns);
    892         for (int i = 0; i < runCounts.length; ++i) {
    893             MakedictLog.i("    " + i + " : " + runCounts[i]);
    894         }
    895     }
    896 
    897     /**
    898      * Writes a file header to an output stream.
    899      *
    900      * @param destination the stream to write the file header to.
    901      * @param dict the dictionary to write.
    902      * @param formatOptions file format options.
    903      * @return the size of the header.
    904      */
    905     /* package */ static int writeDictionaryHeader(final OutputStream destination,
    906             final FusionDictionary dict, final FormatOptions formatOptions)
    907                     throws IOException, UnsupportedFormatException {
    908         final int version = formatOptions.mVersion;
    909         if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
    910                 || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) {
    911             throw new UnsupportedFormatException("Requested file format version " + version
    912                     + ", but this implementation only supports versions "
    913                     + FormatSpec.MINIMUM_SUPPORTED_VERSION + " through "
    914                     + FormatSpec.MAXIMUM_SUPPORTED_VERSION);
    915         }
    916 
    917         ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256);
    918 
    919         // The magic number in big-endian order.
    920         // Magic number for all versions.
    921         headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 24)));
    922         headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 16)));
    923         headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 8)));
    924         headerBuffer.write((byte) (0xFF & FormatSpec.MAGIC_NUMBER));
    925         // Dictionary version.
    926         headerBuffer.write((byte) (0xFF & (version >> 8)));
    927         headerBuffer.write((byte) (0xFF & version));
    928 
    929         // Options flags
    930         final int options = makeOptionsValue(dict, formatOptions);
    931         headerBuffer.write((byte) (0xFF & (options >> 8)));
    932         headerBuffer.write((byte) (0xFF & options));
    933         final int headerSizeOffset = headerBuffer.size();
    934         // Placeholder to be written later with header size.
    935         for (int i = 0; i < 4; ++i) {
    936             headerBuffer.write(0);
    937         }
    938         // Write out the options.
    939         for (final String key : dict.mOptions.mAttributes.keySet()) {
    940             final String value = dict.mOptions.mAttributes.get(key);
    941             CharEncoding.writeString(headerBuffer, key);
    942             CharEncoding.writeString(headerBuffer, value);
    943         }
    944         final int size = headerBuffer.size();
    945         final byte[] bytes = headerBuffer.toByteArray();
    946         // Write out the header size.
    947         bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24));
    948         bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16));
    949         bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8));
    950         bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0));
    951         destination.write(bytes);
    952 
    953         headerBuffer.close();
    954         return size;
    955     }
    956 }
    957