Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.Constants;
     21 import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding;
     22 import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
     23 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
     24 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     25 import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
     26 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     27 
     28 import java.io.File;
     29 import java.io.FileInputStream;
     30 import java.io.FileNotFoundException;
     31 import java.io.IOException;
     32 import java.io.OutputStream;
     33 import java.nio.channels.FileChannel;
     34 import java.util.ArrayList;
     35 import java.util.Arrays;
     36 import java.util.Iterator;
     37 import java.util.Map;
     38 import java.util.Stack;
     39 
     40 public final class BinaryDictIOUtils {
     41     private static final boolean DBG = false;
     42     private static final int MSB24 = 0x800000;
     43     private static final int SINT24_MAX = 0x7FFFFF;
     44     private static final int MAX_JUMPS = 10000;
     45 
     46     private BinaryDictIOUtils() {
     47         // This utility class is not publicly instantiable.
     48     }
     49 
     50     private static final class Position {
     51         public static final int NOT_READ_GROUPCOUNT = -1;
     52 
     53         public int mAddress;
     54         public int mNumOfCharGroup;
     55         public int mPosition;
     56         public int mLength;
     57 
     58         public Position(int address, int length) {
     59             mAddress = address;
     60             mLength = length;
     61             mNumOfCharGroup = NOT_READ_GROUPCOUNT;
     62         }
     63     }
     64 
     65     /**
     66      * Tours all node without recursive call.
     67      */
     68     private static void readUnigramsAndBigramsBinaryInner(
     69             final FusionDictionaryBufferInterface buffer, final int headerSize,
     70             final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
     71             final Map<Integer, ArrayList<PendingAttribute>> bigrams,
     72             final FormatOptions formatOptions) {
     73         int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
     74 
     75         Stack<Position> stack = new Stack<Position>();
     76         int index = 0;
     77 
     78         Position initPos = new Position(headerSize, 0);
     79         stack.push(initPos);
     80 
     81         while (!stack.empty()) {
     82             Position p = stack.peek();
     83 
     84             if (DBG) {
     85                 MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" +
     86                         p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength);
     87             }
     88 
     89             if (buffer.position() != p.mAddress) buffer.position(p.mAddress);
     90             if (index != p.mLength) index = p.mLength;
     91 
     92             if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) {
     93                 p.mNumOfCharGroup = BinaryDictInputOutput.readCharGroupCount(buffer);
     94                 p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup);
     95                 p.mPosition = 0;
     96             }
     97             if (p.mNumOfCharGroup == 0) {
     98                 stack.pop();
     99                 continue;
    100             }
    101             CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer,
    102                     p.mAddress - headerSize, formatOptions);
    103             for (int i = 0; i < info.mCharacters.length; ++i) {
    104                 pushedChars[index++] = info.mCharacters[i];
    105             }
    106             p.mPosition++;
    107 
    108             final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags,
    109                     formatOptions);
    110             final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags,
    111                     formatOptions);
    112             if (!isMovedGroup && !isDeletedGroup
    113                     && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word
    114                 words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
    115                 frequencies.put(info.mOriginalAddress, info.mFrequency);
    116                 if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
    117             }
    118 
    119             if (p.mPosition == p.mNumOfCharGroup) {
    120                 if (formatOptions.mSupportsDynamicUpdate) {
    121                     final int forwardLinkAddress = buffer.readUnsignedInt24();
    122                     if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
    123                         // the node has a forward link.
    124                         p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT;
    125                         p.mAddress = forwardLinkAddress;
    126                     } else {
    127                         stack.pop();
    128                     }
    129                 } else {
    130                     stack.pop();
    131                 }
    132             } else {
    133                 // the node has more groups.
    134                 p.mAddress = buffer.position();
    135             }
    136 
    137             if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
    138                 Position childrenPos = new Position(info.mChildrenAddress + headerSize, index);
    139                 stack.push(childrenPos);
    140             }
    141         }
    142     }
    143 
    144     /**
    145      * Reads unigrams and bigrams from the binary file.
    146      * Doesn't make the memory representation of the dictionary.
    147      *
    148      * @param buffer the buffer to read.
    149      * @param words the map to store the address as a key and the word as a value.
    150      * @param frequencies the map to store the address as a key and the frequency as a value.
    151      * @param bigrams the map to store the address as a key and the list of address as a value.
    152      * @throws IOException
    153      * @throws UnsupportedFormatException
    154      */
    155     public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer,
    156             final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
    157             final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
    158             UnsupportedFormatException {
    159         // Read header
    160         final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
    161         readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams,
    162                 header.mFormatOptions);
    163     }
    164 
    165     /**
    166      * Gets the address of the last CharGroup of the exact matching word in the dictionary.
    167      * If no match is found, returns NOT_VALID_WORD.
    168      *
    169      * @param buffer the buffer to read.
    170      * @param word the word we search for.
    171      * @return the address of the terminal node.
    172      * @throws IOException
    173      * @throws UnsupportedFormatException
    174      */
    175     @UsedForTesting
    176     public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
    177             final String word) throws IOException, UnsupportedFormatException {
    178         if (word == null) return FormatSpec.NOT_VALID_WORD;
    179         if (buffer.position() != 0) buffer.position(0);
    180 
    181         final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
    182         int wordPos = 0;
    183         final int wordLen = word.codePointCount(0, word.length());
    184         for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
    185             if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
    186 
    187             do {
    188                 final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
    189                 boolean foundNextCharGroup = false;
    190                 for (int i = 0; i < charGroupCount; ++i) {
    191                     final int charGroupPos = buffer.position();
    192                     final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
    193                             buffer.position(), header.mFormatOptions);
    194                     final boolean isMovedGroup =
    195                             BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
    196                                     header.mFormatOptions);
    197                     final boolean isDeletedGroup =
    198                             BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags,
    199                                     header.mFormatOptions);
    200                     if (isMovedGroup) continue;
    201                     boolean same = true;
    202                     for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
    203                             p < currentInfo.mCharacters.length;
    204                             ++p, j = word.offsetByCodePoints(j, 1)) {
    205                         if (wordPos + p >= wordLen
    206                                 || word.codePointAt(j) != currentInfo.mCharacters[p]) {
    207                             same = false;
    208                             break;
    209                         }
    210                     }
    211 
    212                     if (same) {
    213                         // found the group matches the word.
    214                         if (wordPos + currentInfo.mCharacters.length == wordLen) {
    215                             if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL
    216                                     || isDeletedGroup) {
    217                                 return FormatSpec.NOT_VALID_WORD;
    218                             } else {
    219                                 return charGroupPos;
    220                             }
    221                         }
    222                         wordPos += currentInfo.mCharacters.length;
    223                         if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
    224                             return FormatSpec.NOT_VALID_WORD;
    225                         }
    226                         foundNextCharGroup = true;
    227                         buffer.position(currentInfo.mChildrenAddress);
    228                         break;
    229                     }
    230                 }
    231 
    232                 // If we found the next char group, it is under the file pointer.
    233                 // But if not, we are at the end of this node so we expect to have
    234                 // a forward link address that we need to consult and possibly resume
    235                 // search on the next node in the linked list.
    236                 if (foundNextCharGroup) break;
    237                 if (!header.mFormatOptions.mSupportsDynamicUpdate) {
    238                     return FormatSpec.NOT_VALID_WORD;
    239                 }
    240 
    241                 final int forwardLinkAddress = buffer.readUnsignedInt24();
    242                 if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
    243                     return FormatSpec.NOT_VALID_WORD;
    244                 }
    245                 buffer.position(forwardLinkAddress);
    246             } while(true);
    247         }
    248         return FormatSpec.NOT_VALID_WORD;
    249     }
    250 
    251     private static int markAsDeleted(final int flags) {
    252         return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED;
    253     }
    254 
    255     /**
    256      * Delete the word from the binary file.
    257      *
    258      * @param buffer the buffer to write.
    259      * @param word the word we delete
    260      * @throws IOException
    261      * @throws UnsupportedFormatException
    262      */
    263     @UsedForTesting
    264     public static void deleteWord(final FusionDictionaryBufferInterface buffer,
    265             final String word) throws IOException, UnsupportedFormatException {
    266         buffer.position(0);
    267         final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
    268         final int wordPosition = getTerminalPosition(buffer, word);
    269         if (wordPosition == FormatSpec.NOT_VALID_WORD) return;
    270 
    271         buffer.position(wordPosition);
    272         final int flags = buffer.readUnsignedByte();
    273         buffer.position(wordPosition);
    274         buffer.put((byte)markAsDeleted(flags));
    275     }
    276 
    277     /**
    278      * @return the size written, in bytes. Always 3 bytes.
    279      */
    280     private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer,
    281             final int value) {
    282         final int absValue = Math.abs(value);
    283         buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
    284         buffer.put((byte)((absValue >> 8) & 0xFF));
    285         buffer.put((byte)(absValue & 0xFF));
    286         return 3;
    287     }
    288 
    289     /**
    290      * @return the size written, in bytes. Always 3 bytes.
    291      */
    292     private static int writeSInt24ToStream(final OutputStream destination, final int value)
    293             throws IOException {
    294         final int absValue = Math.abs(value);
    295         destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
    296         destination.write((byte)((absValue >> 8) & 0xFF));
    297         destination.write((byte)(absValue & 0xFF));
    298         return 3;
    299     }
    300 
    301     /**
    302      * @return the size written, in bytes. 1, 2, or 3 bytes.
    303      */
    304     private static int writeVariableAddress(final OutputStream destination, final int value)
    305             throws IOException {
    306         switch (BinaryDictInputOutput.getByteSize(value)) {
    307         case 1:
    308             destination.write((byte)value);
    309             break;
    310         case 2:
    311             destination.write((byte)(0xFF & (value >> 8)));
    312             destination.write((byte)(0xFF & value));
    313             break;
    314         case 3:
    315             destination.write((byte)(0xFF & (value >> 16)));
    316             destination.write((byte)(0xFF & (value >> 8)));
    317             destination.write((byte)(0xFF & value));
    318             break;
    319         }
    320         return BinaryDictInputOutput.getByteSize(value);
    321     }
    322 
    323     /**
    324      * Update a parent address in a CharGroup that is referred to by groupOriginAddress.
    325      *
    326      * @param buffer the buffer to write.
    327      * @param groupOriginAddress the address of the group.
    328      * @param newParentAddress the absolute address of the parent.
    329      * @param formatOptions file format options.
    330      */
    331     public static void updateParentAddress(final FusionDictionaryBufferInterface buffer,
    332             final int groupOriginAddress, final int newParentAddress,
    333             final FormatOptions formatOptions) {
    334         final int originalPosition = buffer.position();
    335         buffer.position(groupOriginAddress);
    336         if (!formatOptions.mSupportsDynamicUpdate) {
    337             throw new RuntimeException("this file format does not support parent addresses");
    338         }
    339         final int flags = buffer.readUnsignedByte();
    340         if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) {
    341             // if the group is moved, the parent address is stored in the destination group.
    342             // We are guaranteed to process the destination group later, so there is no need to
    343             // update anything here.
    344             buffer.position(originalPosition);
    345             return;
    346         }
    347         if (DBG) {
    348             MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress);
    349         }
    350         final int parentOffset = newParentAddress - groupOriginAddress;
    351         writeSInt24ToBuffer(buffer, parentOffset);
    352         buffer.position(originalPosition);
    353     }
    354 
    355     private static void skipCharGroup(final FusionDictionaryBufferInterface buffer,
    356             final FormatOptions formatOptions) {
    357         final int flags = buffer.readUnsignedByte();
    358         BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
    359         skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
    360         BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions);
    361         if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
    362         if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) {
    363             final int shortcutsSize = buffer.readUnsignedShort();
    364             buffer.position(buffer.position() + shortcutsSize
    365                     - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE);
    366         }
    367         if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) {
    368             int bigramCount = 0;
    369             while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
    370                 final int bigramFlags = buffer.readUnsignedByte();
    371                 switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) {
    372                     case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
    373                         buffer.readUnsignedByte();
    374                         break;
    375                     case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
    376                         buffer.readUnsignedShort();
    377                         break;
    378                     case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
    379                         buffer.readUnsignedInt24();
    380                         break;
    381                 }
    382                 if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break;
    383             }
    384             if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
    385                 throw new RuntimeException("Too many bigrams in a group.");
    386             }
    387         }
    388     }
    389 
    390     /**
    391      * Update parent addresses in a Node that is referred to by nodeOriginAddress.
    392      *
    393      * @param buffer the buffer to be modified.
    394      * @param nodeOriginAddress the address of a modified Node.
    395      * @param newParentAddress the address to be written.
    396      * @param formatOptions file format options.
    397      */
    398     public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer,
    399             final int nodeOriginAddress, final int newParentAddress,
    400             final FormatOptions formatOptions) {
    401         final int originalPosition = buffer.position();
    402         buffer.position(nodeOriginAddress);
    403         do {
    404             final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
    405             for (int i = 0; i < count; ++i) {
    406                 updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions);
    407                 skipCharGroup(buffer, formatOptions);
    408             }
    409             final int forwardLinkAddress = buffer.readUnsignedInt24();
    410             buffer.position(forwardLinkAddress);
    411         } while (formatOptions.mSupportsDynamicUpdate
    412                 && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
    413         buffer.position(originalPosition);
    414     }
    415 
    416     private static void skipString(final FusionDictionaryBufferInterface buffer,
    417             final boolean hasMultipleChars) {
    418         if (hasMultipleChars) {
    419             int character = CharEncoding.readChar(buffer);
    420             while (character != FormatSpec.INVALID_CHARACTER) {
    421                 character = CharEncoding.readChar(buffer);
    422             }
    423         } else {
    424             CharEncoding.readChar(buffer);
    425         }
    426     }
    427 
    428     /**
    429      * Write a string to a stream.
    430      *
    431      * @param destination the stream to write.
    432      * @param word the string to be written.
    433      * @return the size written, in bytes.
    434      * @throws IOException
    435      */
    436     private static int writeString(final OutputStream destination, final String word)
    437             throws IOException {
    438         int size = 0;
    439         final int length = word.length();
    440         for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    441             final int codePoint = word.codePointAt(i);
    442             if (CharEncoding.getCharSize(codePoint) == 1) {
    443                 destination.write((byte)codePoint);
    444                 size++;
    445             } else {
    446                 destination.write((byte)(0xFF & (codePoint >> 16)));
    447                 destination.write((byte)(0xFF & (codePoint >> 8)));
    448                 destination.write((byte)(0xFF & codePoint));
    449                 size += 3;
    450             }
    451         }
    452         destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
    453         size += FormatSpec.GROUP_TERMINATOR_SIZE;
    454         return size;
    455     }
    456 
    457     /**
    458      * Update a children address in a CharGroup that is addressed by groupOriginAddress.
    459      *
    460      * @param buffer the buffer to write.
    461      * @param groupOriginAddress the address of the group.
    462      * @param newChildrenAddress the absolute address of the child.
    463      * @param formatOptions file format options.
    464      */
    465     public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer,
    466             final int groupOriginAddress, final int newChildrenAddress,
    467             final FormatOptions formatOptions) {
    468         final int originalPosition = buffer.position();
    469         buffer.position(groupOriginAddress);
    470         final int flags = buffer.readUnsignedByte();
    471         final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
    472         skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
    473         if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
    474         final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS
    475                 ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position();
    476         writeSInt24ToBuffer(buffer, childrenOffset);
    477         buffer.position(originalPosition);
    478     }
    479 
    480     /**
    481      * Write a char group to an output stream.
    482      * A char group is an in-memory representation of a node in trie.
    483      * A char group info is an on-disk representation of a node.
    484      *
    485      * @param destination the stream to write.
    486      * @param info the char group info to be written.
    487      * @return the size written, in bytes.
    488      */
    489     public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info)
    490             throws IOException {
    491         int size = FormatSpec.GROUP_FLAGS_SIZE;
    492         destination.write((byte)info.mFlags);
    493         final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ?
    494                 FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress;
    495         size += writeSInt24ToStream(destination, parentOffset);
    496 
    497         for (int i = 0; i < info.mCharacters.length; ++i) {
    498             if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) {
    499                 destination.write((byte)info.mCharacters[i]);
    500                 size++;
    501             } else {
    502                 size += writeSInt24ToStream(destination, info.mCharacters[i]);
    503             }
    504         }
    505         if (info.mCharacters.length > 1) {
    506             destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
    507             size++;
    508         }
    509 
    510         if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
    511             destination.write((byte)info.mFrequency);
    512             size++;
    513         }
    514 
    515         if (DBG) {
    516             MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size
    517                     + ", child=" + info.mChildrenAddress + ", characters ="
    518                     + new String(info.mCharacters, 0, info.mCharacters.length));
    519         }
    520         final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ?
    521                 0 : info.mChildrenAddress - (info.mOriginalAddress + size);
    522         writeSInt24ToStream(destination, childrenOffset);
    523         size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
    524 
    525         if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) {
    526             final int shortcutListSize =
    527                     BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
    528             destination.write((byte)(shortcutListSize >> 8));
    529             destination.write((byte)(shortcutListSize & 0xFF));
    530             size += 2;
    531             final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator();
    532             while (shortcutIterator.hasNext()) {
    533                 final WeightedString target = shortcutIterator.next();
    534                 destination.write((byte)BinaryDictInputOutput.makeShortcutFlags(
    535                         shortcutIterator.hasNext(), target.mFrequency));
    536                 size++;
    537                 size += writeString(destination, target.mWord);
    538             }
    539         }
    540 
    541         if (info.mBigrams != null) {
    542             // TODO: Consolidate this code with the code that computes the size of the bigram list
    543             //        in BinaryDictionaryInputOutput#computeActualNodeSize
    544             for (int i = 0; i < info.mBigrams.size(); ++i) {
    545 
    546                 final int bigramFrequency = info.mBigrams.get(i).mFrequency;
    547                 int bigramFlags = (i < info.mBigrams.size() - 1)
    548                         ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0;
    549                 size++;
    550                 final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress
    551                         + size);
    552                 bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0;
    553                 switch (BinaryDictInputOutput.getByteSize(bigramOffset)) {
    554                 case 1:
    555                     bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
    556                     break;
    557                 case 2:
    558                     bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
    559                     break;
    560                 case 3:
    561                     bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
    562                     break;
    563                 }
    564                 bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
    565                 destination.write((byte)bigramFlags);
    566                 size += writeVariableAddress(destination, Math.abs(bigramOffset));
    567             }
    568         }
    569         return size;
    570     }
    571 
    572     @SuppressWarnings("unused")
    573     private static void updateForwardLink(final FusionDictionaryBufferInterface buffer,
    574             final int nodeOriginAddress, final int newNodeAddress,
    575             final FormatOptions formatOptions) {
    576         buffer.position(nodeOriginAddress);
    577         int jumpCount = 0;
    578         while (jumpCount++ < MAX_JUMPS) {
    579             final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
    580             for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions);
    581             final int forwardLinkAddress = buffer.readUnsignedInt24();
    582             if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
    583                 buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
    584                 writeSInt24ToBuffer(buffer, newNodeAddress);
    585                 return;
    586             }
    587             buffer.position(forwardLinkAddress);
    588         }
    589         if (DBG && jumpCount >= MAX_JUMPS) {
    590             throw new RuntimeException("too many jumps, probably a bug.");
    591         }
    592     }
    593 
    594     /**
    595      * Helper method to move a char group to the tail of the file.
    596      */
    597     private static int moveCharGroup(final OutputStream destination,
    598             final FusionDictionaryBufferInterface buffer, final CharGroupInfo info,
    599             final int nodeOriginAddress, final int oldGroupAddress,
    600             final FormatOptions formatOptions) throws IOException {
    601         updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions);
    602         buffer.position(oldGroupAddress);
    603         final int currentFlags = buffer.readUnsignedByte();
    604         buffer.position(oldGroupAddress);
    605         buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags
    606                 & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
    607         int size = FormatSpec.GROUP_FLAGS_SIZE;
    608         updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions);
    609         size += writeNode(destination, new CharGroupInfo[] { info });
    610         return size;
    611     }
    612 
    613     /**
    614      * Compute the size of the char group.
    615      */
    616     private static int computeGroupSize(final CharGroupInfo info,
    617             final FormatOptions formatOptions) {
    618         int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
    619                 + BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters)
    620                 + BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions);
    621         if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
    622             size += FormatSpec.GROUP_FREQUENCY_SIZE;
    623         }
    624         if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) {
    625             size += BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
    626         }
    627         if (info.mBigrams != null) {
    628             for (final PendingAttribute attr : info.mBigrams) {
    629                 size += FormatSpec.GROUP_FLAGS_SIZE;
    630                 size += BinaryDictInputOutput.getByteSize(attr.mAddress);
    631             }
    632         }
    633         return size;
    634     }
    635 
    636     /**
    637      * Write a node to the stream.
    638      *
    639      * @param destination the stream to write.
    640      * @param infos groups to be written.
    641      * @return the size written, in bytes.
    642      * @throws IOException
    643      */
    644     private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos)
    645             throws IOException {
    646         int size = BinaryDictInputOutput.getGroupCountSize(infos.length);
    647         switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) {
    648             case 1:
    649                 destination.write((byte)infos.length);
    650                 break;
    651             case 2:
    652                 destination.write((byte)(infos.length >> 8));
    653                 destination.write((byte)(infos.length & 0xFF));
    654                 break;
    655             default:
    656                 throw new RuntimeException("Invalid group count size.");
    657         }
    658         for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info);
    659         writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
    660         return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
    661     }
    662 
    663     /**
    664      * Move a group that is referred to by oldGroupOrigin to the tail of the file.
    665      * And set the children address to the byte after the group.
    666      *
    667      * @param nodeOrigin the address of the tail of the file.
    668      * @param characters
    669      * @param length
    670      * @param flags
    671      * @param frequency
    672      * @param parentAddress
    673      * @param shortcutTargets
    674      * @param bigrams
    675      * @param destination the stream representing the tail of the file.
    676      * @param buffer the buffer representing the (constant-size) body of the file.
    677      * @param oldNodeOrigin
    678      * @param oldGroupOrigin
    679      * @param formatOptions
    680      * @return the size written, in bytes.
    681      * @throws IOException
    682      */
    683     private static int moveGroup(final int nodeOrigin, final int[] characters, final int length,
    684             final int flags, final int frequency, final int parentAddress,
    685             final ArrayList<WeightedString> shortcutTargets,
    686             final ArrayList<PendingAttribute> bigrams, final OutputStream destination,
    687             final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin,
    688             final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException {
    689         int size = 0;
    690         final int newGroupOrigin = nodeOrigin + 1;
    691         final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length);
    692         final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */,
    693                 flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS,
    694                 shortcutTargets, bigrams);
    695         size = computeGroupSize(tmpInfo, formatOptions);
    696         final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size,
    697                 flags, writtenCharacters, frequency, parentAddress,
    698                 nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets,
    699                 bigrams);
    700         moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions);
    701         return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
    702     }
    703 
    704     /**
    705      * Insert a word into a binary dictionary.
    706      *
    707      * @param buffer
    708      * @param destination
    709      * @param word
    710      * @param frequency
    711      * @param bigramStrings
    712      * @param shortcuts
    713      * @throws IOException
    714      * @throws UnsupportedFormatException
    715      */
    716     // TODO: Support batch insertion.
    717     // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary.
    718     @UsedForTesting
    719     public static void insertWord(final FusionDictionaryBufferInterface buffer,
    720             final OutputStream destination, final String word, final int frequency,
    721             final ArrayList<WeightedString> bigramStrings,
    722             final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
    723             final boolean isBlackListEntry)
    724                     throws IOException, UnsupportedFormatException {
    725         final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>();
    726         if (bigramStrings != null) {
    727             for (final WeightedString bigram : bigramStrings) {
    728                 int position = getTerminalPosition(buffer, bigram.mWord);
    729                 if (position == FormatSpec.NOT_VALID_WORD) {
    730                     // TODO: figure out what is the correct thing to do here.
    731                 } else {
    732                     bigrams.add(new PendingAttribute(bigram.mFrequency, position));
    733                 }
    734             }
    735         }
    736 
    737         final boolean isTerminal = true;
    738         final boolean hasBigrams = !bigrams.isEmpty();
    739         final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty();
    740 
    741         // find the insert position of the word.
    742         if (buffer.position() != 0) buffer.position(0);
    743         final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
    744 
    745         int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position();
    746         final int[] codePoints = FusionDictionary.getCodePoints(word);
    747         final int wordLen = codePoints.length;
    748 
    749         for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
    750             if (wordPos >= wordLen) break;
    751             nodeOriginAddress = buffer.position();
    752             int nodeParentAddress = -1;
    753             final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
    754             boolean foundNextGroup = false;
    755 
    756             for (int i = 0; i < charGroupCount; ++i) {
    757                 address = buffer.position();
    758                 final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
    759                         buffer.position(), header.mFormatOptions);
    760                 final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
    761                         header.mFormatOptions);
    762                 if (isMovedGroup) continue;
    763                 nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS)
    764                         ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address;
    765                 boolean matched = true;
    766                 for (int p = 0; p < currentInfo.mCharacters.length; ++p) {
    767                     if (wordPos + p >= wordLen) {
    768                         /*
    769                          * splitting
    770                          * before
    771                          *  abcd - ef
    772                          *
    773                          * insert "abc"
    774                          *
    775                          * after
    776                          *  abc - d - ef
    777                          */
    778                         final int newNodeAddress = buffer.limit();
    779                         final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
    780                                 isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */,
    781                                 false /* isBlackListEntry */, header.mFormatOptions);
    782                         int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags,
    783                                 frequency, nodeParentAddress, shortcuts, bigrams, destination,
    784                                 buffer, nodeOriginAddress, address, header.mFormatOptions);
    785 
    786                         final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p,
    787                                 currentInfo.mCharacters.length);
    788                         if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
    789                             updateParentAddresses(buffer, currentInfo.mChildrenAddress,
    790                                     newNodeAddress + written + 1, header.mFormatOptions);
    791                         }
    792                         final CharGroupInfo newInfo2 = new CharGroupInfo(
    793                                 newNodeAddress + written + 1, -1 /* endAddress */,
    794                                 currentInfo.mFlags, characters2, currentInfo.mFrequency,
    795                                 newNodeAddress + 1, currentInfo.mChildrenAddress,
    796                                 currentInfo.mShortcutTargets, currentInfo.mBigrams);
    797                         writeNode(destination, new CharGroupInfo[] { newInfo2 });
    798                         return;
    799                     } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) {
    800                         if (p > 0) {
    801                             /*
    802                              * splitting
    803                              * before
    804                              *   ab - cd
    805                              *
    806                              * insert "ac"
    807                              *
    808                              * after
    809                              *   a - b - cd
    810                              *     |
    811                              *     - c
    812                              */
    813 
    814                             final int newNodeAddress = buffer.limit();
    815                             final int childrenAddress = currentInfo.mChildrenAddress;
    816 
    817                             // move prefix
    818                             final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
    819                                     false /* isTerminal */, 0 /* childrenAddressSize*/,
    820                                     false /* hasShortcut */, false /* hasBigrams */,
    821                                     false /* isNotAWord */, false /* isBlackListEntry */,
    822                                     header.mFormatOptions);
    823                             int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p,
    824                                     prefixFlags, -1 /* frequency */, nodeParentAddress, null, null,
    825                                     destination, buffer, nodeOriginAddress, address,
    826                                     header.mFormatOptions);
    827 
    828                             final int[] suffixCharacters = Arrays.copyOfRange(
    829                                     currentInfo.mCharacters, p, currentInfo.mCharacters.length);
    830                             if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
    831                                 updateParentAddresses(buffer, currentInfo.mChildrenAddress,
    832                                         newNodeAddress + written + 1, header.mFormatOptions);
    833                             }
    834                             final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags(
    835                                     suffixCharacters.length > 1,
    836                                     (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0,
    837                                     0 /* childrenAddressSize */,
    838                                     (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)
    839                                             != 0,
    840                                     (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0,
    841                                     isNotAWord, isBlackListEntry, header.mFormatOptions);
    842                             final CharGroupInfo suffixInfo = new CharGroupInfo(
    843                                     newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags,
    844                                     suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1,
    845                                     currentInfo.mChildrenAddress, currentInfo.mShortcutTargets,
    846                                     currentInfo.mBigrams);
    847                             written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1;
    848 
    849                             final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p,
    850                                     codePoints.length);
    851                             final int flags = BinaryDictInputOutput.makeCharGroupFlags(
    852                                     newCharacters.length > 1, isTerminal,
    853                                     0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
    854                                     isNotAWord, isBlackListEntry, header.mFormatOptions);
    855                             final CharGroupInfo newInfo = new CharGroupInfo(
    856                                     newNodeAddress + written, -1 /* endAddress */, flags,
    857                                     newCharacters, frequency, newNodeAddress + 1,
    858                                     FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
    859                             writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo });
    860                             return;
    861                         }
    862                         matched = false;
    863                         break;
    864                     }
    865                 }
    866 
    867                 if (matched) {
    868                     if (wordPos + currentInfo.mCharacters.length == wordLen) {
    869                         // the word exists in the dictionary.
    870                         // only update group.
    871                         final int newNodeAddress = buffer.limit();
    872                         final boolean hasMultipleChars = currentInfo.mCharacters.length > 1;
    873                         final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
    874                                 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
    875                                 isNotAWord, isBlackListEntry, header.mFormatOptions);
    876                         final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
    877                                 -1 /* endAddress */, flags, currentInfo.mCharacters, frequency,
    878                                 nodeParentAddress, currentInfo.mChildrenAddress, shortcuts,
    879                                 bigrams);
    880                         moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address,
    881                                 header.mFormatOptions);
    882                         return;
    883                     }
    884                     wordPos += currentInfo.mCharacters.length;
    885                     if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
    886                         /*
    887                          * found the prefix of the word.
    888                          * make new node and link to the node from this group.
    889                          *
    890                          * before
    891                          * ab - cd
    892                          *
    893                          * insert "abcde"
    894                          *
    895                          * after
    896                          * ab - cd - e
    897                          */
    898                         final int newNodeAddress = buffer.limit();
    899                         updateChildrenAddress(buffer, address, newNodeAddress,
    900                                 header.mFormatOptions);
    901                         final int newGroupAddress = newNodeAddress + 1;
    902                         final boolean hasMultipleChars = (wordLen - wordPos) > 1;
    903                         final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
    904                                 isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
    905                                 isNotAWord, isBlackListEntry, header.mFormatOptions);
    906                         final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
    907                         final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags,
    908                                 characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS,
    909                                 shortcuts, bigrams);
    910                         writeNode(destination, new CharGroupInfo[] { newInfo });
    911                         return;
    912                     }
    913                     buffer.position(currentInfo.mChildrenAddress);
    914                     foundNextGroup = true;
    915                     break;
    916                 }
    917             }
    918 
    919             if (foundNextGroup) continue;
    920 
    921             // reached the end of the array.
    922             final int linkAddressPosition = buffer.position();
    923             int nextLink = buffer.readUnsignedInt24();
    924             if ((nextLink & MSB24) != 0) {
    925                 nextLink = -(nextLink & SINT24_MAX);
    926             }
    927             if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
    928                 /*
    929                  * expand this node.
    930                  *
    931                  * before
    932                  * ab - cd
    933                  *
    934                  * insert "abef"
    935                  *
    936                  * after
    937                  * ab - cd
    938                  *    |
    939                  *    - ef
    940                  */
    941 
    942                 // change the forward link address.
    943                 final int newNodeAddress = buffer.limit();
    944                 buffer.position(linkAddressPosition);
    945                 writeSInt24ToBuffer(buffer, newNodeAddress);
    946 
    947                 final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
    948                 final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1,
    949                         isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
    950                         isNotAWord, isBlackListEntry, header.mFormatOptions);
    951                 final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
    952                         -1 /* endAddress */, flags, characters, frequency, nodeParentAddress,
    953                         FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
    954                 writeNode(destination, new CharGroupInfo[]{ newInfo });
    955                 return;
    956             } else {
    957                 depth--;
    958                 buffer.position(nextLink);
    959             }
    960         }
    961     }
    962 
    963     /**
    964      * Find a word from the buffer.
    965      *
    966      * @param buffer the buffer representing the body of the dictionary file.
    967      * @param word the word searched
    968      * @return the found group
    969      * @throws IOException
    970      * @throws UnsupportedFormatException
    971      */
    972     @UsedForTesting
    973     public static CharGroupInfo findWordFromBuffer(final FusionDictionaryBufferInterface buffer,
    974             final String word) throws IOException, UnsupportedFormatException {
    975         int position = getTerminalPosition(buffer, word);
    976         if (position != FormatSpec.NOT_VALID_WORD) {
    977             buffer.position(0);
    978             final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
    979             buffer.position(position);
    980             return BinaryDictInputOutput.readCharGroup(buffer, position, header.mFormatOptions);
    981         }
    982         return null;
    983     }
    984 
    985     /**
    986      * Convenience method to read the header of a binary file.
    987      *
    988      * This is quite resource intensive - don't call when performance is critical.
    989      *
    990      * @param file The file to read.
    991      * @param offset The offset in the file where to start reading the data.
    992      * @param length The length of the data file.
    993      */
    994     private static final int HEADER_READING_BUFFER_SIZE = 16384;
    995     public static FileHeader getDictionaryFileHeader(
    996             final File file, final long offset, final long length)
    997             throws FileNotFoundException, IOException, UnsupportedFormatException {
    998         final byte[] buffer = new byte[HEADER_READING_BUFFER_SIZE];
    999         final FileInputStream inStream = new FileInputStream(file);
   1000         try {
   1001             inStream.read(buffer);
   1002             final BinaryDictInputOutput.ByteBufferWrapper wrapper =
   1003                     new BinaryDictInputOutput.ByteBufferWrapper(inStream.getChannel().map(
   1004                             FileChannel.MapMode.READ_ONLY, offset, length));
   1005             return BinaryDictInputOutput.readHeader(wrapper);
   1006         } finally {
   1007             inStream.close();
   1008         }
   1009     }
   1010 
   1011     public static FileHeader getDictionaryFileHeaderOrNull(final File file, final long offset,
   1012             final long length) {
   1013         try {
   1014             final FileHeader header = getDictionaryFileHeader(file, offset, length);
   1015             return header;
   1016         } catch (UnsupportedFormatException e) {
   1017             return null;
   1018         } catch (IOException e) {
   1019             return null;
   1020         }
   1021     }
   1022 }
   1023