Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
     21 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
     23 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
     24 
     25 import java.io.File;
     26 import java.io.FileNotFoundException;
     27 import java.io.FileOutputStream;
     28 import java.io.IOException;
     29 import java.io.OutputStream;
     30 import java.util.ArrayList;
     31 import java.util.Iterator;
     32 
     33 /**
     34  * An implementation of DictEncoder for version 2 binary dictionary.
     35  */
     36 @UsedForTesting
     37 public class Ver2DictEncoder implements DictEncoder {
     38 
     39     private final File mDictFile;
     40     private OutputStream mOutStream;
     41     private byte[] mBuffer;
     42     private int mPosition;
     43 
     44     @UsedForTesting
     45     public Ver2DictEncoder(final File dictFile) {
     46         mDictFile = dictFile;
     47         mOutStream = null;
     48         mBuffer = null;
     49     }
     50 
     51     // This constructor is used only by BinaryDictOffdeviceUtilsTests.
     52     // If you want to use this in the production code, you should consider keeping consistency of
     53     // the interface of Ver3DictDecoder by using factory.
     54     @UsedForTesting
     55     public Ver2DictEncoder(final OutputStream outStream) {
     56         mDictFile = null;
     57         mOutStream = outStream;
     58     }
     59 
     60     private void openStream() throws FileNotFoundException {
     61         mOutStream = new FileOutputStream(mDictFile);
     62     }
     63 
     64     private void close() throws IOException {
     65         if (mOutStream != null) {
     66             mOutStream.close();
     67             mOutStream = null;
     68         }
     69     }
     70 
     71     @Override
     72     public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
     73             throws IOException, UnsupportedFormatException {
     74         if (formatOptions.mVersion > FormatSpec.VERSION2) {
     75             throw new UnsupportedFormatException(
     76                     "The given format options has wrong version number : "
     77                     + formatOptions.mVersion);
     78         }
     79 
     80         if (mOutStream == null) {
     81             openStream();
     82         }
     83         BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions);
     84 
     85         // Addresses are limited to 3 bytes, but since addresses can be relative to each node
     86         // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
     87         // the order of the PtNode arrays becomes a quite complicated problem, because though the
     88         // dictionary itself does not have a size limit, each node array must still be within 16MB
     89         // of all its children and parents. As long as this is ensured, the dictionary file may
     90         // grow to any size.
     91 
     92         // Leave the choice of the optimal node order to the flattenTree function.
     93         MakedictLog.i("Flattening the tree...");
     94         ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
     95 
     96         MakedictLog.i("Computing addresses...");
     97         BinaryDictEncoderUtils.computeAddresses(dict, flatNodes);
     98         MakedictLog.i("Checking PtNode array...");
     99         if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
    100 
    101         // Create a buffer that matches the final dictionary size.
    102         final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
    103         final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
    104         mBuffer = new byte[bufferSize];
    105 
    106         MakedictLog.i("Writing file...");
    107 
    108         for (PtNodeArray nodeArray : flatNodes) {
    109             BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray);
    110         }
    111         if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes);
    112         mOutStream.write(mBuffer, 0, mPosition);
    113 
    114         MakedictLog.i("Done");
    115         close();
    116     }
    117 
    118     @Override
    119     public void setPosition(final int position) {
    120         if (mBuffer == null || position < 0 || position >= mBuffer.length) return;
    121         mPosition = position;
    122     }
    123 
    124     @Override
    125     public int getPosition() {
    126         return mPosition;
    127     }
    128 
    129     @Override
    130     public void writePtNodeCount(final int ptNodeCount) {
    131         final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
    132         if (countSize != 1 && countSize != 2) {
    133             throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
    134         }
    135         final int encodedPtNodeCount = (countSize == 2) ?
    136                 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
    137         mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount,
    138                 countSize);
    139     }
    140 
    141     private void writePtNodeFlags(final PtNode ptNode) {
    142         final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode);
    143         mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition,
    144                 BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos),
    145                 FormatSpec.PTNODE_FLAGS_SIZE);
    146     }
    147 
    148     private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars) {
    149         mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition);
    150         if (hasSeveralChars) {
    151             mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
    152         }
    153     }
    154 
    155     private void writeFrequency(final int frequency) {
    156         if (frequency >= 0) {
    157             mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency,
    158                     FormatSpec.PTNODE_FREQUENCY_SIZE);
    159         }
    160     }
    161 
    162     private void writeChildrenPosition(final PtNode ptNode) {
    163         final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode);
    164         mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
    165                 childrenPos);
    166     }
    167 
    168     /**
    169      * Write a shortcut attributes list to mBuffer.
    170      *
    171      * @param shortcuts the shortcut attributes list.
    172      */
    173     private void writeShortcuts(final ArrayList<WeightedString> shortcuts) {
    174         if (null == shortcuts || shortcuts.isEmpty()) return;
    175 
    176         final int indexOfShortcutByteSize = mPosition;
    177         mPosition += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
    178         final Iterator<WeightedString> shortcutIterator = shortcuts.iterator();
    179         while (shortcutIterator.hasNext()) {
    180             final WeightedString target = shortcutIterator.next();
    181             final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
    182                     shortcutIterator.hasNext(),
    183                     target.getProbability());
    184             mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags,
    185                     FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
    186             final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord);
    187             mPosition += shortcutShift;
    188         }
    189         final int shortcutByteSize = mPosition - indexOfShortcutByteSize;
    190         if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) {
    191             throw new RuntimeException("Shortcut list too large");
    192         }
    193         BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize,
    194                 FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
    195     }
    196 
    197     /**
    198      * Write a bigram attributes list to mBuffer.
    199      *
    200      * @param bigrams the bigram attributes list.
    201      * @param dict the dictionary the node array is a part of (for relative offsets).
    202      */
    203     private void writeBigrams(final ArrayList<WeightedString> bigrams,
    204             final FusionDictionary dict) {
    205         if (bigrams == null) return;
    206 
    207         final Iterator<WeightedString> bigramIterator = bigrams.iterator();
    208         while (bigramIterator.hasNext()) {
    209             final WeightedString bigram = bigramIterator.next();
    210             final PtNode target =
    211                     FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
    212             final int addressOfBigram = target.mCachedAddressAfterUpdate;
    213             final int unigramFrequencyForThisWord = target.getProbability();
    214             final int offset = addressOfBigram
    215                     - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
    216             final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
    217                     offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord);
    218             mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags,
    219                     FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
    220             mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
    221                     Math.abs(offset));
    222         }
    223     }
    224 
    225     @Override
    226     public void writeForwardLinkAddress(final int forwardLinkAddress) {
    227         mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, forwardLinkAddress,
    228                 FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
    229     }
    230 
    231     @Override
    232     public void writePtNode(final PtNode ptNode, final FusionDictionary dict) {
    233         writePtNodeFlags(ptNode);
    234         writeCharacters(ptNode.mChars, ptNode.hasSeveralChars());
    235         writeFrequency(ptNode.getProbability());
    236         writeChildrenPosition(ptNode);
    237         writeShortcuts(ptNode.mShortcutTargets);
    238         writeBigrams(ptNode.mBigrams, dict);
    239     }
    240 }
    241