Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
     21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
     22 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
     23 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     24 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
     25 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     26 import com.android.inputmethod.latin.utils.CollectionUtils;
     27 
     28 import android.util.Log;
     29 
     30 import java.io.File;
     31 import java.io.FileNotFoundException;
     32 import java.io.IOException;
     33 import java.util.ArrayList;
     34 import java.util.Arrays;
     35 
     36 /**
     37  * An implementation of binary dictionary decoder for version 4 binary dictionary.
     38  */
     39 @UsedForTesting
     40 public class Ver4DictDecoder extends AbstractDictDecoder {
     41     private static final String TAG = Ver4DictDecoder.class.getSimpleName();
     42 
     43     private static final int FILETYPE_TRIE = 1;
     44     private static final int FILETYPE_FREQUENCY = 2;
     45     private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
     46     private static final int FILETYPE_BIGRAM_FREQ = 4;
     47     private static final int FILETYPE_SHORTCUT = 5;
     48 
     49     private final File mDictDirectory;
     50     private final DictionaryBufferFactory mBufferFactory;
     51     protected DictBuffer mDictBuffer;
     52     private DictBuffer mFrequencyBuffer;
     53     private DictBuffer mTerminalAddressTableBuffer;
     54     private DictBuffer mBigramBuffer;
     55     private DictBuffer mShortcutBuffer;
     56     private SparseTable mBigramAddressTable;
     57     private SparseTable mShortcutAddressTable;
     58 
     59     @UsedForTesting
     60     /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
     61         mDictDirectory = dictDirectory;
     62         mDictBuffer = mFrequencyBuffer = null;
     63 
     64         if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
     65             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
     66         } else if ((factoryFlag  & MASK_DICTBUFFER) == USE_BYTEARRAY) {
     67             mBufferFactory = new DictionaryBufferFromByteArrayFactory();
     68         } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) {
     69             mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory();
     70         } else {
     71             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
     72         }
     73     }
     74 
     75     @UsedForTesting
     76     /* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) {
     77         mDictDirectory = dictDirectory;
     78         mBufferFactory = factory;
     79         mDictBuffer = mFrequencyBuffer = null;
     80     }
     81 
     82     private File getFile(final int fileType) {
     83         if (fileType == FILETYPE_TRIE) {
     84             return new File(mDictDirectory,
     85                     mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION);
     86         } else if (fileType == FILETYPE_FREQUENCY) {
     87             return new File(mDictDirectory,
     88                     mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION);
     89         } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
     90             return new File(mDictDirectory,
     91                     mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
     92         } else if (fileType == FILETYPE_BIGRAM_FREQ) {
     93             return new File(mDictDirectory,
     94                     mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
     95                             + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
     96         } else if (fileType == FILETYPE_SHORTCUT) {
     97             return new File(mDictDirectory,
     98                     mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION
     99                             + FormatSpec.SHORTCUT_CONTENT_ID);
    100         } else {
    101             throw new RuntimeException("Unsupported kind of file : " + fileType);
    102         }
    103     }
    104 
    105     @Override
    106     public void openDictBuffer() throws FileNotFoundException, IOException {
    107         mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE));
    108         mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
    109         mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
    110                 getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
    111         mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
    112         loadBigramAddressSparseTable();
    113         mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT));
    114         loadShortcutAddressSparseTable();
    115     }
    116 
    117     @Override
    118     public boolean isDictBufferOpen() {
    119         return mDictBuffer != null;
    120     }
    121 
    122     /* package */ DictBuffer getDictBuffer() {
    123         return mDictBuffer;
    124     }
    125 
    126     @Override
    127     public FileHeader readHeader() throws IOException, UnsupportedFormatException {
    128         if (mDictBuffer == null) {
    129             openDictBuffer();
    130         }
    131         final FileHeader header = super.readHeader(mDictBuffer);
    132         final int version = header.mFormatOptions.mVersion;
    133         if (version != 4) {
    134             throw new UnsupportedFormatException("File header has a wrong version : " + version);
    135         }
    136         return header;
    137     }
    138 
    139     private void loadBigramAddressSparseTable() throws IOException {
    140         final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
    141                 + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
    142         final File freqsFile = new File(mDictDirectory, mDictDirectory.getName()
    143                 + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
    144                 + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
    145         mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile },
    146                 FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
    147     }
    148 
    149     // TODO: Let's have something like SparseTableContentsReader in this class.
    150     private void loadShortcutAddressSparseTable() throws IOException {
    151         final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
    152                 + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
    153         final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
    154                 + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
    155                 + FormatSpec.SHORTCUT_CONTENT_ID);
    156         final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName()
    157                 + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
    158                 + FormatSpec.SHORTCUT_CONTENT_ID);
    159         mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
    160                 new File[] { contentFile, timestampsFile },
    161                 FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
    162     }
    163 
    164     protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
    165         protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
    166             frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
    167             return frequencyBuffer.readUnsignedByte();
    168         }
    169 
    170         protected static int readTerminalId(final DictBuffer dictBuffer) {
    171             return dictBuffer.readInt();
    172         }
    173     }
    174 
    175     private ArrayList<WeightedString> readShortcuts(final int terminalId) {
    176         if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null;
    177 
    178         final ArrayList<WeightedString> ret = CollectionUtils.newArrayList();
    179         final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX,
    180                 terminalId);
    181         mShortcutBuffer.position(posOfShortcuts);
    182         while (true) {
    183             final int flags = mShortcutBuffer.readUnsignedByte();
    184             final String word = CharEncoding.readString(mShortcutBuffer);
    185             ret.add(new WeightedString(word,
    186                     flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
    187             if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
    188         }
    189         return ret;
    190     }
    191 
    192     // TODO: Make this buffer thread safe.
    193     // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH.
    194     private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
    195     @Override
    196     public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) {
    197         int addressPointer = ptNodePos;
    198         final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
    199         addressPointer += FormatSpec.PTNODE_FLAGS_SIZE;
    200 
    201         final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options);
    202         if (BinaryDictIOUtils.supportsDynamicUpdate(options)) {
    203             addressPointer += FormatSpec.PARENT_ADDRESS_SIZE;
    204         }
    205 
    206         final int characters[];
    207         if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
    208             int index = 0;
    209             int character = CharEncoding.readChar(mDictBuffer);
    210             addressPointer += CharEncoding.getCharSize(character);
    211             while (FormatSpec.INVALID_CHARACTER != character
    212                     && index < FormatSpec.MAX_WORD_LENGTH) {
    213                 mCharacterBuffer[index++] = character;
    214                 character = CharEncoding.readChar(mDictBuffer);
    215                 addressPointer += CharEncoding.getCharSize(character);
    216             }
    217             characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
    218         } else {
    219             final int character = CharEncoding.readChar(mDictBuffer);
    220             addressPointer += CharEncoding.getCharSize(character);
    221             characters = new int[] { character };
    222         }
    223         final int terminalId;
    224         if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
    225             terminalId = PtNodeReader.readTerminalId(mDictBuffer);
    226             addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
    227         } else {
    228             terminalId = PtNode.NOT_A_TERMINAL;
    229         }
    230 
    231         final int frequency;
    232         if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
    233             frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId);
    234         } else {
    235             frequency = PtNode.NOT_A_TERMINAL;
    236         }
    237         int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
    238         if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
    239             childrenAddress += addressPointer;
    240         }
    241         addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
    242         final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId);
    243 
    244         final ArrayList<PendingAttribute> bigrams;
    245         if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
    246             bigrams = new ArrayList<PendingAttribute>();
    247             final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId);
    248             mBigramBuffer.position(posOfBigrams);
    249             while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
    250                 // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
    251                 // remaining bigram entries are ignored.
    252                 final int bigramFlags = mBigramBuffer.readUnsignedByte();
    253                 final int targetTerminalId = mBigramBuffer.readUnsignedInt24();
    254                 mTerminalAddressTableBuffer.position(
    255                         targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
    256                 final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24();
    257                 bigrams.add(new PendingAttribute(
    258                         bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
    259                         targetAddress));
    260                 if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
    261             }
    262             if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
    263                 throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
    264                         + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
    265             }
    266         } else {
    267             bigrams = null;
    268         }
    269         return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency,
    270                 parentAddress, childrenAddress, shortcutTargets, bigrams);
    271     }
    272 
    273     private void deleteDictFiles() {
    274         final File[] files = mDictDirectory.listFiles();
    275         for (int i = 0; i < files.length; ++i) {
    276             files[i].delete();
    277         }
    278     }
    279 
    280     @Override
    281     public FusionDictionary readDictionaryBinary(final FusionDictionary dict,
    282             final boolean deleteDictIfBroken)
    283             throws FileNotFoundException, IOException, UnsupportedFormatException {
    284         if (mDictBuffer == null) {
    285             openDictBuffer();
    286         }
    287         try {
    288             return BinaryDictDecoderUtils.readDictionaryBinary(this, dict);
    289         } catch (IOException e) {
    290             Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e);
    291             if (deleteDictIfBroken) {
    292                 deleteDictFiles();
    293             }
    294             throw e;
    295         } catch (UnsupportedFormatException e) {
    296             Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e);
    297             if (deleteDictIfBroken) {
    298                 deleteDictFiles();
    299             }
    300             throw e;
    301         }
    302     }
    303 
    304     @Override
    305     public void setPosition(int newPos) {
    306         mDictBuffer.position(newPos);
    307     }
    308 
    309     @Override
    310     public int getPosition() {
    311         return mDictBuffer.position();
    312     }
    313 
    314     @Override
    315     public int readPtNodeCount() {
    316         return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer);
    317     }
    318 
    319     @Override
    320     public boolean readAndFollowForwardLink() {
    321         final int nextAddress = mDictBuffer.readUnsignedInt24();
    322         if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) {
    323             mDictBuffer.position(nextAddress);
    324             return true;
    325         }
    326         return false;
    327     }
    328 
    329     @Override
    330     public boolean hasNextPtNodeArray() {
    331         return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS;
    332     }
    333 
    334     @Override
    335     public void skipPtNode(final FormatOptions formatOptions) {
    336         final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
    337         PtNodeReader.readParentAddress(mDictBuffer, formatOptions);
    338         BinaryDictIOUtils.skipString(mDictBuffer,
    339                 (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
    340         if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer);
    341         PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions);
    342     }
    343 }
    344