Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
     21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
     22 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
     23 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     24 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
     25 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     26 import com.android.inputmethod.latin.utils.CollectionUtils;
     27 
     28 import android.util.Log;
     29 
     30 import java.io.File;
     31 import java.io.FileNotFoundException;
     32 import java.io.IOException;
     33 import java.util.ArrayList;
     34 import java.util.Arrays;
     35 
     36 /**
     37  * An implementation of binary dictionary decoder for version 4 binary dictionary.
     38  */
     39 @UsedForTesting
     40 public class Ver4DictDecoder extends AbstractDictDecoder {
     41     private static final String TAG = Ver4DictDecoder.class.getSimpleName();
     42 
     43     private static final int FILETYPE_TRIE = 1;
     44     private static final int FILETYPE_FREQUENCY = 2;
     45     private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
     46     private static final int FILETYPE_BIGRAM_FREQ = 4;
     47     private static final int FILETYPE_SHORTCUT = 5;
     48 
     49     private final File mDictDirectory;
     50     private final DictionaryBufferFactory mBufferFactory;
     51     protected DictBuffer mDictBuffer;
     52     private DictBuffer mFrequencyBuffer;
     53     private DictBuffer mTerminalAddressTableBuffer;
     54     private DictBuffer mBigramBuffer;
     55     private DictBuffer mShortcutBuffer;
     56     private SparseTable mBigramAddressTable;
     57     private SparseTable mShortcutAddressTable;
     58 
     59     @UsedForTesting
     60     /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
     61         mDictDirectory = dictDirectory;
     62         mDictBuffer = mFrequencyBuffer = null;
     63 
     64         if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
     65             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
     66         } else if ((factoryFlag  & MASK_DICTBUFFER) == USE_BYTEARRAY) {
     67             mBufferFactory = new DictionaryBufferFromByteArrayFactory();
     68         } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) {
     69             mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory();
     70         } else {
     71             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
     72         }
     73     }
     74 
     75     @UsedForTesting
     76     /* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) {
     77         mDictDirectory = dictDirectory;
     78         mBufferFactory = factory;
     79         mDictBuffer = mFrequencyBuffer = null;
     80     }
     81 
     82     private File getFile(final int fileType) {
     83         if (fileType == FILETYPE_TRIE) {
     84             return new File(mDictDirectory,
     85                     mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION);
     86         } else if (fileType == FILETYPE_FREQUENCY) {
     87             return new File(mDictDirectory,
     88                     mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION);
     89         } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
     90             return new File(mDictDirectory,
     91                     mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
     92         } else if (fileType == FILETYPE_BIGRAM_FREQ) {
     93             return new File(mDictDirectory,
     94                     mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
     95                             + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
     96         } else if (fileType == FILETYPE_SHORTCUT) {
     97             return new File(mDictDirectory,
     98                     mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION
     99                             + FormatSpec.SHORTCUT_CONTENT_ID);
    100         } else {
    101             throw new RuntimeException("Unsupported kind of file : " + fileType);
    102         }
    103     }
    104 
    105     @Override
    106     public void openDictBuffer() throws FileNotFoundException, IOException {
    107         mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE));
    108         mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
    109         mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
    110                 getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
    111         mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
    112         loadBigramAddressSparseTable();
    113         mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT));
    114         loadShortcutAddressSparseTable();
    115     }
    116 
    117     @Override
    118     public boolean isDictBufferOpen() {
    119         return mDictBuffer != null;
    120     }
    121 
    122     /* package */ DictBuffer getDictBuffer() {
    123         return mDictBuffer;
    124     }
    125 
    126     @Override
    127     public FileHeader readHeader() throws IOException, UnsupportedFormatException {
    128         if (mDictBuffer == null) {
    129             openDictBuffer();
    130         }
    131         final FileHeader header = super.readHeader(mDictBuffer);
    132         final int version = header.mFormatOptions.mVersion;
    133         if (version != 4) {
    134             throw new UnsupportedFormatException("File header has a wrong version : " + version);
    135         }
    136         return header;
    137     }
    138 
    139     private void loadBigramAddressSparseTable() throws IOException {
    140         final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
    141                 + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
    142         final File freqsFile = new File(mDictDirectory, mDictDirectory.getName()
    143                 + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
    144                 + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
    145         mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile },
    146                 FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
    147     }
    148 
    149     // TODO: Let's have something like SparseTableContentsReader in this class.
    150     private void loadShortcutAddressSparseTable() throws IOException {
    151         final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
    152                 + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
    153         final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
    154                 + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
    155                 + FormatSpec.SHORTCUT_CONTENT_ID);
    156         mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
    157                 new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
    158     }
    159 
    160     protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
    161         protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
    162             frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
    163             return frequencyBuffer.readUnsignedByte();
    164         }
    165 
    166         protected static int readTerminalId(final DictBuffer dictBuffer) {
    167             return dictBuffer.readInt();
    168         }
    169     }
    170 
    171     private ArrayList<WeightedString> readShortcuts(final int terminalId) {
    172         if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null;
    173 
    174         final ArrayList<WeightedString> ret = CollectionUtils.newArrayList();
    175         final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX,
    176                 terminalId);
    177         mShortcutBuffer.position(posOfShortcuts);
    178         while (true) {
    179             final int flags = mShortcutBuffer.readUnsignedByte();
    180             final String word = CharEncoding.readString(mShortcutBuffer);
    181             ret.add(new WeightedString(word,
    182                     flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
    183             if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
    184         }
    185         return ret;
    186     }
    187 
    188     // TODO: Make this buffer thread safe.
    189     // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH.
    190     private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
    191     @Override
    192     public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) {
    193         int addressPointer = ptNodePos;
    194         final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
    195         addressPointer += FormatSpec.PTNODE_FLAGS_SIZE;
    196 
    197         final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options);
    198         if (BinaryDictIOUtils.supportsDynamicUpdate(options)) {
    199             addressPointer += FormatSpec.PARENT_ADDRESS_SIZE;
    200         }
    201 
    202         final int characters[];
    203         if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
    204             int index = 0;
    205             int character = CharEncoding.readChar(mDictBuffer);
    206             addressPointer += CharEncoding.getCharSize(character);
    207             while (FormatSpec.INVALID_CHARACTER != character
    208                     && index < FormatSpec.MAX_WORD_LENGTH) {
    209                 mCharacterBuffer[index++] = character;
    210                 character = CharEncoding.readChar(mDictBuffer);
    211                 addressPointer += CharEncoding.getCharSize(character);
    212             }
    213             characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
    214         } else {
    215             final int character = CharEncoding.readChar(mDictBuffer);
    216             addressPointer += CharEncoding.getCharSize(character);
    217             characters = new int[] { character };
    218         }
    219         final int terminalId;
    220         if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
    221             terminalId = PtNodeReader.readTerminalId(mDictBuffer);
    222             addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
    223         } else {
    224             terminalId = PtNode.NOT_A_TERMINAL;
    225         }
    226 
    227         final int frequency;
    228         if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
    229             frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId);
    230         } else {
    231             frequency = PtNode.NOT_A_TERMINAL;
    232         }
    233         int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
    234         if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
    235             childrenAddress += addressPointer;
    236         }
    237         addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
    238         final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId);
    239 
    240         final ArrayList<PendingAttribute> bigrams;
    241         if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
    242             bigrams = new ArrayList<PendingAttribute>();
    243             final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId);
    244             mBigramBuffer.position(posOfBigrams);
    245             while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
    246                 // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
    247                 // remaining bigram entries are ignored.
    248                 final int bigramFlags = mBigramBuffer.readUnsignedByte();
    249                 final int targetTerminalId = mBigramBuffer.readUnsignedInt24();
    250                 mTerminalAddressTableBuffer.position(
    251                         targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
    252                 final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24();
    253                 bigrams.add(new PendingAttribute(
    254                         bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
    255                         targetAddress));
    256                 if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
    257             }
    258             if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
    259                 throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
    260                         + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
    261             }
    262         } else {
    263             bigrams = null;
    264         }
    265         return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency,
    266                 parentAddress, childrenAddress, shortcutTargets, bigrams);
    267     }
    268 
    269     private void deleteDictFiles() {
    270         final File[] files = mDictDirectory.listFiles();
    271         for (int i = 0; i < files.length; ++i) {
    272             files[i].delete();
    273         }
    274     }
    275 
    276     @Override
    277     public FusionDictionary readDictionaryBinary(final FusionDictionary dict,
    278             final boolean deleteDictIfBroken)
    279             throws FileNotFoundException, IOException, UnsupportedFormatException {
    280         if (mDictBuffer == null) {
    281             openDictBuffer();
    282         }
    283         try {
    284             return BinaryDictDecoderUtils.readDictionaryBinary(this, dict);
    285         } catch (IOException e) {
    286             Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e);
    287             if (deleteDictIfBroken) {
    288                 deleteDictFiles();
    289             }
    290             throw e;
    291         } catch (UnsupportedFormatException e) {
    292             Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e);
    293             if (deleteDictIfBroken) {
    294                 deleteDictFiles();
    295             }
    296             throw e;
    297         }
    298     }
    299 
    300     @Override
    301     public void setPosition(int newPos) {
    302         mDictBuffer.position(newPos);
    303     }
    304 
    305     @Override
    306     public int getPosition() {
    307         return mDictBuffer.position();
    308     }
    309 
    310     @Override
    311     public int readPtNodeCount() {
    312         return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer);
    313     }
    314 
    315     @Override
    316     public boolean readAndFollowForwardLink() {
    317         final int nextAddress = mDictBuffer.readUnsignedInt24();
    318         if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) {
    319             mDictBuffer.position(nextAddress);
    320             return true;
    321         }
    322         return false;
    323     }
    324 
    325     @Override
    326     public boolean hasNextPtNodeArray() {
    327         return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS;
    328     }
    329 
    330     @Override
    331     public void skipPtNode(final FormatOptions formatOptions) {
    332         final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
    333         PtNodeReader.readParentAddress(mDictBuffer, formatOptions);
    334         BinaryDictIOUtils.skipString(mDictBuffer,
    335                 (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
    336         if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer);
    337         PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions);
    338     }
    339 }
    340