Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.BinaryDictionary;
     21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
     22 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
     23 
     24 import java.io.File;
     25 import java.io.FileNotFoundException;
     26 import java.io.IOException;
     27 import java.util.ArrayList;
     28 import java.util.Arrays;
     29 
     30 /**
     31  * An implementation of DictDecoder for version 2 binary dictionary.
     32  */
     33 // TODO: Separate logics that are used only for testing.
     34 @UsedForTesting
     35 public class Ver2DictDecoder extends AbstractDictDecoder {
     36     /**
     37      * A utility class for reading a PtNode.
     38      */
     39     protected static class PtNodeReader {
     40         private static ProbabilityInfo readProbabilityInfo(final DictBuffer dictBuffer) {
     41             // Ver2 dicts don't contain historical information.
     42             return new ProbabilityInfo(dictBuffer.readUnsignedByte());
     43         }
     44 
     45         protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) {
     46             return dictBuffer.readUnsignedByte();
     47         }
     48 
     49         protected static int readChildrenAddress(final DictBuffer dictBuffer,
     50                 final int ptNodeFlags) {
     51             switch (ptNodeFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
     52                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
     53                     return dictBuffer.readUnsignedByte();
     54                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
     55                     return dictBuffer.readUnsignedShort();
     56                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
     57                     return dictBuffer.readUnsignedInt24();
     58                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
     59                 default:
     60                     return FormatSpec.NO_CHILDREN_ADDRESS;
     61             }
     62         }
     63 
     64         // Reads shortcuts and returns the read length.
     65         protected static int readShortcut(final DictBuffer dictBuffer,
     66                 final ArrayList<WeightedString> shortcutTargets) {
     67             final int pointerBefore = dictBuffer.position();
     68             dictBuffer.readUnsignedShort(); // skip the size
     69             while (true) {
     70                 final int targetFlags = dictBuffer.readUnsignedByte();
     71                 final String word = CharEncoding.readString(dictBuffer);
     72                 shortcutTargets.add(new WeightedString(word,
     73                         targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
     74                 if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
     75             }
     76             return dictBuffer.position() - pointerBefore;
     77         }
     78 
     79         protected static int readBigramAddresses(final DictBuffer dictBuffer,
     80                 final ArrayList<PendingAttribute> bigrams, final int baseAddress) {
     81             int readLength = 0;
     82             int bigramCount = 0;
     83             while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
     84                 final int bigramFlags = dictBuffer.readUnsignedByte();
     85                 ++readLength;
     86                 final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE)
     87                         ? 1 : -1;
     88                 int bigramAddress = baseAddress + readLength;
     89                 switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) {
     90                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE:
     91                         bigramAddress += sign * dictBuffer.readUnsignedByte();
     92                         readLength += 1;
     93                         break;
     94                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES:
     95                         bigramAddress += sign * dictBuffer.readUnsignedShort();
     96                         readLength += 2;
     97                         break;
     98                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES:
     99                         bigramAddress += sign * dictBuffer.readUnsignedInt24();
    100                         readLength += 3;
    101                         break;
    102                     default:
    103                         throw new RuntimeException("Has bigrams with no address");
    104                 }
    105                 bigrams.add(new PendingAttribute(
    106                         bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
    107                         bigramAddress));
    108                 if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
    109             }
    110             return readLength;
    111         }
    112     }
    113 
    114     protected final File mDictionaryBinaryFile;
    115     protected final long mOffset;
    116     protected final long mLength;
    117     // TODO: Remove mBufferFactory and mDictBuffer from this class members because they are now
    118     // used only for testing.
    119     private final DictionaryBufferFactory mBufferFactory;
    120     protected DictBuffer mDictBuffer;
    121 
    122     @UsedForTesting
    123     /* package */ Ver2DictDecoder(final File file, final long offset, final long length,
    124             final int factoryFlag) {
    125         mDictionaryBinaryFile = file;
    126         mOffset = offset;
    127         mLength = length;
    128         mDictBuffer = null;
    129         if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
    130             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
    131         } else if ((factoryFlag  & MASK_DICTBUFFER) == USE_BYTEARRAY) {
    132             mBufferFactory = new DictionaryBufferFromByteArrayFactory();
    133         } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) {
    134             mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory();
    135         } else {
    136             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
    137         }
    138     }
    139 
    140     /* package */ Ver2DictDecoder(final File file, final long offset, final long length,
    141             final DictionaryBufferFactory factory) {
    142         mDictionaryBinaryFile = file;
    143         mOffset = offset;
    144         mLength = length;
    145         mBufferFactory = factory;
    146     }
    147 
    148     @Override
    149     public void openDictBuffer() throws FileNotFoundException, IOException {
    150         mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile);
    151     }
    152 
    153     @Override
    154     public boolean isDictBufferOpen() {
    155         return mDictBuffer != null;
    156     }
    157 
    158     /* package */ DictBuffer getDictBuffer() {
    159         return mDictBuffer;
    160     }
    161 
    162     @UsedForTesting
    163     /* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException {
    164         openDictBuffer();
    165         return getDictBuffer();
    166     }
    167 
    168     @Override
    169     public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException {
    170         // dictType is not being used in dicttool. Passing an empty string.
    171         final BinaryDictionary binaryDictionary = new BinaryDictionary(
    172                 mDictionaryBinaryFile.getAbsolutePath(), mOffset, mLength,
    173                 true /* useFullEditDistance */, null /* locale */, "" /* dictType */,
    174                 false /* isUpdatable */);
    175         final DictionaryHeader header = binaryDictionary.getHeader();
    176         binaryDictionary.close();
    177         if (header == null) {
    178             throw new IOException("Cannot read the dictionary header.");
    179         }
    180         if (header.mFormatOptions.mVersion != FormatSpec.VERSION2) {
    181             throw new UnsupportedFormatException("File header has a wrong version : "
    182                     + header.mFormatOptions.mVersion);
    183         }
    184         if (!isDictBufferOpen()) {
    185             openDictBuffer();
    186         }
    187         // Advance buffer reading position to the head of dictionary body.
    188         setPosition(header.mBodyOffset);
    189         return header;
    190     }
    191 
    192     // TODO: Make this buffer multi thread safe.
    193     private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
    194     @Override
    195     public PtNodeInfo readPtNode(final int ptNodePos) {
    196         int addressPointer = ptNodePos;
    197         final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
    198         addressPointer += FormatSpec.PTNODE_FLAGS_SIZE;
    199         final int characters[];
    200         if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
    201             int index = 0;
    202             int character = CharEncoding.readChar(mDictBuffer);
    203             addressPointer += CharEncoding.getCharSize(character);
    204             while (FormatSpec.INVALID_CHARACTER != character) {
    205                 // FusionDictionary is making sure that the length of the word is smaller than
    206                 // MAX_WORD_LENGTH.
    207                 // So we'll never write past the end of mCharacterBuffer.
    208                 mCharacterBuffer[index++] = character;
    209                 character = CharEncoding.readChar(mDictBuffer);
    210                 addressPointer += CharEncoding.getCharSize(character);
    211             }
    212             characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
    213         } else {
    214             final int character = CharEncoding.readChar(mDictBuffer);
    215             addressPointer += CharEncoding.getCharSize(character);
    216             characters = new int[] { character };
    217         }
    218         final ProbabilityInfo probabilityInfo;
    219         if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
    220             probabilityInfo = PtNodeReader.readProbabilityInfo(mDictBuffer);
    221             addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE;
    222         } else {
    223             probabilityInfo = null;
    224         }
    225         int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags);
    226         if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
    227             childrenAddress += addressPointer;
    228         }
    229         addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags);
    230         final ArrayList<WeightedString> shortcutTargets;
    231         if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
    232             // readShortcut will add shortcuts to shortcutTargets.
    233             shortcutTargets = new ArrayList<>();
    234             addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets);
    235         } else {
    236             shortcutTargets = null;
    237         }
    238 
    239         final ArrayList<PendingAttribute> bigrams;
    240         if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
    241             bigrams = new ArrayList<>();
    242             addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams,
    243                     addressPointer);
    244             if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
    245                 throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
    246                         + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
    247             }
    248         } else {
    249             bigrams = null;
    250         }
    251         return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, probabilityInfo,
    252                 childrenAddress, shortcutTargets, bigrams);
    253     }
    254 
    255     @Override
    256     public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken)
    257             throws FileNotFoundException, IOException, UnsupportedFormatException {
    258         // dictType is not being used in dicttool. Passing an empty string.
    259         final BinaryDictionary binaryDictionary = new BinaryDictionary(
    260                 mDictionaryBinaryFile.getAbsolutePath(), 0 /* offset */,
    261                 mDictionaryBinaryFile.length() /* length */, true /* useFullEditDistance */,
    262                 null /* locale */, "" /* dictType */, false /* isUpdatable */);
    263         final DictionaryHeader header = readHeader();
    264         final FusionDictionary fusionDict =
    265                 new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions);
    266         int token = 0;
    267         final ArrayList<WordProperty> wordProperties = new ArrayList<>();
    268         do {
    269             final BinaryDictionary.GetNextWordPropertyResult result =
    270                     binaryDictionary.getNextWordProperty(token);
    271             final WordProperty wordProperty = result.mWordProperty;
    272             if (wordProperty == null) {
    273                 binaryDictionary.close();
    274                 if (deleteDictIfBroken) {
    275                     mDictionaryBinaryFile.delete();
    276                 }
    277                 return null;
    278             }
    279             wordProperties.add(wordProperty);
    280             token = result.mNextToken;
    281         } while (token != 0);
    282 
    283         // Insert unigrams into the fusion dictionary.
    284         for (final WordProperty wordProperty : wordProperties) {
    285             if (wordProperty.mIsBlacklistEntry) {
    286                 fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
    287                         wordProperty.mIsNotAWord);
    288             } else {
    289                 fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
    290                         wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
    291             }
    292         }
    293         // Insert bigrams into the fusion dictionary.
    294         for (final WordProperty wordProperty : wordProperties) {
    295             if (wordProperty.mBigrams == null) {
    296                 continue;
    297             }
    298             final String word0 = wordProperty.mWord;
    299             for (final WeightedString bigram : wordProperty.mBigrams) {
    300                 fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo);
    301             }
    302         }
    303         binaryDictionary.close();
    304         return fusionDict;
    305     }
    306 
    307     @Override
    308     public void setPosition(int newPos) {
    309         mDictBuffer.position(newPos);
    310     }
    311 
    312     @Override
    313     public int getPosition() {
    314         return mDictBuffer.position();
    315     }
    316 
    317     @Override
    318     public int readPtNodeCount() {
    319         return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer);
    320     }
    321 }
    322