Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
     21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
     22 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
     23 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     24 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     25 
     26 import java.io.IOException;
     27 import java.util.ArrayList;
     28 import java.util.HashMap;
     29 import java.util.TreeMap;
     30 
     31 /**
     32  * A base class of the binary dictionary decoder.
     33  */
     34 public abstract class AbstractDictDecoder implements DictDecoder {
     35     protected FileHeader readHeader(final DictBuffer dictBuffer)
     36             throws IOException, UnsupportedFormatException {
     37         if (dictBuffer == null) {
     38             openDictBuffer();
     39         }
     40 
     41         final int version = HeaderReader.readVersion(dictBuffer);
     42         if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
     43                 || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) {
     44           throw new UnsupportedFormatException("Unsupported version : " + version);
     45         }
     46         // TODO: Remove this field.
     47         final int optionsFlags = HeaderReader.readOptionFlags(dictBuffer);
     48 
     49         final int headerSize = HeaderReader.readHeaderSize(dictBuffer);
     50 
     51         if (headerSize < 0) {
     52             throw new UnsupportedFormatException("header size can't be negative.");
     53         }
     54 
     55         final HashMap<String, String> attributes = HeaderReader.readAttributes(dictBuffer,
     56                 headerSize);
     57 
     58         final FileHeader header = new FileHeader(headerSize,
     59                 new FusionDictionary.DictionaryOptions(attributes,
     60                         0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG),
     61                         0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)),
     62                         new FormatOptions(version,
     63                                 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE),
     64                                 0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG)));
     65         return header;
     66     }
     67 
     68     @Override @UsedForTesting
     69     public int getTerminalPosition(final String word)
     70             throws IOException, UnsupportedFormatException {
     71         if (!isDictBufferOpen()) {
     72             openDictBuffer();
     73         }
     74         return BinaryDictIOUtils.getTerminalPosition(this, word);
     75     }
     76 
     77     @Override @UsedForTesting
     78     public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words,
     79             final TreeMap<Integer, Integer> frequencies,
     80             final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams)
     81             throws IOException, UnsupportedFormatException {
     82         if (!isDictBufferOpen()) {
     83             openDictBuffer();
     84         }
     85         BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams);
     86     }
     87 
     88     /**
     89      * A utility class for reading a file header.
     90      */
     91     protected static class HeaderReader {
     92         protected static int readVersion(final DictBuffer dictBuffer)
     93                 throws IOException, UnsupportedFormatException {
     94             return BinaryDictDecoderUtils.checkFormatVersion(dictBuffer);
     95         }
     96 
     97         protected static int readOptionFlags(final DictBuffer dictBuffer) {
     98             return dictBuffer.readUnsignedShort();
     99         }
    100 
    101         protected static int readHeaderSize(final DictBuffer dictBuffer) {
    102             return dictBuffer.readInt();
    103         }
    104 
    105         protected static HashMap<String, String> readAttributes(final DictBuffer dictBuffer,
    106                 final int headerSize) {
    107             final HashMap<String, String> attributes = new HashMap<String, String>();
    108             while (dictBuffer.position() < headerSize) {
    109                 // We can avoid an infinite loop here since dictBuffer.position() is always
    110                 // increased by calling CharEncoding.readString.
    111                 final String key = CharEncoding.readString(dictBuffer);
    112                 final String value = CharEncoding.readString(dictBuffer);
    113                 attributes.put(key, value);
    114             }
    115             dictBuffer.position(headerSize);
    116             return attributes;
    117         }
    118     }
    119 
    120     /**
    121      * A utility class for reading a PtNode.
    122      */
    123     protected static class PtNodeReader {
    124         protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) {
    125             return dictBuffer.readUnsignedByte();
    126         }
    127 
    128         protected static int readParentAddress(final DictBuffer dictBuffer,
    129                 final FormatOptions formatOptions) {
    130             if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) {
    131                 return BinaryDictDecoderUtils.readSInt24(dictBuffer);
    132             } else {
    133                 return FormatSpec.NO_PARENT_ADDRESS;
    134             }
    135         }
    136 
    137         protected static int readChildrenAddress(final DictBuffer dictBuffer, final int optionFlags,
    138                 final FormatOptions formatOptions) {
    139             if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) {
    140                 final int address = BinaryDictDecoderUtils.readSInt24(dictBuffer);
    141                 if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS;
    142                 return address;
    143             } else {
    144                 switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
    145                     case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
    146                         return dictBuffer.readUnsignedByte();
    147                     case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
    148                         return dictBuffer.readUnsignedShort();
    149                     case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
    150                         return dictBuffer.readUnsignedInt24();
    151                     case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
    152                     default:
    153                         return FormatSpec.NO_CHILDREN_ADDRESS;
    154                 }
    155             }
    156         }
    157 
    158         // Reads shortcuts and returns the read length.
    159         protected static int readShortcut(final DictBuffer dictBuffer,
    160                 final ArrayList<WeightedString> shortcutTargets) {
    161             final int pointerBefore = dictBuffer.position();
    162             dictBuffer.readUnsignedShort(); // skip the size
    163             while (true) {
    164                 final int targetFlags = dictBuffer.readUnsignedByte();
    165                 final String word = CharEncoding.readString(dictBuffer);
    166                 shortcutTargets.add(new WeightedString(word,
    167                         targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
    168                 if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
    169             }
    170             return dictBuffer.position() - pointerBefore;
    171         }
    172 
    173         protected static int readBigramAddresses(final DictBuffer dictBuffer,
    174                 final ArrayList<PendingAttribute> bigrams, final int baseAddress) {
    175             int readLength = 0;
    176             int bigramCount = 0;
    177             while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
    178                 final int bigramFlags = dictBuffer.readUnsignedByte();
    179                 ++readLength;
    180                 final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE)
    181                         ? 1 : -1;
    182                 int bigramAddress = baseAddress + readLength;
    183                 switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) {
    184                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE:
    185                         bigramAddress += sign * dictBuffer.readUnsignedByte();
    186                         readLength += 1;
    187                         break;
    188                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES:
    189                         bigramAddress += sign * dictBuffer.readUnsignedShort();
    190                         readLength += 2;
    191                         break;
    192                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES:
    193                         bigramAddress += sign * dictBuffer.readUnsignedInt24();
    194                         readLength += 3;
    195                         break;
    196                     default:
    197                         throw new RuntimeException("Has bigrams with no address");
    198                 }
    199                 bigrams.add(new PendingAttribute(
    200                         bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
    201                         bigramAddress));
    202                 if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
    203             }
    204             return readLength;
    205         }
    206     }
    207 }
    208