Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 
     21 import java.io.File;
     22 import java.io.IOException;
     23 import java.io.OutputStream;
     24 import java.nio.ByteBuffer;
     25 
     26 /**
     27  * Decodes binary files for a FusionDictionary.
     28  *
     29  * All the methods in this class are static.
     30  *
     31  * TODO: Move this file to makedict/internal.
     32  * TODO: Rename this class to DictDecoderUtils.
     33  */
     34 public final class BinaryDictDecoderUtils {
     35     private BinaryDictDecoderUtils() {
     36         // This utility class is not publicly instantiable.
     37     }
     38 
     39     @UsedForTesting
     40     public interface DictBuffer {
     41         public int readUnsignedByte();
     42         public int readUnsignedShort();
     43         public int readUnsignedInt24();
     44         public int readInt();
     45         public int position();
     46         public void position(int newPosition);
     47         @UsedForTesting
     48         public void put(final byte b);
     49         public int limit();
     50         @UsedForTesting
     51         public int capacity();
     52     }
     53 
     54     public static final class ByteBufferDictBuffer implements DictBuffer {
     55         private ByteBuffer mBuffer;
     56 
     57         public ByteBufferDictBuffer(final ByteBuffer buffer) {
     58             mBuffer = buffer;
     59         }
     60 
     61         @Override
     62         public int readUnsignedByte() {
     63             return mBuffer.get() & 0xFF;
     64         }
     65 
     66         @Override
     67         public int readUnsignedShort() {
     68             return mBuffer.getShort() & 0xFFFF;
     69         }
     70 
     71         @Override
     72         public int readUnsignedInt24() {
     73             final int retval = readUnsignedByte();
     74             return (retval << 16) + readUnsignedShort();
     75         }
     76 
     77         @Override
     78         public int readInt() {
     79             return mBuffer.getInt();
     80         }
     81 
     82         @Override
     83         public int position() {
     84             return mBuffer.position();
     85         }
     86 
     87         @Override
     88         public void position(int newPos) {
     89             mBuffer.position(newPos);
     90         }
     91 
     92         @Override
     93         public void put(final byte b) {
     94             mBuffer.put(b);
     95         }
     96 
     97         @Override
     98         public int limit() {
     99             return mBuffer.limit();
    100         }
    101 
    102         @Override
    103         public int capacity() {
    104             return mBuffer.capacity();
    105         }
    106     }
    107 
    108     /**
    109      * A class grouping utility function for our specific character encoding.
    110      */
    111     static final class CharEncoding {
    112         private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
    113         private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
    114 
    115         /**
    116          * Helper method to find out whether this code fits on one byte
    117          */
    118         private static boolean fitsOnOneByte(final int character) {
    119             return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
    120                     && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
    121         }
    122 
    123         /**
    124          * Compute the size of a character given its character code.
    125          *
    126          * Char format is:
    127          * 1 byte = bbbbbbbb match
    128          * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
    129          * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
    130          *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
    131          *       00011111 would be outside unicode.
    132          * else: iso-latin-1 code
    133          * This allows for the whole unicode range to be encoded, including chars outside of
    134          * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
    135          * characters which should never happen anyway (and still work, but take 3 bytes).
    136          *
    137          * @param character the character code.
    138          * @return the size in binary encoded-form, either 1 or 3 bytes.
    139          */
    140         static int getCharSize(final int character) {
    141             // See char encoding in FusionDictionary.java
    142             if (fitsOnOneByte(character)) return 1;
    143             if (FormatSpec.INVALID_CHARACTER == character) return 1;
    144             return 3;
    145         }
    146 
    147         /**
    148          * Compute the byte size of a character array.
    149          */
    150         static int getCharArraySize(final int[] chars) {
    151             int size = 0;
    152             for (int character : chars) size += getCharSize(character);
    153             return size;
    154         }
    155 
    156         /**
    157          * Writes a char array to a byte buffer.
    158          *
    159          * @param codePoints the code point array to write.
    160          * @param buffer the byte buffer to write to.
    161          * @param index the index in buffer to write the character array to.
    162          * @return the index after the last character.
    163          */
    164         static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
    165             for (int codePoint : codePoints) {
    166                 if (1 == getCharSize(codePoint)) {
    167                     buffer[index++] = (byte)codePoint;
    168                 } else {
    169                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    170                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    171                     buffer[index++] = (byte)(0xFF & codePoint);
    172                 }
    173             }
    174             return index;
    175         }
    176 
    177         /**
    178          * Writes a string with our character format to a byte buffer.
    179          *
    180          * This will also write the terminator byte.
    181          *
    182          * @param buffer the byte buffer to write to.
    183          * @param origin the offset to write from.
    184          * @param word the string to write.
    185          * @return the size written, in bytes.
    186          */
    187         static int writeString(final byte[] buffer, final int origin, final String word) {
    188             final int length = word.length();
    189             int index = origin;
    190             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    191                 final int codePoint = word.codePointAt(i);
    192                 if (1 == getCharSize(codePoint)) {
    193                     buffer[index++] = (byte)codePoint;
    194                 } else {
    195                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    196                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    197                     buffer[index++] = (byte)(0xFF & codePoint);
    198                 }
    199             }
    200             buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
    201             return index - origin;
    202         }
    203 
    204         /**
    205          * Writes a string with our character format to an OutputStream.
    206          *
    207          * This will also write the terminator byte.
    208          *
    209          * @param stream the OutputStream to write to.
    210          * @param word the string to write.
    211          * @return the size written, in bytes.
    212          */
    213         static int writeString(final OutputStream stream, final String word) throws IOException {
    214             final int length = word.length();
    215             int written = 0;
    216             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    217                 final int codePoint = word.codePointAt(i);
    218                 final int charSize = getCharSize(codePoint);
    219                 if (1 == charSize) {
    220                     stream.write((byte) codePoint);
    221                 } else {
    222                     stream.write((byte) (0xFF & (codePoint >> 16)));
    223                     stream.write((byte) (0xFF & (codePoint >> 8)));
    224                     stream.write((byte) (0xFF & codePoint));
    225                 }
    226                 written += charSize;
    227             }
    228             stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
    229             written += FormatSpec.PTNODE_TERMINATOR_SIZE;
    230             return written;
    231         }
    232 
    233         /**
    234          * Reads a string from a DictBuffer. This is the converse of the above method.
    235          */
    236         static String readString(final DictBuffer dictBuffer) {
    237             final StringBuilder s = new StringBuilder();
    238             int character = readChar(dictBuffer);
    239             while (character != FormatSpec.INVALID_CHARACTER) {
    240                 s.appendCodePoint(character);
    241                 character = readChar(dictBuffer);
    242             }
    243             return s.toString();
    244         }
    245 
    246         /**
    247          * Reads a character from the buffer.
    248          *
    249          * This follows the character format documented earlier in this source file.
    250          *
    251          * @param dictBuffer the buffer, positioned over an encoded character.
    252          * @return the character code.
    253          */
    254         static int readChar(final DictBuffer dictBuffer) {
    255             int character = dictBuffer.readUnsignedByte();
    256             if (!fitsOnOneByte(character)) {
    257                 if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
    258                     return FormatSpec.INVALID_CHARACTER;
    259                 }
    260                 character <<= 16;
    261                 character += dictBuffer.readUnsignedShort();
    262             }
    263             return character;
    264         }
    265     }
    266 
    267     /**
    268      * Reads and returns the PtNode count out of a buffer and forwards the pointer.
    269      */
    270     /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
    271         final int msb = dictBuffer.readUnsignedByte();
    272         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
    273             return msb;
    274         } else {
    275             return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
    276                     + dictBuffer.readUnsignedByte();
    277         }
    278     }
    279 
    280     /**
    281      * Finds, as a string, the word at the position passed as an argument.
    282      *
    283      * @param dictDecoder the dict decoder.
    284      * @param headerSize the size of the header.
    285      * @param pos the position to seek.
    286      * @return the word with its frequency, as a weighted string.
    287      */
    288     @UsedForTesting
    289     /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
    290             final int headerSize, final int pos) {
    291         final WeightedString result;
    292         final int originalPos = dictDecoder.getPosition();
    293         dictDecoder.setPosition(pos);
    294         result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
    295         dictDecoder.setPosition(originalPos);
    296         return result;
    297     }
    298 
    299     private static WeightedString getWordAtPositionWithoutParentAddress(
    300             final DictDecoder dictDecoder, final int headerSize, final int pos) {
    301         dictDecoder.setPosition(headerSize);
    302         final int count = dictDecoder.readPtNodeCount();
    303         int groupPos = dictDecoder.getPosition();
    304         final StringBuilder builder = new StringBuilder();
    305         WeightedString result = null;
    306 
    307         PtNodeInfo last = null;
    308         for (int i = count - 1; i >= 0; --i) {
    309             PtNodeInfo info = dictDecoder.readPtNode(groupPos);
    310             groupPos = info.mEndAddress;
    311             if (info.mOriginalAddress == pos) {
    312                 builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
    313                 result = new WeightedString(builder.toString(), info.mProbabilityInfo);
    314                 break; // and return
    315             }
    316             if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
    317                 if (info.mChildrenAddress > pos) {
    318                     if (null == last) continue;
    319                     builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
    320                     dictDecoder.setPosition(last.mChildrenAddress);
    321                     i = dictDecoder.readPtNodeCount();
    322                     groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
    323                     last = null;
    324                     continue;
    325                 }
    326                 last = info;
    327             }
    328             if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
    329                 builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
    330                 dictDecoder.setPosition(last.mChildrenAddress);
    331                 i = dictDecoder.readPtNodeCount();
    332                 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
    333                 last = null;
    334                 continue;
    335             }
    336         }
    337         return result;
    338     }
    339 
    340     /**
    341      * Helper method to pass a file name instead of a File object to isBinaryDictionary.
    342      */
    343     public static boolean isBinaryDictionary(final String filename) {
    344         final File file = new File(filename);
    345         return isBinaryDictionary(file);
    346     }
    347 
    348     /**
    349      * Basic test to find out whether the file is a binary dictionary or not.
    350      *
    351      * @param file The file to test.
    352      * @return true if it's a binary dictionary, false otherwise
    353      */
    354     public static boolean isBinaryDictionary(final File file) {
    355         final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
    356         if (dictDecoder == null) {
    357             return false;
    358         }
    359         return dictDecoder.hasValidRawBinaryDictionary();
    360     }
    361 }
    362