Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
     21 
     22 import java.io.File;
     23 import java.io.IOException;
     24 import java.io.OutputStream;
     25 import java.nio.ByteBuffer;
     26 import java.util.HashMap;
     27 import java.util.LinkedList;
     28 
     29 import javax.annotation.Nonnull;
     30 
     31 /**
     32  * Decodes binary files for a FusionDictionary.
     33  *
     34  * All the methods in this class are static.
     35  *
     36  * TODO: Move this file to makedict/internal.
     37  * TODO: Rename this class to DictDecoderUtils.
     38  */
     39 public final class BinaryDictDecoderUtils {
     40     private BinaryDictDecoderUtils() {
     41         // This utility class is not publicly instantiable.
     42     }
     43 
     44     @UsedForTesting
     45     public interface DictBuffer {
     46         public int readUnsignedByte();
     47         public int readUnsignedShort();
     48         public int readUnsignedInt24();
     49         public int readInt();
     50         public int position();
     51         public void position(int newPosition);
     52         @UsedForTesting
     53         public void put(final byte b);
     54         public int limit();
     55         @UsedForTesting
     56         public int capacity();
     57     }
     58 
     59     public static final class ByteBufferDictBuffer implements DictBuffer {
     60         private ByteBuffer mBuffer;
     61 
     62         public ByteBufferDictBuffer(final ByteBuffer buffer) {
     63             mBuffer = buffer;
     64         }
     65 
     66         @Override
     67         public int readUnsignedByte() {
     68             return mBuffer.get() & 0xFF;
     69         }
     70 
     71         @Override
     72         public int readUnsignedShort() {
     73             return mBuffer.getShort() & 0xFFFF;
     74         }
     75 
     76         @Override
     77         public int readUnsignedInt24() {
     78             final int retval = readUnsignedByte();
     79             return (retval << 16) + readUnsignedShort();
     80         }
     81 
     82         @Override
     83         public int readInt() {
     84             return mBuffer.getInt();
     85         }
     86 
     87         @Override
     88         public int position() {
     89             return mBuffer.position();
     90         }
     91 
     92         @Override
     93         public void position(int newPos) {
     94             mBuffer.position(newPos);
     95         }
     96 
     97         @Override
     98         public void put(final byte b) {
     99             mBuffer.put(b);
    100         }
    101 
    102         @Override
    103         public int limit() {
    104             return mBuffer.limit();
    105         }
    106 
    107         @Override
    108         public int capacity() {
    109             return mBuffer.capacity();
    110         }
    111     }
    112 
    113     /**
    114      * A class grouping utility function for our specific character encoding.
    115      */
    116     static final class CharEncoding {
    117 
    118         /**
    119          * Helper method to find out whether this code fits on one byte
    120          */
    121         private static boolean fitsOnOneByte(final int character,
    122                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
    123             int codePoint = character;
    124             if (codePointToOneByteCodeMap != null) {
    125                 if (codePointToOneByteCodeMap.containsKey(character)) {
    126                     codePoint = codePointToOneByteCodeMap.get(character);
    127                 }
    128             }
    129             return codePoint >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE
    130                     && codePoint <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
    131         }
    132 
    133         /**
    134          * Compute the size of a character given its character code.
    135          *
    136          * Char format is:
    137          * 1 byte = bbbbbbbb match
    138          * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
    139          * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
    140          *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
    141          *       00011111 would be outside unicode.
    142          * else: iso-latin-1 code
    143          * This allows for the whole unicode range to be encoded, including chars outside of
    144          * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
    145          * characters which should never happen anyway (and still work, but take 3 bytes).
    146          *
    147          * @param character the character code.
    148          * @return the size in binary encoded-form, either 1 or 3 bytes.
    149          */
    150         static int getCharSize(final int character,
    151                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
    152             // See char encoding in FusionDictionary.java
    153             if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1;
    154             if (FormatSpec.INVALID_CHARACTER == character) return 1;
    155             return 3;
    156         }
    157 
    158         /**
    159          * Compute the byte size of a character array.
    160          */
    161         static int getCharArraySize(final int[] chars,
    162                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
    163             int size = 0;
    164             for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap);
    165             return size;
    166         }
    167 
    168         /**
    169          * Writes a char array to a byte buffer.
    170          *
    171          * @param codePoints the code point array to write.
    172          * @param buffer the byte buffer to write to.
    173          * @param fromIndex the index in buffer to write the character array to.
    174          * @param codePointToOneByteCodeMap the map to convert the code point.
    175          * @return the index after the last character.
    176          */
    177         static int writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex,
    178                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
    179             int index = fromIndex;
    180             for (int codePoint : codePoints) {
    181                 if (codePointToOneByteCodeMap != null) {
    182                     if (codePointToOneByteCodeMap.containsKey(codePoint)) {
    183                         // Convert code points
    184                         codePoint = codePointToOneByteCodeMap.get(codePoint);
    185                     }
    186                 }
    187                 if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
    188                     buffer[index++] = (byte)codePoint;
    189                 } else {
    190                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    191                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    192                     buffer[index++] = (byte)(0xFF & codePoint);
    193                 }
    194             }
    195             return index;
    196         }
    197 
    198         /**
    199          * Writes a string with our character format to a byte buffer.
    200          *
    201          * This will also write the terminator byte.
    202          *
    203          * @param buffer the byte buffer to write to.
    204          * @param origin the offset to write from.
    205          * @param word the string to write.
    206          * @return the size written, in bytes.
    207          */
    208         static int writeString(final byte[] buffer, final int origin, final String word,
    209                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
    210             final int length = word.length();
    211             int index = origin;
    212             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    213                 int codePoint = word.codePointAt(i);
    214                 if (codePointToOneByteCodeMap != null) {
    215                     if (codePointToOneByteCodeMap.containsKey(codePoint)) {
    216                         // Convert code points
    217                         codePoint = codePointToOneByteCodeMap.get(codePoint);
    218                     }
    219                 }
    220                 if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
    221                     buffer[index++] = (byte)codePoint;
    222                 } else {
    223                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
    224                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
    225                     buffer[index++] = (byte)(0xFF & codePoint);
    226                 }
    227             }
    228             buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
    229             return index - origin;
    230         }
    231 
    232         /**
    233          * Writes a string with our character format to an OutputStream.
    234          *
    235          * This will also write the terminator byte.
    236          *
    237          * @param stream the OutputStream to write to.
    238          * @param word the string to write.
    239          * @return the size written, in bytes.
    240          */
    241         static int writeString(final OutputStream stream, final String word,
    242                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException {
    243             final int length = word.length();
    244             int written = 0;
    245             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
    246                 final int codePoint = word.codePointAt(i);
    247                 final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap);
    248                 if (1 == charSize) {
    249                     stream.write((byte) codePoint);
    250                 } else {
    251                     stream.write((byte) (0xFF & (codePoint >> 16)));
    252                     stream.write((byte) (0xFF & (codePoint >> 8)));
    253                     stream.write((byte) (0xFF & codePoint));
    254                 }
    255                 written += charSize;
    256             }
    257             stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
    258             written += FormatSpec.PTNODE_TERMINATOR_SIZE;
    259             return written;
    260         }
    261 
    262         /**
    263          * Reads a string from a DictBuffer. This is the converse of the above method.
    264          */
    265         static String readString(final DictBuffer dictBuffer) {
    266             final StringBuilder s = new StringBuilder();
    267             int character = readChar(dictBuffer);
    268             while (character != FormatSpec.INVALID_CHARACTER) {
    269                 s.appendCodePoint(character);
    270                 character = readChar(dictBuffer);
    271             }
    272             return s.toString();
    273         }
    274 
    275         /**
    276          * Reads a character from the buffer.
    277          *
    278          * This follows the character format documented earlier in this source file.
    279          *
    280          * @param dictBuffer the buffer, positioned over an encoded character.
    281          * @return the character code.
    282          */
    283         static int readChar(final DictBuffer dictBuffer) {
    284             int character = dictBuffer.readUnsignedByte();
    285             if (!fitsOnOneByte(character, null)) {
    286                 if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
    287                     return FormatSpec.INVALID_CHARACTER;
    288                 }
    289                 character <<= 16;
    290                 character += dictBuffer.readUnsignedShort();
    291             }
    292             return character;
    293         }
    294     }
    295 
    296     /**
    297      * Reads and returns the PtNode count out of a buffer and forwards the pointer.
    298      */
    299     /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
    300         final int msb = dictBuffer.readUnsignedByte();
    301         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
    302             return msb;
    303         }
    304         return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
    305                 + dictBuffer.readUnsignedByte();
    306     }
    307 
    308     /**
    309      * Finds, as a string, the word at the position passed as an argument.
    310      *
    311      * @param dictDecoder the dict decoder.
    312      * @param headerSize the size of the header.
    313      * @param pos the position to seek.
    314      * @return the word with its frequency, as a weighted string.
    315      */
    316     @UsedForTesting
    317     /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
    318             final int headerSize, final int pos) {
    319         final WeightedString result;
    320         final int originalPos = dictDecoder.getPosition();
    321         dictDecoder.setPosition(pos);
    322         result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
    323         dictDecoder.setPosition(originalPos);
    324         return result;
    325     }
    326 
    327     private static WeightedString getWordAtPositionWithoutParentAddress(
    328             final DictDecoder dictDecoder, final int headerSize, final int pos) {
    329         dictDecoder.setPosition(headerSize);
    330         final int count = dictDecoder.readPtNodeCount();
    331         int groupPos = dictDecoder.getPosition();
    332         final StringBuilder builder = new StringBuilder();
    333         WeightedString result = null;
    334 
    335         PtNodeInfo last = null;
    336         for (int i = count - 1; i >= 0; --i) {
    337             PtNodeInfo info = dictDecoder.readPtNode(groupPos);
    338             groupPos = info.mEndAddress;
    339             if (info.mOriginalAddress == pos) {
    340                 builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
    341                 result = new WeightedString(builder.toString(), info.mProbabilityInfo);
    342                 break; // and return
    343             }
    344             if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
    345                 if (info.mChildrenAddress > pos) {
    346                     if (null == last) continue;
    347                     builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
    348                     dictDecoder.setPosition(last.mChildrenAddress);
    349                     i = dictDecoder.readPtNodeCount();
    350                     groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
    351                     last = null;
    352                     continue;
    353                 }
    354                 last = info;
    355             }
    356             if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
    357                 builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
    358                 dictDecoder.setPosition(last.mChildrenAddress);
    359                 i = dictDecoder.readPtNodeCount();
    360                 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
    361                 last = null;
    362                 continue;
    363             }
    364         }
    365         return result;
    366     }
    367 
    368     /**
    369      * Helper method that brutally decodes a header from a byte array.
    370      *
    371      * @param headerBuffer a buffer containing the bytes of the header.
    372      * @return a hashmap of the attributes stored in the header
    373      */
    374     @Nonnull
    375     public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer)
    376             throws UnsupportedFormatException {
    377         final StringBuilder sb = new StringBuilder();
    378         final LinkedList<String> keyValues = new LinkedList<>();
    379         int index = 0;
    380         while (index < headerBuffer.length) {
    381             if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) {
    382                 keyValues.add(sb.toString());
    383                 sb.setLength(0);
    384             } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF,
    385                     null /* codePointTable */)) {
    386                 sb.appendCodePoint(headerBuffer[index] & 0xFF);
    387             } else {
    388                 sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16)
    389                         + ((headerBuffer[index + 1] & 0xFF) << 8)
    390                         + (headerBuffer[index + 2] & 0xFF));
    391                 index += 2;
    392             }
    393             index += 1;
    394         }
    395         if ((keyValues.size() & 1) != 0) {
    396             throw new UnsupportedFormatException("Odd number of attributes");
    397         }
    398         final HashMap<String, String> attributes = new HashMap<>();
    399         for (int i = 0; i < keyValues.size(); i += 2) {
    400             attributes.put(keyValues.get(i), keyValues.get(i + 1));
    401         }
    402         return attributes;
    403     }
    404 
    405     /**
    406      * Helper method to pass a file name instead of a File object to isBinaryDictionary.
    407      */
    408     public static boolean isBinaryDictionary(final String filename) {
    409         final File file = new File(filename);
    410         return isBinaryDictionary(file);
    411     }
    412 
    413     /**
    414      * Basic test to find out whether the file is a binary dictionary or not.
    415      *
    416      * @param file The file to test.
    417      * @return true if it's a binary dictionary, false otherwise
    418      */
    419     public static boolean isBinaryDictionary(final File file) {
    420         final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
    421         if (dictDecoder == null) {
    422             return false;
    423         }
    424         return dictDecoder.hasValidRawBinaryDictionary();
    425     }
    426 }
    427