Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.Constants;
     21 import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
     22 
     23 import java.io.File;
     24 import java.io.IOException;
     25 import java.io.OutputStream;
     26 import java.util.ArrayList;
     27 import java.util.Map;
     28 import java.util.Stack;
     29 
     30 public final class BinaryDictIOUtils {
     31     private static final boolean DBG = false;
     32 
     33     private BinaryDictIOUtils() {
     34         // This utility class is not publicly instantiable.
     35     }
     36 
     37     /**
     38      * Returns new dictionary decoder.
     39      *
     40      * @param dictFile the dictionary file.
     41      * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
     42      * @return new dictionary decoder if the dictionary file exists, otherwise null.
     43      */
     44     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
     45             final long length, final int bufferType) {
     46         if (dictFile.isDirectory()) {
     47             return new Ver4DictDecoder(dictFile, bufferType);
     48         } else if (dictFile.isFile()) {
     49             return new Ver2DictDecoder(dictFile, offset, length, bufferType);
     50         }
     51         return null;
     52     }
     53 
     54     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
     55             final long length, final DictionaryBufferFactory factory) {
     56         if (dictFile.isDirectory()) {
     57             return new Ver4DictDecoder(dictFile, factory);
     58         } else if (dictFile.isFile()) {
     59             return new Ver2DictDecoder(dictFile, offset, length, factory);
     60         }
     61         return null;
     62     }
     63 
     64     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
     65             final long length) {
     66         return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
     67     }
     68 
     69     private static final class Position {
     70         public static final int NOT_READ_PTNODE_COUNT = -1;
     71 
     72         public int mAddress;
     73         public int mNumOfPtNode;
     74         public int mPosition;
     75         public int mLength;
     76 
     77         public Position(int address, int length) {
     78             mAddress = address;
     79             mLength = length;
     80             mNumOfPtNode = NOT_READ_PTNODE_COUNT;
     81         }
     82     }
     83 
     84     /**
     85      * Retrieves all node arrays without recursive call.
     86      */
     87     private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
     88             final int bodyOffset, final Map<Integer, String> words,
     89             final Map<Integer, Integer> frequencies,
     90             final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
     91         int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
     92 
     93         Stack<Position> stack = new Stack<>();
     94         int index = 0;
     95 
     96         Position initPos = new Position(bodyOffset, 0);
     97         stack.push(initPos);
     98 
     99         while (!stack.empty()) {
    100             Position p = stack.peek();
    101 
    102             if (DBG) {
    103                 MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
    104                         p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
    105             }
    106 
    107             if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
    108             if (index != p.mLength) index = p.mLength;
    109 
    110             if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
    111                 p.mNumOfPtNode = dictDecoder.readPtNodeCount();
    112                 p.mAddress = dictDecoder.getPosition();
    113                 p.mPosition = 0;
    114             }
    115             if (p.mNumOfPtNode == 0) {
    116                 stack.pop();
    117                 continue;
    118             }
    119             final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
    120             for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
    121                 pushedChars[index++] = ptNodeInfo.mCharacters[i];
    122             }
    123             p.mPosition++;
    124             if (ptNodeInfo.isTerminal()) {// found word
    125                 words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
    126                 frequencies.put(
    127                         ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
    128                 if (ptNodeInfo.mBigrams != null) {
    129                     bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
    130                 }
    131             }
    132 
    133             if (p.mPosition == p.mNumOfPtNode) {
    134                 stack.pop();
    135             } else {
    136                 // The PtNode array has more PtNodes.
    137                 p.mAddress = dictDecoder.getPosition();
    138             }
    139 
    140             if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
    141                 final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
    142                 stack.push(childrenPos);
    143             }
    144         }
    145     }
    146 
    147     /**
    148      * Reads unigrams and bigrams from the binary file.
    149      * Doesn't store a full memory representation of the dictionary.
    150      *
    151      * @param dictDecoder the dict decoder.
    152      * @param words the map to store the address as a key and the word as a value.
    153      * @param frequencies the map to store the address as a key and the frequency as a value.
    154      * @param bigrams the map to store the address as a key and the list of address as a value.
    155      * @throws IOException if the file can't be read.
    156      * @throws UnsupportedFormatException if the format of the file is not recognized.
    157      */
    158     /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
    159             final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
    160             final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
    161             UnsupportedFormatException {
    162         // Read header
    163         final DictionaryHeader header = dictDecoder.readHeader();
    164         readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
    165             frequencies, bigrams);
    166     }
    167 
    168     /**
    169      * Gets the address of the last PtNode of the exact matching word in the dictionary.
    170      * If no match is found, returns NOT_VALID_WORD.
    171      *
    172      * @param dictDecoder the dict decoder.
    173      * @param word the word we search for.
    174      * @return the address of the terminal node.
    175      * @throws IOException if the file can't be read.
    176      * @throws UnsupportedFormatException if the format of the file is not recognized.
    177      */
    178     @UsedForTesting
    179     /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
    180             final String word) throws IOException, UnsupportedFormatException {
    181         if (word == null) return FormatSpec.NOT_VALID_WORD;
    182         dictDecoder.setPosition(0);
    183         dictDecoder.readHeader();
    184         int wordPos = 0;
    185         final int wordLen = word.codePointCount(0, word.length());
    186         for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
    187             if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
    188 
    189             do {
    190                 final int ptNodeCount = dictDecoder.readPtNodeCount();
    191                 boolean foundNextPtNode = false;
    192                 for (int i = 0; i < ptNodeCount; ++i) {
    193                     final int ptNodePos = dictDecoder.getPosition();
    194                     final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
    195                     boolean same = true;
    196                     for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
    197                             p < currentInfo.mCharacters.length;
    198                             ++p, j = word.offsetByCodePoints(j, 1)) {
    199                         if (wordPos + p >= wordLen
    200                                 || word.codePointAt(j) != currentInfo.mCharacters[p]) {
    201                             same = false;
    202                             break;
    203                         }
    204                     }
    205 
    206                     if (same) {
    207                         // found the PtNode matches the word.
    208                         if (wordPos + currentInfo.mCharacters.length == wordLen) {
    209                             if (!currentInfo.isTerminal()) {
    210                                 return FormatSpec.NOT_VALID_WORD;
    211                             } else {
    212                                 return ptNodePos;
    213                             }
    214                         }
    215                         wordPos += currentInfo.mCharacters.length;
    216                         if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
    217                             return FormatSpec.NOT_VALID_WORD;
    218                         }
    219                         foundNextPtNode = true;
    220                         dictDecoder.setPosition(currentInfo.mChildrenAddress);
    221                         break;
    222                     }
    223                 }
    224                 if (foundNextPtNode) break;
    225                 return FormatSpec.NOT_VALID_WORD;
    226             } while(true);
    227         }
    228         return FormatSpec.NOT_VALID_WORD;
    229     }
    230 
    231     /**
    232      * Writes a PtNodeCount to the stream.
    233      *
    234      * @param destination the stream to write.
    235      * @param ptNodeCount the count.
    236      * @return the size written in bytes.
    237      */
    238     @UsedForTesting
    239     static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
    240             throws IOException {
    241         final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
    242         // the count must fit on one byte or two bytes.
    243         // Please see comments in FormatSpec.
    244         if (countSize != 1 && countSize != 2) {
    245             throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
    246         }
    247         final int encodedPtNodeCount = (countSize == 2) ?
    248                 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
    249         BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
    250         return countSize;
    251     }
    252 
    253     /**
    254      * Helper method to hide the actual value of the no children address.
    255      */
    256     public static boolean hasChildrenAddress(final int address) {
    257         return FormatSpec.NO_CHILDREN_ADDRESS != address;
    258     }
    259 
    260     /**
    261      * Compute the binary size of the node count
    262      * @param count the node count
    263      * @return the size of the node count, either 1 or 2 bytes.
    264      */
    265     public static int getPtNodeCountSize(final int count) {
    266         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
    267             return 1;
    268         } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
    269             return 2;
    270         } else {
    271             throw new RuntimeException("Can't have more than "
    272                     + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
    273                     + count + ")");
    274         }
    275     }
    276 
    277     static int getChildrenAddressSize(final int optionFlags) {
    278         switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
    279             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
    280                 return 1;
    281             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
    282                 return 2;
    283             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
    284                 return 3;
    285             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
    286             default:
    287                 return 0;
    288         }
    289     }
    290 
    291     /**
    292      * Calculate bigram frequency from compressed value
    293      *
    294      * @param unigramFrequency
    295      * @param bigramFrequency compressed frequency
    296      * @return approximate bigram frequency
    297      */
    298     @UsedForTesting
    299     public static int reconstructBigramFrequency(final int unigramFrequency,
    300             final int bigramFrequency) {
    301         final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
    302                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
    303         final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
    304         return (int)resultFreqFloat;
    305     }
    306 }
    307