Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.define.DecoderSpecificConstants;
     21 import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
     22 
     23 import java.io.File;
     24 import java.io.IOException;
     25 import java.io.OutputStream;
     26 import java.util.ArrayList;
     27 import java.util.Map;
     28 import java.util.Stack;
     29 
     30 public final class BinaryDictIOUtils {
     31     private static final boolean DBG = false;
     32 
     33     private BinaryDictIOUtils() {
     34         // This utility class is not publicly instantiable.
     35     }
     36 
     37     /**
     38      * Returns new dictionary decoder.
     39      *
     40      * @param dictFile the dictionary file.
     41      * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
     42      * @return new dictionary decoder if the dictionary file exists, otherwise null.
     43      */
     44     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
     45             final long length, final int bufferType) {
     46         return new Ver4DictDecoder(dictFile);
     47     }
     48 
     49     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
     50             final long length, final DictionaryBufferFactory factory) {
     51         return new Ver4DictDecoder(dictFile);
     52     }
     53 
     54     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
     55             final long length) {
     56         return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
     57     }
     58 
     59     private static final class Position {
     60         public static final int NOT_READ_PTNODE_COUNT = -1;
     61 
     62         public int mAddress;
     63         public int mNumOfPtNode;
     64         public int mPosition;
     65         public int mLength;
     66 
     67         public Position(int address, int length) {
     68             mAddress = address;
     69             mLength = length;
     70             mNumOfPtNode = NOT_READ_PTNODE_COUNT;
     71         }
     72     }
     73 
     74     /**
     75      * Retrieves all node arrays without recursive call.
     76      */
     77     private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
     78             final int bodyOffset, final Map<Integer, String> words,
     79             final Map<Integer, Integer> frequencies,
     80             final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
     81         int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
     82 
     83         Stack<Position> stack = new Stack<>();
     84         int index = 0;
     85 
     86         Position initPos = new Position(bodyOffset, 0);
     87         stack.push(initPos);
     88 
     89         while (!stack.empty()) {
     90             Position p = stack.peek();
     91 
     92             if (DBG) {
     93                 MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
     94                         p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
     95             }
     96 
     97             if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
     98             if (index != p.mLength) index = p.mLength;
     99 
    100             if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
    101                 p.mNumOfPtNode = dictDecoder.readPtNodeCount();
    102                 p.mAddress = dictDecoder.getPosition();
    103                 p.mPosition = 0;
    104             }
    105             if (p.mNumOfPtNode == 0) {
    106                 stack.pop();
    107                 continue;
    108             }
    109             final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
    110             for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
    111                 pushedChars[index++] = ptNodeInfo.mCharacters[i];
    112             }
    113             p.mPosition++;
    114             if (ptNodeInfo.isTerminal()) {// found word
    115                 words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
    116                 frequencies.put(
    117                         ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
    118                 if (ptNodeInfo.mBigrams != null) {
    119                     bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
    120                 }
    121             }
    122 
    123             if (p.mPosition == p.mNumOfPtNode) {
    124                 stack.pop();
    125             } else {
    126                 // The PtNode array has more PtNodes.
    127                 p.mAddress = dictDecoder.getPosition();
    128             }
    129 
    130             if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
    131                 final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
    132                 stack.push(childrenPos);
    133             }
    134         }
    135     }
    136 
    137     /**
    138      * Reads unigrams and bigrams from the binary file.
    139      * Doesn't store a full memory representation of the dictionary.
    140      *
    141      * @param dictDecoder the dict decoder.
    142      * @param words the map to store the address as a key and the word as a value.
    143      * @param frequencies the map to store the address as a key and the frequency as a value.
    144      * @param bigrams the map to store the address as a key and the list of address as a value.
    145      * @throws IOException if the file can't be read.
    146      * @throws UnsupportedFormatException if the format of the file is not recognized.
    147      */
    148     /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
    149             final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
    150             final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
    151             UnsupportedFormatException {
    152         // Read header
    153         final DictionaryHeader header = dictDecoder.readHeader();
    154         readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
    155             frequencies, bigrams);
    156     }
    157 
    158     /**
    159      * Gets the address of the last PtNode of the exact matching word in the dictionary.
    160      * If no match is found, returns NOT_VALID_WORD.
    161      *
    162      * @param dictDecoder the dict decoder.
    163      * @param word the word we search for.
    164      * @return the address of the terminal node.
    165      * @throws IOException if the file can't be read.
    166      * @throws UnsupportedFormatException if the format of the file is not recognized.
    167      */
    168     @UsedForTesting
    169     /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
    170             final String word) throws IOException, UnsupportedFormatException {
    171         if (word == null) return FormatSpec.NOT_VALID_WORD;
    172         dictDecoder.setPosition(0);
    173         dictDecoder.readHeader();
    174         int wordPos = 0;
    175         final int wordLen = word.codePointCount(0, word.length());
    176         for (int depth = 0; depth < DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
    177             if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
    178 
    179             do {
    180                 final int ptNodeCount = dictDecoder.readPtNodeCount();
    181                 boolean foundNextPtNode = false;
    182                 for (int i = 0; i < ptNodeCount; ++i) {
    183                     final int ptNodePos = dictDecoder.getPosition();
    184                     final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
    185                     boolean same = true;
    186                     for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
    187                             p < currentInfo.mCharacters.length;
    188                             ++p, j = word.offsetByCodePoints(j, 1)) {
    189                         if (wordPos + p >= wordLen
    190                                 || word.codePointAt(j) != currentInfo.mCharacters[p]) {
    191                             same = false;
    192                             break;
    193                         }
    194                     }
    195 
    196                     if (same) {
    197                         // found the PtNode matches the word.
    198                         if (wordPos + currentInfo.mCharacters.length == wordLen) {
    199                             return currentInfo.isTerminal() ? ptNodePos : FormatSpec.NOT_VALID_WORD;
    200                         }
    201                         wordPos += currentInfo.mCharacters.length;
    202                         if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
    203                             return FormatSpec.NOT_VALID_WORD;
    204                         }
    205                         foundNextPtNode = true;
    206                         dictDecoder.setPosition(currentInfo.mChildrenAddress);
    207                         break;
    208                     }
    209                 }
    210                 if (foundNextPtNode) break;
    211                 return FormatSpec.NOT_VALID_WORD;
    212             } while(true);
    213         }
    214         return FormatSpec.NOT_VALID_WORD;
    215     }
    216 
    217     /**
    218      * Writes a PtNodeCount to the stream.
    219      *
    220      * @param destination the stream to write.
    221      * @param ptNodeCount the count.
    222      * @return the size written in bytes.
    223      */
    224     @UsedForTesting
    225     static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
    226             throws IOException {
    227         final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
    228         // the count must fit on one byte or two bytes.
    229         // Please see comments in FormatSpec.
    230         if (countSize != 1 && countSize != 2) {
    231             throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
    232         }
    233         final int encodedPtNodeCount = (countSize == 2) ?
    234                 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
    235         BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
    236         return countSize;
    237     }
    238 
    239     /**
    240      * Helper method to hide the actual value of the no children address.
    241      */
    242     public static boolean hasChildrenAddress(final int address) {
    243         return FormatSpec.NO_CHILDREN_ADDRESS != address;
    244     }
    245 
    246     /**
    247      * Compute the binary size of the node count
    248      * @param count the node count
    249      * @return the size of the node count, either 1 or 2 bytes.
    250      */
    251     public static int getPtNodeCountSize(final int count) {
    252         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
    253             return 1;
    254         } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
    255             return 2;
    256         } else {
    257             throw new RuntimeException("Can't have more than "
    258                     + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
    259                     + count + ")");
    260         }
    261     }
    262 
    263     static int getChildrenAddressSize(final int optionFlags) {
    264         switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
    265             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
    266                 return 1;
    267             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
    268                 return 2;
    269             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
    270                 return 3;
    271             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
    272             default:
    273                 return 0;
    274         }
    275     }
    276 
    277     /**
    278      * Calculate bigram frequency from compressed value
    279      *
    280      * @param unigramFrequency
    281      * @param bigramFrequency compressed frequency
    282      * @return approximate bigram frequency
    283      */
    284     @UsedForTesting
    285     public static int reconstructBigramFrequency(final int unigramFrequency,
    286             final int bigramFrequency) {
    287         final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
    288                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
    289         final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
    290         return (int)resultFreqFloat;
    291     }
    292 }
    293