1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.define.DecoderSpecificConstants; 21 import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; 22 23 import java.io.File; 24 import java.io.IOException; 25 import java.io.OutputStream; 26 import java.util.ArrayList; 27 import java.util.Map; 28 import java.util.Stack; 29 30 public final class BinaryDictIOUtils { 31 private static final boolean DBG = false; 32 33 private BinaryDictIOUtils() { 34 // This utility class is not publicly instantiable. 35 } 36 37 /** 38 * Returns new dictionary decoder. 39 * 40 * @param dictFile the dictionary file. 41 * @param bufferType The type of buffer, as one of USE_* in DictDecoder. 42 * @return new dictionary decoder if the dictionary file exists, otherwise null. 43 */ 44 public static DictDecoder getDictDecoder(final File dictFile, final long offset, 45 final long length, final int bufferType) { 46 return new Ver4DictDecoder(dictFile); 47 } 48 49 public static DictDecoder getDictDecoder(final File dictFile, final long offset, 50 final long length, final DictionaryBufferFactory factory) { 51 return new Ver4DictDecoder(dictFile); 52 } 53 54 public static DictDecoder getDictDecoder(final File dictFile, final long offset, 55 final long length) { 56 return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER); 57 } 58 59 private static final class Position { 60 public static final int NOT_READ_PTNODE_COUNT = -1; 61 62 public int mAddress; 63 public int mNumOfPtNode; 64 public int mPosition; 65 public int mLength; 66 67 public Position(int address, int length) { 68 mAddress = address; 69 mLength = length; 70 mNumOfPtNode = NOT_READ_PTNODE_COUNT; 71 } 72 } 73 74 /** 75 * Retrieves all node arrays without recursive call. 76 */ 77 private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, 78 final int bodyOffset, final Map<Integer, String> words, 79 final Map<Integer, Integer> frequencies, 80 final Map<Integer, ArrayList<PendingAttribute>> bigrams) { 81 int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; 82 83 Stack<Position> stack = new Stack<>(); 84 int index = 0; 85 86 Position initPos = new Position(bodyOffset, 0); 87 stack.push(initPos); 88 89 while (!stack.empty()) { 90 Position p = stack.peek(); 91 92 if (DBG) { 93 MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" + 94 p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength); 95 } 96 97 if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress); 98 if (index != p.mLength) index = p.mLength; 99 100 if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) { 101 p.mNumOfPtNode = dictDecoder.readPtNodeCount(); 102 p.mAddress = dictDecoder.getPosition(); 103 p.mPosition = 0; 104 } 105 if (p.mNumOfPtNode == 0) { 106 stack.pop(); 107 continue; 108 } 109 final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress); 110 for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) { 111 pushedChars[index++] = ptNodeInfo.mCharacters[i]; 112 } 113 p.mPosition++; 114 if (ptNodeInfo.isTerminal()) {// found word 115 words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index)); 116 frequencies.put( 117 ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability); 118 if (ptNodeInfo.mBigrams != null) { 119 bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams); 120 } 121 } 122 123 if (p.mPosition == p.mNumOfPtNode) { 124 stack.pop(); 125 } else { 126 // The PtNode array has more PtNodes. 127 p.mAddress = dictDecoder.getPosition(); 128 } 129 130 if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) { 131 final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index); 132 stack.push(childrenPos); 133 } 134 } 135 } 136 137 /** 138 * Reads unigrams and bigrams from the binary file. 139 * Doesn't store a full memory representation of the dictionary. 140 * 141 * @param dictDecoder the dict decoder. 142 * @param words the map to store the address as a key and the word as a value. 143 * @param frequencies the map to store the address as a key and the frequency as a value. 144 * @param bigrams the map to store the address as a key and the list of address as a value. 145 * @throws IOException if the file can't be read. 146 * @throws UnsupportedFormatException if the format of the file is not recognized. 147 */ 148 /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, 149 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 150 final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 151 UnsupportedFormatException { 152 // Read header 153 final DictionaryHeader header = dictDecoder.readHeader(); 154 readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words, 155 frequencies, bigrams); 156 } 157 158 /** 159 * Gets the address of the last PtNode of the exact matching word in the dictionary. 160 * If no match is found, returns NOT_VALID_WORD. 161 * 162 * @param dictDecoder the dict decoder. 163 * @param word the word we search for. 164 * @return the address of the terminal node. 165 * @throws IOException if the file can't be read. 166 * @throws UnsupportedFormatException if the format of the file is not recognized. 167 */ 168 @UsedForTesting 169 /* package */ static int getTerminalPosition(final DictDecoder dictDecoder, 170 final String word) throws IOException, UnsupportedFormatException { 171 if (word == null) return FormatSpec.NOT_VALID_WORD; 172 dictDecoder.setPosition(0); 173 dictDecoder.readHeader(); 174 int wordPos = 0; 175 final int wordLen = word.codePointCount(0, word.length()); 176 for (int depth = 0; depth < DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { 177 if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; 178 179 do { 180 final int ptNodeCount = dictDecoder.readPtNodeCount(); 181 boolean foundNextPtNode = false; 182 for (int i = 0; i < ptNodeCount; ++i) { 183 final int ptNodePos = dictDecoder.getPosition(); 184 final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos); 185 boolean same = true; 186 for (int p = 0, j = word.offsetByCodePoints(0, wordPos); 187 p < currentInfo.mCharacters.length; 188 ++p, j = word.offsetByCodePoints(j, 1)) { 189 if (wordPos + p >= wordLen 190 || word.codePointAt(j) != currentInfo.mCharacters[p]) { 191 same = false; 192 break; 193 } 194 } 195 196 if (same) { 197 // found the PtNode matches the word. 198 if (wordPos + currentInfo.mCharacters.length == wordLen) { 199 return currentInfo.isTerminal() ? ptNodePos : FormatSpec.NOT_VALID_WORD; 200 } 201 wordPos += currentInfo.mCharacters.length; 202 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 203 return FormatSpec.NOT_VALID_WORD; 204 } 205 foundNextPtNode = true; 206 dictDecoder.setPosition(currentInfo.mChildrenAddress); 207 break; 208 } 209 } 210 if (foundNextPtNode) break; 211 return FormatSpec.NOT_VALID_WORD; 212 } while(true); 213 } 214 return FormatSpec.NOT_VALID_WORD; 215 } 216 217 /** 218 * Writes a PtNodeCount to the stream. 219 * 220 * @param destination the stream to write. 221 * @param ptNodeCount the count. 222 * @return the size written in bytes. 223 */ 224 @UsedForTesting 225 static int writePtNodeCount(final OutputStream destination, final int ptNodeCount) 226 throws IOException { 227 final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); 228 // the count must fit on one byte or two bytes. 229 // Please see comments in FormatSpec. 230 if (countSize != 1 && countSize != 2) { 231 throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); 232 } 233 final int encodedPtNodeCount = (countSize == 2) ? 234 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; 235 BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize); 236 return countSize; 237 } 238 239 /** 240 * Helper method to hide the actual value of the no children address. 241 */ 242 public static boolean hasChildrenAddress(final int address) { 243 return FormatSpec.NO_CHILDREN_ADDRESS != address; 244 } 245 246 /** 247 * Compute the binary size of the node count 248 * @param count the node count 249 * @return the size of the node count, either 1 or 2 bytes. 250 */ 251 public static int getPtNodeCountSize(final int count) { 252 if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) { 253 return 1; 254 } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) { 255 return 2; 256 } else { 257 throw new RuntimeException("Can't have more than " 258 + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found " 259 + count + ")"); 260 } 261 } 262 263 static int getChildrenAddressSize(final int optionFlags) { 264 switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { 265 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: 266 return 1; 267 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: 268 return 2; 269 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: 270 return 3; 271 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: 272 default: 273 return 0; 274 } 275 } 276 277 /** 278 * Calculate bigram frequency from compressed value 279 * 280 * @param unigramFrequency 281 * @param bigramFrequency compressed frequency 282 * @return approximate bigram frequency 283 */ 284 @UsedForTesting 285 public static int reconstructBigramFrequency(final int unigramFrequency, 286 final int bigramFrequency) { 287 final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) 288 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); 289 final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f); 290 return (int)resultFreqFloat; 291 } 292 } 293