1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.BinaryDictionary; 21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; 22 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; 23 24 import java.io.File; 25 import java.io.FileNotFoundException; 26 import java.io.IOException; 27 import java.util.ArrayList; 28 import java.util.Arrays; 29 30 /** 31 * An implementation of DictDecoder for version 2 binary dictionary. 32 */ 33 // TODO: Separate logics that are used only for testing. 34 @UsedForTesting 35 public class Ver2DictDecoder extends AbstractDictDecoder { 36 /** 37 * A utility class for reading a PtNode. 38 */ 39 protected static class PtNodeReader { 40 private static ProbabilityInfo readProbabilityInfo(final DictBuffer dictBuffer) { 41 // Ver2 dicts don't contain historical information. 42 return new ProbabilityInfo(dictBuffer.readUnsignedByte()); 43 } 44 45 protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) { 46 return dictBuffer.readUnsignedByte(); 47 } 48 49 protected static int readChildrenAddress(final DictBuffer dictBuffer, 50 final int ptNodeFlags) { 51 switch (ptNodeFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { 52 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: 53 return dictBuffer.readUnsignedByte(); 54 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: 55 return dictBuffer.readUnsignedShort(); 56 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: 57 return dictBuffer.readUnsignedInt24(); 58 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: 59 default: 60 return FormatSpec.NO_CHILDREN_ADDRESS; 61 } 62 } 63 64 // Reads shortcuts and returns the read length. 65 protected static int readShortcut(final DictBuffer dictBuffer, 66 final ArrayList<WeightedString> shortcutTargets) { 67 final int pointerBefore = dictBuffer.position(); 68 dictBuffer.readUnsignedShort(); // skip the size 69 while (true) { 70 final int targetFlags = dictBuffer.readUnsignedByte(); 71 final String word = CharEncoding.readString(dictBuffer); 72 shortcutTargets.add(new WeightedString(word, 73 targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); 74 if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; 75 } 76 return dictBuffer.position() - pointerBefore; 77 } 78 79 protected static int readBigramAddresses(final DictBuffer dictBuffer, 80 final ArrayList<PendingAttribute> bigrams, final int baseAddress) { 81 int readLength = 0; 82 int bigramCount = 0; 83 while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { 84 final int bigramFlags = dictBuffer.readUnsignedByte(); 85 ++readLength; 86 final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) 87 ? 1 : -1; 88 int bigramAddress = baseAddress + readLength; 89 switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { 90 case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: 91 bigramAddress += sign * dictBuffer.readUnsignedByte(); 92 readLength += 1; 93 break; 94 case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: 95 bigramAddress += sign * dictBuffer.readUnsignedShort(); 96 readLength += 2; 97 break; 98 case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: 99 bigramAddress += sign * dictBuffer.readUnsignedInt24(); 100 readLength += 3; 101 break; 102 default: 103 throw new RuntimeException("Has bigrams with no address"); 104 } 105 bigrams.add(new PendingAttribute( 106 bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, 107 bigramAddress)); 108 if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; 109 } 110 return readLength; 111 } 112 } 113 114 protected final File mDictionaryBinaryFile; 115 protected final long mOffset; 116 protected final long mLength; 117 // TODO: Remove mBufferFactory and mDictBuffer from this class members because they are now 118 // used only for testing. 119 private final DictionaryBufferFactory mBufferFactory; 120 protected DictBuffer mDictBuffer; 121 122 @UsedForTesting 123 /* package */ Ver2DictDecoder(final File file, final long offset, final long length, 124 final int factoryFlag) { 125 mDictionaryBinaryFile = file; 126 mOffset = offset; 127 mLength = length; 128 mDictBuffer = null; 129 if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { 130 mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); 131 } else if ((factoryFlag & MASK_DICTBUFFER) == USE_BYTEARRAY) { 132 mBufferFactory = new DictionaryBufferFromByteArrayFactory(); 133 } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) { 134 mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory(); 135 } else { 136 mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); 137 } 138 } 139 140 /* package */ Ver2DictDecoder(final File file, final long offset, final long length, 141 final DictionaryBufferFactory factory) { 142 mDictionaryBinaryFile = file; 143 mOffset = offset; 144 mLength = length; 145 mBufferFactory = factory; 146 } 147 148 @Override 149 public void openDictBuffer() throws FileNotFoundException, IOException { 150 mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile); 151 } 152 153 @Override 154 public boolean isDictBufferOpen() { 155 return mDictBuffer != null; 156 } 157 158 /* package */ DictBuffer getDictBuffer() { 159 return mDictBuffer; 160 } 161 162 @UsedForTesting 163 /* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException { 164 openDictBuffer(); 165 return getDictBuffer(); 166 } 167 168 @Override 169 public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException { 170 // dictType is not being used in dicttool. Passing an empty string. 171 final BinaryDictionary binaryDictionary = new BinaryDictionary( 172 mDictionaryBinaryFile.getAbsolutePath(), mOffset, mLength, 173 true /* useFullEditDistance */, null /* locale */, "" /* dictType */, 174 false /* isUpdatable */); 175 final DictionaryHeader header = binaryDictionary.getHeader(); 176 binaryDictionary.close(); 177 if (header == null) { 178 throw new IOException("Cannot read the dictionary header."); 179 } 180 if (header.mFormatOptions.mVersion != FormatSpec.VERSION2) { 181 throw new UnsupportedFormatException("File header has a wrong version : " 182 + header.mFormatOptions.mVersion); 183 } 184 if (!isDictBufferOpen()) { 185 openDictBuffer(); 186 } 187 // Advance buffer reading position to the head of dictionary body. 188 setPosition(header.mBodyOffset); 189 return header; 190 } 191 192 // TODO: Make this buffer multi thread safe. 193 private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; 194 @Override 195 public PtNodeInfo readPtNode(final int ptNodePos) { 196 int addressPointer = ptNodePos; 197 final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); 198 addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; 199 final int characters[]; 200 if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { 201 int index = 0; 202 int character = CharEncoding.readChar(mDictBuffer); 203 addressPointer += CharEncoding.getCharSize(character); 204 while (FormatSpec.INVALID_CHARACTER != character) { 205 // FusionDictionary is making sure that the length of the word is smaller than 206 // MAX_WORD_LENGTH. 207 // So we'll never write past the end of mCharacterBuffer. 208 mCharacterBuffer[index++] = character; 209 character = CharEncoding.readChar(mDictBuffer); 210 addressPointer += CharEncoding.getCharSize(character); 211 } 212 characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); 213 } else { 214 final int character = CharEncoding.readChar(mDictBuffer); 215 addressPointer += CharEncoding.getCharSize(character); 216 characters = new int[] { character }; 217 } 218 final ProbabilityInfo probabilityInfo; 219 if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { 220 probabilityInfo = PtNodeReader.readProbabilityInfo(mDictBuffer); 221 addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE; 222 } else { 223 probabilityInfo = null; 224 } 225 int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags); 226 if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { 227 childrenAddress += addressPointer; 228 } 229 addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags); 230 final ArrayList<WeightedString> shortcutTargets; 231 if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { 232 // readShortcut will add shortcuts to shortcutTargets. 233 shortcutTargets = new ArrayList<>(); 234 addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); 235 } else { 236 shortcutTargets = null; 237 } 238 239 final ArrayList<PendingAttribute> bigrams; 240 if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { 241 bigrams = new ArrayList<>(); 242 addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, 243 addressPointer); 244 if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { 245 throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() 246 + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); 247 } 248 } else { 249 bigrams = null; 250 } 251 return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, probabilityInfo, 252 childrenAddress, shortcutTargets, bigrams); 253 } 254 255 @Override 256 public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken) 257 throws FileNotFoundException, IOException, UnsupportedFormatException { 258 // dictType is not being used in dicttool. Passing an empty string. 259 final BinaryDictionary binaryDictionary = new BinaryDictionary( 260 mDictionaryBinaryFile.getAbsolutePath(), 0 /* offset */, 261 mDictionaryBinaryFile.length() /* length */, true /* useFullEditDistance */, 262 null /* locale */, "" /* dictType */, false /* isUpdatable */); 263 final DictionaryHeader header = readHeader(); 264 final FusionDictionary fusionDict = 265 new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions); 266 int token = 0; 267 final ArrayList<WordProperty> wordProperties = new ArrayList<>(); 268 do { 269 final BinaryDictionary.GetNextWordPropertyResult result = 270 binaryDictionary.getNextWordProperty(token); 271 final WordProperty wordProperty = result.mWordProperty; 272 if (wordProperty == null) { 273 binaryDictionary.close(); 274 if (deleteDictIfBroken) { 275 mDictionaryBinaryFile.delete(); 276 } 277 return null; 278 } 279 wordProperties.add(wordProperty); 280 token = result.mNextToken; 281 } while (token != 0); 282 283 // Insert unigrams into the fusion dictionary. 284 for (final WordProperty wordProperty : wordProperties) { 285 if (wordProperty.mIsBlacklistEntry) { 286 fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets, 287 wordProperty.mIsNotAWord); 288 } else { 289 fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo, 290 wordProperty.mShortcutTargets, wordProperty.mIsNotAWord); 291 } 292 } 293 // Insert bigrams into the fusion dictionary. 294 for (final WordProperty wordProperty : wordProperties) { 295 if (wordProperty.mBigrams == null) { 296 continue; 297 } 298 final String word0 = wordProperty.mWord; 299 for (final WeightedString bigram : wordProperty.mBigrams) { 300 fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo); 301 } 302 } 303 binaryDictionary.close(); 304 return fusionDict; 305 } 306 307 @Override 308 public void setPosition(int newPos) { 309 mDictBuffer.position(newPos); 310 } 311 312 @Override 313 public int getPosition() { 314 return mDictBuffer.position(); 315 } 316 317 @Override 318 public int readPtNodeCount() { 319 return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer); 320 } 321 } 322