1 /* 2 /* 3 * Copyright (C) 2013 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package com.android.inputmethod.latin.makedict; 19 20 import com.android.inputmethod.annotations.UsedForTesting; 21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; 22 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; 23 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 24 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; 25 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; 26 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 27 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 28 29 import java.io.File; 30 import java.io.FileNotFoundException; 31 import java.io.FileOutputStream; 32 import java.io.IOException; 33 import java.io.OutputStream; 34 import java.util.ArrayList; 35 import java.util.Iterator; 36 37 /** 38 * An implementation of DictEncoder for version 4 binary dictionary. 39 */ 40 @UsedForTesting 41 public class Ver4DictEncoder implements DictEncoder { 42 private final File mDictPlacedDir; 43 private byte[] mTrieBuf; 44 private int mTriePos; 45 private int mHeaderSize; 46 private OutputStream mTrieOutStream; 47 private OutputStream mFreqOutStream; 48 private OutputStream mUnigramTimestampOutStream; 49 private OutputStream mTerminalAddressTableOutStream; 50 private File mDictDir; 51 private String mBaseFilename; 52 private BigramContentWriter mBigramWriter; 53 private ShortcutContentWriter mShortcutWriter; 54 55 @UsedForTesting 56 public Ver4DictEncoder(final File dictPlacedDir) { 57 mDictPlacedDir = dictPlacedDir; 58 } 59 60 private interface SparseTableContentWriterInterface { 61 public void write(final OutputStream outStream) throws IOException; 62 } 63 64 private static class SparseTableContentWriter { 65 private final int mContentCount; 66 private final SparseTable mSparseTable; 67 private final File mLookupTableFile; 68 protected final File mBaseDir; 69 private final File[] mAddressTableFiles; 70 private final File[] mContentFiles; 71 protected final OutputStream[] mContentOutStreams; 72 73 public SparseTableContentWriter(final String name, final int initialCapacity, 74 final int blockSize, final File baseDir, final String[] contentFilenames, 75 final String[] contentIds) { 76 if (contentFilenames.length != contentIds.length) { 77 throw new RuntimeException("The length of contentFilenames and the length of" 78 + " contentIds are different " + contentFilenames.length + ", " 79 + contentIds.length); 80 } 81 mContentCount = contentFilenames.length; 82 mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount); 83 mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); 84 mAddressTableFiles = new File[mContentCount]; 85 mContentFiles = new File[mContentCount]; 86 mBaseDir = baseDir; 87 for (int i = 0; i < mContentCount; ++i) { 88 mAddressTableFiles[i] = new File(mBaseDir, 89 name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]); 90 mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]); 91 } 92 mContentOutStreams = new OutputStream[mContentCount]; 93 } 94 95 public void openStreams() throws FileNotFoundException { 96 for (int i = 0; i < mContentCount; ++i) { 97 mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]); 98 } 99 } 100 101 protected void write(final int contentIndex, final int index, 102 final SparseTableContentWriterInterface writer) throws IOException { 103 mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length()); 104 writer.write(mContentOutStreams[contentIndex]); 105 mContentOutStreams[contentIndex].flush(); 106 } 107 108 public void closeStreams() throws IOException { 109 mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles); 110 for (int i = 0; i < mContentCount; ++i) { 111 mContentOutStreams[i].close(); 112 } 113 } 114 } 115 116 private static class BigramContentWriter extends SparseTableContentWriter { 117 private final boolean mWriteTimestamp; 118 119 public BigramContentWriter(final String name, final int initialCapacity, 120 final File baseDir, final boolean writeTimestamp) { 121 super(name + FormatSpec.BIGRAM_FILE_EXTENSION, initialCapacity, 122 FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, 123 getContentFilenames(name, writeTimestamp), getContentIds(writeTimestamp)); 124 mWriteTimestamp = writeTimestamp; 125 } 126 127 private static String[] getContentFilenames(final String name, 128 final boolean writeTimestamp) { 129 final String[] contentFilenames; 130 if (writeTimestamp) { 131 contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION, 132 name + FormatSpec.BIGRAM_FILE_EXTENSION }; 133 } else { 134 contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }; 135 } 136 return contentFilenames; 137 } 138 139 private static String[] getContentIds(final boolean writeTimestamp) { 140 final String[] contentIds; 141 if (writeTimestamp) { 142 contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID, 143 FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID }; 144 } else { 145 contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }; 146 } 147 return contentIds; 148 } 149 150 public void writeBigramsForOneWord(final int terminalId, final int bigramCount, 151 final Iterator<WeightedString> bigramIterator, final FusionDictionary dict) 152 throws IOException { 153 write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, 154 new SparseTableContentWriterInterface() { 155 @Override 156 public void write(final OutputStream outStream) throws IOException { 157 writeBigramsForOneWordInternal(outStream, bigramIterator, dict); 158 }}); 159 if (mWriteTimestamp) { 160 write(FormatSpec.BIGRAM_TIMESTAMP_CONTENT_INDEX, terminalId, 161 new SparseTableContentWriterInterface() { 162 @Override 163 public void write(final OutputStream outStream) throws IOException { 164 initBigramTimestampsCountersAndLevelsForOneWordInternal(outStream, 165 bigramCount); 166 }}); 167 } 168 } 169 170 private void writeBigramsForOneWordInternal(final OutputStream outStream, 171 final Iterator<WeightedString> bigramIterator, final FusionDictionary dict) 172 throws IOException { 173 while (bigramIterator.hasNext()) { 174 final WeightedString bigram = bigramIterator.next(); 175 final PtNode target = 176 FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); 177 final int unigramFrequencyForThisWord = target.mFrequency; 178 final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags( 179 bigramIterator.hasNext(), 0, bigram.mFrequency, 180 unigramFrequencyForThisWord, bigram.mWord); 181 BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags, 182 FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); 183 BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId, 184 FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE); 185 } 186 } 187 188 private void initBigramTimestampsCountersAndLevelsForOneWordInternal( 189 final OutputStream outStream, final int bigramCount) throws IOException { 190 for (int i = 0; i < bigramCount; ++i) { 191 // TODO: Figure out what initial values should be. 192 BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */, 193 FormatSpec.BIGRAM_TIMESTAMP_SIZE); 194 BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */, 195 FormatSpec.BIGRAM_COUNTER_SIZE); 196 BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */, 197 FormatSpec.BIGRAM_LEVEL_SIZE); 198 } 199 } 200 } 201 202 private static class ShortcutContentWriter extends SparseTableContentWriter { 203 public ShortcutContentWriter(final String name, final int initialCapacity, 204 final File baseDir) { 205 super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, initialCapacity, 206 FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, 207 new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, 208 new String[] { FormatSpec.SHORTCUT_CONTENT_ID }); 209 } 210 211 public void writeShortcutForOneWord(final int terminalId, 212 final Iterator<WeightedString> shortcutIterator) throws IOException { 213 write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, 214 new SparseTableContentWriterInterface() { 215 @Override 216 public void write(final OutputStream outStream) throws IOException { 217 writeShortcutForOneWordInternal(outStream, shortcutIterator); 218 } 219 }); 220 } 221 222 private void writeShortcutForOneWordInternal(final OutputStream outStream, 223 final Iterator<WeightedString> shortcutIterator) throws IOException { 224 while (shortcutIterator.hasNext()) { 225 final WeightedString target = shortcutIterator.next(); 226 final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( 227 shortcutIterator.hasNext(), target.mFrequency); 228 BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags, 229 FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); 230 CharEncoding.writeString(outStream, target.mWord); 231 } 232 } 233 } 234 235 private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) 236 throws FileNotFoundException, IOException { 237 final FileHeader header = new FileHeader(0, dictOptions, formatOptions); 238 mBaseFilename = header.getId() + "." + header.getVersion(); 239 mDictDir = new File(mDictPlacedDir, mBaseFilename); 240 final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION); 241 final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION); 242 final File timestampFile = new File(mDictDir, 243 mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION); 244 final File terminalAddressTableFile = new File(mDictDir, 245 mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); 246 if (!mDictDir.isDirectory()) { 247 if (mDictDir.exists()) mDictDir.delete(); 248 mDictDir.mkdirs(); 249 } 250 mTrieOutStream = new FileOutputStream(trieFile); 251 mFreqOutStream = new FileOutputStream(freqFile); 252 mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile); 253 if (formatOptions.mHasTimestamp) { 254 mUnigramTimestampOutStream = new FileOutputStream(timestampFile); 255 } 256 } 257 258 private void close() throws IOException { 259 try { 260 if (mTrieOutStream != null) { 261 mTrieOutStream.close(); 262 } 263 if (mFreqOutStream != null) { 264 mFreqOutStream.close(); 265 } 266 if (mTerminalAddressTableOutStream != null) { 267 mTerminalAddressTableOutStream.close(); 268 } 269 if (mUnigramTimestampOutStream != null) { 270 mUnigramTimestampOutStream.close(); 271 } 272 } finally { 273 mTrieOutStream = null; 274 mFreqOutStream = null; 275 mTerminalAddressTableOutStream = null; 276 } 277 } 278 279 @Override 280 public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) 281 throws IOException, UnsupportedFormatException { 282 if (formatOptions.mVersion != FormatSpec.VERSION4) { 283 throw new UnsupportedFormatException("File header has a wrong version number : " 284 + formatOptions.mVersion); 285 } 286 if (!mDictPlacedDir.isDirectory()) { 287 throw new UnsupportedFormatException("Given path is not a directory."); 288 } 289 290 if (mTrieOutStream == null) { 291 openStreams(formatOptions, dict.mOptions); 292 } 293 294 mHeaderSize = BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict, 295 formatOptions); 296 297 MakedictLog.i("Flattening the tree..."); 298 ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); 299 int terminalCount = 0; 300 for (final PtNodeArray array : flatNodes) { 301 for (final PtNode node : array.mData) { 302 if (node.isTerminal()) node.mTerminalId = terminalCount++; 303 } 304 } 305 306 MakedictLog.i("Computing addresses..."); 307 BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions); 308 if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); 309 310 writeTerminalData(flatNodes, terminalCount); 311 if (formatOptions.mHasTimestamp) { 312 initUnigramTimestamps(terminalCount); 313 } 314 mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir, 315 formatOptions.mHasTimestamp); 316 writeBigrams(flatNodes, dict); 317 mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir); 318 writeShortcuts(flatNodes); 319 320 final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); 321 final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; 322 mTrieBuf = new byte[bufferSize]; 323 324 MakedictLog.i("Writing file..."); 325 for (PtNodeArray nodeArray : flatNodes) { 326 BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); 327 } 328 if (MakedictLog.DBG) { 329 BinaryDictEncoderUtils.showStatistics(flatNodes); 330 MakedictLog.i("has " + terminalCount + " terminals."); 331 } 332 mTrieOutStream.write(mTrieBuf); 333 334 MakedictLog.i("Done"); 335 close(); 336 } 337 338 @Override 339 public void setPosition(int position) { 340 if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return; 341 mTriePos = position; 342 } 343 344 @Override 345 public int getPosition() { 346 return mTriePos; 347 } 348 349 @Override 350 public void writePtNodeCount(int ptNodeCount) { 351 final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); 352 // ptNodeCount must fit on one byte or two bytes. 353 // Please see comments in FormatSpec 354 if (countSize != 1 && countSize != 2) { 355 throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); 356 } 357 final int encodedPtNodeCount = (countSize == 2) ? 358 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; 359 mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, encodedPtNodeCount, 360 countSize); 361 } 362 363 private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) { 364 final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); 365 mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, 366 BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions), 367 FormatSpec.PTNODE_FLAGS_SIZE); 368 } 369 370 private void writeParentPosition(int parentPos, final PtNode ptNode, 371 final FormatOptions formatOptions) { 372 if (parentPos != FormatSpec.NO_PARENT_ADDRESS) { 373 parentPos -= ptNode.mCachedAddressAfterUpdate; 374 } 375 mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos, 376 formatOptions); 377 } 378 379 private void writeCharacters(final int[] characters, final boolean hasSeveralChars) { 380 mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos); 381 if (hasSeveralChars) { 382 mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; 383 } 384 } 385 386 private void writeTerminalId(final int terminalId) { 387 mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId, 388 FormatSpec.PTNODE_TERMINAL_ID_SIZE); 389 } 390 391 private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) { 392 final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); 393 if (formatOptions.mSupportsDynamicUpdate) { 394 mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf, 395 mTriePos, childrenPos); 396 } else { 397 mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, 398 mTriePos, childrenPos); 399 } 400 } 401 402 private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) 403 throws IOException { 404 mBigramWriter.openStreams(); 405 for (final PtNodeArray nodeArray : flatNodes) { 406 for (final PtNode ptNode : nodeArray.mData) { 407 if (ptNode.mBigrams != null) { 408 mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, ptNode.mBigrams.size(), 409 ptNode.mBigrams.iterator(), dict); 410 } 411 } 412 } 413 mBigramWriter.closeStreams(); 414 } 415 416 private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException { 417 mShortcutWriter.openStreams(); 418 for (final PtNodeArray nodeArray : flatNodes) { 419 for (final PtNode ptNode : nodeArray.mData) { 420 if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) { 421 mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId, 422 ptNode.mShortcutTargets.iterator()); 423 } 424 } 425 } 426 mShortcutWriter.closeStreams(); 427 } 428 429 @Override 430 public void writeForwardLinkAddress(int forwardLinkAddress) { 431 mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, 432 forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE); 433 } 434 435 @Override 436 public void writePtNode(final PtNode ptNode, final int parentPosition, 437 final FormatOptions formatOptions, final FusionDictionary dict) { 438 writePtNodeFlags(ptNode, formatOptions); 439 writeParentPosition(parentPosition, ptNode, formatOptions); 440 writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); 441 if (ptNode.isTerminal()) { 442 writeTerminalId(ptNode.mTerminalId); 443 } 444 writeChildrenPosition(ptNode, formatOptions); 445 } 446 447 private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, 448 final int terminalCount) throws IOException { 449 final byte[] freqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE]; 450 final byte[] terminalAddressTableBuf = 451 new byte[terminalCount * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE]; 452 for (final PtNodeArray nodeArray : flatNodes) { 453 for (final PtNode ptNode : nodeArray.mData) { 454 if (ptNode.isTerminal()) { 455 BinaryDictEncoderUtils.writeUIntToBuffer(freqBuf, 456 ptNode.mTerminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE, 457 ptNode.mFrequency, FormatSpec.FREQUENCY_AND_FLAGS_SIZE); 458 BinaryDictEncoderUtils.writeUIntToBuffer(terminalAddressTableBuf, 459 ptNode.mTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, 460 ptNode.mCachedAddressAfterUpdate + mHeaderSize, 461 FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); 462 } 463 } 464 } 465 mFreqOutStream.write(freqBuf); 466 mTerminalAddressTableOutStream.write(terminalAddressTableBuf); 467 } 468 469 private void initUnigramTimestamps(final int terminalCount) throws IOException { 470 // Initial value of time stamps for each word is 0. 471 final byte[] unigramTimestampBuf = 472 new byte[terminalCount * FormatSpec.UNIGRAM_TIMESTAMP_SIZE]; 473 mUnigramTimestampOutStream.write(unigramTimestampBuf); 474 } 475 } 476