Home | History | Annotate | Download | only in makedict
      1 /*
      2 /*
      3  * Copyright (C) 2013 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 
     18 package com.android.inputmethod.latin.makedict;
     19 
     20 import com.android.inputmethod.annotations.UsedForTesting;
     21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
     22 import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
     23 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     24 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
     25 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
     26 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
     27 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     28 
     29 import java.io.File;
     30 import java.io.FileNotFoundException;
     31 import java.io.FileOutputStream;
     32 import java.io.IOException;
     33 import java.io.OutputStream;
     34 import java.util.ArrayList;
     35 import java.util.Iterator;
     36 
     37 /**
     38  * An implementation of DictEncoder for version 4 binary dictionary.
     39  */
     40 @UsedForTesting
     41 public class Ver4DictEncoder implements DictEncoder {
     42     private final File mDictPlacedDir;
     43     private byte[] mTrieBuf;
     44     private int mTriePos;
     45     private int mHeaderSize;
     46     private OutputStream mTrieOutStream;
     47     private OutputStream mFreqOutStream;
     48     private OutputStream mUnigramTimestampOutStream;
     49     private OutputStream mTerminalAddressTableOutStream;
     50     private File mDictDir;
     51     private String mBaseFilename;
     52     private BigramContentWriter mBigramWriter;
     53     private ShortcutContentWriter mShortcutWriter;
     54 
     55     @UsedForTesting
     56     public Ver4DictEncoder(final File dictPlacedDir) {
     57         mDictPlacedDir = dictPlacedDir;
     58     }
     59 
     60     private interface SparseTableContentWriterInterface {
     61         public void write(final OutputStream outStream) throws IOException;
     62     }
     63 
     64     private static class SparseTableContentWriter {
     65         private final int mContentCount;
     66         private final SparseTable mSparseTable;
     67         private final File mLookupTableFile;
     68         protected final File mBaseDir;
     69         private final File[] mAddressTableFiles;
     70         private final File[] mContentFiles;
     71         protected final OutputStream[] mContentOutStreams;
     72 
     73         public SparseTableContentWriter(final String name, final int initialCapacity,
     74                 final int blockSize, final File baseDir, final String[] contentFilenames,
     75                 final String[] contentIds) {
     76             if (contentFilenames.length != contentIds.length) {
     77                 throw new RuntimeException("The length of contentFilenames and the length of"
     78                         + " contentIds are different " + contentFilenames.length + ", "
     79                         + contentIds.length);
     80             }
     81             mContentCount = contentFilenames.length;
     82             mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount);
     83             mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
     84             mAddressTableFiles = new File[mContentCount];
     85             mContentFiles = new File[mContentCount];
     86             mBaseDir = baseDir;
     87             for (int i = 0; i < mContentCount; ++i) {
     88                 mAddressTableFiles[i] = new File(mBaseDir,
     89                         name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
     90                 mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
     91             }
     92             mContentOutStreams = new OutputStream[mContentCount];
     93         }
     94 
     95         public void openStreams() throws FileNotFoundException {
     96             for (int i = 0; i < mContentCount; ++i) {
     97                 mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]);
     98             }
     99         }
    100 
    101         protected void write(final int contentIndex, final int index,
    102                 final SparseTableContentWriterInterface writer) throws IOException {
    103             mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length());
    104             writer.write(mContentOutStreams[contentIndex]);
    105             mContentOutStreams[contentIndex].flush();
    106         }
    107 
    108         public void closeStreams() throws IOException {
    109             mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles);
    110             for (int i = 0; i < mContentCount; ++i) {
    111                 mContentOutStreams[i].close();
    112             }
    113         }
    114     }
    115 
    116     private static class BigramContentWriter extends SparseTableContentWriter {
    117         private final boolean mWriteTimestamp;
    118 
    119         public BigramContentWriter(final String name, final int initialCapacity,
    120                 final File baseDir, final boolean writeTimestamp) {
    121             super(name + FormatSpec.BIGRAM_FILE_EXTENSION, initialCapacity,
    122                     FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
    123                     getContentFilenames(name, writeTimestamp), getContentIds(writeTimestamp));
    124             mWriteTimestamp = writeTimestamp;
    125         }
    126 
    127         private static String[] getContentFilenames(final String name,
    128                 final boolean writeTimestamp) {
    129             final String[] contentFilenames;
    130             if (writeTimestamp) {
    131                 contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION,
    132                         name + FormatSpec.BIGRAM_FILE_EXTENSION };
    133             } else {
    134                 contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
    135             }
    136             return contentFilenames;
    137         }
    138 
    139         private static String[] getContentIds(final boolean writeTimestamp) {
    140             final String[] contentIds;
    141             if (writeTimestamp) {
    142                 contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID,
    143                         FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID };
    144             } else {
    145                 contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
    146             }
    147             return contentIds;
    148         }
    149 
    150         public void writeBigramsForOneWord(final int terminalId, final int bigramCount,
    151                 final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
    152                         throws IOException {
    153             write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
    154                     new SparseTableContentWriterInterface() {
    155                         @Override
    156                         public void write(final OutputStream outStream) throws IOException {
    157                             writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
    158                         }});
    159             if (mWriteTimestamp) {
    160                 write(FormatSpec.BIGRAM_TIMESTAMP_CONTENT_INDEX, terminalId,
    161                         new SparseTableContentWriterInterface() {
    162                             @Override
    163                             public void write(final OutputStream outStream) throws IOException {
    164                                 initBigramTimestampsCountersAndLevelsForOneWordInternal(outStream,
    165                                         bigramCount);
    166                             }});
    167             }
    168         }
    169 
    170         private void writeBigramsForOneWordInternal(final OutputStream outStream,
    171                 final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
    172                         throws IOException {
    173             while (bigramIterator.hasNext()) {
    174                 final WeightedString bigram = bigramIterator.next();
    175                 final PtNode target =
    176                         FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
    177                 final int unigramFrequencyForThisWord = target.mFrequency;
    178                 final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
    179                         bigramIterator.hasNext(), 0, bigram.mFrequency,
    180                         unigramFrequencyForThisWord, bigram.mWord);
    181                 BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags,
    182                         FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
    183                 BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId,
    184                         FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
    185             }
    186         }
    187 
    188         private void initBigramTimestampsCountersAndLevelsForOneWordInternal(
    189                 final OutputStream outStream, final int bigramCount) throws IOException {
    190             for (int i = 0; i < bigramCount; ++i) {
    191                 // TODO: Figure out what initial values should be.
    192                 BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
    193                         FormatSpec.BIGRAM_TIMESTAMP_SIZE);
    194                 BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
    195                         FormatSpec.BIGRAM_COUNTER_SIZE);
    196                 BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
    197                         FormatSpec.BIGRAM_LEVEL_SIZE);
    198             }
    199         }
    200     }
    201 
    202     private static class ShortcutContentWriter extends SparseTableContentWriter {
    203         public ShortcutContentWriter(final String name, final int initialCapacity,
    204                 final File baseDir) {
    205             super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, initialCapacity,
    206                     FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
    207                     new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
    208                     new String[] { FormatSpec.SHORTCUT_CONTENT_ID });
    209         }
    210 
    211         public void writeShortcutForOneWord(final int terminalId,
    212                 final Iterator<WeightedString> shortcutIterator) throws IOException {
    213             write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
    214                     new SparseTableContentWriterInterface() {
    215                         @Override
    216                         public void write(final OutputStream outStream) throws IOException {
    217                             writeShortcutForOneWordInternal(outStream, shortcutIterator);
    218                         }
    219                     });
    220         }
    221 
    222         private void writeShortcutForOneWordInternal(final OutputStream outStream,
    223                 final Iterator<WeightedString> shortcutIterator) throws IOException {
    224             while (shortcutIterator.hasNext()) {
    225                 final WeightedString target = shortcutIterator.next();
    226                 final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
    227                         shortcutIterator.hasNext(), target.mFrequency);
    228                 BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags,
    229                         FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
    230                 CharEncoding.writeString(outStream, target.mWord);
    231             }
    232         }
    233     }
    234 
    235     private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
    236             throws FileNotFoundException, IOException {
    237         final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
    238         mBaseFilename = header.getId() + "." + header.getVersion();
    239         mDictDir = new File(mDictPlacedDir, mBaseFilename);
    240         final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
    241         final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
    242         final File timestampFile = new File(mDictDir,
    243                 mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION);
    244         final File terminalAddressTableFile = new File(mDictDir,
    245                 mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
    246         if (!mDictDir.isDirectory()) {
    247             if (mDictDir.exists()) mDictDir.delete();
    248             mDictDir.mkdirs();
    249         }
    250         mTrieOutStream = new FileOutputStream(trieFile);
    251         mFreqOutStream = new FileOutputStream(freqFile);
    252         mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
    253         if (formatOptions.mHasTimestamp) {
    254             mUnigramTimestampOutStream = new FileOutputStream(timestampFile);
    255         }
    256     }
    257 
    258     private void close() throws IOException {
    259         try {
    260             if (mTrieOutStream != null) {
    261                 mTrieOutStream.close();
    262             }
    263             if (mFreqOutStream != null) {
    264                 mFreqOutStream.close();
    265             }
    266             if (mTerminalAddressTableOutStream != null) {
    267                 mTerminalAddressTableOutStream.close();
    268             }
    269             if (mUnigramTimestampOutStream != null) {
    270                 mUnigramTimestampOutStream.close();
    271             }
    272         } finally {
    273             mTrieOutStream = null;
    274             mFreqOutStream = null;
    275             mTerminalAddressTableOutStream = null;
    276         }
    277     }
    278 
    279     @Override
    280     public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
    281             throws IOException, UnsupportedFormatException {
    282         if (formatOptions.mVersion != FormatSpec.VERSION4) {
    283             throw new UnsupportedFormatException("File header has a wrong version number : "
    284                     + formatOptions.mVersion);
    285         }
    286         if (!mDictPlacedDir.isDirectory()) {
    287             throw new UnsupportedFormatException("Given path is not a directory.");
    288         }
    289 
    290         if (mTrieOutStream == null) {
    291             openStreams(formatOptions, dict.mOptions);
    292         }
    293 
    294         mHeaderSize = BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict,
    295                 formatOptions);
    296 
    297         MakedictLog.i("Flattening the tree...");
    298         ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
    299         int terminalCount = 0;
    300         for (final PtNodeArray array : flatNodes) {
    301             for (final PtNode node : array.mData) {
    302                 if (node.isTerminal()) node.mTerminalId = terminalCount++;
    303             }
    304         }
    305 
    306         MakedictLog.i("Computing addresses...");
    307         BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions);
    308         if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
    309 
    310         writeTerminalData(flatNodes, terminalCount);
    311         if (formatOptions.mHasTimestamp) {
    312             initUnigramTimestamps(terminalCount);
    313         }
    314         mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
    315                 formatOptions.mHasTimestamp);
    316         writeBigrams(flatNodes, dict);
    317         mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir);
    318         writeShortcuts(flatNodes);
    319 
    320         final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
    321         final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
    322         mTrieBuf = new byte[bufferSize];
    323 
    324         MakedictLog.i("Writing file...");
    325         for (PtNodeArray nodeArray : flatNodes) {
    326             BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions);
    327         }
    328         if (MakedictLog.DBG) {
    329             BinaryDictEncoderUtils.showStatistics(flatNodes);
    330             MakedictLog.i("has " + terminalCount + " terminals.");
    331         }
    332         mTrieOutStream.write(mTrieBuf);
    333 
    334         MakedictLog.i("Done");
    335         close();
    336     }
    337 
    338     @Override
    339     public void setPosition(int position) {
    340         if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return;
    341         mTriePos = position;
    342     }
    343 
    344     @Override
    345     public int getPosition() {
    346         return mTriePos;
    347     }
    348 
    349     @Override
    350     public void writePtNodeCount(int ptNodeCount) {
    351         final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
    352         // ptNodeCount must fit on one byte or two bytes.
    353         // Please see comments in FormatSpec
    354         if (countSize != 1 && countSize != 2) {
    355             throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
    356         }
    357         final int encodedPtNodeCount = (countSize == 2) ?
    358                 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
    359         mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, encodedPtNodeCount,
    360                 countSize);
    361     }
    362 
    363     private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) {
    364         final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
    365         mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
    366                 BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions),
    367                 FormatSpec.PTNODE_FLAGS_SIZE);
    368     }
    369 
    370     private void writeParentPosition(int parentPos, final PtNode ptNode,
    371             final FormatOptions formatOptions) {
    372         if (parentPos != FormatSpec.NO_PARENT_ADDRESS) {
    373             parentPos -= ptNode.mCachedAddressAfterUpdate;
    374         }
    375         mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos,
    376                 formatOptions);
    377     }
    378 
    379     private void writeCharacters(final int[] characters, final boolean hasSeveralChars) {
    380         mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos);
    381         if (hasSeveralChars) {
    382             mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
    383         }
    384     }
    385 
    386     private void writeTerminalId(final int terminalId) {
    387         mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId,
    388                 FormatSpec.PTNODE_TERMINAL_ID_SIZE);
    389     }
    390 
    391     private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) {
    392         final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
    393         if (formatOptions.mSupportsDynamicUpdate) {
    394             mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf,
    395                     mTriePos, childrenPos);
    396         } else {
    397             mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
    398                     mTriePos, childrenPos);
    399         }
    400     }
    401 
    402     private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
    403             throws IOException {
    404         mBigramWriter.openStreams();
    405         for (final PtNodeArray nodeArray : flatNodes) {
    406             for (final PtNode ptNode : nodeArray.mData) {
    407                 if (ptNode.mBigrams != null) {
    408                     mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, ptNode.mBigrams.size(),
    409                             ptNode.mBigrams.iterator(), dict);
    410                 }
    411             }
    412         }
    413         mBigramWriter.closeStreams();
    414     }
    415 
    416     private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException {
    417         mShortcutWriter.openStreams();
    418         for (final PtNodeArray nodeArray : flatNodes) {
    419             for (final PtNode ptNode : nodeArray.mData) {
    420                 if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) {
    421                     mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId,
    422                             ptNode.mShortcutTargets.iterator());
    423                 }
    424             }
    425         }
    426         mShortcutWriter.closeStreams();
    427     }
    428 
    429     @Override
    430     public void writeForwardLinkAddress(int forwardLinkAddress) {
    431         mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
    432                 forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
    433     }
    434 
    435     @Override
    436     public void writePtNode(final PtNode ptNode, final int parentPosition,
    437             final FormatOptions formatOptions, final FusionDictionary dict) {
    438         writePtNodeFlags(ptNode, formatOptions);
    439         writeParentPosition(parentPosition, ptNode, formatOptions);
    440         writeCharacters(ptNode.mChars, ptNode.hasSeveralChars());
    441         if (ptNode.isTerminal()) {
    442             writeTerminalId(ptNode.mTerminalId);
    443         }
    444         writeChildrenPosition(ptNode, formatOptions);
    445     }
    446 
    447     private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes,
    448           final int terminalCount) throws IOException {
    449         final byte[] freqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE];
    450         final byte[] terminalAddressTableBuf =
    451                 new byte[terminalCount * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE];
    452         for (final PtNodeArray nodeArray : flatNodes) {
    453             for (final PtNode ptNode : nodeArray.mData) {
    454                 if (ptNode.isTerminal()) {
    455                     BinaryDictEncoderUtils.writeUIntToBuffer(freqBuf,
    456                             ptNode.mTerminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE,
    457                             ptNode.mFrequency, FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
    458                     BinaryDictEncoderUtils.writeUIntToBuffer(terminalAddressTableBuf,
    459                             ptNode.mTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
    460                             ptNode.mCachedAddressAfterUpdate + mHeaderSize,
    461                             FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
    462                 }
    463             }
    464         }
    465         mFreqOutStream.write(freqBuf);
    466         mTerminalAddressTableOutStream.write(terminalAddressTableBuf);
    467     }
    468 
    469     private void initUnigramTimestamps(final int terminalCount) throws IOException {
    470         // Initial value of time stamps for each word is 0.
    471         final byte[] unigramTimestampBuf =
    472                 new byte[terminalCount * FormatSpec.UNIGRAM_TIMESTAMP_SIZE];
    473         mUnigramTimestampOutStream.write(unigramTimestampBuf);
    474     }
    475 }
    476