Home | History | Annotate | Download | only in dicttool
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.dicttool;
     18 
     19 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils;
     20 import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
     21 import com.android.inputmethod.latin.makedict.DictDecoder;
     22 import com.android.inputmethod.latin.makedict.DictionaryHeader;
     23 import com.android.inputmethod.latin.makedict.FormatSpec;
     24 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
     25 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
     26 import com.android.inputmethod.latin.makedict.FusionDictionary;
     27 import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
     28 
     29 import java.io.BufferedInputStream;
     30 import java.io.BufferedOutputStream;
     31 import java.io.BufferedReader;
     32 import java.io.File;
     33 import java.io.FileInputStream;
     34 import java.io.FileNotFoundException;
     35 import java.io.FileOutputStream;
     36 import java.io.IOException;
     37 import java.io.InputStream;
     38 import java.io.InputStreamReader;
     39 import java.io.OutputStream;
     40 import java.util.HashMap;
     41 
     42 import javax.annotation.Nonnull;
     43 import javax.annotation.Nullable;
     44 
     45 /**
     46  * Class grouping utilities for offline dictionary making.
     47  *
     48  * Those should not be used on-device, essentially because they are quite
     49  * liberal about I/O and performance.
     50  */
     51 public final class BinaryDictOffdeviceUtils {
     52     // Prefix and suffix are arbitrary, the values do not really matter
     53     private final static String PREFIX = "dicttool";
     54     private final static String SUFFIX = ".tmp";
     55     private final static int COPY_BUFFER_SIZE = 8192;
     56 
     57     public static class DecoderChainSpec<T> {
     58         public final static int COMPRESSION = 1;
     59         public final static int ENCRYPTION = 2;
     60 
     61         private final static int[][] VALID_DECODER_CHAINS = {
     62             { }, { COMPRESSION }, { ENCRYPTION, COMPRESSION }
     63         };
     64 
     65         private final int mDecoderSpecIndex;
     66         public T mResult;
     67 
     68         public DecoderChainSpec() {
     69             mDecoderSpecIndex = 0;
     70             mResult = null;
     71         }
     72 
     73         private DecoderChainSpec(final DecoderChainSpec<T> src) {
     74             mDecoderSpecIndex = src.mDecoderSpecIndex + 1;
     75             mResult = src.mResult;
     76         }
     77 
     78         private String getStepDescription(final int step) {
     79             switch (step) {
     80             case COMPRESSION:
     81                 return "compression";
     82             case ENCRYPTION:
     83                 return "encryption";
     84             default:
     85                 return "unknown";
     86             }
     87         }
     88 
     89         public String describeChain() {
     90             final StringBuilder s = new StringBuilder("raw");
     91             for (final int step : VALID_DECODER_CHAINS[mDecoderSpecIndex]) {
     92                 s.append(" > ");
     93                 s.append(getStepDescription(step));
     94             }
     95             return s.toString();
     96         }
     97 
     98         /**
     99          * Returns the next sequential spec. If exhausted, return null.
    100          */
    101         public DecoderChainSpec next() {
    102             if (mDecoderSpecIndex + 1 >= VALID_DECODER_CHAINS.length) {
    103                 return null;
    104             }
    105             return new DecoderChainSpec(this);
    106         }
    107 
    108         public InputStream getStream(final File src) throws FileNotFoundException, IOException {
    109             InputStream input = new BufferedInputStream(new FileInputStream(src));
    110             for (final int step : VALID_DECODER_CHAINS[mDecoderSpecIndex]) {
    111                 switch (step) {
    112                 case COMPRESSION:
    113                     input = Compress.getUncompressedStream(input);
    114                     break;
    115                 case ENCRYPTION:
    116                     input = Crypt.getDecryptedStream(input);
    117                     break;
    118                 }
    119             }
    120             return input;
    121         }
    122     }
    123 
    124     public interface InputProcessor<T> {
    125         @Nonnull
    126         public T process(@Nonnull final InputStream input)
    127                 throws IOException, UnsupportedFormatException;
    128     }
    129 
    130     public static class CopyProcessor implements InputProcessor<File> {
    131         @Override @Nonnull
    132         public File process(@Nonnull final InputStream input) throws IOException,
    133                 UnsupportedFormatException {
    134             final File dst = File.createTempFile(PREFIX, SUFFIX);
    135             dst.deleteOnExit();
    136             try (final OutputStream output = new BufferedOutputStream(new FileOutputStream(dst))) {
    137                 copy(input, output);
    138                 output.flush();
    139                 output.close();
    140                 if (BinaryDictDecoderUtils.isBinaryDictionary(dst)
    141                         || CombinedInputOutput.isCombinedDictionary(dst.getAbsolutePath())) {
    142                     return dst;
    143                 }
    144             }
    145             throw new UnsupportedFormatException("Input stream not at the expected format");
    146         }
    147     }
    148 
    149     public static class HeaderReaderProcessor implements InputProcessor<DictionaryHeader> {
    150         // Arbitrarily limit the header length to 32k. Sounds like it would never be larger
    151         // than this. Revisit this if needed later.
    152         private final int MAX_HEADER_LENGTH = 32 * 1024;
    153         @Override @Nonnull
    154         public DictionaryHeader process(final InputStream input) throws IOException,
    155                 UnsupportedFormatException {
    156             // Do everything as curtly and ad-hoc as possible for performance.
    157             final byte[] tmpBuffer = new byte[12];
    158             if (tmpBuffer.length != input.read(tmpBuffer)) {
    159                 throw new UnsupportedFormatException("File too short, not a dictionary");
    160             }
    161             // Ad-hoc check for the magic number. See FormatSpec.java as well as
    162             // byte_array_utils.h and BinaryDictEncoderUtils#writeDictionaryHeader().
    163             final int MAGIC_NUMBER_START_OFFSET = 0;
    164             final int VERSION_START_OFFSET = 4;
    165             final int HEADER_SIZE_OFFSET = 8;
    166             final int magicNumber = ((tmpBuffer[MAGIC_NUMBER_START_OFFSET] & 0xFF) << 24)
    167                     + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 1] & 0xFF) << 16)
    168                     + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 2] & 0xFF) << 8)
    169                     + (tmpBuffer[MAGIC_NUMBER_START_OFFSET + 3] & 0xFF);
    170             if (magicNumber != FormatSpec.MAGIC_NUMBER) {
    171                 throw new UnsupportedFormatException("Wrong magic number");
    172             }
    173             final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8)
    174                     + (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF);
    175             if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201
    176                     && version != FormatSpec.VERSION202) {
    177                 throw new UnsupportedFormatException("Only versions 2, 201, 202 are supported");
    178             }
    179             final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) << 24)
    180                     + ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) << 16)
    181                     + ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) << 8)
    182                     + (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF);
    183             if (totalHeaderSize > MAX_HEADER_LENGTH) {
    184                 throw new UnsupportedFormatException("Header too large");
    185             }
    186             final byte[] headerBuffer = new byte[totalHeaderSize - tmpBuffer.length];
    187             readStreamExhaustively(input, headerBuffer);
    188             final HashMap<String, String> attributes =
    189                     BinaryDictDecoderUtils.decodeHeaderAttributes(headerBuffer);
    190             return new DictionaryHeader(totalHeaderSize, new DictionaryOptions(attributes),
    191                     new FormatOptions(version, false /* hasTimestamp */));
    192         }
    193     }
    194 
    195     private static void readStreamExhaustively(final InputStream inputStream,
    196             final byte[] outBuffer) throws IOException, UnsupportedFormatException {
    197         int readBytes = 0;
    198         int readBytesLastCycle = -1;
    199         while (readBytes != outBuffer.length) {
    200             readBytesLastCycle = inputStream.read(outBuffer, readBytes,
    201                     outBuffer.length - readBytes);
    202             if (readBytesLastCycle == -1)
    203                 throw new UnsupportedFormatException("File shorter than specified in the header"
    204                         + " (expected " + outBuffer.length + ", read " + readBytes + ")");
    205             readBytes += readBytesLastCycle;
    206         }
    207     }
    208 
    209     public static void copy(final InputStream input, final OutputStream output) throws IOException {
    210         final byte[] buffer = new byte[COPY_BUFFER_SIZE];
    211         for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) {
    212             output.write(buffer, 0, readBytes);
    213         }
    214     }
    215 
    216     /**
    217      * Process a dictionary, decrypting/uncompressing it on the fly as necessary.
    218      *
    219      * This will execute the given processor repeatedly with the possible alternatives
    220      * for dictionary format until the processor does not throw an exception.
    221      * If the processor succeeds for none of the possible formats, the method returns null.
    222      */
    223     @Nullable
    224     public static <T> DecoderChainSpec<T> decodeDictionaryForProcess(@Nonnull final File src,
    225             @Nonnull final InputProcessor<T> processor) {
    226         @Nonnull DecoderChainSpec spec = new DecoderChainSpec();
    227         while (null != spec) {
    228             try {
    229                 final InputStream input = spec.getStream(src);
    230                 spec.mResult = processor.process(input);
    231                 try {
    232                     input.close();
    233                 } catch (IOException e) {
    234                     // CipherInputStream doesn't like being closed without having read the
    235                     // entire stream, for some reason. But we don't want to because it's a waste
    236                     // of resources. We really, really don't care about this.
    237                     // However on close() CipherInputStream does throw this exception, wrapped
    238                     // in an IOException so we need to catch it.
    239                     if (!(e.getCause() instanceof javax.crypto.BadPaddingException)) {
    240                         throw e;
    241                     }
    242                 }
    243                 return spec;
    244             } catch (IOException | UnsupportedFormatException | ArrayIndexOutOfBoundsException e) {
    245                 // If the format is not the right one for this file, the processor will throw one
    246                 // of these exceptions. In our case, that means we should try the next spec,
    247                 // since it may still be at another format we haven't tried yet.
    248                 // TODO: stop using exceptions for this non-exceptional case.
    249             }
    250             spec = spec.next();
    251         }
    252         return null;
    253     }
    254 
    255     /**
    256      * Get a decoder chain spec with a raw dictionary file. This makes a new file on the
    257      * disk ready for any treatment the client wants.
    258      */
    259     @Nullable
    260     public static DecoderChainSpec<File> getRawDictionaryOrNull(@Nonnull final File src) {
    261         return decodeDictionaryForProcess(src, new CopyProcessor());
    262     }
    263 
    264     static FusionDictionary getDictionary(final String filename, final boolean report) {
    265         final File file = new File(filename);
    266         if (report) {
    267             System.out.println("Dictionary : " + file.getAbsolutePath());
    268             System.out.println("Size : " + file.length() + " bytes");
    269         }
    270         try {
    271             final DecoderChainSpec<File> decodedSpec = getRawDictionaryOrNull(file);
    272             if (null == decodedSpec) {
    273                 throw new RuntimeException("Does not seem to be a dictionary file " + filename);
    274             }
    275             if (CombinedInputOutput.isCombinedDictionary(decodedSpec.mResult.getAbsolutePath())) {
    276                 if (report) {
    277                     System.out.println("Format : Combined format");
    278                     System.out.println("Packaging : " + decodedSpec.describeChain());
    279                     System.out.println("Uncompressed size : " + decodedSpec.mResult.length());
    280                 }
    281                 try (final BufferedReader reader = new BufferedReader(
    282                         new InputStreamReader(new FileInputStream(decodedSpec.mResult), "UTF-8"))) {
    283                     return CombinedInputOutput.readDictionaryCombined(reader);
    284                 }
    285             }
    286             final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(
    287                     decodedSpec.mResult, 0, decodedSpec.mResult.length(),
    288                     DictDecoder.USE_BYTEARRAY);
    289             if (report) {
    290                 System.out.println("Format : Binary dictionary format");
    291                 System.out.println("Packaging : " + decodedSpec.describeChain());
    292                 System.out.println("Uncompressed size : " + decodedSpec.mResult.length());
    293             }
    294             return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */);
    295         } catch (final IOException | UnsupportedFormatException e) {
    296             throw new RuntimeException("Can't read file " + filename, e);
    297         }
    298     }
    299 }
    300