Home | History | Annotate | Download | only in dicttool
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.dicttool;
     18 
     19 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils;
     20 import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
     21 import com.android.inputmethod.latin.makedict.DictDecoder;
     22 import com.android.inputmethod.latin.makedict.DictEncoder;
     23 import com.android.inputmethod.latin.makedict.FormatSpec;
     24 import com.android.inputmethod.latin.makedict.FusionDictionary;
     25 import com.android.inputmethod.latin.makedict.MakedictLog;
     26 import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
     27 import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
     28 import com.android.inputmethod.latin.makedict.Ver4DictEncoder;
     29 
     30 import org.xml.sax.SAXException;
     31 
     32 import java.io.BufferedInputStream;
     33 import java.io.BufferedReader;
     34 import java.io.BufferedWriter;
     35 import java.io.File;
     36 import java.io.FileInputStream;
     37 import java.io.FileNotFoundException;
     38 import java.io.FileWriter;
     39 import java.io.IOException;
     40 import java.io.InputStream;
     41 import java.io.InputStreamReader;
     42 import java.util.Arrays;
     43 import java.util.LinkedList;
     44 
     45 import javax.xml.parsers.ParserConfigurationException;
     46 
     47 /**
     48  * Main class/method for DictionaryMaker.
     49  */
     50 public class DictionaryMaker {
     51 
     52     static class Arguments {
     53         private static final String OPTION_VERSION_2 = "-2";
     54         private static final String OPTION_VERSION_4 = "-4";
     55         private static final String OPTION_INPUT_SOURCE = "-s";
     56         private static final String OPTION_INPUT_BIGRAM_XML = "-b";
     57         private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
     58         private static final String OPTION_OUTPUT_BINARY = "-d";
     59         private static final String OPTION_OUTPUT_XML = "-x";
     60         private static final String OPTION_OUTPUT_COMBINED = "-o";
     61         private static final String OPTION_HELP = "-h";
     62         public final String mInputBinary;
     63         public final String mInputCombined;
     64         public final String mInputUnigramXml;
     65         public final String mInputShortcutXml;
     66         public final String mInputBigramXml;
     67         public final String mOutputBinary;
     68         public final String mOutputXml;
     69         public final String mOutputCombined;
     70         public final int mOutputBinaryFormatVersion;
     71 
     72         private void checkIntegrity() throws IOException {
     73             checkHasExactlyOneInput();
     74             checkHasAtLeastOneOutput();
     75             checkNotSameFile(mInputBinary, mOutputBinary);
     76             checkNotSameFile(mInputBinary, mOutputXml);
     77             checkNotSameFile(mInputCombined, mOutputBinary);
     78             checkNotSameFile(mInputCombined, mOutputXml);
     79             checkNotSameFile(mInputUnigramXml, mOutputBinary);
     80             checkNotSameFile(mInputUnigramXml, mOutputXml);
     81             checkNotSameFile(mInputUnigramXml, mOutputCombined);
     82             checkNotSameFile(mInputShortcutXml, mOutputBinary);
     83             checkNotSameFile(mInputShortcutXml, mOutputXml);
     84             checkNotSameFile(mInputShortcutXml, mOutputCombined);
     85             checkNotSameFile(mInputBigramXml, mOutputBinary);
     86             checkNotSameFile(mInputBigramXml, mOutputXml);
     87             checkNotSameFile(mInputBigramXml, mOutputCombined);
     88             checkNotSameFile(mOutputBinary, mOutputXml);
     89             checkNotSameFile(mOutputBinary, mOutputCombined);
     90             checkNotSameFile(mOutputXml, mOutputCombined);
     91         }
     92 
     93         private void checkHasExactlyOneInput() {
     94             if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) {
     95                 throw new RuntimeException("No input file specified");
     96             } else if ((null != mInputUnigramXml && null != mInputBinary)
     97                     || (null != mInputUnigramXml && null != mInputCombined)
     98                     || (null != mInputBinary && null != mInputCombined)) {
     99                 throw new RuntimeException("Several input files specified");
    100             } else if ((null != mInputBinary || null != mInputCombined)
    101                     && (null != mInputBigramXml || null != mInputShortcutXml)) {
    102                 throw new RuntimeException("Separate bigrams/shortcut files are only supported"
    103                         + " with XML input (other formats include bigrams and shortcuts already)");
    104             }
    105         }
    106 
    107         private void checkHasAtLeastOneOutput() {
    108             if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) {
    109                 throw new RuntimeException("No output specified");
    110             }
    111         }
    112 
    113         /**
    114          * Utility method that throws an exception if path1 and path2 point to the same file.
    115          */
    116         private static void checkNotSameFile(final String path1, final String path2)
    117                 throws IOException {
    118             if (null == path1 || null == path2) return;
    119             if (new File(path1).getCanonicalPath().equals(new File(path2).getCanonicalPath())) {
    120                 throw new RuntimeException(path1 + " and " + path2 + " are the same file: "
    121                         + " refusing to process.");
    122             }
    123         }
    124 
    125         private void displayHelp() {
    126             MakedictLog.i(getHelp());
    127         }
    128 
    129         public static String getHelp() {
    130             return "Usage: makedict "
    131                     + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
    132                     + "| [-s <combined format input]"
    133                     + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
    134                     + " [-o <combined output>]"
    135                     + "[-2] [-3] [-4]\n"
    136                     + "\n"
    137                     + "  Converts a source dictionary file to one or several outputs.\n"
    138                     + "  Source can be an XML file, with an optional XML bigrams file, or a\n"
    139                     + "  binary dictionary file.\n"
    140                     + "  Binary version 2 (Jelly Bean), 3, 4, XML and\n"
    141                     + "  combined format outputs are supported.";
    142         }
    143 
    144         public Arguments(String[] argsArray) throws IOException {
    145             final LinkedList<String> args = new LinkedList<>(Arrays.asList(argsArray));
    146             if (args.isEmpty()) {
    147                 displayHelp();
    148             }
    149             String inputBinary = null;
    150             String inputCombined = null;
    151             String inputUnigramXml = null;
    152             String inputShortcutXml = null;
    153             String inputBigramXml = null;
    154             String outputBinary = null;
    155             String outputXml = null;
    156             String outputCombined = null;
    157             int outputBinaryFormatVersion = 2; // the default version is 2.
    158 
    159             while (!args.isEmpty()) {
    160                 final String arg = args.get(0);
    161                 args.remove(0);
    162                 if (arg.charAt(0) == '-') {
    163                     if (OPTION_VERSION_2.equals(arg)) {
    164                         // Do nothing, this is the default
    165                     } else if (OPTION_VERSION_4.equals(arg)) {
    166                         outputBinaryFormatVersion = FormatSpec.VERSION4;
    167                     } else if (OPTION_HELP.equals(arg)) {
    168                         displayHelp();
    169                     } else {
    170                         // All these options need an argument
    171                         if (args.isEmpty()) {
    172                             throw new IllegalArgumentException("Option " + arg + " is unknown or "
    173                                     + "requires an argument");
    174                         }
    175                         String filename = args.get(0);
    176                         args.remove(0);
    177                         if (OPTION_INPUT_SOURCE.equals(arg)) {
    178                             if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) {
    179                                 inputUnigramXml = filename;
    180                             } else if (CombinedInputOutput.isCombinedDictionary(filename)) {
    181                                 inputCombined = filename;
    182                             } else if (BinaryDictDecoderUtils.isBinaryDictionary(filename)) {
    183                                 inputBinary = filename;
    184                             } else {
    185                                 throw new IllegalArgumentException(
    186                                         "Unknown format for file " + filename);
    187                             }
    188                         } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
    189                             inputShortcutXml = filename;
    190                         } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
    191                             inputBigramXml = filename;
    192                         } else if (OPTION_OUTPUT_BINARY.equals(arg)) {
    193                             outputBinary = filename;
    194                         } else if (OPTION_OUTPUT_XML.equals(arg)) {
    195                             outputXml = filename;
    196                         } else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
    197                             outputCombined = filename;
    198                         } else {
    199                             throw new IllegalArgumentException("Unknown option : " + arg);
    200                         }
    201                     }
    202                 } else {
    203                     if (null == inputBinary && null == inputUnigramXml) {
    204                         if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) {
    205                             inputBinary = arg;
    206                         } else if (CombinedInputOutput.isCombinedDictionary(arg)) {
    207                             inputCombined = arg;
    208                         } else {
    209                             inputUnigramXml = arg;
    210                         }
    211                     } else if (null == outputBinary) {
    212                         outputBinary = arg;
    213                     } else {
    214                         throw new IllegalArgumentException("Several output binary files specified");
    215                     }
    216                 }
    217             }
    218 
    219             mInputBinary = inputBinary;
    220             mInputCombined = inputCombined;
    221             mInputUnigramXml = inputUnigramXml;
    222             mInputShortcutXml = inputShortcutXml;
    223             mInputBigramXml = inputBigramXml;
    224             mOutputBinary = outputBinary;
    225             mOutputXml = outputXml;
    226             mOutputCombined = outputCombined;
    227             mOutputBinaryFormatVersion = outputBinaryFormatVersion;
    228             checkIntegrity();
    229         }
    230     }
    231 
    232     public static void main(String[] args)
    233             throws FileNotFoundException, ParserConfigurationException, SAXException, IOException,
    234             UnsupportedFormatException {
    235         final Arguments parsedArgs = new Arguments(args);
    236         FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs);
    237         writeOutputToParsedArgs(parsedArgs, dictionary);
    238     }
    239 
    240     /**
    241      * Invoke the right input method according to args.
    242      *
    243      * @param args the parsed command line arguments.
    244      * @return the read dictionary.
    245      */
    246     private static FusionDictionary readInputFromParsedArgs(final Arguments args)
    247             throws IOException, UnsupportedFormatException, ParserConfigurationException,
    248             SAXException, FileNotFoundException {
    249         if (null != args.mInputBinary) {
    250             return readBinaryFile(args.mInputBinary);
    251         } else if (null != args.mInputCombined) {
    252             return readCombinedFile(args.mInputCombined);
    253         } else if (null != args.mInputUnigramXml) {
    254             return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
    255         } else {
    256             throw new RuntimeException("No input file specified");
    257         }
    258     }
    259 
    260     /**
    261      * Read a dictionary from the name of a binary file.
    262      *
    263      * @param binaryFilename the name of the file in the binary dictionary format.
    264      * @return the read dictionary.
    265      * @throws FileNotFoundException if the file can't be found
    266      * @throws IOException if the input file can't be read
    267      * @throws UnsupportedFormatException if the binary file is not in the expected format
    268      */
    269     private static FusionDictionary readBinaryFile(final String binaryFilename)
    270             throws FileNotFoundException, IOException, UnsupportedFormatException {
    271         final File file = new File(binaryFilename);
    272         final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
    273         return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */);
    274     }
    275 
    276     /**
    277      * Read a dictionary from the name of a combined file.
    278      *
    279      * @param combinedFilename the name of the file in the combined format.
    280      * @return the read dictionary.
    281      * @throws FileNotFoundException if the file can't be found
    282      * @throws IOException if the input file can't be read
    283      */
    284     private static FusionDictionary readCombinedFile(final String combinedFilename)
    285         throws FileNotFoundException, IOException {
    286         try (final BufferedReader reader = new BufferedReader(new InputStreamReader(
    287                 new FileInputStream(combinedFilename), "UTF-8"))
    288         ) {
    289             return CombinedInputOutput.readDictionaryCombined(reader);
    290         }
    291     }
    292 
    293     private static BufferedInputStream getBufferedFileInputStream(final String filename)
    294             throws FileNotFoundException {
    295         if (filename == null) {
    296             return null;
    297         }
    298         return new BufferedInputStream(new FileInputStream(filename));
    299     }
    300 
    301     /**
    302      * Read a dictionary from a unigram XML file, and optionally a bigram XML file.
    303      *
    304      * @param unigramXmlFilename the name of the unigram XML file. May not be null.
    305      * @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none.
    306      * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
    307      * @return the read dictionary.
    308      * @throws FileNotFoundException if one of the files can't be found
    309      * @throws SAXException if one or more of the XML files is not well-formed
    310      * @throws IOException if one the input files can't be read
    311      * @throws ParserConfigurationException if the system can't create a SAX parser
    312      */
    313     private static FusionDictionary readXmlFile(final String unigramXmlFilename,
    314             final String shortcutXmlFilename, final String bigramXmlFilename)
    315             throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
    316         try (
    317             final BufferedInputStream unigrams = getBufferedFileInputStream(unigramXmlFilename);
    318             final BufferedInputStream shortcuts = getBufferedFileInputStream(shortcutXmlFilename);
    319             final BufferedInputStream bigrams = getBufferedFileInputStream(bigramXmlFilename);
    320         ) {
    321             return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
    322         }
    323     }
    324 
    325     /**
    326      * Invoke the right output method according to args.
    327      *
    328      * This will write the passed dictionary to the file(s) passed in the command line arguments.
    329      * @param args the parsed arguments.
    330      * @param dict the file to output.
    331      * @throws FileNotFoundException if one of the output files can't be created.
    332      * @throws IOException if one of the output files can't be written to.
    333      */
    334     private static void writeOutputToParsedArgs(final Arguments args, final FusionDictionary dict)
    335             throws FileNotFoundException, IOException, UnsupportedFormatException,
    336             IllegalArgumentException {
    337         if (null != args.mOutputBinary) {
    338             writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion);
    339         }
    340         if (null != args.mOutputXml) {
    341             writeXmlDictionary(args.mOutputXml, dict);
    342         }
    343         if (null != args.mOutputCombined) {
    344             writeCombinedDictionary(args.mOutputCombined, dict);
    345         }
    346     }
    347 
    348     /**
    349      * Write the dictionary in binary format to the specified filename.
    350      *
    351      * @param outputFilename the name of the file to write to.
    352      * @param dict the dictionary to write.
    353      * @param version the binary format version to use.
    354      * @throws FileNotFoundException if the output file can't be created.
    355      * @throws IOException if the output file can't be written to.
    356      */
    357     private static void writeBinaryDictionary(final String outputFilename,
    358             final FusionDictionary dict, final int version)
    359             throws FileNotFoundException, IOException, UnsupportedFormatException {
    360         final File outputFile = new File(outputFilename);
    361         final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions(version);
    362         final DictEncoder dictEncoder;
    363         if (version == FormatSpec.VERSION4) {
    364             dictEncoder = new Ver4DictEncoder(outputFile);
    365         } else {
    366             dictEncoder = new Ver2DictEncoder(outputFile);
    367         }
    368         dictEncoder.writeDictionary(dict, formatOptions);
    369     }
    370 
    371     /**
    372      * Write the dictionary in XML format to the specified filename.
    373      *
    374      * @param outputFilename the name of the file to write to.
    375      * @param dict the dictionary to write.
    376      * @throws FileNotFoundException if the output file can't be created.
    377      * @throws IOException if the output file can't be written to.
    378      */
    379     private static void writeXmlDictionary(final String outputFilename,
    380             final FusionDictionary dict) throws FileNotFoundException, IOException {
    381         try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
    382             XmlDictInputOutput.writeDictionaryXml(writer, dict);
    383         }
    384     }
    385 
    386     /**
    387      * Write the dictionary in the combined format to the specified filename.
    388      *
    389      * @param outputFilename the name of the file to write to.
    390      * @param dict the dictionary to write.
    391      * @throws FileNotFoundException if the output file can't be created.
    392      * @throws IOException if the output file can't be written to.
    393      */
    394     private static void writeCombinedDictionary(final String outputFilename,
    395             final FusionDictionary dict) throws FileNotFoundException, IOException {
    396         try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
    397             CombinedInputOutput.writeDictionaryCombined(writer, dict);
    398         }
    399     }
    400 }
    401