1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.dicttool; 18 19 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils; 20 import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; 21 import com.android.inputmethod.latin.makedict.DictDecoder; 22 import com.android.inputmethod.latin.makedict.DictEncoder; 23 import com.android.inputmethod.latin.makedict.FormatSpec; 24 import com.android.inputmethod.latin.makedict.FusionDictionary; 25 import com.android.inputmethod.latin.makedict.MakedictLog; 26 import com.android.inputmethod.latin.makedict.UnsupportedFormatException; 27 import com.android.inputmethod.latin.makedict.Ver2DictEncoder; 28 import com.android.inputmethod.latin.makedict.Ver4DictEncoder; 29 30 import org.xml.sax.SAXException; 31 32 import java.io.BufferedInputStream; 33 import java.io.BufferedReader; 34 import java.io.BufferedWriter; 35 import java.io.File; 36 import java.io.FileInputStream; 37 import java.io.FileNotFoundException; 38 import java.io.FileWriter; 39 import java.io.IOException; 40 import java.io.InputStream; 41 import java.io.InputStreamReader; 42 import java.util.Arrays; 43 import java.util.LinkedList; 44 45 import javax.xml.parsers.ParserConfigurationException; 46 47 /** 48 * Main class/method for DictionaryMaker. 49 */ 50 public class DictionaryMaker { 51 52 static class Arguments { 53 private static final String OPTION_VERSION_2 = "-2"; 54 private static final String OPTION_VERSION_4 = "-4"; 55 private static final String OPTION_INPUT_SOURCE = "-s"; 56 private static final String OPTION_INPUT_BIGRAM_XML = "-b"; 57 private static final String OPTION_INPUT_SHORTCUT_XML = "-c"; 58 private static final String OPTION_OUTPUT_BINARY = "-d"; 59 private static final String OPTION_OUTPUT_XML = "-x"; 60 private static final String OPTION_OUTPUT_COMBINED = "-o"; 61 private static final String OPTION_HELP = "-h"; 62 public final String mInputBinary; 63 public final String mInputCombined; 64 public final String mInputUnigramXml; 65 public final String mInputShortcutXml; 66 public final String mInputBigramXml; 67 public final String mOutputBinary; 68 public final String mOutputXml; 69 public final String mOutputCombined; 70 public final int mOutputBinaryFormatVersion; 71 72 private void checkIntegrity() throws IOException { 73 checkHasExactlyOneInput(); 74 checkHasAtLeastOneOutput(); 75 checkNotSameFile(mInputBinary, mOutputBinary); 76 checkNotSameFile(mInputBinary, mOutputXml); 77 checkNotSameFile(mInputCombined, mOutputBinary); 78 checkNotSameFile(mInputCombined, mOutputXml); 79 checkNotSameFile(mInputUnigramXml, mOutputBinary); 80 checkNotSameFile(mInputUnigramXml, mOutputXml); 81 checkNotSameFile(mInputUnigramXml, mOutputCombined); 82 checkNotSameFile(mInputShortcutXml, mOutputBinary); 83 checkNotSameFile(mInputShortcutXml, mOutputXml); 84 checkNotSameFile(mInputShortcutXml, mOutputCombined); 85 checkNotSameFile(mInputBigramXml, mOutputBinary); 86 checkNotSameFile(mInputBigramXml, mOutputXml); 87 checkNotSameFile(mInputBigramXml, mOutputCombined); 88 checkNotSameFile(mOutputBinary, mOutputXml); 89 checkNotSameFile(mOutputBinary, mOutputCombined); 90 checkNotSameFile(mOutputXml, mOutputCombined); 91 } 92 93 private void checkHasExactlyOneInput() { 94 if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) { 95 throw new RuntimeException("No input file specified"); 96 } else if ((null != mInputUnigramXml && null != mInputBinary) 97 || (null != mInputUnigramXml && null != mInputCombined) 98 || (null != mInputBinary && null != mInputCombined)) { 99 throw new RuntimeException("Several input files specified"); 100 } else if ((null != mInputBinary || null != mInputCombined) 101 && (null != mInputBigramXml || null != mInputShortcutXml)) { 102 throw new RuntimeException("Separate bigrams/shortcut files are only supported" 103 + " with XML input (other formats include bigrams and shortcuts already)"); 104 } 105 } 106 107 private void checkHasAtLeastOneOutput() { 108 if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) { 109 throw new RuntimeException("No output specified"); 110 } 111 } 112 113 /** 114 * Utility method that throws an exception if path1 and path2 point to the same file. 115 */ 116 private static void checkNotSameFile(final String path1, final String path2) 117 throws IOException { 118 if (null == path1 || null == path2) return; 119 if (new File(path1).getCanonicalPath().equals(new File(path2).getCanonicalPath())) { 120 throw new RuntimeException(path1 + " and " + path2 + " are the same file: " 121 + " refusing to process."); 122 } 123 } 124 125 private void displayHelp() { 126 MakedictLog.i(getHelp()); 127 } 128 129 public static String getHelp() { 130 return "Usage: makedict " 131 + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] " 132 + "| [-s <combined format input]" 133 + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] " 134 + " [-o <combined output>]" 135 + "[-2] [-3] [-4]\n" 136 + "\n" 137 + " Converts a source dictionary file to one or several outputs.\n" 138 + " Source can be an XML file, with an optional XML bigrams file, or a\n" 139 + " binary dictionary file.\n" 140 + " Binary version 2 (Jelly Bean), 3, 4, XML and\n" 141 + " combined format outputs are supported."; 142 } 143 144 public Arguments(String[] argsArray) throws IOException { 145 final LinkedList<String> args = new LinkedList<>(Arrays.asList(argsArray)); 146 if (args.isEmpty()) { 147 displayHelp(); 148 } 149 String inputBinary = null; 150 String inputCombined = null; 151 String inputUnigramXml = null; 152 String inputShortcutXml = null; 153 String inputBigramXml = null; 154 String outputBinary = null; 155 String outputXml = null; 156 String outputCombined = null; 157 int outputBinaryFormatVersion = 2; // the default version is 2. 158 159 while (!args.isEmpty()) { 160 final String arg = args.get(0); 161 args.remove(0); 162 if (arg.charAt(0) == '-') { 163 if (OPTION_VERSION_2.equals(arg)) { 164 // Do nothing, this is the default 165 } else if (OPTION_VERSION_4.equals(arg)) { 166 outputBinaryFormatVersion = FormatSpec.VERSION4; 167 } else if (OPTION_HELP.equals(arg)) { 168 displayHelp(); 169 } else { 170 // All these options need an argument 171 if (args.isEmpty()) { 172 throw new IllegalArgumentException("Option " + arg + " is unknown or " 173 + "requires an argument"); 174 } 175 String filename = args.get(0); 176 args.remove(0); 177 if (OPTION_INPUT_SOURCE.equals(arg)) { 178 if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) { 179 inputUnigramXml = filename; 180 } else if (CombinedInputOutput.isCombinedDictionary(filename)) { 181 inputCombined = filename; 182 } else if (BinaryDictDecoderUtils.isBinaryDictionary(filename)) { 183 inputBinary = filename; 184 } else { 185 throw new IllegalArgumentException( 186 "Unknown format for file " + filename); 187 } 188 } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) { 189 inputShortcutXml = filename; 190 } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) { 191 inputBigramXml = filename; 192 } else if (OPTION_OUTPUT_BINARY.equals(arg)) { 193 outputBinary = filename; 194 } else if (OPTION_OUTPUT_XML.equals(arg)) { 195 outputXml = filename; 196 } else if (OPTION_OUTPUT_COMBINED.equals(arg)) { 197 outputCombined = filename; 198 } else { 199 throw new IllegalArgumentException("Unknown option : " + arg); 200 } 201 } 202 } else { 203 if (null == inputBinary && null == inputUnigramXml) { 204 if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) { 205 inputBinary = arg; 206 } else if (CombinedInputOutput.isCombinedDictionary(arg)) { 207 inputCombined = arg; 208 } else { 209 inputUnigramXml = arg; 210 } 211 } else if (null == outputBinary) { 212 outputBinary = arg; 213 } else { 214 throw new IllegalArgumentException("Several output binary files specified"); 215 } 216 } 217 } 218 219 mInputBinary = inputBinary; 220 mInputCombined = inputCombined; 221 mInputUnigramXml = inputUnigramXml; 222 mInputShortcutXml = inputShortcutXml; 223 mInputBigramXml = inputBigramXml; 224 mOutputBinary = outputBinary; 225 mOutputXml = outputXml; 226 mOutputCombined = outputCombined; 227 mOutputBinaryFormatVersion = outputBinaryFormatVersion; 228 checkIntegrity(); 229 } 230 } 231 232 public static void main(String[] args) 233 throws FileNotFoundException, ParserConfigurationException, SAXException, IOException, 234 UnsupportedFormatException { 235 final Arguments parsedArgs = new Arguments(args); 236 FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs); 237 writeOutputToParsedArgs(parsedArgs, dictionary); 238 } 239 240 /** 241 * Invoke the right input method according to args. 242 * 243 * @param args the parsed command line arguments. 244 * @return the read dictionary. 245 */ 246 private static FusionDictionary readInputFromParsedArgs(final Arguments args) 247 throws IOException, UnsupportedFormatException, ParserConfigurationException, 248 SAXException, FileNotFoundException { 249 if (null != args.mInputBinary) { 250 return readBinaryFile(args.mInputBinary); 251 } else if (null != args.mInputCombined) { 252 return readCombinedFile(args.mInputCombined); 253 } else if (null != args.mInputUnigramXml) { 254 return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); 255 } else { 256 throw new RuntimeException("No input file specified"); 257 } 258 } 259 260 /** 261 * Read a dictionary from the name of a binary file. 262 * 263 * @param binaryFilename the name of the file in the binary dictionary format. 264 * @return the read dictionary. 265 * @throws FileNotFoundException if the file can't be found 266 * @throws IOException if the input file can't be read 267 * @throws UnsupportedFormatException if the binary file is not in the expected format 268 */ 269 private static FusionDictionary readBinaryFile(final String binaryFilename) 270 throws FileNotFoundException, IOException, UnsupportedFormatException { 271 final File file = new File(binaryFilename); 272 final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length()); 273 return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); 274 } 275 276 /** 277 * Read a dictionary from the name of a combined file. 278 * 279 * @param combinedFilename the name of the file in the combined format. 280 * @return the read dictionary. 281 * @throws FileNotFoundException if the file can't be found 282 * @throws IOException if the input file can't be read 283 */ 284 private static FusionDictionary readCombinedFile(final String combinedFilename) 285 throws FileNotFoundException, IOException { 286 try (final BufferedReader reader = new BufferedReader(new InputStreamReader( 287 new FileInputStream(combinedFilename), "UTF-8")) 288 ) { 289 return CombinedInputOutput.readDictionaryCombined(reader); 290 } 291 } 292 293 private static BufferedInputStream getBufferedFileInputStream(final String filename) 294 throws FileNotFoundException { 295 if (filename == null) { 296 return null; 297 } 298 return new BufferedInputStream(new FileInputStream(filename)); 299 } 300 301 /** 302 * Read a dictionary from a unigram XML file, and optionally a bigram XML file. 303 * 304 * @param unigramXmlFilename the name of the unigram XML file. May not be null. 305 * @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none. 306 * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams. 307 * @return the read dictionary. 308 * @throws FileNotFoundException if one of the files can't be found 309 * @throws SAXException if one or more of the XML files is not well-formed 310 * @throws IOException if one the input files can't be read 311 * @throws ParserConfigurationException if the system can't create a SAX parser 312 */ 313 private static FusionDictionary readXmlFile(final String unigramXmlFilename, 314 final String shortcutXmlFilename, final String bigramXmlFilename) 315 throws FileNotFoundException, SAXException, IOException, ParserConfigurationException { 316 try ( 317 final BufferedInputStream unigrams = getBufferedFileInputStream(unigramXmlFilename); 318 final BufferedInputStream shortcuts = getBufferedFileInputStream(shortcutXmlFilename); 319 final BufferedInputStream bigrams = getBufferedFileInputStream(bigramXmlFilename); 320 ) { 321 return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams); 322 } 323 } 324 325 /** 326 * Invoke the right output method according to args. 327 * 328 * This will write the passed dictionary to the file(s) passed in the command line arguments. 329 * @param args the parsed arguments. 330 * @param dict the file to output. 331 * @throws FileNotFoundException if one of the output files can't be created. 332 * @throws IOException if one of the output files can't be written to. 333 */ 334 private static void writeOutputToParsedArgs(final Arguments args, final FusionDictionary dict) 335 throws FileNotFoundException, IOException, UnsupportedFormatException, 336 IllegalArgumentException { 337 if (null != args.mOutputBinary) { 338 writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion); 339 } 340 if (null != args.mOutputXml) { 341 writeXmlDictionary(args.mOutputXml, dict); 342 } 343 if (null != args.mOutputCombined) { 344 writeCombinedDictionary(args.mOutputCombined, dict); 345 } 346 } 347 348 /** 349 * Write the dictionary in binary format to the specified filename. 350 * 351 * @param outputFilename the name of the file to write to. 352 * @param dict the dictionary to write. 353 * @param version the binary format version to use. 354 * @throws FileNotFoundException if the output file can't be created. 355 * @throws IOException if the output file can't be written to. 356 */ 357 private static void writeBinaryDictionary(final String outputFilename, 358 final FusionDictionary dict, final int version) 359 throws FileNotFoundException, IOException, UnsupportedFormatException { 360 final File outputFile = new File(outputFilename); 361 final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions(version); 362 final DictEncoder dictEncoder; 363 if (version == FormatSpec.VERSION4) { 364 dictEncoder = new Ver4DictEncoder(outputFile); 365 } else { 366 dictEncoder = new Ver2DictEncoder(outputFile); 367 } 368 dictEncoder.writeDictionary(dict, formatOptions); 369 } 370 371 /** 372 * Write the dictionary in XML format to the specified filename. 373 * 374 * @param outputFilename the name of the file to write to. 375 * @param dict the dictionary to write. 376 * @throws FileNotFoundException if the output file can't be created. 377 * @throws IOException if the output file can't be written to. 378 */ 379 private static void writeXmlDictionary(final String outputFilename, 380 final FusionDictionary dict) throws FileNotFoundException, IOException { 381 try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) { 382 XmlDictInputOutput.writeDictionaryXml(writer, dict); 383 } 384 } 385 386 /** 387 * Write the dictionary in the combined format to the specified filename. 388 * 389 * @param outputFilename the name of the file to write to. 390 * @param dict the dictionary to write. 391 * @throws FileNotFoundException if the output file can't be created. 392 * @throws IOException if the output file can't be written to. 393 */ 394 private static void writeCombinedDictionary(final String outputFilename, 395 final FusionDictionary dict) throws FileNotFoundException, IOException { 396 try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) { 397 CombinedInputOutput.writeDictionaryCombined(writer, dict); 398 } 399 } 400 } 401