Home | History | Annotate | Download | only in dicttool
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.dicttool;
     18 
     19 import com.android.inputmethod.latin.makedict.FormatSpec;
     20 import com.android.inputmethod.latin.makedict.FusionDictionary;
     21 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
     22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
     23 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
     24 import com.android.inputmethod.latin.makedict.Word;
     25 
     26 import java.io.BufferedReader;
     27 import java.io.File;
     28 import java.io.FileNotFoundException;
     29 import java.io.FileReader;
     30 import java.io.IOException;
     31 import java.io.InputStream;
     32 import java.io.InputStreamReader;
     33 import java.io.Writer;
     34 import java.util.ArrayList;
     35 import java.util.HashMap;
     36 import java.util.TreeSet;
     37 
     38 /**
     39  * Reads and writes combined format for a FusionDictionary.
     40  *
     41  * All functions in this class are static.
     42  */
     43 public class CombinedInputOutput {
     44 
     45     private static final String DICTIONARY_TAG = "dictionary";
     46     private static final String BIGRAM_TAG = "bigram";
     47     private static final String SHORTCUT_TAG = "shortcut";
     48     private static final String FREQUENCY_TAG = "f";
     49     private static final String WORD_TAG = "word";
     50     private static final String NOT_A_WORD_TAG = "not_a_word";
     51     private static final String WHITELIST_TAG = "whitelist";
     52     private static final String OPTIONS_TAG = "options";
     53     private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing";
     54     private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing";
     55     private static final String COMMENT_LINE_STARTER = "#";
     56 
     57     /**
     58      * Basic test to find out whether the file is in the combined format or not.
     59      *
     60      * Concretely this only tests the header line.
     61      *
     62      * @param filename The name of the file to test.
     63      * @return true if the file is in the combined format, false otherwise
     64      */
     65     public static boolean isCombinedDictionary(final String filename) {
     66         BufferedReader reader = null;
     67         try {
     68             reader = new BufferedReader(new FileReader(new File(filename)));
     69             String firstLine = reader.readLine();
     70             while (firstLine.startsWith(COMMENT_LINE_STARTER)) {
     71                 firstLine = reader.readLine();
     72             }
     73             return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
     74         } catch (FileNotFoundException e) {
     75             return false;
     76         } catch (IOException e) {
     77             return false;
     78         } finally {
     79             if (reader != null) {
     80                 try {
     81                     reader.close();
     82                 } catch (IOException e) {
     83                     // do nothing
     84                 }
     85             }
     86         }
     87     }
     88 
     89     /**
     90      * Reads a dictionary from a combined format file.
     91      *
     92      * This is the public method that will read a combined file and return the corresponding memory
     93      * representation.
     94      *
     95      * @param source the file to read the data from.
     96      * @return the in-memory representation of the dictionary.
     97      */
     98     public static FusionDictionary readDictionaryCombined(final InputStream source)
     99             throws IOException {
    100         final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8"));
    101         String headerLine = reader.readLine();
    102         while (headerLine.startsWith(COMMENT_LINE_STARTER)) {
    103             headerLine = reader.readLine();
    104         }
    105         final String header[] = headerLine.split(",");
    106         final HashMap<String, String> attributes = new HashMap<String, String>();
    107         for (String item : header) {
    108             final String keyValue[] = item.split("=");
    109             if (2 != keyValue.length) {
    110                 throw new RuntimeException("Wrong header format : " + headerLine);
    111             }
    112             attributes.put(keyValue[0], keyValue[1]);
    113         }
    114 
    115         final boolean processUmlauts =
    116                 GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG));
    117         final boolean processLigatures =
    118                 FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG));
    119         attributes.remove(OPTIONS_TAG);
    120         final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), new DictionaryOptions(
    121                 attributes, processUmlauts, processLigatures));
    122 
    123         String line;
    124         String word = null;
    125         int freq = 0;
    126         boolean isNotAWord = false;
    127         ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>();
    128         ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>();
    129         while (null != (line = reader.readLine())) {
    130             if (line.startsWith(COMMENT_LINE_STARTER)) continue;
    131             final String args[] = line.trim().split(",");
    132             if (args[0].matches(WORD_TAG + "=.*")) {
    133                 if (null != word) {
    134                     dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
    135                     for (WeightedString s : bigrams) {
    136                         dict.setBigram(word, s.mWord, s.mFrequency);
    137                     }
    138                 }
    139                 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>();
    140                 if (!bigrams.isEmpty()) bigrams = new ArrayList<WeightedString>();
    141                 isNotAWord = false;
    142                 for (String param : args) {
    143                     final String params[] = param.split("=", 2);
    144                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    145                     if (WORD_TAG.equals(params[0])) {
    146                         word = params[1];
    147                     } else if (FREQUENCY_TAG.equals(params[0])) {
    148                         freq = Integer.parseInt(params[1]);
    149                     } else if (NOT_A_WORD_TAG.equals(params[0])) {
    150                         isNotAWord = "true".equals(params[1]);
    151                     }
    152                 }
    153             } else if (args[0].matches(SHORTCUT_TAG + "=.*")) {
    154                 String shortcut = null;
    155                 int shortcutFreq = 0;
    156                 for (String param : args) {
    157                     final String params[] = param.split("=", 2);
    158                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    159                     if (SHORTCUT_TAG.equals(params[0])) {
    160                         shortcut = params[1];
    161                     } else if (FREQUENCY_TAG.equals(params[0])) {
    162                         shortcutFreq = WHITELIST_TAG.equals(params[1])
    163                                 ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY
    164                                 : Integer.parseInt(params[1]);
    165                     }
    166                 }
    167                 if (null != shortcut) {
    168                     shortcuts.add(new WeightedString(shortcut, shortcutFreq));
    169                 } else {
    170                     throw new RuntimeException("Wrong format : " + line);
    171                 }
    172             } else if (args[0].matches(BIGRAM_TAG + "=.*")) {
    173                 String secondWordOfBigram = null;
    174                 int bigramFreq = 0;
    175                 for (String param : args) {
    176                     final String params[] = param.split("=", 2);
    177                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    178                     if (BIGRAM_TAG.equals(params[0])) {
    179                         secondWordOfBigram = params[1];
    180                     } else if (FREQUENCY_TAG.equals(params[0])) {
    181                         bigramFreq = Integer.parseInt(params[1]);
    182                     }
    183                 }
    184                 if (null != secondWordOfBigram) {
    185                     bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq));
    186                 } else {
    187                     throw new RuntimeException("Wrong format : " + line);
    188                 }
    189             }
    190         }
    191         if (null != word) {
    192             dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
    193             for (WeightedString s : bigrams) {
    194                 dict.setBigram(word, s.mWord, s.mFrequency);
    195             }
    196         }
    197 
    198         return dict;
    199     }
    200 
    201     /**
    202      * Writes a dictionary to a combined file.
    203      *
    204      * @param destination a destination stream to write to.
    205      * @param dict the dictionary to write.
    206      */
    207     public static void writeDictionaryCombined(Writer destination, FusionDictionary dict)
    208             throws IOException {
    209         final TreeSet<Word> set = new TreeSet<Word>();
    210         for (Word word : dict) {
    211             set.add(word); // This for ordering by frequency, then by asciibetic order
    212         }
    213         final HashMap<String, String> options = dict.mOptions.mAttributes;
    214         destination.write(DICTIONARY_TAG + "=");
    215         if (options.containsKey(DICTIONARY_TAG)) {
    216             destination.write(options.get(DICTIONARY_TAG));
    217             options.remove(DICTIONARY_TAG);
    218         }
    219         if (dict.mOptions.mGermanUmlautProcessing) {
    220             destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION);
    221         } else if (dict.mOptions.mFrenchLigatureProcessing) {
    222             destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION);
    223         }
    224         for (final String key : dict.mOptions.mAttributes.keySet()) {
    225             final String value = dict.mOptions.mAttributes.get(key);
    226             destination.write("," + key + "=" + value);
    227         }
    228         destination.write("\n");
    229         for (Word word : set) {
    230             destination.write(" " + WORD_TAG + "=" + word.mWord + ","
    231                     + FREQUENCY_TAG + "=" + word.mFrequency
    232                     + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n"));
    233             if (null != word.mShortcutTargets) {
    234                 for (WeightedString target : word.mShortcutTargets) {
    235                     destination.write("  " + SHORTCUT_TAG + "=" + target.mWord + ","
    236                             + FREQUENCY_TAG + "=" + target.mFrequency + "\n");
    237                 }
    238             }
    239             if (null != word.mBigrams) {
    240                 for (WeightedString bigram : word.mBigrams) {
    241                     destination.write("  " + BIGRAM_TAG + "=" + bigram.mWord + ","
    242                             + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n");
    243                 }
    244             }
    245         }
    246         destination.close();
    247     }
    248 }
    249