1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.dicttool; 18 19 import com.android.inputmethod.latin.makedict.FormatSpec; 20 import com.android.inputmethod.latin.makedict.FusionDictionary; 21 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; 22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 23 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 24 import com.android.inputmethod.latin.makedict.Word; 25 26 import java.io.BufferedReader; 27 import java.io.File; 28 import java.io.FileNotFoundException; 29 import java.io.FileReader; 30 import java.io.IOException; 31 import java.io.InputStream; 32 import java.io.InputStreamReader; 33 import java.io.Writer; 34 import java.util.ArrayList; 35 import java.util.HashMap; 36 import java.util.TreeSet; 37 38 /** 39 * Reads and writes combined format for a FusionDictionary. 40 * 41 * All functions in this class are static. 42 */ 43 public class CombinedInputOutput { 44 45 private static final String DICTIONARY_TAG = "dictionary"; 46 private static final String BIGRAM_TAG = "bigram"; 47 private static final String SHORTCUT_TAG = "shortcut"; 48 private static final String FREQUENCY_TAG = "f"; 49 private static final String WORD_TAG = "word"; 50 private static final String NOT_A_WORD_TAG = "not_a_word"; 51 private static final String WHITELIST_TAG = "whitelist"; 52 private static final String OPTIONS_TAG = "options"; 53 private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; 54 private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; 55 private static final String COMMENT_LINE_STARTER = "#"; 56 57 /** 58 * Basic test to find out whether the file is in the combined format or not. 59 * 60 * Concretely this only tests the header line. 61 * 62 * @param filename The name of the file to test. 63 * @return true if the file is in the combined format, false otherwise 64 */ 65 public static boolean isCombinedDictionary(final String filename) { 66 BufferedReader reader = null; 67 try { 68 reader = new BufferedReader(new FileReader(new File(filename))); 69 String firstLine = reader.readLine(); 70 while (firstLine.startsWith(COMMENT_LINE_STARTER)) { 71 firstLine = reader.readLine(); 72 } 73 return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); 74 } catch (FileNotFoundException e) { 75 return false; 76 } catch (IOException e) { 77 return false; 78 } finally { 79 if (reader != null) { 80 try { 81 reader.close(); 82 } catch (IOException e) { 83 // do nothing 84 } 85 } 86 } 87 } 88 89 /** 90 * Reads a dictionary from a combined format file. 91 * 92 * This is the public method that will read a combined file and return the corresponding memory 93 * representation. 94 * 95 * @param source the file to read the data from. 96 * @return the in-memory representation of the dictionary. 97 */ 98 public static FusionDictionary readDictionaryCombined(final InputStream source) 99 throws IOException { 100 final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8")); 101 String headerLine = reader.readLine(); 102 while (headerLine.startsWith(COMMENT_LINE_STARTER)) { 103 headerLine = reader.readLine(); 104 } 105 final String header[] = headerLine.split(","); 106 final HashMap<String, String> attributes = new HashMap<String, String>(); 107 for (String item : header) { 108 final String keyValue[] = item.split("="); 109 if (2 != keyValue.length) { 110 throw new RuntimeException("Wrong header format : " + headerLine); 111 } 112 attributes.put(keyValue[0], keyValue[1]); 113 } 114 115 final boolean processUmlauts = 116 GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); 117 final boolean processLigatures = 118 FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); 119 attributes.remove(OPTIONS_TAG); 120 final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), new DictionaryOptions( 121 attributes, processUmlauts, processLigatures)); 122 123 String line; 124 String word = null; 125 int freq = 0; 126 boolean isNotAWord = false; 127 ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>(); 128 ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>(); 129 while (null != (line = reader.readLine())) { 130 if (line.startsWith(COMMENT_LINE_STARTER)) continue; 131 final String args[] = line.trim().split(","); 132 if (args[0].matches(WORD_TAG + "=.*")) { 133 if (null != word) { 134 dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); 135 for (WeightedString s : bigrams) { 136 dict.setBigram(word, s.mWord, s.mFrequency); 137 } 138 } 139 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>(); 140 if (!bigrams.isEmpty()) bigrams = new ArrayList<WeightedString>(); 141 isNotAWord = false; 142 for (String param : args) { 143 final String params[] = param.split("=", 2); 144 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 145 if (WORD_TAG.equals(params[0])) { 146 word = params[1]; 147 } else if (FREQUENCY_TAG.equals(params[0])) { 148 freq = Integer.parseInt(params[1]); 149 } else if (NOT_A_WORD_TAG.equals(params[0])) { 150 isNotAWord = "true".equals(params[1]); 151 } 152 } 153 } else if (args[0].matches(SHORTCUT_TAG + "=.*")) { 154 String shortcut = null; 155 int shortcutFreq = 0; 156 for (String param : args) { 157 final String params[] = param.split("=", 2); 158 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 159 if (SHORTCUT_TAG.equals(params[0])) { 160 shortcut = params[1]; 161 } else if (FREQUENCY_TAG.equals(params[0])) { 162 shortcutFreq = WHITELIST_TAG.equals(params[1]) 163 ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY 164 : Integer.parseInt(params[1]); 165 } 166 } 167 if (null != shortcut) { 168 shortcuts.add(new WeightedString(shortcut, shortcutFreq)); 169 } else { 170 throw new RuntimeException("Wrong format : " + line); 171 } 172 } else if (args[0].matches(BIGRAM_TAG + "=.*")) { 173 String secondWordOfBigram = null; 174 int bigramFreq = 0; 175 for (String param : args) { 176 final String params[] = param.split("=", 2); 177 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 178 if (BIGRAM_TAG.equals(params[0])) { 179 secondWordOfBigram = params[1]; 180 } else if (FREQUENCY_TAG.equals(params[0])) { 181 bigramFreq = Integer.parseInt(params[1]); 182 } 183 } 184 if (null != secondWordOfBigram) { 185 bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq)); 186 } else { 187 throw new RuntimeException("Wrong format : " + line); 188 } 189 } 190 } 191 if (null != word) { 192 dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); 193 for (WeightedString s : bigrams) { 194 dict.setBigram(word, s.mWord, s.mFrequency); 195 } 196 } 197 198 return dict; 199 } 200 201 /** 202 * Writes a dictionary to a combined file. 203 * 204 * @param destination a destination stream to write to. 205 * @param dict the dictionary to write. 206 */ 207 public static void writeDictionaryCombined(Writer destination, FusionDictionary dict) 208 throws IOException { 209 final TreeSet<Word> set = new TreeSet<Word>(); 210 for (Word word : dict) { 211 set.add(word); // This for ordering by frequency, then by asciibetic order 212 } 213 final HashMap<String, String> options = dict.mOptions.mAttributes; 214 destination.write(DICTIONARY_TAG + "="); 215 if (options.containsKey(DICTIONARY_TAG)) { 216 destination.write(options.get(DICTIONARY_TAG)); 217 options.remove(DICTIONARY_TAG); 218 } 219 if (dict.mOptions.mGermanUmlautProcessing) { 220 destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION); 221 } else if (dict.mOptions.mFrenchLigatureProcessing) { 222 destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION); 223 } 224 for (final String key : dict.mOptions.mAttributes.keySet()) { 225 final String value = dict.mOptions.mAttributes.get(key); 226 destination.write("," + key + "=" + value); 227 } 228 destination.write("\n"); 229 for (Word word : set) { 230 destination.write(" " + WORD_TAG + "=" + word.mWord + "," 231 + FREQUENCY_TAG + "=" + word.mFrequency 232 + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); 233 if (null != word.mShortcutTargets) { 234 for (WeightedString target : word.mShortcutTargets) { 235 destination.write(" " + SHORTCUT_TAG + "=" + target.mWord + "," 236 + FREQUENCY_TAG + "=" + target.mFrequency + "\n"); 237 } 238 } 239 if (null != word.mBigrams) { 240 for (WeightedString bigram : word.mBigrams) { 241 destination.write(" " + BIGRAM_TAG + "=" + bigram.mWord + "," 242 + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n"); 243 } 244 } 245 } 246 destination.close(); 247 } 248 } 249