1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.dicttool; 18 19 import com.android.inputmethod.latin.makedict.FormatSpec; 20 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 21 import com.android.inputmethod.latin.makedict.FusionDictionary; 22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 23 import com.android.inputmethod.latin.makedict.ProbabilityInfo; 24 import com.android.inputmethod.latin.makedict.WeightedString; 25 import com.android.inputmethod.latin.makedict.WordProperty; 26 import com.android.inputmethod.latin.utils.CombinedFormatUtils; 27 28 import java.io.BufferedReader; 29 import java.io.BufferedWriter; 30 import java.io.FileReader; 31 import java.io.IOException; 32 import java.util.ArrayList; 33 import java.util.HashMap; 34 import java.util.TreeSet; 35 36 /** 37 * Reads and writes combined format for a FusionDictionary. 38 * 39 * All functions in this class are static. 40 */ 41 public class CombinedInputOutput { 42 private static final String WHITELIST_TAG = "whitelist"; 43 private static final String OPTIONS_TAG = "options"; 44 private static final String COMMENT_LINE_STARTER = "#"; 45 private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3; 46 47 /** 48 * Basic test to find out whether the file is in the combined format or not. 49 * 50 * Concretely this only tests the header line. 51 * 52 * @param filename The name of the file to test. 53 * @return true if the file is in the combined format, false otherwise 54 */ 55 public static boolean isCombinedDictionary(final String filename) { 56 try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) { 57 String firstLine = reader.readLine(); 58 while (firstLine.startsWith(COMMENT_LINE_STARTER)) { 59 firstLine = reader.readLine(); 60 } 61 return firstLine.matches( 62 "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); 63 } catch (final IOException e) { 64 return false; 65 } 66 } 67 68 /** 69 * Reads a dictionary from a combined format file. 70 * 71 * This is the public method that will read a combined file and return the corresponding memory 72 * representation. 73 * 74 * @param reader the buffered reader to read the data from. 75 * @return the in-memory representation of the dictionary. 76 */ 77 public static FusionDictionary readDictionaryCombined(final BufferedReader reader) 78 throws IOException { 79 String headerLine = reader.readLine(); 80 while (headerLine.startsWith(COMMENT_LINE_STARTER)) { 81 headerLine = reader.readLine(); 82 } 83 final String header[] = headerLine.split(","); 84 final HashMap<String, String> attributes = new HashMap<>(); 85 for (String item : header) { 86 final String keyValue[] = item.split("="); 87 if (2 != keyValue.length) { 88 throw new RuntimeException("Wrong header format : " + headerLine); 89 } 90 attributes.put(keyValue[0], keyValue[1]); 91 } 92 93 attributes.remove(OPTIONS_TAG); 94 final FusionDictionary dict = 95 new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes)); 96 97 String line; 98 String word = null; 99 ProbabilityInfo probabilityInfo = new ProbabilityInfo(0); 100 boolean isNotAWord = false; 101 ArrayList<WeightedString> bigrams = new ArrayList<>(); 102 ArrayList<WeightedString> shortcuts = new ArrayList<>(); 103 while (null != (line = reader.readLine())) { 104 if (line.startsWith(COMMENT_LINE_STARTER)) continue; 105 final String args[] = line.trim().split(","); 106 if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { 107 if (null != word) { 108 dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, 109 isNotAWord); 110 for (WeightedString s : bigrams) { 111 dict.setBigram(word, s.mWord, s.mProbabilityInfo); 112 } 113 } 114 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>(); 115 if (!bigrams.isEmpty()) bigrams = new ArrayList<>(); 116 isNotAWord = false; 117 for (String param : args) { 118 final String params[] = param.split("=", 2); 119 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 120 if (CombinedFormatUtils.WORD_TAG.equals(params[0])) { 121 word = params[1]; 122 } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { 123 probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), 124 probabilityInfo.mTimestamp, probabilityInfo.mLevel, 125 probabilityInfo.mCount); 126 } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { 127 final String[] historicalInfoParams = 128 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); 129 if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { 130 throw new RuntimeException("Wrong format (historical info) : " + line); 131 } 132 probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, 133 Integer.parseInt(historicalInfoParams[0]), 134 Integer.parseInt(historicalInfoParams[1]), 135 Integer.parseInt(historicalInfoParams[2])); 136 } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) { 137 isNotAWord = "true".equals(params[1]); 138 } 139 } 140 } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { 141 String shortcut = null; 142 int shortcutFreq = 0; 143 for (String param : args) { 144 final String params[] = param.split("=", 2); 145 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 146 if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) { 147 shortcut = params[1]; 148 } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { 149 shortcutFreq = WHITELIST_TAG.equals(params[1]) 150 ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY 151 : Integer.parseInt(params[1]); 152 } 153 } 154 if (null != shortcut) { 155 shortcuts.add(new WeightedString(shortcut, shortcutFreq)); 156 } else { 157 throw new RuntimeException("Wrong format : " + line); 158 } 159 } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) { 160 String secondWordOfBigram = null; 161 ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0); 162 for (String param : args) { 163 final String params[] = param.split("=", 2); 164 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 165 if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) { 166 secondWordOfBigram = params[1]; 167 } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { 168 bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), 169 bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel, 170 bigramProbabilityInfo.mCount); 171 } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { 172 final String[] historicalInfoParams = 173 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); 174 if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { 175 throw new RuntimeException("Wrong format (historical info) : " + line); 176 } 177 bigramProbabilityInfo = new ProbabilityInfo( 178 bigramProbabilityInfo.mProbability, 179 Integer.parseInt(historicalInfoParams[0]), 180 Integer.parseInt(historicalInfoParams[1]), 181 Integer.parseInt(historicalInfoParams[2])); 182 } 183 } 184 if (null != secondWordOfBigram) { 185 bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo)); 186 } else { 187 throw new RuntimeException("Wrong format : " + line); 188 } 189 } 190 } 191 if (null != word) { 192 dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); 193 for (WeightedString s : bigrams) { 194 dict.setBigram(word, s.mWord, s.mProbabilityInfo); 195 } 196 } 197 198 return dict; 199 } 200 201 /** 202 * Writes a dictionary to a combined file. 203 * 204 * @param destination a destination writer. 205 * @param dict the dictionary to write. 206 */ 207 public static void writeDictionaryCombined(final BufferedWriter destination, 208 final FusionDictionary dict) throws IOException { 209 final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>(); 210 for (final WordProperty wordProperty : dict) { 211 // This for ordering by frequency, then by asciibetic order 212 wordPropertiesInDict.add(wordProperty); 213 } 214 destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes)); 215 for (final WordProperty wordProperty : wordPropertiesInDict) { 216 destination.write(CombinedFormatUtils.formatWordProperty(wordProperty)); 217 } 218 } 219 } 220