1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.dicttool; 18 19 import com.android.inputmethod.latin.makedict.FormatSpec; 20 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 21 import com.android.inputmethod.latin.makedict.FusionDictionary; 22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 23 import com.android.inputmethod.latin.makedict.ProbabilityInfo; 24 import com.android.inputmethod.latin.makedict.WeightedString; 25 import com.android.inputmethod.latin.makedict.WordProperty; 26 import com.android.inputmethod.latin.utils.CombinedFormatUtils; 27 28 import java.io.BufferedReader; 29 import java.io.BufferedWriter; 30 import java.io.FileReader; 31 import java.io.IOException; 32 import java.util.ArrayList; 33 import java.util.HashMap; 34 import java.util.TreeSet; 35 36 /** 37 * Reads and writes combined format for a FusionDictionary. 38 * 39 * All functions in this class are static. 40 */ 41 public class CombinedInputOutput { 42 private static final String WHITELIST_TAG = "whitelist"; 43 private static final String OPTIONS_TAG = "options"; 44 private static final String COMMENT_LINE_STARTER = "#"; 45 private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3; 46 47 /** 48 * Basic test to find out whether the file is in the combined format or not. 49 * 50 * Concretely this only tests the header line. 51 * 52 * @param filename The name of the file to test. 53 * @return true if the file is in the combined format, false otherwise 54 */ 55 public static boolean isCombinedDictionary(final String filename) { 56 try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) { 57 String firstLine = reader.readLine(); 58 while (firstLine.startsWith(COMMENT_LINE_STARTER)) { 59 firstLine = reader.readLine(); 60 } 61 return firstLine.matches( 62 "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); 63 } catch (final IOException e) { 64 return false; 65 } 66 } 67 68 /** 69 * Reads a dictionary from a combined format file. 70 * 71 * This is the public method that will read a combined file and return the corresponding memory 72 * representation. 73 * 74 * @param reader the buffered reader to read the data from. 75 * @return the in-memory representation of the dictionary. 76 */ 77 public static FusionDictionary readDictionaryCombined(final BufferedReader reader) 78 throws IOException { 79 String headerLine = reader.readLine(); 80 while (headerLine.startsWith(COMMENT_LINE_STARTER)) { 81 headerLine = reader.readLine(); 82 } 83 final String header[] = headerLine.split(","); 84 final HashMap<String, String> attributes = new HashMap<>(); 85 for (String item : header) { 86 final String keyValue[] = item.split("="); 87 if (2 != keyValue.length) { 88 throw new RuntimeException("Wrong header format : " + headerLine); 89 } 90 attributes.put(keyValue[0], keyValue[1]); 91 } 92 93 attributes.remove(OPTIONS_TAG); 94 final FusionDictionary dict = 95 new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes)); 96 97 String line; 98 String word = null; 99 ProbabilityInfo probabilityInfo = new ProbabilityInfo(0); 100 boolean isNotAWord = false; 101 boolean isPossiblyOffensive = false; 102 ArrayList<WeightedString> bigrams = new ArrayList<>(); 103 ArrayList<WeightedString> shortcuts = new ArrayList<>(); 104 while (null != (line = reader.readLine())) { 105 if (line.startsWith(COMMENT_LINE_STARTER)) continue; 106 final String args[] = line.trim().split(","); 107 if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { 108 if (null != word) { 109 dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, 110 isNotAWord, isPossiblyOffensive); 111 for (WeightedString s : bigrams) { 112 dict.setBigram(word, s.mWord, s.mProbabilityInfo); 113 } 114 } 115 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>(); 116 if (!bigrams.isEmpty()) bigrams = new ArrayList<>(); 117 isNotAWord = false; 118 isPossiblyOffensive = false; 119 for (String param : args) { 120 final String params[] = param.split("=", 2); 121 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 122 switch (params[0]) { 123 case CombinedFormatUtils.WORD_TAG: 124 word = params[1]; 125 break; 126 case CombinedFormatUtils.PROBABILITY_TAG: 127 probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), 128 probabilityInfo.mTimestamp, probabilityInfo.mLevel, 129 probabilityInfo.mCount); 130 break; 131 case CombinedFormatUtils.HISTORICAL_INFO_TAG: 132 final String[] historicalInfoParams = params[1].split( 133 CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); 134 if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { 135 throw new RuntimeException("Wrong format (historical info) : " 136 + line); 137 } 138 probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, 139 Integer.parseInt(historicalInfoParams[0]), 140 Integer.parseInt(historicalInfoParams[1]), 141 Integer.parseInt(historicalInfoParams[2])); 142 break; 143 case CombinedFormatUtils.NOT_A_WORD_TAG: 144 isNotAWord = CombinedFormatUtils.isLiteralTrue(params[1]); 145 break; 146 case CombinedFormatUtils.POSSIBLY_OFFENSIVE_TAG: 147 isPossiblyOffensive = CombinedFormatUtils.isLiteralTrue(params[1]); 148 break; 149 } 150 } 151 } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { 152 String shortcut = null; 153 int shortcutFreq = 0; 154 for (String param : args) { 155 final String params[] = param.split("=", 2); 156 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 157 if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) { 158 shortcut = params[1]; 159 } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { 160 shortcutFreq = WHITELIST_TAG.equals(params[1]) 161 ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY 162 : Integer.parseInt(params[1]); 163 } 164 } 165 if (null != shortcut) { 166 shortcuts.add(new WeightedString(shortcut, shortcutFreq)); 167 } else { 168 throw new RuntimeException("Wrong format : " + line); 169 } 170 } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) { 171 String secondWordOfBigram = null; 172 ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0); 173 for (String param : args) { 174 final String params[] = param.split("=", 2); 175 if (2 != params.length) throw new RuntimeException("Wrong format : " + line); 176 if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) { 177 secondWordOfBigram = params[1]; 178 } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { 179 bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), 180 bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel, 181 bigramProbabilityInfo.mCount); 182 } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { 183 final String[] historicalInfoParams = 184 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); 185 if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { 186 throw new RuntimeException("Wrong format (historical info) : " + line); 187 } 188 bigramProbabilityInfo = new ProbabilityInfo( 189 bigramProbabilityInfo.mProbability, 190 Integer.parseInt(historicalInfoParams[0]), 191 Integer.parseInt(historicalInfoParams[1]), 192 Integer.parseInt(historicalInfoParams[2])); 193 } 194 } 195 if (null != secondWordOfBigram) { 196 bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo)); 197 } else { 198 throw new RuntimeException("Wrong format : " + line); 199 } 200 } 201 } 202 if (null != word) { 203 dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord, 204 isPossiblyOffensive); 205 for (WeightedString s : bigrams) { 206 dict.setBigram(word, s.mWord, s.mProbabilityInfo); 207 } 208 } 209 210 return dict; 211 } 212 213 /** 214 * Writes a dictionary to a combined file. 215 * 216 * @param destination a destination writer. 217 * @param dict the dictionary to write. 218 */ 219 public static void writeDictionaryCombined(final BufferedWriter destination, 220 final FusionDictionary dict) throws IOException { 221 final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>(); 222 for (final WordProperty wordProperty : dict) { 223 // This for ordering by frequency, then by asciibetic order 224 wordPropertiesInDict.add(wordProperty); 225 } 226 destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes)); 227 for (final WordProperty wordProperty : wordPropertiesInDict) { 228 destination.write(CombinedFormatUtils.formatWordProperty(wordProperty)); 229 } 230 } 231 } 232