1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.dicttool; 18 19 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 20 import com.android.inputmethod.latin.makedict.FusionDictionary; 21 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 22 import com.android.inputmethod.latin.makedict.ProbabilityInfo; 23 import com.android.inputmethod.latin.makedict.WeightedString; 24 import com.android.inputmethod.latin.makedict.WordProperty; 25 26 import org.xml.sax.Attributes; 27 import org.xml.sax.SAXException; 28 import org.xml.sax.helpers.DefaultHandler; 29 30 import java.io.BufferedInputStream; 31 import java.io.BufferedReader; 32 import java.io.BufferedWriter; 33 import java.io.FileInputStream; 34 import java.io.IOException; 35 import java.io.InputStreamReader; 36 import java.util.ArrayList; 37 import java.util.HashMap; 38 import java.util.TreeSet; 39 40 import javax.xml.parsers.ParserConfigurationException; 41 import javax.xml.parsers.SAXParser; 42 import javax.xml.parsers.SAXParserFactory; 43 44 /** 45 * Reads and writes XML files for a FusionDictionary. 46 * 47 * All functions in this class are static. 48 */ 49 public class XmlDictInputOutput { 50 51 private static final String ROOT_TAG = "wordlist"; 52 private static final String WORD_TAG = "w"; 53 private static final String BIGRAM_TAG = "bigram"; 54 private static final String SHORTCUT_TAG = "shortcut"; 55 private static final String PROBABILITY_ATTR = "f"; 56 private static final String WORD_ATTR = "word"; 57 private static final String NOT_A_WORD_ATTR = "not_a_word"; 58 59 /** 60 * SAX handler for a unigram XML file. 61 */ 62 static private class UnigramHandler extends DefaultHandler { 63 // Parser states 64 private static final int START = 1; 65 private static final int WORD = 2; 66 private static final int UNKNOWN = 3; 67 private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1; 68 69 FusionDictionary mDictionary; 70 int mState; // the state of the parser 71 int mFreq; // the currently read freq 72 String mWord; // the current word 73 final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; 74 75 /** 76 * Create the handler. 77 * 78 * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. 79 */ 80 public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) { 81 mDictionary = null; 82 mShortcutsMap = shortcuts; 83 mWord = ""; 84 mState = START; 85 mFreq = 0; 86 } 87 88 public FusionDictionary getFinalDictionary() { 89 final FusionDictionary dict = mDictionary; 90 for (final String shortcutOnly : mShortcutsMap.keySet()) { 91 if (dict.hasWord(shortcutOnly)) continue; 92 dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY), 93 mShortcutsMap.get(shortcutOnly), true /* isNotAWord */); 94 } 95 mDictionary = null; 96 mShortcutsMap.clear(); 97 mWord = ""; 98 mState = START; 99 mFreq = 0; 100 return dict; 101 } 102 103 @Override 104 public void startElement(String uri, String localName, String qName, Attributes attrs) { 105 if (WORD_TAG.equals(localName)) { 106 mState = WORD; 107 mWord = ""; 108 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { 109 final String attrName = attrs.getLocalName(attrIndex); 110 if (PROBABILITY_ATTR.equals(attrName)) { 111 mFreq = Integer.parseInt(attrs.getValue(attrIndex)); 112 } 113 } 114 } else if (ROOT_TAG.equals(localName)) { 115 final HashMap<String, String> attributes = new HashMap<>(); 116 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { 117 final String attrName = attrs.getLocalName(attrIndex); 118 attributes.put(attrName, attrs.getValue(attrIndex)); 119 } 120 mDictionary = new FusionDictionary(new PtNodeArray(), 121 new DictionaryOptions(attributes)); 122 } else { 123 mState = UNKNOWN; 124 } 125 } 126 127 @Override 128 public void characters(char[] ch, int start, int length) { 129 if (WORD == mState) { 130 // The XML parser is free to return text in arbitrary chunks one after the 131 // other. In particular, this happens in some implementations when it finds 132 // an escape code like "&". 133 mWord += String.copyValueOf(ch, start, length); 134 } 135 } 136 137 @Override 138 public void endElement(String uri, String localName, String qName) { 139 if (WORD == mState) { 140 mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord), 141 false /* isNotAWord */); 142 mState = START; 143 } 144 } 145 } 146 147 static private class AssociativeListHandler extends DefaultHandler { 148 private final String SRC_TAG; 149 private final String SRC_ATTRIBUTE; 150 private final String DST_TAG; 151 private final String DST_ATTRIBUTE; 152 private final String DST_FREQ; 153 154 // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX 155 private final static int XML_MAX = 256; 156 // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX 157 private final static int MEMORY_MAX = 256; 158 private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; 159 160 private String mSrc; 161 private final HashMap<String, ArrayList<WeightedString>> mAssocMap; 162 163 public AssociativeListHandler(final String srcTag, final String srcAttribute, 164 final String dstTag, final String dstAttribute, final String dstFreq) { 165 SRC_TAG = srcTag; 166 SRC_ATTRIBUTE = srcAttribute; 167 DST_TAG = dstTag; 168 DST_ATTRIBUTE = dstAttribute; 169 DST_FREQ = dstFreq; 170 mSrc = null; 171 mAssocMap = new HashMap<>(); 172 } 173 174 @Override 175 public void startElement(String uri, String localName, String qName, Attributes attrs) { 176 if (SRC_TAG.equals(localName)) { 177 mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); 178 } else if (DST_TAG.equals(localName)) { 179 String dst = attrs.getValue(uri, DST_ATTRIBUTE); 180 int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ)); 181 WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); 182 ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); 183 if (null == bigramList) bigramList = new ArrayList<>(); 184 bigramList.add(bigram); 185 mAssocMap.put(mSrc, bigramList); 186 } 187 } 188 189 protected int getValueFromFreqString(final String freqString) { 190 return Integer.parseInt(freqString); 191 } 192 193 // This may return an empty map, but will never return null. 194 public HashMap<String, ArrayList<WeightedString>> getAssocMap() { 195 return mAssocMap; 196 } 197 } 198 199 /** 200 * SAX handler for a bigram XML file. 201 */ 202 static private class BigramHandler extends AssociativeListHandler { 203 private final static String BIGRAM_W1_TAG = "bi"; 204 private final static String BIGRAM_W2_TAG = "w"; 205 private final static String BIGRAM_W1_ATTRIBUTE = "w1"; 206 private final static String BIGRAM_W2_ATTRIBUTE = "w2"; 207 private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; 208 209 public BigramHandler() { 210 super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, 211 BIGRAM_FREQ_ATTRIBUTE); 212 } 213 214 // As per getAssocMap(), this never returns null. 215 public HashMap<String, ArrayList<WeightedString>> getBigramMap() { 216 return getAssocMap(); 217 } 218 } 219 220 /** 221 * SAX handler for a shortcut & whitelist XML file. 222 */ 223 static private class ShortcutAndWhitelistHandler extends AssociativeListHandler { 224 private final static String ENTRY_TAG = "entry"; 225 private final static String ENTRY_ATTRIBUTE = "shortcut"; 226 private final static String TARGET_TAG = "target"; 227 private final static String REPLACEMENT_ATTRIBUTE = "replacement"; 228 private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; 229 private final static String WHITELIST_MARKER = "whitelist"; 230 private final static int WHITELIST_FREQ_VALUE = 15; 231 private final static int MIN_FREQ = 0; 232 private final static int MAX_FREQ = 14; 233 234 public ShortcutAndWhitelistHandler() { 235 super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, 236 TARGET_PRIORITY_ATTRIBUTE); 237 } 238 239 @Override 240 protected int getValueFromFreqString(final String freqString) { 241 if (WHITELIST_MARKER.equals(freqString)) { 242 return WHITELIST_FREQ_VALUE; 243 } 244 final int intValue = super.getValueFromFreqString(freqString); 245 if (intValue < MIN_FREQ || intValue > MAX_FREQ) { 246 throw new RuntimeException("Shortcut freq out of range. Accepted range is " 247 + MIN_FREQ + ".." + MAX_FREQ); 248 } 249 return intValue; 250 } 251 252 // As per getAssocMap(), this never returns null. 253 public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() { 254 return getAssocMap(); 255 } 256 } 257 258 /** 259 * Basic test to find out whether the file is in the unigram XML format or not. 260 * 261 * Concretely this only tests the header line. 262 * 263 * @param filename The name of the file to test. 264 * @return true if the file is in the unigram XML format, false otherwise 265 */ 266 public static boolean isXmlUnigramDictionary(final String filename) { 267 try (final BufferedReader reader = new BufferedReader( 268 new InputStreamReader(new FileInputStream(filename), "UTF-8"))) { 269 final String firstLine = reader.readLine(); 270 return firstLine.matches("^\\s*<wordlist .*>\\s*$"); 271 } catch (final IOException e) { 272 return false; 273 } 274 } 275 276 /** 277 * Reads a dictionary from an XML file. 278 * 279 * This is the public method that will parse an XML file and return the corresponding memory 280 * representation. 281 * 282 * @param unigrams the file to read the data from. 283 * @param shortcuts the file to read the shortcuts & whitelist from, or null. 284 * @param bigrams the file to read the bigrams from, or null. 285 * @return the in-memory representation of the dictionary. 286 */ 287 public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams, 288 final BufferedInputStream shortcuts, final BufferedInputStream bigrams) 289 throws SAXException, IOException, ParserConfigurationException { 290 final SAXParserFactory factory = SAXParserFactory.newInstance(); 291 factory.setNamespaceAware(true); 292 final SAXParser parser = factory.newSAXParser(); 293 final BigramHandler bigramHandler = new BigramHandler(); 294 if (null != bigrams) parser.parse(bigrams, bigramHandler); 295 296 final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler = 297 new ShortcutAndWhitelistHandler(); 298 if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler); 299 300 final UnigramHandler unigramHandler = 301 new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap()); 302 parser.parse(unigrams, unigramHandler); 303 final FusionDictionary dict = unigramHandler.getFinalDictionary(); 304 final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap(); 305 for (final String firstWord : bigramMap.keySet()) { 306 if (!dict.hasWord(firstWord)) continue; 307 final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); 308 for (final WeightedString bigram : bigramList) { 309 if (!dict.hasWord(bigram.mWord)) continue; 310 dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo); 311 } 312 } 313 return dict; 314 } 315 316 /** 317 * Reads a dictionary in the first, legacy XML format 318 * 319 * This method reads data from the parser and creates a new FusionDictionary with it. 320 * The format parsed by this method is the format used before Ice Cream Sandwich, 321 * which has no support for bigrams or shortcuts/whitelist. 322 * It is important to note that this method expects the parser to have already eaten 323 * the first, all-encompassing tag. 324 * 325 * @param xpp the parser to read the data from. 326 * @return the parsed dictionary. 327 */ 328 329 /** 330 * Writes a dictionary to an XML file. 331 * 332 * The output format is the "second" format, which supports bigrams and shortcuts/whitelist. 333 * 334 * @param destination a destination stream to write to. 335 * @param dict the dictionary to write. 336 */ 337 public static void writeDictionaryXml(final BufferedWriter destination, 338 final FusionDictionary dict) throws IOException { 339 final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>(); 340 for (WordProperty wordProperty : dict) { 341 wordPropertiesInDict.add(wordProperty); 342 } 343 // TODO: use an XMLSerializer if this gets big 344 destination.write("<wordlist format=\"2\""); 345 for (final String key : dict.mOptions.mAttributes.keySet()) { 346 final String value = dict.mOptions.mAttributes.get(key); 347 destination.write(" " + key + "=\"" + value + "\""); 348 } 349 destination.write(">\n"); 350 destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); 351 for (WordProperty wordProperty : wordPropertiesInDict) { 352 destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord 353 + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability() 354 + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") 355 + "\">"); 356 if (null != wordProperty.mShortcutTargets) { 357 destination.write("\n"); 358 for (WeightedString target : wordProperty.mShortcutTargets) { 359 destination.write(" <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\"" 360 + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG 361 + ">\n"); 362 } 363 destination.write(" "); 364 } 365 if (null != wordProperty.mBigrams) { 366 destination.write("\n"); 367 for (WeightedString bigram : wordProperty.mBigrams) { 368 destination.write(" <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\"" 369 + bigram.getProbability() + "\">" + bigram.mWord 370 + "</" + BIGRAM_TAG + ">\n"); 371 } 372 destination.write(" "); 373 } 374 destination.write("</" + WORD_TAG + ">\n"); 375 } 376 destination.write("</wordlist>\n"); 377 destination.close(); 378 } 379 } 380