1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import com.android.inputmethod.latin.FusionDictionary.WeightedString; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.io.Writer; 24 import java.util.ArrayList; 25 import java.util.HashMap; 26 import java.util.TreeSet; 27 28 import javax.xml.parsers.ParserConfigurationException; 29 import javax.xml.parsers.SAXParser; 30 import javax.xml.parsers.SAXParserFactory; 31 32 import org.xml.sax.Attributes; 33 import org.xml.sax.SAXException; 34 import org.xml.sax.helpers.DefaultHandler; 35 36 /** 37 * Reads and writes XML files for a FusionDictionary. 38 * 39 * All functions in this class are static. 40 */ 41 public class XmlDictInputOutput { 42 43 private static final String WORD_TAG = "w"; 44 private static final String BIGRAM_TAG = "bigram"; 45 private static final String FREQUENCY_ATTR = "f"; 46 private static final String WORD_ATTR = "word"; 47 48 /** 49 * SAX handler for a unigram XML file. 50 */ 51 static private class UnigramHandler extends DefaultHandler { 52 // Parser states 53 private static final int NONE = 0; 54 private static final int START = 1; 55 private static final int WORD = 2; 56 private static final int BIGRAM = 4; 57 private static final int END = 5; 58 private static final int UNKNOWN = 6; 59 60 final FusionDictionary mDictionary; 61 int mState; // the state of the parser 62 int mFreq; // the currently read freq 63 String mWord; // the current word 64 final HashMap<String, ArrayList<WeightedString>> mBigramsMap; 65 66 /** 67 * Create the handler. 68 * 69 * @param dict the dictionary to construct. 70 * @param bigrams the bigrams as a map. This may be empty, but may not be null. 71 */ 72 public UnigramHandler(FusionDictionary dict, 73 HashMap<String, ArrayList<WeightedString>> bigrams) { 74 mDictionary = dict; 75 mBigramsMap = bigrams; 76 mWord = ""; 77 mState = START; 78 mFreq = 0; 79 } 80 81 @Override 82 public void startElement(String uri, String localName, String qName, Attributes attrs) { 83 if (WORD_TAG.equals(localName)) { 84 mState = WORD; 85 mWord = ""; 86 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { 87 final String attrName = attrs.getLocalName(attrIndex); 88 if (FREQUENCY_ATTR.equals(attrName)) { 89 mFreq = Integer.parseInt(attrs.getValue(attrIndex)); 90 } 91 } 92 } else { 93 mState = UNKNOWN; 94 } 95 } 96 97 @Override 98 public void characters(char[] ch, int start, int length) { 99 if (WORD == mState) { 100 // The XML parser is free to return text in arbitrary chunks one after the 101 // other. In particular, this happens in some implementations when it finds 102 // an escape code like "&". 103 mWord += String.copyValueOf(ch, start, length); 104 } 105 } 106 107 @Override 108 public void endElement(String uri, String localName, String qName) { 109 if (WORD == mState) { 110 mDictionary.add(mWord, mFreq, mBigramsMap.get(mWord)); 111 mState = START; 112 } 113 } 114 } 115 116 /** 117 * SAX handler for a bigram XML file. 118 */ 119 static private class BigramHandler extends DefaultHandler { 120 private final static String BIGRAM_W1_TAG = "bi"; 121 private final static String BIGRAM_W2_TAG = "w"; 122 private final static String BIGRAM_W1_ATTRIBUTE = "w1"; 123 private final static String BIGRAM_W2_ATTRIBUTE = "w2"; 124 private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; 125 126 String mW1; 127 final HashMap<String, ArrayList<WeightedString>> mBigramsMap; 128 129 public BigramHandler() { 130 mW1 = null; 131 mBigramsMap = new HashMap<String, ArrayList<WeightedString>>(); 132 } 133 134 @Override 135 public void startElement(String uri, String localName, String qName, Attributes attrs) { 136 if (BIGRAM_W1_TAG.equals(localName)) { 137 mW1 = attrs.getValue(uri, BIGRAM_W1_ATTRIBUTE); 138 } else if (BIGRAM_W2_TAG.equals(localName)) { 139 String w2 = attrs.getValue(uri, BIGRAM_W2_ATTRIBUTE); 140 int freq = Integer.parseInt(attrs.getValue(uri, BIGRAM_FREQ_ATTRIBUTE)); 141 WeightedString bigram = new WeightedString(w2, freq / 8); 142 ArrayList<WeightedString> bigramList = mBigramsMap.get(mW1); 143 if (null == bigramList) bigramList = new ArrayList<WeightedString>(); 144 bigramList.add(bigram); 145 mBigramsMap.put(mW1, bigramList); 146 } 147 } 148 149 public HashMap<String, ArrayList<WeightedString>> getBigramMap() { 150 return mBigramsMap; 151 } 152 } 153 154 /** 155 * Reads a dictionary from an XML file. 156 * 157 * This is the public method that will parse an XML file and return the corresponding memory 158 * representation. 159 * 160 * @param unigrams the file to read the data from. 161 * @return the in-memory representation of the dictionary. 162 */ 163 public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams) 164 throws SAXException, IOException, ParserConfigurationException { 165 final SAXParserFactory factory = SAXParserFactory.newInstance(); 166 factory.setNamespaceAware(true); 167 final SAXParser parser = factory.newSAXParser(); 168 final BigramHandler bigramHandler = new BigramHandler(); 169 if (null != bigrams) parser.parse(bigrams, bigramHandler); 170 171 final FusionDictionary dict = new FusionDictionary(); 172 final UnigramHandler unigramHandler = 173 new UnigramHandler(dict, bigramHandler.getBigramMap()); 174 parser.parse(unigrams, unigramHandler); 175 return dict; 176 } 177 178 /** 179 * Reads a dictionary in the first, legacy XML format 180 * 181 * This method reads data from the parser and creates a new FusionDictionary with it. 182 * The format parsed by this method is the format used before Ice Cream Sandwich, 183 * which has no support for bigrams or shortcuts. 184 * It is important to note that this method expects the parser to have already eaten 185 * the first, all-encompassing tag. 186 * 187 * @param xpp the parser to read the data from. 188 * @return the parsed dictionary. 189 */ 190 191 /** 192 * Writes a dictionary to an XML file. 193 * 194 * The output format is the "second" format, which supports bigrams and shortcuts. 195 * 196 * @param destination a destination stream to write to. 197 * @param dict the dictionary to write. 198 */ 199 public static void writeDictionaryXml(Writer destination, FusionDictionary dict) 200 throws IOException { 201 final TreeSet<Word> set = new TreeSet<Word>(); 202 for (Word word : dict) { 203 set.add(word); 204 } 205 // TODO: use an XMLSerializer if this gets big 206 destination.write("<wordlist format=\"2\">\n"); 207 for (Word word : set) { 208 destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " 209 + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">"); 210 if (null != word.mBigrams) { 211 destination.write("\n"); 212 for (WeightedString bigram : word.mBigrams) { 213 destination.write(" <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\"" 214 + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n"); 215 } 216 destination.write(" "); 217 } 218 destination.write("</" + WORD_TAG + ">\n"); 219 } 220 destination.write("</wordlist>\n"); 221 destination.close(); 222 } 223 } 224