Home | History | Annotate | Download | only in latin
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin;
     18 
     19 import com.android.inputmethod.latin.FusionDictionary.WeightedString;
     20 
     21 import java.io.IOException;
     22 import java.io.InputStream;
     23 import java.io.Writer;
     24 import java.util.ArrayList;
     25 import java.util.HashMap;
     26 import java.util.TreeSet;
     27 
     28 import javax.xml.parsers.ParserConfigurationException;
     29 import javax.xml.parsers.SAXParser;
     30 import javax.xml.parsers.SAXParserFactory;
     31 
     32 import org.xml.sax.Attributes;
     33 import org.xml.sax.SAXException;
     34 import org.xml.sax.helpers.DefaultHandler;
     35 
     36 /**
     37  * Reads and writes XML files for a FusionDictionary.
     38  *
     39  * All functions in this class are static.
     40  */
     41 public class XmlDictInputOutput {
     42 
     43     private static final String WORD_TAG = "w";
     44     private static final String BIGRAM_TAG = "bigram";
     45     private static final String FREQUENCY_ATTR = "f";
     46     private static final String WORD_ATTR = "word";
     47 
     48     /**
     49      * SAX handler for a unigram XML file.
     50      */
     51     static private class UnigramHandler extends DefaultHandler {
     52         // Parser states
     53         private static final int NONE = 0;
     54         private static final int START = 1;
     55         private static final int WORD = 2;
     56         private static final int BIGRAM = 4;
     57         private static final int END = 5;
     58         private static final int UNKNOWN = 6;
     59 
     60         final FusionDictionary mDictionary;
     61         int mState; // the state of the parser
     62         int mFreq; // the currently read freq
     63         String mWord; // the current word
     64         final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
     65 
     66         /**
     67          * Create the handler.
     68          *
     69          * @param dict the dictionary to construct.
     70          * @param bigrams the bigrams as a map. This may be empty, but may not be null.
     71          */
     72         public UnigramHandler(FusionDictionary dict,
     73                 HashMap<String, ArrayList<WeightedString>> bigrams) {
     74             mDictionary = dict;
     75             mBigramsMap = bigrams;
     76             mWord = "";
     77             mState = START;
     78             mFreq = 0;
     79         }
     80 
     81         @Override
     82         public void startElement(String uri, String localName, String qName, Attributes attrs) {
     83             if (WORD_TAG.equals(localName)) {
     84                 mState = WORD;
     85                 mWord = "";
     86                 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
     87                     final String attrName = attrs.getLocalName(attrIndex);
     88                     if (FREQUENCY_ATTR.equals(attrName)) {
     89                         mFreq = Integer.parseInt(attrs.getValue(attrIndex));
     90                     }
     91                 }
     92             } else {
     93                 mState = UNKNOWN;
     94             }
     95         }
     96 
     97         @Override
     98         public void characters(char[] ch, int start, int length) {
     99             if (WORD == mState) {
    100                 // The XML parser is free to return text in arbitrary chunks one after the
    101                 // other. In particular, this happens in some implementations when it finds
    102                 // an escape code like "&amp;".
    103                 mWord += String.copyValueOf(ch, start, length);
    104             }
    105         }
    106 
    107         @Override
    108         public void endElement(String uri, String localName, String qName) {
    109             if (WORD == mState) {
    110                 mDictionary.add(mWord, mFreq, mBigramsMap.get(mWord));
    111                 mState = START;
    112             }
    113         }
    114     }
    115 
    116     /**
    117      * SAX handler for a bigram XML file.
    118      */
    119     static private class BigramHandler extends DefaultHandler {
    120         private final static String BIGRAM_W1_TAG = "bi";
    121         private final static String BIGRAM_W2_TAG = "w";
    122         private final static String BIGRAM_W1_ATTRIBUTE = "w1";
    123         private final static String BIGRAM_W2_ATTRIBUTE = "w2";
    124         private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
    125 
    126         String mW1;
    127         final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
    128 
    129         public BigramHandler() {
    130             mW1 = null;
    131             mBigramsMap = new HashMap<String, ArrayList<WeightedString>>();
    132         }
    133 
    134         @Override
    135         public void startElement(String uri, String localName, String qName, Attributes attrs) {
    136             if (BIGRAM_W1_TAG.equals(localName)) {
    137                 mW1 = attrs.getValue(uri, BIGRAM_W1_ATTRIBUTE);
    138             } else if (BIGRAM_W2_TAG.equals(localName)) {
    139                 String w2 = attrs.getValue(uri, BIGRAM_W2_ATTRIBUTE);
    140                 int freq = Integer.parseInt(attrs.getValue(uri, BIGRAM_FREQ_ATTRIBUTE));
    141                 WeightedString bigram = new WeightedString(w2, freq / 8);
    142                 ArrayList<WeightedString> bigramList = mBigramsMap.get(mW1);
    143                 if (null == bigramList) bigramList = new ArrayList<WeightedString>();
    144                 bigramList.add(bigram);
    145                 mBigramsMap.put(mW1, bigramList);
    146             }
    147         }
    148 
    149         public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
    150             return mBigramsMap;
    151         }
    152     }
    153 
    154     /**
    155      * Reads a dictionary from an XML file.
    156      *
    157      * This is the public method that will parse an XML file and return the corresponding memory
    158      * representation.
    159      *
    160      * @param unigrams the file to read the data from.
    161      * @return the in-memory representation of the dictionary.
    162      */
    163     public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams)
    164             throws SAXException, IOException, ParserConfigurationException {
    165         final SAXParserFactory factory = SAXParserFactory.newInstance();
    166         factory.setNamespaceAware(true);
    167         final SAXParser parser = factory.newSAXParser();
    168         final BigramHandler bigramHandler = new BigramHandler();
    169         if (null != bigrams) parser.parse(bigrams, bigramHandler);
    170 
    171         final FusionDictionary dict = new FusionDictionary();
    172         final UnigramHandler unigramHandler =
    173                 new UnigramHandler(dict, bigramHandler.getBigramMap());
    174         parser.parse(unigrams, unigramHandler);
    175         return dict;
    176     }
    177 
    178     /**
    179      * Reads a dictionary in the first, legacy XML format
    180      *
    181      * This method reads data from the parser and creates a new FusionDictionary with it.
    182      * The format parsed by this method is the format used before Ice Cream Sandwich,
    183      * which has no support for bigrams or shortcuts.
    184      * It is important to note that this method expects the parser to have already eaten
    185      * the first, all-encompassing tag.
    186      *
    187      * @param xpp the parser to read the data from.
    188      * @return the parsed dictionary.
    189      */
    190 
    191     /**
    192      * Writes a dictionary to an XML file.
    193      *
    194      * The output format is the "second" format, which supports bigrams and shortcuts.
    195      *
    196      * @param destination a destination stream to write to.
    197      * @param dict the dictionary to write.
    198      */
    199     public static void writeDictionaryXml(Writer destination, FusionDictionary dict)
    200             throws IOException {
    201         final TreeSet<Word> set = new TreeSet<Word>();
    202         for (Word word : dict) {
    203             set.add(word);
    204         }
    205         // TODO: use an XMLSerializer if this gets big
    206         destination.write("<wordlist format=\"2\">\n");
    207         for (Word word : set) {
    208             destination.write("  <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
    209                     + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">");
    210             if (null != word.mBigrams) {
    211                 destination.write("\n");
    212                 for (WeightedString bigram : word.mBigrams) {
    213                     destination.write("    <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\""
    214                             + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n");
    215                 }
    216                 destination.write("  ");
    217             }
    218             destination.write("</" + WORD_TAG + ">\n");
    219         }
    220         destination.write("</wordlist>\n");
    221         destination.close();
    222     }
    223 }
    224