Home | History | Annotate | Download | only in dicttool
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.dicttool;
     18 
     19 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
     20 import com.android.inputmethod.latin.makedict.FusionDictionary;
     21 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
     22 import com.android.inputmethod.latin.makedict.ProbabilityInfo;
     23 import com.android.inputmethod.latin.makedict.WeightedString;
     24 import com.android.inputmethod.latin.makedict.WordProperty;
     25 
     26 import org.xml.sax.Attributes;
     27 import org.xml.sax.SAXException;
     28 import org.xml.sax.helpers.DefaultHandler;
     29 
     30 import java.io.BufferedInputStream;
     31 import java.io.BufferedReader;
     32 import java.io.BufferedWriter;
     33 import java.io.FileInputStream;
     34 import java.io.IOException;
     35 import java.io.InputStreamReader;
     36 import java.util.ArrayList;
     37 import java.util.HashMap;
     38 import java.util.TreeSet;
     39 
     40 import javax.xml.parsers.ParserConfigurationException;
     41 import javax.xml.parsers.SAXParser;
     42 import javax.xml.parsers.SAXParserFactory;
     43 
     44 /**
     45  * Reads and writes XML files for a FusionDictionary.
     46  *
     47  * All functions in this class are static.
     48  */
     49 public class XmlDictInputOutput {
     50 
     51     private static final String ROOT_TAG = "wordlist";
     52     private static final String WORD_TAG = "w";
     53     private static final String BIGRAM_TAG = "bigram";
     54     private static final String SHORTCUT_TAG = "shortcut";
     55     private static final String PROBABILITY_ATTR = "f";
     56     private static final String WORD_ATTR = "word";
     57     private static final String NOT_A_WORD_ATTR = "not_a_word";
     58 
     59     /**
     60      * SAX handler for a unigram XML file.
     61      */
     62     static private class UnigramHandler extends DefaultHandler {
     63         // Parser states
     64         private static final int START = 1;
     65         private static final int WORD = 2;
     66         private static final int UNKNOWN = 3;
     67         private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1;
     68 
     69         FusionDictionary mDictionary;
     70         int mState; // the state of the parser
     71         int mFreq; // the currently read freq
     72         String mWord; // the current word
     73         final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
     74 
     75         /**
     76          * Create the handler.
     77          *
     78          * @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
     79          */
     80         public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
     81             mDictionary = null;
     82             mShortcutsMap = shortcuts;
     83             mWord = "";
     84             mState = START;
     85             mFreq = 0;
     86         }
     87 
     88         public FusionDictionary getFinalDictionary() {
     89             final FusionDictionary dict = mDictionary;
     90             for (final String shortcutOnly : mShortcutsMap.keySet()) {
     91                 if (dict.hasWord(shortcutOnly)) continue;
     92                 dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
     93                         mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
     94             }
     95             mDictionary = null;
     96             mShortcutsMap.clear();
     97             mWord = "";
     98             mState = START;
     99             mFreq = 0;
    100             return dict;
    101         }
    102 
    103         @Override
    104         public void startElement(String uri, String localName, String qName, Attributes attrs) {
    105             if (WORD_TAG.equals(localName)) {
    106                 mState = WORD;
    107                 mWord = "";
    108                 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
    109                     final String attrName = attrs.getLocalName(attrIndex);
    110                     if (PROBABILITY_ATTR.equals(attrName)) {
    111                         mFreq = Integer.parseInt(attrs.getValue(attrIndex));
    112                     }
    113                 }
    114             } else if (ROOT_TAG.equals(localName)) {
    115                 final HashMap<String, String> attributes = new HashMap<>();
    116                 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
    117                     final String attrName = attrs.getLocalName(attrIndex);
    118                     attributes.put(attrName, attrs.getValue(attrIndex));
    119                 }
    120                 mDictionary = new FusionDictionary(new PtNodeArray(),
    121                         new DictionaryOptions(attributes));
    122             } else {
    123                 mState = UNKNOWN;
    124             }
    125         }
    126 
    127         @Override
    128         public void characters(char[] ch, int start, int length) {
    129             if (WORD == mState) {
    130                 // The XML parser is free to return text in arbitrary chunks one after the
    131                 // other. In particular, this happens in some implementations when it finds
    132                 // an escape code like "&amp;".
    133                 mWord += String.copyValueOf(ch, start, length);
    134             }
    135         }
    136 
    137         @Override
    138         public void endElement(String uri, String localName, String qName) {
    139             if (WORD == mState) {
    140                 mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
    141                         false /* isNotAWord */);
    142                 mState = START;
    143             }
    144         }
    145     }
    146 
    147     static private class AssociativeListHandler extends DefaultHandler {
    148         private final String SRC_TAG;
    149         private final String SRC_ATTRIBUTE;
    150         private final String DST_TAG;
    151         private final String DST_ATTRIBUTE;
    152         private final String DST_FREQ;
    153 
    154         // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX
    155         private final static int XML_MAX = 256;
    156         // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX
    157         private final static int MEMORY_MAX = 256;
    158         private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX;
    159 
    160         private String mSrc;
    161         private final HashMap<String, ArrayList<WeightedString>> mAssocMap;
    162 
    163         public AssociativeListHandler(final String srcTag, final String srcAttribute,
    164                 final String dstTag, final String dstAttribute, final String dstFreq) {
    165             SRC_TAG = srcTag;
    166             SRC_ATTRIBUTE = srcAttribute;
    167             DST_TAG = dstTag;
    168             DST_ATTRIBUTE = dstAttribute;
    169             DST_FREQ = dstFreq;
    170             mSrc = null;
    171             mAssocMap = new HashMap<>();
    172         }
    173 
    174         @Override
    175         public void startElement(String uri, String localName, String qName, Attributes attrs) {
    176             if (SRC_TAG.equals(localName)) {
    177                 mSrc = attrs.getValue(uri, SRC_ATTRIBUTE);
    178             } else if (DST_TAG.equals(localName)) {
    179                 String dst = attrs.getValue(uri, DST_ATTRIBUTE);
    180                 int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ));
    181                 WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO);
    182                 ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc);
    183                 if (null == bigramList) bigramList = new ArrayList<>();
    184                 bigramList.add(bigram);
    185                 mAssocMap.put(mSrc, bigramList);
    186             }
    187         }
    188 
    189         protected int getValueFromFreqString(final String freqString) {
    190             return Integer.parseInt(freqString);
    191         }
    192 
    193         // This may return an empty map, but will never return null.
    194         public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
    195             return mAssocMap;
    196         }
    197     }
    198 
    199     /**
    200      * SAX handler for a bigram XML file.
    201      */
    202     static private class BigramHandler extends AssociativeListHandler {
    203         private final static String BIGRAM_W1_TAG = "bi";
    204         private final static String BIGRAM_W2_TAG = "w";
    205         private final static String BIGRAM_W1_ATTRIBUTE = "w1";
    206         private final static String BIGRAM_W2_ATTRIBUTE = "w2";
    207         private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
    208 
    209         public BigramHandler() {
    210             super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE,
    211                     BIGRAM_FREQ_ATTRIBUTE);
    212         }
    213 
    214         // As per getAssocMap(), this never returns null.
    215         public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
    216             return getAssocMap();
    217         }
    218     }
    219 
    220     /**
    221      * SAX handler for a shortcut & whitelist XML file.
    222      */
    223     static private class ShortcutAndWhitelistHandler extends AssociativeListHandler {
    224         private final static String ENTRY_TAG = "entry";
    225         private final static String ENTRY_ATTRIBUTE = "shortcut";
    226         private final static String TARGET_TAG = "target";
    227         private final static String REPLACEMENT_ATTRIBUTE = "replacement";
    228         private final static String TARGET_PRIORITY_ATTRIBUTE = "priority";
    229         private final static String WHITELIST_MARKER = "whitelist";
    230         private final static int WHITELIST_FREQ_VALUE = 15;
    231         private final static int MIN_FREQ = 0;
    232         private final static int MAX_FREQ = 14;
    233 
    234         public ShortcutAndWhitelistHandler() {
    235             super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE,
    236                     TARGET_PRIORITY_ATTRIBUTE);
    237         }
    238 
    239         @Override
    240         protected int getValueFromFreqString(final String freqString) {
    241             if (WHITELIST_MARKER.equals(freqString)) {
    242                 return WHITELIST_FREQ_VALUE;
    243             }
    244             final int intValue = super.getValueFromFreqString(freqString);
    245             if (intValue < MIN_FREQ || intValue > MAX_FREQ) {
    246                 throw new RuntimeException("Shortcut freq out of range. Accepted range is "
    247                         + MIN_FREQ + ".." + MAX_FREQ);
    248             }
    249             return intValue;
    250         }
    251 
    252         // As per getAssocMap(), this never returns null.
    253         public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() {
    254             return getAssocMap();
    255         }
    256     }
    257 
    258     /**
    259      * Basic test to find out whether the file is in the unigram XML format or not.
    260      *
    261      * Concretely this only tests the header line.
    262      *
    263      * @param filename The name of the file to test.
    264      * @return true if the file is in the unigram XML format, false otherwise
    265      */
    266     public static boolean isXmlUnigramDictionary(final String filename) {
    267         try (final BufferedReader reader = new BufferedReader(
    268                 new InputStreamReader(new FileInputStream(filename), "UTF-8"))) {
    269             final String firstLine = reader.readLine();
    270             return firstLine.matches("^\\s*<wordlist .*>\\s*$");
    271         } catch (final IOException e) {
    272             return false;
    273         }
    274     }
    275 
    276     /**
    277      * Reads a dictionary from an XML file.
    278      *
    279      * This is the public method that will parse an XML file and return the corresponding memory
    280      * representation.
    281      *
    282      * @param unigrams the file to read the data from.
    283      * @param shortcuts the file to read the shortcuts & whitelist from, or null.
    284      * @param bigrams the file to read the bigrams from, or null.
    285      * @return the in-memory representation of the dictionary.
    286      */
    287     public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams,
    288             final BufferedInputStream shortcuts, final BufferedInputStream bigrams)
    289             throws SAXException, IOException, ParserConfigurationException {
    290         final SAXParserFactory factory = SAXParserFactory.newInstance();
    291         factory.setNamespaceAware(true);
    292         final SAXParser parser = factory.newSAXParser();
    293         final BigramHandler bigramHandler = new BigramHandler();
    294         if (null != bigrams) parser.parse(bigrams, bigramHandler);
    295 
    296         final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler =
    297                 new ShortcutAndWhitelistHandler();
    298         if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler);
    299 
    300         final UnigramHandler unigramHandler =
    301                 new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap());
    302         parser.parse(unigrams, unigramHandler);
    303         final FusionDictionary dict = unigramHandler.getFinalDictionary();
    304         final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
    305         for (final String firstWord : bigramMap.keySet()) {
    306             if (!dict.hasWord(firstWord)) continue;
    307             final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
    308             for (final WeightedString bigram : bigramList) {
    309                 if (!dict.hasWord(bigram.mWord)) continue;
    310                 dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo);
    311             }
    312         }
    313         return dict;
    314     }
    315 
    316     /**
    317      * Reads a dictionary in the first, legacy XML format
    318      *
    319      * This method reads data from the parser and creates a new FusionDictionary with it.
    320      * The format parsed by this method is the format used before Ice Cream Sandwich,
    321      * which has no support for bigrams or shortcuts/whitelist.
    322      * It is important to note that this method expects the parser to have already eaten
    323      * the first, all-encompassing tag.
    324      *
    325      * @param xpp the parser to read the data from.
    326      * @return the parsed dictionary.
    327      */
    328 
    329     /**
    330      * Writes a dictionary to an XML file.
    331      *
    332      * The output format is the "second" format, which supports bigrams and shortcuts/whitelist.
    333      *
    334      * @param destination a destination stream to write to.
    335      * @param dict the dictionary to write.
    336      */
    337     public static void writeDictionaryXml(final BufferedWriter destination,
    338             final FusionDictionary dict) throws IOException {
    339         final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
    340         for (WordProperty wordProperty : dict) {
    341             wordPropertiesInDict.add(wordProperty);
    342         }
    343         // TODO: use an XMLSerializer if this gets big
    344         destination.write("<wordlist format=\"2\"");
    345         for (final String key : dict.mOptions.mAttributes.keySet()) {
    346             final String value = dict.mOptions.mAttributes.get(key);
    347             destination.write(" " + key + "=\"" + value + "\"");
    348         }
    349         destination.write(">\n");
    350         destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
    351         for (WordProperty wordProperty : wordPropertiesInDict) {
    352             destination.write("  <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord
    353                     + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability()
    354                     + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "")
    355                     + "\">");
    356             if (null != wordProperty.mShortcutTargets) {
    357                 destination.write("\n");
    358                 for (WeightedString target : wordProperty.mShortcutTargets) {
    359                     destination.write("    <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\""
    360                             + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG
    361                             + ">\n");
    362                 }
    363                 destination.write("  ");
    364             }
    365             if (null != wordProperty.mBigrams) {
    366                 destination.write("\n");
    367                 for (WeightedString bigram : wordProperty.mBigrams) {
    368                     destination.write("    <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\""
    369                             + bigram.getProbability() + "\">" + bigram.mWord
    370                             + "</" + BIGRAM_TAG + ">\n");
    371                 }
    372                 destination.write("  ");
    373             }
    374             destination.write("</" + WORD_TAG + ">\n");
    375         }
    376         destination.write("</wordlist>\n");
    377         destination.close();
    378     }
    379 }
    380