Home | History | Annotate | Download | only in dicttool
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.dicttool;
     18 
     19 import com.android.inputmethod.latin.makedict.FormatSpec;
     20 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
     21 import com.android.inputmethod.latin.makedict.FusionDictionary;
     22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
     23 import com.android.inputmethod.latin.makedict.ProbabilityInfo;
     24 import com.android.inputmethod.latin.makedict.WeightedString;
     25 import com.android.inputmethod.latin.makedict.WordProperty;
     26 import com.android.inputmethod.latin.utils.CombinedFormatUtils;
     27 
     28 import java.io.BufferedReader;
     29 import java.io.BufferedWriter;
     30 import java.io.FileReader;
     31 import java.io.IOException;
     32 import java.util.ArrayList;
     33 import java.util.HashMap;
     34 import java.util.TreeSet;
     35 
     36 /**
     37  * Reads and writes combined format for a FusionDictionary.
     38  *
     39  * All functions in this class are static.
     40  */
     41 public class CombinedInputOutput {
     42     private static final String WHITELIST_TAG = "whitelist";
     43     private static final String OPTIONS_TAG = "options";
     44     private static final String COMMENT_LINE_STARTER = "#";
     45     private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3;
     46 
     47     /**
     48      * Basic test to find out whether the file is in the combined format or not.
     49      *
     50      * Concretely this only tests the header line.
     51      *
     52      * @param filename The name of the file to test.
     53      * @return true if the file is in the combined format, false otherwise
     54      */
     55     public static boolean isCombinedDictionary(final String filename) {
     56         try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) {
     57             String firstLine = reader.readLine();
     58             while (firstLine.startsWith(COMMENT_LINE_STARTER)) {
     59                 firstLine = reader.readLine();
     60             }
     61             return firstLine.matches(
     62                     "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
     63         } catch (final IOException e) {
     64             return false;
     65         }
     66     }
     67 
     68     /**
     69      * Reads a dictionary from a combined format file.
     70      *
     71      * This is the public method that will read a combined file and return the corresponding memory
     72      * representation.
     73      *
     74      * @param reader the buffered reader to read the data from.
     75      * @return the in-memory representation of the dictionary.
     76      */
     77     public static FusionDictionary readDictionaryCombined(final BufferedReader reader)
     78             throws IOException {
     79         String headerLine = reader.readLine();
     80         while (headerLine.startsWith(COMMENT_LINE_STARTER)) {
     81             headerLine = reader.readLine();
     82         }
     83         final String header[] = headerLine.split(",");
     84         final HashMap<String, String> attributes = new HashMap<>();
     85         for (String item : header) {
     86             final String keyValue[] = item.split("=");
     87             if (2 != keyValue.length) {
     88                 throw new RuntimeException("Wrong header format : " + headerLine);
     89             }
     90             attributes.put(keyValue[0], keyValue[1]);
     91         }
     92 
     93         attributes.remove(OPTIONS_TAG);
     94         final FusionDictionary dict =
     95                 new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes));
     96 
     97         String line;
     98         String word = null;
     99         ProbabilityInfo probabilityInfo = new ProbabilityInfo(0);
    100         boolean isNotAWord = false;
    101         ArrayList<WeightedString> bigrams = new ArrayList<>();
    102         ArrayList<WeightedString> shortcuts = new ArrayList<>();
    103         while (null != (line = reader.readLine())) {
    104             if (line.startsWith(COMMENT_LINE_STARTER)) continue;
    105             final String args[] = line.trim().split(",");
    106             if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
    107                 if (null != word) {
    108                     dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts,
    109                             isNotAWord);
    110                     for (WeightedString s : bigrams) {
    111                         dict.setBigram(word, s.mWord, s.mProbabilityInfo);
    112                     }
    113                 }
    114                 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>();
    115                 if (!bigrams.isEmpty()) bigrams = new ArrayList<>();
    116                 isNotAWord = false;
    117                 for (String param : args) {
    118                     final String params[] = param.split("=", 2);
    119                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    120                     if (CombinedFormatUtils.WORD_TAG.equals(params[0])) {
    121                         word = params[1];
    122                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
    123                         probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
    124                                 probabilityInfo.mTimestamp, probabilityInfo.mLevel,
    125                                 probabilityInfo.mCount);
    126                     } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
    127                         final String[] historicalInfoParams =
    128                                 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
    129                         if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
    130                             throw new RuntimeException("Wrong format (historical info) : " + line);
    131                         }
    132                         probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability,
    133                                 Integer.parseInt(historicalInfoParams[0]),
    134                                 Integer.parseInt(historicalInfoParams[1]),
    135                                 Integer.parseInt(historicalInfoParams[2]));
    136                     } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) {
    137                         isNotAWord = "true".equals(params[1]);
    138                     }
    139                 }
    140             } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) {
    141                 String shortcut = null;
    142                 int shortcutFreq = 0;
    143                 for (String param : args) {
    144                     final String params[] = param.split("=", 2);
    145                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    146                     if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) {
    147                         shortcut = params[1];
    148                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
    149                         shortcutFreq = WHITELIST_TAG.equals(params[1])
    150                                 ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY
    151                                 : Integer.parseInt(params[1]);
    152                     }
    153                 }
    154                 if (null != shortcut) {
    155                     shortcuts.add(new WeightedString(shortcut, shortcutFreq));
    156                 } else {
    157                     throw new RuntimeException("Wrong format : " + line);
    158                 }
    159             } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) {
    160                 String secondWordOfBigram = null;
    161                 ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0);
    162                 for (String param : args) {
    163                     final String params[] = param.split("=", 2);
    164                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    165                     if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) {
    166                         secondWordOfBigram = params[1];
    167                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
    168                         bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
    169                                 bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel,
    170                                 bigramProbabilityInfo.mCount);
    171                     }  else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
    172                         final String[] historicalInfoParams =
    173                                 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
    174                         if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
    175                             throw new RuntimeException("Wrong format (historical info) : " + line);
    176                         }
    177                         bigramProbabilityInfo = new ProbabilityInfo(
    178                                 bigramProbabilityInfo.mProbability,
    179                                 Integer.parseInt(historicalInfoParams[0]),
    180                                 Integer.parseInt(historicalInfoParams[1]),
    181                                 Integer.parseInt(historicalInfoParams[2]));
    182                     }
    183                 }
    184                 if (null != secondWordOfBigram) {
    185                     bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo));
    186                 } else {
    187                     throw new RuntimeException("Wrong format : " + line);
    188                 }
    189             }
    190         }
    191         if (null != word) {
    192             dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
    193             for (WeightedString s : bigrams) {
    194                 dict.setBigram(word, s.mWord, s.mProbabilityInfo);
    195             }
    196         }
    197 
    198         return dict;
    199     }
    200 
    201     /**
    202      * Writes a dictionary to a combined file.
    203      *
    204      * @param destination a destination writer.
    205      * @param dict the dictionary to write.
    206      */
    207     public static void writeDictionaryCombined(final BufferedWriter destination,
    208             final FusionDictionary dict) throws IOException {
    209         final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
    210         for (final WordProperty wordProperty : dict) {
    211             // This for ordering by frequency, then by asciibetic order
    212             wordPropertiesInDict.add(wordProperty);
    213         }
    214         destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes));
    215         for (final WordProperty wordProperty : wordPropertiesInDict) {
    216             destination.write(CombinedFormatUtils.formatWordProperty(wordProperty));
    217         }
    218     }
    219 }
    220