Home | History | Annotate | Download | only in dicttool
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.dicttool;
     18 
     19 import com.android.inputmethod.latin.makedict.FormatSpec;
     20 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
     21 import com.android.inputmethod.latin.makedict.FusionDictionary;
     22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
     23 import com.android.inputmethod.latin.makedict.ProbabilityInfo;
     24 import com.android.inputmethod.latin.makedict.WeightedString;
     25 import com.android.inputmethod.latin.makedict.WordProperty;
     26 import com.android.inputmethod.latin.utils.CombinedFormatUtils;
     27 
     28 import java.io.BufferedReader;
     29 import java.io.BufferedWriter;
     30 import java.io.FileReader;
     31 import java.io.IOException;
     32 import java.util.ArrayList;
     33 import java.util.HashMap;
     34 import java.util.TreeSet;
     35 
     36 /**
     37  * Reads and writes combined format for a FusionDictionary.
     38  *
     39  * All functions in this class are static.
     40  */
     41 public class CombinedInputOutput {
     42     private static final String WHITELIST_TAG = "whitelist";
     43     private static final String OPTIONS_TAG = "options";
     44     private static final String COMMENT_LINE_STARTER = "#";
     45     private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3;
     46 
     47     /**
     48      * Basic test to find out whether the file is in the combined format or not.
     49      *
     50      * Concretely this only tests the header line.
     51      *
     52      * @param filename The name of the file to test.
     53      * @return true if the file is in the combined format, false otherwise
     54      */
     55     public static boolean isCombinedDictionary(final String filename) {
     56         try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) {
     57             String firstLine = reader.readLine();
     58             while (firstLine.startsWith(COMMENT_LINE_STARTER)) {
     59                 firstLine = reader.readLine();
     60             }
     61             return firstLine.matches(
     62                     "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
     63         } catch (final IOException e) {
     64             return false;
     65         }
     66     }
     67 
     68     /**
     69      * Reads a dictionary from a combined format file.
     70      *
     71      * This is the public method that will read a combined file and return the corresponding memory
     72      * representation.
     73      *
     74      * @param reader the buffered reader to read the data from.
     75      * @return the in-memory representation of the dictionary.
     76      */
     77     public static FusionDictionary readDictionaryCombined(final BufferedReader reader)
     78             throws IOException {
     79         String headerLine = reader.readLine();
     80         while (headerLine.startsWith(COMMENT_LINE_STARTER)) {
     81             headerLine = reader.readLine();
     82         }
     83         final String header[] = headerLine.split(",");
     84         final HashMap<String, String> attributes = new HashMap<>();
     85         for (String item : header) {
     86             final String keyValue[] = item.split("=");
     87             if (2 != keyValue.length) {
     88                 throw new RuntimeException("Wrong header format : " + headerLine);
     89             }
     90             attributes.put(keyValue[0], keyValue[1]);
     91         }
     92 
     93         attributes.remove(OPTIONS_TAG);
     94         final FusionDictionary dict =
     95                 new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes));
     96 
     97         String line;
     98         String word = null;
     99         ProbabilityInfo probabilityInfo = new ProbabilityInfo(0);
    100         boolean isNotAWord = false;
    101         boolean isPossiblyOffensive = false;
    102         ArrayList<WeightedString> bigrams = new ArrayList<>();
    103         ArrayList<WeightedString> shortcuts = new ArrayList<>();
    104         while (null != (line = reader.readLine())) {
    105             if (line.startsWith(COMMENT_LINE_STARTER)) continue;
    106             final String args[] = line.trim().split(",");
    107             if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
    108                 if (null != word) {
    109                     dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts,
    110                             isNotAWord, isPossiblyOffensive);
    111                     for (WeightedString s : bigrams) {
    112                         dict.setBigram(word, s.mWord, s.mProbabilityInfo);
    113                     }
    114                 }
    115                 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>();
    116                 if (!bigrams.isEmpty()) bigrams = new ArrayList<>();
    117                 isNotAWord = false;
    118                 isPossiblyOffensive = false;
    119                 for (String param : args) {
    120                     final String params[] = param.split("=", 2);
    121                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    122                     switch (params[0]) {
    123                         case CombinedFormatUtils.WORD_TAG:
    124                             word = params[1];
    125                             break;
    126                         case CombinedFormatUtils.PROBABILITY_TAG:
    127                             probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
    128                                     probabilityInfo.mTimestamp, probabilityInfo.mLevel,
    129                                     probabilityInfo.mCount);
    130                             break;
    131                         case CombinedFormatUtils.HISTORICAL_INFO_TAG:
    132                             final String[] historicalInfoParams = params[1].split(
    133                                     CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
    134                             if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
    135                                 throw new RuntimeException("Wrong format (historical info) : "
    136                                         + line);
    137                             }
    138                             probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability,
    139                                     Integer.parseInt(historicalInfoParams[0]),
    140                                     Integer.parseInt(historicalInfoParams[1]),
    141                                     Integer.parseInt(historicalInfoParams[2]));
    142                             break;
    143                         case CombinedFormatUtils.NOT_A_WORD_TAG:
    144                             isNotAWord = CombinedFormatUtils.isLiteralTrue(params[1]);
    145                             break;
    146                         case CombinedFormatUtils.POSSIBLY_OFFENSIVE_TAG:
    147                             isPossiblyOffensive = CombinedFormatUtils.isLiteralTrue(params[1]);
    148                             break;
    149                     }
    150                 }
    151             } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) {
    152                 String shortcut = null;
    153                 int shortcutFreq = 0;
    154                 for (String param : args) {
    155                     final String params[] = param.split("=", 2);
    156                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    157                     if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) {
    158                         shortcut = params[1];
    159                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
    160                         shortcutFreq = WHITELIST_TAG.equals(params[1])
    161                                 ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY
    162                                 : Integer.parseInt(params[1]);
    163                     }
    164                 }
    165                 if (null != shortcut) {
    166                     shortcuts.add(new WeightedString(shortcut, shortcutFreq));
    167                 } else {
    168                     throw new RuntimeException("Wrong format : " + line);
    169                 }
    170             } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) {
    171                 String secondWordOfBigram = null;
    172                 ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0);
    173                 for (String param : args) {
    174                     final String params[] = param.split("=", 2);
    175                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
    176                     if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) {
    177                         secondWordOfBigram = params[1];
    178                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
    179                         bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
    180                                 bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel,
    181                                 bigramProbabilityInfo.mCount);
    182                     }  else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
    183                         final String[] historicalInfoParams =
    184                                 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
    185                         if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
    186                             throw new RuntimeException("Wrong format (historical info) : " + line);
    187                         }
    188                         bigramProbabilityInfo = new ProbabilityInfo(
    189                                 bigramProbabilityInfo.mProbability,
    190                                 Integer.parseInt(historicalInfoParams[0]),
    191                                 Integer.parseInt(historicalInfoParams[1]),
    192                                 Integer.parseInt(historicalInfoParams[2]));
    193                     }
    194                 }
    195                 if (null != secondWordOfBigram) {
    196                     bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo));
    197                 } else {
    198                     throw new RuntimeException("Wrong format : " + line);
    199                 }
    200             }
    201         }
    202         if (null != word) {
    203             dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord,
    204                     isPossiblyOffensive);
    205             for (WeightedString s : bigrams) {
    206                 dict.setBigram(word, s.mWord, s.mProbabilityInfo);
    207             }
    208         }
    209 
    210         return dict;
    211     }
    212 
    213     /**
    214      * Writes a dictionary to a combined file.
    215      *
    216      * @param destination a destination writer.
    217      * @param dict the dictionary to write.
    218      */
    219     public static void writeDictionaryCombined(final BufferedWriter destination,
    220             final FusionDictionary dict) throws IOException {
    221         final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
    222         for (final WordProperty wordProperty : dict) {
    223             // This for ordering by frequency, then by asciibetic order
    224             wordPropertiesInDict.add(wordProperty);
    225         }
    226         destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes));
    227         for (final WordProperty wordProperty : wordPropertiesInDict) {
    228             destination.write(CombinedFormatUtils.formatWordProperty(wordProperty));
    229         }
    230     }
    231 }
    232