Home | History | Annotate | Download | only in JAJP
      1 /*
      2  * Copyright (C) 2008,2009  OMRON SOFTWARE Co., Ltd.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package jp.co.omronsoft.openwnn.JAJP;
     18 
     19 import jp.co.omronsoft.openwnn.*;
     20 import java.util.*;
     21 
     22 /**
     23  * The penWnn Clause Converter class for Japanese IME.
     24  *
     25  * @author Copyright (C) 2009 OMRON SOFTWARE CO., LTD.  All Rights Reserved.
     26  */
     27 public class OpenWnnClauseConverterJAJP {
     28     /** Score(frequency value) of word in the learning dictionary */
     29     private static final int FREQ_LEARN = 600;
     30     /** Score(frequency value) of word in the user dictionary */
     31     private static final int FREQ_USER  = 500;
     32 
     33     /** Maximum limit length of input */
     34     public static final int MAX_INPUT_LENGTH = 50;
     35 
     36     /** search cache for unique independent words (jiritsugo) */
     37     private HashMap<String, ArrayList<WnnWord>> mIndepWordBag;
     38     /** search cache for all independent words (jiritsugo) */
     39     private HashMap<String, ArrayList<WnnWord>> mAllIndepWordBag;
     40     /** search cache for ancillary words (fuzokugo) */
     41     private HashMap<String, ArrayList<WnnWord>> mFzkPatterns;
     42 
     43     /** connect matrix for generating a clause */
     44     private byte[][] mConnectMatrix;
     45 
     46     /** dictionaries */
     47     private WnnDictionary mDictionary;
     48 
     49     /** candidates of conversion */
     50     private LinkedList mConvertResult;
     51 
     52     /** work area for consecutive clause conversion */
     53     private WnnSentence[] mSentenceBuffer;
     54 
     55     /** part of speech (default) */
     56     private WnnPOS mPosDefault;
     57     /** part of speech (end of clause/not end of sentence) */
     58     private WnnPOS mPosEndOfClause1;
     59     /** part of speech (end of clause/any place) */
     60     private WnnPOS mPosEndOfClause2;
     61     /** part of speech (end of sentence) */
     62     private WnnPOS mPosEndOfClause3;
     63 
     64     /** cost value of a clause */
     65     private static final int CLAUSE_COST = -1000;
     66 
     67     /** The candidate filter */
     68     private CandidateFilter mFilter = null;
     69 
     70     /**
     71      * Constructor
     72      */
     73     public OpenWnnClauseConverterJAJP() {
     74         mIndepWordBag  = new HashMap<String, ArrayList<WnnWord>>();
     75         mAllIndepWordBag  = new HashMap<String, ArrayList<WnnWord>>();
     76         mFzkPatterns   = new HashMap();
     77         mConvertResult = new LinkedList();
     78 
     79         mSentenceBuffer = new WnnSentence[MAX_INPUT_LENGTH];
     80     }
     81 
     82     /**
     83      * Set the dictionary
     84      *
     85      * @param dict  The dictionary for phrase conversion
     86      */
     87     public void setDictionary(WnnDictionary dict) {
     88         /* get connect matrix */
     89         mConnectMatrix = dict.getConnectMatrix();
     90 
     91         /* clear dictionary settings */
     92         mDictionary = dict;
     93         dict.clearDictionary();
     94         dict.clearApproxPattern();
     95 
     96         /* clear work areas */
     97         mIndepWordBag.clear();
     98         mAllIndepWordBag.clear();
     99         mFzkPatterns.clear();
    100 
    101         /* get part of speech tags */
    102         mPosDefault      = dict.getPOS(WnnDictionary.POS_TYPE_MEISI);
    103         mPosEndOfClause1 = dict.getPOS(WnnDictionary.POS_TYPE_V1);
    104         mPosEndOfClause2 = dict.getPOS(WnnDictionary.POS_TYPE_V2);
    105         mPosEndOfClause3 = dict.getPOS(WnnDictionary.POS_TYPE_V3);
    106     }
    107 
    108     /**
    109      * Set the candidate filter
    110      *
    111      * @param filter	The candidate filter
    112      */
    113     public void setFilter(CandidateFilter filter) {
    114     	mFilter = filter;
    115     }
    116 
    117     /**
    118      * Kana-to-Kanji conversion (single clause).
    119      * <br>
    120      * This method execute single clause conversion.
    121       *
    122      * @param input		The input string
    123      * @return			The candidates of conversion; {@code null} if an error occurs.
    124      */
    125      public Iterator convert(String input) {
    126         /* do nothing if no dictionary is specified */
    127         if (mConnectMatrix == null || mDictionary == null) {
    128             return null;
    129         }
    130         /* do nothing if the length of input exceeds the limit */
    131         if (input.length() > MAX_INPUT_LENGTH) {
    132             return null;
    133         }
    134 
    135         /* clear the candidates list */
    136         mConvertResult.clear();
    137 
    138         /* try single clause conversion */
    139         if (!singleClauseConvert(mConvertResult, input, mPosEndOfClause2, true)) {
    140             return null;
    141         }
    142         return mConvertResult.iterator();
    143     }
    144 
    145     /**
    146      * Consecutive clause conversion.
    147      *
    148      * @param input		The input string
    149      * @return			The result of consecutive clause conversion; {@code null} if fail.
    150      */
    151     public WnnSentence consecutiveClauseConvert(String input) {
    152         LinkedList clauses = new LinkedList();
    153 
    154         /* clear the cache which is not matched */
    155         for (int i = 0; i < input.length(); i++) {
    156             mSentenceBuffer[i] = null;
    157         }
    158         WnnSentence[] sentence = mSentenceBuffer;
    159 
    160         /* consecutive clause conversion */
    161         for (int start = 0; start < input.length(); start++) {
    162             if (start != 0 && sentence[start-1] == null) {
    163                 continue;
    164             }
    165 
    166             /* limit the length of a clause */
    167             int end = input.length();
    168             if (end > start + 20) {
    169                 end = start + 20;
    170             }
    171             /* make clauses */
    172             for ( ; end > start; end--) {
    173                 int idx = end - 1;
    174 
    175                 /* cutting a branch */
    176                 if (sentence[idx] != null) {
    177                     if (start != 0) {
    178                         if (sentence[idx].frequency > sentence[start-1].frequency + CLAUSE_COST + FREQ_LEARN) {
    179                             /* there may be no way to be the best sequence from the 'start' */
    180                             break;
    181                         }
    182                     } else {
    183                         if (sentence[idx].frequency > CLAUSE_COST + FREQ_LEARN) {
    184                             /* there may be no way to be the best sequence from the 'start' */
    185                             break;
    186                         }
    187                     }
    188                 }
    189 
    190                 String key = input.substring(start, end);
    191                 clauses.clear();
    192                 WnnClause bestClause = null;
    193                 if (end == input.length()) {
    194                     /* get the clause which can be the end of the sentence */
    195                     singleClauseConvert(clauses, key, mPosEndOfClause1, false);
    196                 } else {
    197                     /* get the clause which is not the end of the sentence */
    198                     singleClauseConvert(clauses, key, mPosEndOfClause3, false);
    199                 }
    200                 if (clauses.isEmpty()) {
    201                     bestClause = defaultClause(key);
    202                 } else {
    203                     bestClause = (WnnClause)clauses.get(0);
    204                 }
    205 
    206                 /* make a sub-sentence */
    207                 WnnSentence ws;
    208                 if (start == 0) {
    209                     ws = new WnnSentence(key, bestClause);
    210                 } else {
    211                     ws = new WnnSentence(sentence[start-1], bestClause);
    212                 }
    213                 ws.frequency += CLAUSE_COST;
    214 
    215                 /* update the best sub-sentence on the cache buffer */
    216                 if (sentence[idx] == null || (sentence[idx].frequency < ws.frequency)) {
    217                     sentence[idx] = ws;
    218                 }
    219             }
    220         }
    221 
    222         /* return the result of the consecutive clause conversion */
    223         if (sentence[input.length() - 1] != null) {
    224             return sentence[input.length() - 1];
    225         }
    226         return null;
    227     }
    228 
    229     /**
    230      * Consecutive clause conversion.
    231      *
    232      * @param resultList	Where to store the result
    233      * @param input			Input string
    234      * @return				{@code true} if success; {@code false} if fail.
    235      */
    236     private boolean consecutiveClauseConvert(LinkedList resultList, String input) {
    237         WnnSentence sentence = consecutiveClauseConvert(input);
    238 
    239         /* set the result of the consecutive clause conversion on the top of the list */
    240         if (sentence != null) {
    241             resultList.add(0, sentence);
    242             return true;
    243         }
    244         return false;
    245     }
    246 
    247     /**
    248      * Single clause conversion.
    249      *
    250      * @param clauseList	Where to store the results
    251      * @param input			Input string
    252      * @param terminal		Part of speech tag at the terminal
    253      * @param all			Get all candidates or not
    254      * @return				{@code true} if success; {@code false} if fail.
    255      */
    256     private boolean singleClauseConvert(LinkedList clauseList, String input, WnnPOS terminal, boolean all) {
    257         boolean ret = false;
    258 
    259         /* get clauses without ancillary word */
    260         ArrayList<WnnWord> stems = getIndependentWords(input, all);
    261         if (stems != null && (!stems.isEmpty())) {
    262             Iterator<WnnWord> stemsi = stems.iterator();
    263             while (stemsi.hasNext()) {
    264                 WnnWord stem = stemsi.next();
    265                 if (addClause(clauseList, input, stem, null, terminal, all)) {
    266                     ret = true;
    267                 }
    268             }
    269         }
    270 
    271         /* get clauses with ancillary word */
    272         int max = CLAUSE_COST * 2;
    273         for (int split = 1; split < input.length(); split++) {
    274             /* get ancillary patterns */
    275             String str = input.substring(split);
    276             ArrayList<WnnWord> fzks = getAncillaryPattern(str);
    277             if (fzks == null || fzks.isEmpty()) {
    278                 continue;
    279             }
    280 
    281             /* get candidates of stem in a clause */
    282             str = input.substring(0, split);
    283             stems = getIndependentWords(str, all);
    284             if (stems == null || stems.isEmpty()) {
    285                 if (mDictionary.searchWord(WnnDictionary.SEARCH_PREFIX, WnnDictionary.ORDER_BY_FREQUENCY, str) <= 0) {
    286                     break;
    287                 } else {
    288                     continue;
    289                 }
    290             }
    291             /* make clauses */
    292             Iterator<WnnWord> stemsi = stems.iterator();
    293             while (stemsi.hasNext()) {
    294                 WnnWord stem = stemsi.next();
    295                 if (all || stem.frequency > max) {
    296                     Iterator<WnnWord> fzksi  = fzks.iterator();
    297                     while (fzksi.hasNext()) {
    298                         WnnWord fzk = fzksi.next();
    299                         if (addClause(clauseList, input, stem, fzk, terminal, all)) {
    300                             ret = true;
    301                             max = stem.frequency;
    302                         }
    303                     }
    304                 }
    305             }
    306         }
    307         return ret;
    308     }
    309 
    310     /**
    311      * Add valid clause to the candidates list.
    312      *
    313      * @param clauseList	Where to store the results
    314      * @param input			Input string
    315      * @param stem			Stem of the clause (a independent word)
    316      * @param fzk			Ancillary pattern
    317      * @param terminal		Part of speech tag at the terminal
    318      * @param all			Get all candidates or not
    319      * @return				{@code true} if add the clause to the list; {@code false} if not.
    320      */
    321     private boolean addClause(LinkedList<WnnClause> clauseList, String input, WnnWord stem, WnnWord fzk,
    322                               WnnPOS terminal, boolean all) {
    323         WnnClause clause = null;
    324         /* check if the part of speech is valid */
    325         if (fzk == null) {
    326             if (connectible(stem.partOfSpeech.right, terminal.left)) {
    327                 clause = new WnnClause(input, stem);
    328             }
    329         } else {
    330             if (connectible(stem.partOfSpeech.right, fzk.partOfSpeech.left)
    331                 && connectible(fzk.partOfSpeech.right, terminal.left)) {
    332                 clause = new WnnClause(input, stem, fzk);
    333             }
    334         }
    335         if (clause == null) {
    336             return false;
    337         }
    338         if (mFilter != null && !mFilter.isAllowed(clause)) {
    339         	return false;
    340         }
    341 
    342         /* store to the list */
    343         if (clauseList.isEmpty()) {
    344             /* add if the list is empty */
    345             clauseList.add(0, clause);
    346             return true;
    347         } else {
    348             if (!all) {
    349                 /* reserve only the best clause */
    350                 WnnClause best = (WnnClause)clauseList.get(0);
    351                 if (best.frequency < clause.frequency) {
    352                     clauseList.set(0, clause);
    353                     return true;
    354                 }
    355             } else {
    356                 /* reserve all clauses */
    357                 Iterator clauseListi = clauseList.iterator();
    358                 int index = 0;
    359                 while (clauseListi.hasNext()) {
    360                     WnnClause clausei = (WnnClause)clauseListi.next();
    361                     if (clausei.frequency < clause.frequency) {
    362                         break;
    363                     }
    364                     index++;
    365                 }
    366                 clauseList.add(index, clause);
    367                 return true;
    368             }
    369         }
    370 
    371         return false;
    372     }
    373 
    374     /**
    375      * Check the part-of-speeches are connectable.
    376      *
    377      * @param right		Right attribute of the preceding word/clause
    378      * @param left		Left attribute of the following word/clause
    379      * @return			{@code true} if there are connectable; {@code false} if otherwise
    380      */
    381     private boolean connectible(int right, int left) {
    382         try {
    383             if (mConnectMatrix[left][right] != 0) {
    384                 return true;
    385             }
    386         } catch (Exception ex) {
    387         }
    388         return false;
    389     }
    390 
    391     /**
    392      * Get all exact matched ancillary words(Fuzokugo) list.
    393      *
    394      * @param input		Search key
    395      * @return			List of ancillary words
    396      */
    397     private ArrayList<WnnWord> getAncillaryPattern(String input) {
    398         if (input.length() == 0) {
    399             return null;
    400         }
    401 
    402         HashMap<String,ArrayList<WnnWord>> fzkPat = mFzkPatterns;
    403         ArrayList<WnnWord> fzks = fzkPat.get(input);
    404         if (fzks != null) {
    405             return fzks;
    406         }
    407 
    408         /* set dictionaries */
    409         WnnDictionary dict = mDictionary;
    410         dict.clearDictionary();
    411         dict.clearApproxPattern();
    412         dict.setDictionary(6, 400, 500);
    413 
    414         for (int start = input.length() - 1; start >= 0; start--) {
    415             String key = input.substring(start);
    416 
    417             fzks = fzkPat.get(key);
    418             if (fzks != null) {
    419                 continue;
    420             }
    421 
    422             fzks = new ArrayList<WnnWord>();
    423             mFzkPatterns.put(key, fzks);
    424 
    425             /* search ancillary words */
    426             dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, key);
    427             WnnWord word;
    428             while ((word = dict.getNextWord()) != null) {
    429                 fzks.add(word);
    430             }
    431 
    432             /* concatenate sequence of ancillary words */
    433             for (int end = input.length() - 1; end > start; end--) {
    434                 ArrayList<WnnWord> followFzks = fzkPat.get(input.substring(end));
    435                 if (followFzks == null ||  followFzks.isEmpty()) {
    436                     continue;
    437                 }
    438                 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input.substring(start, end));
    439                 while ((word = dict.getNextWord()) != null) {
    440                     Iterator<WnnWord> followFzksi = followFzks.iterator();
    441                     while (followFzksi.hasNext()) {
    442                         WnnWord follow = followFzksi.next();
    443                         if (connectible(word.partOfSpeech.right, follow.partOfSpeech.left)) {
    444                             fzks.add(new WnnWord(key, key, new WnnPOS(word.partOfSpeech.left, follow.partOfSpeech.right)));
    445                         }
    446                     }
    447                 }
    448             }
    449         }
    450         return fzks;
    451     }
    452 
    453     /**
    454      * Get all exact matched independent words(Jiritsugo) list.
    455      *
    456      * @param input    Search key
    457      * @param all      {@code true} if list all words; {@code false} if list words which has an unique part of speech tag.
    458      * @return			List of words; {@code null} if {@code input.length() == 0}.
    459      */
    460     private ArrayList<WnnWord> getIndependentWords(String input, boolean all) {
    461         if (input.length() == 0) {
    462             return null;
    463         }
    464 
    465         ArrayList<WnnWord> words = (all)? mAllIndepWordBag.get(input) : mIndepWordBag.get(input);
    466 
    467         if (words == null) {
    468             /* set dictionaries */
    469             WnnDictionary dict = mDictionary;
    470             dict.clearDictionary();
    471             dict.clearApproxPattern();
    472             dict.setDictionary(4, 0, 10);
    473             dict.setDictionary(5, 400, 500);
    474             dict.setDictionary(WnnDictionary.INDEX_USER_DICTIONARY, FREQ_USER, FREQ_USER);
    475             dict.setDictionary(WnnDictionary.INDEX_LEARN_DICTIONARY, FREQ_LEARN, FREQ_LEARN);
    476 
    477             words = new ArrayList<WnnWord>();
    478             WnnWord word;
    479             if (all) {
    480             	mAllIndepWordBag.put(input, words);
    481                 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
    482                 /* store all words */
    483                 while ((word = dict.getNextWord()) != null) {
    484                     if (input.equals(word.stroke)) {
    485                         words.add(word);
    486                     }
    487                 }
    488             } else {
    489             	mIndepWordBag.put(input, words);
    490                 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
    491                 /* store a word which has an unique part of speech tag */
    492                 while ((word = dict.getNextWord()) != null) {
    493                     if (input.equals(word.stroke)) {
    494                         Iterator<WnnWord> list = words.iterator();
    495                         boolean found = false;
    496                         while (list.hasNext()) {
    497                             WnnWord w = (WnnWord)list.next();
    498                                 if (w.partOfSpeech.right == word.partOfSpeech.right) {
    499                                     found = true;
    500                                     break;
    501                                 }
    502                         }
    503                         if (!found) {
    504                             words.add(word);
    505                         }
    506                         if (word.frequency < 400) {
    507                             break;
    508                         }
    509                     }
    510                 }
    511             }
    512             addAutoGeneratedCandidates(input, words, all);
    513         }
    514         return words;
    515     }
    516 
    517     /**
    518      * Add some words not including in the dictionary.
    519      * <br>
    520      * This method adds some words which are not in the dictionary.
    521      *
    522      * @param input     Input string
    523      * @param wordList  List to store words
    524      * @param all       Get all candidates or not
    525      */
    526     private void addAutoGeneratedCandidates(String input, ArrayList wordList, boolean all) {
    527         wordList.add(new WnnWord(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
    528     }
    529 
    530     /**
    531      * Get a default clause.
    532      * <br>
    533      * This method generates a clause which has a string same as input
    534      * and the default part-of-speech tag.
    535      *
    536      * @param input    Input string
    537      * @return			Default clause
    538      */
    539     private WnnClause defaultClause(String input) {
    540         return (new WnnClause(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
    541     }
    542 }
    543