1 /* 2 * Copyright (C) 2008,2009 OMRON SOFTWARE Co., Ltd. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package jp.co.omronsoft.openwnn.JAJP; 18 19 import jp.co.omronsoft.openwnn.*; 20 import java.util.*; 21 22 /** 23 * The penWnn Clause Converter class for Japanese IME. 24 * 25 * @author Copyright (C) 2009 OMRON SOFTWARE CO., LTD. All Rights Reserved. 26 */ 27 public class OpenWnnClauseConverterJAJP { 28 /** Score(frequency value) of word in the learning dictionary */ 29 private static final int FREQ_LEARN = 600; 30 /** Score(frequency value) of word in the user dictionary */ 31 private static final int FREQ_USER = 500; 32 33 /** Maximum limit length of input */ 34 public static final int MAX_INPUT_LENGTH = 50; 35 36 /** search cache for unique independent words (jiritsugo) */ 37 private HashMap<String, ArrayList<WnnWord>> mIndepWordBag; 38 /** search cache for all independent words (jiritsugo) */ 39 private HashMap<String, ArrayList<WnnWord>> mAllIndepWordBag; 40 /** search cache for ancillary words (fuzokugo) */ 41 private HashMap<String, ArrayList<WnnWord>> mFzkPatterns; 42 43 /** connect matrix for generating a clause */ 44 private byte[][] mConnectMatrix; 45 46 /** dictionaries */ 47 private WnnDictionary mDictionary; 48 49 /** candidates of conversion */ 50 private LinkedList mConvertResult; 51 52 /** work area for consecutive clause conversion */ 53 private WnnSentence[] mSentenceBuffer; 54 55 /** part of speech (default) */ 56 private WnnPOS mPosDefault; 57 /** part of speech (end of clause/not end of sentence) */ 58 private WnnPOS mPosEndOfClause1; 59 /** part of speech (end of clause/any place) */ 60 private WnnPOS mPosEndOfClause2; 61 /** part of speech (end of sentence) */ 62 private WnnPOS mPosEndOfClause3; 63 64 /** cost value of a clause */ 65 private static final int CLAUSE_COST = -1000; 66 67 /** The candidate filter */ 68 private CandidateFilter mFilter = null; 69 70 /** 71 * Constructor 72 */ 73 public OpenWnnClauseConverterJAJP() { 74 mIndepWordBag = new HashMap<String, ArrayList<WnnWord>>(); 75 mAllIndepWordBag = new HashMap<String, ArrayList<WnnWord>>(); 76 mFzkPatterns = new HashMap(); 77 mConvertResult = new LinkedList(); 78 79 mSentenceBuffer = new WnnSentence[MAX_INPUT_LENGTH]; 80 } 81 82 /** 83 * Set the dictionary 84 * 85 * @param dict The dictionary for phrase conversion 86 */ 87 public void setDictionary(WnnDictionary dict) { 88 /* get connect matrix */ 89 mConnectMatrix = dict.getConnectMatrix(); 90 91 /* clear dictionary settings */ 92 mDictionary = dict; 93 dict.clearDictionary(); 94 dict.clearApproxPattern(); 95 96 /* clear work areas */ 97 mIndepWordBag.clear(); 98 mAllIndepWordBag.clear(); 99 mFzkPatterns.clear(); 100 101 /* get part of speech tags */ 102 mPosDefault = dict.getPOS(WnnDictionary.POS_TYPE_MEISI); 103 mPosEndOfClause1 = dict.getPOS(WnnDictionary.POS_TYPE_V1); 104 mPosEndOfClause2 = dict.getPOS(WnnDictionary.POS_TYPE_V2); 105 mPosEndOfClause3 = dict.getPOS(WnnDictionary.POS_TYPE_V3); 106 } 107 108 /** 109 * Set the candidate filter 110 * 111 * @param filter The candidate filter 112 */ 113 public void setFilter(CandidateFilter filter) { 114 mFilter = filter; 115 } 116 117 /** 118 * Kana-to-Kanji conversion (single clause). 119 * <br> 120 * This method execute single clause conversion. 121 * 122 * @param input The input string 123 * @return The candidates of conversion; {@code null} if an error occurs. 124 */ 125 public Iterator convert(String input) { 126 /* do nothing if no dictionary is specified */ 127 if (mConnectMatrix == null || mDictionary == null) { 128 return null; 129 } 130 /* do nothing if the length of input exceeds the limit */ 131 if (input.length() > MAX_INPUT_LENGTH) { 132 return null; 133 } 134 135 /* clear the candidates list */ 136 mConvertResult.clear(); 137 138 /* try single clause conversion */ 139 if (!singleClauseConvert(mConvertResult, input, mPosEndOfClause2, true)) { 140 return null; 141 } 142 return mConvertResult.iterator(); 143 } 144 145 /** 146 * Consecutive clause conversion. 147 * 148 * @param input The input string 149 * @return The result of consecutive clause conversion; {@code null} if fail. 150 */ 151 public WnnSentence consecutiveClauseConvert(String input) { 152 LinkedList clauses = new LinkedList(); 153 154 /* clear the cache which is not matched */ 155 for (int i = 0; i < input.length(); i++) { 156 mSentenceBuffer[i] = null; 157 } 158 WnnSentence[] sentence = mSentenceBuffer; 159 160 /* consecutive clause conversion */ 161 for (int start = 0; start < input.length(); start++) { 162 if (start != 0 && sentence[start-1] == null) { 163 continue; 164 } 165 166 /* limit the length of a clause */ 167 int end = input.length(); 168 if (end > start + 20) { 169 end = start + 20; 170 } 171 /* make clauses */ 172 for ( ; end > start; end--) { 173 int idx = end - 1; 174 175 /* cutting a branch */ 176 if (sentence[idx] != null) { 177 if (start != 0) { 178 if (sentence[idx].frequency > sentence[start-1].frequency + CLAUSE_COST + FREQ_LEARN) { 179 /* there may be no way to be the best sequence from the 'start' */ 180 break; 181 } 182 } else { 183 if (sentence[idx].frequency > CLAUSE_COST + FREQ_LEARN) { 184 /* there may be no way to be the best sequence from the 'start' */ 185 break; 186 } 187 } 188 } 189 190 String key = input.substring(start, end); 191 clauses.clear(); 192 WnnClause bestClause = null; 193 if (end == input.length()) { 194 /* get the clause which can be the end of the sentence */ 195 singleClauseConvert(clauses, key, mPosEndOfClause1, false); 196 } else { 197 /* get the clause which is not the end of the sentence */ 198 singleClauseConvert(clauses, key, mPosEndOfClause3, false); 199 } 200 if (clauses.isEmpty()) { 201 bestClause = defaultClause(key); 202 } else { 203 bestClause = (WnnClause)clauses.get(0); 204 } 205 206 /* make a sub-sentence */ 207 WnnSentence ws; 208 if (start == 0) { 209 ws = new WnnSentence(key, bestClause); 210 } else { 211 ws = new WnnSentence(sentence[start-1], bestClause); 212 } 213 ws.frequency += CLAUSE_COST; 214 215 /* update the best sub-sentence on the cache buffer */ 216 if (sentence[idx] == null || (sentence[idx].frequency < ws.frequency)) { 217 sentence[idx] = ws; 218 } 219 } 220 } 221 222 /* return the result of the consecutive clause conversion */ 223 if (sentence[input.length() - 1] != null) { 224 return sentence[input.length() - 1]; 225 } 226 return null; 227 } 228 229 /** 230 * Consecutive clause conversion. 231 * 232 * @param resultList Where to store the result 233 * @param input Input string 234 * @return {@code true} if success; {@code false} if fail. 235 */ 236 private boolean consecutiveClauseConvert(LinkedList resultList, String input) { 237 WnnSentence sentence = consecutiveClauseConvert(input); 238 239 /* set the result of the consecutive clause conversion on the top of the list */ 240 if (sentence != null) { 241 resultList.add(0, sentence); 242 return true; 243 } 244 return false; 245 } 246 247 /** 248 * Single clause conversion. 249 * 250 * @param clauseList Where to store the results 251 * @param input Input string 252 * @param terminal Part of speech tag at the terminal 253 * @param all Get all candidates or not 254 * @return {@code true} if success; {@code false} if fail. 255 */ 256 private boolean singleClauseConvert(LinkedList clauseList, String input, WnnPOS terminal, boolean all) { 257 boolean ret = false; 258 259 /* get clauses without ancillary word */ 260 ArrayList<WnnWord> stems = getIndependentWords(input, all); 261 if (stems != null && (!stems.isEmpty())) { 262 Iterator<WnnWord> stemsi = stems.iterator(); 263 while (stemsi.hasNext()) { 264 WnnWord stem = stemsi.next(); 265 if (addClause(clauseList, input, stem, null, terminal, all)) { 266 ret = true; 267 } 268 } 269 } 270 271 /* get clauses with ancillary word */ 272 int max = CLAUSE_COST * 2; 273 for (int split = 1; split < input.length(); split++) { 274 /* get ancillary patterns */ 275 String str = input.substring(split); 276 ArrayList<WnnWord> fzks = getAncillaryPattern(str); 277 if (fzks == null || fzks.isEmpty()) { 278 continue; 279 } 280 281 /* get candidates of stem in a clause */ 282 str = input.substring(0, split); 283 stems = getIndependentWords(str, all); 284 if (stems == null || stems.isEmpty()) { 285 if (mDictionary.searchWord(WnnDictionary.SEARCH_PREFIX, WnnDictionary.ORDER_BY_FREQUENCY, str) <= 0) { 286 break; 287 } else { 288 continue; 289 } 290 } 291 /* make clauses */ 292 Iterator<WnnWord> stemsi = stems.iterator(); 293 while (stemsi.hasNext()) { 294 WnnWord stem = stemsi.next(); 295 if (all || stem.frequency > max) { 296 Iterator<WnnWord> fzksi = fzks.iterator(); 297 while (fzksi.hasNext()) { 298 WnnWord fzk = fzksi.next(); 299 if (addClause(clauseList, input, stem, fzk, terminal, all)) { 300 ret = true; 301 max = stem.frequency; 302 } 303 } 304 } 305 } 306 } 307 return ret; 308 } 309 310 /** 311 * Add valid clause to the candidates list. 312 * 313 * @param clauseList Where to store the results 314 * @param input Input string 315 * @param stem Stem of the clause (a independent word) 316 * @param fzk Ancillary pattern 317 * @param terminal Part of speech tag at the terminal 318 * @param all Get all candidates or not 319 * @return {@code true} if add the clause to the list; {@code false} if not. 320 */ 321 private boolean addClause(LinkedList<WnnClause> clauseList, String input, WnnWord stem, WnnWord fzk, 322 WnnPOS terminal, boolean all) { 323 WnnClause clause = null; 324 /* check if the part of speech is valid */ 325 if (fzk == null) { 326 if (connectible(stem.partOfSpeech.right, terminal.left)) { 327 clause = new WnnClause(input, stem); 328 } 329 } else { 330 if (connectible(stem.partOfSpeech.right, fzk.partOfSpeech.left) 331 && connectible(fzk.partOfSpeech.right, terminal.left)) { 332 clause = new WnnClause(input, stem, fzk); 333 } 334 } 335 if (clause == null) { 336 return false; 337 } 338 if (mFilter != null && !mFilter.isAllowed(clause)) { 339 return false; 340 } 341 342 /* store to the list */ 343 if (clauseList.isEmpty()) { 344 /* add if the list is empty */ 345 clauseList.add(0, clause); 346 return true; 347 } else { 348 if (!all) { 349 /* reserve only the best clause */ 350 WnnClause best = (WnnClause)clauseList.get(0); 351 if (best.frequency < clause.frequency) { 352 clauseList.set(0, clause); 353 return true; 354 } 355 } else { 356 /* reserve all clauses */ 357 Iterator clauseListi = clauseList.iterator(); 358 int index = 0; 359 while (clauseListi.hasNext()) { 360 WnnClause clausei = (WnnClause)clauseListi.next(); 361 if (clausei.frequency < clause.frequency) { 362 break; 363 } 364 index++; 365 } 366 clauseList.add(index, clause); 367 return true; 368 } 369 } 370 371 return false; 372 } 373 374 /** 375 * Check the part-of-speeches are connectable. 376 * 377 * @param right Right attribute of the preceding word/clause 378 * @param left Left attribute of the following word/clause 379 * @return {@code true} if there are connectable; {@code false} if otherwise 380 */ 381 private boolean connectible(int right, int left) { 382 try { 383 if (mConnectMatrix[left][right] != 0) { 384 return true; 385 } 386 } catch (Exception ex) { 387 } 388 return false; 389 } 390 391 /** 392 * Get all exact matched ancillary words(Fuzokugo) list. 393 * 394 * @param input Search key 395 * @return List of ancillary words 396 */ 397 private ArrayList<WnnWord> getAncillaryPattern(String input) { 398 if (input.length() == 0) { 399 return null; 400 } 401 402 HashMap<String,ArrayList<WnnWord>> fzkPat = mFzkPatterns; 403 ArrayList<WnnWord> fzks = fzkPat.get(input); 404 if (fzks != null) { 405 return fzks; 406 } 407 408 /* set dictionaries */ 409 WnnDictionary dict = mDictionary; 410 dict.clearDictionary(); 411 dict.clearApproxPattern(); 412 dict.setDictionary(6, 400, 500); 413 414 for (int start = input.length() - 1; start >= 0; start--) { 415 String key = input.substring(start); 416 417 fzks = fzkPat.get(key); 418 if (fzks != null) { 419 continue; 420 } 421 422 fzks = new ArrayList<WnnWord>(); 423 mFzkPatterns.put(key, fzks); 424 425 /* search ancillary words */ 426 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, key); 427 WnnWord word; 428 while ((word = dict.getNextWord()) != null) { 429 fzks.add(word); 430 } 431 432 /* concatenate sequence of ancillary words */ 433 for (int end = input.length() - 1; end > start; end--) { 434 ArrayList<WnnWord> followFzks = fzkPat.get(input.substring(end)); 435 if (followFzks == null || followFzks.isEmpty()) { 436 continue; 437 } 438 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input.substring(start, end)); 439 while ((word = dict.getNextWord()) != null) { 440 Iterator<WnnWord> followFzksi = followFzks.iterator(); 441 while (followFzksi.hasNext()) { 442 WnnWord follow = followFzksi.next(); 443 if (connectible(word.partOfSpeech.right, follow.partOfSpeech.left)) { 444 fzks.add(new WnnWord(key, key, new WnnPOS(word.partOfSpeech.left, follow.partOfSpeech.right))); 445 } 446 } 447 } 448 } 449 } 450 return fzks; 451 } 452 453 /** 454 * Get all exact matched independent words(Jiritsugo) list. 455 * 456 * @param input Search key 457 * @param all {@code true} if list all words; {@code false} if list words which has an unique part of speech tag. 458 * @return List of words; {@code null} if {@code input.length() == 0}. 459 */ 460 private ArrayList<WnnWord> getIndependentWords(String input, boolean all) { 461 if (input.length() == 0) { 462 return null; 463 } 464 465 ArrayList<WnnWord> words = (all)? mAllIndepWordBag.get(input) : mIndepWordBag.get(input); 466 467 if (words == null) { 468 /* set dictionaries */ 469 WnnDictionary dict = mDictionary; 470 dict.clearDictionary(); 471 dict.clearApproxPattern(); 472 dict.setDictionary(4, 0, 10); 473 dict.setDictionary(5, 400, 500); 474 dict.setDictionary(WnnDictionary.INDEX_USER_DICTIONARY, FREQ_USER, FREQ_USER); 475 dict.setDictionary(WnnDictionary.INDEX_LEARN_DICTIONARY, FREQ_LEARN, FREQ_LEARN); 476 477 words = new ArrayList<WnnWord>(); 478 WnnWord word; 479 if (all) { 480 mAllIndepWordBag.put(input, words); 481 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input); 482 /* store all words */ 483 while ((word = dict.getNextWord()) != null) { 484 if (input.equals(word.stroke)) { 485 words.add(word); 486 } 487 } 488 } else { 489 mIndepWordBag.put(input, words); 490 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input); 491 /* store a word which has an unique part of speech tag */ 492 while ((word = dict.getNextWord()) != null) { 493 if (input.equals(word.stroke)) { 494 Iterator<WnnWord> list = words.iterator(); 495 boolean found = false; 496 while (list.hasNext()) { 497 WnnWord w = (WnnWord)list.next(); 498 if (w.partOfSpeech.right == word.partOfSpeech.right) { 499 found = true; 500 break; 501 } 502 } 503 if (!found) { 504 words.add(word); 505 } 506 if (word.frequency < 400) { 507 break; 508 } 509 } 510 } 511 } 512 addAutoGeneratedCandidates(input, words, all); 513 } 514 return words; 515 } 516 517 /** 518 * Add some words not including in the dictionary. 519 * <br> 520 * This method adds some words which are not in the dictionary. 521 * 522 * @param input Input string 523 * @param wordList List to store words 524 * @param all Get all candidates or not 525 */ 526 private void addAutoGeneratedCandidates(String input, ArrayList wordList, boolean all) { 527 wordList.add(new WnnWord(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length())); 528 } 529 530 /** 531 * Get a default clause. 532 * <br> 533 * This method generates a clause which has a string same as input 534 * and the default part-of-speech tag. 535 * 536 * @param input Input string 537 * @return Default clause 538 */ 539 private WnnClause defaultClause(String input) { 540 return (new WnnClause(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length())); 541 } 542 } 543