1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 22 import java.util.ArrayList; 23 import java.util.Arrays; 24 import java.util.Collections; 25 import java.util.Date; 26 import java.util.HashMap; 27 import java.util.Iterator; 28 import java.util.LinkedList; 29 30 /** 31 * A dictionary that can fusion heads and tails of words for more compression. 32 */ 33 @UsedForTesting 34 public final class FusionDictionary implements Iterable<Word> { 35 private static final boolean DBG = MakedictLog.DBG; 36 37 private static int CHARACTER_NOT_FOUND_INDEX = -1; 38 39 /** 40 * A node array of the dictionary, containing several PtNodes. 41 * 42 * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the 43 * real information. 44 * This class also contains fields to cache size and address, to help with binary 45 * generation. 46 */ 47 public static final class PtNodeArray { 48 ArrayList<PtNode> mData; 49 // To help with binary generation 50 int mCachedSize = Integer.MIN_VALUE; 51 // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 52 // always hold the same value except between dictionary address compression, during which 53 // the update process needs to know about both values at the same time. Updating will 54 // update the AfterUpdate value, and the code will move them to BeforeUpdate before 55 // the next update pass. 56 int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 57 int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 58 int mCachedParentAddress = 0; 59 60 public PtNodeArray() { 61 mData = new ArrayList<PtNode>(); 62 } 63 public PtNodeArray(ArrayList<PtNode> data) { 64 mData = data; 65 } 66 } 67 68 /** 69 * A string with a frequency. 70 * 71 * This represents an "attribute", that is either a bigram or a shortcut. 72 */ 73 public static final class WeightedString { 74 public final String mWord; 75 public int mFrequency; 76 public WeightedString(String word, int frequency) { 77 mWord = word; 78 mFrequency = frequency; 79 } 80 81 @Override 82 public int hashCode() { 83 return Arrays.hashCode(new Object[] { mWord, mFrequency }); 84 } 85 86 @Override 87 public boolean equals(Object o) { 88 if (o == this) return true; 89 if (!(o instanceof WeightedString)) return false; 90 WeightedString w = (WeightedString)o; 91 return mWord.equals(w.mWord) && mFrequency == w.mFrequency; 92 } 93 } 94 95 /** 96 * PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children 97 * (Pt means Patricia Trie). 98 * 99 * This is the central class of the in-memory representation. A PtNode is what can 100 * be seen as a traditional "trie node", except it can hold several characters at the 101 * same time. A PtNode essentially represents one or several characters in the middle 102 * of the trie tree; as such, it can be a terminal, and it can have children. 103 * In this in-memory representation, whether the PtNode is a terminal or not is represented 104 * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 105 * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 106 * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 107 */ 108 public static final class PtNode { 109 public static final int NOT_A_TERMINAL = -1; 110 final int mChars[]; 111 ArrayList<WeightedString> mShortcutTargets; 112 ArrayList<WeightedString> mBigrams; 113 int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 114 int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. 115 PtNodeArray mChildren; 116 boolean mIsNotAWord; // Only a shortcut 117 boolean mIsBlacklistEntry; 118 // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary 119 // generation. Before and After always hold the same value except during dictionary 120 // address compression, where the update process needs to know about both values at the 121 // same time. Updating will update the AfterUpdate value, and the code will move them 122 // to BeforeUpdate before the next update pass. 123 // The update process does not need two versions of mCachedSize. 124 int mCachedSize; // The size, in bytes, of this PtNode. 125 int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) 126 int mCachedAddressAfterUpdate; // The address of this PtNode (after update) 127 128 public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 129 final ArrayList<WeightedString> bigrams, final int frequency, 130 final boolean isNotAWord, final boolean isBlacklistEntry) { 131 mChars = chars; 132 mFrequency = frequency; 133 mTerminalId = frequency; 134 mShortcutTargets = shortcutTargets; 135 mBigrams = bigrams; 136 mChildren = null; 137 mIsNotAWord = isNotAWord; 138 mIsBlacklistEntry = isBlacklistEntry; 139 } 140 141 public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 142 final ArrayList<WeightedString> bigrams, final int frequency, 143 final boolean isNotAWord, final boolean isBlacklistEntry, 144 final PtNodeArray children) { 145 mChars = chars; 146 mFrequency = frequency; 147 mShortcutTargets = shortcutTargets; 148 mBigrams = bigrams; 149 mChildren = children; 150 mIsNotAWord = isNotAWord; 151 mIsBlacklistEntry = isBlacklistEntry; 152 } 153 154 public void addChild(PtNode n) { 155 if (null == mChildren) { 156 mChildren = new PtNodeArray(); 157 } 158 mChildren.mData.add(n); 159 } 160 161 public int getTerminalId() { 162 return mTerminalId; 163 } 164 165 public boolean isTerminal() { 166 return NOT_A_TERMINAL != mFrequency; 167 } 168 169 public int getFrequency() { 170 return mFrequency; 171 } 172 173 public boolean getIsNotAWord() { 174 return mIsNotAWord; 175 } 176 177 public boolean getIsBlacklistEntry() { 178 return mIsBlacklistEntry; 179 } 180 181 public ArrayList<WeightedString> getShortcutTargets() { 182 // We don't want write permission to escape outside the package, so we return a copy 183 if (null == mShortcutTargets) return null; 184 final ArrayList<WeightedString> copyOfShortcutTargets = 185 new ArrayList<WeightedString>(mShortcutTargets); 186 return copyOfShortcutTargets; 187 } 188 189 public ArrayList<WeightedString> getBigrams() { 190 // We don't want write permission to escape outside the package, so we return a copy 191 if (null == mBigrams) return null; 192 final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); 193 return copyOfBigrams; 194 } 195 196 public boolean hasSeveralChars() { 197 assert(mChars.length > 0); 198 return 1 < mChars.length; 199 } 200 201 /** 202 * Adds a word to the bigram list. Updates the frequency if the word already 203 * exists. 204 */ 205 public void addBigram(final String word, final int frequency) { 206 if (mBigrams == null) { 207 mBigrams = new ArrayList<WeightedString>(); 208 } 209 WeightedString bigram = getBigram(word); 210 if (bigram != null) { 211 bigram.mFrequency = frequency; 212 } else { 213 bigram = new WeightedString(word, frequency); 214 mBigrams.add(bigram); 215 } 216 } 217 218 /** 219 * Gets the shortcut target for the given word. Returns null if the word is not in the 220 * shortcut list. 221 */ 222 public WeightedString getShortcut(final String word) { 223 // TODO: Don't do a linear search 224 if (mShortcutTargets != null) { 225 final int size = mShortcutTargets.size(); 226 for (int i = 0; i < size; ++i) { 227 WeightedString shortcut = mShortcutTargets.get(i); 228 if (shortcut.mWord.equals(word)) { 229 return shortcut; 230 } 231 } 232 } 233 return null; 234 } 235 236 /** 237 * Gets the bigram for the given word. 238 * Returns null if the word is not in the bigrams list. 239 */ 240 public WeightedString getBigram(final String word) { 241 // TODO: Don't do a linear search 242 if (mBigrams != null) { 243 final int size = mBigrams.size(); 244 for (int i = 0; i < size; ++i) { 245 WeightedString bigram = mBigrams.get(i); 246 if (bigram.mWord.equals(word)) { 247 return bigram; 248 } 249 } 250 } 251 return null; 252 } 253 254 /** 255 * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to 256 * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 257 * updated if they are higher than the existing ones. 258 */ 259 public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets, 260 final ArrayList<WeightedString> bigrams, 261 final boolean isNotAWord, final boolean isBlacklistEntry) { 262 if (frequency > mFrequency) { 263 mFrequency = frequency; 264 } 265 if (shortcutTargets != null) { 266 if (mShortcutTargets == null) { 267 mShortcutTargets = shortcutTargets; 268 } else { 269 final int size = shortcutTargets.size(); 270 for (int i = 0; i < size; ++i) { 271 final WeightedString shortcut = shortcutTargets.get(i); 272 final WeightedString existingShortcut = getShortcut(shortcut.mWord); 273 if (existingShortcut == null) { 274 mShortcutTargets.add(shortcut); 275 } else if (existingShortcut.mFrequency < shortcut.mFrequency) { 276 existingShortcut.mFrequency = shortcut.mFrequency; 277 } 278 } 279 } 280 } 281 if (bigrams != null) { 282 if (mBigrams == null) { 283 mBigrams = bigrams; 284 } else { 285 final int size = bigrams.size(); 286 for (int i = 0; i < size; ++i) { 287 final WeightedString bigram = bigrams.get(i); 288 final WeightedString existingBigram = getBigram(bigram.mWord); 289 if (existingBigram == null) { 290 mBigrams.add(bigram); 291 } else if (existingBigram.mFrequency < bigram.mFrequency) { 292 existingBigram.mFrequency = bigram.mFrequency; 293 } 294 } 295 } 296 } 297 mIsNotAWord = isNotAWord; 298 mIsBlacklistEntry = isBlacklistEntry; 299 } 300 } 301 302 /** 303 * Options global to the dictionary. 304 */ 305 public static final class DictionaryOptions { 306 public final boolean mGermanUmlautProcessing; 307 public final boolean mFrenchLigatureProcessing; 308 public final HashMap<String, String> mAttributes; 309 public DictionaryOptions(final HashMap<String, String> attributes, 310 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { 311 mAttributes = attributes; 312 mGermanUmlautProcessing = germanUmlautProcessing; 313 mFrenchLigatureProcessing = frenchLigatureProcessing; 314 } 315 @Override 316 public String toString() { // Convenience method 317 return toString(0, false); 318 } 319 public String toString(final int indentCount, final boolean plumbing) { 320 final StringBuilder indent = new StringBuilder(); 321 if (plumbing) { 322 indent.append("H:"); 323 } else { 324 for (int i = 0; i < indentCount; ++i) { 325 indent.append(" "); 326 } 327 } 328 final StringBuilder s = new StringBuilder(); 329 for (final String optionKey : mAttributes.keySet()) { 330 s.append(indent); 331 s.append(optionKey); 332 s.append(" = "); 333 if ("date".equals(optionKey) && !plumbing) { 334 // Date needs a number of milliseconds, but the dictionary contains seconds 335 s.append(new Date( 336 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); 337 } else { 338 s.append(mAttributes.get(optionKey)); 339 } 340 s.append("\n"); 341 } 342 if (mGermanUmlautProcessing) { 343 s.append(indent); 344 s.append("Needs German umlaut processing\n"); 345 } 346 if (mFrenchLigatureProcessing) { 347 s.append(indent); 348 s.append("Needs French ligature processing\n"); 349 } 350 return s.toString(); 351 } 352 } 353 354 public final DictionaryOptions mOptions; 355 public final PtNodeArray mRootNodeArray; 356 357 public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { 358 mRootNodeArray = rootNodeArray; 359 mOptions = options; 360 } 361 362 public void addOptionAttribute(final String key, final String value) { 363 mOptions.mAttributes.put(key, value); 364 } 365 366 /** 367 * Helper method to convert a String to an int array. 368 */ 369 static int[] getCodePoints(final String word) { 370 // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, 371 // which is not visible from the makedict package. Factor this code. 372 final int length = word.length(); 373 if (length <= 0) return new int[] {}; 374 final char[] characters = word.toCharArray(); 375 final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 376 int codePoint = Character.codePointAt(characters, 0); 377 int dsti = 0; 378 for (int srci = Character.charCount(codePoint); 379 srci < length; srci += Character.charCount(codePoint), ++dsti) { 380 codePoints[dsti] = codePoint; 381 codePoint = Character.codePointAt(characters, srci); 382 } 383 codePoints[dsti] = codePoint; 384 return codePoints; 385 } 386 387 /** 388 * Helper method to add a word as a string. 389 * 390 * This method adds a word to the dictionary with the given frequency. Optional 391 * lists of bigrams and shortcuts can be passed here. For each word inside, 392 * they will be added to the dictionary as necessary. 393 * 394 * @param word the word to add. 395 * @param frequency the frequency of the word, in the range [0..255]. 396 * @param shortcutTargets a list of shortcut targets for this word, or null. 397 * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 398 */ 399 public void add(final String word, final int frequency, 400 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 401 add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, 402 false /* isBlacklistEntry */); 403 } 404 405 /** 406 * Helper method to add a blacklist entry as a string. 407 * 408 * @param word the word to add as a blacklist entry. 409 * @param shortcutTargets a list of shortcut targets for this word, or null. 410 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 411 */ 412 public void addBlacklistEntry(final String word, 413 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 414 add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); 415 } 416 417 /** 418 * Sanity check for a PtNode array. 419 * 420 * This method checks that all PtNodes in a node array are ordered as expected. 421 * If they are, nothing happens. If they aren't, an exception is thrown. 422 */ 423 private void checkStack(PtNodeArray ptNodeArray) { 424 ArrayList<PtNode> stack = ptNodeArray.mData; 425 int lastValue = -1; 426 for (int i = 0; i < stack.size(); ++i) { 427 int currentValue = stack.get(i).mChars[0]; 428 if (currentValue <= lastValue) 429 throw new RuntimeException("Invalid stack"); 430 else 431 lastValue = currentValue; 432 } 433 } 434 435 /** 436 * Helper method to add a new bigram to the dictionary. 437 * 438 * @param word1 the previous word of the context 439 * @param word2 the next word of the context 440 * @param frequency the bigram frequency 441 */ 442 public void setBigram(final String word1, final String word2, final int frequency) { 443 PtNode ptNode = findWordInTree(mRootNodeArray, word1); 444 if (ptNode != null) { 445 final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2); 446 if (ptNode2 == null) { 447 add(getCodePoints(word2), 0, null, false /* isNotAWord */, 448 false /* isBlacklistEntry */); 449 // The PtNode for the first word may have moved by the above insertion, 450 // if word1 and word2 share a common stem that happens not to have been 451 // a cutting point until now. In this case, we need to refresh ptNode. 452 ptNode = findWordInTree(mRootNodeArray, word1); 453 } 454 ptNode.addBigram(word2, frequency); 455 } else { 456 throw new RuntimeException("First word of bigram not found"); 457 } 458 } 459 460 /** 461 * Add a word to this dictionary. 462 * 463 * The shortcuts, if any, have to be in the dictionary already. If they aren't, 464 * an exception is thrown. 465 * 466 * @param word the word, as an int array. 467 * @param frequency the frequency of the word, in the range [0..255]. 468 * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 469 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 470 * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 471 */ 472 private void add(final int[] word, final int frequency, 473 final ArrayList<WeightedString> shortcutTargets, 474 final boolean isNotAWord, final boolean isBlacklistEntry) { 475 assert(frequency >= 0 && frequency <= 255); 476 if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { 477 MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 478 return; 479 } 480 481 PtNodeArray currentNodeArray = mRootNodeArray; 482 int charIndex = 0; 483 484 PtNode currentPtNode = null; 485 int differentCharIndex = 0; // Set by the loop to the index of the char that differs 486 int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); 487 while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { 488 currentPtNode = currentNodeArray.mData.get(nodeIndex); 489 differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); 490 if (ARRAYS_ARE_EQUAL != differentCharIndex 491 && differentCharIndex < currentPtNode.mChars.length) break; 492 if (null == currentPtNode.mChildren) break; 493 charIndex += currentPtNode.mChars.length; 494 if (charIndex >= word.length) break; 495 currentNodeArray = currentPtNode.mChildren; 496 nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); 497 } 498 499 if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { 500 // No node at this point to accept the word. Create one. 501 final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); 502 final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), 503 shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); 504 currentNodeArray.mData.add(insertionIndex, newPtNode); 505 if (DBG) checkStack(currentNodeArray); 506 } else { 507 // There is a word with a common prefix. 508 if (differentCharIndex == currentPtNode.mChars.length) { 509 if (charIndex + differentCharIndex >= word.length) { 510 // The new word is a prefix of an existing word, but the node on which it 511 // should end already exists as is. Since the old PtNode was not a terminal, 512 // make it one by filling in its frequency and other attributes 513 currentPtNode.update(frequency, shortcutTargets, null, isNotAWord, 514 isBlacklistEntry); 515 } else { 516 // The new word matches the full old word and extends past it. 517 // We only have to create a new node and add it to the end of this. 518 final PtNode newNode = new PtNode( 519 Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 520 shortcutTargets, null /* bigrams */, frequency, isNotAWord, 521 isBlacklistEntry); 522 currentPtNode.mChildren = new PtNodeArray(); 523 currentPtNode.mChildren.mData.add(newNode); 524 } 525 } else { 526 if (0 == differentCharIndex) { 527 // Exact same word. Update the frequency if higher. This will also add the 528 // new shortcuts to the existing shortcut list if it already exists. 529 currentPtNode.update(frequency, shortcutTargets, null, 530 currentPtNode.mIsNotAWord && isNotAWord, 531 currentPtNode.mIsBlacklistEntry || isBlacklistEntry); 532 } else { 533 // Partial prefix match only. We have to replace the current node with a node 534 // containing the current prefix and create two new ones for the tails. 535 PtNodeArray newChildren = new PtNodeArray(); 536 final PtNode newOldWord = new PtNode( 537 Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, 538 currentPtNode.mChars.length), currentPtNode.mShortcutTargets, 539 currentPtNode.mBigrams, currentPtNode.mFrequency, 540 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry, 541 currentPtNode.mChildren); 542 newChildren.mData.add(newOldWord); 543 544 final PtNode newParent; 545 if (charIndex + differentCharIndex >= word.length) { 546 newParent = new PtNode( 547 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 548 shortcutTargets, null /* bigrams */, frequency, 549 isNotAWord, isBlacklistEntry, newChildren); 550 } else { 551 newParent = new PtNode( 552 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 553 null /* shortcutTargets */, null /* bigrams */, -1, 554 false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); 555 final PtNode newWord = new PtNode(Arrays.copyOfRange(word, 556 charIndex + differentCharIndex, word.length), 557 shortcutTargets, null /* bigrams */, frequency, 558 isNotAWord, isBlacklistEntry); 559 final int addIndex = word[charIndex + differentCharIndex] 560 > currentPtNode.mChars[differentCharIndex] ? 1 : 0; 561 newChildren.mData.add(addIndex, newWord); 562 } 563 currentNodeArray.mData.set(nodeIndex, newParent); 564 } 565 if (DBG) checkStack(currentNodeArray); 566 } 567 } 568 } 569 570 private static int ARRAYS_ARE_EQUAL = 0; 571 572 /** 573 * Custom comparison of two int arrays taken to contain character codes. 574 * 575 * This method compares the two arrays passed as an argument in a lexicographic way, 576 * with an offset in the dst string. 577 * This method does NOT test for the first character. It is taken to be equal. 578 * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 579 * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 580 * strings are equal. This works BECAUSE we don't look at the first character. 581 * 582 * @param src the left-hand side string of the comparison. 583 * @param dst the right-hand side string of the comparison. 584 * @param dstOffset the offset in the right-hand side string. 585 * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 586 */ 587 private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { 588 // We do NOT test the first char, because we come from a method that already 589 // tested it. 590 for (int i = 1; i < src.length; ++i) { 591 if (dstOffset + i >= dst.length) return i; 592 if (src[i] != dst[dstOffset + i]) return i; 593 } 594 if (dst.length > src.length) return src.length; 595 return ARRAYS_ARE_EQUAL; 596 } 597 598 /** 599 * Helper class that compares and sorts two PtNodes according to their 600 * first element only. I repeat: ONLY the first element is considered, the rest 601 * is ignored. 602 * This comparator imposes orderings that are inconsistent with equals. 603 */ 604 static private final class PtNodeComparator implements java.util.Comparator<PtNode> { 605 @Override 606 public int compare(PtNode p1, PtNode p2) { 607 if (p1.mChars[0] == p2.mChars[0]) return 0; 608 return p1.mChars[0] < p2.mChars[0] ? -1 : 1; 609 } 610 } 611 final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); 612 613 /** 614 * Finds the insertion index of a character within a node array. 615 */ 616 private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { 617 final ArrayList<PtNode> data = nodeArray.mData; 618 final PtNode reference = new PtNode(new int[] { character }, 619 null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, 620 false /* isBlacklistEntry */); 621 int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); 622 return result >= 0 ? result : -result - 1; 623 } 624 625 /** 626 * Find the index of a char in a node array, if it exists. 627 * 628 * @param nodeArray the node array to search in. 629 * @param character the character to search for. 630 * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. 631 */ 632 private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { 633 final int insertionIndex = findInsertionIndex(nodeArray, character); 634 if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; 635 return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex 636 : CHARACTER_NOT_FOUND_INDEX; 637 } 638 639 /** 640 * Helper method to find a word in a given branch. 641 */ 642 @SuppressWarnings("unused") 643 public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) { 644 int index = 0; 645 final StringBuilder checker = DBG ? new StringBuilder() : null; 646 final int[] codePoints = getCodePoints(string); 647 648 PtNode currentPtNode; 649 do { 650 int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); 651 if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; 652 currentPtNode = nodeArray.mData.get(indexOfGroup); 653 654 if (codePoints.length - index < currentPtNode.mChars.length) return null; 655 int newIndex = index; 656 while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { 657 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; 658 newIndex++; 659 } 660 index = newIndex; 661 662 if (DBG) { 663 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); 664 } 665 if (index < codePoints.length) { 666 nodeArray = currentPtNode.mChildren; 667 } 668 } while (null != nodeArray && index < codePoints.length); 669 670 if (index < codePoints.length) return null; 671 if (!currentPtNode.isTerminal()) return null; 672 if (DBG && !string.equals(checker.toString())) return null; 673 return currentPtNode; 674 } 675 676 /** 677 * Helper method to find out whether a word is in the dict or not. 678 */ 679 public boolean hasWord(final String s) { 680 if (null == s || "".equals(s)) { 681 throw new RuntimeException("Can't search for a null or empty string"); 682 } 683 return null != findWordInTree(mRootNodeArray, s); 684 } 685 686 /** 687 * Recursively count the number of PtNodes in a given branch of the trie. 688 * 689 * @param nodeArray the parent node. 690 * @return the number of PtNodes in all the branch under this node. 691 */ 692 public static int countPtNodes(final PtNodeArray nodeArray) { 693 final int nodeSize = nodeArray.mData.size(); 694 int size = nodeSize; 695 for (int i = nodeSize - 1; i >= 0; --i) { 696 PtNode ptNode = nodeArray.mData.get(i); 697 if (null != ptNode.mChildren) 698 size += countPtNodes(ptNode.mChildren); 699 } 700 return size; 701 } 702 703 /** 704 * Recursively count the number of nodes in a given branch of the trie. 705 * 706 * @param nodeArray the node array to count. 707 * @return the number of nodes in this branch. 708 */ 709 public static int countNodeArrays(final PtNodeArray nodeArray) { 710 int size = 1; 711 for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { 712 PtNode ptNode = nodeArray.mData.get(i); 713 if (null != ptNode.mChildren) 714 size += countNodeArrays(ptNode.mChildren); 715 } 716 return size; 717 } 718 719 // Recursively find out whether there are any bigrams. 720 // This can be pretty expensive especially if there aren't any (we return as soon 721 // as we find one, so it's much cheaper if there are bigrams) 722 private static boolean hasBigramsInternal(final PtNodeArray nodeArray) { 723 if (null == nodeArray) return false; 724 for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { 725 PtNode ptNode = nodeArray.mData.get(i); 726 if (null != ptNode.mBigrams) return true; 727 if (hasBigramsInternal(ptNode.mChildren)) return true; 728 } 729 return false; 730 } 731 732 /** 733 * Finds out whether there are any bigrams in this dictionary. 734 * 735 * @return true if there is any bigram, false otherwise. 736 */ 737 // TODO: this is expensive especially for large dictionaries without any bigram. 738 // The up side is, this is always accurate and correct and uses no memory. We should 739 // find a more efficient way of doing this, without compromising too much on memory 740 // and ease of use. 741 public boolean hasBigrams() { 742 return hasBigramsInternal(mRootNodeArray); 743 } 744 745 // Historically, the tails of the words were going to be merged to save space. 746 // However, that would prevent the code to search for a specific address in log(n) 747 // time so this was abandoned. 748 // The code is still of interest as it does add some compression to any dictionary 749 // that has no need for attributes. Implementations that does not read attributes should be 750 // able to read a dictionary with merged tails. 751 // Also, the following code does support frequencies, as in, it will only merges 752 // tails that share the same frequency. Though it would result in the above loss of 753 // performance while searching by address, it is still technically possible to merge 754 // tails that contain attributes, but this code does not take that into account - it does 755 // not compare attributes and will merge terminals with different attributes regardless. 756 public void mergeTails() { 757 MakedictLog.i("Do not merge tails"); 758 return; 759 760 // MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root)); 761 // MakedictLog.i("Number of PtNodes : " + countPtNodes(root)); 762 // 763 // final HashMap<String, ArrayList<PtNodeArray>> repository = 764 // new HashMap<String, ArrayList<PtNodeArray>>(); 765 // mergeTailsInner(repository, root); 766 // 767 // MakedictLog.i("Number of different pseudohashes : " + repository.size()); 768 // int size = 0; 769 // for (ArrayList<PtNodeArray> a : repository.values()) { 770 // size += a.size(); 771 // } 772 // MakedictLog.i("Number of nodes after merge : " + (1 + size)); 773 // MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 774 } 775 776 // The following methods are used by the deactivated mergeTails() 777 // private static boolean isEqual(PtNodeArray a, PtNodeArray b) { 778 // if (null == a && null == b) return true; 779 // if (null == a || null == b) return false; 780 // if (a.data.size() != b.data.size()) return false; 781 // final int size = a.data.size(); 782 // for (int i = size - 1; i >= 0; --i) { 783 // PtNode aPtNode = a.data.get(i); 784 // PtNode bPtNode = b.data.get(i); 785 // if (aPtNode.frequency != bPtNode.frequency) return false; 786 // if (aPtNode.alternates == null && bPtNode.alternates != null) return false; 787 // if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false; 788 // if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false; 789 // if (!isEqual(aPtNode.children, bPtNode.children)) return false; 790 // } 791 // return true; 792 // } 793 794 // static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner( 795 // final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) { 796 // final ArrayList<PtNode> branches = nodeArray.data; 797 // final int nodeSize = branches.size(); 798 // for (int i = 0; i < nodeSize; ++i) { 799 // PtNode ptNode = branches.get(i); 800 // if (null != ptNode.children) { 801 // String pseudoHash = getPseudoHash(ptNode.children); 802 // ArrayList<PtNodeArray> similarList = map.get(pseudoHash); 803 // if (null == similarList) { 804 // similarList = new ArrayList<PtNodeArray>(); 805 // map.put(pseudoHash, similarList); 806 // } 807 // boolean merged = false; 808 // for (PtNodeArray similar : similarList) { 809 // if (isEqual(ptNode.children, similar)) { 810 // ptNode.children = similar; 811 // merged = true; 812 // break; 813 // } 814 // } 815 // if (!merged) { 816 // similarList.add(ptNode.children); 817 // } 818 // mergeTailsInner(map, ptNode.children); 819 // } 820 // } 821 // return map; 822 // } 823 824 // private static String getPseudoHash(final PtNodeArray nodeArray) { 825 // StringBuilder s = new StringBuilder(); 826 // for (PtNode ptNode : nodeArray.data) { 827 // s.append(ptNode.frequency); 828 // for (int ch : ptNode.chars) { 829 // s.append(Character.toChars(ch)); 830 // } 831 // } 832 // return s.toString(); 833 // } 834 835 /** 836 * Iterator to walk through a dictionary. 837 * 838 * This is purely for convenience. 839 */ 840 public static final class DictionaryIterator implements Iterator<Word> { 841 private static final class Position { 842 public Iterator<PtNode> pos; 843 public int length; 844 public Position(ArrayList<PtNode> ptNodes) { 845 pos = ptNodes.iterator(); 846 length = 0; 847 } 848 } 849 final StringBuilder mCurrentString; 850 final LinkedList<Position> mPositions; 851 852 public DictionaryIterator(ArrayList<PtNode> ptRoot) { 853 mCurrentString = new StringBuilder(); 854 mPositions = new LinkedList<Position>(); 855 final Position rootPos = new Position(ptRoot); 856 mPositions.add(rootPos); 857 } 858 859 @Override 860 public boolean hasNext() { 861 for (Position p : mPositions) { 862 if (p.pos.hasNext()) { 863 return true; 864 } 865 } 866 return false; 867 } 868 869 @Override 870 public Word next() { 871 Position currentPos = mPositions.getLast(); 872 mCurrentString.setLength(currentPos.length); 873 874 do { 875 if (currentPos.pos.hasNext()) { 876 final PtNode currentPtNode = currentPos.pos.next(); 877 currentPos.length = mCurrentString.length(); 878 for (int i : currentPtNode.mChars) { 879 mCurrentString.append(Character.toChars(i)); 880 } 881 if (null != currentPtNode.mChildren) { 882 currentPos = new Position(currentPtNode.mChildren.mData); 883 currentPos.length = mCurrentString.length(); 884 mPositions.addLast(currentPos); 885 } 886 if (currentPtNode.mFrequency >= 0) { 887 return new Word(mCurrentString.toString(), currentPtNode.mFrequency, 888 currentPtNode.mShortcutTargets, currentPtNode.mBigrams, 889 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry); 890 } 891 } else { 892 mPositions.removeLast(); 893 currentPos = mPositions.getLast(); 894 mCurrentString.setLength(mPositions.getLast().length); 895 } 896 } while (true); 897 } 898 899 @Override 900 public void remove() { 901 throw new UnsupportedOperationException("Unsupported yet"); 902 } 903 904 } 905 906 /** 907 * Method to return an iterator. 908 * 909 * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 910 * and say : for (Word w : x) {} 911 */ 912 @Override 913 public Iterator<Word> iterator() { 914 return new DictionaryIterator(mRootNodeArray.mData); 915 } 916 } 917