1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 22 import java.util.ArrayList; 23 import java.util.Arrays; 24 import java.util.Collections; 25 import java.util.Date; 26 import java.util.HashMap; 27 import java.util.Iterator; 28 import java.util.LinkedList; 29 30 /** 31 * A dictionary that can fusion heads and tails of words for more compression. 32 */ 33 @UsedForTesting 34 public final class FusionDictionary implements Iterable<Word> { 35 private static final boolean DBG = MakedictLog.DBG; 36 37 /** 38 * A node of the dictionary, containing several CharGroups. 39 * 40 * A node is but an ordered array of CharGroups, which essentially contain all the 41 * real information. 42 * This class also contains fields to cache size and address, to help with binary 43 * generation. 44 */ 45 public static final class Node { 46 ArrayList<CharGroup> mData; 47 // To help with binary generation 48 int mCachedSize = Integer.MIN_VALUE; 49 int mCachedAddress = Integer.MIN_VALUE; 50 int mCachedParentAddress = 0; 51 52 public Node() { 53 mData = new ArrayList<CharGroup>(); 54 } 55 public Node(ArrayList<CharGroup> data) { 56 mData = data; 57 } 58 } 59 60 /** 61 * A string with a frequency. 62 * 63 * This represents an "attribute", that is either a bigram or a shortcut. 64 */ 65 public static final class WeightedString { 66 public final String mWord; 67 public int mFrequency; 68 public WeightedString(String word, int frequency) { 69 mWord = word; 70 mFrequency = frequency; 71 } 72 73 @Override 74 public int hashCode() { 75 return Arrays.hashCode(new Object[] { mWord, mFrequency }); 76 } 77 78 @Override 79 public boolean equals(Object o) { 80 if (o == this) return true; 81 if (!(o instanceof WeightedString)) return false; 82 WeightedString w = (WeightedString)o; 83 return mWord.equals(w.mWord) && mFrequency == w.mFrequency; 84 } 85 } 86 87 /** 88 * A group of characters, with a frequency, shortcut targets, bigrams, and children. 89 * 90 * This is the central class of the in-memory representation. A CharGroup is what can 91 * be seen as a traditional "trie node", except it can hold several characters at the 92 * same time. A CharGroup essentially represents one or several characters in the middle 93 * of the trie trie; as such, it can be a terminal, and it can have children. 94 * In this in-memory representation, whether the CharGroup is a terminal or not is represented 95 * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 96 * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 97 * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 98 */ 99 public static final class CharGroup { 100 public static final int NOT_A_TERMINAL = -1; 101 final int mChars[]; 102 ArrayList<WeightedString> mShortcutTargets; 103 ArrayList<WeightedString> mBigrams; 104 int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 105 Node mChildren; 106 boolean mIsNotAWord; // Only a shortcut 107 boolean mIsBlacklistEntry; 108 // The two following members to help with binary generation 109 int mCachedSize; 110 int mCachedAddress; 111 112 public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 113 final ArrayList<WeightedString> bigrams, final int frequency, 114 final boolean isNotAWord, final boolean isBlacklistEntry) { 115 mChars = chars; 116 mFrequency = frequency; 117 mShortcutTargets = shortcutTargets; 118 mBigrams = bigrams; 119 mChildren = null; 120 mIsNotAWord = isNotAWord; 121 mIsBlacklistEntry = isBlacklistEntry; 122 } 123 124 public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 125 final ArrayList<WeightedString> bigrams, final int frequency, 126 final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) { 127 mChars = chars; 128 mFrequency = frequency; 129 mShortcutTargets = shortcutTargets; 130 mBigrams = bigrams; 131 mChildren = children; 132 mIsNotAWord = isNotAWord; 133 mIsBlacklistEntry = isBlacklistEntry; 134 } 135 136 public void addChild(CharGroup n) { 137 if (null == mChildren) { 138 mChildren = new Node(); 139 } 140 mChildren.mData.add(n); 141 } 142 143 public boolean isTerminal() { 144 return NOT_A_TERMINAL != mFrequency; 145 } 146 147 public int getFrequency() { 148 return mFrequency; 149 } 150 151 public boolean getIsNotAWord() { 152 return mIsNotAWord; 153 } 154 155 public boolean getIsBlacklistEntry() { 156 return mIsBlacklistEntry; 157 } 158 159 public ArrayList<WeightedString> getShortcutTargets() { 160 // We don't want write permission to escape outside the package, so we return a copy 161 if (null == mShortcutTargets) return null; 162 final ArrayList<WeightedString> copyOfShortcutTargets = 163 new ArrayList<WeightedString>(mShortcutTargets); 164 return copyOfShortcutTargets; 165 } 166 167 public ArrayList<WeightedString> getBigrams() { 168 // We don't want write permission to escape outside the package, so we return a copy 169 if (null == mBigrams) return null; 170 final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); 171 return copyOfBigrams; 172 } 173 174 public boolean hasSeveralChars() { 175 assert(mChars.length > 0); 176 return 1 < mChars.length; 177 } 178 179 /** 180 * Adds a word to the bigram list. Updates the frequency if the word already 181 * exists. 182 */ 183 public void addBigram(final String word, final int frequency) { 184 if (mBigrams == null) { 185 mBigrams = new ArrayList<WeightedString>(); 186 } 187 WeightedString bigram = getBigram(word); 188 if (bigram != null) { 189 bigram.mFrequency = frequency; 190 } else { 191 bigram = new WeightedString(word, frequency); 192 mBigrams.add(bigram); 193 } 194 } 195 196 /** 197 * Gets the shortcut target for the given word. Returns null if the word is not in the 198 * shortcut list. 199 */ 200 public WeightedString getShortcut(final String word) { 201 // TODO: Don't do a linear search 202 if (mShortcutTargets != null) { 203 final int size = mShortcutTargets.size(); 204 for (int i = 0; i < size; ++i) { 205 WeightedString shortcut = mShortcutTargets.get(i); 206 if (shortcut.mWord.equals(word)) { 207 return shortcut; 208 } 209 } 210 } 211 return null; 212 } 213 214 /** 215 * Gets the bigram for the given word. 216 * Returns null if the word is not in the bigrams list. 217 */ 218 public WeightedString getBigram(final String word) { 219 // TODO: Don't do a linear search 220 if (mBigrams != null) { 221 final int size = mBigrams.size(); 222 for (int i = 0; i < size; ++i) { 223 WeightedString bigram = mBigrams.get(i); 224 if (bigram.mWord.equals(word)) { 225 return bigram; 226 } 227 } 228 } 229 return null; 230 } 231 232 /** 233 * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to 234 * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 235 * updated if they are higher than the existing ones. 236 */ 237 public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets, 238 final ArrayList<WeightedString> bigrams, 239 final boolean isNotAWord, final boolean isBlacklistEntry) { 240 if (frequency > mFrequency) { 241 mFrequency = frequency; 242 } 243 if (shortcutTargets != null) { 244 if (mShortcutTargets == null) { 245 mShortcutTargets = shortcutTargets; 246 } else { 247 final int size = shortcutTargets.size(); 248 for (int i = 0; i < size; ++i) { 249 final WeightedString shortcut = shortcutTargets.get(i); 250 final WeightedString existingShortcut = getShortcut(shortcut.mWord); 251 if (existingShortcut == null) { 252 mShortcutTargets.add(shortcut); 253 } else if (existingShortcut.mFrequency < shortcut.mFrequency) { 254 existingShortcut.mFrequency = shortcut.mFrequency; 255 } 256 } 257 } 258 } 259 if (bigrams != null) { 260 if (mBigrams == null) { 261 mBigrams = bigrams; 262 } else { 263 final int size = bigrams.size(); 264 for (int i = 0; i < size; ++i) { 265 final WeightedString bigram = bigrams.get(i); 266 final WeightedString existingBigram = getBigram(bigram.mWord); 267 if (existingBigram == null) { 268 mBigrams.add(bigram); 269 } else if (existingBigram.mFrequency < bigram.mFrequency) { 270 existingBigram.mFrequency = bigram.mFrequency; 271 } 272 } 273 } 274 } 275 mIsNotAWord = isNotAWord; 276 mIsBlacklistEntry = isBlacklistEntry; 277 } 278 } 279 280 /** 281 * Options global to the dictionary. 282 */ 283 public static final class DictionaryOptions { 284 public final boolean mGermanUmlautProcessing; 285 public final boolean mFrenchLigatureProcessing; 286 public final HashMap<String, String> mAttributes; 287 public DictionaryOptions(final HashMap<String, String> attributes, 288 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { 289 mAttributes = attributes; 290 mGermanUmlautProcessing = germanUmlautProcessing; 291 mFrenchLigatureProcessing = frenchLigatureProcessing; 292 } 293 @Override 294 public String toString() { // Convenience method 295 return toString(0, false); 296 } 297 public String toString(final int indentCount, final boolean plumbing) { 298 final StringBuilder indent = new StringBuilder(); 299 if (plumbing) { 300 indent.append("H:"); 301 } else { 302 for (int i = 0; i < indentCount; ++i) { 303 indent.append(" "); 304 } 305 } 306 final StringBuilder s = new StringBuilder(); 307 for (final String optionKey : mAttributes.keySet()) { 308 s.append(indent); 309 s.append(optionKey); 310 s.append(" = "); 311 if ("date".equals(optionKey) && !plumbing) { 312 // Date needs a number of milliseconds, but the dictionary contains seconds 313 s.append(new Date( 314 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); 315 } else { 316 s.append(mAttributes.get(optionKey)); 317 } 318 s.append("\n"); 319 } 320 if (mGermanUmlautProcessing) { 321 s.append(indent); 322 s.append("Needs German umlaut processing\n"); 323 } 324 if (mFrenchLigatureProcessing) { 325 s.append(indent); 326 s.append("Needs French ligature processing\n"); 327 } 328 return s.toString(); 329 } 330 } 331 332 public final DictionaryOptions mOptions; 333 public final Node mRoot; 334 335 public FusionDictionary(final Node root, final DictionaryOptions options) { 336 mRoot = root; 337 mOptions = options; 338 } 339 340 public void addOptionAttribute(final String key, final String value) { 341 mOptions.mAttributes.put(key, value); 342 } 343 344 /** 345 * Helper method to convert a String to an int array. 346 */ 347 static int[] getCodePoints(final String word) { 348 // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray, 349 // which is not visible from the makedict package. Factor this code. 350 final char[] characters = word.toCharArray(); 351 final int length = characters.length; 352 final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 353 int codePoint = Character.codePointAt(characters, 0); 354 int dsti = 0; 355 for (int srci = Character.charCount(codePoint); 356 srci < length; srci += Character.charCount(codePoint), ++dsti) { 357 codePoints[dsti] = codePoint; 358 codePoint = Character.codePointAt(characters, srci); 359 } 360 codePoints[dsti] = codePoint; 361 return codePoints; 362 } 363 364 /** 365 * Helper method to add a word as a string. 366 * 367 * This method adds a word to the dictionary with the given frequency. Optional 368 * lists of bigrams and shortcuts can be passed here. For each word inside, 369 * they will be added to the dictionary as necessary. 370 * 371 * @param word the word to add. 372 * @param frequency the frequency of the word, in the range [0..255]. 373 * @param shortcutTargets a list of shortcut targets for this word, or null. 374 * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 375 */ 376 public void add(final String word, final int frequency, 377 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 378 add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, 379 false /* isBlacklistEntry */); 380 } 381 382 /** 383 * Helper method to add a blacklist entry as a string. 384 * 385 * @param word the word to add as a blacklist entry. 386 * @param shortcutTargets a list of shortcut targets for this word, or null. 387 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 388 */ 389 public void addBlacklistEntry(final String word, 390 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 391 add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); 392 } 393 394 /** 395 * Sanity check for a node. 396 * 397 * This method checks that all CharGroups in a node are ordered as expected. 398 * If they are, nothing happens. If they aren't, an exception is thrown. 399 */ 400 private void checkStack(Node node) { 401 ArrayList<CharGroup> stack = node.mData; 402 int lastValue = -1; 403 for (int i = 0; i < stack.size(); ++i) { 404 int currentValue = stack.get(i).mChars[0]; 405 if (currentValue <= lastValue) 406 throw new RuntimeException("Invalid stack"); 407 else 408 lastValue = currentValue; 409 } 410 } 411 412 /** 413 * Helper method to add a new bigram to the dictionary. 414 * 415 * @param word1 the previous word of the context 416 * @param word2 the next word of the context 417 * @param frequency the bigram frequency 418 */ 419 public void setBigram(final String word1, final String word2, final int frequency) { 420 CharGroup charGroup = findWordInTree(mRoot, word1); 421 if (charGroup != null) { 422 final CharGroup charGroup2 = findWordInTree(mRoot, word2); 423 if (charGroup2 == null) { 424 add(getCodePoints(word2), 0, null, false /* isNotAWord */, 425 false /* isBlacklistEntry */); 426 // The chargroup for the first word may have moved by the above insertion, 427 // if word1 and word2 share a common stem that happens not to have been 428 // a cutting point until now. In this case, we need to refresh charGroup. 429 charGroup = findWordInTree(mRoot, word1); 430 } 431 charGroup.addBigram(word2, frequency); 432 } else { 433 throw new RuntimeException("First word of bigram not found"); 434 } 435 } 436 437 /** 438 * Add a word to this dictionary. 439 * 440 * The shortcuts, if any, have to be in the dictionary already. If they aren't, 441 * an exception is thrown. 442 * 443 * @param word the word, as an int array. 444 * @param frequency the frequency of the word, in the range [0..255]. 445 * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 446 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 447 * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 448 */ 449 private void add(final int[] word, final int frequency, 450 final ArrayList<WeightedString> shortcutTargets, 451 final boolean isNotAWord, final boolean isBlacklistEntry) { 452 assert(frequency >= 0 && frequency <= 255); 453 if (word.length >= Constants.Dictionary.MAX_WORD_LENGTH) { 454 MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 455 return; 456 } 457 458 Node currentNode = mRoot; 459 int charIndex = 0; 460 461 CharGroup currentGroup = null; 462 int differentCharIndex = 0; // Set by the loop to the index of the char that differs 463 int nodeIndex = findIndexOfChar(mRoot, word[charIndex]); 464 while (CHARACTER_NOT_FOUND != nodeIndex) { 465 currentGroup = currentNode.mData.get(nodeIndex); 466 differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex); 467 if (ARRAYS_ARE_EQUAL != differentCharIndex 468 && differentCharIndex < currentGroup.mChars.length) break; 469 if (null == currentGroup.mChildren) break; 470 charIndex += currentGroup.mChars.length; 471 if (charIndex >= word.length) break; 472 currentNode = currentGroup.mChildren; 473 nodeIndex = findIndexOfChar(currentNode, word[charIndex]); 474 } 475 476 if (-1 == nodeIndex) { 477 // No node at this point to accept the word. Create one. 478 final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); 479 final CharGroup newGroup = new CharGroup( 480 Arrays.copyOfRange(word, charIndex, word.length), 481 shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); 482 currentNode.mData.add(insertionIndex, newGroup); 483 if (DBG) checkStack(currentNode); 484 } else { 485 // There is a word with a common prefix. 486 if (differentCharIndex == currentGroup.mChars.length) { 487 if (charIndex + differentCharIndex >= word.length) { 488 // The new word is a prefix of an existing word, but the node on which it 489 // should end already exists as is. Since the old CharNode was not a terminal, 490 // make it one by filling in its frequency and other attributes 491 currentGroup.update(frequency, shortcutTargets, null, isNotAWord, 492 isBlacklistEntry); 493 } else { 494 // The new word matches the full old word and extends past it. 495 // We only have to create a new node and add it to the end of this. 496 final CharGroup newNode = new CharGroup( 497 Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 498 shortcutTargets, null /* bigrams */, frequency, isNotAWord, 499 isBlacklistEntry); 500 currentGroup.mChildren = new Node(); 501 currentGroup.mChildren.mData.add(newNode); 502 } 503 } else { 504 if (0 == differentCharIndex) { 505 // Exact same word. Update the frequency if higher. This will also add the 506 // new shortcuts to the existing shortcut list if it already exists. 507 currentGroup.update(frequency, shortcutTargets, null, 508 currentGroup.mIsNotAWord && isNotAWord, 509 currentGroup.mIsBlacklistEntry || isBlacklistEntry); 510 } else { 511 // Partial prefix match only. We have to replace the current node with a node 512 // containing the current prefix and create two new ones for the tails. 513 Node newChildren = new Node(); 514 final CharGroup newOldWord = new CharGroup( 515 Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, 516 currentGroup.mChars.length), currentGroup.mShortcutTargets, 517 currentGroup.mBigrams, currentGroup.mFrequency, 518 currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry, 519 currentGroup.mChildren); 520 newChildren.mData.add(newOldWord); 521 522 final CharGroup newParent; 523 if (charIndex + differentCharIndex >= word.length) { 524 newParent = new CharGroup( 525 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 526 shortcutTargets, null /* bigrams */, frequency, 527 isNotAWord, isBlacklistEntry, newChildren); 528 } else { 529 newParent = new CharGroup( 530 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 531 null /* shortcutTargets */, null /* bigrams */, -1, 532 false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); 533 final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word, 534 charIndex + differentCharIndex, word.length), 535 shortcutTargets, null /* bigrams */, frequency, 536 isNotAWord, isBlacklistEntry); 537 final int addIndex = word[charIndex + differentCharIndex] 538 > currentGroup.mChars[differentCharIndex] ? 1 : 0; 539 newChildren.mData.add(addIndex, newWord); 540 } 541 currentNode.mData.set(nodeIndex, newParent); 542 } 543 if (DBG) checkStack(currentNode); 544 } 545 } 546 } 547 548 private static int ARRAYS_ARE_EQUAL = 0; 549 550 /** 551 * Custom comparison of two int arrays taken to contain character codes. 552 * 553 * This method compares the two arrays passed as an argument in a lexicographic way, 554 * with an offset in the dst string. 555 * This method does NOT test for the first character. It is taken to be equal. 556 * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 557 * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 558 * strings are equal. This works BECAUSE we don't look at the first character. 559 * 560 * @param src the left-hand side string of the comparison. 561 * @param dst the right-hand side string of the comparison. 562 * @param dstOffset the offset in the right-hand side string. 563 * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 564 */ 565 private static int compareArrays(final int[] src, final int[] dst, int dstOffset) { 566 // We do NOT test the first char, because we come from a method that already 567 // tested it. 568 for (int i = 1; i < src.length; ++i) { 569 if (dstOffset + i >= dst.length) return i; 570 if (src[i] != dst[dstOffset + i]) return i; 571 } 572 if (dst.length > src.length) return src.length; 573 return ARRAYS_ARE_EQUAL; 574 } 575 576 /** 577 * Helper class that compares and sorts two chargroups according to their 578 * first element only. I repeat: ONLY the first element is considered, the rest 579 * is ignored. 580 * This comparator imposes orderings that are inconsistent with equals. 581 */ 582 static private final class CharGroupComparator implements java.util.Comparator<CharGroup> { 583 @Override 584 public int compare(CharGroup c1, CharGroup c2) { 585 if (c1.mChars[0] == c2.mChars[0]) return 0; 586 return c1.mChars[0] < c2.mChars[0] ? -1 : 1; 587 } 588 } 589 final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator(); 590 591 /** 592 * Finds the insertion index of a character within a node. 593 */ 594 private static int findInsertionIndex(final Node node, int character) { 595 final ArrayList<CharGroup> data = node.mData; 596 final CharGroup reference = new CharGroup(new int[] { character }, 597 null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, 598 false /* isBlacklistEntry */); 599 int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); 600 return result >= 0 ? result : -result - 1; 601 } 602 603 private static int CHARACTER_NOT_FOUND = -1; 604 605 /** 606 * Find the index of a char in a node, if it exists. 607 * 608 * @param node the node to search in. 609 * @param character the character to search for. 610 * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else. 611 */ 612 private static int findIndexOfChar(final Node node, int character) { 613 final int insertionIndex = findInsertionIndex(node, character); 614 if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND; 615 return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex 616 : CHARACTER_NOT_FOUND; 617 } 618 619 /** 620 * Helper method to find a word in a given branch. 621 */ 622 @SuppressWarnings("unused") 623 public static CharGroup findWordInTree(Node node, final String string) { 624 int index = 0; 625 final StringBuilder checker = DBG ? new StringBuilder() : null; 626 final int[] codePoints = getCodePoints(string); 627 628 CharGroup currentGroup; 629 do { 630 int indexOfGroup = findIndexOfChar(node, codePoints[index]); 631 if (CHARACTER_NOT_FOUND == indexOfGroup) return null; 632 currentGroup = node.mData.get(indexOfGroup); 633 634 if (codePoints.length - index < currentGroup.mChars.length) return null; 635 int newIndex = index; 636 while (newIndex < codePoints.length && newIndex - index < currentGroup.mChars.length) { 637 if (currentGroup.mChars[newIndex - index] != codePoints[newIndex]) return null; 638 newIndex++; 639 } 640 index = newIndex; 641 642 if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length)); 643 if (index < codePoints.length) { 644 node = currentGroup.mChildren; 645 } 646 } while (null != node && index < codePoints.length); 647 648 if (index < codePoints.length) return null; 649 if (!currentGroup.isTerminal()) return null; 650 if (DBG && !string.equals(checker.toString())) return null; 651 return currentGroup; 652 } 653 654 /** 655 * Helper method to find out whether a word is in the dict or not. 656 */ 657 public boolean hasWord(final String s) { 658 if (null == s || "".equals(s)) { 659 throw new RuntimeException("Can't search for a null or empty string"); 660 } 661 return null != findWordInTree(mRoot, s); 662 } 663 664 /** 665 * Recursively count the number of character groups in a given branch of the trie. 666 * 667 * @param node the parent node. 668 * @return the number of char groups in all the branch under this node. 669 */ 670 public static int countCharGroups(final Node node) { 671 final int nodeSize = node.mData.size(); 672 int size = nodeSize; 673 for (int i = nodeSize - 1; i >= 0; --i) { 674 CharGroup group = node.mData.get(i); 675 if (null != group.mChildren) 676 size += countCharGroups(group.mChildren); 677 } 678 return size; 679 } 680 681 /** 682 * Recursively count the number of nodes in a given branch of the trie. 683 * 684 * @param node the node to count. 685 * @return the number of nodes in this branch. 686 */ 687 public static int countNodes(final Node node) { 688 int size = 1; 689 for (int i = node.mData.size() - 1; i >= 0; --i) { 690 CharGroup group = node.mData.get(i); 691 if (null != group.mChildren) 692 size += countNodes(group.mChildren); 693 } 694 return size; 695 } 696 697 // Recursively find out whether there are any bigrams. 698 // This can be pretty expensive especially if there aren't any (we return as soon 699 // as we find one, so it's much cheaper if there are bigrams) 700 private static boolean hasBigramsInternal(final Node node) { 701 if (null == node) return false; 702 for (int i = node.mData.size() - 1; i >= 0; --i) { 703 CharGroup group = node.mData.get(i); 704 if (null != group.mBigrams) return true; 705 if (hasBigramsInternal(group.mChildren)) return true; 706 } 707 return false; 708 } 709 710 /** 711 * Finds out whether there are any bigrams in this dictionary. 712 * 713 * @return true if there is any bigram, false otherwise. 714 */ 715 // TODO: this is expensive especially for large dictionaries without any bigram. 716 // The up side is, this is always accurate and correct and uses no memory. We should 717 // find a more efficient way of doing this, without compromising too much on memory 718 // and ease of use. 719 public boolean hasBigrams() { 720 return hasBigramsInternal(mRoot); 721 } 722 723 // Historically, the tails of the words were going to be merged to save space. 724 // However, that would prevent the code to search for a specific address in log(n) 725 // time so this was abandoned. 726 // The code is still of interest as it does add some compression to any dictionary 727 // that has no need for attributes. Implementations that does not read attributes should be 728 // able to read a dictionary with merged tails. 729 // Also, the following code does support frequencies, as in, it will only merges 730 // tails that share the same frequency. Though it would result in the above loss of 731 // performance while searching by address, it is still technically possible to merge 732 // tails that contain attributes, but this code does not take that into account - it does 733 // not compare attributes and will merge terminals with different attributes regardless. 734 public void mergeTails() { 735 MakedictLog.i("Do not merge tails"); 736 return; 737 738 // MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root)); 739 // MakedictLog.i("Number of groups : " + countCharGroups(root)); 740 // 741 // final HashMap<String, ArrayList<Node>> repository = 742 // new HashMap<String, ArrayList<Node>>(); 743 // mergeTailsInner(repository, root); 744 // 745 // MakedictLog.i("Number of different pseudohashes : " + repository.size()); 746 // int size = 0; 747 // for (ArrayList<Node> a : repository.values()) { 748 // size += a.size(); 749 // } 750 // MakedictLog.i("Number of nodes after merge : " + (1 + size)); 751 // MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 752 } 753 754 // The following methods are used by the deactivated mergeTails() 755 // private static boolean isEqual(Node a, Node b) { 756 // if (null == a && null == b) return true; 757 // if (null == a || null == b) return false; 758 // if (a.data.size() != b.data.size()) return false; 759 // final int size = a.data.size(); 760 // for (int i = size - 1; i >= 0; --i) { 761 // CharGroup aGroup = a.data.get(i); 762 // CharGroup bGroup = b.data.get(i); 763 // if (aGroup.frequency != bGroup.frequency) return false; 764 // if (aGroup.alternates == null && bGroup.alternates != null) return false; 765 // if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false; 766 // if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false; 767 // if (!isEqual(aGroup.children, bGroup.children)) return false; 768 // } 769 // return true; 770 // } 771 772 // static private HashMap<String, ArrayList<Node>> mergeTailsInner( 773 // final HashMap<String, ArrayList<Node>> map, final Node node) { 774 // final ArrayList<CharGroup> branches = node.data; 775 // final int nodeSize = branches.size(); 776 // for (int i = 0; i < nodeSize; ++i) { 777 // CharGroup group = branches.get(i); 778 // if (null != group.children) { 779 // String pseudoHash = getPseudoHash(group.children); 780 // ArrayList<Node> similarList = map.get(pseudoHash); 781 // if (null == similarList) { 782 // similarList = new ArrayList<Node>(); 783 // map.put(pseudoHash, similarList); 784 // } 785 // boolean merged = false; 786 // for (Node similar : similarList) { 787 // if (isEqual(group.children, similar)) { 788 // group.children = similar; 789 // merged = true; 790 // break; 791 // } 792 // } 793 // if (!merged) { 794 // similarList.add(group.children); 795 // } 796 // mergeTailsInner(map, group.children); 797 // } 798 // } 799 // return map; 800 // } 801 802 // private static String getPseudoHash(final Node node) { 803 // StringBuilder s = new StringBuilder(); 804 // for (CharGroup g : node.data) { 805 // s.append(g.frequency); 806 // for (int ch : g.chars) { 807 // s.append(Character.toChars(ch)); 808 // } 809 // } 810 // return s.toString(); 811 // } 812 813 /** 814 * Iterator to walk through a dictionary. 815 * 816 * This is purely for convenience. 817 */ 818 public static final class DictionaryIterator implements Iterator<Word> { 819 private static final class Position { 820 public Iterator<CharGroup> pos; 821 public int length; 822 public Position(ArrayList<CharGroup> groups) { 823 pos = groups.iterator(); 824 length = 0; 825 } 826 } 827 final StringBuilder mCurrentString; 828 final LinkedList<Position> mPositions; 829 830 public DictionaryIterator(ArrayList<CharGroup> root) { 831 mCurrentString = new StringBuilder(); 832 mPositions = new LinkedList<Position>(); 833 final Position rootPos = new Position(root); 834 mPositions.add(rootPos); 835 } 836 837 @Override 838 public boolean hasNext() { 839 for (Position p : mPositions) { 840 if (p.pos.hasNext()) { 841 return true; 842 } 843 } 844 return false; 845 } 846 847 @Override 848 public Word next() { 849 Position currentPos = mPositions.getLast(); 850 mCurrentString.setLength(currentPos.length); 851 852 do { 853 if (currentPos.pos.hasNext()) { 854 final CharGroup currentGroup = currentPos.pos.next(); 855 currentPos.length = mCurrentString.length(); 856 for (int i : currentGroup.mChars) { 857 mCurrentString.append(Character.toChars(i)); 858 } 859 if (null != currentGroup.mChildren) { 860 currentPos = new Position(currentGroup.mChildren.mData); 861 currentPos.length = mCurrentString.length(); 862 mPositions.addLast(currentPos); 863 } 864 if (currentGroup.mFrequency >= 0) { 865 return new Word(mCurrentString.toString(), currentGroup.mFrequency, 866 currentGroup.mShortcutTargets, currentGroup.mBigrams, 867 currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry); 868 } 869 } else { 870 mPositions.removeLast(); 871 currentPos = mPositions.getLast(); 872 mCurrentString.setLength(mPositions.getLast().length); 873 } 874 } while (true); 875 } 876 877 @Override 878 public void remove() { 879 throw new UnsupportedOperationException("Unsupported yet"); 880 } 881 882 } 883 884 /** 885 * Method to return an iterator. 886 * 887 * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 888 * and say : for (Word w : x) {} 889 */ 890 @Override 891 public Iterator<Word> iterator() { 892 return new DictionaryIterator(mRoot.mData); 893 } 894 } 895