1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import java.util.ArrayList; 20 import java.util.Arrays; 21 import java.util.Collections; 22 import java.util.Iterator; 23 import java.util.LinkedList; 24 import java.util.List; 25 26 /** 27 * A dictionary that can fusion heads and tails of words for more compression. 28 */ 29 public class FusionDictionary implements Iterable<Word> { 30 31 /** 32 * A node of the dictionary, containing several CharGroups. 33 * 34 * A node is but an ordered array of CharGroups, which essentially contain all the 35 * real information. 36 * This class also contains fields to cache size and address, to help with binary 37 * generation. 38 */ 39 public static class Node { 40 ArrayList<CharGroup> mData; 41 // To help with binary generation 42 int mCachedSize; 43 int mCachedAddress; 44 public Node() { 45 mData = new ArrayList<CharGroup>(); 46 mCachedSize = Integer.MIN_VALUE; 47 mCachedAddress = Integer.MIN_VALUE; 48 } 49 public Node(ArrayList<CharGroup> data) { 50 mData = data; 51 mCachedSize = Integer.MIN_VALUE; 52 mCachedAddress = Integer.MIN_VALUE; 53 } 54 } 55 56 /** 57 * A string with a frequency. 58 * 59 * This represents an "attribute", that is either a bigram or a shortcut. 60 */ 61 public static class WeightedString { 62 final String mWord; 63 final int mFrequency; 64 public WeightedString(String word, int frequency) { 65 mWord = word; 66 mFrequency = frequency; 67 } 68 } 69 70 /** 71 * A group of characters, with a frequency, shortcuts, bigrams, and children. 72 * 73 * This is the central class of the in-memory representation. A CharGroup is what can 74 * be seen as a traditional "trie node", except it can hold several characters at the 75 * same time. A CharGroup essentially represents one or several characters in the middle 76 * of the trie trie; as such, it can be a terminal, and it can have children. 77 * In this in-memory representation, whether the CharGroup is a terminal or not is represented 78 * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 79 * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 80 * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 81 */ 82 public static class CharGroup { 83 public static final int NOT_A_TERMINAL = -1; 84 final int mChars[]; 85 final ArrayList<WeightedString> mBigrams; 86 final int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 87 Node mChildren; 88 // The two following members to help with binary generation 89 int mCachedSize; 90 int mCachedAddress; 91 92 public CharGroup(final int[] chars, 93 final ArrayList<WeightedString> bigrams, final int frequency) { 94 mChars = chars; 95 mFrequency = frequency; 96 mBigrams = bigrams; 97 mChildren = null; 98 } 99 100 public CharGroup(final int[] chars, 101 final ArrayList<WeightedString> bigrams, final int frequency, final Node children) { 102 mChars = chars; 103 mFrequency = frequency; 104 mBigrams = bigrams; 105 mChildren = children; 106 } 107 108 public void addChild(CharGroup n) { 109 if (null == mChildren) { 110 mChildren = new Node(); 111 } 112 mChildren.mData.add(n); 113 } 114 115 public boolean isTerminal() { 116 return NOT_A_TERMINAL != mFrequency; 117 } 118 119 public boolean hasSeveralChars() { 120 assert(mChars.length > 0); 121 return 1 < mChars.length; 122 } 123 } 124 125 /** 126 * Options global to the dictionary. 127 * 128 * There are no options at the moment, so this class is empty. 129 */ 130 public static class DictionaryOptions { 131 } 132 133 134 public final DictionaryOptions mOptions; 135 public final Node mRoot; 136 137 public FusionDictionary() { 138 mOptions = new DictionaryOptions(); 139 mRoot = new Node(); 140 } 141 142 public FusionDictionary(final Node root, final DictionaryOptions options) { 143 mRoot = root; 144 mOptions = options; 145 } 146 147 /** 148 * Helper method to convert a String to an int array. 149 */ 150 static private int[] getCodePoints(String word) { 151 final int wordLength = word.length(); 152 int[] array = new int[word.codePointCount(0, wordLength)]; 153 for (int i = 0; i < wordLength; ++i) { 154 array[i] = word.codePointAt(i); 155 } 156 return array; 157 } 158 159 /** 160 * Helper method to add a word as a string. 161 * 162 * This method adds a word to the dictionary with the given frequency. Optional 163 * lists of bigrams and shortcuts can be passed here. For each word inside, 164 * they will be added to the dictionary as necessary. 165 * 166 * @param word the word to add. 167 * @param frequency the frequency of the word, in the range [0..255]. 168 * @param bigrams a list of bigrams, or null. 169 */ 170 public void add(String word, int frequency, ArrayList<WeightedString> bigrams) { 171 if (null != bigrams) { 172 for (WeightedString bigram : bigrams) { 173 final CharGroup t = findWordInTree(mRoot, bigram.mWord); 174 if (null == t) { 175 add(getCodePoints(bigram.mWord), 0, null); 176 } 177 } 178 } 179 add(getCodePoints(word), frequency, bigrams); 180 } 181 182 /** 183 * Sanity check for a node. 184 * 185 * This method checks that all CharGroups in a node are ordered as expected. 186 * If they are, nothing happens. If they aren't, an exception is thrown. 187 */ 188 private void checkStack(Node node) { 189 ArrayList<CharGroup> stack = node.mData; 190 int lastValue = -1; 191 for (int i = 0; i < stack.size(); ++i) { 192 int currentValue = stack.get(i).mChars[0]; 193 if (currentValue <= lastValue) 194 throw new RuntimeException("Invalid stack"); 195 else 196 lastValue = currentValue; 197 } 198 } 199 200 /** 201 * Add a word to this dictionary. 202 * 203 * The bigrams, if any, have to be in the dictionary already. If they aren't, 204 * an exception is thrown. 205 * 206 * @param word the word, as an int array. 207 * @param frequency the frequency of the word, in the range [0..255]. 208 * @param bigrams an optional list of bigrams for this word (null if none). 209 */ 210 private void add(int[] word, int frequency, ArrayList<WeightedString> bigrams) { 211 assert(frequency >= 0 && frequency <= 255); 212 Node currentNode = mRoot; 213 int charIndex = 0; 214 215 CharGroup currentGroup = null; 216 int differentCharIndex = 0; // Set by the loop to the index of the char that differs 217 int nodeIndex = findIndexOfChar(mRoot, word[charIndex]); 218 while (CHARACTER_NOT_FOUND != nodeIndex) { 219 currentGroup = currentNode.mData.get(nodeIndex); 220 differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex); 221 if (ARRAYS_ARE_EQUAL != differentCharIndex 222 && differentCharIndex < currentGroup.mChars.length) break; 223 if (null == currentGroup.mChildren) break; 224 charIndex += currentGroup.mChars.length; 225 if (charIndex >= word.length) break; 226 currentNode = currentGroup.mChildren; 227 nodeIndex = findIndexOfChar(currentNode, word[charIndex]); 228 } 229 230 if (-1 == nodeIndex) { 231 // No node at this point to accept the word. Create one. 232 final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); 233 final CharGroup newGroup = new CharGroup( 234 Arrays.copyOfRange(word, charIndex, word.length), bigrams, frequency); 235 currentNode.mData.add(insertionIndex, newGroup); 236 checkStack(currentNode); 237 } else { 238 // There is a word with a common prefix. 239 if (differentCharIndex == currentGroup.mChars.length) { 240 if (charIndex + differentCharIndex >= word.length) { 241 // The new word is a prefix of an existing word, but the node on which it 242 // should end already exists as is. 243 if (currentGroup.mFrequency > 0) { 244 throw new RuntimeException("Such a word already exists in the dictionary : " 245 + new String(word, 0, word.length)); 246 } else { 247 final CharGroup newNode = new CharGroup(currentGroup.mChars, 248 bigrams, frequency, currentGroup.mChildren); 249 currentNode.mData.set(nodeIndex, newNode); 250 checkStack(currentNode); 251 } 252 } else { 253 // The new word matches the full old word and extends past it. 254 // We only have to create a new node and add it to the end of this. 255 final CharGroup newNode = new CharGroup( 256 Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 257 bigrams, frequency); 258 currentGroup.mChildren = new Node(); 259 currentGroup.mChildren.mData.add(newNode); 260 } 261 } else { 262 if (0 == differentCharIndex) { 263 // Exact same word. Check the frequency is 0 or -1, and update. 264 if (0 != frequency) { 265 if (0 < currentGroup.mFrequency) { 266 throw new RuntimeException("This word already exists with frequency " 267 + currentGroup.mFrequency + " : " 268 + new String(word, 0, word.length)); 269 } 270 final CharGroup newGroup = new CharGroup(word, 271 currentGroup.mBigrams, frequency, currentGroup.mChildren); 272 currentNode.mData.set(nodeIndex, newGroup); 273 } 274 } else { 275 // Partial prefix match only. We have to replace the current node with a node 276 // containing the current prefix and create two new ones for the tails. 277 Node newChildren = new Node(); 278 final CharGroup newOldWord = new CharGroup( 279 Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, 280 currentGroup.mChars.length), 281 currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren); 282 newChildren.mData.add(newOldWord); 283 284 final CharGroup newParent; 285 if (charIndex + differentCharIndex >= word.length) { 286 newParent = new CharGroup( 287 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 288 bigrams, frequency, newChildren); 289 } else { 290 newParent = new CharGroup( 291 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 292 null, -1, newChildren); 293 final CharGroup newWord = new CharGroup( 294 Arrays.copyOfRange(word, charIndex + differentCharIndex, 295 word.length), bigrams, frequency); 296 final int addIndex = word[charIndex + differentCharIndex] 297 > currentGroup.mChars[differentCharIndex] ? 1 : 0; 298 newChildren.mData.add(addIndex, newWord); 299 } 300 currentNode.mData.set(nodeIndex, newParent); 301 } 302 checkStack(currentNode); 303 } 304 } 305 } 306 307 /** 308 * Custom comparison of two int arrays taken to contain character codes. 309 * 310 * This method compares the two arrays passed as an argument in a lexicographic way, 311 * with an offset in the dst string. 312 * This method does NOT test for the first character. It is taken to be equal. 313 * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 314 * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 315 * strings are equal. This works BECAUSE we don't look at the first character. 316 * 317 * @param src the left-hand side string of the comparison. 318 * @param dst the right-hand side string of the comparison. 319 * @param dstOffset the offset in the right-hand side string. 320 * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 321 */ 322 private static int ARRAYS_ARE_EQUAL = 0; 323 private static int compareArrays(final int[] src, final int[] dst, int dstOffset) { 324 // We do NOT test the first char, because we come from a method that already 325 // tested it. 326 for (int i = 1; i < src.length; ++i) { 327 if (dstOffset + i >= dst.length) return i; 328 if (src[i] != dst[dstOffset + i]) return i; 329 } 330 if (dst.length > src.length) return src.length; 331 return ARRAYS_ARE_EQUAL; 332 } 333 334 /** 335 * Helper class that compares and sorts two chargroups according to their 336 * first element only. I repeat: ONLY the first element is considered, the rest 337 * is ignored. 338 * This comparator imposes orderings that are inconsistent with equals. 339 */ 340 static private class CharGroupComparator implements java.util.Comparator { 341 public int compare(Object o1, Object o2) { 342 final CharGroup c1 = (CharGroup)o1; 343 final CharGroup c2 = (CharGroup)o2; 344 if (c1.mChars[0] == c2.mChars[0]) return 0; 345 return c1.mChars[0] < c2.mChars[0] ? -1 : 1; 346 } 347 public boolean equals(Object o) { 348 return o instanceof CharGroupComparator; 349 } 350 } 351 final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator(); 352 353 /** 354 * Finds the insertion index of a character within a node. 355 */ 356 private static int findInsertionIndex(final Node node, int character) { 357 final List data = node.mData; 358 final CharGroup reference = new CharGroup(new int[] { character }, null, 0); 359 int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); 360 return result >= 0 ? result : -result - 1; 361 } 362 363 /** 364 * Find the index of a char in a node, if it exists. 365 * 366 * @param node the node to search in. 367 * @param character the character to search for. 368 * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else. 369 */ 370 private static int CHARACTER_NOT_FOUND = -1; 371 private static int findIndexOfChar(final Node node, int character) { 372 final int insertionIndex = findInsertionIndex(node, character); 373 if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND; 374 return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex 375 : CHARACTER_NOT_FOUND; 376 } 377 378 /** 379 * Helper method to find a word in a given branch. 380 */ 381 public static CharGroup findWordInTree(Node node, final String s) { 382 int index = 0; 383 final StringBuilder checker = new StringBuilder(); 384 385 CharGroup currentGroup; 386 do { 387 int indexOfGroup = findIndexOfChar(node, s.codePointAt(index)); 388 if (CHARACTER_NOT_FOUND == indexOfGroup) return null; 389 currentGroup = node.mData.get(indexOfGroup); 390 checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length)); 391 index += currentGroup.mChars.length; 392 if (index < s.length()) { 393 node = currentGroup.mChildren; 394 } 395 } while (null != node && index < s.length()); 396 397 if (!s.equals(checker.toString())) return null; 398 return currentGroup; 399 } 400 401 /** 402 * Recursively count the number of character groups in a given branch of the trie. 403 * 404 * @param node the parent node. 405 * @return the number of char groups in all the branch under this node. 406 */ 407 public static int countCharGroups(final Node node) { 408 final int nodeSize = node.mData.size(); 409 int size = nodeSize; 410 for (int i = nodeSize - 1; i >= 0; --i) { 411 CharGroup group = node.mData.get(i); 412 if (null != group.mChildren) 413 size += countCharGroups(group.mChildren); 414 } 415 return size; 416 } 417 418 /** 419 * Recursively count the number of nodes in a given branch of the trie. 420 * 421 * @param node the node to count. 422 * @result the number of nodes in this branch. 423 */ 424 public static int countNodes(final Node node) { 425 int size = 1; 426 for (int i = node.mData.size() - 1; i >= 0; --i) { 427 CharGroup group = node.mData.get(i); 428 if (null != group.mChildren) 429 size += countNodes(group.mChildren); 430 } 431 return size; 432 } 433 434 // Historically, the tails of the words were going to be merged to save space. 435 // However, that would prevent the code to search for a specific address in log(n) 436 // time so this was abandoned. 437 // The code is still of interest as it does add some compression to any dictionary 438 // that has no need for attributes. Implementations that does not read attributes should be 439 // able to read a dictionary with merged tails. 440 // Also, the following code does support frequencies, as in, it will only merges 441 // tails that share the same frequency. Though it would result in the above loss of 442 // performance while searching by address, it is still technically possible to merge 443 // tails that contain attributes, but this code does not take that into account - it does 444 // not compare attributes and will merge terminals with different attributes regardless. 445 public void mergeTails() { 446 MakedictLog.i("Do not merge tails"); 447 return; 448 449 // MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root)); 450 // MakedictLog.i("Number of groups : " + countCharGroups(root)); 451 // 452 // final HashMap<String, ArrayList<Node>> repository = 453 // new HashMap<String, ArrayList<Node>>(); 454 // mergeTailsInner(repository, root); 455 // 456 // MakedictLog.i("Number of different pseudohashes : " + repository.size()); 457 // int size = 0; 458 // for (ArrayList<Node> a : repository.values()) { 459 // size += a.size(); 460 // } 461 // MakedictLog.i("Number of nodes after merge : " + (1 + size)); 462 // MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 463 } 464 465 // The following methods are used by the deactivated mergeTails() 466 // private static boolean isEqual(Node a, Node b) { 467 // if (null == a && null == b) return true; 468 // if (null == a || null == b) return false; 469 // if (a.data.size() != b.data.size()) return false; 470 // final int size = a.data.size(); 471 // for (int i = size - 1; i >= 0; --i) { 472 // CharGroup aGroup = a.data.get(i); 473 // CharGroup bGroup = b.data.get(i); 474 // if (aGroup.frequency != bGroup.frequency) return false; 475 // if (aGroup.alternates == null && bGroup.alternates != null) return false; 476 // if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false; 477 // if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false; 478 // if (!isEqual(aGroup.children, bGroup.children)) return false; 479 // } 480 // return true; 481 // } 482 483 // static private HashMap<String, ArrayList<Node>> mergeTailsInner( 484 // final HashMap<String, ArrayList<Node>> map, final Node node) { 485 // final ArrayList<CharGroup> branches = node.data; 486 // final int nodeSize = branches.size(); 487 // for (int i = 0; i < nodeSize; ++i) { 488 // CharGroup group = branches.get(i); 489 // if (null != group.children) { 490 // String pseudoHash = getPseudoHash(group.children); 491 // ArrayList<Node> similarList = map.get(pseudoHash); 492 // if (null == similarList) { 493 // similarList = new ArrayList<Node>(); 494 // map.put(pseudoHash, similarList); 495 // } 496 // boolean merged = false; 497 // for (Node similar : similarList) { 498 // if (isEqual(group.children, similar)) { 499 // group.children = similar; 500 // merged = true; 501 // break; 502 // } 503 // } 504 // if (!merged) { 505 // similarList.add(group.children); 506 // } 507 // mergeTailsInner(map, group.children); 508 // } 509 // } 510 // return map; 511 // } 512 513 // private static String getPseudoHash(final Node node) { 514 // StringBuilder s = new StringBuilder(); 515 // for (CharGroup g : node.data) { 516 // s.append(g.frequency); 517 // for (int ch : g.chars){ 518 // s.append(Character.toChars(ch)); 519 // } 520 // } 521 // return s.toString(); 522 // } 523 524 /** 525 * Iterator to walk through a dictionary. 526 * 527 * This is purely for convenience. 528 */ 529 public static class DictionaryIterator implements Iterator<Word> { 530 531 private static class Position { 532 public Iterator<CharGroup> pos; 533 public int length; 534 public Position(ArrayList<CharGroup> groups) { 535 pos = groups.iterator(); 536 length = 0; 537 } 538 } 539 final StringBuilder mCurrentString; 540 final LinkedList<Position> mPositions; 541 542 public DictionaryIterator(ArrayList<CharGroup> root) { 543 mCurrentString = new StringBuilder(); 544 mPositions = new LinkedList<Position>(); 545 final Position rootPos = new Position(root); 546 mPositions.add(rootPos); 547 } 548 549 @Override 550 public boolean hasNext() { 551 for (Position p : mPositions) { 552 if (p.pos.hasNext()) { 553 return true; 554 } 555 } 556 return false; 557 } 558 559 @Override 560 public Word next() { 561 Position currentPos = mPositions.getLast(); 562 mCurrentString.setLength(mCurrentString.length() - currentPos.length); 563 564 do { 565 if (currentPos.pos.hasNext()) { 566 final CharGroup currentGroup = currentPos.pos.next(); 567 currentPos.length = currentGroup.mChars.length; 568 for (int i : currentGroup.mChars) 569 mCurrentString.append(Character.toChars(i)); 570 if (null != currentGroup.mChildren) { 571 currentPos = new Position(currentGroup.mChildren.mData); 572 mPositions.addLast(currentPos); 573 } 574 if (currentGroup.mFrequency >= 0) 575 return new Word(mCurrentString.toString(), currentGroup.mFrequency, 576 currentGroup.mBigrams); 577 } else { 578 mPositions.removeLast(); 579 currentPos = mPositions.getLast(); 580 mCurrentString.setLength(mCurrentString.length() - mPositions.getLast().length); 581 } 582 } while(true); 583 } 584 585 @Override 586 public void remove() { 587 throw new UnsupportedOperationException("Unsupported yet"); 588 } 589 590 } 591 592 /** 593 * Method to return an iterator. 594 * 595 * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 596 * and say : for (Word w : x) {} 597 */ 598 @Override 599 public Iterator<Word> iterator() { 600 return new DictionaryIterator(mRoot.mData); 601 } 602 } 603