Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.Constants;
     21 
     22 import java.util.ArrayList;
     23 import java.util.Arrays;
     24 import java.util.Collections;
     25 import java.util.Date;
     26 import java.util.HashMap;
     27 import java.util.Iterator;
     28 import java.util.LinkedList;
     29 
     30 /**
     31  * A dictionary that can fusion heads and tails of words for more compression.
     32  */
     33 @UsedForTesting
     34 public final class FusionDictionary implements Iterable<Word> {
     35     private static final boolean DBG = MakedictLog.DBG;
     36 
     37     /**
     38      * A node of the dictionary, containing several CharGroups.
     39      *
     40      * A node is but an ordered array of CharGroups, which essentially contain all the
     41      * real information.
     42      * This class also contains fields to cache size and address, to help with binary
     43      * generation.
     44      */
     45     public static final class Node {
     46         ArrayList<CharGroup> mData;
     47         // To help with binary generation
     48         int mCachedSize = Integer.MIN_VALUE;
     49         int mCachedAddress = Integer.MIN_VALUE;
     50         int mCachedParentAddress = 0;
     51 
     52         public Node() {
     53             mData = new ArrayList<CharGroup>();
     54         }
     55         public Node(ArrayList<CharGroup> data) {
     56             mData = data;
     57         }
     58     }
     59 
     60     /**
     61      * A string with a frequency.
     62      *
     63      * This represents an "attribute", that is either a bigram or a shortcut.
     64      */
     65     public static final class WeightedString {
     66         public final String mWord;
     67         public int mFrequency;
     68         public WeightedString(String word, int frequency) {
     69             mWord = word;
     70             mFrequency = frequency;
     71         }
     72 
     73         @Override
     74         public int hashCode() {
     75             return Arrays.hashCode(new Object[] { mWord, mFrequency });
     76         }
     77 
     78         @Override
     79         public boolean equals(Object o) {
     80             if (o == this) return true;
     81             if (!(o instanceof WeightedString)) return false;
     82             WeightedString w = (WeightedString)o;
     83             return mWord.equals(w.mWord) && mFrequency == w.mFrequency;
     84         }
     85     }
     86 
     87     /**
     88      * A group of characters, with a frequency, shortcut targets, bigrams, and children.
     89      *
     90      * This is the central class of the in-memory representation. A CharGroup is what can
     91      * be seen as a traditional "trie node", except it can hold several characters at the
     92      * same time. A CharGroup essentially represents one or several characters in the middle
     93      * of the trie trie; as such, it can be a terminal, and it can have children.
     94      * In this in-memory representation, whether the CharGroup is a terminal or not is represented
     95      * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
     96      * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
     97      * bigrams, but a non-terminal may not. Moreover, children, if present, are null.
     98      */
     99     public static final class CharGroup {
    100         public static final int NOT_A_TERMINAL = -1;
    101         final int mChars[];
    102         ArrayList<WeightedString> mShortcutTargets;
    103         ArrayList<WeightedString> mBigrams;
    104         int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
    105         Node mChildren;
    106         boolean mIsNotAWord; // Only a shortcut
    107         boolean mIsBlacklistEntry;
    108         // The two following members to help with binary generation
    109         int mCachedSize;
    110         int mCachedAddress;
    111 
    112         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    113                 final ArrayList<WeightedString> bigrams, final int frequency,
    114                 final boolean isNotAWord, final boolean isBlacklistEntry) {
    115             mChars = chars;
    116             mFrequency = frequency;
    117             mShortcutTargets = shortcutTargets;
    118             mBigrams = bigrams;
    119             mChildren = null;
    120             mIsNotAWord = isNotAWord;
    121             mIsBlacklistEntry = isBlacklistEntry;
    122         }
    123 
    124         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    125                 final ArrayList<WeightedString> bigrams, final int frequency,
    126                 final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) {
    127             mChars = chars;
    128             mFrequency = frequency;
    129             mShortcutTargets = shortcutTargets;
    130             mBigrams = bigrams;
    131             mChildren = children;
    132             mIsNotAWord = isNotAWord;
    133             mIsBlacklistEntry = isBlacklistEntry;
    134         }
    135 
    136         public void addChild(CharGroup n) {
    137             if (null == mChildren) {
    138                 mChildren = new Node();
    139             }
    140             mChildren.mData.add(n);
    141         }
    142 
    143         public boolean isTerminal() {
    144             return NOT_A_TERMINAL != mFrequency;
    145         }
    146 
    147         public int getFrequency() {
    148             return mFrequency;
    149         }
    150 
    151         public boolean getIsNotAWord() {
    152             return mIsNotAWord;
    153         }
    154 
    155         public boolean getIsBlacklistEntry() {
    156             return mIsBlacklistEntry;
    157         }
    158 
    159         public ArrayList<WeightedString> getShortcutTargets() {
    160             // We don't want write permission to escape outside the package, so we return a copy
    161             if (null == mShortcutTargets) return null;
    162             final ArrayList<WeightedString> copyOfShortcutTargets =
    163                     new ArrayList<WeightedString>(mShortcutTargets);
    164             return copyOfShortcutTargets;
    165         }
    166 
    167         public ArrayList<WeightedString> getBigrams() {
    168             // We don't want write permission to escape outside the package, so we return a copy
    169             if (null == mBigrams) return null;
    170             final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams);
    171             return copyOfBigrams;
    172         }
    173 
    174         public boolean hasSeveralChars() {
    175             assert(mChars.length > 0);
    176             return 1 < mChars.length;
    177         }
    178 
    179         /**
    180          * Adds a word to the bigram list. Updates the frequency if the word already
    181          * exists.
    182          */
    183         public void addBigram(final String word, final int frequency) {
    184             if (mBigrams == null) {
    185                 mBigrams = new ArrayList<WeightedString>();
    186             }
    187             WeightedString bigram = getBigram(word);
    188             if (bigram != null) {
    189                 bigram.mFrequency = frequency;
    190             } else {
    191                 bigram = new WeightedString(word, frequency);
    192                 mBigrams.add(bigram);
    193             }
    194         }
    195 
    196         /**
    197          * Gets the shortcut target for the given word. Returns null if the word is not in the
    198          * shortcut list.
    199          */
    200         public WeightedString getShortcut(final String word) {
    201             // TODO: Don't do a linear search
    202             if (mShortcutTargets != null) {
    203                 final int size = mShortcutTargets.size();
    204                 for (int i = 0; i < size; ++i) {
    205                     WeightedString shortcut = mShortcutTargets.get(i);
    206                     if (shortcut.mWord.equals(word)) {
    207                         return shortcut;
    208                     }
    209                 }
    210             }
    211             return null;
    212         }
    213 
    214         /**
    215          * Gets the bigram for the given word.
    216          * Returns null if the word is not in the bigrams list.
    217          */
    218         public WeightedString getBigram(final String word) {
    219             // TODO: Don't do a linear search
    220             if (mBigrams != null) {
    221                 final int size = mBigrams.size();
    222                 for (int i = 0; i < size; ++i) {
    223                     WeightedString bigram = mBigrams.get(i);
    224                     if (bigram.mWord.equals(word)) {
    225                         return bigram;
    226                     }
    227                 }
    228             }
    229             return null;
    230         }
    231 
    232         /**
    233          * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to
    234          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
    235          * updated if they are higher than the existing ones.
    236          */
    237         public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
    238                 final ArrayList<WeightedString> bigrams,
    239                 final boolean isNotAWord, final boolean isBlacklistEntry) {
    240             if (frequency > mFrequency) {
    241                 mFrequency = frequency;
    242             }
    243             if (shortcutTargets != null) {
    244                 if (mShortcutTargets == null) {
    245                     mShortcutTargets = shortcutTargets;
    246                 } else {
    247                     final int size = shortcutTargets.size();
    248                     for (int i = 0; i < size; ++i) {
    249                         final WeightedString shortcut = shortcutTargets.get(i);
    250                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
    251                         if (existingShortcut == null) {
    252                             mShortcutTargets.add(shortcut);
    253                         } else if (existingShortcut.mFrequency < shortcut.mFrequency) {
    254                             existingShortcut.mFrequency = shortcut.mFrequency;
    255                         }
    256                     }
    257                 }
    258             }
    259             if (bigrams != null) {
    260                 if (mBigrams == null) {
    261                     mBigrams = bigrams;
    262                 } else {
    263                     final int size = bigrams.size();
    264                     for (int i = 0; i < size; ++i) {
    265                         final WeightedString bigram = bigrams.get(i);
    266                         final WeightedString existingBigram = getBigram(bigram.mWord);
    267                         if (existingBigram == null) {
    268                             mBigrams.add(bigram);
    269                         } else if (existingBigram.mFrequency < bigram.mFrequency) {
    270                             existingBigram.mFrequency = bigram.mFrequency;
    271                         }
    272                     }
    273                 }
    274             }
    275             mIsNotAWord = isNotAWord;
    276             mIsBlacklistEntry = isBlacklistEntry;
    277         }
    278     }
    279 
    280     /**
    281      * Options global to the dictionary.
    282      */
    283     public static final class DictionaryOptions {
    284         public final boolean mGermanUmlautProcessing;
    285         public final boolean mFrenchLigatureProcessing;
    286         public final HashMap<String, String> mAttributes;
    287         public DictionaryOptions(final HashMap<String, String> attributes,
    288                 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) {
    289             mAttributes = attributes;
    290             mGermanUmlautProcessing = germanUmlautProcessing;
    291             mFrenchLigatureProcessing = frenchLigatureProcessing;
    292         }
    293         @Override
    294         public String toString() { // Convenience method
    295             return toString(0, false);
    296         }
    297         public String toString(final int indentCount, final boolean plumbing) {
    298             final StringBuilder indent = new StringBuilder();
    299             if (plumbing) {
    300                 indent.append("H:");
    301             } else {
    302                 for (int i = 0; i < indentCount; ++i) {
    303                     indent.append(" ");
    304                 }
    305             }
    306             final StringBuilder s = new StringBuilder();
    307             for (final String optionKey : mAttributes.keySet()) {
    308                 s.append(indent);
    309                 s.append(optionKey);
    310                 s.append(" = ");
    311                 if ("date".equals(optionKey) && !plumbing) {
    312                     // Date needs a number of milliseconds, but the dictionary contains seconds
    313                     s.append(new Date(
    314                             1000 * Long.parseLong(mAttributes.get(optionKey))).toString());
    315                 } else {
    316                     s.append(mAttributes.get(optionKey));
    317                 }
    318                 s.append("\n");
    319             }
    320             if (mGermanUmlautProcessing) {
    321                 s.append(indent);
    322                 s.append("Needs German umlaut processing\n");
    323             }
    324             if (mFrenchLigatureProcessing) {
    325                 s.append(indent);
    326                 s.append("Needs French ligature processing\n");
    327             }
    328             return s.toString();
    329         }
    330     }
    331 
    332     public final DictionaryOptions mOptions;
    333     public final Node mRoot;
    334 
    335     public FusionDictionary(final Node root, final DictionaryOptions options) {
    336         mRoot = root;
    337         mOptions = options;
    338     }
    339 
    340     public void addOptionAttribute(final String key, final String value) {
    341         mOptions.mAttributes.put(key, value);
    342     }
    343 
    344     /**
    345      * Helper method to convert a String to an int array.
    346      */
    347     static int[] getCodePoints(final String word) {
    348         // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray,
    349         // which is not visible from the makedict package. Factor this code.
    350         final char[] characters = word.toCharArray();
    351         final int length = characters.length;
    352         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
    353         int codePoint = Character.codePointAt(characters, 0);
    354         int dsti = 0;
    355         for (int srci = Character.charCount(codePoint);
    356                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
    357             codePoints[dsti] = codePoint;
    358             codePoint = Character.codePointAt(characters, srci);
    359         }
    360         codePoints[dsti] = codePoint;
    361         return codePoints;
    362     }
    363 
    364     /**
    365      * Helper method to add a word as a string.
    366      *
    367      * This method adds a word to the dictionary with the given frequency. Optional
    368      * lists of bigrams and shortcuts can be passed here. For each word inside,
    369      * they will be added to the dictionary as necessary.
    370      *
    371      * @param word the word to add.
    372      * @param frequency the frequency of the word, in the range [0..255].
    373      * @param shortcutTargets a list of shortcut targets for this word, or null.
    374      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
    375      */
    376     public void add(final String word, final int frequency,
    377             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
    378         add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
    379                 false /* isBlacklistEntry */);
    380     }
    381 
    382     /**
    383      * Helper method to add a blacklist entry as a string.
    384      *
    385      * @param word the word to add as a blacklist entry.
    386      * @param shortcutTargets a list of shortcut targets for this word, or null.
    387      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
    388      */
    389     public void addBlacklistEntry(final String word,
    390             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
    391         add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
    392     }
    393 
    394     /**
    395      * Sanity check for a node.
    396      *
    397      * This method checks that all CharGroups in a node are ordered as expected.
    398      * If they are, nothing happens. If they aren't, an exception is thrown.
    399      */
    400     private void checkStack(Node node) {
    401         ArrayList<CharGroup> stack = node.mData;
    402         int lastValue = -1;
    403         for (int i = 0; i < stack.size(); ++i) {
    404             int currentValue = stack.get(i).mChars[0];
    405             if (currentValue <= lastValue)
    406                 throw new RuntimeException("Invalid stack");
    407             else
    408                 lastValue = currentValue;
    409         }
    410     }
    411 
    412     /**
    413      * Helper method to add a new bigram to the dictionary.
    414      *
    415      * @param word1 the previous word of the context
    416      * @param word2 the next word of the context
    417      * @param frequency the bigram frequency
    418      */
    419     public void setBigram(final String word1, final String word2, final int frequency) {
    420         CharGroup charGroup = findWordInTree(mRoot, word1);
    421         if (charGroup != null) {
    422             final CharGroup charGroup2 = findWordInTree(mRoot, word2);
    423             if (charGroup2 == null) {
    424                 add(getCodePoints(word2), 0, null, false /* isNotAWord */,
    425                         false /* isBlacklistEntry */);
    426                 // The chargroup for the first word may have moved by the above insertion,
    427                 // if word1 and word2 share a common stem that happens not to have been
    428                 // a cutting point until now. In this case, we need to refresh charGroup.
    429                 charGroup = findWordInTree(mRoot, word1);
    430             }
    431             charGroup.addBigram(word2, frequency);
    432         } else {
    433             throw new RuntimeException("First word of bigram not found");
    434         }
    435     }
    436 
    437     /**
    438      * Add a word to this dictionary.
    439      *
    440      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
    441      * an exception is thrown.
    442      *
    443      * @param word the word, as an int array.
    444      * @param frequency the frequency of the word, in the range [0..255].
    445      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
    446      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
    447      * @param isBlacklistEntry true if this is a blacklisted word, false otherwise
    448      */
    449     private void add(final int[] word, final int frequency,
    450             final ArrayList<WeightedString> shortcutTargets,
    451             final boolean isNotAWord, final boolean isBlacklistEntry) {
    452         assert(frequency >= 0 && frequency <= 255);
    453         if (word.length >= Constants.Dictionary.MAX_WORD_LENGTH) {
    454             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
    455             return;
    456         }
    457 
    458         Node currentNode = mRoot;
    459         int charIndex = 0;
    460 
    461         CharGroup currentGroup = null;
    462         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
    463         int nodeIndex = findIndexOfChar(mRoot, word[charIndex]);
    464         while (CHARACTER_NOT_FOUND != nodeIndex) {
    465             currentGroup = currentNode.mData.get(nodeIndex);
    466             differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex);
    467             if (ARRAYS_ARE_EQUAL != differentCharIndex
    468                     && differentCharIndex < currentGroup.mChars.length) break;
    469             if (null == currentGroup.mChildren) break;
    470             charIndex += currentGroup.mChars.length;
    471             if (charIndex >= word.length) break;
    472             currentNode = currentGroup.mChildren;
    473             nodeIndex = findIndexOfChar(currentNode, word[charIndex]);
    474         }
    475 
    476         if (-1 == nodeIndex) {
    477             // No node at this point to accept the word. Create one.
    478             final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
    479             final CharGroup newGroup = new CharGroup(
    480                     Arrays.copyOfRange(word, charIndex, word.length),
    481                     shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
    482             currentNode.mData.add(insertionIndex, newGroup);
    483             if (DBG) checkStack(currentNode);
    484         } else {
    485             // There is a word with a common prefix.
    486             if (differentCharIndex == currentGroup.mChars.length) {
    487                 if (charIndex + differentCharIndex >= word.length) {
    488                     // The new word is a prefix of an existing word, but the node on which it
    489                     // should end already exists as is. Since the old CharNode was not a terminal,
    490                     // make it one by filling in its frequency and other attributes
    491                     currentGroup.update(frequency, shortcutTargets, null, isNotAWord,
    492                             isBlacklistEntry);
    493                 } else {
    494                     // The new word matches the full old word and extends past it.
    495                     // We only have to create a new node and add it to the end of this.
    496                     final CharGroup newNode = new CharGroup(
    497                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
    498                                     shortcutTargets, null /* bigrams */, frequency, isNotAWord,
    499                                     isBlacklistEntry);
    500                     currentGroup.mChildren = new Node();
    501                     currentGroup.mChildren.mData.add(newNode);
    502                 }
    503             } else {
    504                 if (0 == differentCharIndex) {
    505                     // Exact same word. Update the frequency if higher. This will also add the
    506                     // new shortcuts to the existing shortcut list if it already exists.
    507                     currentGroup.update(frequency, shortcutTargets, null,
    508                             currentGroup.mIsNotAWord && isNotAWord,
    509                             currentGroup.mIsBlacklistEntry || isBlacklistEntry);
    510                 } else {
    511                     // Partial prefix match only. We have to replace the current node with a node
    512                     // containing the current prefix and create two new ones for the tails.
    513                     Node newChildren = new Node();
    514                     final CharGroup newOldWord = new CharGroup(
    515                             Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
    516                                     currentGroup.mChars.length), currentGroup.mShortcutTargets,
    517                             currentGroup.mBigrams, currentGroup.mFrequency,
    518                             currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry,
    519                             currentGroup.mChildren);
    520                     newChildren.mData.add(newOldWord);
    521 
    522                     final CharGroup newParent;
    523                     if (charIndex + differentCharIndex >= word.length) {
    524                         newParent = new CharGroup(
    525                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
    526                                 shortcutTargets, null /* bigrams */, frequency,
    527                                 isNotAWord, isBlacklistEntry, newChildren);
    528                     } else {
    529                         newParent = new CharGroup(
    530                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
    531                                 null /* shortcutTargets */, null /* bigrams */, -1,
    532                                 false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
    533                         final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
    534                                 charIndex + differentCharIndex, word.length),
    535                                 shortcutTargets, null /* bigrams */, frequency,
    536                                 isNotAWord, isBlacklistEntry);
    537                         final int addIndex = word[charIndex + differentCharIndex]
    538                                 > currentGroup.mChars[differentCharIndex] ? 1 : 0;
    539                         newChildren.mData.add(addIndex, newWord);
    540                     }
    541                     currentNode.mData.set(nodeIndex, newParent);
    542                 }
    543                 if (DBG) checkStack(currentNode);
    544             }
    545         }
    546     }
    547 
    548     private static int ARRAYS_ARE_EQUAL = 0;
    549 
    550     /**
    551      * Custom comparison of two int arrays taken to contain character codes.
    552      *
    553      * This method compares the two arrays passed as an argument in a lexicographic way,
    554      * with an offset in the dst string.
    555      * This method does NOT test for the first character. It is taken to be equal.
    556      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
    557      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
    558      * strings are equal. This works BECAUSE we don't look at the first character.
    559      *
    560      * @param src the left-hand side string of the comparison.
    561      * @param dst the right-hand side string of the comparison.
    562      * @param dstOffset the offset in the right-hand side string.
    563      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
    564      */
    565     private static int compareArrays(final int[] src, final int[] dst, int dstOffset) {
    566         // We do NOT test the first char, because we come from a method that already
    567         // tested it.
    568         for (int i = 1; i < src.length; ++i) {
    569             if (dstOffset + i >= dst.length) return i;
    570             if (src[i] != dst[dstOffset + i]) return i;
    571         }
    572         if (dst.length > src.length) return src.length;
    573         return ARRAYS_ARE_EQUAL;
    574     }
    575 
    576     /**
    577      * Helper class that compares and sorts two chargroups according to their
    578      * first element only. I repeat: ONLY the first element is considered, the rest
    579      * is ignored.
    580      * This comparator imposes orderings that are inconsistent with equals.
    581      */
    582     static private final class CharGroupComparator implements java.util.Comparator<CharGroup> {
    583         @Override
    584         public int compare(CharGroup c1, CharGroup c2) {
    585             if (c1.mChars[0] == c2.mChars[0]) return 0;
    586             return c1.mChars[0] < c2.mChars[0] ? -1 : 1;
    587         }
    588     }
    589     final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator();
    590 
    591     /**
    592      * Finds the insertion index of a character within a node.
    593      */
    594     private static int findInsertionIndex(final Node node, int character) {
    595         final ArrayList<CharGroup> data = node.mData;
    596         final CharGroup reference = new CharGroup(new int[] { character },
    597                 null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
    598                 false /* isBlacklistEntry */);
    599         int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
    600         return result >= 0 ? result : -result - 1;
    601     }
    602 
    603     private static int CHARACTER_NOT_FOUND = -1;
    604 
    605     /**
    606      * Find the index of a char in a node, if it exists.
    607      *
    608      * @param node the node to search in.
    609      * @param character the character to search for.
    610      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else.
    611      */
    612     private static int findIndexOfChar(final Node node, int character) {
    613         final int insertionIndex = findInsertionIndex(node, character);
    614         if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND;
    615         return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex
    616                 : CHARACTER_NOT_FOUND;
    617     }
    618 
    619     /**
    620      * Helper method to find a word in a given branch.
    621      */
    622     @SuppressWarnings("unused")
    623     public static CharGroup findWordInTree(Node node, final String string) {
    624         int index = 0;
    625         final StringBuilder checker = DBG ? new StringBuilder() : null;
    626         final int[] codePoints = getCodePoints(string);
    627 
    628         CharGroup currentGroup;
    629         do {
    630             int indexOfGroup = findIndexOfChar(node, codePoints[index]);
    631             if (CHARACTER_NOT_FOUND == indexOfGroup) return null;
    632             currentGroup = node.mData.get(indexOfGroup);
    633 
    634             if (codePoints.length - index < currentGroup.mChars.length) return null;
    635             int newIndex = index;
    636             while (newIndex < codePoints.length && newIndex - index < currentGroup.mChars.length) {
    637                 if (currentGroup.mChars[newIndex - index] != codePoints[newIndex]) return null;
    638                 newIndex++;
    639             }
    640             index = newIndex;
    641 
    642             if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length));
    643             if (index < codePoints.length) {
    644                 node = currentGroup.mChildren;
    645             }
    646         } while (null != node && index < codePoints.length);
    647 
    648         if (index < codePoints.length) return null;
    649         if (!currentGroup.isTerminal()) return null;
    650         if (DBG && !string.equals(checker.toString())) return null;
    651         return currentGroup;
    652     }
    653 
    654     /**
    655      * Helper method to find out whether a word is in the dict or not.
    656      */
    657     public boolean hasWord(final String s) {
    658         if (null == s || "".equals(s)) {
    659             throw new RuntimeException("Can't search for a null or empty string");
    660         }
    661         return null != findWordInTree(mRoot, s);
    662     }
    663 
    664     /**
    665      * Recursively count the number of character groups in a given branch of the trie.
    666      *
    667      * @param node the parent node.
    668      * @return the number of char groups in all the branch under this node.
    669      */
    670     public static int countCharGroups(final Node node) {
    671         final int nodeSize = node.mData.size();
    672         int size = nodeSize;
    673         for (int i = nodeSize - 1; i >= 0; --i) {
    674             CharGroup group = node.mData.get(i);
    675             if (null != group.mChildren)
    676                 size += countCharGroups(group.mChildren);
    677         }
    678         return size;
    679     }
    680 
    681     /**
    682      * Recursively count the number of nodes in a given branch of the trie.
    683      *
    684      * @param node the node to count.
    685      * @return the number of nodes in this branch.
    686      */
    687     public static int countNodes(final Node node) {
    688         int size = 1;
    689         for (int i = node.mData.size() - 1; i >= 0; --i) {
    690             CharGroup group = node.mData.get(i);
    691             if (null != group.mChildren)
    692                 size += countNodes(group.mChildren);
    693         }
    694         return size;
    695     }
    696 
    697     // Recursively find out whether there are any bigrams.
    698     // This can be pretty expensive especially if there aren't any (we return as soon
    699     // as we find one, so it's much cheaper if there are bigrams)
    700     private static boolean hasBigramsInternal(final Node node) {
    701         if (null == node) return false;
    702         for (int i = node.mData.size() - 1; i >= 0; --i) {
    703             CharGroup group = node.mData.get(i);
    704             if (null != group.mBigrams) return true;
    705             if (hasBigramsInternal(group.mChildren)) return true;
    706         }
    707         return false;
    708     }
    709 
    710     /**
    711      * Finds out whether there are any bigrams in this dictionary.
    712      *
    713      * @return true if there is any bigram, false otherwise.
    714      */
    715     // TODO: this is expensive especially for large dictionaries without any bigram.
    716     // The up side is, this is always accurate and correct and uses no memory. We should
    717     // find a more efficient way of doing this, without compromising too much on memory
    718     // and ease of use.
    719     public boolean hasBigrams() {
    720         return hasBigramsInternal(mRoot);
    721     }
    722 
    723     // Historically, the tails of the words were going to be merged to save space.
    724     // However, that would prevent the code to search for a specific address in log(n)
    725     // time so this was abandoned.
    726     // The code is still of interest as it does add some compression to any dictionary
    727     // that has no need for attributes. Implementations that does not read attributes should be
    728     // able to read a dictionary with merged tails.
    729     // Also, the following code does support frequencies, as in, it will only merges
    730     // tails that share the same frequency. Though it would result in the above loss of
    731     // performance while searching by address, it is still technically possible to merge
    732     // tails that contain attributes, but this code does not take that into account - it does
    733     // not compare attributes and will merge terminals with different attributes regardless.
    734     public void mergeTails() {
    735         MakedictLog.i("Do not merge tails");
    736         return;
    737 
    738 //        MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root));
    739 //        MakedictLog.i("Number of groups : " + countCharGroups(root));
    740 //
    741 //        final HashMap<String, ArrayList<Node>> repository =
    742 //                  new HashMap<String, ArrayList<Node>>();
    743 //        mergeTailsInner(repository, root);
    744 //
    745 //        MakedictLog.i("Number of different pseudohashes : " + repository.size());
    746 //        int size = 0;
    747 //        for (ArrayList<Node> a : repository.values()) {
    748 //            size += a.size();
    749 //        }
    750 //        MakedictLog.i("Number of nodes after merge : " + (1 + size));
    751 //        MakedictLog.i("Recursively seen nodes : " + countNodes(root));
    752     }
    753 
    754     // The following methods are used by the deactivated mergeTails()
    755 //   private static boolean isEqual(Node a, Node b) {
    756 //       if (null == a && null == b) return true;
    757 //       if (null == a || null == b) return false;
    758 //       if (a.data.size() != b.data.size()) return false;
    759 //       final int size = a.data.size();
    760 //       for (int i = size - 1; i >= 0; --i) {
    761 //           CharGroup aGroup = a.data.get(i);
    762 //           CharGroup bGroup = b.data.get(i);
    763 //           if (aGroup.frequency != bGroup.frequency) return false;
    764 //           if (aGroup.alternates == null && bGroup.alternates != null) return false;
    765 //           if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false;
    766 //           if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false;
    767 //           if (!isEqual(aGroup.children, bGroup.children)) return false;
    768 //       }
    769 //       return true;
    770 //   }
    771 
    772 //   static private HashMap<String, ArrayList<Node>> mergeTailsInner(
    773 //           final HashMap<String, ArrayList<Node>> map, final Node node) {
    774 //       final ArrayList<CharGroup> branches = node.data;
    775 //       final int nodeSize = branches.size();
    776 //       for (int i = 0; i < nodeSize; ++i) {
    777 //           CharGroup group = branches.get(i);
    778 //           if (null != group.children) {
    779 //               String pseudoHash = getPseudoHash(group.children);
    780 //               ArrayList<Node> similarList = map.get(pseudoHash);
    781 //               if (null == similarList) {
    782 //                   similarList = new ArrayList<Node>();
    783 //                   map.put(pseudoHash, similarList);
    784 //               }
    785 //               boolean merged = false;
    786 //               for (Node similar : similarList) {
    787 //                   if (isEqual(group.children, similar)) {
    788 //                       group.children = similar;
    789 //                       merged = true;
    790 //                       break;
    791 //                   }
    792 //               }
    793 //               if (!merged) {
    794 //                   similarList.add(group.children);
    795 //               }
    796 //               mergeTailsInner(map, group.children);
    797 //           }
    798 //       }
    799 //       return map;
    800 //   }
    801 
    802 //  private static String getPseudoHash(final Node node) {
    803 //      StringBuilder s = new StringBuilder();
    804 //      for (CharGroup g : node.data) {
    805 //          s.append(g.frequency);
    806 //          for (int ch : g.chars) {
    807 //              s.append(Character.toChars(ch));
    808 //          }
    809 //      }
    810 //      return s.toString();
    811 //  }
    812 
    813     /**
    814      * Iterator to walk through a dictionary.
    815      *
    816      * This is purely for convenience.
    817      */
    818     public static final class DictionaryIterator implements Iterator<Word> {
    819         private static final class Position {
    820             public Iterator<CharGroup> pos;
    821             public int length;
    822             public Position(ArrayList<CharGroup> groups) {
    823                 pos = groups.iterator();
    824                 length = 0;
    825             }
    826         }
    827         final StringBuilder mCurrentString;
    828         final LinkedList<Position> mPositions;
    829 
    830         public DictionaryIterator(ArrayList<CharGroup> root) {
    831             mCurrentString = new StringBuilder();
    832             mPositions = new LinkedList<Position>();
    833             final Position rootPos = new Position(root);
    834             mPositions.add(rootPos);
    835         }
    836 
    837         @Override
    838         public boolean hasNext() {
    839             for (Position p : mPositions) {
    840                 if (p.pos.hasNext()) {
    841                     return true;
    842                 }
    843             }
    844             return false;
    845         }
    846 
    847         @Override
    848         public Word next() {
    849             Position currentPos = mPositions.getLast();
    850             mCurrentString.setLength(currentPos.length);
    851 
    852             do {
    853                 if (currentPos.pos.hasNext()) {
    854                     final CharGroup currentGroup = currentPos.pos.next();
    855                     currentPos.length = mCurrentString.length();
    856                     for (int i : currentGroup.mChars) {
    857                         mCurrentString.append(Character.toChars(i));
    858                     }
    859                     if (null != currentGroup.mChildren) {
    860                         currentPos = new Position(currentGroup.mChildren.mData);
    861                         currentPos.length = mCurrentString.length();
    862                         mPositions.addLast(currentPos);
    863                     }
    864                     if (currentGroup.mFrequency >= 0) {
    865                         return new Word(mCurrentString.toString(), currentGroup.mFrequency,
    866                                 currentGroup.mShortcutTargets, currentGroup.mBigrams,
    867                                 currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry);
    868                     }
    869                 } else {
    870                     mPositions.removeLast();
    871                     currentPos = mPositions.getLast();
    872                     mCurrentString.setLength(mPositions.getLast().length);
    873                 }
    874             } while (true);
    875         }
    876 
    877         @Override
    878         public void remove() {
    879             throw new UnsupportedOperationException("Unsupported yet");
    880         }
    881 
    882     }
    883 
    884     /**
    885      * Method to return an iterator.
    886      *
    887      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
    888      * and say : for (Word w : x) {}
    889      */
    890     @Override
    891     public Iterator<Word> iterator() {
    892         return new DictionaryIterator(mRoot.mData);
    893     }
    894 }
    895