Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.Constants;
     21 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
     22 
     23 import java.util.ArrayList;
     24 import java.util.Arrays;
     25 import java.util.Collections;
     26 import java.util.Iterator;
     27 import java.util.LinkedList;
     28 
     29 /**
     30  * A dictionary that can fusion heads and tails of words for more compression.
     31  */
     32 @UsedForTesting
     33 public final class FusionDictionary implements Iterable<WordProperty> {
     34     private static final boolean DBG = MakedictLog.DBG;
     35 
     36     private static int CHARACTER_NOT_FOUND_INDEX = -1;
     37 
     38     /**
     39      * A node array of the dictionary, containing several PtNodes.
     40      *
     41      * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the
     42      * real information.
     43      * This class also contains fields to cache size and address, to help with binary
     44      * generation.
     45      */
     46     public static final class PtNodeArray {
     47         ArrayList<PtNode> mData;
     48         // To help with binary generation
     49         int mCachedSize = Integer.MIN_VALUE;
     50         // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
     51         // always hold the same value except between dictionary address compression, during which
     52         // the update process needs to know about both values at the same time. Updating will
     53         // update the AfterUpdate value, and the code will move them to BeforeUpdate before
     54         // the next update pass.
     55         int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
     56         int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
     57         int mCachedParentAddress = 0;
     58 
     59         public PtNodeArray() {
     60             mData = new ArrayList<>();
     61         }
     62         public PtNodeArray(ArrayList<PtNode> data) {
     63             Collections.sort(data, PTNODE_COMPARATOR);
     64             mData = data;
     65         }
     66     }
     67 
     68     /**
     69      * PtNode is a group of characters, with probability information, shortcut targets, bigrams,
     70      * and children (Pt means Patricia Trie).
     71      *
     72      * This is the central class of the in-memory representation. A PtNode is what can
     73      * be seen as a traditional "trie node", except it can hold several characters at the
     74      * same time. A PtNode essentially represents one or several characters in the middle
     75      * of the trie tree; as such, it can be a terminal, and it can have children.
     76      * In this in-memory representation, whether the PtNode is a terminal or not is represented
     77      * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the
     78      * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null
     79      * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present,
     80      * are non-null.
     81      */
     82     public static final class PtNode {
     83         private static final int NOT_A_TERMINAL = -1;
     84         final int mChars[];
     85         ArrayList<WeightedString> mShortcutTargets;
     86         ArrayList<WeightedString> mBigrams;
     87         // null == mProbabilityInfo indicates this is not a terminal.
     88         ProbabilityInfo mProbabilityInfo;
     89         int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
     90         PtNodeArray mChildren;
     91         boolean mIsNotAWord; // Only a shortcut
     92         boolean mIsBlacklistEntry;
     93         // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
     94         // generation. Before and After always hold the same value except during dictionary
     95         // address compression, where the update process needs to know about both values at the
     96         // same time. Updating will update the AfterUpdate value, and the code will move them
     97         // to BeforeUpdate before the next update pass.
     98         // The update process does not need two versions of mCachedSize.
     99         int mCachedSize; // The size, in bytes, of this PtNode.
    100         int mCachedAddressBeforeUpdate; // The address of this PtNode (before update)
    101         int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
    102 
    103         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    104                 final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
    105                 final boolean isNotAWord, final boolean isBlacklistEntry) {
    106             mChars = chars;
    107             mProbabilityInfo = probabilityInfo;
    108             mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
    109             mShortcutTargets = shortcutTargets;
    110             mBigrams = bigrams;
    111             mChildren = null;
    112             mIsNotAWord = isNotAWord;
    113             mIsBlacklistEntry = isBlacklistEntry;
    114         }
    115 
    116         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    117                 final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
    118                 final boolean isNotAWord, final boolean isBlacklistEntry,
    119                 final PtNodeArray children) {
    120             mChars = chars;
    121             mProbabilityInfo = probabilityInfo;
    122             mShortcutTargets = shortcutTargets;
    123             mBigrams = bigrams;
    124             mChildren = children;
    125             mIsNotAWord = isNotAWord;
    126             mIsBlacklistEntry = isBlacklistEntry;
    127         }
    128 
    129         public void addChild(PtNode n) {
    130             if (null == mChildren) {
    131                 mChildren = new PtNodeArray();
    132             }
    133             mChildren.mData.add(n);
    134         }
    135 
    136         public int getTerminalId() {
    137             return mTerminalId;
    138         }
    139 
    140         public boolean isTerminal() {
    141             return mProbabilityInfo != null;
    142         }
    143 
    144         public int getProbability() {
    145             if (isTerminal()) {
    146                 return mProbabilityInfo.mProbability;
    147             } else {
    148                 return NOT_A_TERMINAL;
    149             }
    150         }
    151 
    152         public boolean getIsNotAWord() {
    153             return mIsNotAWord;
    154         }
    155 
    156         public boolean getIsBlacklistEntry() {
    157             return mIsBlacklistEntry;
    158         }
    159 
    160         public ArrayList<WeightedString> getShortcutTargets() {
    161             // We don't want write permission to escape outside the package, so we return a copy
    162             if (null == mShortcutTargets) return null;
    163             final ArrayList<WeightedString> copyOfShortcutTargets =
    164                     new ArrayList<>(mShortcutTargets);
    165             return copyOfShortcutTargets;
    166         }
    167 
    168         public ArrayList<WeightedString> getBigrams() {
    169             // We don't want write permission to escape outside the package, so we return a copy
    170             if (null == mBigrams) return null;
    171             final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams);
    172             return copyOfBigrams;
    173         }
    174 
    175         public boolean hasSeveralChars() {
    176             assert(mChars.length > 0);
    177             return 1 < mChars.length;
    178         }
    179 
    180         /**
    181          * Adds a word to the bigram list. Updates the probability information if the word already
    182          * exists.
    183          */
    184         public void addBigram(final String word, final ProbabilityInfo probabilityInfo) {
    185             if (mBigrams == null) {
    186                 mBigrams = new ArrayList<>();
    187             }
    188             WeightedString bigram = getBigram(word);
    189             if (bigram != null) {
    190                 bigram.mProbabilityInfo = probabilityInfo;
    191             } else {
    192                 bigram = new WeightedString(word, probabilityInfo);
    193                 mBigrams.add(bigram);
    194             }
    195         }
    196 
    197         /**
    198          * Gets the shortcut target for the given word. Returns null if the word is not in the
    199          * shortcut list.
    200          */
    201         public WeightedString getShortcut(final String word) {
    202             // TODO: Don't do a linear search
    203             if (mShortcutTargets != null) {
    204                 final int size = mShortcutTargets.size();
    205                 for (int i = 0; i < size; ++i) {
    206                     WeightedString shortcut = mShortcutTargets.get(i);
    207                     if (shortcut.mWord.equals(word)) {
    208                         return shortcut;
    209                     }
    210                 }
    211             }
    212             return null;
    213         }
    214 
    215         /**
    216          * Gets the bigram for the given word.
    217          * Returns null if the word is not in the bigrams list.
    218          */
    219         public WeightedString getBigram(final String word) {
    220             // TODO: Don't do a linear search
    221             if (mBigrams != null) {
    222                 final int size = mBigrams.size();
    223                 for (int i = 0; i < size; ++i) {
    224                     WeightedString bigram = mBigrams.get(i);
    225                     if (bigram.mWord.equals(word)) {
    226                         return bigram;
    227                     }
    228                 }
    229             }
    230             return null;
    231         }
    232 
    233         /**
    234          * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to
    235          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
    236          * updated if they are higher than the existing ones.
    237          */
    238         private void update(final ProbabilityInfo probabilityInfo,
    239                 final ArrayList<WeightedString> shortcutTargets,
    240                 final ArrayList<WeightedString> bigrams,
    241                 final boolean isNotAWord, final boolean isBlacklistEntry) {
    242             mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
    243             if (shortcutTargets != null) {
    244                 if (mShortcutTargets == null) {
    245                     mShortcutTargets = shortcutTargets;
    246                 } else {
    247                     final int size = shortcutTargets.size();
    248                     for (int i = 0; i < size; ++i) {
    249                         final WeightedString shortcut = shortcutTargets.get(i);
    250                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
    251                         if (existingShortcut == null) {
    252                             mShortcutTargets.add(shortcut);
    253                         } else {
    254                             existingShortcut.mProbabilityInfo = ProbabilityInfo.max(
    255                                     existingShortcut.mProbabilityInfo, shortcut.mProbabilityInfo);
    256                         }
    257                     }
    258                 }
    259             }
    260             if (bigrams != null) {
    261                 if (mBigrams == null) {
    262                     mBigrams = bigrams;
    263                 } else {
    264                     final int size = bigrams.size();
    265                     for (int i = 0; i < size; ++i) {
    266                         final WeightedString bigram = bigrams.get(i);
    267                         final WeightedString existingBigram = getBigram(bigram.mWord);
    268                         if (existingBigram == null) {
    269                             mBigrams.add(bigram);
    270                         } else {
    271                             existingBigram.mProbabilityInfo = ProbabilityInfo.max(
    272                                     existingBigram.mProbabilityInfo, bigram.mProbabilityInfo);
    273                         }
    274                     }
    275                 }
    276             }
    277             mIsNotAWord = isNotAWord;
    278             mIsBlacklistEntry = isBlacklistEntry;
    279         }
    280     }
    281 
    282     public final DictionaryOptions mOptions;
    283     public final PtNodeArray mRootNodeArray;
    284 
    285     public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) {
    286         mRootNodeArray = rootNodeArray;
    287         mOptions = options;
    288     }
    289 
    290     public void addOptionAttribute(final String key, final String value) {
    291         mOptions.mAttributes.put(key, value);
    292     }
    293 
    294     /**
    295      * Helper method to convert a String to an int array.
    296      */
    297     static int[] getCodePoints(final String word) {
    298         // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray,
    299         // which is not visible from the makedict package. Factor this code.
    300         final int length = word.length();
    301         if (length <= 0) return new int[] {};
    302         final char[] characters = word.toCharArray();
    303         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
    304         int codePoint = Character.codePointAt(characters, 0);
    305         int dsti = 0;
    306         for (int srci = Character.charCount(codePoint);
    307                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
    308             codePoints[dsti] = codePoint;
    309             codePoint = Character.codePointAt(characters, srci);
    310         }
    311         codePoints[dsti] = codePoint;
    312         return codePoints;
    313     }
    314 
    315     /**
    316      * Helper method to add a word as a string.
    317      *
    318      * This method adds a word to the dictionary with the given frequency. Optional
    319      * lists of bigrams and shortcuts can be passed here. For each word inside,
    320      * they will be added to the dictionary as necessary.
    321      *
    322      * @param word the word to add.
    323      * @param probabilityInfo probability information of the word.
    324      * @param shortcutTargets a list of shortcut targets for this word, or null.
    325      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
    326      */
    327     public void add(final String word, final ProbabilityInfo probabilityInfo,
    328             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
    329         add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord,
    330                 false /* isBlacklistEntry */);
    331     }
    332 
    333     /**
    334      * Helper method to add a blacklist entry as a string.
    335      *
    336      * @param word the word to add as a blacklist entry.
    337      * @param shortcutTargets a list of shortcut targets for this word, or null.
    338      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
    339      */
    340     public void addBlacklistEntry(final String word,
    341             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
    342         add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord,
    343                 true /* isBlacklistEntry */);
    344     }
    345 
    346     /**
    347      * Sanity check for a PtNode array.
    348      *
    349      * This method checks that all PtNodes in a node array are ordered as expected.
    350      * If they are, nothing happens. If they aren't, an exception is thrown.
    351      */
    352     private void checkStack(PtNodeArray ptNodeArray) {
    353         ArrayList<PtNode> stack = ptNodeArray.mData;
    354         int lastValue = -1;
    355         for (int i = 0; i < stack.size(); ++i) {
    356             int currentValue = stack.get(i).mChars[0];
    357             if (currentValue <= lastValue)
    358                 throw new RuntimeException("Invalid stack");
    359             else
    360                 lastValue = currentValue;
    361         }
    362     }
    363 
    364     /**
    365      * Helper method to add a new bigram to the dictionary.
    366      *
    367      * @param word0 the previous word of the context
    368      * @param word1 the next word of the context
    369      * @param probabilityInfo the bigram probability info
    370      */
    371     public void setBigram(final String word0, final String word1,
    372             final ProbabilityInfo probabilityInfo) {
    373         PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
    374         if (ptNode0 != null) {
    375             final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
    376             if (ptNode1 == null) {
    377                 add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */,
    378                         false /* isBlacklistEntry */);
    379                 // The PtNode for the first word may have moved by the above insertion,
    380                 // if word1 and word2 share a common stem that happens not to have been
    381                 // a cutting point until now. In this case, we need to refresh ptNode.
    382                 ptNode0 = findWordInTree(mRootNodeArray, word0);
    383             }
    384             ptNode0.addBigram(word1, probabilityInfo);
    385         } else {
    386             throw new RuntimeException("First word of bigram not found " + word0);
    387         }
    388     }
    389 
    390     /**
    391      * Add a word to this dictionary.
    392      *
    393      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
    394      * an exception is thrown.
    395      *
    396      * @param word the word, as an int array.
    397      * @param probabilityInfo the probability information of the word.
    398      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
    399      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
    400      * @param isBlacklistEntry true if this is a blacklisted word, false otherwise
    401      */
    402     private void add(final int[] word, final ProbabilityInfo probabilityInfo,
    403             final ArrayList<WeightedString> shortcutTargets,
    404             final boolean isNotAWord, final boolean isBlacklistEntry) {
    405         assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
    406         if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
    407             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
    408             return;
    409         }
    410 
    411         PtNodeArray currentNodeArray = mRootNodeArray;
    412         int charIndex = 0;
    413 
    414         PtNode currentPtNode = null;
    415         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
    416         int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]);
    417         while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) {
    418             currentPtNode = currentNodeArray.mData.get(nodeIndex);
    419             differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex);
    420             if (ARRAYS_ARE_EQUAL != differentCharIndex
    421                     && differentCharIndex < currentPtNode.mChars.length) break;
    422             if (null == currentPtNode.mChildren) break;
    423             charIndex += currentPtNode.mChars.length;
    424             if (charIndex >= word.length) break;
    425             currentNodeArray = currentPtNode.mChildren;
    426             nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]);
    427         }
    428 
    429         if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) {
    430             // No node at this point to accept the word. Create one.
    431             final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
    432             final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
    433                     shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord,
    434                     isBlacklistEntry);
    435             currentNodeArray.mData.add(insertionIndex, newPtNode);
    436             if (DBG) checkStack(currentNodeArray);
    437         } else {
    438             // There is a word with a common prefix.
    439             if (differentCharIndex == currentPtNode.mChars.length) {
    440                 if (charIndex + differentCharIndex >= word.length) {
    441                     // The new word is a prefix of an existing word, but the node on which it
    442                     // should end already exists as is. Since the old PtNode was not a terminal,
    443                     // make it one by filling in its frequency and other attributes
    444                     currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord,
    445                             isBlacklistEntry);
    446                 } else {
    447                     // The new word matches the full old word and extends past it.
    448                     // We only have to create a new node and add it to the end of this.
    449                     final PtNode newNode = new PtNode(
    450                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
    451                                     shortcutTargets, null /* bigrams */, probabilityInfo,
    452                                     isNotAWord, isBlacklistEntry);
    453                     currentPtNode.mChildren = new PtNodeArray();
    454                     currentPtNode.mChildren.mData.add(newNode);
    455                 }
    456             } else {
    457                 if (0 == differentCharIndex) {
    458                     // Exact same word. Update the frequency if higher. This will also add the
    459                     // new shortcuts to the existing shortcut list if it already exists.
    460                     currentPtNode.update(probabilityInfo, shortcutTargets, null,
    461                             currentPtNode.mIsNotAWord && isNotAWord,
    462                             currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
    463                 } else {
    464                     // Partial prefix match only. We have to replace the current node with a node
    465                     // containing the current prefix and create two new ones for the tails.
    466                     PtNodeArray newChildren = new PtNodeArray();
    467                     final PtNode newOldWord = new PtNode(
    468                             Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
    469                                     currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
    470                             currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
    471                             currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
    472                             currentPtNode.mChildren);
    473                     newChildren.mData.add(newOldWord);
    474 
    475                     final PtNode newParent;
    476                     if (charIndex + differentCharIndex >= word.length) {
    477                         newParent = new PtNode(
    478                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
    479                                 shortcutTargets, null /* bigrams */, probabilityInfo,
    480                                 isNotAWord, isBlacklistEntry, newChildren);
    481                     } else {
    482                         newParent = new PtNode(
    483                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
    484                                 null /* shortcutTargets */, null /* bigrams */,
    485                                 null /* probabilityInfo */, false /* isNotAWord */,
    486                                 false /* isBlacklistEntry */, newChildren);
    487                         final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
    488                                 charIndex + differentCharIndex, word.length),
    489                                 shortcutTargets, null /* bigrams */, probabilityInfo,
    490                                 isNotAWord, isBlacklistEntry);
    491                         final int addIndex = word[charIndex + differentCharIndex]
    492                                 > currentPtNode.mChars[differentCharIndex] ? 1 : 0;
    493                         newChildren.mData.add(addIndex, newWord);
    494                     }
    495                     currentNodeArray.mData.set(nodeIndex, newParent);
    496                 }
    497                 if (DBG) checkStack(currentNodeArray);
    498             }
    499         }
    500     }
    501 
    502     private static int ARRAYS_ARE_EQUAL = 0;
    503 
    504     /**
    505      * Custom comparison of two int arrays taken to contain character codes.
    506      *
    507      * This method compares the two arrays passed as an argument in a lexicographic way,
    508      * with an offset in the dst string.
    509      * This method does NOT test for the first character. It is taken to be equal.
    510      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
    511      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
    512      * strings are equal. This works BECAUSE we don't look at the first character.
    513      *
    514      * @param src the left-hand side string of the comparison.
    515      * @param dst the right-hand side string of the comparison.
    516      * @param dstOffset the offset in the right-hand side string.
    517      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
    518      */
    519     private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) {
    520         // We do NOT test the first char, because we come from a method that already
    521         // tested it.
    522         for (int i = 1; i < src.length; ++i) {
    523             if (dstOffset + i >= dst.length) return i;
    524             if (src[i] != dst[dstOffset + i]) return i;
    525         }
    526         if (dst.length > src.length) return src.length;
    527         return ARRAYS_ARE_EQUAL;
    528     }
    529 
    530     /**
    531      * Helper class that compares and sorts two PtNodes according to their
    532      * first element only. I repeat: ONLY the first element is considered, the rest
    533      * is ignored.
    534      * This comparator imposes orderings that are inconsistent with equals.
    535      */
    536     static private final class PtNodeComparator implements java.util.Comparator<PtNode> {
    537         @Override
    538         public int compare(PtNode p1, PtNode p2) {
    539             if (p1.mChars[0] == p2.mChars[0]) return 0;
    540             return p1.mChars[0] < p2.mChars[0] ? -1 : 1;
    541         }
    542     }
    543     final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator();
    544 
    545     /**
    546      * Finds the insertion index of a character within a node array.
    547      */
    548     private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
    549         final ArrayList<PtNode> data = nodeArray.mData;
    550         final PtNode reference = new PtNode(new int[] { character },
    551                 null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */,
    552                 false /* isNotAWord */, false /* isBlacklistEntry */);
    553         int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
    554         return result >= 0 ? result : -result - 1;
    555     }
    556 
    557     /**
    558      * Find the index of a char in a node array, if it exists.
    559      *
    560      * @param nodeArray the node array to search in.
    561      * @param character the character to search for.
    562      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else.
    563      */
    564     private static int findIndexOfChar(final PtNodeArray nodeArray, int character) {
    565         final int insertionIndex = findInsertionIndex(nodeArray, character);
    566         if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX;
    567         return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex
    568                 : CHARACTER_NOT_FOUND_INDEX;
    569     }
    570 
    571     /**
    572      * Helper method to find a word in a given branch.
    573      */
    574     public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) {
    575         int index = 0;
    576         final StringBuilder checker = DBG ? new StringBuilder() : null;
    577         final int[] codePoints = getCodePoints(string);
    578 
    579         PtNode currentPtNode;
    580         do {
    581             int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]);
    582             if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null;
    583             currentPtNode = nodeArray.mData.get(indexOfGroup);
    584 
    585             if (codePoints.length - index < currentPtNode.mChars.length) return null;
    586             int newIndex = index;
    587             while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) {
    588                 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null;
    589                 newIndex++;
    590             }
    591             index = newIndex;
    592 
    593             if (DBG) {
    594                 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length));
    595             }
    596             if (index < codePoints.length) {
    597                 nodeArray = currentPtNode.mChildren;
    598             }
    599         } while (null != nodeArray && index < codePoints.length);
    600 
    601         if (index < codePoints.length) return null;
    602         if (!currentPtNode.isTerminal()) return null;
    603         if (DBG && !string.equals(checker.toString())) return null;
    604         return currentPtNode;
    605     }
    606 
    607     /**
    608      * Helper method to find out whether a word is in the dict or not.
    609      */
    610     public boolean hasWord(final String s) {
    611         if (null == s || "".equals(s)) {
    612             throw new RuntimeException("Can't search for a null or empty string");
    613         }
    614         return null != findWordInTree(mRootNodeArray, s);
    615     }
    616 
    617     /**
    618      * Recursively count the number of PtNodes in a given branch of the trie.
    619      *
    620      * @param nodeArray the parent node.
    621      * @return the number of PtNodes in all the branch under this node.
    622      */
    623     public static int countPtNodes(final PtNodeArray nodeArray) {
    624         final int nodeSize = nodeArray.mData.size();
    625         int size = nodeSize;
    626         for (int i = nodeSize - 1; i >= 0; --i) {
    627             PtNode ptNode = nodeArray.mData.get(i);
    628             if (null != ptNode.mChildren)
    629                 size += countPtNodes(ptNode.mChildren);
    630         }
    631         return size;
    632     }
    633 
    634     /**
    635      * Iterator to walk through a dictionary.
    636      *
    637      * This is purely for convenience.
    638      */
    639     public static final class DictionaryIterator implements Iterator<WordProperty> {
    640         private static final class Position {
    641             public Iterator<PtNode> pos;
    642             public int length;
    643             public Position(ArrayList<PtNode> ptNodes) {
    644                 pos = ptNodes.iterator();
    645                 length = 0;
    646             }
    647         }
    648         final StringBuilder mCurrentString;
    649         final LinkedList<Position> mPositions;
    650 
    651         public DictionaryIterator(ArrayList<PtNode> ptRoot) {
    652             mCurrentString = new StringBuilder();
    653             mPositions = new LinkedList<>();
    654             final Position rootPos = new Position(ptRoot);
    655             mPositions.add(rootPos);
    656         }
    657 
    658         @Override
    659         public boolean hasNext() {
    660             for (Position p : mPositions) {
    661                 if (p.pos.hasNext()) {
    662                     return true;
    663                 }
    664             }
    665             return false;
    666         }
    667 
    668         @Override
    669         public WordProperty next() {
    670             Position currentPos = mPositions.getLast();
    671             mCurrentString.setLength(currentPos.length);
    672 
    673             do {
    674                 if (currentPos.pos.hasNext()) {
    675                     final PtNode currentPtNode = currentPos.pos.next();
    676                     currentPos.length = mCurrentString.length();
    677                     for (int i : currentPtNode.mChars) {
    678                         mCurrentString.append(Character.toChars(i));
    679                     }
    680                     if (null != currentPtNode.mChildren) {
    681                         currentPos = new Position(currentPtNode.mChildren.mData);
    682                         currentPos.length = mCurrentString.length();
    683                         mPositions.addLast(currentPos);
    684                     }
    685                     if (currentPtNode.isTerminal()) {
    686                         return new WordProperty(mCurrentString.toString(),
    687                                 currentPtNode.mProbabilityInfo,
    688                                 currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
    689                                 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
    690                     }
    691                 } else {
    692                     mPositions.removeLast();
    693                     currentPos = mPositions.getLast();
    694                     mCurrentString.setLength(mPositions.getLast().length);
    695                 }
    696             } while (true);
    697         }
    698 
    699         @Override
    700         public void remove() {
    701             throw new UnsupportedOperationException("Unsupported yet");
    702         }
    703 
    704     }
    705 
    706     /**
    707      * Method to return an iterator.
    708      *
    709      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
    710      * and say : for (Word w : x) {}
    711      */
    712     @Override
    713     public Iterator<WordProperty> iterator() {
    714         return new DictionaryIterator(mRootNodeArray.mData);
    715     }
    716 }
    717