Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.define.DecoderSpecificConstants;
     21 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
     22 
     23 import java.util.ArrayList;
     24 import java.util.Arrays;
     25 import java.util.Collections;
     26 import java.util.Iterator;
     27 import java.util.LinkedList;
     28 
     29 /**
     30  * A dictionary that can fusion heads and tails of words for more compression.
     31  */
     32 @UsedForTesting
     33 public final class FusionDictionary implements Iterable<WordProperty> {
     34     private static final boolean DBG = MakedictLog.DBG;
     35 
     36     private static int CHARACTER_NOT_FOUND_INDEX = -1;
     37 
     38     /**
     39      * A node array of the dictionary, containing several PtNodes.
     40      *
     41      * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the
     42      * real information.
     43      * This class also contains fields to cache size and address, to help with binary
     44      * generation.
     45      */
     46     public static final class PtNodeArray {
     47         ArrayList<PtNode> mData;
     48         // To help with binary generation
     49         int mCachedSize = Integer.MIN_VALUE;
     50         // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
     51         // always hold the same value except between dictionary address compression, during which
     52         // the update process needs to know about both values at the same time. Updating will
     53         // update the AfterUpdate value, and the code will move them to BeforeUpdate before
     54         // the next update pass.
     55         int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
     56         int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
     57         int mCachedParentAddress = 0;
     58 
     59         public PtNodeArray() {
     60             mData = new ArrayList<>();
     61         }
     62         public PtNodeArray(ArrayList<PtNode> data) {
     63             Collections.sort(data, PTNODE_COMPARATOR);
     64             mData = data;
     65         }
     66     }
     67 
     68     /**
     69      * PtNode is a group of characters, with probability information, shortcut targets, bigrams,
     70      * and children (Pt means Patricia Trie).
     71      *
     72      * This is the central class of the in-memory representation. A PtNode is what can
     73      * be seen as a traditional "trie node", except it can hold several characters at the
     74      * same time. A PtNode essentially represents one or several characters in the middle
     75      * of the trie tree; as such, it can be a terminal, and it can have children.
     76      * In this in-memory representation, whether the PtNode is a terminal or not is represented
     77      * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the
     78      * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null
     79      * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present,
     80      * are non-null.
     81      */
     82     public static final class PtNode {
     83         private static final int NOT_A_TERMINAL = -1;
     84         final int mChars[];
     85         ArrayList<WeightedString> mBigrams;
     86         // null == mProbabilityInfo indicates this is not a terminal.
     87         ProbabilityInfo mProbabilityInfo;
     88         int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
     89         PtNodeArray mChildren;
     90         boolean mIsNotAWord; // Only a shortcut
     91         boolean mIsPossiblyOffensive;
     92         // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
     93         // generation. Before and After always hold the same value except during dictionary
     94         // address compression, where the update process needs to know about both values at the
     95         // same time. Updating will update the AfterUpdate value, and the code will move them
     96         // to BeforeUpdate before the next update pass.
     97         // The update process does not need two versions of mCachedSize.
     98         int mCachedSize; // The size, in bytes, of this PtNode.
     99         int mCachedAddressBeforeUpdate; // The address of this PtNode (before update)
    100         int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
    101 
    102         public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams,
    103                 final ProbabilityInfo probabilityInfo, final boolean isNotAWord,
    104                 final boolean isPossiblyOffensive) {
    105             mChars = chars;
    106             mProbabilityInfo = probabilityInfo;
    107             mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
    108             mBigrams = bigrams;
    109             mChildren = null;
    110             mIsNotAWord = isNotAWord;
    111             mIsPossiblyOffensive = isPossiblyOffensive;
    112         }
    113 
    114         public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams,
    115                 final ProbabilityInfo probabilityInfo, final boolean isNotAWord,
    116                 final boolean isPossiblyOffensive, final PtNodeArray children) {
    117             mChars = chars;
    118             mProbabilityInfo = probabilityInfo;
    119             mBigrams = bigrams;
    120             mChildren = children;
    121             mIsNotAWord = isNotAWord;
    122             mIsPossiblyOffensive = isPossiblyOffensive;
    123         }
    124 
    125         public void addChild(PtNode n) {
    126             if (null == mChildren) {
    127                 mChildren = new PtNodeArray();
    128             }
    129             mChildren.mData.add(n);
    130         }
    131 
    132         public int getTerminalId() {
    133             return mTerminalId;
    134         }
    135 
    136         public boolean isTerminal() {
    137             return mProbabilityInfo != null;
    138         }
    139 
    140         public int getProbability() {
    141             return isTerminal() ? mProbabilityInfo.mProbability : NOT_A_TERMINAL;
    142         }
    143 
    144         public boolean getIsNotAWord() {
    145             return mIsNotAWord;
    146         }
    147 
    148         public boolean getIsPossiblyOffensive() {
    149             return mIsPossiblyOffensive;
    150         }
    151 
    152         public ArrayList<WeightedString> getBigrams() {
    153             // We don't want write permission to escape outside the package, so we return a copy
    154             if (null == mBigrams) return null;
    155             final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams);
    156             return copyOfBigrams;
    157         }
    158 
    159         public boolean hasSeveralChars() {
    160             assert(mChars.length > 0);
    161             return 1 < mChars.length;
    162         }
    163 
    164         /**
    165          * Adds a word to the bigram list. Updates the probability information if the word already
    166          * exists.
    167          */
    168         public void addBigram(final String word, final ProbabilityInfo probabilityInfo) {
    169             if (mBigrams == null) {
    170                 mBigrams = new ArrayList<>();
    171             }
    172             WeightedString bigram = getBigram(word);
    173             if (bigram != null) {
    174                 bigram.mProbabilityInfo = probabilityInfo;
    175             } else {
    176                 bigram = new WeightedString(word, probabilityInfo);
    177                 mBigrams.add(bigram);
    178             }
    179         }
    180 
    181         /**
    182          * Gets the bigram for the given word.
    183          * Returns null if the word is not in the bigrams list.
    184          */
    185         public WeightedString getBigram(final String word) {
    186             // TODO: Don't do a linear search
    187             if (mBigrams != null) {
    188                 final int size = mBigrams.size();
    189                 for (int i = 0; i < size; ++i) {
    190                     WeightedString bigram = mBigrams.get(i);
    191                     if (bigram.mWord.equals(word)) {
    192                         return bigram;
    193                     }
    194                 }
    195             }
    196             return null;
    197         }
    198 
    199         /**
    200          * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to
    201          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
    202          * updated if they are higher than the existing ones.
    203          */
    204         void update(final ProbabilityInfo probabilityInfo,
    205                 final ArrayList<WeightedString> bigrams,
    206                 final boolean isNotAWord, final boolean isPossiblyOffensive) {
    207             mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
    208             if (bigrams != null) {
    209                 if (mBigrams == null) {
    210                     mBigrams = bigrams;
    211                 } else {
    212                     final int size = bigrams.size();
    213                     for (int i = 0; i < size; ++i) {
    214                         final WeightedString bigram = bigrams.get(i);
    215                         final WeightedString existingBigram = getBigram(bigram.mWord);
    216                         if (existingBigram == null) {
    217                             mBigrams.add(bigram);
    218                         } else {
    219                             existingBigram.mProbabilityInfo = ProbabilityInfo.max(
    220                                     existingBigram.mProbabilityInfo, bigram.mProbabilityInfo);
    221                         }
    222                     }
    223                 }
    224             }
    225             mIsNotAWord = isNotAWord;
    226             mIsPossiblyOffensive = isPossiblyOffensive;
    227         }
    228     }
    229 
    230     public final DictionaryOptions mOptions;
    231     public final PtNodeArray mRootNodeArray;
    232 
    233     public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) {
    234         mRootNodeArray = rootNodeArray;
    235         mOptions = options;
    236     }
    237 
    238     public void addOptionAttribute(final String key, final String value) {
    239         mOptions.mAttributes.put(key, value);
    240     }
    241 
    242     /**
    243      * Helper method to convert a String to an int array.
    244      */
    245     static int[] getCodePoints(final String word) {
    246         // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray,
    247         // which is not visible from the makedict package. Factor this code.
    248         final int length = word.length();
    249         if (length <= 0) return new int[] {};
    250         final char[] characters = word.toCharArray();
    251         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
    252         int codePoint = Character.codePointAt(characters, 0);
    253         int dsti = 0;
    254         for (int srci = Character.charCount(codePoint);
    255                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
    256             codePoints[dsti] = codePoint;
    257             codePoint = Character.codePointAt(characters, srci);
    258         }
    259         codePoints[dsti] = codePoint;
    260         return codePoints;
    261     }
    262 
    263     /**
    264      * Helper method to add a word as a string.
    265      *
    266      * This method adds a word to the dictionary with the given frequency. Optional
    267      * lists of bigrams can be passed here. For each word inside,
    268      * they will be added to the dictionary as necessary.
    269      *  @param word the word to add.
    270      * @param probabilityInfo probability information of the word.
    271      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
    272      * @param isPossiblyOffensive true if this word is possibly offensive
    273      */
    274     public void add(final String word, final ProbabilityInfo probabilityInfo,
    275             final boolean isNotAWord, final boolean isPossiblyOffensive) {
    276         add(getCodePoints(word), probabilityInfo, isNotAWord, isPossiblyOffensive);
    277     }
    278 
    279     /**
    280      * Sanity check for a PtNode array.
    281      *
    282      * This method checks that all PtNodes in a node array are ordered as expected.
    283      * If they are, nothing happens. If they aren't, an exception is thrown.
    284      */
    285     private static void checkStack(PtNodeArray ptNodeArray) {
    286         ArrayList<PtNode> stack = ptNodeArray.mData;
    287         int lastValue = -1;
    288         for (int i = 0; i < stack.size(); ++i) {
    289             int currentValue = stack.get(i).mChars[0];
    290             if (currentValue <= lastValue) {
    291                 throw new RuntimeException("Invalid stack");
    292             }
    293             lastValue = currentValue;
    294         }
    295     }
    296 
    297     /**
    298      * Helper method to add a new bigram to the dictionary.
    299      *
    300      * @param word0 the previous word of the context
    301      * @param word1 the next word of the context
    302      * @param probabilityInfo the bigram probability info
    303      */
    304     public void setBigram(final String word0, final String word1,
    305             final ProbabilityInfo probabilityInfo) {
    306         PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
    307         if (ptNode0 != null) {
    308             final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
    309             if (ptNode1 == null) {
    310                 add(getCodePoints(word1), new ProbabilityInfo(0), false /* isNotAWord */,
    311                         false /* isPossiblyOffensive */);
    312                 // The PtNode for the first word may have moved by the above insertion,
    313                 // if word1 and word2 share a common stem that happens not to have been
    314                 // a cutting point until now. In this case, we need to refresh ptNode.
    315                 ptNode0 = findWordInTree(mRootNodeArray, word0);
    316             }
    317             ptNode0.addBigram(word1, probabilityInfo);
    318         } else {
    319             throw new RuntimeException("First word of bigram not found " + word0);
    320         }
    321     }
    322 
    323     /**
    324      * Add a word to this dictionary.
    325      *
    326      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
    327      * an exception is thrown.
    328      *  @param word the word, as an int array.
    329      * @param probabilityInfo the probability information of the word.
    330      * @param isNotAWord true if this is not a word for spellchecking purposes (shortcut only or so)
    331      * @param isPossiblyOffensive true if this word is possibly offensive
    332      */
    333     private void add(final int[] word, final ProbabilityInfo probabilityInfo,
    334             final boolean isNotAWord, final boolean isPossiblyOffensive) {
    335         assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
    336         if (word.length >= DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH) {
    337             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
    338             return;
    339         }
    340 
    341         PtNodeArray currentNodeArray = mRootNodeArray;
    342         int charIndex = 0;
    343 
    344         PtNode currentPtNode = null;
    345         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
    346         int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]);
    347         while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) {
    348             currentPtNode = currentNodeArray.mData.get(nodeIndex);
    349             differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex);
    350             if (ARRAYS_ARE_EQUAL != differentCharIndex
    351                     && differentCharIndex < currentPtNode.mChars.length) break;
    352             if (null == currentPtNode.mChildren) break;
    353             charIndex += currentPtNode.mChars.length;
    354             if (charIndex >= word.length) break;
    355             currentNodeArray = currentPtNode.mChildren;
    356             nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]);
    357         }
    358 
    359         if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) {
    360             // No node at this point to accept the word. Create one.
    361             final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
    362             final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
    363                     null /* bigrams */, probabilityInfo, isNotAWord,
    364                     isPossiblyOffensive);
    365             currentNodeArray.mData.add(insertionIndex, newPtNode);
    366             if (DBG) checkStack(currentNodeArray);
    367         } else {
    368             // There is a word with a common prefix.
    369             if (differentCharIndex == currentPtNode.mChars.length) {
    370                 if (charIndex + differentCharIndex >= word.length) {
    371                     // The new word is a prefix of an existing word, but the node on which it
    372                     // should end already exists as is. Since the old PtNode was not a terminal,
    373                     // make it one by filling in its frequency and other attributes
    374                     currentPtNode.update(probabilityInfo, null, isNotAWord,
    375                             isPossiblyOffensive);
    376                 } else {
    377                     // The new word matches the full old word and extends past it.
    378                     // We only have to create a new node and add it to the end of this.
    379                     final PtNode newNode = new PtNode(
    380                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
    381                                     null /* bigrams */, probabilityInfo,
    382                                     isNotAWord, isPossiblyOffensive);
    383                     currentPtNode.mChildren = new PtNodeArray();
    384                     currentPtNode.mChildren.mData.add(newNode);
    385                 }
    386             } else {
    387                 if (0 == differentCharIndex) {
    388                     // Exact same word. Update the frequency if higher. This will also add the
    389                     // new shortcuts to the existing shortcut list if it already exists.
    390                     currentPtNode.update(probabilityInfo, null,
    391                             currentPtNode.mIsNotAWord && isNotAWord,
    392                             currentPtNode.mIsPossiblyOffensive || isPossiblyOffensive);
    393                 } else {
    394                     // Partial prefix match only. We have to replace the current node with a node
    395                     // containing the current prefix and create two new ones for the tails.
    396                     PtNodeArray newChildren = new PtNodeArray();
    397                     final PtNode newOldWord = new PtNode(
    398                             Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
    399                                     currentPtNode.mChars.length),
    400                             currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
    401                             currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive,
    402                             currentPtNode.mChildren);
    403                     newChildren.mData.add(newOldWord);
    404 
    405                     final PtNode newParent;
    406                     if (charIndex + differentCharIndex >= word.length) {
    407                         newParent = new PtNode(
    408                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
    409                                 null /* bigrams */, probabilityInfo,
    410                                 isNotAWord, isPossiblyOffensive, newChildren);
    411                     } else {
    412                         newParent = new PtNode(
    413                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
    414                                 null /* bigrams */, null /* probabilityInfo */,
    415                                 false /* isNotAWord */, false /* isPossiblyOffensive */,
    416                                 newChildren);
    417                         final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
    418                                 charIndex + differentCharIndex, word.length),
    419                                 null /* bigrams */, probabilityInfo,
    420                                 isNotAWord, isPossiblyOffensive);
    421                         final int addIndex = word[charIndex + differentCharIndex]
    422                                 > currentPtNode.mChars[differentCharIndex] ? 1 : 0;
    423                         newChildren.mData.add(addIndex, newWord);
    424                     }
    425                     currentNodeArray.mData.set(nodeIndex, newParent);
    426                 }
    427                 if (DBG) checkStack(currentNodeArray);
    428             }
    429         }
    430     }
    431 
    432     private static int ARRAYS_ARE_EQUAL = 0;
    433 
    434     /**
    435      * Custom comparison of two int arrays taken to contain character codes.
    436      *
    437      * This method compares the two arrays passed as an argument in a lexicographic way,
    438      * with an offset in the dst string.
    439      * This method does NOT test for the first character. It is taken to be equal.
    440      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
    441      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
    442      * strings are equal. This works BECAUSE we don't look at the first character.
    443      *
    444      * @param src the left-hand side string of the comparison.
    445      * @param dst the right-hand side string of the comparison.
    446      * @param dstOffset the offset in the right-hand side string.
    447      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
    448      */
    449     private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) {
    450         // We do NOT test the first char, because we come from a method that already
    451         // tested it.
    452         for (int i = 1; i < src.length; ++i) {
    453             if (dstOffset + i >= dst.length) return i;
    454             if (src[i] != dst[dstOffset + i]) return i;
    455         }
    456         if (dst.length > src.length) return src.length;
    457         return ARRAYS_ARE_EQUAL;
    458     }
    459 
    460     /**
    461      * Helper class that compares and sorts two PtNodes according to their
    462      * first element only. I repeat: ONLY the first element is considered, the rest
    463      * is ignored.
    464      * This comparator imposes orderings that are inconsistent with equals.
    465      */
    466     static final class PtNodeComparator implements java.util.Comparator<PtNode> {
    467         @Override
    468         public int compare(PtNode p1, PtNode p2) {
    469             if (p1.mChars[0] == p2.mChars[0]) return 0;
    470             return p1.mChars[0] < p2.mChars[0] ? -1 : 1;
    471         }
    472     }
    473     final static PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator();
    474 
    475     /**
    476      * Finds the insertion index of a character within a node array.
    477      */
    478     private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
    479         final ArrayList<PtNode> data = nodeArray.mData;
    480         final PtNode reference = new PtNode(new int[] { character },
    481                 null /* bigrams */, null /* probabilityInfo */,
    482                 false /* isNotAWord */, false /* isPossiblyOffensive */);
    483         int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
    484         return result >= 0 ? result : -result - 1;
    485     }
    486 
    487     /**
    488      * Find the index of a char in a node array, if it exists.
    489      *
    490      * @param nodeArray the node array to search in.
    491      * @param character the character to search for.
    492      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else.
    493      */
    494     private static int findIndexOfChar(final PtNodeArray nodeArray, int character) {
    495         final int insertionIndex = findInsertionIndex(nodeArray, character);
    496         if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX;
    497         return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex
    498                 : CHARACTER_NOT_FOUND_INDEX;
    499     }
    500 
    501     /**
    502      * Helper method to find a word in a given branch.
    503      */
    504     public static PtNode findWordInTree(final PtNodeArray rootNodeArray, final String string) {
    505         PtNodeArray nodeArray = rootNodeArray;
    506         int index = 0;
    507         final StringBuilder checker = DBG ? new StringBuilder() : null;
    508         final int[] codePoints = getCodePoints(string);
    509 
    510         PtNode currentPtNode;
    511         do {
    512             int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]);
    513             if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null;
    514             currentPtNode = nodeArray.mData.get(indexOfGroup);
    515 
    516             if (codePoints.length - index < currentPtNode.mChars.length) return null;
    517             int newIndex = index;
    518             while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) {
    519                 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null;
    520                 newIndex++;
    521             }
    522             index = newIndex;
    523 
    524             if (DBG) {
    525                 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length));
    526             }
    527             if (index < codePoints.length) {
    528                 nodeArray = currentPtNode.mChildren;
    529             }
    530         } while (null != nodeArray && index < codePoints.length);
    531 
    532         if (index < codePoints.length) return null;
    533         if (!currentPtNode.isTerminal()) return null;
    534         if (DBG && !string.equals(checker.toString())) return null;
    535         return currentPtNode;
    536     }
    537 
    538     /**
    539      * Helper method to find out whether a word is in the dict or not.
    540      */
    541     public boolean hasWord(final String s) {
    542         if (null == s || "".equals(s)) {
    543             throw new RuntimeException("Can't search for a null or empty string");
    544         }
    545         return null != findWordInTree(mRootNodeArray, s);
    546     }
    547 
    548     /**
    549      * Recursively count the number of PtNodes in a given branch of the trie.
    550      *
    551      * @param nodeArray the parent node.
    552      * @return the number of PtNodes in all the branch under this node.
    553      */
    554     public static int countPtNodes(final PtNodeArray nodeArray) {
    555         final int nodeSize = nodeArray.mData.size();
    556         int size = nodeSize;
    557         for (int i = nodeSize - 1; i >= 0; --i) {
    558             PtNode ptNode = nodeArray.mData.get(i);
    559             if (null != ptNode.mChildren)
    560                 size += countPtNodes(ptNode.mChildren);
    561         }
    562         return size;
    563     }
    564 
    565     /**
    566      * Iterator to walk through a dictionary.
    567      *
    568      * This is purely for convenience.
    569      */
    570     public static final class DictionaryIterator implements Iterator<WordProperty> {
    571         private static final class Position {
    572             public Iterator<PtNode> pos;
    573             public int length;
    574             public Position(ArrayList<PtNode> ptNodes) {
    575                 pos = ptNodes.iterator();
    576                 length = 0;
    577             }
    578         }
    579         final StringBuilder mCurrentString;
    580         final LinkedList<Position> mPositions;
    581 
    582         public DictionaryIterator(ArrayList<PtNode> ptRoot) {
    583             mCurrentString = new StringBuilder();
    584             mPositions = new LinkedList<>();
    585             final Position rootPos = new Position(ptRoot);
    586             mPositions.add(rootPos);
    587         }
    588 
    589         @Override
    590         public boolean hasNext() {
    591             for (Position p : mPositions) {
    592                 if (p.pos.hasNext()) {
    593                     return true;
    594                 }
    595             }
    596             return false;
    597         }
    598 
    599         @Override
    600         public WordProperty next() {
    601             Position currentPos = mPositions.getLast();
    602             mCurrentString.setLength(currentPos.length);
    603 
    604             do {
    605                 if (currentPos.pos.hasNext()) {
    606                     final PtNode currentPtNode = currentPos.pos.next();
    607                     currentPos.length = mCurrentString.length();
    608                     for (int i : currentPtNode.mChars) {
    609                         mCurrentString.append(Character.toChars(i));
    610                     }
    611                     if (null != currentPtNode.mChildren) {
    612                         currentPos = new Position(currentPtNode.mChildren.mData);
    613                         currentPos.length = mCurrentString.length();
    614                         mPositions.addLast(currentPos);
    615                     }
    616                     if (currentPtNode.isTerminal()) {
    617                         return new WordProperty(mCurrentString.toString(),
    618                                 currentPtNode.mProbabilityInfo, currentPtNode.mBigrams,
    619                                 currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive);
    620                     }
    621                 } else {
    622                     mPositions.removeLast();
    623                     currentPos = mPositions.getLast();
    624                     mCurrentString.setLength(mPositions.getLast().length);
    625                 }
    626             } while (true);
    627         }
    628 
    629         @Override
    630         public void remove() {
    631             throw new UnsupportedOperationException("Unsupported yet");
    632         }
    633 
    634     }
    635 
    636     /**
    637      * Method to return an iterator.
    638      *
    639      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
    640      * and say : for (Word w : x) {}
    641      */
    642     @Override
    643     public Iterator<WordProperty> iterator() {
    644         return new DictionaryIterator(mRootNodeArray.mData);
    645     }
    646 }
    647