Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import java.util.ArrayList;
     20 import java.util.Arrays;
     21 import java.util.Collections;
     22 import java.util.HashMap;
     23 import java.util.Iterator;
     24 import java.util.LinkedList;
     25 
     26 /**
     27  * A dictionary that can fusion heads and tails of words for more compression.
     28  */
     29 public class FusionDictionary implements Iterable<Word> {
     30 
     31     private static final boolean DBG = MakedictLog.DBG;
     32 
     33     /**
     34      * A node of the dictionary, containing several CharGroups.
     35      *
     36      * A node is but an ordered array of CharGroups, which essentially contain all the
     37      * real information.
     38      * This class also contains fields to cache size and address, to help with binary
     39      * generation.
     40      */
     41     public static class Node {
     42         ArrayList<CharGroup> mData;
     43         // To help with binary generation
     44         int mCachedSize;
     45         int mCachedAddress;
     46         public Node() {
     47             mData = new ArrayList<CharGroup>();
     48             mCachedSize = Integer.MIN_VALUE;
     49             mCachedAddress = Integer.MIN_VALUE;
     50         }
     51         public Node(ArrayList<CharGroup> data) {
     52             mData = data;
     53             mCachedSize = Integer.MIN_VALUE;
     54             mCachedAddress = Integer.MIN_VALUE;
     55         }
     56     }
     57 
     58     /**
     59      * A string with a frequency.
     60      *
     61      * This represents an "attribute", that is either a bigram or a shortcut.
     62      */
     63     public static class WeightedString {
     64         final String mWord;
     65         int mFrequency;
     66         public WeightedString(String word, int frequency) {
     67             mWord = word;
     68             mFrequency = frequency;
     69         }
     70 
     71         @Override
     72         public int hashCode() {
     73             return Arrays.hashCode(new Object[] { mWord, mFrequency });
     74         }
     75 
     76         @Override
     77         public boolean equals(Object o) {
     78             if (o == this) return true;
     79             if (!(o instanceof WeightedString)) return false;
     80             WeightedString w = (WeightedString)o;
     81             return mWord.equals(w.mWord) && mFrequency == w.mFrequency;
     82         }
     83     }
     84 
     85     /**
     86      * A group of characters, with a frequency, shortcut targets, bigrams, and children.
     87      *
     88      * This is the central class of the in-memory representation. A CharGroup is what can
     89      * be seen as a traditional "trie node", except it can hold several characters at the
     90      * same time. A CharGroup essentially represents one or several characters in the middle
     91      * of the trie trie; as such, it can be a terminal, and it can have children.
     92      * In this in-memory representation, whether the CharGroup is a terminal or not is represented
     93      * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
     94      * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
     95      * bigrams, but a non-terminal may not. Moreover, children, if present, are null.
     96      */
     97     public static class CharGroup {
     98         public static final int NOT_A_TERMINAL = -1;
     99         final int mChars[];
    100         ArrayList<WeightedString> mShortcutTargets;
    101         ArrayList<WeightedString> mBigrams;
    102         int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
    103         Node mChildren;
    104         // The two following members to help with binary generation
    105         int mCachedSize;
    106         int mCachedAddress;
    107 
    108         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    109                 final ArrayList<WeightedString> bigrams, final int frequency) {
    110             mChars = chars;
    111             mFrequency = frequency;
    112             mShortcutTargets = shortcutTargets;
    113             mBigrams = bigrams;
    114             mChildren = null;
    115         }
    116 
    117         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    118                 final ArrayList<WeightedString> bigrams, final int frequency, final Node children) {
    119             mChars = chars;
    120             mFrequency = frequency;
    121             mShortcutTargets = shortcutTargets;
    122             mBigrams = bigrams;
    123             mChildren = children;
    124         }
    125 
    126         public void addChild(CharGroup n) {
    127             if (null == mChildren) {
    128                 mChildren = new Node();
    129             }
    130             mChildren.mData.add(n);
    131         }
    132 
    133         public boolean isTerminal() {
    134             return NOT_A_TERMINAL != mFrequency;
    135         }
    136 
    137         public boolean hasSeveralChars() {
    138             assert(mChars.length > 0);
    139             return 1 < mChars.length;
    140         }
    141 
    142         /**
    143          * Adds a word to the bigram list. Updates the frequency if the word already
    144          * exists.
    145          */
    146         public void addBigram(final String word, final int frequency) {
    147             if (mBigrams == null) {
    148                 mBigrams = new ArrayList<WeightedString>();
    149             }
    150             WeightedString bigram = getBigram(word);
    151             if (bigram != null) {
    152                 bigram.mFrequency = frequency;
    153             } else {
    154                 bigram = new WeightedString(word, frequency);
    155                 mBigrams.add(bigram);
    156             }
    157         }
    158 
    159         /**
    160          * Gets the shortcut target for the given word. Returns null if the word is not in the
    161          * shortcut list.
    162          */
    163         public WeightedString getShortcut(final String word) {
    164             // TODO: Don't do a linear search
    165             if (mShortcutTargets != null) {
    166                 final int size = mShortcutTargets.size();
    167                 for (int i = 0; i < size; ++i) {
    168                     WeightedString shortcut = mShortcutTargets.get(i);
    169                     if (shortcut.mWord.equals(word)) {
    170                         return shortcut;
    171                     }
    172                 }
    173             }
    174             return null;
    175         }
    176 
    177         /**
    178          * Gets the bigram for the given word.
    179          * Returns null if the word is not in the bigrams list.
    180          */
    181         public WeightedString getBigram(final String word) {
    182             // TODO: Don't do a linear search
    183             if (mBigrams != null) {
    184                 final int size = mBigrams.size();
    185                 for (int i = 0; i < size; ++i) {
    186                     WeightedString bigram = mBigrams.get(i);
    187                     if (bigram.mWord.equals(word)) {
    188                         return bigram;
    189                     }
    190                 }
    191             }
    192             return null;
    193         }
    194 
    195         /**
    196          * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to
    197          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
    198          * updated if they are higher than the existing ones.
    199          */
    200         public void update(int frequency, ArrayList<WeightedString> shortcutTargets,
    201                 ArrayList<WeightedString> bigrams) {
    202             if (frequency > mFrequency) {
    203                 mFrequency = frequency;
    204             }
    205             if (shortcutTargets != null) {
    206                 if (mShortcutTargets == null) {
    207                     mShortcutTargets = shortcutTargets;
    208                 } else {
    209                     final int size = shortcutTargets.size();
    210                     for (int i = 0; i < size; ++i) {
    211                         final WeightedString shortcut = shortcutTargets.get(i);
    212                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
    213                         if (existingShortcut == null) {
    214                             mShortcutTargets.add(shortcut);
    215                         } else if (existingShortcut.mFrequency < shortcut.mFrequency) {
    216                             existingShortcut.mFrequency = shortcut.mFrequency;
    217                         }
    218                     }
    219                 }
    220             }
    221             if (bigrams != null) {
    222                 if (mBigrams == null) {
    223                     mBigrams = bigrams;
    224                 } else {
    225                     final int size = bigrams.size();
    226                     for (int i = 0; i < size; ++i) {
    227                         final WeightedString bigram = bigrams.get(i);
    228                         final WeightedString existingBigram = getBigram(bigram.mWord);
    229                         if (existingBigram == null) {
    230                             mBigrams.add(bigram);
    231                         } else if (existingBigram.mFrequency < bigram.mFrequency) {
    232                             existingBigram.mFrequency = bigram.mFrequency;
    233                         }
    234                     }
    235                 }
    236             }
    237         }
    238     }
    239 
    240     /**
    241      * Options global to the dictionary.
    242      *
    243      * There are no options at the moment, so this class is empty.
    244      */
    245     public static class DictionaryOptions {
    246         public final boolean mGermanUmlautProcessing;
    247         public final boolean mFrenchLigatureProcessing;
    248         public final HashMap<String, String> mAttributes;
    249         public DictionaryOptions(final HashMap<String, String> attributes,
    250                 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) {
    251             mAttributes = attributes;
    252             mGermanUmlautProcessing = germanUmlautProcessing;
    253             mFrenchLigatureProcessing = frenchLigatureProcessing;
    254         }
    255     }
    256 
    257     public final DictionaryOptions mOptions;
    258     public final Node mRoot;
    259 
    260     public FusionDictionary(final Node root, final DictionaryOptions options) {
    261         mRoot = root;
    262         mOptions = options;
    263     }
    264 
    265     public void addOptionAttribute(final String key, final String value) {
    266         mOptions.mAttributes.put(key, value);
    267     }
    268 
    269     /**
    270      * Helper method to convert a String to an int array.
    271      */
    272     static private int[] getCodePoints(final String word) {
    273         // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray,
    274         // which is not visible from the makedict package. Factor this code.
    275         final char[] characters = word.toCharArray();
    276         final int length = characters.length;
    277         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
    278         int codePoint = Character.codePointAt(characters, 0);
    279         int dsti = 0;
    280         for (int srci = Character.charCount(codePoint);
    281                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
    282             codePoints[dsti] = codePoint;
    283             codePoint = Character.codePointAt(characters, srci);
    284         }
    285         codePoints[dsti] = codePoint;
    286         return codePoints;
    287     }
    288 
    289     /**
    290      * Helper method to add a word as a string.
    291      *
    292      * This method adds a word to the dictionary with the given frequency. Optional
    293      * lists of bigrams and shortcuts can be passed here. For each word inside,
    294      * they will be added to the dictionary as necessary.
    295      *
    296      * @param word the word to add.
    297      * @param frequency the frequency of the word, in the range [0..255].
    298      * @param shortcutTargets a list of shortcut targets for this word, or null.
    299      */
    300     public void add(final String word, final int frequency,
    301             final ArrayList<WeightedString> shortcutTargets) {
    302         add(getCodePoints(word), frequency, shortcutTargets);
    303     }
    304 
    305     /**
    306      * Sanity check for a node.
    307      *
    308      * This method checks that all CharGroups in a node are ordered as expected.
    309      * If they are, nothing happens. If they aren't, an exception is thrown.
    310      */
    311     private void checkStack(Node node) {
    312         ArrayList<CharGroup> stack = node.mData;
    313         int lastValue = -1;
    314         for (int i = 0; i < stack.size(); ++i) {
    315             int currentValue = stack.get(i).mChars[0];
    316             if (currentValue <= lastValue)
    317                 throw new RuntimeException("Invalid stack");
    318             else
    319                 lastValue = currentValue;
    320         }
    321     }
    322 
    323     /**
    324      * Helper method to add a new bigram to the dictionary.
    325      *
    326      * @param word1 the previous word of the context
    327      * @param word2 the next word of the context
    328      * @param frequency the bigram frequency
    329      */
    330     public void setBigram(final String word1, final String word2, final int frequency) {
    331         CharGroup charGroup = findWordInTree(mRoot, word1);
    332         if (charGroup != null) {
    333             final CharGroup charGroup2 = findWordInTree(mRoot, word2);
    334             if (charGroup2 == null) {
    335                 add(getCodePoints(word2), 0, null);
    336             }
    337             charGroup.addBigram(word2, frequency);
    338         } else {
    339             throw new RuntimeException("First word of bigram not found");
    340         }
    341     }
    342 
    343     /**
    344      * Add a word to this dictionary.
    345      *
    346      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
    347      * an exception is thrown.
    348      *
    349      * @param word the word, as an int array.
    350      * @param frequency the frequency of the word, in the range [0..255].
    351      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
    352      */
    353     private void add(final int[] word, final int frequency,
    354             final ArrayList<WeightedString> shortcutTargets) {
    355         assert(frequency >= 0 && frequency <= 255);
    356         Node currentNode = mRoot;
    357         int charIndex = 0;
    358 
    359         CharGroup currentGroup = null;
    360         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
    361         int nodeIndex = findIndexOfChar(mRoot, word[charIndex]);
    362         while (CHARACTER_NOT_FOUND != nodeIndex) {
    363             currentGroup = currentNode.mData.get(nodeIndex);
    364             differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex);
    365             if (ARRAYS_ARE_EQUAL != differentCharIndex
    366                     && differentCharIndex < currentGroup.mChars.length) break;
    367             if (null == currentGroup.mChildren) break;
    368             charIndex += currentGroup.mChars.length;
    369             if (charIndex >= word.length) break;
    370             currentNode = currentGroup.mChildren;
    371             nodeIndex = findIndexOfChar(currentNode, word[charIndex]);
    372         }
    373 
    374         if (-1 == nodeIndex) {
    375             // No node at this point to accept the word. Create one.
    376             final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
    377             final CharGroup newGroup = new CharGroup(
    378                     Arrays.copyOfRange(word, charIndex, word.length),
    379                     shortcutTargets, null /* bigrams */, frequency);
    380             currentNode.mData.add(insertionIndex, newGroup);
    381             if (DBG) checkStack(currentNode);
    382         } else {
    383             // There is a word with a common prefix.
    384             if (differentCharIndex == currentGroup.mChars.length) {
    385                 if (charIndex + differentCharIndex >= word.length) {
    386                     // The new word is a prefix of an existing word, but the node on which it
    387                     // should end already exists as is. Since the old CharNode was not a terminal,
    388                     // make it one by filling in its frequency and other attributes
    389                     currentGroup.update(frequency, shortcutTargets, null);
    390                 } else {
    391                     // The new word matches the full old word and extends past it.
    392                     // We only have to create a new node and add it to the end of this.
    393                     final CharGroup newNode = new CharGroup(
    394                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
    395                                     shortcutTargets, null /* bigrams */, frequency);
    396                     currentGroup.mChildren = new Node();
    397                     currentGroup.mChildren.mData.add(newNode);
    398                 }
    399             } else {
    400                 if (0 == differentCharIndex) {
    401                     // Exact same word. Update the frequency if higher. This will also add the
    402                     // new shortcuts to the existing shortcut list if it already exists.
    403                     currentGroup.update(frequency, shortcutTargets, null);
    404                 } else {
    405                     // Partial prefix match only. We have to replace the current node with a node
    406                     // containing the current prefix and create two new ones for the tails.
    407                     Node newChildren = new Node();
    408                     final CharGroup newOldWord = new CharGroup(
    409                             Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
    410                                     currentGroup.mChars.length), currentGroup.mShortcutTargets,
    411                             currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren);
    412                     newChildren.mData.add(newOldWord);
    413 
    414                     final CharGroup newParent;
    415                     if (charIndex + differentCharIndex >= word.length) {
    416                         newParent = new CharGroup(
    417                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
    418                                 shortcutTargets, null /* bigrams */, frequency, newChildren);
    419                     } else {
    420                         newParent = new CharGroup(
    421                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
    422                                 null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
    423                         final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
    424                                 charIndex + differentCharIndex, word.length),
    425                                 shortcutTargets, null /* bigrams */, frequency);
    426                         final int addIndex = word[charIndex + differentCharIndex]
    427                                 > currentGroup.mChars[differentCharIndex] ? 1 : 0;
    428                         newChildren.mData.add(addIndex, newWord);
    429                     }
    430                     currentNode.mData.set(nodeIndex, newParent);
    431                 }
    432                 if (DBG) checkStack(currentNode);
    433             }
    434         }
    435     }
    436 
    437     private static int ARRAYS_ARE_EQUAL = 0;
    438 
    439     /**
    440      * Custom comparison of two int arrays taken to contain character codes.
    441      *
    442      * This method compares the two arrays passed as an argument in a lexicographic way,
    443      * with an offset in the dst string.
    444      * This method does NOT test for the first character. It is taken to be equal.
    445      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
    446      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
    447      * strings are equal. This works BECAUSE we don't look at the first character.
    448      *
    449      * @param src the left-hand side string of the comparison.
    450      * @param dst the right-hand side string of the comparison.
    451      * @param dstOffset the offset in the right-hand side string.
    452      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
    453      */
    454     private static int compareArrays(final int[] src, final int[] dst, int dstOffset) {
    455         // We do NOT test the first char, because we come from a method that already
    456         // tested it.
    457         for (int i = 1; i < src.length; ++i) {
    458             if (dstOffset + i >= dst.length) return i;
    459             if (src[i] != dst[dstOffset + i]) return i;
    460         }
    461         if (dst.length > src.length) return src.length;
    462         return ARRAYS_ARE_EQUAL;
    463     }
    464 
    465     /**
    466      * Helper class that compares and sorts two chargroups according to their
    467      * first element only. I repeat: ONLY the first element is considered, the rest
    468      * is ignored.
    469      * This comparator imposes orderings that are inconsistent with equals.
    470      */
    471     static private class CharGroupComparator implements java.util.Comparator<CharGroup> {
    472         @Override
    473         public int compare(CharGroup c1, CharGroup c2) {
    474             if (c1.mChars[0] == c2.mChars[0]) return 0;
    475             return c1.mChars[0] < c2.mChars[0] ? -1 : 1;
    476         }
    477     }
    478     final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator();
    479 
    480     /**
    481      * Finds the insertion index of a character within a node.
    482      */
    483     private static int findInsertionIndex(final Node node, int character) {
    484         final ArrayList<CharGroup> data = node.mData;
    485         final CharGroup reference = new CharGroup(new int[] { character },
    486                 null /* shortcutTargets */, null /* bigrams */, 0);
    487         int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
    488         return result >= 0 ? result : -result - 1;
    489     }
    490 
    491     private static int CHARACTER_NOT_FOUND = -1;
    492 
    493     /**
    494      * Find the index of a char in a node, if it exists.
    495      *
    496      * @param node the node to search in.
    497      * @param character the character to search for.
    498      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else.
    499      */
    500     private static int findIndexOfChar(final Node node, int character) {
    501         final int insertionIndex = findInsertionIndex(node, character);
    502         if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND;
    503         return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex
    504                 : CHARACTER_NOT_FOUND;
    505     }
    506 
    507     /**
    508      * Helper method to find a word in a given branch.
    509      */
    510     public static CharGroup findWordInTree(Node node, final String s) {
    511         int index = 0;
    512         final StringBuilder checker = DBG ? new StringBuilder() : null;
    513 
    514         CharGroup currentGroup;
    515         do {
    516             int indexOfGroup = findIndexOfChar(node, s.codePointAt(index));
    517             if (CHARACTER_NOT_FOUND == indexOfGroup) return null;
    518             currentGroup = node.mData.get(indexOfGroup);
    519             if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length));
    520             index += currentGroup.mChars.length;
    521             if (index < s.length()) {
    522                 node = currentGroup.mChildren;
    523             }
    524         } while (null != node && index < s.length());
    525 
    526         if (DBG && !s.equals(checker.toString())) return null;
    527         return currentGroup;
    528     }
    529 
    530     /**
    531      * Helper method to find out whether a word is in the dict or not.
    532      */
    533     public boolean hasWord(final String s) {
    534         if (null == s || "".equals(s)) {
    535             throw new RuntimeException("Can't search for a null or empty string");
    536         }
    537         return null != findWordInTree(mRoot, s);
    538     }
    539 
    540     /**
    541      * Recursively count the number of character groups in a given branch of the trie.
    542      *
    543      * @param node the parent node.
    544      * @return the number of char groups in all the branch under this node.
    545      */
    546     public static int countCharGroups(final Node node) {
    547         final int nodeSize = node.mData.size();
    548         int size = nodeSize;
    549         for (int i = nodeSize - 1; i >= 0; --i) {
    550             CharGroup group = node.mData.get(i);
    551             if (null != group.mChildren)
    552                 size += countCharGroups(group.mChildren);
    553         }
    554         return size;
    555     }
    556 
    557     /**
    558      * Recursively count the number of nodes in a given branch of the trie.
    559      *
    560      * @param node the node to count.
    561      * @return the number of nodes in this branch.
    562      */
    563     public static int countNodes(final Node node) {
    564         int size = 1;
    565         for (int i = node.mData.size() - 1; i >= 0; --i) {
    566             CharGroup group = node.mData.get(i);
    567             if (null != group.mChildren)
    568                 size += countNodes(group.mChildren);
    569         }
    570         return size;
    571     }
    572 
    573     // Recursively find out whether there are any bigrams.
    574     // This can be pretty expensive especially if there aren't any (we return as soon
    575     // as we find one, so it's much cheaper if there are bigrams)
    576     private static boolean hasBigramsInternal(final Node node) {
    577         if (null == node) return false;
    578         for (int i = node.mData.size() - 1; i >= 0; --i) {
    579             CharGroup group = node.mData.get(i);
    580             if (null != group.mBigrams) return true;
    581             if (hasBigramsInternal(group.mChildren)) return true;
    582         }
    583         return false;
    584     }
    585 
    586     /**
    587      * Finds out whether there are any bigrams in this dictionary.
    588      *
    589      * @return true if there is any bigram, false otherwise.
    590      */
    591     // TODO: this is expensive especially for large dictionaries without any bigram.
    592     // The up side is, this is always accurate and correct and uses no memory. We should
    593     // find a more efficient way of doing this, without compromising too much on memory
    594     // and ease of use.
    595     public boolean hasBigrams() {
    596         return hasBigramsInternal(mRoot);
    597     }
    598 
    599     // Historically, the tails of the words were going to be merged to save space.
    600     // However, that would prevent the code to search for a specific address in log(n)
    601     // time so this was abandoned.
    602     // The code is still of interest as it does add some compression to any dictionary
    603     // that has no need for attributes. Implementations that does not read attributes should be
    604     // able to read a dictionary with merged tails.
    605     // Also, the following code does support frequencies, as in, it will only merges
    606     // tails that share the same frequency. Though it would result in the above loss of
    607     // performance while searching by address, it is still technically possible to merge
    608     // tails that contain attributes, but this code does not take that into account - it does
    609     // not compare attributes and will merge terminals with different attributes regardless.
    610     public void mergeTails() {
    611         MakedictLog.i("Do not merge tails");
    612         return;
    613 
    614 //        MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root));
    615 //        MakedictLog.i("Number of groups : " + countCharGroups(root));
    616 //
    617 //        final HashMap<String, ArrayList<Node>> repository =
    618 //                  new HashMap<String, ArrayList<Node>>();
    619 //        mergeTailsInner(repository, root);
    620 //
    621 //        MakedictLog.i("Number of different pseudohashes : " + repository.size());
    622 //        int size = 0;
    623 //        for (ArrayList<Node> a : repository.values()) {
    624 //            size += a.size();
    625 //        }
    626 //        MakedictLog.i("Number of nodes after merge : " + (1 + size));
    627 //        MakedictLog.i("Recursively seen nodes : " + countNodes(root));
    628     }
    629 
    630     // The following methods are used by the deactivated mergeTails()
    631 //   private static boolean isEqual(Node a, Node b) {
    632 //       if (null == a && null == b) return true;
    633 //       if (null == a || null == b) return false;
    634 //       if (a.data.size() != b.data.size()) return false;
    635 //       final int size = a.data.size();
    636 //       for (int i = size - 1; i >= 0; --i) {
    637 //           CharGroup aGroup = a.data.get(i);
    638 //           CharGroup bGroup = b.data.get(i);
    639 //           if (aGroup.frequency != bGroup.frequency) return false;
    640 //           if (aGroup.alternates == null && bGroup.alternates != null) return false;
    641 //           if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false;
    642 //           if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false;
    643 //           if (!isEqual(aGroup.children, bGroup.children)) return false;
    644 //       }
    645 //       return true;
    646 //   }
    647 
    648 //   static private HashMap<String, ArrayList<Node>> mergeTailsInner(
    649 //           final HashMap<String, ArrayList<Node>> map, final Node node) {
    650 //       final ArrayList<CharGroup> branches = node.data;
    651 //       final int nodeSize = branches.size();
    652 //       for (int i = 0; i < nodeSize; ++i) {
    653 //           CharGroup group = branches.get(i);
    654 //           if (null != group.children) {
    655 //               String pseudoHash = getPseudoHash(group.children);
    656 //               ArrayList<Node> similarList = map.get(pseudoHash);
    657 //               if (null == similarList) {
    658 //                   similarList = new ArrayList<Node>();
    659 //                   map.put(pseudoHash, similarList);
    660 //               }
    661 //               boolean merged = false;
    662 //               for (Node similar : similarList) {
    663 //                   if (isEqual(group.children, similar)) {
    664 //                       group.children = similar;
    665 //                       merged = true;
    666 //                       break;
    667 //                   }
    668 //               }
    669 //               if (!merged) {
    670 //                   similarList.add(group.children);
    671 //               }
    672 //               mergeTailsInner(map, group.children);
    673 //           }
    674 //       }
    675 //       return map;
    676 //   }
    677 
    678 //  private static String getPseudoHash(final Node node) {
    679 //      StringBuilder s = new StringBuilder();
    680 //      for (CharGroup g : node.data) {
    681 //          s.append(g.frequency);
    682 //          for (int ch : g.chars){
    683 //              s.append(Character.toChars(ch));
    684 //          }
    685 //      }
    686 //      return s.toString();
    687 //  }
    688 
    689     /**
    690      * Iterator to walk through a dictionary.
    691      *
    692      * This is purely for convenience.
    693      */
    694     public static class DictionaryIterator implements Iterator<Word> {
    695 
    696         private static class Position {
    697             public Iterator<CharGroup> pos;
    698             public int length;
    699             public Position(ArrayList<CharGroup> groups) {
    700                 pos = groups.iterator();
    701                 length = 0;
    702             }
    703         }
    704         final StringBuilder mCurrentString;
    705         final LinkedList<Position> mPositions;
    706 
    707         public DictionaryIterator(ArrayList<CharGroup> root) {
    708             mCurrentString = new StringBuilder();
    709             mPositions = new LinkedList<Position>();
    710             final Position rootPos = new Position(root);
    711             mPositions.add(rootPos);
    712         }
    713 
    714         @Override
    715         public boolean hasNext() {
    716             for (Position p : mPositions) {
    717                 if (p.pos.hasNext()) {
    718                     return true;
    719                 }
    720             }
    721             return false;
    722         }
    723 
    724         @Override
    725         public Word next() {
    726             Position currentPos = mPositions.getLast();
    727             mCurrentString.setLength(mCurrentString.length() - currentPos.length);
    728 
    729             do {
    730                 if (currentPos.pos.hasNext()) {
    731                     final CharGroup currentGroup = currentPos.pos.next();
    732                     currentPos.length = currentGroup.mChars.length;
    733                     for (int i : currentGroup.mChars)
    734                         mCurrentString.append(Character.toChars(i));
    735                     if (null != currentGroup.mChildren) {
    736                         currentPos = new Position(currentGroup.mChildren.mData);
    737                         mPositions.addLast(currentPos);
    738                     }
    739                     if (currentGroup.mFrequency >= 0)
    740                         return new Word(mCurrentString.toString(), currentGroup.mFrequency,
    741                                 currentGroup.mShortcutTargets, currentGroup.mBigrams);
    742                 } else {
    743                     mPositions.removeLast();
    744                     currentPos = mPositions.getLast();
    745                     mCurrentString.setLength(mCurrentString.length() - mPositions.getLast().length);
    746                 }
    747             } while(true);
    748         }
    749 
    750         @Override
    751         public void remove() {
    752             throw new UnsupportedOperationException("Unsupported yet");
    753         }
    754 
    755     }
    756 
    757     /**
    758      * Method to return an iterator.
    759      *
    760      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
    761      * and say : for (Word w : x) {}
    762      */
    763     @Override
    764     public Iterator<Word> iterator() {
    765         return new DictionaryIterator(mRoot.mData);
    766     }
    767 }
    768