Home | History | Annotate | Download | only in makedict
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.makedict;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 import com.android.inputmethod.latin.Constants;
     21 
     22 import java.util.ArrayList;
     23 import java.util.Arrays;
     24 import java.util.Collections;
     25 import java.util.Date;
     26 import java.util.HashMap;
     27 import java.util.Iterator;
     28 import java.util.LinkedList;
     29 
     30 /**
     31  * A dictionary that can fusion heads and tails of words for more compression.
     32  */
     33 @UsedForTesting
     34 public final class FusionDictionary implements Iterable<Word> {
     35     private static final boolean DBG = MakedictLog.DBG;
     36 
     37     private static int CHARACTER_NOT_FOUND_INDEX = -1;
     38 
     39     /**
     40      * A node array of the dictionary, containing several PtNodes.
     41      *
     42      * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the
     43      * real information.
     44      * This class also contains fields to cache size and address, to help with binary
     45      * generation.
     46      */
     47     public static final class PtNodeArray {
     48         ArrayList<PtNode> mData;
     49         // To help with binary generation
     50         int mCachedSize = Integer.MIN_VALUE;
     51         // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
     52         // always hold the same value except between dictionary address compression, during which
     53         // the update process needs to know about both values at the same time. Updating will
     54         // update the AfterUpdate value, and the code will move them to BeforeUpdate before
     55         // the next update pass.
     56         int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
     57         int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
     58         int mCachedParentAddress = 0;
     59 
     60         public PtNodeArray() {
     61             mData = new ArrayList<PtNode>();
     62         }
     63         public PtNodeArray(ArrayList<PtNode> data) {
     64             mData = data;
     65         }
     66     }
     67 
     68     /**
     69      * A string with a frequency.
     70      *
     71      * This represents an "attribute", that is either a bigram or a shortcut.
     72      */
     73     public static final class WeightedString {
     74         public final String mWord;
     75         public int mFrequency;
     76         public WeightedString(String word, int frequency) {
     77             mWord = word;
     78             mFrequency = frequency;
     79         }
     80 
     81         @Override
     82         public int hashCode() {
     83             return Arrays.hashCode(new Object[] { mWord, mFrequency });
     84         }
     85 
     86         @Override
     87         public boolean equals(Object o) {
     88             if (o == this) return true;
     89             if (!(o instanceof WeightedString)) return false;
     90             WeightedString w = (WeightedString)o;
     91             return mWord.equals(w.mWord) && mFrequency == w.mFrequency;
     92         }
     93     }
     94 
     95     /**
     96      * PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children
     97      * (Pt means Patricia Trie).
     98      *
     99      * This is the central class of the in-memory representation. A PtNode is what can
    100      * be seen as a traditional "trie node", except it can hold several characters at the
    101      * same time. A PtNode essentially represents one or several characters in the middle
    102      * of the trie tree; as such, it can be a terminal, and it can have children.
    103      * In this in-memory representation, whether the PtNode is a terminal or not is represented
    104      * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
    105      * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
    106      * bigrams, but a non-terminal may not. Moreover, children, if present, are null.
    107      */
    108     public static final class PtNode {
    109         public static final int NOT_A_TERMINAL = -1;
    110         final int mChars[];
    111         ArrayList<WeightedString> mShortcutTargets;
    112         ArrayList<WeightedString> mBigrams;
    113         int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
    114         int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
    115         PtNodeArray mChildren;
    116         boolean mIsNotAWord; // Only a shortcut
    117         boolean mIsBlacklistEntry;
    118         // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
    119         // generation. Before and After always hold the same value except during dictionary
    120         // address compression, where the update process needs to know about both values at the
    121         // same time. Updating will update the AfterUpdate value, and the code will move them
    122         // to BeforeUpdate before the next update pass.
    123         // The update process does not need two versions of mCachedSize.
    124         int mCachedSize; // The size, in bytes, of this PtNode.
    125         int mCachedAddressBeforeUpdate; // The address of this PtNode (before update)
    126         int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
    127 
    128         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    129                 final ArrayList<WeightedString> bigrams, final int frequency,
    130                 final boolean isNotAWord, final boolean isBlacklistEntry) {
    131             mChars = chars;
    132             mFrequency = frequency;
    133             mTerminalId = frequency;
    134             mShortcutTargets = shortcutTargets;
    135             mBigrams = bigrams;
    136             mChildren = null;
    137             mIsNotAWord = isNotAWord;
    138             mIsBlacklistEntry = isBlacklistEntry;
    139         }
    140 
    141         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
    142                 final ArrayList<WeightedString> bigrams, final int frequency,
    143                 final boolean isNotAWord, final boolean isBlacklistEntry,
    144                 final PtNodeArray children) {
    145             mChars = chars;
    146             mFrequency = frequency;
    147             mShortcutTargets = shortcutTargets;
    148             mBigrams = bigrams;
    149             mChildren = children;
    150             mIsNotAWord = isNotAWord;
    151             mIsBlacklistEntry = isBlacklistEntry;
    152         }
    153 
    154         public void addChild(PtNode n) {
    155             if (null == mChildren) {
    156                 mChildren = new PtNodeArray();
    157             }
    158             mChildren.mData.add(n);
    159         }
    160 
    161         public int getTerminalId() {
    162             return mTerminalId;
    163         }
    164 
    165         public boolean isTerminal() {
    166             return NOT_A_TERMINAL != mFrequency;
    167         }
    168 
    169         public int getFrequency() {
    170             return mFrequency;
    171         }
    172 
    173         public boolean getIsNotAWord() {
    174             return mIsNotAWord;
    175         }
    176 
    177         public boolean getIsBlacklistEntry() {
    178             return mIsBlacklistEntry;
    179         }
    180 
    181         public ArrayList<WeightedString> getShortcutTargets() {
    182             // We don't want write permission to escape outside the package, so we return a copy
    183             if (null == mShortcutTargets) return null;
    184             final ArrayList<WeightedString> copyOfShortcutTargets =
    185                     new ArrayList<WeightedString>(mShortcutTargets);
    186             return copyOfShortcutTargets;
    187         }
    188 
    189         public ArrayList<WeightedString> getBigrams() {
    190             // We don't want write permission to escape outside the package, so we return a copy
    191             if (null == mBigrams) return null;
    192             final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams);
    193             return copyOfBigrams;
    194         }
    195 
    196         public boolean hasSeveralChars() {
    197             assert(mChars.length > 0);
    198             return 1 < mChars.length;
    199         }
    200 
    201         /**
    202          * Adds a word to the bigram list. Updates the frequency if the word already
    203          * exists.
    204          */
    205         public void addBigram(final String word, final int frequency) {
    206             if (mBigrams == null) {
    207                 mBigrams = new ArrayList<WeightedString>();
    208             }
    209             WeightedString bigram = getBigram(word);
    210             if (bigram != null) {
    211                 bigram.mFrequency = frequency;
    212             } else {
    213                 bigram = new WeightedString(word, frequency);
    214                 mBigrams.add(bigram);
    215             }
    216         }
    217 
    218         /**
    219          * Gets the shortcut target for the given word. Returns null if the word is not in the
    220          * shortcut list.
    221          */
    222         public WeightedString getShortcut(final String word) {
    223             // TODO: Don't do a linear search
    224             if (mShortcutTargets != null) {
    225                 final int size = mShortcutTargets.size();
    226                 for (int i = 0; i < size; ++i) {
    227                     WeightedString shortcut = mShortcutTargets.get(i);
    228                     if (shortcut.mWord.equals(word)) {
    229                         return shortcut;
    230                     }
    231                 }
    232             }
    233             return null;
    234         }
    235 
    236         /**
    237          * Gets the bigram for the given word.
    238          * Returns null if the word is not in the bigrams list.
    239          */
    240         public WeightedString getBigram(final String word) {
    241             // TODO: Don't do a linear search
    242             if (mBigrams != null) {
    243                 final int size = mBigrams.size();
    244                 for (int i = 0; i < size; ++i) {
    245                     WeightedString bigram = mBigrams.get(i);
    246                     if (bigram.mWord.equals(word)) {
    247                         return bigram;
    248                     }
    249                 }
    250             }
    251             return null;
    252         }
    253 
    254         /**
    255          * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to
    256          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
    257          * updated if they are higher than the existing ones.
    258          */
    259         public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
    260                 final ArrayList<WeightedString> bigrams,
    261                 final boolean isNotAWord, final boolean isBlacklistEntry) {
    262             if (frequency > mFrequency) {
    263                 mFrequency = frequency;
    264             }
    265             if (shortcutTargets != null) {
    266                 if (mShortcutTargets == null) {
    267                     mShortcutTargets = shortcutTargets;
    268                 } else {
    269                     final int size = shortcutTargets.size();
    270                     for (int i = 0; i < size; ++i) {
    271                         final WeightedString shortcut = shortcutTargets.get(i);
    272                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
    273                         if (existingShortcut == null) {
    274                             mShortcutTargets.add(shortcut);
    275                         } else if (existingShortcut.mFrequency < shortcut.mFrequency) {
    276                             existingShortcut.mFrequency = shortcut.mFrequency;
    277                         }
    278                     }
    279                 }
    280             }
    281             if (bigrams != null) {
    282                 if (mBigrams == null) {
    283                     mBigrams = bigrams;
    284                 } else {
    285                     final int size = bigrams.size();
    286                     for (int i = 0; i < size; ++i) {
    287                         final WeightedString bigram = bigrams.get(i);
    288                         final WeightedString existingBigram = getBigram(bigram.mWord);
    289                         if (existingBigram == null) {
    290                             mBigrams.add(bigram);
    291                         } else if (existingBigram.mFrequency < bigram.mFrequency) {
    292                             existingBigram.mFrequency = bigram.mFrequency;
    293                         }
    294                     }
    295                 }
    296             }
    297             mIsNotAWord = isNotAWord;
    298             mIsBlacklistEntry = isBlacklistEntry;
    299         }
    300     }
    301 
    302     /**
    303      * Options global to the dictionary.
    304      */
    305     public static final class DictionaryOptions {
    306         public final boolean mGermanUmlautProcessing;
    307         public final boolean mFrenchLigatureProcessing;
    308         public final HashMap<String, String> mAttributes;
    309         public DictionaryOptions(final HashMap<String, String> attributes,
    310                 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) {
    311             mAttributes = attributes;
    312             mGermanUmlautProcessing = germanUmlautProcessing;
    313             mFrenchLigatureProcessing = frenchLigatureProcessing;
    314         }
    315         @Override
    316         public String toString() { // Convenience method
    317             return toString(0, false);
    318         }
    319         public String toString(final int indentCount, final boolean plumbing) {
    320             final StringBuilder indent = new StringBuilder();
    321             if (plumbing) {
    322                 indent.append("H:");
    323             } else {
    324                 for (int i = 0; i < indentCount; ++i) {
    325                     indent.append(" ");
    326                 }
    327             }
    328             final StringBuilder s = new StringBuilder();
    329             for (final String optionKey : mAttributes.keySet()) {
    330                 s.append(indent);
    331                 s.append(optionKey);
    332                 s.append(" = ");
    333                 if ("date".equals(optionKey) && !plumbing) {
    334                     // Date needs a number of milliseconds, but the dictionary contains seconds
    335                     s.append(new Date(
    336                             1000 * Long.parseLong(mAttributes.get(optionKey))).toString());
    337                 } else {
    338                     s.append(mAttributes.get(optionKey));
    339                 }
    340                 s.append("\n");
    341             }
    342             if (mGermanUmlautProcessing) {
    343                 s.append(indent);
    344                 s.append("Needs German umlaut processing\n");
    345             }
    346             if (mFrenchLigatureProcessing) {
    347                 s.append(indent);
    348                 s.append("Needs French ligature processing\n");
    349             }
    350             return s.toString();
    351         }
    352     }
    353 
    354     public final DictionaryOptions mOptions;
    355     public final PtNodeArray mRootNodeArray;
    356 
    357     public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) {
    358         mRootNodeArray = rootNodeArray;
    359         mOptions = options;
    360     }
    361 
    362     public void addOptionAttribute(final String key, final String value) {
    363         mOptions.mAttributes.put(key, value);
    364     }
    365 
    366     /**
    367      * Helper method to convert a String to an int array.
    368      */
    369     static int[] getCodePoints(final String word) {
    370         // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray,
    371         // which is not visible from the makedict package. Factor this code.
    372         final int length = word.length();
    373         if (length <= 0) return new int[] {};
    374         final char[] characters = word.toCharArray();
    375         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
    376         int codePoint = Character.codePointAt(characters, 0);
    377         int dsti = 0;
    378         for (int srci = Character.charCount(codePoint);
    379                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
    380             codePoints[dsti] = codePoint;
    381             codePoint = Character.codePointAt(characters, srci);
    382         }
    383         codePoints[dsti] = codePoint;
    384         return codePoints;
    385     }
    386 
    387     /**
    388      * Helper method to add a word as a string.
    389      *
    390      * This method adds a word to the dictionary with the given frequency. Optional
    391      * lists of bigrams and shortcuts can be passed here. For each word inside,
    392      * they will be added to the dictionary as necessary.
    393      *
    394      * @param word the word to add.
    395      * @param frequency the frequency of the word, in the range [0..255].
    396      * @param shortcutTargets a list of shortcut targets for this word, or null.
    397      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
    398      */
    399     public void add(final String word, final int frequency,
    400             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
    401         add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
    402                 false /* isBlacklistEntry */);
    403     }
    404 
    405     /**
    406      * Helper method to add a blacklist entry as a string.
    407      *
    408      * @param word the word to add as a blacklist entry.
    409      * @param shortcutTargets a list of shortcut targets for this word, or null.
    410      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
    411      */
    412     public void addBlacklistEntry(final String word,
    413             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
    414         add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
    415     }
    416 
    417     /**
    418      * Sanity check for a PtNode array.
    419      *
    420      * This method checks that all PtNodes in a node array are ordered as expected.
    421      * If they are, nothing happens. If they aren't, an exception is thrown.
    422      */
    423     private void checkStack(PtNodeArray ptNodeArray) {
    424         ArrayList<PtNode> stack = ptNodeArray.mData;
    425         int lastValue = -1;
    426         for (int i = 0; i < stack.size(); ++i) {
    427             int currentValue = stack.get(i).mChars[0];
    428             if (currentValue <= lastValue)
    429                 throw new RuntimeException("Invalid stack");
    430             else
    431                 lastValue = currentValue;
    432         }
    433     }
    434 
    435     /**
    436      * Helper method to add a new bigram to the dictionary.
    437      *
    438      * @param word1 the previous word of the context
    439      * @param word2 the next word of the context
    440      * @param frequency the bigram frequency
    441      */
    442     public void setBigram(final String word1, final String word2, final int frequency) {
    443         PtNode ptNode = findWordInTree(mRootNodeArray, word1);
    444         if (ptNode != null) {
    445             final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2);
    446             if (ptNode2 == null) {
    447                 add(getCodePoints(word2), 0, null, false /* isNotAWord */,
    448                         false /* isBlacklistEntry */);
    449                 // The PtNode for the first word may have moved by the above insertion,
    450                 // if word1 and word2 share a common stem that happens not to have been
    451                 // a cutting point until now. In this case, we need to refresh ptNode.
    452                 ptNode = findWordInTree(mRootNodeArray, word1);
    453             }
    454             ptNode.addBigram(word2, frequency);
    455         } else {
    456             throw new RuntimeException("First word of bigram not found");
    457         }
    458     }
    459 
    460     /**
    461      * Add a word to this dictionary.
    462      *
    463      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
    464      * an exception is thrown.
    465      *
    466      * @param word the word, as an int array.
    467      * @param frequency the frequency of the word, in the range [0..255].
    468      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
    469      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
    470      * @param isBlacklistEntry true if this is a blacklisted word, false otherwise
    471      */
    472     private void add(final int[] word, final int frequency,
    473             final ArrayList<WeightedString> shortcutTargets,
    474             final boolean isNotAWord, final boolean isBlacklistEntry) {
    475         assert(frequency >= 0 && frequency <= 255);
    476         if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
    477             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
    478             return;
    479         }
    480 
    481         PtNodeArray currentNodeArray = mRootNodeArray;
    482         int charIndex = 0;
    483 
    484         PtNode currentPtNode = null;
    485         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
    486         int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]);
    487         while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) {
    488             currentPtNode = currentNodeArray.mData.get(nodeIndex);
    489             differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex);
    490             if (ARRAYS_ARE_EQUAL != differentCharIndex
    491                     && differentCharIndex < currentPtNode.mChars.length) break;
    492             if (null == currentPtNode.mChildren) break;
    493             charIndex += currentPtNode.mChars.length;
    494             if (charIndex >= word.length) break;
    495             currentNodeArray = currentPtNode.mChildren;
    496             nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]);
    497         }
    498 
    499         if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) {
    500             // No node at this point to accept the word. Create one.
    501             final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
    502             final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
    503                     shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
    504             currentNodeArray.mData.add(insertionIndex, newPtNode);
    505             if (DBG) checkStack(currentNodeArray);
    506         } else {
    507             // There is a word with a common prefix.
    508             if (differentCharIndex == currentPtNode.mChars.length) {
    509                 if (charIndex + differentCharIndex >= word.length) {
    510                     // The new word is a prefix of an existing word, but the node on which it
    511                     // should end already exists as is. Since the old PtNode was not a terminal,
    512                     // make it one by filling in its frequency and other attributes
    513                     currentPtNode.update(frequency, shortcutTargets, null, isNotAWord,
    514                             isBlacklistEntry);
    515                 } else {
    516                     // The new word matches the full old word and extends past it.
    517                     // We only have to create a new node and add it to the end of this.
    518                     final PtNode newNode = new PtNode(
    519                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
    520                                     shortcutTargets, null /* bigrams */, frequency, isNotAWord,
    521                                     isBlacklistEntry);
    522                     currentPtNode.mChildren = new PtNodeArray();
    523                     currentPtNode.mChildren.mData.add(newNode);
    524                 }
    525             } else {
    526                 if (0 == differentCharIndex) {
    527                     // Exact same word. Update the frequency if higher. This will also add the
    528                     // new shortcuts to the existing shortcut list if it already exists.
    529                     currentPtNode.update(frequency, shortcutTargets, null,
    530                             currentPtNode.mIsNotAWord && isNotAWord,
    531                             currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
    532                 } else {
    533                     // Partial prefix match only. We have to replace the current node with a node
    534                     // containing the current prefix and create two new ones for the tails.
    535                     PtNodeArray newChildren = new PtNodeArray();
    536                     final PtNode newOldWord = new PtNode(
    537                             Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
    538                                     currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
    539                             currentPtNode.mBigrams, currentPtNode.mFrequency,
    540                             currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
    541                             currentPtNode.mChildren);
    542                     newChildren.mData.add(newOldWord);
    543 
    544                     final PtNode newParent;
    545                     if (charIndex + differentCharIndex >= word.length) {
    546                         newParent = new PtNode(
    547                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
    548                                 shortcutTargets, null /* bigrams */, frequency,
    549                                 isNotAWord, isBlacklistEntry, newChildren);
    550                     } else {
    551                         newParent = new PtNode(
    552                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
    553                                 null /* shortcutTargets */, null /* bigrams */, -1,
    554                                 false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
    555                         final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
    556                                 charIndex + differentCharIndex, word.length),
    557                                 shortcutTargets, null /* bigrams */, frequency,
    558                                 isNotAWord, isBlacklistEntry);
    559                         final int addIndex = word[charIndex + differentCharIndex]
    560                                 > currentPtNode.mChars[differentCharIndex] ? 1 : 0;
    561                         newChildren.mData.add(addIndex, newWord);
    562                     }
    563                     currentNodeArray.mData.set(nodeIndex, newParent);
    564                 }
    565                 if (DBG) checkStack(currentNodeArray);
    566             }
    567         }
    568     }
    569 
    570     private static int ARRAYS_ARE_EQUAL = 0;
    571 
    572     /**
    573      * Custom comparison of two int arrays taken to contain character codes.
    574      *
    575      * This method compares the two arrays passed as an argument in a lexicographic way,
    576      * with an offset in the dst string.
    577      * This method does NOT test for the first character. It is taken to be equal.
    578      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
    579      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
    580      * strings are equal. This works BECAUSE we don't look at the first character.
    581      *
    582      * @param src the left-hand side string of the comparison.
    583      * @param dst the right-hand side string of the comparison.
    584      * @param dstOffset the offset in the right-hand side string.
    585      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
    586      */
    587     private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) {
    588         // We do NOT test the first char, because we come from a method that already
    589         // tested it.
    590         for (int i = 1; i < src.length; ++i) {
    591             if (dstOffset + i >= dst.length) return i;
    592             if (src[i] != dst[dstOffset + i]) return i;
    593         }
    594         if (dst.length > src.length) return src.length;
    595         return ARRAYS_ARE_EQUAL;
    596     }
    597 
    598     /**
    599      * Helper class that compares and sorts two PtNodes according to their
    600      * first element only. I repeat: ONLY the first element is considered, the rest
    601      * is ignored.
    602      * This comparator imposes orderings that are inconsistent with equals.
    603      */
    604     static private final class PtNodeComparator implements java.util.Comparator<PtNode> {
    605         @Override
    606         public int compare(PtNode p1, PtNode p2) {
    607             if (p1.mChars[0] == p2.mChars[0]) return 0;
    608             return p1.mChars[0] < p2.mChars[0] ? -1 : 1;
    609         }
    610     }
    611     final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator();
    612 
    613     /**
    614      * Finds the insertion index of a character within a node array.
    615      */
    616     private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
    617         final ArrayList<PtNode> data = nodeArray.mData;
    618         final PtNode reference = new PtNode(new int[] { character },
    619                 null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
    620                 false /* isBlacklistEntry */);
    621         int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
    622         return result >= 0 ? result : -result - 1;
    623     }
    624 
    625     /**
    626      * Find the index of a char in a node array, if it exists.
    627      *
    628      * @param nodeArray the node array to search in.
    629      * @param character the character to search for.
    630      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else.
    631      */
    632     private static int findIndexOfChar(final PtNodeArray nodeArray, int character) {
    633         final int insertionIndex = findInsertionIndex(nodeArray, character);
    634         if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX;
    635         return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex
    636                 : CHARACTER_NOT_FOUND_INDEX;
    637     }
    638 
    639     /**
    640      * Helper method to find a word in a given branch.
    641      */
    642     @SuppressWarnings("unused")
    643     public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) {
    644         int index = 0;
    645         final StringBuilder checker = DBG ? new StringBuilder() : null;
    646         final int[] codePoints = getCodePoints(string);
    647 
    648         PtNode currentPtNode;
    649         do {
    650             int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]);
    651             if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null;
    652             currentPtNode = nodeArray.mData.get(indexOfGroup);
    653 
    654             if (codePoints.length - index < currentPtNode.mChars.length) return null;
    655             int newIndex = index;
    656             while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) {
    657                 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null;
    658                 newIndex++;
    659             }
    660             index = newIndex;
    661 
    662             if (DBG) {
    663                 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length));
    664             }
    665             if (index < codePoints.length) {
    666                 nodeArray = currentPtNode.mChildren;
    667             }
    668         } while (null != nodeArray && index < codePoints.length);
    669 
    670         if (index < codePoints.length) return null;
    671         if (!currentPtNode.isTerminal()) return null;
    672         if (DBG && !string.equals(checker.toString())) return null;
    673         return currentPtNode;
    674     }
    675 
    676     /**
    677      * Helper method to find out whether a word is in the dict or not.
    678      */
    679     public boolean hasWord(final String s) {
    680         if (null == s || "".equals(s)) {
    681             throw new RuntimeException("Can't search for a null or empty string");
    682         }
    683         return null != findWordInTree(mRootNodeArray, s);
    684     }
    685 
    686     /**
    687      * Recursively count the number of PtNodes in a given branch of the trie.
    688      *
    689      * @param nodeArray the parent node.
    690      * @return the number of PtNodes in all the branch under this node.
    691      */
    692     public static int countPtNodes(final PtNodeArray nodeArray) {
    693         final int nodeSize = nodeArray.mData.size();
    694         int size = nodeSize;
    695         for (int i = nodeSize - 1; i >= 0; --i) {
    696             PtNode ptNode = nodeArray.mData.get(i);
    697             if (null != ptNode.mChildren)
    698                 size += countPtNodes(ptNode.mChildren);
    699         }
    700         return size;
    701     }
    702 
    703     /**
    704      * Recursively count the number of nodes in a given branch of the trie.
    705      *
    706      * @param nodeArray the node array to count.
    707      * @return the number of nodes in this branch.
    708      */
    709     public static int countNodeArrays(final PtNodeArray nodeArray) {
    710         int size = 1;
    711         for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
    712             PtNode ptNode = nodeArray.mData.get(i);
    713             if (null != ptNode.mChildren)
    714                 size += countNodeArrays(ptNode.mChildren);
    715         }
    716         return size;
    717     }
    718 
    719     // Recursively find out whether there are any bigrams.
    720     // This can be pretty expensive especially if there aren't any (we return as soon
    721     // as we find one, so it's much cheaper if there are bigrams)
    722     private static boolean hasBigramsInternal(final PtNodeArray nodeArray) {
    723         if (null == nodeArray) return false;
    724         for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
    725             PtNode ptNode = nodeArray.mData.get(i);
    726             if (null != ptNode.mBigrams) return true;
    727             if (hasBigramsInternal(ptNode.mChildren)) return true;
    728         }
    729         return false;
    730     }
    731 
    732     /**
    733      * Finds out whether there are any bigrams in this dictionary.
    734      *
    735      * @return true if there is any bigram, false otherwise.
    736      */
    737     // TODO: this is expensive especially for large dictionaries without any bigram.
    738     // The up side is, this is always accurate and correct and uses no memory. We should
    739     // find a more efficient way of doing this, without compromising too much on memory
    740     // and ease of use.
    741     public boolean hasBigrams() {
    742         return hasBigramsInternal(mRootNodeArray);
    743     }
    744 
    745     // Historically, the tails of the words were going to be merged to save space.
    746     // However, that would prevent the code to search for a specific address in log(n)
    747     // time so this was abandoned.
    748     // The code is still of interest as it does add some compression to any dictionary
    749     // that has no need for attributes. Implementations that does not read attributes should be
    750     // able to read a dictionary with merged tails.
    751     // Also, the following code does support frequencies, as in, it will only merges
    752     // tails that share the same frequency. Though it would result in the above loss of
    753     // performance while searching by address, it is still technically possible to merge
    754     // tails that contain attributes, but this code does not take that into account - it does
    755     // not compare attributes and will merge terminals with different attributes regardless.
    756     public void mergeTails() {
    757         MakedictLog.i("Do not merge tails");
    758         return;
    759 
    760 //        MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root));
    761 //        MakedictLog.i("Number of PtNodes : " + countPtNodes(root));
    762 //
    763 //        final HashMap<String, ArrayList<PtNodeArray>> repository =
    764 //                  new HashMap<String, ArrayList<PtNodeArray>>();
    765 //        mergeTailsInner(repository, root);
    766 //
    767 //        MakedictLog.i("Number of different pseudohashes : " + repository.size());
    768 //        int size = 0;
    769 //        for (ArrayList<PtNodeArray> a : repository.values()) {
    770 //            size += a.size();
    771 //        }
    772 //        MakedictLog.i("Number of nodes after merge : " + (1 + size));
    773 //        MakedictLog.i("Recursively seen nodes : " + countNodes(root));
    774     }
    775 
    776     // The following methods are used by the deactivated mergeTails()
    777 //   private static boolean isEqual(PtNodeArray a, PtNodeArray b) {
    778 //       if (null == a && null == b) return true;
    779 //       if (null == a || null == b) return false;
    780 //       if (a.data.size() != b.data.size()) return false;
    781 //       final int size = a.data.size();
    782 //       for (int i = size - 1; i >= 0; --i) {
    783 //           PtNode aPtNode = a.data.get(i);
    784 //           PtNode bPtNode = b.data.get(i);
    785 //           if (aPtNode.frequency != bPtNode.frequency) return false;
    786 //           if (aPtNode.alternates == null && bPtNode.alternates != null) return false;
    787 //           if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false;
    788 //           if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false;
    789 //           if (!isEqual(aPtNode.children, bPtNode.children)) return false;
    790 //       }
    791 //       return true;
    792 //   }
    793 
    794 //   static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner(
    795 //           final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) {
    796 //       final ArrayList<PtNode> branches = nodeArray.data;
    797 //       final int nodeSize = branches.size();
    798 //       for (int i = 0; i < nodeSize; ++i) {
    799 //           PtNode ptNode = branches.get(i);
    800 //           if (null != ptNode.children) {
    801 //               String pseudoHash = getPseudoHash(ptNode.children);
    802 //               ArrayList<PtNodeArray> similarList = map.get(pseudoHash);
    803 //               if (null == similarList) {
    804 //                   similarList = new ArrayList<PtNodeArray>();
    805 //                   map.put(pseudoHash, similarList);
    806 //               }
    807 //               boolean merged = false;
    808 //               for (PtNodeArray similar : similarList) {
    809 //                   if (isEqual(ptNode.children, similar)) {
    810 //                       ptNode.children = similar;
    811 //                       merged = true;
    812 //                       break;
    813 //                   }
    814 //               }
    815 //               if (!merged) {
    816 //                   similarList.add(ptNode.children);
    817 //               }
    818 //               mergeTailsInner(map, ptNode.children);
    819 //           }
    820 //       }
    821 //       return map;
    822 //   }
    823 
    824 //  private static String getPseudoHash(final PtNodeArray nodeArray) {
    825 //      StringBuilder s = new StringBuilder();
    826 //      for (PtNode ptNode : nodeArray.data) {
    827 //          s.append(ptNode.frequency);
    828 //          for (int ch : ptNode.chars) {
    829 //              s.append(Character.toChars(ch));
    830 //          }
    831 //      }
    832 //      return s.toString();
    833 //  }
    834 
    835     /**
    836      * Iterator to walk through a dictionary.
    837      *
    838      * This is purely for convenience.
    839      */
    840     public static final class DictionaryIterator implements Iterator<Word> {
    841         private static final class Position {
    842             public Iterator<PtNode> pos;
    843             public int length;
    844             public Position(ArrayList<PtNode> ptNodes) {
    845                 pos = ptNodes.iterator();
    846                 length = 0;
    847             }
    848         }
    849         final StringBuilder mCurrentString;
    850         final LinkedList<Position> mPositions;
    851 
    852         public DictionaryIterator(ArrayList<PtNode> ptRoot) {
    853             mCurrentString = new StringBuilder();
    854             mPositions = new LinkedList<Position>();
    855             final Position rootPos = new Position(ptRoot);
    856             mPositions.add(rootPos);
    857         }
    858 
    859         @Override
    860         public boolean hasNext() {
    861             for (Position p : mPositions) {
    862                 if (p.pos.hasNext()) {
    863                     return true;
    864                 }
    865             }
    866             return false;
    867         }
    868 
    869         @Override
    870         public Word next() {
    871             Position currentPos = mPositions.getLast();
    872             mCurrentString.setLength(currentPos.length);
    873 
    874             do {
    875                 if (currentPos.pos.hasNext()) {
    876                     final PtNode currentPtNode = currentPos.pos.next();
    877                     currentPos.length = mCurrentString.length();
    878                     for (int i : currentPtNode.mChars) {
    879                         mCurrentString.append(Character.toChars(i));
    880                     }
    881                     if (null != currentPtNode.mChildren) {
    882                         currentPos = new Position(currentPtNode.mChildren.mData);
    883                         currentPos.length = mCurrentString.length();
    884                         mPositions.addLast(currentPos);
    885                     }
    886                     if (currentPtNode.mFrequency >= 0) {
    887                         return new Word(mCurrentString.toString(), currentPtNode.mFrequency,
    888                                 currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
    889                                 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
    890                     }
    891                 } else {
    892                     mPositions.removeLast();
    893                     currentPos = mPositions.getLast();
    894                     mCurrentString.setLength(mPositions.getLast().length);
    895                 }
    896             } while (true);
    897         }
    898 
    899         @Override
    900         public void remove() {
    901             throw new UnsupportedOperationException("Unsupported yet");
    902         }
    903 
    904     }
    905 
    906     /**
    907      * Method to return an iterator.
    908      *
    909      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
    910      * and say : for (Word w : x) {}
    911      */
    912     @Override
    913     public Iterator<Word> iterator() {
    914         return new DictionaryIterator(mRootNodeArray.mData);
    915     }
    916 }
    917