Home | History | Annotate | Download | only in latin
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.latin;
     18 
     19 import android.content.Context;
     20 import android.text.AutoText;
     21 import android.text.TextUtils;
     22 import android.util.Log;
     23 import android.view.View;
     24 
     25 import java.nio.ByteBuffer;
     26 import java.util.ArrayList;
     27 import java.util.Arrays;
     28 import java.util.List;
     29 
     30 /**
     31  * This class loads a dictionary and provides a list of suggestions for a given sequence of
     32  * characters. This includes corrections and completions.
     33  * @hide pending API Council Approval
     34  */
     35 public class Suggest implements Dictionary.WordCallback {
     36 
     37     public static final int APPROX_MAX_WORD_LENGTH = 32;
     38 
     39     public static final int CORRECTION_NONE = 0;
     40     public static final int CORRECTION_BASIC = 1;
     41     public static final int CORRECTION_FULL = 2;
     42     public static final int CORRECTION_FULL_BIGRAM = 3;
     43 
     44     /**
     45      * Words that appear in both bigram and unigram data gets multiplier ranging from
     46      * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the frequency score from
     47      * bigram data.
     48      */
     49     public static final double BIGRAM_MULTIPLIER_MIN = 1.2;
     50     public static final double BIGRAM_MULTIPLIER_MAX = 1.5;
     51 
     52     /**
     53      * Maximum possible bigram frequency. Will depend on how many bits are being used in data
     54      * structure. Maximum bigram freqeuncy will get the BIGRAM_MULTIPLIER_MAX as the multiplier.
     55      */
     56     public static final int MAXIMUM_BIGRAM_FREQUENCY = 127;
     57 
     58     public static final int DIC_USER_TYPED = 0;
     59     public static final int DIC_MAIN = 1;
     60     public static final int DIC_USER = 2;
     61     public static final int DIC_AUTO = 3;
     62     public static final int DIC_CONTACTS = 4;
     63     // If you add a type of dictionary, increment DIC_TYPE_LAST_ID
     64     public static final int DIC_TYPE_LAST_ID = 4;
     65 
     66     static final int LARGE_DICTIONARY_THRESHOLD = 200 * 1000;
     67 
     68     private BinaryDictionary mMainDict;
     69 
     70     private Dictionary mUserDictionary;
     71 
     72     private Dictionary mAutoDictionary;
     73 
     74     private Dictionary mContactsDictionary;
     75 
     76     private Dictionary mUserBigramDictionary;
     77 
     78     private int mPrefMaxSuggestions = 12;
     79 
     80     private static final int PREF_MAX_BIGRAMS = 60;
     81 
     82     private boolean mAutoTextEnabled;
     83 
     84     private int[] mPriorities = new int[mPrefMaxSuggestions];
     85     private int[] mBigramPriorities = new int[PREF_MAX_BIGRAMS];
     86 
     87     // Handle predictive correction for only the first 1280 characters for performance reasons
     88     // If we support scripts that need latin characters beyond that, we should probably use some
     89     // kind of a sparse array or language specific list with a mapping lookup table.
     90     // 1280 is the size of the BASE_CHARS array in ExpandableDictionary, which is a basic set of
     91     // latin characters.
     92     private int[] mNextLettersFrequencies = new int[1280];
     93     private ArrayList<CharSequence> mSuggestions = new ArrayList<CharSequence>();
     94     ArrayList<CharSequence> mBigramSuggestions  = new ArrayList<CharSequence>();
     95     private ArrayList<CharSequence> mStringPool = new ArrayList<CharSequence>();
     96     private boolean mHaveCorrection;
     97     private CharSequence mOriginalWord;
     98     private String mLowerOriginalWord;
     99 
    100     // TODO: Remove these member variables by passing more context to addWord() callback method
    101     private boolean mIsFirstCharCapitalized;
    102     private boolean mIsAllUpperCase;
    103 
    104     private int mCorrectionMode = CORRECTION_BASIC;
    105 
    106     public Suggest(Context context, int[] dictionaryResId) {
    107         mMainDict = new BinaryDictionary(context, dictionaryResId, DIC_MAIN);
    108         initPool();
    109     }
    110 
    111     public Suggest(Context context, ByteBuffer byteBuffer) {
    112         mMainDict = new BinaryDictionary(context, byteBuffer, DIC_MAIN);
    113         initPool();
    114     }
    115 
    116     private void initPool() {
    117         for (int i = 0; i < mPrefMaxSuggestions; i++) {
    118             StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
    119             mStringPool.add(sb);
    120         }
    121     }
    122 
    123     public void setAutoTextEnabled(boolean enabled) {
    124         mAutoTextEnabled = enabled;
    125     }
    126 
    127     public int getCorrectionMode() {
    128         return mCorrectionMode;
    129     }
    130 
    131     public void setCorrectionMode(int mode) {
    132         mCorrectionMode = mode;
    133     }
    134 
    135     public boolean hasMainDictionary() {
    136         return mMainDict.getSize() > LARGE_DICTIONARY_THRESHOLD;
    137     }
    138 
    139     public int getApproxMaxWordLength() {
    140         return APPROX_MAX_WORD_LENGTH;
    141     }
    142 
    143     /**
    144      * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted
    145      * before the main dictionary, if set.
    146      */
    147     public void setUserDictionary(Dictionary userDictionary) {
    148         mUserDictionary = userDictionary;
    149     }
    150 
    151     /**
    152      * Sets an optional contacts dictionary resource to be loaded.
    153      */
    154     public void setContactsDictionary(Dictionary userDictionary) {
    155         mContactsDictionary = userDictionary;
    156     }
    157 
    158     public void setAutoDictionary(Dictionary autoDictionary) {
    159         mAutoDictionary = autoDictionary;
    160     }
    161 
    162     public void setUserBigramDictionary(Dictionary userBigramDictionary) {
    163         mUserBigramDictionary = userBigramDictionary;
    164     }
    165 
    166     /**
    167      * Number of suggestions to generate from the input key sequence. This has
    168      * to be a number between 1 and 100 (inclusive).
    169      * @param maxSuggestions
    170      * @throws IllegalArgumentException if the number is out of range
    171      */
    172     public void setMaxSuggestions(int maxSuggestions) {
    173         if (maxSuggestions < 1 || maxSuggestions > 100) {
    174             throw new IllegalArgumentException("maxSuggestions must be between 1 and 100");
    175         }
    176         mPrefMaxSuggestions = maxSuggestions;
    177         mPriorities = new int[mPrefMaxSuggestions];
    178         mBigramPriorities = new int[PREF_MAX_BIGRAMS];
    179         collectGarbage(mSuggestions, mPrefMaxSuggestions);
    180         while (mStringPool.size() < mPrefMaxSuggestions) {
    181             StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
    182             mStringPool.add(sb);
    183         }
    184     }
    185 
    186     private boolean haveSufficientCommonality(String original, CharSequence suggestion) {
    187         final int originalLength = original.length();
    188         final int suggestionLength = suggestion.length();
    189         final int minLength = Math.min(originalLength, suggestionLength);
    190         if (minLength <= 2) return true;
    191         int matching = 0;
    192         int lessMatching = 0; // Count matches if we skip one character
    193         int i;
    194         for (i = 0; i < minLength; i++) {
    195             final char origChar = ExpandableDictionary.toLowerCase(original.charAt(i));
    196             if (origChar == ExpandableDictionary.toLowerCase(suggestion.charAt(i))) {
    197                 matching++;
    198                 lessMatching++;
    199             } else if (i + 1 < suggestionLength
    200                     && origChar == ExpandableDictionary.toLowerCase(suggestion.charAt(i + 1))) {
    201                 lessMatching++;
    202             }
    203         }
    204         matching = Math.max(matching, lessMatching);
    205 
    206         if (minLength <= 4) {
    207             return matching >= 2;
    208         } else {
    209             return matching > minLength / 2;
    210         }
    211     }
    212 
    213     /**
    214      * Returns a list of words that match the list of character codes passed in.
    215      * This list will be overwritten the next time this function is called.
    216      * @param view a view for retrieving the context for AutoText
    217      * @param wordComposer contains what is currently being typed
    218      * @param prevWordForBigram previous word (used only for bigram)
    219      * @return list of suggestions.
    220      */
    221     public List<CharSequence> getSuggestions(View view, WordComposer wordComposer,
    222             boolean includeTypedWordIfValid, CharSequence prevWordForBigram) {
    223         LatinImeLogger.onStartSuggestion(prevWordForBigram);
    224         mHaveCorrection = false;
    225         mIsFirstCharCapitalized = wordComposer.isFirstCharCapitalized();
    226         mIsAllUpperCase = wordComposer.isAllUpperCase();
    227         collectGarbage(mSuggestions, mPrefMaxSuggestions);
    228         Arrays.fill(mPriorities, 0);
    229         Arrays.fill(mNextLettersFrequencies, 0);
    230 
    231         // Save a lowercase version of the original word
    232         mOriginalWord = wordComposer.getTypedWord();
    233         if (mOriginalWord != null) {
    234             final String mOriginalWordString = mOriginalWord.toString();
    235             mOriginalWord = mOriginalWordString;
    236             mLowerOriginalWord = mOriginalWordString.toLowerCase();
    237             // Treating USER_TYPED as UNIGRAM suggestion for logging now.
    238             LatinImeLogger.onAddSuggestedWord(mOriginalWordString, Suggest.DIC_USER_TYPED,
    239                     Dictionary.DataType.UNIGRAM);
    240         } else {
    241             mLowerOriginalWord = "";
    242         }
    243 
    244         if (wordComposer.size() == 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM
    245                 || mCorrectionMode == CORRECTION_BASIC)) {
    246             // At first character typed, search only the bigrams
    247             Arrays.fill(mBigramPriorities, 0);
    248             collectGarbage(mBigramSuggestions, PREF_MAX_BIGRAMS);
    249 
    250             if (!TextUtils.isEmpty(prevWordForBigram)) {
    251                 CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase();
    252                 if (mMainDict.isValidWord(lowerPrevWord)) {
    253                     prevWordForBigram = lowerPrevWord;
    254                 }
    255                 if (mUserBigramDictionary != null) {
    256                     mUserBigramDictionary.getBigrams(wordComposer, prevWordForBigram, this,
    257                             mNextLettersFrequencies);
    258                 }
    259                 if (mContactsDictionary != null) {
    260                     mContactsDictionary.getBigrams(wordComposer, prevWordForBigram, this,
    261                             mNextLettersFrequencies);
    262                 }
    263                 if (mMainDict != null) {
    264                     mMainDict.getBigrams(wordComposer, prevWordForBigram, this,
    265                             mNextLettersFrequencies);
    266                 }
    267                 char currentChar = wordComposer.getTypedWord().charAt(0);
    268                 // TODO: Must pay attention to locale when changing case.
    269                 char currentCharUpper = Character.toUpperCase(currentChar);
    270                 int count = 0;
    271                 int bigramSuggestionSize = mBigramSuggestions.size();
    272                 for (int i = 0; i < bigramSuggestionSize; i++) {
    273                     if (mBigramSuggestions.get(i).charAt(0) == currentChar
    274                             || mBigramSuggestions.get(i).charAt(0) == currentCharUpper) {
    275                         int poolSize = mStringPool.size();
    276                         StringBuilder sb = poolSize > 0 ?
    277                                 (StringBuilder) mStringPool.remove(poolSize - 1)
    278                                 : new StringBuilder(getApproxMaxWordLength());
    279                         sb.setLength(0);
    280                         sb.append(mBigramSuggestions.get(i));
    281                         mSuggestions.add(count++, sb);
    282                         if (count > mPrefMaxSuggestions) break;
    283                     }
    284                 }
    285             }
    286 
    287         } else if (wordComposer.size() > 1) {
    288             // At second character typed, search the unigrams (scores being affected by bigrams)
    289             if (mUserDictionary != null || mContactsDictionary != null) {
    290                 if (mUserDictionary != null) {
    291                     mUserDictionary.getWords(wordComposer, this, mNextLettersFrequencies);
    292                 }
    293                 if (mContactsDictionary != null) {
    294                     mContactsDictionary.getWords(wordComposer, this, mNextLettersFrequencies);
    295                 }
    296 
    297                 if (mSuggestions.size() > 0 && isValidWord(mOriginalWord)
    298                         && (mCorrectionMode == CORRECTION_FULL
    299                         || mCorrectionMode == CORRECTION_FULL_BIGRAM)) {
    300                     mHaveCorrection = true;
    301                 }
    302             }
    303             mMainDict.getWords(wordComposer, this, mNextLettersFrequencies);
    304             if ((mCorrectionMode == CORRECTION_FULL || mCorrectionMode == CORRECTION_FULL_BIGRAM)
    305                     && mSuggestions.size() > 0) {
    306                 mHaveCorrection = true;
    307             }
    308         }
    309         if (mOriginalWord != null) {
    310             mSuggestions.add(0, mOriginalWord.toString());
    311         }
    312 
    313         // Check if the first suggestion has a minimum number of characters in common
    314         if (wordComposer.size() > 1 && mSuggestions.size() > 1
    315                 && (mCorrectionMode == CORRECTION_FULL
    316                 || mCorrectionMode == CORRECTION_FULL_BIGRAM)) {
    317             if (!haveSufficientCommonality(mLowerOriginalWord, mSuggestions.get(1))) {
    318                 mHaveCorrection = false;
    319             }
    320         }
    321         if (mAutoTextEnabled) {
    322             int i = 0;
    323             int max = 6;
    324             // Don't autotext the suggestions from the dictionaries
    325             if (mCorrectionMode == CORRECTION_BASIC) max = 1;
    326             while (i < mSuggestions.size() && i < max) {
    327                 String suggestedWord = mSuggestions.get(i).toString().toLowerCase();
    328                 CharSequence autoText =
    329                         AutoText.get(suggestedWord, 0, suggestedWord.length(), view);
    330                 // Is there an AutoText correction?
    331                 boolean canAdd = autoText != null;
    332                 // Is that correction already the current prediction (or original word)?
    333                 canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i));
    334                 // Is that correction already the next predicted word?
    335                 if (canAdd && i + 1 < mSuggestions.size() && mCorrectionMode != CORRECTION_BASIC) {
    336                     canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i + 1));
    337                 }
    338                 if (canAdd) {
    339                     mHaveCorrection = true;
    340                     mSuggestions.add(i + 1, autoText);
    341                     i++;
    342                 }
    343                 i++;
    344             }
    345         }
    346         removeDupes();
    347         return mSuggestions;
    348     }
    349 
    350     public int[] getNextLettersFrequencies() {
    351         return mNextLettersFrequencies;
    352     }
    353 
    354     private void removeDupes() {
    355         final ArrayList<CharSequence> suggestions = mSuggestions;
    356         if (suggestions.size() < 2) return;
    357         int i = 1;
    358         // Don't cache suggestions.size(), since we may be removing items
    359         while (i < suggestions.size()) {
    360             final CharSequence cur = suggestions.get(i);
    361             // Compare each candidate with each previous candidate
    362             for (int j = 0; j < i; j++) {
    363                 CharSequence previous = suggestions.get(j);
    364                 if (TextUtils.equals(cur, previous)) {
    365                     removeFromSuggestions(i);
    366                     i--;
    367                     break;
    368                 }
    369             }
    370             i++;
    371         }
    372     }
    373 
    374     private void removeFromSuggestions(int index) {
    375         CharSequence garbage = mSuggestions.remove(index);
    376         if (garbage != null && garbage instanceof StringBuilder) {
    377             mStringPool.add(garbage);
    378         }
    379     }
    380 
    381     public boolean hasMinimalCorrection() {
    382         return mHaveCorrection;
    383     }
    384 
    385     private boolean compareCaseInsensitive(final String mLowerOriginalWord,
    386             final char[] word, final int offset, final int length) {
    387         final int originalLength = mLowerOriginalWord.length();
    388         if (originalLength == length && Character.isUpperCase(word[offset])) {
    389             for (int i = 0; i < originalLength; i++) {
    390                 if (mLowerOriginalWord.charAt(i) != Character.toLowerCase(word[offset+i])) {
    391                     return false;
    392                 }
    393             }
    394             return true;
    395         }
    396         return false;
    397     }
    398 
    399     public boolean addWord(final char[] word, final int offset, final int length, int freq,
    400             final int dicTypeId, final Dictionary.DataType dataType) {
    401         Dictionary.DataType dataTypeForLog = dataType;
    402         ArrayList<CharSequence> suggestions;
    403         int[] priorities;
    404         int prefMaxSuggestions;
    405         if(dataType == Dictionary.DataType.BIGRAM) {
    406             suggestions = mBigramSuggestions;
    407             priorities = mBigramPriorities;
    408             prefMaxSuggestions = PREF_MAX_BIGRAMS;
    409         } else {
    410             suggestions = mSuggestions;
    411             priorities = mPriorities;
    412             prefMaxSuggestions = mPrefMaxSuggestions;
    413         }
    414 
    415         int pos = 0;
    416 
    417         // Check if it's the same word, only caps are different
    418         if (compareCaseInsensitive(mLowerOriginalWord, word, offset, length)) {
    419             pos = 0;
    420         } else {
    421             if (dataType == Dictionary.DataType.UNIGRAM) {
    422                 // Check if the word was already added before (by bigram data)
    423                 int bigramSuggestion = searchBigramSuggestion(word,offset,length);
    424                 if(bigramSuggestion >= 0) {
    425                     dataTypeForLog = Dictionary.DataType.BIGRAM;
    426                     // turn freq from bigram into multiplier specified above
    427                     double multiplier = (((double) mBigramPriorities[bigramSuggestion])
    428                             / MAXIMUM_BIGRAM_FREQUENCY)
    429                             * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN)
    430                             + BIGRAM_MULTIPLIER_MIN;
    431                     /* Log.d(TAG,"bigram num: " + bigramSuggestion
    432                             + "  wordB: " + mBigramSuggestions.get(bigramSuggestion).toString()
    433                             + "  currentPriority: " + freq + "  bigramPriority: "
    434                             + mBigramPriorities[bigramSuggestion]
    435                             + "  multiplier: " + multiplier); */
    436                     freq = (int)Math.round((freq * multiplier));
    437                 }
    438             }
    439 
    440             // Check the last one's priority and bail
    441             if (priorities[prefMaxSuggestions - 1] >= freq) return true;
    442             while (pos < prefMaxSuggestions) {
    443                 if (priorities[pos] < freq
    444                         || (priorities[pos] == freq && length < suggestions.get(pos).length())) {
    445                     break;
    446                 }
    447                 pos++;
    448             }
    449         }
    450         if (pos >= prefMaxSuggestions) {
    451             return true;
    452         }
    453 
    454         System.arraycopy(priorities, pos, priorities, pos + 1,
    455                 prefMaxSuggestions - pos - 1);
    456         priorities[pos] = freq;
    457         int poolSize = mStringPool.size();
    458         StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove(poolSize - 1)
    459                 : new StringBuilder(getApproxMaxWordLength());
    460         sb.setLength(0);
    461         // TODO: Must pay attention to locale when changing case.
    462         if (mIsAllUpperCase) {
    463             sb.append(new String(word, offset, length).toUpperCase());
    464         } else if (mIsFirstCharCapitalized) {
    465             sb.append(Character.toUpperCase(word[offset]));
    466             if (length > 1) {
    467                 sb.append(word, offset + 1, length - 1);
    468             }
    469         } else {
    470             sb.append(word, offset, length);
    471         }
    472         suggestions.add(pos, sb);
    473         if (suggestions.size() > prefMaxSuggestions) {
    474             CharSequence garbage = suggestions.remove(prefMaxSuggestions);
    475             if (garbage instanceof StringBuilder) {
    476                 mStringPool.add(garbage);
    477             }
    478         } else {
    479             LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog);
    480         }
    481         return true;
    482     }
    483 
    484     private int searchBigramSuggestion(final char[] word, final int offset, final int length) {
    485         // TODO This is almost O(n^2). Might need fix.
    486         // search whether the word appeared in bigram data
    487         int bigramSuggestSize = mBigramSuggestions.size();
    488         for(int i = 0; i < bigramSuggestSize; i++) {
    489             if(mBigramSuggestions.get(i).length() == length) {
    490                 boolean chk = true;
    491                 for(int j = 0; j < length; j++) {
    492                     if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) {
    493                         chk = false;
    494                         break;
    495                     }
    496                 }
    497                 if(chk) return i;
    498             }
    499         }
    500 
    501         return -1;
    502     }
    503 
    504     public boolean isValidWord(final CharSequence word) {
    505         if (word == null || word.length() == 0) {
    506             return false;
    507         }
    508         return mMainDict.isValidWord(word)
    509                 || (mUserDictionary != null && mUserDictionary.isValidWord(word))
    510                 || (mAutoDictionary != null && mAutoDictionary.isValidWord(word))
    511                 || (mContactsDictionary != null && mContactsDictionary.isValidWord(word));
    512     }
    513 
    514     private void collectGarbage(ArrayList<CharSequence> suggestions, int prefMaxSuggestions) {
    515         int poolSize = mStringPool.size();
    516         int garbageSize = suggestions.size();
    517         while (poolSize < prefMaxSuggestions && garbageSize > 0) {
    518             CharSequence garbage = suggestions.get(garbageSize - 1);
    519             if (garbage != null && garbage instanceof StringBuilder) {
    520                 mStringPool.add(garbage);
    521                 poolSize++;
    522             }
    523             garbageSize--;
    524         }
    525         if (poolSize == prefMaxSuggestions + 1) {
    526             Log.w("Suggest", "String pool got too big: " + poolSize);
    527         }
    528         suggestions.clear();
    529     }
    530 
    531     public void close() {
    532         if (mMainDict != null) {
    533             mMainDict.close();
    534         }
    535     }
    536 }
    537