Home | History | Annotate | Download | only in spellcheck
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.spellcheck;
     18 
     19 import android.content.ContentResolver;
     20 import android.database.ContentObserver;
     21 import android.os.Binder;
     22 import android.provider.UserDictionary.Words;
     23 import android.service.textservice.SpellCheckerService.Session;
     24 import android.text.TextUtils;
     25 import android.util.Log;
     26 import android.util.LruCache;
     27 import android.view.textservice.SuggestionsInfo;
     28 import android.view.textservice.TextInfo;
     29 
     30 import com.android.inputmethod.compat.SuggestionsInfoCompatUtils;
     31 import com.android.inputmethod.latin.Constants;
     32 import com.android.inputmethod.latin.Dictionary;
     33 import com.android.inputmethod.latin.LocaleUtils;
     34 import com.android.inputmethod.latin.StringUtils;
     35 import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
     36 import com.android.inputmethod.latin.WordComposer;
     37 import com.android.inputmethod.latin.spellcheck.AndroidSpellCheckerService.SuggestionsGatherer;
     38 
     39 import java.util.ArrayList;
     40 import java.util.Locale;
     41 
     42 public abstract class AndroidWordLevelSpellCheckerSession extends Session {
     43     private static final String TAG = AndroidWordLevelSpellCheckerSession.class.getSimpleName();
     44     private static final boolean DBG = false;
     45 
     46     // Immutable, but need the locale which is not available in the constructor yet
     47     private DictionaryPool mDictionaryPool;
     48     // Likewise
     49     private Locale mLocale;
     50     // Cache this for performance
     51     private int mScript; // One of SCRIPT_LATIN or SCRIPT_CYRILLIC for now.
     52     private final AndroidSpellCheckerService mService;
     53     protected final SuggestionsCache mSuggestionsCache = new SuggestionsCache();
     54     private final ContentObserver mObserver;
     55 
     56     private static final class SuggestionsParams {
     57         public final String[] mSuggestions;
     58         public final int mFlags;
     59         public SuggestionsParams(String[] suggestions, int flags) {
     60             mSuggestions = suggestions;
     61             mFlags = flags;
     62         }
     63     }
     64 
     65     protected static final class SuggestionsCache {
     66         private static final char CHAR_DELIMITER = '\uFFFC';
     67         private static final int MAX_CACHE_SIZE = 50;
     68         private final LruCache<String, SuggestionsParams> mUnigramSuggestionsInfoCache =
     69                 new LruCache<String, SuggestionsParams>(MAX_CACHE_SIZE);
     70 
     71         // TODO: Support n-gram input
     72         private static String generateKey(String query, String prevWord) {
     73             if (TextUtils.isEmpty(query) || TextUtils.isEmpty(prevWord)) {
     74                 return query;
     75             }
     76             return query + CHAR_DELIMITER + prevWord;
     77         }
     78 
     79         // TODO: Support n-gram input
     80         public SuggestionsParams getSuggestionsFromCache(String query, String prevWord) {
     81             return mUnigramSuggestionsInfoCache.get(generateKey(query, prevWord));
     82         }
     83 
     84         // TODO: Support n-gram input
     85         public void putSuggestionsToCache(
     86                 String query, String prevWord, String[] suggestions, int flags) {
     87             if (suggestions == null || TextUtils.isEmpty(query)) {
     88                 return;
     89             }
     90             mUnigramSuggestionsInfoCache.put(
     91                     generateKey(query, prevWord), new SuggestionsParams(suggestions, flags));
     92         }
     93 
     94         public void clearCache() {
     95             mUnigramSuggestionsInfoCache.evictAll();
     96         }
     97     }
     98 
     99     AndroidWordLevelSpellCheckerSession(final AndroidSpellCheckerService service) {
    100         mService = service;
    101         final ContentResolver cres = service.getContentResolver();
    102 
    103         mObserver = new ContentObserver(null) {
    104             @Override
    105             public void onChange(boolean self) {
    106                 mSuggestionsCache.clearCache();
    107             }
    108         };
    109         cres.registerContentObserver(Words.CONTENT_URI, true, mObserver);
    110     }
    111 
    112     @Override
    113     public void onCreate() {
    114         final String localeString = getLocale();
    115         mDictionaryPool = mService.getDictionaryPool(localeString);
    116         mLocale = LocaleUtils.constructLocaleFromString(localeString);
    117         mScript = AndroidSpellCheckerService.getScriptFromLocale(mLocale);
    118     }
    119 
    120     @Override
    121     public void onClose() {
    122         final ContentResolver cres = mService.getContentResolver();
    123         cres.unregisterContentObserver(mObserver);
    124     }
    125 
    126     /*
    127      * Returns whether the code point is a letter that makes sense for the specified
    128      * locale for this spell checker.
    129      * The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml
    130      * and is limited to EFIGS languages and Russian.
    131      * Hence at the moment this explicitly tests for Cyrillic characters or Latin characters
    132      * as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters.
    133      */
    134     private static boolean isLetterCheckableByLanguage(final int codePoint,
    135             final int script) {
    136         switch (script) {
    137         case AndroidSpellCheckerService.SCRIPT_LATIN:
    138             // Our supported latin script dictionaries (EFIGS) at the moment only include
    139             // characters in the C0, C1, Latin Extended A and B, IPA extensions unicode
    140             // blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF,
    141             // so the below is a very efficient way to test for it. As for the 0-0x3F, it's
    142             // excluded from isLetter anyway.
    143             return codePoint <= 0x2AF && Character.isLetter(codePoint);
    144         case AndroidSpellCheckerService.SCRIPT_CYRILLIC:
    145             // All Cyrillic characters are in the 400~52F block. There are some in the upper
    146             // Unicode range, but they are archaic characters that are not used in modern
    147             // Russian and are not used by our dictionary.
    148             return codePoint >= 0x400 && codePoint <= 0x52F && Character.isLetter(codePoint);
    149         case AndroidSpellCheckerService.SCRIPT_GREEK:
    150             // Greek letters are either in the 370~3FF range (Greek & Coptic), or in the
    151             // 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters.
    152             // Our dictionary also contains a few words with 0xF2; it would be best to check
    153             // if that's correct, but a web search does return results for these words so
    154             // they are probably okay.
    155             return (codePoint >= 0x370 && codePoint <= 0x3FF)
    156                     || (codePoint >= 0x1F00 && codePoint <= 0x1FFF)
    157                     || codePoint == 0xF2;
    158         default:
    159             // Should never come here
    160             throw new RuntimeException("Impossible value of script: " + script);
    161         }
    162     }
    163 
    164     /**
    165      * Finds out whether a particular string should be filtered out of spell checking.
    166      *
    167      * This will loosely match URLs, numbers, symbols. To avoid always underlining words that
    168      * we know we will never recognize, this accepts a script identifier that should be one
    169      * of the SCRIPT_* constants defined above, to rule out quickly characters from very
    170      * different languages.
    171      *
    172      * @param text the string to evaluate.
    173      * @param script the identifier for the script this spell checker recognizes
    174      * @return true if we should filter this text out, false otherwise
    175      */
    176     private static boolean shouldFilterOut(final String text, final int script) {
    177         if (TextUtils.isEmpty(text) || text.length() <= 1) return true;
    178 
    179         // TODO: check if an equivalent processing can't be done more quickly with a
    180         // compiled regexp.
    181         // Filter by first letter
    182         final int firstCodePoint = text.codePointAt(0);
    183         // Filter out words that don't start with a letter or an apostrophe
    184         if (!isLetterCheckableByLanguage(firstCodePoint, script)
    185                 && '\'' != firstCodePoint) return true;
    186 
    187         // Filter contents
    188         final int length = text.length();
    189         int letterCount = 0;
    190         for (int i = 0; i < length; i = text.offsetByCodePoints(i, 1)) {
    191             final int codePoint = text.codePointAt(i);
    192             // Any word containing a COMMERCIAL_AT is probably an e-mail address
    193             // Any word containing a SLASH is probably either an ad-hoc combination of two
    194             // words or a URI - in either case we don't want to spell check that
    195             if (Constants.CODE_COMMERCIAL_AT == codePoint || Constants.CODE_SLASH == codePoint) {
    196                 return true;
    197             }
    198             if (isLetterCheckableByLanguage(codePoint, script)) ++letterCount;
    199         }
    200         // Guestimate heuristic: perform spell checking if at least 3/4 of the characters
    201         // in this word are letters
    202         return (letterCount * 4 < length * 3);
    203     }
    204 
    205     /**
    206      * Helper method to test valid capitalizations of a word.
    207      *
    208      * If the "text" is lower-case, we test only the exact string.
    209      * If the "Text" is capitalized, we test the exact string "Text" and the lower-cased
    210      *  version of it "text".
    211      * If the "TEXT" is fully upper case, we test the exact string "TEXT", the lower-cased
    212      *  version of it "text" and the capitalized version of it "Text".
    213      */
    214     private boolean isInDictForAnyCapitalization(final Dictionary dict, final String text,
    215             final int capitalizeType) {
    216         // If the word is in there as is, then it's in the dictionary. If not, we'll test lower
    217         // case versions, but only if the word is not already all-lower case or mixed case.
    218         if (dict.isValidWord(text)) return true;
    219         if (StringUtils.CAPITALIZE_NONE == capitalizeType) return false;
    220 
    221         // If we come here, we have a capitalized word (either First- or All-).
    222         // Downcase the word and look it up again. If the word is only capitalized, we
    223         // tested all possibilities, so if it's still negative we can return false.
    224         final String lowerCaseText = text.toLowerCase(mLocale);
    225         if (dict.isValidWord(lowerCaseText)) return true;
    226         if (StringUtils.CAPITALIZE_FIRST == capitalizeType) return false;
    227 
    228         // If the lower case version is not in the dictionary, it's still possible
    229         // that we have an all-caps version of a word that needs to be capitalized
    230         // according to the dictionary. E.g. "GERMANS" only exists in the dictionary as "Germans".
    231         return dict.isValidWord(StringUtils.capitalizeFirstAndDowncaseRest(lowerCaseText, mLocale));
    232     }
    233 
    234     // Note : this must be reentrant
    235     /**
    236      * Gets a list of suggestions for a specific string. This returns a list of possible
    237      * corrections for the text passed as an argument. It may split or group words, and
    238      * even perform grammatical analysis.
    239      */
    240     private SuggestionsInfo onGetSuggestionsInternal(final TextInfo textInfo,
    241             final int suggestionsLimit) {
    242         return onGetSuggestionsInternal(textInfo, null, suggestionsLimit);
    243     }
    244 
    245     protected SuggestionsInfo onGetSuggestionsInternal(
    246             final TextInfo textInfo, final String prevWord, final int suggestionsLimit) {
    247         try {
    248             final String inText = textInfo.getText();
    249             final SuggestionsParams cachedSuggestionsParams =
    250                     mSuggestionsCache.getSuggestionsFromCache(inText, prevWord);
    251             if (cachedSuggestionsParams != null) {
    252                 if (DBG) {
    253                     Log.d(TAG, "Cache hit: " + inText + ", " + cachedSuggestionsParams.mFlags);
    254                 }
    255                 return new SuggestionsInfo(
    256                         cachedSuggestionsParams.mFlags, cachedSuggestionsParams.mSuggestions);
    257             }
    258 
    259             if (shouldFilterOut(inText, mScript)) {
    260                 DictAndKeyboard dictInfo = null;
    261                 try {
    262                     dictInfo = mDictionaryPool.pollWithDefaultTimeout();
    263                     if (!DictionaryPool.isAValidDictionary(dictInfo)) {
    264                         return AndroidSpellCheckerService.getNotInDictEmptySuggestions();
    265                     }
    266                     return dictInfo.mDictionary.isValidWord(inText)
    267                             ? AndroidSpellCheckerService.getInDictEmptySuggestions()
    268                             : AndroidSpellCheckerService.getNotInDictEmptySuggestions();
    269                 } finally {
    270                     if (null != dictInfo) {
    271                         if (!mDictionaryPool.offer(dictInfo)) {
    272                             Log.e(TAG, "Can't re-insert a dictionary into its pool");
    273                         }
    274                     }
    275                 }
    276             }
    277             final String text = inText.replaceAll(
    278                     AndroidSpellCheckerService.APOSTROPHE, AndroidSpellCheckerService.SINGLE_QUOTE);
    279 
    280             // TODO: Don't gather suggestions if the limit is <= 0 unless necessary
    281             //final SuggestionsGatherer suggestionsGatherer = new SuggestionsGatherer(text,
    282             //mService.mSuggestionThreshold, mService.mRecommendedThreshold,
    283             //suggestionsLimit);
    284             final SuggestionsGatherer suggestionsGatherer = mService.newSuggestionsGatherer(
    285                     text, suggestionsLimit);
    286 
    287             final int capitalizeType = StringUtils.getCapitalizationType(text);
    288             boolean isInDict = true;
    289             DictAndKeyboard dictInfo = null;
    290             try {
    291                 dictInfo = mDictionaryPool.pollWithDefaultTimeout();
    292                 if (!DictionaryPool.isAValidDictionary(dictInfo)) {
    293                     return AndroidSpellCheckerService.getNotInDictEmptySuggestions();
    294                 }
    295                 final WordComposer composer = new WordComposer();
    296                 final int length = text.length();
    297                 for (int i = 0; i < length; i = text.offsetByCodePoints(i, 1)) {
    298                     final int codePoint = text.codePointAt(i);
    299                     composer.addKeyInfo(codePoint, dictInfo.getKeyboard(codePoint));
    300                 }
    301                 // TODO: make a spell checker option to block offensive words or not
    302                 final ArrayList<SuggestedWordInfo> suggestions =
    303                         dictInfo.mDictionary.getSuggestions(composer, prevWord,
    304                                 dictInfo.getProximityInfo(),
    305                                 true /* blockOffensiveWords */);
    306                 for (final SuggestedWordInfo suggestion : suggestions) {
    307                     final String suggestionStr = suggestion.mWord;
    308                     suggestionsGatherer.addWord(suggestionStr.toCharArray(), null, 0,
    309                             suggestionStr.length(), suggestion.mScore);
    310                 }
    311                 isInDict = isInDictForAnyCapitalization(dictInfo.mDictionary, text, capitalizeType);
    312             } finally {
    313                 if (null != dictInfo) {
    314                     if (!mDictionaryPool.offer(dictInfo)) {
    315                         Log.e(TAG, "Can't re-insert a dictionary into its pool");
    316                     }
    317                 }
    318             }
    319 
    320             final SuggestionsGatherer.Result result = suggestionsGatherer.getResults(
    321                     capitalizeType, mLocale);
    322 
    323             if (DBG) {
    324                 Log.i(TAG, "Spell checking results for " + text + " with suggestion limit "
    325                         + suggestionsLimit);
    326                 Log.i(TAG, "IsInDict = " + isInDict);
    327                 Log.i(TAG, "LooksLikeTypo = " + (!isInDict));
    328                 Log.i(TAG, "HasRecommendedSuggestions = " + result.mHasRecommendedSuggestions);
    329                 if (null != result.mSuggestions) {
    330                     for (String suggestion : result.mSuggestions) {
    331                         Log.i(TAG, suggestion);
    332                     }
    333                 }
    334             }
    335 
    336             final int flags =
    337                     (isInDict ? SuggestionsInfo.RESULT_ATTR_IN_THE_DICTIONARY
    338                             : SuggestionsInfo.RESULT_ATTR_LOOKS_LIKE_TYPO)
    339                     | (result.mHasRecommendedSuggestions
    340                             ? SuggestionsInfoCompatUtils
    341                                     .getValueOf_RESULT_ATTR_HAS_RECOMMENDED_SUGGESTIONS()
    342                             : 0);
    343             final SuggestionsInfo retval = new SuggestionsInfo(flags, result.mSuggestions);
    344             mSuggestionsCache.putSuggestionsToCache(text, prevWord, result.mSuggestions, flags);
    345             return retval;
    346         } catch (RuntimeException e) {
    347             // Don't kill the keyboard if there is a bug in the spell checker
    348             if (DBG) {
    349                 throw e;
    350             } else {
    351                 Log.e(TAG, "Exception while spellcheking", e);
    352                 return AndroidSpellCheckerService.getNotInDictEmptySuggestions();
    353             }
    354         }
    355     }
    356 
    357     /*
    358      * The spell checker acts on its own behalf. That is needed, in particular, to be able to
    359      * access the dictionary files, which the provider restricts to the identity of Latin IME.
    360      * Since it's called externally by the application, the spell checker is using the identity
    361      * of the application by default unless we clearCallingIdentity.
    362      * That's what the following method does.
    363      */
    364     @Override
    365     public SuggestionsInfo onGetSuggestions(final TextInfo textInfo,
    366             final int suggestionsLimit) {
    367         long ident = Binder.clearCallingIdentity();
    368         try {
    369             return onGetSuggestionsInternal(textInfo, suggestionsLimit);
    370         } finally {
    371             Binder.restoreCallingIdentity(ident);
    372         }
    373     }
    374 }
    375