Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.utils;
     18 
     19 import android.content.ContentValues;
     20 import android.content.Context;
     21 import android.content.res.AssetManager;
     22 import android.content.res.Resources;
     23 import android.text.TextUtils;
     24 import android.util.Log;
     25 
     26 import com.android.inputmethod.latin.AssetFileAddress;
     27 import com.android.inputmethod.latin.BinaryDictionaryGetter;
     28 import com.android.inputmethod.latin.Constants;
     29 import com.android.inputmethod.latin.R;
     30 import com.android.inputmethod.latin.makedict.DictionaryHeader;
     31 import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
     32 import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
     33 
     34 import java.io.File;
     35 import java.io.IOException;
     36 import java.util.ArrayList;
     37 import java.util.Iterator;
     38 import java.util.Locale;
     39 import java.util.concurrent.TimeUnit;
     40 
     41 /**
     42  * This class encapsulates the logic for the Latin-IME side of dictionary information management.
     43  */
     44 public class DictionaryInfoUtils {
     45     private static final String TAG = DictionaryInfoUtils.class.getSimpleName();
     46     private static final String RESOURCE_PACKAGE_NAME = R.class.getPackage().getName();
     47     private static final String DEFAULT_MAIN_DICT = "main";
     48     private static final String MAIN_DICT_PREFIX = "main_";
     49     // 6 digits - unicode is limited to 21 bits
     50     private static final int MAX_HEX_DIGITS_FOR_CODEPOINT = 6;
     51 
     52     public static class DictionaryInfo {
     53         private static final String LOCALE_COLUMN = "locale";
     54         private static final String WORDLISTID_COLUMN = "id";
     55         private static final String LOCAL_FILENAME_COLUMN = "filename";
     56         private static final String DESCRIPTION_COLUMN = "description";
     57         private static final String DATE_COLUMN = "date";
     58         private static final String FILESIZE_COLUMN = "filesize";
     59         private static final String VERSION_COLUMN = "version";
     60         public final String mId;
     61         public final Locale mLocale;
     62         public final String mDescription;
     63         public final AssetFileAddress mFileAddress;
     64         public final int mVersion;
     65         public DictionaryInfo(final String id, final Locale locale, final String description,
     66                 final AssetFileAddress fileAddress, final int version) {
     67             mId = id;
     68             mLocale = locale;
     69             mDescription = description;
     70             mFileAddress = fileAddress;
     71             mVersion = version;
     72         }
     73         public ContentValues toContentValues() {
     74             final ContentValues values = new ContentValues();
     75             values.put(WORDLISTID_COLUMN, mId);
     76             values.put(LOCALE_COLUMN, mLocale.toString());
     77             values.put(DESCRIPTION_COLUMN, mDescription);
     78             values.put(LOCAL_FILENAME_COLUMN, mFileAddress.mFilename);
     79             values.put(DATE_COLUMN, TimeUnit.MILLISECONDS.toSeconds(
     80                     new File(mFileAddress.mFilename).lastModified()));
     81             values.put(FILESIZE_COLUMN, mFileAddress.mLength);
     82             values.put(VERSION_COLUMN, mVersion);
     83             return values;
     84         }
     85     }
     86 
     87     private DictionaryInfoUtils() {
     88         // Private constructor to forbid instantation of this helper class.
     89     }
     90 
     91     /**
     92      * Returns whether we may want to use this character as part of a file name.
     93      *
     94      * This basically only accepts ascii letters and numbers, and rejects everything else.
     95      */
     96     private static boolean isFileNameCharacter(int codePoint) {
     97         if (codePoint >= 0x30 && codePoint <= 0x39) return true; // Digit
     98         if (codePoint >= 0x41 && codePoint <= 0x5A) return true; // Uppercase
     99         if (codePoint >= 0x61 && codePoint <= 0x7A) return true; // Lowercase
    100         return codePoint == '_'; // Underscore
    101     }
    102 
    103     /**
    104      * Escapes a string for any characters that may be suspicious for a file or directory name.
    105      *
    106      * Concretely this does a sort of URL-encoding except it will encode everything that's not
    107      * alphanumeric or underscore. (true URL-encoding leaves alone characters like '*', which
    108      * we cannot allow here)
    109      */
    110     // TODO: create a unit test for this method
    111     public static String replaceFileNameDangerousCharacters(final String name) {
    112         // This assumes '%' is fully available as a non-separator, normal
    113         // character in a file name. This is probably true for all file systems.
    114         final StringBuilder sb = new StringBuilder();
    115         final int nameLength = name.length();
    116         for (int i = 0; i < nameLength; i = name.offsetByCodePoints(i, 1)) {
    117             final int codePoint = name.codePointAt(i);
    118             if (DictionaryInfoUtils.isFileNameCharacter(codePoint)) {
    119                 sb.appendCodePoint(codePoint);
    120             } else {
    121                 sb.append(String.format((Locale)null, "%%%1$0" + MAX_HEX_DIGITS_FOR_CODEPOINT + "x",
    122                         codePoint));
    123             }
    124         }
    125         return sb.toString();
    126     }
    127 
    128     /**
    129      * Helper method to get the top level cache directory.
    130      */
    131     private static String getWordListCacheDirectory(final Context context) {
    132         return context.getFilesDir() + File.separator + "dicts";
    133     }
    134 
    135     /**
    136      * Helper method to get the top level temp directory.
    137      */
    138     public static String getWordListTempDirectory(final Context context) {
    139         return context.getFilesDir() + File.separator + "tmp";
    140     }
    141 
    142     /**
    143      * Reverse escaping done by replaceFileNameDangerousCharacters.
    144      */
    145     public static String getWordListIdFromFileName(final String fname) {
    146         final StringBuilder sb = new StringBuilder();
    147         final int fnameLength = fname.length();
    148         for (int i = 0; i < fnameLength; i = fname.offsetByCodePoints(i, 1)) {
    149             final int codePoint = fname.codePointAt(i);
    150             if ('%' != codePoint) {
    151                 sb.appendCodePoint(codePoint);
    152             } else {
    153                 // + 1 to pass the % sign
    154                 final int encodedCodePoint = Integer.parseInt(
    155                         fname.substring(i + 1, i + 1 + MAX_HEX_DIGITS_FOR_CODEPOINT), 16);
    156                 i += MAX_HEX_DIGITS_FOR_CODEPOINT;
    157                 sb.appendCodePoint(encodedCodePoint);
    158             }
    159         }
    160         return sb.toString();
    161     }
    162 
    163     /**
    164      * Helper method to the list of cache directories, one for each distinct locale.
    165      */
    166     public static File[] getCachedDirectoryList(final Context context) {
    167         return new File(DictionaryInfoUtils.getWordListCacheDirectory(context)).listFiles();
    168     }
    169 
    170     /**
    171      * Returns the category for a given file name.
    172      *
    173      * This parses the file name, extracts the category, and returns it. See
    174      * {@link #getMainDictId(Locale)} and {@link #isMainWordListId(String)}.
    175      * @return The category as a string or null if it can't be found in the file name.
    176      */
    177     public static String getCategoryFromFileName(final String fileName) {
    178         final String id = getWordListIdFromFileName(fileName);
    179         final String[] idArray = id.split(BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR);
    180         // An id is supposed to be in format category:locale, so splitting on the separator
    181         // should yield a 2-elements array
    182         if (2 != idArray.length) return null;
    183         return idArray[0];
    184     }
    185 
    186     /**
    187      * Find out the cache directory associated with a specific locale.
    188      */
    189     private static String getCacheDirectoryForLocale(final String locale, final Context context) {
    190         final String relativeDirectoryName = replaceFileNameDangerousCharacters(locale);
    191         final String absoluteDirectoryName = getWordListCacheDirectory(context) + File.separator
    192                 + relativeDirectoryName;
    193         final File directory = new File(absoluteDirectoryName);
    194         if (!directory.exists()) {
    195             if (!directory.mkdirs()) {
    196                 Log.e(TAG, "Could not create the directory for locale" + locale);
    197             }
    198         }
    199         return absoluteDirectoryName;
    200     }
    201 
    202     /**
    203      * Generates a file name for the id and locale passed as an argument.
    204      *
    205      * In the current implementation the file name returned will always be unique for
    206      * any id/locale pair, but please do not expect that the id can be the same for
    207      * different dictionaries with different locales. An id should be unique for any
    208      * dictionary.
    209      * The file name is pretty much an URL-encoded version of the id inside a directory
    210      * named like the locale, except it will also escape characters that look dangerous
    211      * to some file systems.
    212      * @param id the id of the dictionary for which to get a file name
    213      * @param locale the locale for which to get the file name as a string
    214      * @param context the context to use for getting the directory
    215      * @return the name of the file to be created
    216      */
    217     public static String getCacheFileName(String id, String locale, Context context) {
    218         final String fileName = replaceFileNameDangerousCharacters(id);
    219         return getCacheDirectoryForLocale(locale, context) + File.separator + fileName;
    220     }
    221 
    222     public static boolean isMainWordListId(final String id) {
    223         final String[] idArray = id.split(BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR);
    224         // An id is supposed to be in format category:locale, so splitting on the separator
    225         // should yield a 2-elements array
    226         if (2 != idArray.length) return false;
    227         return BinaryDictionaryGetter.MAIN_DICTIONARY_CATEGORY.equals(idArray[0]);
    228     }
    229 
    230     /**
    231      * Helper method to return a dictionary res id for a locale, or 0 if none.
    232      * @param locale dictionary locale
    233      * @return main dictionary resource id
    234      */
    235     public static int getMainDictionaryResourceIdIfAvailableForLocale(final Resources res,
    236             final Locale locale) {
    237         int resId;
    238         // Try to find main_language_country dictionary.
    239         if (!locale.getCountry().isEmpty()) {
    240             final String dictLanguageCountry =
    241                     MAIN_DICT_PREFIX + locale.toString().toLowerCase(Locale.ROOT);
    242             if ((resId = res.getIdentifier(
    243                     dictLanguageCountry, "raw", RESOURCE_PACKAGE_NAME)) != 0) {
    244                 return resId;
    245             }
    246         }
    247 
    248         // Try to find main_language dictionary.
    249         final String dictLanguage = MAIN_DICT_PREFIX + locale.getLanguage();
    250         if ((resId = res.getIdentifier(dictLanguage, "raw", RESOURCE_PACKAGE_NAME)) != 0) {
    251             return resId;
    252         }
    253 
    254         // Not found, return 0
    255         return 0;
    256     }
    257 
    258     /**
    259      * Returns a main dictionary resource id
    260      * @param locale dictionary locale
    261      * @return main dictionary resource id
    262      */
    263     public static int getMainDictionaryResourceId(final Resources res, final Locale locale) {
    264         int resourceId = getMainDictionaryResourceIdIfAvailableForLocale(res, locale);
    265         if (0 != resourceId) return resourceId;
    266         return res.getIdentifier(DEFAULT_MAIN_DICT, "raw", RESOURCE_PACKAGE_NAME);
    267     }
    268 
    269     /**
    270      * Returns the id associated with the main word list for a specified locale.
    271      *
    272      * Word lists stored in Android Keyboard's resources are referred to as the "main"
    273      * word lists. Since they can be updated like any other list, we need to assign a
    274      * unique ID to them. This ID is just the name of the language (locale-wise) they
    275      * are for, and this method returns this ID.
    276      */
    277     public static String getMainDictId(final Locale locale) {
    278         // This works because we don't include by default different dictionaries for
    279         // different countries. This actually needs to return the id that we would
    280         // like to use for word lists included in resources, and the following is okay.
    281         return BinaryDictionaryGetter.MAIN_DICTIONARY_CATEGORY +
    282                 BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR + locale.getLanguage().toString();
    283     }
    284 
    285     public static DictionaryHeader getDictionaryFileHeaderOrNull(final File file) {
    286         return getDictionaryFileHeaderOrNull(file, 0, file.length());
    287     }
    288 
    289     private static DictionaryHeader getDictionaryFileHeaderOrNull(final File file,
    290             final long offset, final long length) {
    291         try {
    292             final DictionaryHeader header =
    293                     BinaryDictionaryUtils.getHeaderWithOffsetAndLength(file, offset, length);
    294             return header;
    295         } catch (UnsupportedFormatException e) {
    296             return null;
    297         } catch (IOException e) {
    298             return null;
    299         }
    300     }
    301 
    302     /**
    303      * Returns information of the dictionary.
    304      *
    305      * @param fileAddress the asset dictionary file address.
    306      * @return information of the specified dictionary.
    307      */
    308     private static DictionaryInfo createDictionaryInfoFromFileAddress(
    309             final AssetFileAddress fileAddress) {
    310         final DictionaryHeader header = getDictionaryFileHeaderOrNull(
    311                 new File(fileAddress.mFilename), fileAddress.mOffset, fileAddress.mLength);
    312         if (header == null) {
    313             return null;
    314         }
    315         final String id = header.getId();
    316         final Locale locale = LocaleUtils.constructLocaleFromString(header.getLocaleString());
    317         final String description = header.getDescription();
    318         final String version = header.getVersion();
    319         return new DictionaryInfo(id, locale, description, fileAddress, Integer.parseInt(version));
    320     }
    321 
    322     private static void addOrUpdateDictInfo(final ArrayList<DictionaryInfo> dictList,
    323             final DictionaryInfo newElement) {
    324         final Iterator<DictionaryInfo> iter = dictList.iterator();
    325         while (iter.hasNext()) {
    326             final DictionaryInfo thisDictInfo = iter.next();
    327             if (thisDictInfo.mLocale.equals(newElement.mLocale)) {
    328                 if (newElement.mVersion <= thisDictInfo.mVersion) {
    329                     return;
    330                 }
    331                 iter.remove();
    332             }
    333         }
    334         dictList.add(newElement);
    335     }
    336 
    337     public static ArrayList<DictionaryInfo> getCurrentDictionaryFileNameAndVersionInfo(
    338             final Context context) {
    339         final ArrayList<DictionaryInfo> dictList = new ArrayList<>();
    340 
    341         // Retrieve downloaded dictionaries
    342         final File[] directoryList = getCachedDirectoryList(context);
    343         if (null != directoryList) {
    344             for (final File directory : directoryList) {
    345                 final String localeString = getWordListIdFromFileName(directory.getName());
    346                 File[] dicts = BinaryDictionaryGetter.getCachedWordLists(localeString, context);
    347                 for (final File dict : dicts) {
    348                     final String wordListId = getWordListIdFromFileName(dict.getName());
    349                     if (!DictionaryInfoUtils.isMainWordListId(wordListId)) continue;
    350                     final Locale locale = LocaleUtils.constructLocaleFromString(localeString);
    351                     final AssetFileAddress fileAddress = AssetFileAddress.makeFromFile(dict);
    352                     final DictionaryInfo dictionaryInfo =
    353                             createDictionaryInfoFromFileAddress(fileAddress);
    354                     // Protect against cases of a less-specific dictionary being found, like an
    355                     // en dictionary being used for an en_US locale. In this case, the en dictionary
    356                     // should be used for en_US but discounted for listing purposes.
    357                     if (dictionaryInfo == null || !dictionaryInfo.mLocale.equals(locale)) continue;
    358                     addOrUpdateDictInfo(dictList, dictionaryInfo);
    359                 }
    360             }
    361         }
    362 
    363         // Retrieve files from assets
    364         final Resources resources = context.getResources();
    365         final AssetManager assets = resources.getAssets();
    366         for (final String localeString : assets.getLocales()) {
    367             final Locale locale = LocaleUtils.constructLocaleFromString(localeString);
    368             final int resourceId =
    369                     DictionaryInfoUtils.getMainDictionaryResourceIdIfAvailableForLocale(
    370                             context.getResources(), locale);
    371             if (0 == resourceId) continue;
    372             final AssetFileAddress fileAddress =
    373                     BinaryDictionaryGetter.loadFallbackResource(context, resourceId);
    374             final DictionaryInfo dictionaryInfo = createDictionaryInfoFromFileAddress(fileAddress);
    375             // Protect against cases of a less-specific dictionary being found, like an
    376             // en dictionary being used for an en_US locale. In this case, the en dictionary
    377             // should be used for en_US but discounted for listing purposes.
    378             if (!dictionaryInfo.mLocale.equals(locale)) continue;
    379             addOrUpdateDictInfo(dictList, dictionaryInfo);
    380         }
    381 
    382         return dictList;
    383     }
    384 
    385     public static boolean looksValidForDictionaryInsertion(final CharSequence text,
    386             final SpacingAndPunctuations spacingAndPunctuations) {
    387         if (TextUtils.isEmpty(text)) return false;
    388         final int length = text.length();
    389         if (length > Constants.DICTIONARY_MAX_WORD_LENGTH) {
    390             return false;
    391         }
    392         int i = 0;
    393         int digitCount = 0;
    394         while (i < length) {
    395             final int codePoint = Character.codePointAt(text, i);
    396             final int charCount = Character.charCount(codePoint);
    397             i += charCount;
    398             if (Character.isDigit(codePoint)) {
    399                 // Count digits: see below
    400                 digitCount += charCount;
    401                 continue;
    402             }
    403             if (!spacingAndPunctuations.isWordCodePoint(codePoint)) return false;
    404         }
    405         // We reject strings entirely comprised of digits to avoid using PIN codes or credit
    406         // card numbers. It would come in handy for word prediction though; a good example is
    407         // when writing one's address where the street number is usually quite discriminative,
    408         // as well as the postal code.
    409         return digitCount < length;
    410     }
    411 }
    412