Home | History | Annotate | Download | only in latin
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin;
     18 
     19 import android.text.TextUtils;
     20 
     21 import com.android.inputmethod.keyboard.Keyboard; // For character constants
     22 
     23 import java.util.ArrayList;
     24 import java.util.Locale;
     25 
     26 public final class StringUtils {
     27     private StringUtils() {
     28         // This utility class is not publicly instantiable.
     29     }
     30 
     31     public static int codePointCount(String text) {
     32         if (TextUtils.isEmpty(text)) return 0;
     33         return text.codePointCount(0, text.length());
     34     }
     35 
     36     public static boolean containsInArray(String key, String[] array) {
     37         for (final String element : array) {
     38             if (key.equals(element)) return true;
     39         }
     40         return false;
     41     }
     42 
     43     public static boolean containsInCsv(String key, String csv) {
     44         if (TextUtils.isEmpty(csv)) return false;
     45         return containsInArray(key, csv.split(","));
     46     }
     47 
     48     public static String appendToCsvIfNotExists(String key, String csv) {
     49         if (TextUtils.isEmpty(csv)) return key;
     50         if (containsInCsv(key, csv)) return csv;
     51         return csv + "," + key;
     52     }
     53 
     54     public static String removeFromCsvIfExists(String key, String csv) {
     55         if (TextUtils.isEmpty(csv)) return "";
     56         final String[] elements = csv.split(",");
     57         if (!containsInArray(key, elements)) return csv;
     58         final ArrayList<String> result = CollectionUtils.newArrayList(elements.length - 1);
     59         for (final String element : elements) {
     60             if (!key.equals(element)) result.add(element);
     61         }
     62         return TextUtils.join(",", result);
     63     }
     64 
     65     /**
     66      * Returns true if a and b are equal ignoring the case of the character.
     67      * @param a first character to check
     68      * @param b second character to check
     69      * @return {@code true} if a and b are equal, {@code false} otherwise.
     70      */
     71     public static boolean equalsIgnoreCase(char a, char b) {
     72         // Some language, such as Turkish, need testing both cases.
     73         return a == b
     74                 || Character.toLowerCase(a) == Character.toLowerCase(b)
     75                 || Character.toUpperCase(a) == Character.toUpperCase(b);
     76     }
     77 
     78     /**
     79      * Returns true if a and b are equal ignoring the case of the characters, including if they are
     80      * both null.
     81      * @param a first CharSequence to check
     82      * @param b second CharSequence to check
     83      * @return {@code true} if a and b are equal, {@code false} otherwise.
     84      */
     85     public static boolean equalsIgnoreCase(CharSequence a, CharSequence b) {
     86         if (a == b)
     87             return true;  // including both a and b are null.
     88         if (a == null || b == null)
     89             return false;
     90         final int length = a.length();
     91         if (length != b.length())
     92             return false;
     93         for (int i = 0; i < length; i++) {
     94             if (!equalsIgnoreCase(a.charAt(i), b.charAt(i)))
     95                 return false;
     96         }
     97         return true;
     98     }
     99 
    100     /**
    101      * Returns true if a and b are equal ignoring the case of the characters, including if a is null
    102      * and b is zero length.
    103      * @param a CharSequence to check
    104      * @param b character array to check
    105      * @param offset start offset of array b
    106      * @param length length of characters in array b
    107      * @return {@code true} if a and b are equal, {@code false} otherwise.
    108      * @throws IndexOutOfBoundsException
    109      *   if {@code offset < 0 || length < 0 || offset + length > data.length}.
    110      * @throws NullPointerException if {@code b == null}.
    111      */
    112     public static boolean equalsIgnoreCase(CharSequence a, char[] b, int offset, int length) {
    113         if (offset < 0 || length < 0 || length > b.length - offset)
    114             throw new IndexOutOfBoundsException("array.length=" + b.length + " offset=" + offset
    115                     + " length=" + length);
    116         if (a == null)
    117             return length == 0;  // including a is null and b is zero length.
    118         if (a.length() != length)
    119             return false;
    120         for (int i = 0; i < length; i++) {
    121             if (!equalsIgnoreCase(a.charAt(i), b[offset + i]))
    122                 return false;
    123         }
    124         return true;
    125     }
    126 
    127     /**
    128      * Remove duplicates from an array of strings.
    129      *
    130      * This method will always keep the first occurrence of all strings at their position
    131      * in the array, removing the subsequent ones.
    132      */
    133     public static void removeDupes(final ArrayList<CharSequence> suggestions) {
    134         if (suggestions.size() < 2) return;
    135         int i = 1;
    136         // Don't cache suggestions.size(), since we may be removing items
    137         while (i < suggestions.size()) {
    138             final CharSequence cur = suggestions.get(i);
    139             // Compare each suggestion with each previous suggestion
    140             for (int j = 0; j < i; j++) {
    141                 CharSequence previous = suggestions.get(j);
    142                 if (TextUtils.equals(cur, previous)) {
    143                     suggestions.remove(i);
    144                     i--;
    145                     break;
    146                 }
    147             }
    148             i++;
    149         }
    150     }
    151 
    152     public static String toTitleCase(String s, Locale locale) {
    153         if (s.length() <= 1) {
    154             // TODO: is this really correct? Shouldn't this be s.toUpperCase()?
    155             return s;
    156         }
    157         // TODO: fix the bugs below
    158         // - This does not work for Greek, because it returns upper case instead of title case.
    159         // - It does not work for Serbian, because it fails to account for the "lj" character,
    160         // which should be "Lj" in title case and "LJ" in upper case.
    161         // - It does not work for Dutch, because it fails to account for the "ij" digraph, which
    162         // are two different characters but both should be capitalized as "IJ" as if they were
    163         // a single letter.
    164         // - It also does not work with unicode surrogate code points.
    165         return s.toUpperCase(locale).charAt(0) + s.substring(1);
    166     }
    167 
    168     public static int[] toCodePointArray(final String string) {
    169         final char[] characters = string.toCharArray();
    170         final int length = characters.length;
    171         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
    172         if (length <= 0) {
    173             return new int[0];
    174         }
    175         int codePoint = Character.codePointAt(characters, 0);
    176         int dsti = 0;
    177         for (int srci = Character.charCount(codePoint);
    178                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
    179             codePoints[dsti] = codePoint;
    180             codePoint = Character.codePointAt(characters, srci);
    181         }
    182         codePoints[dsti] = codePoint;
    183         return codePoints;
    184     }
    185 
    186     /**
    187      * Determine what caps mode should be in effect at the current offset in
    188      * the text. Only the mode bits set in <var>reqModes</var> will be
    189      * checked. Note that the caps mode flags here are explicitly defined
    190      * to match those in {@link InputType}.
    191      *
    192      * This code is a straight copy of TextUtils.getCapsMode (modulo namespace and formatting
    193      * issues). This will change in the future as we simplify the code for our use and fix bugs.
    194      *
    195      * @param cs The text that should be checked for caps modes.
    196      * @param reqModes The modes to be checked: may be any combination of
    197      * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
    198      * {@link TextUtils#CAP_MODE_SENTENCES}.
    199      * @param locale The locale to consider for capitalization rules
    200      * @param hasSpaceBefore Whether we should consider there is a space inserted at the end of cs
    201      *
    202      * @return Returns the actual capitalization modes that can be in effect
    203      * at the current position, which is any combination of
    204      * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
    205      * {@link TextUtils#CAP_MODE_SENTENCES}.
    206      */
    207     public static int getCapsMode(final CharSequence cs, final int reqModes, final Locale locale,
    208             final boolean hasSpaceBefore) {
    209         // Quick description of what we want to do:
    210         // CAP_MODE_CHARACTERS is always on.
    211         // CAP_MODE_WORDS is on if there is some whitespace before the cursor.
    212         // CAP_MODE_SENTENCES is on if there is some whitespace before the cursor, and the end
    213         //   of a sentence just before that.
    214         // We ignore opening parentheses and the like just before the cursor for purposes of
    215         // finding whitespace for WORDS and SENTENCES modes.
    216         // The end of a sentence ends with a period, question mark or exclamation mark. If it's
    217         // a period, it also needs not to be an abbreviation, which means it also needs to either
    218         // be immediately preceded by punctuation, or by a string of only letters with single
    219         // periods interleaved.
    220 
    221         // Step 1 : check for cap MODE_CHARACTERS. If it's looked for, it's always on.
    222         if ((reqModes & (TextUtils.CAP_MODE_WORDS | TextUtils.CAP_MODE_SENTENCES)) == 0) {
    223             // Here we are not looking for MODE_WORDS or MODE_SENTENCES, so since we already
    224             // evaluated MODE_CHARACTERS, we can return.
    225             return TextUtils.CAP_MODE_CHARACTERS & reqModes;
    226         }
    227 
    228         // Step 2 : Skip (ignore at the end of input) any opening punctuation. This includes
    229         // opening parentheses, brackets, opening quotes, everything that *opens* a span of
    230         // text in the linguistic sense. In RTL languages, this is still an opening sign, although
    231         // it may look like a right parenthesis for example. We also include double quote and
    232         // single quote since they aren't start punctuation in the unicode sense, but should still
    233         // be skipped for English. TODO: does this depend on the language?
    234         int i;
    235         if (hasSpaceBefore) {
    236             i = cs.length() + 1;
    237         } else {
    238             for (i = cs.length(); i > 0; i--) {
    239                 final char c = cs.charAt(i - 1);
    240                 if (c != Keyboard.CODE_DOUBLE_QUOTE && c != Keyboard.CODE_SINGLE_QUOTE
    241                         && Character.getType(c) != Character.START_PUNCTUATION) {
    242                     break;
    243                 }
    244             }
    245         }
    246 
    247         // We are now on the character that precedes any starting punctuation, so in the most
    248         // frequent case this will be whitespace or a letter, although it may occasionally be a
    249         // start of line, or some symbol.
    250 
    251         // Step 3 : Search for the start of a paragraph. From the starting point computed in step 2,
    252         // we go back over any space or tab char sitting there. We find the start of a paragraph
    253         // if the first char that's not a space or tab is a start of line (as in \n, start of text,
    254         // or some other similar characters).
    255         int j = i;
    256         char prevChar = Keyboard.CODE_SPACE;
    257         if (hasSpaceBefore) --j;
    258         while (j > 0) {
    259             prevChar = cs.charAt(j - 1);
    260             if (!Character.isSpaceChar(prevChar) && prevChar != Keyboard.CODE_TAB) break;
    261             j--;
    262         }
    263         if (j <= 0 || Character.isWhitespace(prevChar)) {
    264             // There are only spacing chars between the start of the paragraph and the cursor,
    265             // defined as a isWhitespace() char that is neither a isSpaceChar() nor a tab. Both
    266             // MODE_WORDS and MODE_SENTENCES should be active.
    267             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
    268                     | TextUtils.CAP_MODE_SENTENCES) & reqModes;
    269         }
    270         if (i == j) {
    271             // If we don't have whitespace before index i, it means neither MODE_WORDS
    272             // nor mode sentences should be on so we can return right away.
    273             return TextUtils.CAP_MODE_CHARACTERS & reqModes;
    274         }
    275         if ((reqModes & TextUtils.CAP_MODE_SENTENCES) == 0) {
    276             // Here we know we have whitespace before the cursor (if not, we returned in the above
    277             // if i == j clause), so we need MODE_WORDS to be on. And we don't need to evaluate
    278             // MODE_SENTENCES so we can return right away.
    279             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
    280         }
    281         // Please note that because of the reqModes & CAP_MODE_SENTENCES test a few lines above,
    282         // we know that MODE_SENTENCES is being requested.
    283 
    284         // Step 4 : Search for MODE_SENTENCES.
    285         // English is a special case in that "American typography" rules, which are the most common
    286         // in English, state that a sentence terminator immediately following a quotation mark
    287         // should be swapped with it and de-duplicated (included in the quotation mark),
    288         // e.g. <<Did he say, "let's go home?">>
    289         // No other language has such a rule as far as I know, instead putting inside the quotation
    290         // mark as the exact thing quoted and handling the surrounding punctuation independently,
    291         // e.g. <<Did he say, "let's go home"?>>
    292         // Hence, specifically for English, we treat this special case here.
    293         if (Locale.ENGLISH.getLanguage().equals(locale.getLanguage())) {
    294             for (; j > 0; j--) {
    295                 // Here we look to go over any closing punctuation. This is because in dominant
    296                 // variants of English, the final period is placed within double quotes and maybe
    297                 // other closing punctuation signs. This is generally not true in other languages.
    298                 final char c = cs.charAt(j - 1);
    299                 if (c != Keyboard.CODE_DOUBLE_QUOTE && c != Keyboard.CODE_SINGLE_QUOTE
    300                         && Character.getType(c) != Character.END_PUNCTUATION) {
    301                     break;
    302                 }
    303             }
    304         }
    305 
    306         if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes;
    307         char c = cs.charAt(--j);
    308 
    309         // We found the next interesting chunk of text ; next we need to determine if it's the
    310         // end of a sentence. If we have a question mark or an exclamation mark, it's the end of
    311         // a sentence. If it's neither, the only remaining case is the period so we get the opposite
    312         // case out of the way.
    313         if (c == Keyboard.CODE_QUESTION_MARK || c == Keyboard.CODE_EXCLAMATION_MARK) {
    314             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes;
    315         }
    316         if (c != Keyboard.CODE_PERIOD || j <= 0) {
    317             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
    318         }
    319 
    320         // We found out that we have a period. We need to determine if this is a full stop or
    321         // otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation
    322         // looks like (\w\.){2,}
    323         // To find out, we will have a simple state machine with the following states :
    324         // START, WORD, PERIOD, ABBREVIATION
    325         // On START : (just before the first period)
    326         //           letter => WORD
    327         //           whitespace => end with no caps (it was a stand-alone period)
    328         //           otherwise => end with caps (several periods/symbols in a row)
    329         // On WORD : (within the word just before the first period)
    330         //           letter => WORD
    331         //           period => PERIOD
    332         //           otherwise => end with caps (it was a word with a full stop at the end)
    333         // On PERIOD : (period within a potential abbreviation)
    334         //           letter => LETTER
    335         //           otherwise => end with caps (it was not an abbreviation)
    336         // On LETTER : (letter within a potential abbreviation)
    337         //           letter => LETTER
    338         //           period => PERIOD
    339         //           otherwise => end with no caps (it was an abbreviation)
    340         // "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This
    341         // should capitalize.
    342 
    343         final int START = 0;
    344         final int WORD = 1;
    345         final int PERIOD = 2;
    346         final int LETTER = 3;
    347         final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
    348                 | TextUtils.CAP_MODE_SENTENCES) & reqModes;
    349         final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
    350         int state = START;
    351         while (j > 0) {
    352             c = cs.charAt(--j);
    353             switch (state) {
    354             case START:
    355                 if (Character.isLetter(c)) {
    356                     state = WORD;
    357                 } else if (Character.isWhitespace(c)) {
    358                     return noCaps;
    359                 } else {
    360                     return caps;
    361                 }
    362                 break;
    363             case WORD:
    364                 if (Character.isLetter(c)) {
    365                     state = WORD;
    366                 } else if (c == Keyboard.CODE_PERIOD) {
    367                     state = PERIOD;
    368                 } else {
    369                     return caps;
    370                 }
    371                 break;
    372             case PERIOD:
    373                 if (Character.isLetter(c)) {
    374                     state = LETTER;
    375                 } else {
    376                     return caps;
    377                 }
    378                 break;
    379             case LETTER:
    380                 if (Character.isLetter(c)) {
    381                     state = LETTER;
    382                 } else if (c == Keyboard.CODE_PERIOD) {
    383                     state = PERIOD;
    384                 } else {
    385                     return noCaps;
    386                 }
    387             }
    388         }
    389         // Here we arrived at the start of the line. This should behave exactly like whitespace.
    390         return (START == state || LETTER == state) ? noCaps : caps;
    391     }
    392 }
    393