1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import android.text.TextUtils; 20 21 import java.util.ArrayList; 22 import java.util.Locale; 23 24 public final class StringUtils { 25 public static final int CAPITALIZE_NONE = 0; // No caps, or mixed case 26 public static final int CAPITALIZE_FIRST = 1; // First only 27 public static final int CAPITALIZE_ALL = 2; // All caps 28 29 private StringUtils() { 30 // This utility class is not publicly instantiable. 31 } 32 33 public static int codePointCount(final String text) { 34 if (TextUtils.isEmpty(text)) return 0; 35 return text.codePointCount(0, text.length()); 36 } 37 38 public static boolean containsInArray(final String key, final String[] array) { 39 for (final String element : array) { 40 if (key.equals(element)) return true; 41 } 42 return false; 43 } 44 45 public static boolean containsInCsv(final String key, final String csv) { 46 if (TextUtils.isEmpty(csv)) return false; 47 return containsInArray(key, csv.split(",")); 48 } 49 50 public static String appendToCsvIfNotExists(final String key, final String csv) { 51 if (TextUtils.isEmpty(csv)) return key; 52 if (containsInCsv(key, csv)) return csv; 53 return csv + "," + key; 54 } 55 56 public static String removeFromCsvIfExists(final String key, final String csv) { 57 if (TextUtils.isEmpty(csv)) return ""; 58 final String[] elements = csv.split(","); 59 if (!containsInArray(key, elements)) return csv; 60 final ArrayList<String> result = CollectionUtils.newArrayList(elements.length - 1); 61 for (final String element : elements) { 62 if (!key.equals(element)) result.add(element); 63 } 64 return TextUtils.join(",", result); 65 } 66 67 /** 68 * Remove duplicates from an array of strings. 69 * 70 * This method will always keep the first occurrence of all strings at their position 71 * in the array, removing the subsequent ones. 72 */ 73 public static void removeDupes(final ArrayList<String> suggestions) { 74 if (suggestions.size() < 2) return; 75 int i = 1; 76 // Don't cache suggestions.size(), since we may be removing items 77 while (i < suggestions.size()) { 78 final String cur = suggestions.get(i); 79 // Compare each suggestion with each previous suggestion 80 for (int j = 0; j < i; j++) { 81 final String previous = suggestions.get(j); 82 if (TextUtils.equals(cur, previous)) { 83 suggestions.remove(i); 84 i--; 85 break; 86 } 87 } 88 i++; 89 } 90 } 91 92 public static String capitalizeFirstCodePoint(final String s, final Locale locale) { 93 if (s.length() <= 1) { 94 return s.toUpperCase(locale); 95 } 96 // Please refer to the comment below in 97 // {@link #capitalizeFirstAndDowncaseRest(String,Locale)} as this has the same shortcomings 98 final int cutoff = s.offsetByCodePoints(0, 1); 99 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff); 100 } 101 102 public static String capitalizeFirstAndDowncaseRest(final String s, final Locale locale) { 103 if (s.length() <= 1) { 104 return s.toUpperCase(locale); 105 } 106 // TODO: fix the bugs below 107 // - This does not work for Greek, because it returns upper case instead of title case. 108 // - It does not work for Serbian, because it fails to account for the "lj" character, 109 // which should be "Lj" in title case and "LJ" in upper case. 110 // - It does not work for Dutch, because it fails to account for the "ij" digraph when it's 111 // written as two separate code points. They are two different characters but both should 112 // be capitalized as "IJ" as if they were a single letter in most words (not all). If the 113 // unicode char for the ligature is used however, it works. 114 final int cutoff = s.offsetByCodePoints(0, 1); 115 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff).toLowerCase(locale); 116 } 117 118 private static final int[] EMPTY_CODEPOINTS = {}; 119 120 public static int[] toCodePointArray(final String string) { 121 final int length = string.length(); 122 if (length <= 0) { 123 return EMPTY_CODEPOINTS; 124 } 125 final int[] codePoints = new int[string.codePointCount(0, length)]; 126 int destIndex = 0; 127 for (int index = 0; index < length; index = string.offsetByCodePoints(index, 1)) { 128 codePoints[destIndex] = string.codePointAt(index); 129 destIndex++; 130 } 131 return codePoints; 132 } 133 134 public static String[] parseCsvString(final String text) { 135 final int size = text.length(); 136 if (size == 0) { 137 return null; 138 } 139 if (codePointCount(text) == 1) { 140 return text.codePointAt(0) == Constants.CSV_SEPARATOR ? null : new String[] { text }; 141 } 142 143 ArrayList<String> list = null; 144 int start = 0; 145 for (int pos = 0; pos < size; pos++) { 146 final char c = text.charAt(pos); 147 if (c == Constants.CSV_SEPARATOR) { 148 // Skip empty entry. 149 if (pos - start > 0) { 150 if (list == null) { 151 list = CollectionUtils.newArrayList(); 152 } 153 list.add(text.substring(start, pos)); 154 } 155 // Skip comma 156 start = pos + 1; 157 } else if (c == Constants.CSV_ESCAPE) { 158 // Skip escape character and escaped character. 159 pos++; 160 } 161 } 162 final String remain = (size - start > 0) ? text.substring(start) : null; 163 if (list == null) { 164 return remain != null ? new String[] { remain } : null; 165 } 166 if (remain != null) { 167 list.add(remain); 168 } 169 return list.toArray(new String[list.size()]); 170 } 171 172 // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE. 173 public static int getCapitalizationType(final String text) { 174 // If the first char is not uppercase, then the word is either all lower case or 175 // camel case, and in either case we return CAPITALIZE_NONE. 176 final int len = text.length(); 177 int index = 0; 178 for (; index < len; index = text.offsetByCodePoints(index, 1)) { 179 if (Character.isLetter(text.codePointAt(index))) { 180 break; 181 } 182 } 183 if (index == len) return CAPITALIZE_NONE; 184 if (!Character.isUpperCase(text.codePointAt(index))) { 185 return CAPITALIZE_NONE; 186 } 187 int capsCount = 1; 188 int letterCount = 1; 189 for (index = text.offsetByCodePoints(index, 1); index < len; 190 index = text.offsetByCodePoints(index, 1)) { 191 if (1 != capsCount && letterCount != capsCount) break; 192 final int codePoint = text.codePointAt(index); 193 if (Character.isUpperCase(codePoint)) { 194 ++capsCount; 195 ++letterCount; 196 } else if (Character.isLetter(codePoint)) { 197 // We need to discount non-letters since they may not be upper-case, but may 198 // still be part of a word (e.g. single quote or dash, as in "IT'S" or "FULL-TIME") 199 ++letterCount; 200 } 201 } 202 // We know the first char is upper case. So we want to test if either every letter other 203 // than the first is lower case, or if they are all upper case. If the string is exactly 204 // one char long, then we will arrive here with letterCount 1, and this is correct, too. 205 if (1 == capsCount) return CAPITALIZE_FIRST; 206 return (letterCount == capsCount ? CAPITALIZE_ALL : CAPITALIZE_NONE); 207 } 208 209 public static boolean isIdenticalAfterUpcase(final String text) { 210 final int len = text.length(); 211 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 212 final int codePoint = text.codePointAt(i); 213 if (Character.isLetter(codePoint) && !Character.isUpperCase(codePoint)) { 214 return false; 215 } 216 } 217 return true; 218 } 219 220 public static boolean isIdenticalAfterDowncase(final String text) { 221 final int len = text.length(); 222 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 223 final int codePoint = text.codePointAt(i); 224 if (Character.isLetter(codePoint) && !Character.isLowerCase(codePoint)) { 225 return false; 226 } 227 } 228 return true; 229 } 230 231 public static boolean isIdenticalAfterCapitalizeEachWord(final String text, 232 final String separators) { 233 boolean needCapsNext = true; 234 final int len = text.length(); 235 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 236 final int codePoint = text.codePointAt(i); 237 if (Character.isLetter(codePoint)) { 238 if ((needCapsNext && !Character.isUpperCase(codePoint)) 239 || (!needCapsNext && !Character.isLowerCase(codePoint))) { 240 return false; 241 } 242 } 243 // We need a capital letter next if this is a separator. 244 needCapsNext = (-1 != separators.indexOf(codePoint)); 245 } 246 return true; 247 } 248 249 // TODO: like capitalizeFirst*, this does not work perfectly for Dutch because of the IJ digraph 250 // which should be capitalized together in *some* cases. 251 public static String capitalizeEachWord(final String text, final String separators, 252 final Locale locale) { 253 final StringBuilder builder = new StringBuilder(); 254 boolean needCapsNext = true; 255 final int len = text.length(); 256 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 257 final String nextChar = text.substring(i, text.offsetByCodePoints(i, 1)); 258 if (needCapsNext) { 259 builder.append(nextChar.toUpperCase(locale)); 260 } else { 261 builder.append(nextChar.toLowerCase(locale)); 262 } 263 // We need a capital letter next if this is a separator. 264 needCapsNext = (-1 != separators.indexOf(nextChar.codePointAt(0))); 265 } 266 return builder.toString(); 267 } 268 269 /** 270 * Approximates whether the text before the cursor looks like a URL. 271 * 272 * This is not foolproof, but it should work well in the practice. 273 * Essentially it walks backward from the cursor until it finds something that's not a letter, 274 * digit, or common URL symbol like underscore. If it hasn't found a period yet, then it 275 * does not look like a URL. 276 * If the text: 277 * - starts with www and contains a period 278 * - starts with a slash preceded by either a slash, whitespace, or start-of-string 279 * Then it looks like a URL and we return true. Otherwise, we return false. 280 * 281 * Note: this method is called quite often, and should be fast. 282 * 283 * TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the 284 * code complexity, but ideally it should not. It's acceptable for now. 285 */ 286 public static boolean lastPartLooksLikeURL(final CharSequence text) { 287 int i = text.length(); 288 if (0 == i) return false; 289 int wCount = 0; 290 int slashCount = 0; 291 boolean hasSlash = false; 292 boolean hasPeriod = false; 293 int codePoint = 0; 294 while (i > 0) { 295 codePoint = Character.codePointBefore(text, i); 296 if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') { 297 // Handwavy heuristic to see if that's a URL character. Anything between period 298 // and z. This includes all lower- and upper-case ascii letters, period, 299 // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation 300 // marks, double quotes... 301 // Anything that's not a URL-like character causes us to break from here and 302 // evaluate normally. 303 break; 304 } 305 if (Constants.CODE_PERIOD == codePoint) { 306 hasPeriod = true; 307 } 308 if (Constants.CODE_SLASH == codePoint) { 309 hasSlash = true; 310 if (2 == ++slashCount) { 311 return true; 312 } 313 } else { 314 slashCount = 0; 315 } 316 if ('w' == codePoint) { 317 ++wCount; 318 } else { 319 wCount = 0; 320 } 321 i = Character.offsetByCodePoints(text, i, -1); 322 } 323 // End of the text run. 324 // If it starts with www and includes a period, then it looks like a URL. 325 if (wCount >= 3 && hasPeriod) return true; 326 // If it starts with a slash, and the code point before is whitespace, it looks like an URL. 327 if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true; 328 // If it has both a period and a slash, it looks like an URL. 329 if (hasPeriod && hasSlash) return true; 330 // Otherwise, it doesn't look like an URL. 331 return false; 332 } 333 } 334