1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.utils; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 import com.android.inputmethod.latin.settings.SettingsValues; 22 23 import android.text.TextUtils; 24 import android.util.JsonReader; 25 import android.util.JsonWriter; 26 import android.util.Log; 27 28 import java.io.IOException; 29 import java.io.StringReader; 30 import java.io.StringWriter; 31 import java.util.ArrayList; 32 import java.util.Collections; 33 import java.util.List; 34 import java.util.Locale; 35 36 public final class StringUtils { 37 private static final String TAG = StringUtils.class.getSimpleName(); 38 public static final int CAPITALIZE_NONE = 0; // No caps, or mixed case 39 public static final int CAPITALIZE_FIRST = 1; // First only 40 public static final int CAPITALIZE_ALL = 2; // All caps 41 42 private StringUtils() { 43 // This utility class is not publicly instantiable. 44 } 45 46 public static int codePointCount(final String text) { 47 if (TextUtils.isEmpty(text)) return 0; 48 return text.codePointCount(0, text.length()); 49 } 50 51 public static String newSingleCodePointString(int codePoint) { 52 if (Character.charCount(codePoint) == 1) { 53 // Optimization: avoid creating an temporary array for characters that are 54 // represented by a single char value 55 return String.valueOf((char) codePoint); 56 } 57 // For surrogate pair 58 return new String(Character.toChars(codePoint)); 59 } 60 61 public static boolean containsInArray(final String text, final String[] array) { 62 for (final String element : array) { 63 if (text.equals(element)) return true; 64 } 65 return false; 66 } 67 68 /** 69 * Comma-Splittable Text is similar to Comma-Separated Values (CSV) but has much simpler syntax. 70 * Unlike CSV, Comma-Splittable Text has no escaping mechanism, so that the text can't contain 71 * a comma character in it. 72 */ 73 private static final String SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT = ","; 74 75 public static boolean containsInCommaSplittableText(final String text, 76 final String extraValues) { 77 if (TextUtils.isEmpty(extraValues)) { 78 return false; 79 } 80 return containsInArray(text, extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT)); 81 } 82 83 public static String appendToCommaSplittableTextIfNotExists(final String text, 84 final String extraValues) { 85 if (TextUtils.isEmpty(extraValues)) { 86 return text; 87 } 88 if (containsInCommaSplittableText(text, extraValues)) { 89 return extraValues; 90 } 91 return extraValues + SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT + text; 92 } 93 94 public static String removeFromCommaSplittableTextIfExists(final String text, 95 final String extraValues) { 96 if (TextUtils.isEmpty(extraValues)) { 97 return ""; 98 } 99 final String[] elements = extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT); 100 if (!containsInArray(text, elements)) { 101 return extraValues; 102 } 103 final ArrayList<String> result = CollectionUtils.newArrayList(elements.length - 1); 104 for (final String element : elements) { 105 if (!text.equals(element)) { 106 result.add(element); 107 } 108 } 109 return TextUtils.join(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT, result); 110 } 111 112 /** 113 * Remove duplicates from an array of strings. 114 * 115 * This method will always keep the first occurrence of all strings at their position 116 * in the array, removing the subsequent ones. 117 */ 118 public static void removeDupes(final ArrayList<String> suggestions) { 119 if (suggestions.size() < 2) return; 120 int i = 1; 121 // Don't cache suggestions.size(), since we may be removing items 122 while (i < suggestions.size()) { 123 final String cur = suggestions.get(i); 124 // Compare each suggestion with each previous suggestion 125 for (int j = 0; j < i; j++) { 126 final String previous = suggestions.get(j); 127 if (TextUtils.equals(cur, previous)) { 128 suggestions.remove(i); 129 i--; 130 break; 131 } 132 } 133 i++; 134 } 135 } 136 137 public static String capitalizeFirstCodePoint(final String s, final Locale locale) { 138 if (s.length() <= 1) { 139 return s.toUpperCase(locale); 140 } 141 // Please refer to the comment below in 142 // {@link #capitalizeFirstAndDowncaseRest(String,Locale)} as this has the same shortcomings 143 final int cutoff = s.offsetByCodePoints(0, 1); 144 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff); 145 } 146 147 public static String capitalizeFirstAndDowncaseRest(final String s, final Locale locale) { 148 if (s.length() <= 1) { 149 return s.toUpperCase(locale); 150 } 151 // TODO: fix the bugs below 152 // - This does not work for Greek, because it returns upper case instead of title case. 153 // - It does not work for Serbian, because it fails to account for the "lj" character, 154 // which should be "Lj" in title case and "LJ" in upper case. 155 // - It does not work for Dutch, because it fails to account for the "ij" digraph when it's 156 // written as two separate code points. They are two different characters but both should 157 // be capitalized as "IJ" as if they were a single letter in most words (not all). If the 158 // unicode char for the ligature is used however, it works. 159 final int cutoff = s.offsetByCodePoints(0, 1); 160 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff).toLowerCase(locale); 161 } 162 163 private static final int[] EMPTY_CODEPOINTS = {}; 164 165 public static int[] toCodePointArray(final String string) { 166 final int length = string.length(); 167 if (length <= 0) { 168 return EMPTY_CODEPOINTS; 169 } 170 final int[] codePoints = new int[string.codePointCount(0, length)]; 171 int destIndex = 0; 172 for (int index = 0; index < length; index = string.offsetByCodePoints(index, 1)) { 173 codePoints[destIndex] = string.codePointAt(index); 174 destIndex++; 175 } 176 return codePoints; 177 } 178 179 // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE. 180 public static int getCapitalizationType(final String text) { 181 // If the first char is not uppercase, then the word is either all lower case or 182 // camel case, and in either case we return CAPITALIZE_NONE. 183 final int len = text.length(); 184 int index = 0; 185 for (; index < len; index = text.offsetByCodePoints(index, 1)) { 186 if (Character.isLetter(text.codePointAt(index))) { 187 break; 188 } 189 } 190 if (index == len) return CAPITALIZE_NONE; 191 if (!Character.isUpperCase(text.codePointAt(index))) { 192 return CAPITALIZE_NONE; 193 } 194 int capsCount = 1; 195 int letterCount = 1; 196 for (index = text.offsetByCodePoints(index, 1); index < len; 197 index = text.offsetByCodePoints(index, 1)) { 198 if (1 != capsCount && letterCount != capsCount) break; 199 final int codePoint = text.codePointAt(index); 200 if (Character.isUpperCase(codePoint)) { 201 ++capsCount; 202 ++letterCount; 203 } else if (Character.isLetter(codePoint)) { 204 // We need to discount non-letters since they may not be upper-case, but may 205 // still be part of a word (e.g. single quote or dash, as in "IT'S" or "FULL-TIME") 206 ++letterCount; 207 } 208 } 209 // We know the first char is upper case. So we want to test if either every letter other 210 // than the first is lower case, or if they are all upper case. If the string is exactly 211 // one char long, then we will arrive here with letterCount 1, and this is correct, too. 212 if (1 == capsCount) return CAPITALIZE_FIRST; 213 return (letterCount == capsCount ? CAPITALIZE_ALL : CAPITALIZE_NONE); 214 } 215 216 public static boolean isIdenticalAfterUpcase(final String text) { 217 final int length = text.length(); 218 int i = 0; 219 while (i < length) { 220 final int codePoint = text.codePointAt(i); 221 if (Character.isLetter(codePoint) && !Character.isUpperCase(codePoint)) { 222 return false; 223 } 224 i += Character.charCount(codePoint); 225 } 226 return true; 227 } 228 229 public static boolean isIdenticalAfterDowncase(final String text) { 230 final int length = text.length(); 231 int i = 0; 232 while (i < length) { 233 final int codePoint = text.codePointAt(i); 234 if (Character.isLetter(codePoint) && !Character.isLowerCase(codePoint)) { 235 return false; 236 } 237 i += Character.charCount(codePoint); 238 } 239 return true; 240 } 241 242 @UsedForTesting 243 public static boolean looksValidForDictionaryInsertion(final CharSequence text, 244 final SettingsValues settings) { 245 if (TextUtils.isEmpty(text)) return false; 246 final int length = text.length(); 247 int i = 0; 248 int digitCount = 0; 249 while (i < length) { 250 final int codePoint = Character.codePointAt(text, i); 251 final int charCount = Character.charCount(codePoint); 252 i += charCount; 253 if (Character.isDigit(codePoint)) { 254 // Count digits: see below 255 digitCount += charCount; 256 continue; 257 } 258 if (!settings.isWordCodePoint(codePoint)) return false; 259 } 260 // We reject strings entirely comprised of digits to avoid using PIN codes or credit 261 // card numbers. It would come in handy for word prediction though; a good example is 262 // when writing one's address where the street number is usually quite discriminative, 263 // as well as the postal code. 264 return digitCount < length; 265 } 266 267 public static boolean isIdenticalAfterCapitalizeEachWord(final String text, 268 final String separators) { 269 boolean needCapsNext = true; 270 final int len = text.length(); 271 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 272 final int codePoint = text.codePointAt(i); 273 if (Character.isLetter(codePoint)) { 274 if ((needCapsNext && !Character.isUpperCase(codePoint)) 275 || (!needCapsNext && !Character.isLowerCase(codePoint))) { 276 return false; 277 } 278 } 279 // We need a capital letter next if this is a separator. 280 needCapsNext = (-1 != separators.indexOf(codePoint)); 281 } 282 return true; 283 } 284 285 // TODO: like capitalizeFirst*, this does not work perfectly for Dutch because of the IJ digraph 286 // which should be capitalized together in *some* cases. 287 public static String capitalizeEachWord(final String text, final String separators, 288 final Locale locale) { 289 final StringBuilder builder = new StringBuilder(); 290 boolean needCapsNext = true; 291 final int len = text.length(); 292 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 293 final String nextChar = text.substring(i, text.offsetByCodePoints(i, 1)); 294 if (needCapsNext) { 295 builder.append(nextChar.toUpperCase(locale)); 296 } else { 297 builder.append(nextChar.toLowerCase(locale)); 298 } 299 // We need a capital letter next if this is a separator. 300 needCapsNext = (-1 != separators.indexOf(nextChar.codePointAt(0))); 301 } 302 return builder.toString(); 303 } 304 305 /** 306 * Approximates whether the text before the cursor looks like a URL. 307 * 308 * This is not foolproof, but it should work well in the practice. 309 * Essentially it walks backward from the cursor until it finds something that's not a letter, 310 * digit, or common URL symbol like underscore. If it hasn't found a period yet, then it 311 * does not look like a URL. 312 * If the text: 313 * - starts with www and contains a period 314 * - starts with a slash preceded by either a slash, whitespace, or start-of-string 315 * Then it looks like a URL and we return true. Otherwise, we return false. 316 * 317 * Note: this method is called quite often, and should be fast. 318 * 319 * TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the 320 * code complexity, but ideally it should not. It's acceptable for now. 321 */ 322 public static boolean lastPartLooksLikeURL(final CharSequence text) { 323 int i = text.length(); 324 if (0 == i) return false; 325 int wCount = 0; 326 int slashCount = 0; 327 boolean hasSlash = false; 328 boolean hasPeriod = false; 329 int codePoint = 0; 330 while (i > 0) { 331 codePoint = Character.codePointBefore(text, i); 332 if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') { 333 // Handwavy heuristic to see if that's a URL character. Anything between period 334 // and z. This includes all lower- and upper-case ascii letters, period, 335 // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation 336 // marks, double quotes... 337 // Anything that's not a URL-like character causes us to break from here and 338 // evaluate normally. 339 break; 340 } 341 if (Constants.CODE_PERIOD == codePoint) { 342 hasPeriod = true; 343 } 344 if (Constants.CODE_SLASH == codePoint) { 345 hasSlash = true; 346 if (2 == ++slashCount) { 347 return true; 348 } 349 } else { 350 slashCount = 0; 351 } 352 if ('w' == codePoint) { 353 ++wCount; 354 } else { 355 wCount = 0; 356 } 357 i = Character.offsetByCodePoints(text, i, -1); 358 } 359 // End of the text run. 360 // If it starts with www and includes a period, then it looks like a URL. 361 if (wCount >= 3 && hasPeriod) return true; 362 // If it starts with a slash, and the code point before is whitespace, it looks like an URL. 363 if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true; 364 // If it has both a period and a slash, it looks like an URL. 365 if (hasPeriod && hasSlash) return true; 366 // Otherwise, it doesn't look like an URL. 367 return false; 368 } 369 370 public static boolean isEmptyStringOrWhiteSpaces(String s) { 371 final int N = codePointCount(s); 372 for (int i = 0; i < N; ++i) { 373 if (!Character.isWhitespace(s.codePointAt(i))) { 374 return false; 375 } 376 } 377 return true; 378 } 379 380 @UsedForTesting 381 public static String byteArrayToHexString(byte[] bytes) { 382 if (bytes == null || bytes.length == 0) { 383 return ""; 384 } 385 final StringBuilder sb = new StringBuilder(); 386 for (byte b : bytes) { 387 sb.append(String.format("%02x", b & 0xff)); 388 } 389 return sb.toString(); 390 } 391 392 /** 393 * Convert hex string to byte array. The string length must be an even number. 394 */ 395 @UsedForTesting 396 public static byte[] hexStringToByteArray(String hexString) { 397 if (TextUtils.isEmpty(hexString)) { 398 return null; 399 } 400 final int N = hexString.length(); 401 if (N % 2 != 0) { 402 throw new NumberFormatException("Input hex string length must be an even number." 403 + " Length = " + N); 404 } 405 final byte[] bytes = new byte[N / 2]; 406 for (int i = 0; i < N; i += 2) { 407 bytes[i / 2] = (byte) ((Character.digit(hexString.charAt(i), 16) << 4) 408 + Character.digit(hexString.charAt(i + 1), 16)); 409 } 410 return bytes; 411 } 412 413 public static List<Object> jsonStrToList(String s) { 414 final ArrayList<Object> retval = CollectionUtils.newArrayList(); 415 final JsonReader reader = new JsonReader(new StringReader(s)); 416 try { 417 reader.beginArray(); 418 while(reader.hasNext()) { 419 reader.beginObject(); 420 while (reader.hasNext()) { 421 final String name = reader.nextName(); 422 if (name.equals(Integer.class.getSimpleName())) { 423 retval.add(reader.nextInt()); 424 } else if (name.equals(String.class.getSimpleName())) { 425 retval.add(reader.nextString()); 426 } else { 427 Log.w(TAG, "Invalid name: " + name); 428 reader.skipValue(); 429 } 430 } 431 reader.endObject(); 432 } 433 reader.endArray(); 434 return retval; 435 } catch (IOException e) { 436 } finally { 437 try { 438 reader.close(); 439 } catch (IOException e) { 440 } 441 } 442 return Collections.<Object>emptyList(); 443 } 444 445 public static String listToJsonStr(List<Object> list) { 446 if (list == null || list.isEmpty()) { 447 return ""; 448 } 449 final StringWriter sw = new StringWriter(); 450 final JsonWriter writer = new JsonWriter(sw); 451 try { 452 writer.beginArray(); 453 for (final Object o : list) { 454 writer.beginObject(); 455 if (o instanceof Integer) { 456 writer.name(Integer.class.getSimpleName()).value((Integer)o); 457 } else if (o instanceof String) { 458 writer.name(String.class.getSimpleName()).value((String)o); 459 } 460 writer.endObject(); 461 } 462 writer.endArray(); 463 return sw.toString(); 464 } catch (IOException e) { 465 } finally { 466 try { 467 if (writer != null) { 468 writer.close(); 469 } 470 } catch (IOException e) { 471 } 472 } 473 return ""; 474 } 475 } 476