1 /* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.emailcommon.utility; 18 19 import com.google.common.annotations.VisibleForTesting; 20 21 import android.graphics.Color; 22 import android.text.Spannable; 23 import android.text.SpannableString; 24 import android.text.SpannableStringBuilder; 25 import android.text.TextUtils; 26 import android.text.style.BackgroundColorSpan; 27 28 import java.io.IOException; 29 import java.util.ArrayList; 30 import java.util.HashMap; 31 import java.util.Map; 32 import java.util.StringTokenizer; 33 34 public class TextUtilities { 35 // Highlight color is yellow, as in other apps. 36 // TODO Push for this to be a global (style-related?) constant 37 public static final int HIGHLIGHT_COLOR_INT = Color.YELLOW; 38 // We AND off the "alpha" from the color (i.e. 0xFFFFFF00 -> 0x00FFFF00) 39 /*package*/ static final String HIGHLIGHT_COLOR_STRING = 40 '#' + Integer.toHexString(HIGHLIGHT_COLOR_INT & 0x00FFFFFF); 41 42 // This is how many chars we'll allow in a snippet 43 private static final int MAX_SNIPPET_LENGTH = 200; 44 // For some reason, isWhitespace() returns false with the following... 45 /*package*/ static final char NON_BREAKING_SPACE_CHARACTER = (char)160; 46 47 // Tags whose content must be stripped as well 48 static final String[] STRIP_TAGS = 49 new String[] {"title", "script", "style", "applet", "head"}; 50 // The number of characters we peel off for testing against STRIP_TAGS; this should be the 51 // maximum size of the strings in STRIP_TAGS 52 static final int MAX_STRIP_TAG_LENGTH = 6; 53 54 static final Map<String, Character> ESCAPE_STRINGS; 55 static { 56 // HTML character entity references as defined in HTML 4 57 // see http://www.w3.org/TR/REC-html40/sgml/entities.html 58 ESCAPE_STRINGS = new HashMap<String, Character>(252); 59 60 ESCAPE_STRINGS.put(" ", '\u00A0'); 61 ESCAPE_STRINGS.put("¡", '\u00A1'); 62 ESCAPE_STRINGS.put("¢", '\u00A2'); 63 ESCAPE_STRINGS.put("£", '\u00A3'); 64 ESCAPE_STRINGS.put("¤", '\u00A4'); 65 ESCAPE_STRINGS.put("¥", '\u00A5'); 66 ESCAPE_STRINGS.put("¦", '\u00A6'); 67 ESCAPE_STRINGS.put("§", '\u00A7'); 68 ESCAPE_STRINGS.put("¨", '\u00A8'); 69 ESCAPE_STRINGS.put("©", '\u00A9'); 70 ESCAPE_STRINGS.put("ª", '\u00AA'); 71 ESCAPE_STRINGS.put("«", '\u00AB'); 72 ESCAPE_STRINGS.put("¬", '\u00AC'); 73 ESCAPE_STRINGS.put("­", '\u00AD'); 74 ESCAPE_STRINGS.put("®", '\u00AE'); 75 ESCAPE_STRINGS.put("¯", '\u00AF'); 76 ESCAPE_STRINGS.put("°", '\u00B0'); 77 ESCAPE_STRINGS.put("±", '\u00B1'); 78 ESCAPE_STRINGS.put("²", '\u00B2'); 79 ESCAPE_STRINGS.put("³", '\u00B3'); 80 ESCAPE_STRINGS.put("´", '\u00B4'); 81 ESCAPE_STRINGS.put("µ", '\u00B5'); 82 ESCAPE_STRINGS.put("¶", '\u00B6'); 83 ESCAPE_STRINGS.put("·", '\u00B7'); 84 ESCAPE_STRINGS.put("¸", '\u00B8'); 85 ESCAPE_STRINGS.put("¹", '\u00B9'); 86 ESCAPE_STRINGS.put("º", '\u00BA'); 87 ESCAPE_STRINGS.put("»", '\u00BB'); 88 ESCAPE_STRINGS.put("¼", '\u00BC'); 89 ESCAPE_STRINGS.put("½", '\u00BD'); 90 ESCAPE_STRINGS.put("¾", '\u00BE'); 91 ESCAPE_STRINGS.put("¿", '\u00BF'); 92 ESCAPE_STRINGS.put("À", '\u00C0'); 93 ESCAPE_STRINGS.put("Á", '\u00C1'); 94 ESCAPE_STRINGS.put("Â", '\u00C2'); 95 ESCAPE_STRINGS.put("Ã", '\u00C3'); 96 ESCAPE_STRINGS.put("Ä", '\u00C4'); 97 ESCAPE_STRINGS.put("Å", '\u00C5'); 98 ESCAPE_STRINGS.put("Æ", '\u00C6'); 99 ESCAPE_STRINGS.put("Ç", '\u00C7'); 100 ESCAPE_STRINGS.put("È", '\u00C8'); 101 ESCAPE_STRINGS.put("É", '\u00C9'); 102 ESCAPE_STRINGS.put("Ê", '\u00CA'); 103 ESCAPE_STRINGS.put("Ë", '\u00CB'); 104 ESCAPE_STRINGS.put("Ì", '\u00CC'); 105 ESCAPE_STRINGS.put("Í", '\u00CD'); 106 ESCAPE_STRINGS.put("Î", '\u00CE'); 107 ESCAPE_STRINGS.put("Ï", '\u00CF'); 108 ESCAPE_STRINGS.put("Ð", '\u00D0'); 109 ESCAPE_STRINGS.put("Ñ", '\u00D1'); 110 ESCAPE_STRINGS.put("Ò", '\u00D2'); 111 ESCAPE_STRINGS.put("Ó", '\u00D3'); 112 ESCAPE_STRINGS.put("Ô", '\u00D4'); 113 ESCAPE_STRINGS.put("Õ", '\u00D5'); 114 ESCAPE_STRINGS.put("Ö", '\u00D6'); 115 ESCAPE_STRINGS.put("×", '\u00D7'); 116 ESCAPE_STRINGS.put("Ø", '\u00D8'); 117 ESCAPE_STRINGS.put("Ù", '\u00D9'); 118 ESCAPE_STRINGS.put("Ú", '\u00DA'); 119 ESCAPE_STRINGS.put("Û", '\u00DB'); 120 ESCAPE_STRINGS.put("Ü", '\u00DC'); 121 ESCAPE_STRINGS.put("Ý", '\u00DD'); 122 ESCAPE_STRINGS.put("Þ", '\u00DE'); 123 ESCAPE_STRINGS.put("ß", '\u00DF'); 124 ESCAPE_STRINGS.put("à", '\u00E0'); 125 ESCAPE_STRINGS.put("á", '\u00E1'); 126 ESCAPE_STRINGS.put("â", '\u00E2'); 127 ESCAPE_STRINGS.put("ã", '\u00E3'); 128 ESCAPE_STRINGS.put("ä", '\u00E4'); 129 ESCAPE_STRINGS.put("å", '\u00E5'); 130 ESCAPE_STRINGS.put("æ", '\u00E6'); 131 ESCAPE_STRINGS.put("ç", '\u00E7'); 132 ESCAPE_STRINGS.put("è", '\u00E8'); 133 ESCAPE_STRINGS.put("é", '\u00E9'); 134 ESCAPE_STRINGS.put("ê", '\u00EA'); 135 ESCAPE_STRINGS.put("ë", '\u00EB'); 136 ESCAPE_STRINGS.put("ì", '\u00EC'); 137 ESCAPE_STRINGS.put("í", '\u00ED'); 138 ESCAPE_STRINGS.put("î", '\u00EE'); 139 ESCAPE_STRINGS.put("ï", '\u00EF'); 140 ESCAPE_STRINGS.put("ð", '\u00F0'); 141 ESCAPE_STRINGS.put("ñ", '\u00F1'); 142 ESCAPE_STRINGS.put("ò", '\u00F2'); 143 ESCAPE_STRINGS.put("ó", '\u00F3'); 144 ESCAPE_STRINGS.put("ô", '\u00F4'); 145 ESCAPE_STRINGS.put("õ", '\u00F5'); 146 ESCAPE_STRINGS.put("ö", '\u00F6'); 147 ESCAPE_STRINGS.put("÷", '\u00F7'); 148 ESCAPE_STRINGS.put("ø", '\u00F8'); 149 ESCAPE_STRINGS.put("ù", '\u00F9'); 150 ESCAPE_STRINGS.put("ú", '\u00FA'); 151 ESCAPE_STRINGS.put("û", '\u00FB'); 152 ESCAPE_STRINGS.put("ü", '\u00FC'); 153 ESCAPE_STRINGS.put("ý", '\u00FD'); 154 ESCAPE_STRINGS.put("þ", '\u00FE'); 155 ESCAPE_STRINGS.put("ÿ", '\u00FF'); 156 ESCAPE_STRINGS.put("&fnof", '\u0192'); 157 ESCAPE_STRINGS.put("&Alpha", '\u0391'); 158 ESCAPE_STRINGS.put("&Beta", '\u0392'); 159 ESCAPE_STRINGS.put("&Gamma", '\u0393'); 160 ESCAPE_STRINGS.put("&Delta", '\u0394'); 161 ESCAPE_STRINGS.put("&Epsilon", '\u0395'); 162 ESCAPE_STRINGS.put("&Zeta", '\u0396'); 163 ESCAPE_STRINGS.put("&Eta", '\u0397'); 164 ESCAPE_STRINGS.put("&Theta", '\u0398'); 165 ESCAPE_STRINGS.put("&Iota", '\u0399'); 166 ESCAPE_STRINGS.put("&Kappa", '\u039A'); 167 ESCAPE_STRINGS.put("&Lambda", '\u039B'); 168 ESCAPE_STRINGS.put("&Mu", '\u039C'); 169 ESCAPE_STRINGS.put("&Nu", '\u039D'); 170 ESCAPE_STRINGS.put("&Xi", '\u039E'); 171 ESCAPE_STRINGS.put("&Omicron", '\u039F'); 172 ESCAPE_STRINGS.put("&Pi", '\u03A0'); 173 ESCAPE_STRINGS.put("&Rho", '\u03A1'); 174 ESCAPE_STRINGS.put("&Sigma", '\u03A3'); 175 ESCAPE_STRINGS.put("&Tau", '\u03A4'); 176 ESCAPE_STRINGS.put("&Upsilon", '\u03A5'); 177 ESCAPE_STRINGS.put("&Phi", '\u03A6'); 178 ESCAPE_STRINGS.put("&Chi", '\u03A7'); 179 ESCAPE_STRINGS.put("&Psi", '\u03A8'); 180 ESCAPE_STRINGS.put("&Omega", '\u03A9'); 181 ESCAPE_STRINGS.put("&alpha", '\u03B1'); 182 ESCAPE_STRINGS.put("&beta", '\u03B2'); 183 ESCAPE_STRINGS.put("&gamma", '\u03B3'); 184 ESCAPE_STRINGS.put("&delta", '\u03B4'); 185 ESCAPE_STRINGS.put("&epsilon", '\u03B5'); 186 ESCAPE_STRINGS.put("&zeta", '\u03B6'); 187 ESCAPE_STRINGS.put("&eta", '\u03B7'); 188 ESCAPE_STRINGS.put("&theta", '\u03B8'); 189 ESCAPE_STRINGS.put("&iota", '\u03B9'); 190 ESCAPE_STRINGS.put("&kappa", '\u03BA'); 191 ESCAPE_STRINGS.put("&lambda", '\u03BB'); 192 ESCAPE_STRINGS.put("&mu", '\u03BC'); 193 ESCAPE_STRINGS.put("&nu", '\u03BD'); 194 ESCAPE_STRINGS.put("&xi", '\u03BE'); 195 ESCAPE_STRINGS.put("&omicron", '\u03BF'); 196 ESCAPE_STRINGS.put("&pi", '\u03C0'); 197 ESCAPE_STRINGS.put("&rho", '\u03C1'); 198 ESCAPE_STRINGS.put("&sigmaf", '\u03C2'); 199 ESCAPE_STRINGS.put("&sigma", '\u03C3'); 200 ESCAPE_STRINGS.put("&tau", '\u03C4'); 201 ESCAPE_STRINGS.put("&upsilon", '\u03C5'); 202 ESCAPE_STRINGS.put("&phi", '\u03C6'); 203 ESCAPE_STRINGS.put("&chi", '\u03C7'); 204 ESCAPE_STRINGS.put("&psi", '\u03C8'); 205 ESCAPE_STRINGS.put("&omega", '\u03C9'); 206 ESCAPE_STRINGS.put("&thetasym", '\u03D1'); 207 ESCAPE_STRINGS.put("&upsih", '\u03D2'); 208 ESCAPE_STRINGS.put("&piv", '\u03D6'); 209 ESCAPE_STRINGS.put("&bull", '\u2022'); 210 ESCAPE_STRINGS.put("&hellip", '\u2026'); 211 ESCAPE_STRINGS.put("&prime", '\u2032'); 212 ESCAPE_STRINGS.put("&Prime", '\u2033'); 213 ESCAPE_STRINGS.put("&oline", '\u203E'); 214 ESCAPE_STRINGS.put("&frasl", '\u2044'); 215 ESCAPE_STRINGS.put("&weierp", '\u2118'); 216 ESCAPE_STRINGS.put("&image", '\u2111'); 217 ESCAPE_STRINGS.put("&real", '\u211C'); 218 ESCAPE_STRINGS.put("&trade", '\u2122'); 219 ESCAPE_STRINGS.put("&alefsym", '\u2135'); 220 ESCAPE_STRINGS.put("&larr", '\u2190'); 221 ESCAPE_STRINGS.put("&uarr", '\u2191'); 222 ESCAPE_STRINGS.put("&rarr", '\u2192'); 223 ESCAPE_STRINGS.put("&darr", '\u2193'); 224 ESCAPE_STRINGS.put("&harr", '\u2194'); 225 ESCAPE_STRINGS.put("&crarr", '\u21B5'); 226 ESCAPE_STRINGS.put("&lArr", '\u21D0'); 227 ESCAPE_STRINGS.put("&uArr", '\u21D1'); 228 ESCAPE_STRINGS.put("&rArr", '\u21D2'); 229 ESCAPE_STRINGS.put("&dArr", '\u21D3'); 230 ESCAPE_STRINGS.put("&hArr", '\u21D4'); 231 ESCAPE_STRINGS.put("&forall", '\u2200'); 232 ESCAPE_STRINGS.put("&part", '\u2202'); 233 ESCAPE_STRINGS.put("&exist", '\u2203'); 234 ESCAPE_STRINGS.put("&empty", '\u2205'); 235 ESCAPE_STRINGS.put("&nabla", '\u2207'); 236 ESCAPE_STRINGS.put("&isin", '\u2208'); 237 ESCAPE_STRINGS.put("¬in", '\u2209'); 238 ESCAPE_STRINGS.put("&ni", '\u220B'); 239 ESCAPE_STRINGS.put("&prod", '\u220F'); 240 ESCAPE_STRINGS.put("&sum", '\u2211'); 241 ESCAPE_STRINGS.put("&minus", '\u2212'); 242 ESCAPE_STRINGS.put("&lowast", '\u2217'); 243 ESCAPE_STRINGS.put("&radic", '\u221A'); 244 ESCAPE_STRINGS.put("&prop", '\u221D'); 245 ESCAPE_STRINGS.put("&infin", '\u221E'); 246 ESCAPE_STRINGS.put("&ang", '\u2220'); 247 ESCAPE_STRINGS.put("&and", '\u2227'); 248 ESCAPE_STRINGS.put("&or", '\u2228'); 249 ESCAPE_STRINGS.put("&cap", '\u2229'); 250 ESCAPE_STRINGS.put("&cup", '\u222A'); 251 ESCAPE_STRINGS.put("&int", '\u222B'); 252 ESCAPE_STRINGS.put("&there4", '\u2234'); 253 ESCAPE_STRINGS.put("&sim", '\u223C'); 254 ESCAPE_STRINGS.put("&cong", '\u2245'); 255 ESCAPE_STRINGS.put("&asymp", '\u2248'); 256 ESCAPE_STRINGS.put("&ne", '\u2260'); 257 ESCAPE_STRINGS.put("&equiv", '\u2261'); 258 ESCAPE_STRINGS.put("&le", '\u2264'); 259 ESCAPE_STRINGS.put("&ge", '\u2265'); 260 ESCAPE_STRINGS.put("&sub", '\u2282'); 261 ESCAPE_STRINGS.put("&sup", '\u2283'); 262 ESCAPE_STRINGS.put("&nsub", '\u2284'); 263 ESCAPE_STRINGS.put("&sube", '\u2286'); 264 ESCAPE_STRINGS.put("&supe", '\u2287'); 265 ESCAPE_STRINGS.put("&oplus", '\u2295'); 266 ESCAPE_STRINGS.put("&otimes", '\u2297'); 267 ESCAPE_STRINGS.put("&perp", '\u22A5'); 268 ESCAPE_STRINGS.put("&sdot", '\u22C5'); 269 ESCAPE_STRINGS.put("&lceil", '\u2308'); 270 ESCAPE_STRINGS.put("&rceil", '\u2309'); 271 ESCAPE_STRINGS.put("&lfloor", '\u230A'); 272 ESCAPE_STRINGS.put("&rfloor", '\u230B'); 273 ESCAPE_STRINGS.put("&lang", '\u2329'); 274 ESCAPE_STRINGS.put("&rang", '\u232A'); 275 ESCAPE_STRINGS.put("&loz", '\u25CA'); 276 ESCAPE_STRINGS.put("&spades", '\u2660'); 277 ESCAPE_STRINGS.put("&clubs", '\u2663'); 278 ESCAPE_STRINGS.put("&hearts", '\u2665'); 279 ESCAPE_STRINGS.put("&diams", '\u2666'); 280 ESCAPE_STRINGS.put(""", '\u0022'); 281 ESCAPE_STRINGS.put("&", '\u0026'); 282 ESCAPE_STRINGS.put("<", '\u003C'); 283 ESCAPE_STRINGS.put(">", '\u003E'); 284 ESCAPE_STRINGS.put("&OElig", '\u0152'); 285 ESCAPE_STRINGS.put("&oelig", '\u0153'); 286 ESCAPE_STRINGS.put("&Scaron", '\u0160'); 287 ESCAPE_STRINGS.put("&scaron", '\u0161'); 288 ESCAPE_STRINGS.put("&Yuml", '\u0178'); 289 ESCAPE_STRINGS.put("&circ", '\u02C6'); 290 ESCAPE_STRINGS.put("&tilde", '\u02DC'); 291 ESCAPE_STRINGS.put("&ensp", '\u2002'); 292 ESCAPE_STRINGS.put("&emsp", '\u2003'); 293 ESCAPE_STRINGS.put("&thinsp", '\u2009'); 294 ESCAPE_STRINGS.put("&zwnj", '\u200C'); 295 ESCAPE_STRINGS.put("&zwj", '\u200D'); 296 ESCAPE_STRINGS.put("&lrm", '\u200E'); 297 ESCAPE_STRINGS.put("&rlm", '\u200F'); 298 ESCAPE_STRINGS.put("&ndash", '\u2013'); 299 ESCAPE_STRINGS.put("&mdash", '\u2014'); 300 ESCAPE_STRINGS.put("&lsquo", '\u2018'); 301 ESCAPE_STRINGS.put("&rsquo", '\u2019'); 302 ESCAPE_STRINGS.put("&sbquo", '\u201A'); 303 ESCAPE_STRINGS.put("&ldquo", '\u201C'); 304 ESCAPE_STRINGS.put("&rdquo", '\u201D'); 305 ESCAPE_STRINGS.put("&bdquo", '\u201E'); 306 ESCAPE_STRINGS.put("&dagger", '\u2020'); 307 ESCAPE_STRINGS.put("&Dagger", '\u2021'); 308 ESCAPE_STRINGS.put("&permil", '\u2030'); 309 ESCAPE_STRINGS.put("&lsaquo", '\u2039'); 310 ESCAPE_STRINGS.put("&rsaquo", '\u203A'); 311 ESCAPE_STRINGS.put("&euro", '\u20AC'); 312 } 313 314 /** 315 * Code to generate a short 'snippet' from either plain text or html text 316 * 317 * If the sync protocol can get plain text, that's great, but we'll still strip out extraneous 318 * whitespace. If it's HTML, we'll 1) strip out tags, 2) turn entities into the appropriate 319 * characters, and 3) strip out extraneous whitespace, all in one pass 320 * 321 * Why not use an existing class? The best answer is performance; yet another answer is 322 * correctness (e.g. Html.textFromHtml simply doesn't generate well-stripped text). But 323 * performance is key; we frequently sync text that is 10K or (much) longer, yet we really only 324 * care about a small amount of text for the snippet. So it's critically important that we just 325 * stop when we've gotten enough; existing methods that exist will go through the entire 326 * incoming string, at great (and useless, in this case) expense. 327 */ 328 329 public static String makeSnippetFromHtmlText(String text) { 330 return makeSnippetFromText(text, true); 331 } 332 333 public static String makeSnippetFromPlainText(String text) { 334 return makeSnippetFromText(text, false); 335 } 336 337 /** 338 * Find the end of this tag; there are two alternatives: <tag .../> or <tag ...> ... </tag> 339 * @param htmlText some HTML text 340 * @param tag the HTML tag 341 * @param startPos the start position in the HTML text where the tag starts 342 * @return the position just before the end of the tag or -1 if not found 343 */ 344 /*package*/ static int findTagEnd(String htmlText, String tag, int startPos) { 345 if (tag.endsWith(" ")) { 346 tag = tag.substring(0, tag.length() - 1); 347 } 348 int length = htmlText.length(); 349 char prevChar = 0; 350 for (int i = startPos; i < length; i++) { 351 char c = htmlText.charAt(i); 352 if (c == '>') { 353 if (prevChar == '/') { 354 return i - 1; 355 } 356 break; 357 } 358 prevChar = c; 359 } 360 // We didn't find /> at the end of the tag so find </tag> 361 return htmlText.indexOf("/" + tag, startPos); 362 } 363 364 public static String makeSnippetFromText(String text, boolean stripHtml) { 365 // Handle null and empty string 366 if (TextUtils.isEmpty(text)) return ""; 367 368 final int length = text.length(); 369 // Use char[] instead of StringBuilder purely for performance; fewer method calls, etc. 370 char[] buffer = new char[MAX_SNIPPET_LENGTH]; 371 // skipCount is an array of a single int; that int is set inside stripHtmlEntity and is 372 // used to determine how many characters can be "skipped" due to the transformation of the 373 // entity to a single character. When Java allows multiple return values, we can make this 374 // much cleaner :-) 375 int[] skipCount = new int[1]; 376 int bufferCount = 0; 377 // Start with space as last character to avoid leading whitespace 378 char last = ' '; 379 // Indicates whether we're in the middle of an HTML tag 380 boolean inTag = false; 381 382 // Walk through the text until we're done with the input OR we've got a large enough snippet 383 for (int i = 0; i < length && bufferCount < MAX_SNIPPET_LENGTH; i++) { 384 char c = text.charAt(i); 385 if (stripHtml && !inTag && (c == '<')) { 386 // Find tags to strip; they will begin with <! or !- or </ or <letter 387 if (i < (length - 1)) { 388 char peek = text.charAt(i + 1); 389 if (peek == '!' || peek == '-' || peek == '/' || Character.isLetter(peek)) { 390 inTag = true; 391 // Strip content of title, script, style and applet tags 392 if (i < (length - (MAX_STRIP_TAG_LENGTH + 2))) { 393 String tag = text.substring(i + 1, i + MAX_STRIP_TAG_LENGTH + 1); 394 String tagLowerCase = tag.toLowerCase(); 395 boolean stripContent = false; 396 for (String stripTag: STRIP_TAGS) { 397 if (tagLowerCase.startsWith(stripTag)) { 398 stripContent = true; 399 tag = tag.substring(0, stripTag.length()); 400 break; 401 } 402 } 403 if (stripContent) { 404 // Look for the end of this tag 405 int endTagPosition = findTagEnd(text, tag, i); 406 if (endTagPosition < 0) { 407 break; 408 } else { 409 i = endTagPosition; 410 } 411 } 412 } 413 } 414 } 415 } else if (stripHtml && inTag && (c == '>')) { 416 // Terminate stripping here 417 inTag = false; 418 continue; 419 } 420 421 if (inTag) { 422 // We just skip by everything while we're in a tag 423 continue; 424 } else if (stripHtml && (c == '&')) { 425 // Handle a possible HTML entity here 426 // We always get back a character to use; we also get back a "skip count", 427 // indicating how many characters were eaten from the entity 428 c = stripHtmlEntity(text, i, skipCount); 429 i += skipCount[0]; 430 } 431 432 if (Character.isWhitespace(c) || (c == NON_BREAKING_SPACE_CHARACTER)) { 433 // The idea is to find the content in the message, not the whitespace, so we'll 434 // turn any combination of contiguous whitespace into a single space 435 if (last == ' ') { 436 continue; 437 } else { 438 // Make every whitespace character a simple space 439 c = ' '; 440 } 441 } else if ((c == '-' || c == '=') && (last == c)) { 442 // Lots of messages (especially digests) have whole lines of --- or === 443 // We'll get rid of those duplicates here 444 continue; 445 } 446 447 // After all that, maybe we've got a character for our snippet 448 buffer[bufferCount++] = c; 449 last = c; 450 } 451 452 // Lose trailing space and return our snippet 453 if ((bufferCount > 0) && (last == ' ')) { 454 bufferCount--; 455 } 456 return new String(buffer, 0, bufferCount); 457 } 458 459 static /*package*/ char stripHtmlEntity(String text, int pos, int[] skipCount) { 460 int length = text.length(); 461 // Ugly, but we store our skip count in this array; we can't use a static here, because 462 // multiple threads might be calling in 463 skipCount[0] = 0; 464 // All entities are <= 8 characters long, so that's how far we'll look for one (+ & and ;) 465 int end = pos + 10; 466 String entity = null; 467 // Isolate the entity 468 for (int i = pos; (i < length) && (i < end); i++) { 469 if (text.charAt(i) == ';') { 470 entity = text.substring(pos, i); 471 break; 472 } 473 } 474 if (entity == null) { 475 // This wasn't really an HTML entity 476 return '&'; 477 } else { 478 // Skip count is the length of the entity 479 Character mapping = ESCAPE_STRINGS.get(entity); 480 int entityLength = entity.length(); 481 if (mapping != null) { 482 skipCount[0] = entityLength; 483 return mapping; 484 } else if ((entityLength > 2) && (entity.charAt(1) == '#')) { 485 // &#nn; means ascii nn (decimal) and &#xnn means ascii nn (hex) 486 char c = '?'; 487 try { 488 int i; 489 if ((entity.charAt(2) == 'x') && (entityLength > 3)) { 490 i = Integer.parseInt(entity.substring(3), 16); 491 } else { 492 i = Integer.parseInt(entity.substring(2)); 493 } 494 c = (char)i; 495 } catch (NumberFormatException e) { 496 // We'll just return the ? in this case 497 } 498 skipCount[0] = entityLength; 499 return c; 500 } 501 } 502 // Worst case, we return the original start character, ampersand 503 return '&'; 504 } 505 506 /** 507 * Given a string of HTML text and a query containing any number of search terms, returns 508 * an HTML string in which those search terms are highlighted (intended for use in a WebView) 509 * 510 * @param text the HTML text to process 511 * @param query the search terms 512 * @return HTML text with the search terms highlighted 513 */ 514 @VisibleForTesting 515 public static String highlightTermsInHtml(String text, String query) { 516 try { 517 return highlightTerms(text, query, true).toString(); 518 } catch (IOException e) { 519 // Can't happen, but we must catch this 520 return text; 521 } 522 } 523 524 /** 525 * Given a string of plain text and a query containing any number of search terms, returns 526 * a CharSequence in which those search terms are highlighted (intended for use in a TextView) 527 * 528 * @param text the text to process 529 * @param query the search terms 530 * @return a CharSequence with the search terms highlighted 531 */ 532 public static CharSequence highlightTermsInText(String text, String query) { 533 try { 534 return highlightTerms(text, query, false); 535 } catch (IOException e) { 536 // Can't happen, but we must catch this 537 return text; 538 } 539 } 540 541 static class SearchTerm { 542 final String mTerm; 543 final String mTermLowerCase; 544 final int mLength; 545 int mMatchLength = 0; 546 int mMatchStart = -1; 547 548 SearchTerm(String term) { 549 mTerm = term; 550 mTermLowerCase = term.toLowerCase(); 551 mLength = term.length(); 552 } 553 } 554 555 /** 556 * Generate a version of the incoming text in which all search terms in a query are highlighted. 557 * If the input is HTML, we return a StringBuilder with additional markup as required 558 * If the input is text, we return a SpannableStringBuilder with additional spans as required 559 * 560 * @param text the text to be processed 561 * @param query the query, which can contain multiple terms separated by whitespace 562 * @param html whether or not the text to be processed is HTML 563 * @return highlighted text 564 * 565 * @throws IOException as Appendable requires this 566 */ 567 public static CharSequence highlightTerms(String text, String query, boolean html) 568 throws IOException { 569 // Handle null and empty string 570 if (TextUtils.isEmpty(text)) return ""; 571 final int length = text.length(); 572 573 // Break up the query into search terms 574 ArrayList<SearchTerm> terms = new ArrayList<SearchTerm>(); 575 if (query != null) { 576 StringTokenizer st = new StringTokenizer(query); 577 while (st.hasMoreTokens()) { 578 terms.add(new SearchTerm(st.nextToken())); 579 } 580 } 581 582 // Our appendable depends on whether we're building HTML text (for webview) or spannable 583 // text (for UI) 584 final Appendable sb = html ? new StringBuilder() : new SpannableStringBuilder(); 585 // Indicates whether we're in the middle of an HTML tag 586 boolean inTag = false; 587 // The position of the last input character copied to output 588 int lastOut = -1; 589 590 // Walk through the text until we're done with the input 591 // Just copy any HTML tags directly into the output; search for terms in the remaining text 592 for (int i = 0; i < length; i++) { 593 char chr = text.charAt(i); 594 if (html) { 595 if (!inTag && (chr == '<')) { 596 // Find tags; they will begin with <! or !- or </ or <letter 597 if (i < (length - 1)) { 598 char peek = text.charAt(i + 1); 599 if (peek == '!' || peek == '-' || peek == '/' || Character.isLetter(peek)) { 600 inTag = true; 601 // Skip content of title, script, style and applet tags 602 if (i < (length - (MAX_STRIP_TAG_LENGTH + 2))) { 603 String tag = text.substring(i + 1, i + MAX_STRIP_TAG_LENGTH + 1); 604 String tagLowerCase = tag.toLowerCase(); 605 boolean stripContent = false; 606 for (String stripTag: STRIP_TAGS) { 607 if (tagLowerCase.startsWith(stripTag)) { 608 stripContent = true; 609 tag = tag.substring(0, stripTag.length()); 610 break; 611 } 612 } 613 if (stripContent) { 614 // Look for the end of this tag 615 int endTagPosition = findTagEnd(text, tag, i); 616 if (endTagPosition < 0) { 617 sb.append(text.substring(i)); 618 break; 619 } else { 620 sb.append(text.substring(i, endTagPosition - 1)); 621 i = endTagPosition - 1; 622 chr = text.charAt(i); 623 } 624 } 625 } 626 } 627 } 628 } else if (inTag && (chr == '>')) { 629 inTag = false; 630 } 631 632 if (inTag) { 633 sb.append(chr); 634 continue; 635 } 636 } 637 638 // After all that, we've got some "body" text 639 char chrLowerCase = Character.toLowerCase(chr); 640 // Whether or not the current character should be appended to the output; we inhibit 641 // this while any search terms match 642 boolean appendNow = true; 643 // Look through search terms for matches 644 for (SearchTerm t: terms) { 645 if (chrLowerCase == t.mTermLowerCase.charAt(t.mMatchLength)) { 646 if (t.mMatchLength++ == 0) { 647 // New match start 648 t.mMatchStart = i; 649 } 650 if (t.mMatchLength == t.mLength) { 651 String matchText = text.substring(t.mMatchStart, t.mMatchStart + t.mLength); 652 // Completed match; add highlight and reset term 653 if (t.mMatchStart <= lastOut) { 654 matchText = text.substring(lastOut + 1, i + 1); 655 } 656 /*else*/ 657 if (matchText.length() == 0) {} else 658 if (html) { 659 sb.append("<span style=\"background-color: " + HIGHLIGHT_COLOR_STRING + 660 "\">"); 661 sb.append(matchText); 662 sb.append("</span>"); 663 } else { 664 SpannableString highlightSpan = new SpannableString(matchText); 665 highlightSpan.setSpan(new BackgroundColorSpan(HIGHLIGHT_COLOR_INT), 0, 666 highlightSpan.length(), Spannable.SPAN_EXCLUSIVE_EXCLUSIVE); 667 sb.append(highlightSpan); 668 } 669 lastOut = t.mMatchStart + t.mLength - 1; 670 t.mMatchLength = 0; 671 t.mMatchStart = -1; 672 } 673 appendNow = false; 674 } else { 675 if (t.mMatchStart >= 0) { 676 // We're no longer matching; check for other matches in progress 677 int leastOtherStart = -1; 678 for (SearchTerm ot: terms) { 679 // Save away the lowest match start for other search terms 680 if ((ot != t) && (ot.mMatchStart >= 0) && ((leastOtherStart < 0) || 681 (ot.mMatchStart <= leastOtherStart))) { 682 leastOtherStart = ot.mMatchStart; 683 } 684 } 685 int matchEnd = t.mMatchStart + t.mMatchLength; 686 if (leastOtherStart < 0 || leastOtherStart > matchEnd) { 687 // Append the whole thing 688 if (t.mMatchStart > lastOut) { 689 sb.append(text.substring(t.mMatchStart, matchEnd)); 690 lastOut = matchEnd; 691 } 692 } else if (leastOtherStart == t.mMatchStart) { 693 // Ok to append the current char 694 } else if (leastOtherStart < t.mMatchStart) { 695 // We're already covered by another search term, so don't append 696 appendNow = false; 697 } else if (t.mMatchStart > lastOut) { 698 // Append the piece of our term that's not already covered 699 sb.append(text.substring(t.mMatchStart, leastOtherStart)); 700 lastOut = leastOtherStart; 701 } 702 } 703 // Reset this term 704 t.mMatchLength = 0; 705 t.mMatchStart = -1; 706 } 707 } 708 709 if (appendNow) { 710 sb.append(chr); 711 lastOut = i; 712 } 713 } 714 715 return (CharSequence)sb; 716 } 717 718 /** 719 * Determine whether two Strings (either of which might be null) are the same; this is true 720 * when both are null or both are Strings that are equal. 721 */ 722 public static boolean stringOrNullEquals(String a, String b) { 723 if (a == null && b == null) return true; 724 if (a != null && b != null && a.equals(b)) return true; 725 return false; 726 } 727 728 } 729