Home | History | Annotate | Download | only in utility
      1 /*
      2  * Copyright (C) 2010 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.emailcommon.utility;
     18 
     19 import com.google.common.annotations.VisibleForTesting;
     20 
     21 import android.graphics.Color;
     22 import android.text.Spannable;
     23 import android.text.SpannableString;
     24 import android.text.SpannableStringBuilder;
     25 import android.text.TextUtils;
     26 import android.text.style.BackgroundColorSpan;
     27 
     28 import java.io.IOException;
     29 import java.util.ArrayList;
     30 import java.util.HashMap;
     31 import java.util.Map;
     32 import java.util.StringTokenizer;
     33 
     34 public class TextUtilities {
     35     // Highlight color is yellow, as in other apps.
     36     // TODO Push for this to be a global (style-related?) constant
     37     public static final int HIGHLIGHT_COLOR_INT = Color.YELLOW;
     38     // We AND off the "alpha" from the color (i.e. 0xFFFFFF00 -> 0x00FFFF00)
     39     /*package*/ static final String HIGHLIGHT_COLOR_STRING =
     40         '#' + Integer.toHexString(HIGHLIGHT_COLOR_INT & 0x00FFFFFF);
     41 
     42     // This is how many chars we'll allow in a snippet
     43     private static final int MAX_SNIPPET_LENGTH = 200;
     44     // For some reason, isWhitespace() returns false with the following...
     45     /*package*/ static final char NON_BREAKING_SPACE_CHARACTER = (char)160;
     46 
     47     // Tags whose content must be stripped as well
     48     static final String[] STRIP_TAGS =
     49         new String[] {"title", "script", "style", "applet", "head"};
     50     // The number of characters we peel off for testing against STRIP_TAGS; this should be the
     51     // maximum size of the strings in STRIP_TAGS
     52     static final int MAX_STRIP_TAG_LENGTH = 6;
     53 
     54     static final Map<String, Character> ESCAPE_STRINGS;
     55     static {
     56         // HTML character entity references as defined in HTML 4
     57         // see http://www.w3.org/TR/REC-html40/sgml/entities.html
     58         ESCAPE_STRINGS = new HashMap<String, Character>(252);
     59 
     60         ESCAPE_STRINGS.put("&nbsp", '\u00A0');
     61         ESCAPE_STRINGS.put("&iexcl", '\u00A1');
     62         ESCAPE_STRINGS.put("&cent", '\u00A2');
     63         ESCAPE_STRINGS.put("&pound", '\u00A3');
     64         ESCAPE_STRINGS.put("&curren", '\u00A4');
     65         ESCAPE_STRINGS.put("&yen", '\u00A5');
     66         ESCAPE_STRINGS.put("&brvbar", '\u00A6');
     67         ESCAPE_STRINGS.put("&sect", '\u00A7');
     68         ESCAPE_STRINGS.put("&uml", '\u00A8');
     69         ESCAPE_STRINGS.put("&copy", '\u00A9');
     70         ESCAPE_STRINGS.put("&ordf", '\u00AA');
     71         ESCAPE_STRINGS.put("&laquo", '\u00AB');
     72         ESCAPE_STRINGS.put("&not", '\u00AC');
     73         ESCAPE_STRINGS.put("&shy", '\u00AD');
     74         ESCAPE_STRINGS.put("&reg", '\u00AE');
     75         ESCAPE_STRINGS.put("&macr", '\u00AF');
     76         ESCAPE_STRINGS.put("&deg", '\u00B0');
     77         ESCAPE_STRINGS.put("&plusmn", '\u00B1');
     78         ESCAPE_STRINGS.put("&sup2", '\u00B2');
     79         ESCAPE_STRINGS.put("&sup3", '\u00B3');
     80         ESCAPE_STRINGS.put("&acute", '\u00B4');
     81         ESCAPE_STRINGS.put("&micro", '\u00B5');
     82         ESCAPE_STRINGS.put("&para", '\u00B6');
     83         ESCAPE_STRINGS.put("&middot", '\u00B7');
     84         ESCAPE_STRINGS.put("&cedil", '\u00B8');
     85         ESCAPE_STRINGS.put("&sup1", '\u00B9');
     86         ESCAPE_STRINGS.put("&ordm", '\u00BA');
     87         ESCAPE_STRINGS.put("&raquo", '\u00BB');
     88         ESCAPE_STRINGS.put("&frac14", '\u00BC');
     89         ESCAPE_STRINGS.put("&frac12", '\u00BD');
     90         ESCAPE_STRINGS.put("&frac34", '\u00BE');
     91         ESCAPE_STRINGS.put("&iquest", '\u00BF');
     92         ESCAPE_STRINGS.put("&Agrave", '\u00C0');
     93         ESCAPE_STRINGS.put("&Aacute", '\u00C1');
     94         ESCAPE_STRINGS.put("&Acirc", '\u00C2');
     95         ESCAPE_STRINGS.put("&Atilde", '\u00C3');
     96         ESCAPE_STRINGS.put("&Auml", '\u00C4');
     97         ESCAPE_STRINGS.put("&Aring", '\u00C5');
     98         ESCAPE_STRINGS.put("&AElig", '\u00C6');
     99         ESCAPE_STRINGS.put("&Ccedil", '\u00C7');
    100         ESCAPE_STRINGS.put("&Egrave", '\u00C8');
    101         ESCAPE_STRINGS.put("&Eacute", '\u00C9');
    102         ESCAPE_STRINGS.put("&Ecirc", '\u00CA');
    103         ESCAPE_STRINGS.put("&Euml", '\u00CB');
    104         ESCAPE_STRINGS.put("&Igrave", '\u00CC');
    105         ESCAPE_STRINGS.put("&Iacute", '\u00CD');
    106         ESCAPE_STRINGS.put("&Icirc", '\u00CE');
    107         ESCAPE_STRINGS.put("&Iuml", '\u00CF');
    108         ESCAPE_STRINGS.put("&ETH", '\u00D0');
    109         ESCAPE_STRINGS.put("&Ntilde", '\u00D1');
    110         ESCAPE_STRINGS.put("&Ograve", '\u00D2');
    111         ESCAPE_STRINGS.put("&Oacute", '\u00D3');
    112         ESCAPE_STRINGS.put("&Ocirc", '\u00D4');
    113         ESCAPE_STRINGS.put("&Otilde", '\u00D5');
    114         ESCAPE_STRINGS.put("&Ouml", '\u00D6');
    115         ESCAPE_STRINGS.put("&times", '\u00D7');
    116         ESCAPE_STRINGS.put("&Oslash", '\u00D8');
    117         ESCAPE_STRINGS.put("&Ugrave", '\u00D9');
    118         ESCAPE_STRINGS.put("&Uacute", '\u00DA');
    119         ESCAPE_STRINGS.put("&Ucirc", '\u00DB');
    120         ESCAPE_STRINGS.put("&Uuml", '\u00DC');
    121         ESCAPE_STRINGS.put("&Yacute", '\u00DD');
    122         ESCAPE_STRINGS.put("&THORN", '\u00DE');
    123         ESCAPE_STRINGS.put("&szlig", '\u00DF');
    124         ESCAPE_STRINGS.put("&agrave", '\u00E0');
    125         ESCAPE_STRINGS.put("&aacute", '\u00E1');
    126         ESCAPE_STRINGS.put("&acirc", '\u00E2');
    127         ESCAPE_STRINGS.put("&atilde", '\u00E3');
    128         ESCAPE_STRINGS.put("&auml", '\u00E4');
    129         ESCAPE_STRINGS.put("&aring", '\u00E5');
    130         ESCAPE_STRINGS.put("&aelig", '\u00E6');
    131         ESCAPE_STRINGS.put("&ccedil", '\u00E7');
    132         ESCAPE_STRINGS.put("&egrave", '\u00E8');
    133         ESCAPE_STRINGS.put("&eacute", '\u00E9');
    134         ESCAPE_STRINGS.put("&ecirc", '\u00EA');
    135         ESCAPE_STRINGS.put("&euml", '\u00EB');
    136         ESCAPE_STRINGS.put("&igrave", '\u00EC');
    137         ESCAPE_STRINGS.put("&iacute", '\u00ED');
    138         ESCAPE_STRINGS.put("&icirc", '\u00EE');
    139         ESCAPE_STRINGS.put("&iuml", '\u00EF');
    140         ESCAPE_STRINGS.put("&eth", '\u00F0');
    141         ESCAPE_STRINGS.put("&ntilde", '\u00F1');
    142         ESCAPE_STRINGS.put("&ograve", '\u00F2');
    143         ESCAPE_STRINGS.put("&oacute", '\u00F3');
    144         ESCAPE_STRINGS.put("&ocirc", '\u00F4');
    145         ESCAPE_STRINGS.put("&otilde", '\u00F5');
    146         ESCAPE_STRINGS.put("&ouml", '\u00F6');
    147         ESCAPE_STRINGS.put("&divide", '\u00F7');
    148         ESCAPE_STRINGS.put("&oslash", '\u00F8');
    149         ESCAPE_STRINGS.put("&ugrave", '\u00F9');
    150         ESCAPE_STRINGS.put("&uacute", '\u00FA');
    151         ESCAPE_STRINGS.put("&ucirc", '\u00FB');
    152         ESCAPE_STRINGS.put("&uuml", '\u00FC');
    153         ESCAPE_STRINGS.put("&yacute", '\u00FD');
    154         ESCAPE_STRINGS.put("&thorn", '\u00FE');
    155         ESCAPE_STRINGS.put("&yuml", '\u00FF');
    156         ESCAPE_STRINGS.put("&fnof", '\u0192');
    157         ESCAPE_STRINGS.put("&Alpha", '\u0391');
    158         ESCAPE_STRINGS.put("&Beta", '\u0392');
    159         ESCAPE_STRINGS.put("&Gamma", '\u0393');
    160         ESCAPE_STRINGS.put("&Delta", '\u0394');
    161         ESCAPE_STRINGS.put("&Epsilon", '\u0395');
    162         ESCAPE_STRINGS.put("&Zeta", '\u0396');
    163         ESCAPE_STRINGS.put("&Eta", '\u0397');
    164         ESCAPE_STRINGS.put("&Theta", '\u0398');
    165         ESCAPE_STRINGS.put("&Iota", '\u0399');
    166         ESCAPE_STRINGS.put("&Kappa", '\u039A');
    167         ESCAPE_STRINGS.put("&Lambda", '\u039B');
    168         ESCAPE_STRINGS.put("&Mu", '\u039C');
    169         ESCAPE_STRINGS.put("&Nu", '\u039D');
    170         ESCAPE_STRINGS.put("&Xi", '\u039E');
    171         ESCAPE_STRINGS.put("&Omicron", '\u039F');
    172         ESCAPE_STRINGS.put("&Pi", '\u03A0');
    173         ESCAPE_STRINGS.put("&Rho", '\u03A1');
    174         ESCAPE_STRINGS.put("&Sigma", '\u03A3');
    175         ESCAPE_STRINGS.put("&Tau", '\u03A4');
    176         ESCAPE_STRINGS.put("&Upsilon", '\u03A5');
    177         ESCAPE_STRINGS.put("&Phi", '\u03A6');
    178         ESCAPE_STRINGS.put("&Chi", '\u03A7');
    179         ESCAPE_STRINGS.put("&Psi", '\u03A8');
    180         ESCAPE_STRINGS.put("&Omega", '\u03A9');
    181         ESCAPE_STRINGS.put("&alpha", '\u03B1');
    182         ESCAPE_STRINGS.put("&beta", '\u03B2');
    183         ESCAPE_STRINGS.put("&gamma", '\u03B3');
    184         ESCAPE_STRINGS.put("&delta", '\u03B4');
    185         ESCAPE_STRINGS.put("&epsilon", '\u03B5');
    186         ESCAPE_STRINGS.put("&zeta", '\u03B6');
    187         ESCAPE_STRINGS.put("&eta", '\u03B7');
    188         ESCAPE_STRINGS.put("&theta", '\u03B8');
    189         ESCAPE_STRINGS.put("&iota", '\u03B9');
    190         ESCAPE_STRINGS.put("&kappa", '\u03BA');
    191         ESCAPE_STRINGS.put("&lambda", '\u03BB');
    192         ESCAPE_STRINGS.put("&mu", '\u03BC');
    193         ESCAPE_STRINGS.put("&nu", '\u03BD');
    194         ESCAPE_STRINGS.put("&xi", '\u03BE');
    195         ESCAPE_STRINGS.put("&omicron", '\u03BF');
    196         ESCAPE_STRINGS.put("&pi", '\u03C0');
    197         ESCAPE_STRINGS.put("&rho", '\u03C1');
    198         ESCAPE_STRINGS.put("&sigmaf", '\u03C2');
    199         ESCAPE_STRINGS.put("&sigma", '\u03C3');
    200         ESCAPE_STRINGS.put("&tau", '\u03C4');
    201         ESCAPE_STRINGS.put("&upsilon", '\u03C5');
    202         ESCAPE_STRINGS.put("&phi", '\u03C6');
    203         ESCAPE_STRINGS.put("&chi", '\u03C7');
    204         ESCAPE_STRINGS.put("&psi", '\u03C8');
    205         ESCAPE_STRINGS.put("&omega", '\u03C9');
    206         ESCAPE_STRINGS.put("&thetasym", '\u03D1');
    207         ESCAPE_STRINGS.put("&upsih", '\u03D2');
    208         ESCAPE_STRINGS.put("&piv", '\u03D6');
    209         ESCAPE_STRINGS.put("&bull", '\u2022');
    210         ESCAPE_STRINGS.put("&hellip", '\u2026');
    211         ESCAPE_STRINGS.put("&prime", '\u2032');
    212         ESCAPE_STRINGS.put("&Prime", '\u2033');
    213         ESCAPE_STRINGS.put("&oline", '\u203E');
    214         ESCAPE_STRINGS.put("&frasl", '\u2044');
    215         ESCAPE_STRINGS.put("&weierp", '\u2118');
    216         ESCAPE_STRINGS.put("&image", '\u2111');
    217         ESCAPE_STRINGS.put("&real", '\u211C');
    218         ESCAPE_STRINGS.put("&trade", '\u2122');
    219         ESCAPE_STRINGS.put("&alefsym", '\u2135');
    220         ESCAPE_STRINGS.put("&larr", '\u2190');
    221         ESCAPE_STRINGS.put("&uarr", '\u2191');
    222         ESCAPE_STRINGS.put("&rarr", '\u2192');
    223         ESCAPE_STRINGS.put("&darr", '\u2193');
    224         ESCAPE_STRINGS.put("&harr", '\u2194');
    225         ESCAPE_STRINGS.put("&crarr", '\u21B5');
    226         ESCAPE_STRINGS.put("&lArr", '\u21D0');
    227         ESCAPE_STRINGS.put("&uArr", '\u21D1');
    228         ESCAPE_STRINGS.put("&rArr", '\u21D2');
    229         ESCAPE_STRINGS.put("&dArr", '\u21D3');
    230         ESCAPE_STRINGS.put("&hArr", '\u21D4');
    231         ESCAPE_STRINGS.put("&forall", '\u2200');
    232         ESCAPE_STRINGS.put("&part", '\u2202');
    233         ESCAPE_STRINGS.put("&exist", '\u2203');
    234         ESCAPE_STRINGS.put("&empty", '\u2205');
    235         ESCAPE_STRINGS.put("&nabla", '\u2207');
    236         ESCAPE_STRINGS.put("&isin", '\u2208');
    237         ESCAPE_STRINGS.put("&notin", '\u2209');
    238         ESCAPE_STRINGS.put("&ni", '\u220B');
    239         ESCAPE_STRINGS.put("&prod", '\u220F');
    240         ESCAPE_STRINGS.put("&sum", '\u2211');
    241         ESCAPE_STRINGS.put("&minus", '\u2212');
    242         ESCAPE_STRINGS.put("&lowast", '\u2217');
    243         ESCAPE_STRINGS.put("&radic", '\u221A');
    244         ESCAPE_STRINGS.put("&prop", '\u221D');
    245         ESCAPE_STRINGS.put("&infin", '\u221E');
    246         ESCAPE_STRINGS.put("&ang", '\u2220');
    247         ESCAPE_STRINGS.put("&and", '\u2227');
    248         ESCAPE_STRINGS.put("&or", '\u2228');
    249         ESCAPE_STRINGS.put("&cap", '\u2229');
    250         ESCAPE_STRINGS.put("&cup", '\u222A');
    251         ESCAPE_STRINGS.put("&int", '\u222B');
    252         ESCAPE_STRINGS.put("&there4", '\u2234');
    253         ESCAPE_STRINGS.put("&sim", '\u223C');
    254         ESCAPE_STRINGS.put("&cong", '\u2245');
    255         ESCAPE_STRINGS.put("&asymp", '\u2248');
    256         ESCAPE_STRINGS.put("&ne", '\u2260');
    257         ESCAPE_STRINGS.put("&equiv", '\u2261');
    258         ESCAPE_STRINGS.put("&le", '\u2264');
    259         ESCAPE_STRINGS.put("&ge", '\u2265');
    260         ESCAPE_STRINGS.put("&sub", '\u2282');
    261         ESCAPE_STRINGS.put("&sup", '\u2283');
    262         ESCAPE_STRINGS.put("&nsub", '\u2284');
    263         ESCAPE_STRINGS.put("&sube", '\u2286');
    264         ESCAPE_STRINGS.put("&supe", '\u2287');
    265         ESCAPE_STRINGS.put("&oplus", '\u2295');
    266         ESCAPE_STRINGS.put("&otimes", '\u2297');
    267         ESCAPE_STRINGS.put("&perp", '\u22A5');
    268         ESCAPE_STRINGS.put("&sdot", '\u22C5');
    269         ESCAPE_STRINGS.put("&lceil", '\u2308');
    270         ESCAPE_STRINGS.put("&rceil", '\u2309');
    271         ESCAPE_STRINGS.put("&lfloor", '\u230A');
    272         ESCAPE_STRINGS.put("&rfloor", '\u230B');
    273         ESCAPE_STRINGS.put("&lang", '\u2329');
    274         ESCAPE_STRINGS.put("&rang", '\u232A');
    275         ESCAPE_STRINGS.put("&loz", '\u25CA');
    276         ESCAPE_STRINGS.put("&spades", '\u2660');
    277         ESCAPE_STRINGS.put("&clubs", '\u2663');
    278         ESCAPE_STRINGS.put("&hearts", '\u2665');
    279         ESCAPE_STRINGS.put("&diams", '\u2666');
    280         ESCAPE_STRINGS.put("&quot", '\u0022');
    281         ESCAPE_STRINGS.put("&amp", '\u0026');
    282         ESCAPE_STRINGS.put("&lt", '\u003C');
    283         ESCAPE_STRINGS.put("&gt", '\u003E');
    284         ESCAPE_STRINGS.put("&OElig", '\u0152');
    285         ESCAPE_STRINGS.put("&oelig", '\u0153');
    286         ESCAPE_STRINGS.put("&Scaron", '\u0160');
    287         ESCAPE_STRINGS.put("&scaron", '\u0161');
    288         ESCAPE_STRINGS.put("&Yuml", '\u0178');
    289         ESCAPE_STRINGS.put("&circ", '\u02C6');
    290         ESCAPE_STRINGS.put("&tilde", '\u02DC');
    291         ESCAPE_STRINGS.put("&ensp", '\u2002');
    292         ESCAPE_STRINGS.put("&emsp", '\u2003');
    293         ESCAPE_STRINGS.put("&thinsp", '\u2009');
    294         ESCAPE_STRINGS.put("&zwnj", '\u200C');
    295         ESCAPE_STRINGS.put("&zwj", '\u200D');
    296         ESCAPE_STRINGS.put("&lrm", '\u200E');
    297         ESCAPE_STRINGS.put("&rlm", '\u200F');
    298         ESCAPE_STRINGS.put("&ndash", '\u2013');
    299         ESCAPE_STRINGS.put("&mdash", '\u2014');
    300         ESCAPE_STRINGS.put("&lsquo", '\u2018');
    301         ESCAPE_STRINGS.put("&rsquo", '\u2019');
    302         ESCAPE_STRINGS.put("&sbquo", '\u201A');
    303         ESCAPE_STRINGS.put("&ldquo", '\u201C');
    304         ESCAPE_STRINGS.put("&rdquo", '\u201D');
    305         ESCAPE_STRINGS.put("&bdquo", '\u201E');
    306         ESCAPE_STRINGS.put("&dagger", '\u2020');
    307         ESCAPE_STRINGS.put("&Dagger", '\u2021');
    308         ESCAPE_STRINGS.put("&permil", '\u2030');
    309         ESCAPE_STRINGS.put("&lsaquo", '\u2039');
    310         ESCAPE_STRINGS.put("&rsaquo", '\u203A');
    311         ESCAPE_STRINGS.put("&euro", '\u20AC');
    312     }
    313 
    314     /**
    315      * Code to generate a short 'snippet' from either plain text or html text
    316      *
    317      * If the sync protocol can get plain text, that's great, but we'll still strip out extraneous
    318      * whitespace.  If it's HTML, we'll 1) strip out tags, 2) turn entities into the appropriate
    319      * characters, and 3) strip out extraneous whitespace, all in one pass
    320      *
    321      * Why not use an existing class?  The best answer is performance; yet another answer is
    322      * correctness (e.g. Html.textFromHtml simply doesn't generate well-stripped text).  But
    323      * performance is key; we frequently sync text that is 10K or (much) longer, yet we really only
    324      * care about a small amount of text for the snippet.  So it's critically important that we just
    325      * stop when we've gotten enough; existing methods that exist will go through the entire
    326      * incoming string, at great (and useless, in this case) expense.
    327      */
    328 
    329     public static String makeSnippetFromHtmlText(String text) {
    330         return makeSnippetFromText(text, true);
    331     }
    332 
    333     public static String makeSnippetFromPlainText(String text) {
    334         return makeSnippetFromText(text, false);
    335     }
    336 
    337     /**
    338      * Find the end of this tag; there are two alternatives: <tag .../> or <tag ...> ... </tag>
    339      * @param htmlText some HTML text
    340      * @param tag the HTML tag
    341      * @param startPos the start position in the HTML text where the tag starts
    342      * @return the position just before the end of the tag or -1 if not found
    343      */
    344     /*package*/ static int findTagEnd(String htmlText, String tag, int startPos) {
    345         if (tag.endsWith(" ")) {
    346             tag = tag.substring(0, tag.length() - 1);
    347         }
    348         int length = htmlText.length();
    349         char prevChar = 0;
    350         for (int i = startPos; i < length; i++) {
    351             char c = htmlText.charAt(i);
    352             if (c == '>') {
    353                if (prevChar == '/') {
    354                    return i - 1;
    355                }
    356                break;
    357             }
    358             prevChar = c;
    359         }
    360         // We didn't find /> at the end of the tag so find </tag>
    361         return htmlText.indexOf("/" + tag, startPos);
    362     }
    363 
    364     public static String makeSnippetFromText(String text, boolean stripHtml) {
    365         // Handle null and empty string
    366         if (TextUtils.isEmpty(text)) return "";
    367 
    368         final int length = text.length();
    369         // Use char[] instead of StringBuilder purely for performance; fewer method calls, etc.
    370         char[] buffer = new char[MAX_SNIPPET_LENGTH];
    371         // skipCount is an array of a single int; that int is set inside stripHtmlEntity and is
    372         // used to determine how many characters can be "skipped" due to the transformation of the
    373         // entity to a single character.  When Java allows multiple return values, we can make this
    374         // much cleaner :-)
    375         int[] skipCount = new int[1];
    376         int bufferCount = 0;
    377         // Start with space as last character to avoid leading whitespace
    378         char last = ' ';
    379         // Indicates whether we're in the middle of an HTML tag
    380         boolean inTag = false;
    381 
    382         // Walk through the text until we're done with the input OR we've got a large enough snippet
    383         for (int i = 0; i < length && bufferCount < MAX_SNIPPET_LENGTH; i++) {
    384             char c = text.charAt(i);
    385             if (stripHtml && !inTag && (c == '<')) {
    386                 // Find tags to strip; they will begin with <! or !- or </ or <letter
    387                 if (i < (length - 1)) {
    388                     char peek = text.charAt(i + 1);
    389                     if (peek == '!' || peek == '-' || peek == '/' || Character.isLetter(peek)) {
    390                         inTag = true;
    391                         // Strip content of title, script, style and applet tags
    392                         if (i < (length - (MAX_STRIP_TAG_LENGTH + 2))) {
    393                             String tag = text.substring(i + 1, i + MAX_STRIP_TAG_LENGTH + 1);
    394                             String tagLowerCase = tag.toLowerCase();
    395                             boolean stripContent = false;
    396                             for (String stripTag: STRIP_TAGS) {
    397                                 if (tagLowerCase.startsWith(stripTag)) {
    398                                     stripContent = true;
    399                                     tag = tag.substring(0, stripTag.length());
    400                                     break;
    401                                 }
    402                             }
    403                             if (stripContent) {
    404                                 // Look for the end of this tag
    405                                 int endTagPosition = findTagEnd(text, tag, i);
    406                                 if (endTagPosition < 0) {
    407                                     break;
    408                                 } else {
    409                                     i = endTagPosition;
    410                                 }
    411                             }
    412                         }
    413                     }
    414                 }
    415             } else if (stripHtml && inTag && (c == '>')) {
    416                 // Terminate stripping here
    417                 inTag = false;
    418                 continue;
    419             }
    420 
    421             if (inTag) {
    422                 // We just skip by everything while we're in a tag
    423                 continue;
    424             } else if (stripHtml && (c == '&')) {
    425                 // Handle a possible HTML entity here
    426                 // We always get back a character to use; we also get back a "skip count",
    427                 // indicating how many characters were eaten from the entity
    428                 c = stripHtmlEntity(text, i, skipCount);
    429                 i += skipCount[0];
    430             }
    431 
    432             if (Character.isWhitespace(c) || (c == NON_BREAKING_SPACE_CHARACTER)) {
    433                 // The idea is to find the content in the message, not the whitespace, so we'll
    434                 // turn any combination of contiguous whitespace into a single space
    435                 if (last == ' ') {
    436                     continue;
    437                 } else {
    438                     // Make every whitespace character a simple space
    439                     c = ' ';
    440                 }
    441             } else if ((c == '-' || c == '=') && (last == c)) {
    442                 // Lots of messages (especially digests) have whole lines of --- or ===
    443                 // We'll get rid of those duplicates here
    444                 continue;
    445             }
    446 
    447             // After all that, maybe we've got a character for our snippet
    448             buffer[bufferCount++] = c;
    449             last = c;
    450         }
    451 
    452         // Lose trailing space and return our snippet
    453         if ((bufferCount > 0) && (last == ' ')) {
    454             bufferCount--;
    455         }
    456         return new String(buffer, 0, bufferCount);
    457     }
    458 
    459     static /*package*/ char stripHtmlEntity(String text, int pos, int[] skipCount) {
    460         int length = text.length();
    461         // Ugly, but we store our skip count in this array; we can't use a static here, because
    462         // multiple threads might be calling in
    463         skipCount[0] = 0;
    464         // All entities are <= 8 characters long, so that's how far we'll look for one (+ & and ;)
    465         int end = pos + 10;
    466         String entity = null;
    467         // Isolate the entity
    468         for (int i = pos; (i < length) && (i < end); i++) {
    469             if (text.charAt(i) == ';') {
    470                 entity = text.substring(pos, i);
    471                 break;
    472             }
    473         }
    474         if (entity == null) {
    475             // This wasn't really an HTML entity
    476             return '&';
    477         } else {
    478             // Skip count is the length of the entity
    479             Character mapping = ESCAPE_STRINGS.get(entity);
    480             int entityLength = entity.length();
    481             if (mapping != null) {
    482                 skipCount[0] = entityLength;
    483                 return mapping;
    484             } else if ((entityLength > 2) && (entity.charAt(1) == '#')) {
    485                 // &#nn; means ascii nn (decimal) and &#xnn means ascii nn (hex)
    486                 char c = '?';
    487                 try {
    488                     int i;
    489                     if ((entity.charAt(2) == 'x') && (entityLength > 3)) {
    490                         i = Integer.parseInt(entity.substring(3), 16);
    491                     } else {
    492                         i = Integer.parseInt(entity.substring(2));
    493                     }
    494                     c = (char)i;
    495                 } catch (NumberFormatException e) {
    496                     // We'll just return the ? in this case
    497                 }
    498                 skipCount[0] = entityLength;
    499                 return c;
    500             }
    501         }
    502         // Worst case, we return the original start character, ampersand
    503         return '&';
    504     }
    505 
    506     /**
    507      * Given a string of HTML text and a query containing any number of search terms, returns
    508      * an HTML string in which those search terms are highlighted (intended for use in a WebView)
    509      *
    510      * @param text the HTML text to process
    511      * @param query the search terms
    512      * @return HTML text with the search terms highlighted
    513      */
    514     @VisibleForTesting
    515     public static String highlightTermsInHtml(String text, String query) {
    516         try {
    517             return highlightTerms(text, query, true).toString();
    518         } catch (IOException e) {
    519             // Can't happen, but we must catch this
    520             return text;
    521         }
    522     }
    523 
    524     /**
    525      * Given a string of plain text and a query containing any number of search terms, returns
    526      * a CharSequence in which those search terms are highlighted (intended for use in a TextView)
    527      *
    528      * @param text the text to process
    529      * @param query the search terms
    530      * @return a CharSequence with the search terms highlighted
    531      */
    532     public static CharSequence highlightTermsInText(String text, String query) {
    533         try {
    534             return highlightTerms(text, query, false);
    535         } catch (IOException e) {
    536             // Can't happen, but we must catch this
    537             return text;
    538         }
    539     }
    540 
    541     static class SearchTerm {
    542         final String mTerm;
    543         final String mTermLowerCase;
    544         final int mLength;
    545         int mMatchLength = 0;
    546         int mMatchStart = -1;
    547 
    548         SearchTerm(String term, boolean html) {
    549             mTerm = term;
    550             mTermLowerCase = term.toLowerCase();
    551             mLength = term.length();
    552         }
    553     }
    554 
    555     /**
    556      * Generate a version of the incoming text in which all search terms in a query are highlighted.
    557      * If the input is HTML, we return a StringBuilder with additional markup as required
    558      * If the input is text, we return a SpannableStringBuilder with additional spans as required
    559      *
    560      * @param text the text to be processed
    561      * @param query the query, which can contain multiple terms separated by whitespace
    562      * @param html whether or not the text to be processed is HTML
    563      * @return highlighted text
    564      *
    565      * @throws IOException as Appendable requires this
    566      */
    567     public static CharSequence highlightTerms(String text, String query, boolean html)
    568             throws IOException {
    569         // Handle null and empty string
    570         if (TextUtils.isEmpty(text)) return "";
    571         final int length = text.length();
    572 
    573         // Break up the query into search terms
    574         ArrayList<SearchTerm> terms = new ArrayList<SearchTerm>();
    575         if (query != null) {
    576             StringTokenizer st = new StringTokenizer(query);
    577             while (st.hasMoreTokens()) {
    578                 terms.add(new SearchTerm(st.nextToken(), html));
    579             }
    580         }
    581 
    582         // Our appendable depends on whether we're building HTML text (for webview) or spannable
    583         // text (for UI)
    584         final Appendable sb = html ? new StringBuilder() : new SpannableStringBuilder();
    585         // Indicates whether we're in the middle of an HTML tag
    586         boolean inTag = false;
    587         // The position of the last input character copied to output
    588         int lastOut = -1;
    589 
    590         // Walk through the text until we're done with the input
    591         // Just copy any HTML tags directly into the output; search for terms in the remaining text
    592         for (int i = 0; i < length; i++) {
    593             char chr = text.charAt(i);
    594             if (html) {
    595                 if (!inTag && (chr == '<')) {
    596                     // Find tags; they will begin with <! or !- or </ or <letter
    597                     if (i < (length - 1)) {
    598                         char peek = text.charAt(i + 1);
    599                         if (peek == '!' || peek == '-' || peek == '/' || Character.isLetter(peek)) {
    600                             inTag = true;
    601                             // Skip content of title, script, style and applet tags
    602                             if (i < (length - (MAX_STRIP_TAG_LENGTH + 2))) {
    603                                 String tag = text.substring(i + 1, i + MAX_STRIP_TAG_LENGTH + 1);
    604                                 String tagLowerCase = tag.toLowerCase();
    605                                 boolean stripContent = false;
    606                                 for (String stripTag: STRIP_TAGS) {
    607                                     if (tagLowerCase.startsWith(stripTag)) {
    608                                         stripContent = true;
    609                                         tag = tag.substring(0, stripTag.length());
    610                                         break;
    611                                     }
    612                                 }
    613                                 if (stripContent) {
    614                                     // Look for the end of this tag
    615                                     int endTagPosition = findTagEnd(text, tag, i);
    616                                     if (endTagPosition < 0) {
    617                                         sb.append(text.substring(i));
    618                                         break;
    619                                     } else {
    620                                         sb.append(text.substring(i, endTagPosition - 1));
    621                                         i = endTagPosition - 1;
    622                                         chr = text.charAt(i);
    623                                     }
    624                                 }
    625                             }
    626                         }
    627                     }
    628                 } else if (inTag && (chr == '>')) {
    629                     inTag = false;
    630                 }
    631 
    632                 if (inTag) {
    633                     sb.append(chr);
    634                     continue;
    635                 }
    636             }
    637 
    638             // After all that, we've got some "body" text
    639             char chrLowerCase = Character.toLowerCase(chr);
    640             // Whether or not the current character should be appended to the output; we inhibit
    641             // this while any search terms match
    642             boolean appendNow = true;
    643             // Look through search terms for matches
    644             for (SearchTerm t: terms) {
    645                 if (chrLowerCase == t.mTermLowerCase.charAt(t.mMatchLength)) {
    646                     if (t.mMatchLength++ == 0) {
    647                         // New match start
    648                         t.mMatchStart = i;
    649                     }
    650                     if (t.mMatchLength == t.mLength) {
    651                         String matchText = text.substring(t.mMatchStart, t.mMatchStart + t.mLength);
    652                         // Completed match; add highlight and reset term
    653                         if (t.mMatchStart <= lastOut) {
    654                             matchText = text.substring(lastOut + 1, i + 1);
    655                         }
    656                         /*else*/
    657                         if (matchText.length() == 0) {} else
    658                         if (html) {
    659                             sb.append("<span style=\"background-color: " + HIGHLIGHT_COLOR_STRING +
    660                                     "\">");
    661                             sb.append(matchText);
    662                             sb.append("</span>");
    663                         } else {
    664                             SpannableString highlightSpan = new SpannableString(matchText);
    665                             highlightSpan.setSpan(new BackgroundColorSpan(HIGHLIGHT_COLOR_INT), 0,
    666                                     highlightSpan.length(), Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
    667                             sb.append(highlightSpan);
    668                         }
    669                         lastOut = t.mMatchStart + t.mLength - 1;
    670                         t.mMatchLength = 0;
    671                         t.mMatchStart = -1;
    672                     }
    673                     appendNow = false;
    674                 } else {
    675                     if (t.mMatchStart >= 0) {
    676                         // We're no longer matching; check for other matches in progress
    677                         int leastOtherStart = -1;
    678                         for (SearchTerm ot: terms) {
    679                             // Save away the lowest match start for other search terms
    680                             if ((ot != t) && (ot.mMatchStart >= 0) && ((leastOtherStart < 0) ||
    681                                     (ot.mMatchStart <= leastOtherStart))) {
    682                                 leastOtherStart = ot.mMatchStart;
    683                             }
    684                         }
    685                         int matchEnd = t.mMatchStart + t.mMatchLength;
    686                         if (leastOtherStart < 0 || leastOtherStart > matchEnd) {
    687                             // Append the whole thing
    688                             if (t.mMatchStart > lastOut) {
    689                                 sb.append(text.substring(t.mMatchStart, matchEnd));
    690                                 lastOut = matchEnd;
    691                             }
    692                         } else if (leastOtherStart == t.mMatchStart) {
    693                             // Ok to append the current char
    694                         } else if (leastOtherStart < t.mMatchStart) {
    695                             // We're already covered by another search term, so don't append
    696                             appendNow = false;
    697                         } else if (t.mMatchStart > lastOut) {
    698                             // Append the piece of our term that's not already covered
    699                             sb.append(text.substring(t.mMatchStart, leastOtherStart));
    700                             lastOut = leastOtherStart;
    701                         }
    702                     }
    703                     // Reset this term
    704                     t.mMatchLength = 0;
    705                     t.mMatchStart = -1;
    706                 }
    707             }
    708 
    709             if (appendNow) {
    710                 sb.append(chr);
    711                 lastOut = i;
    712             }
    713         }
    714 
    715         return (CharSequence)sb;
    716    }
    717 }
    718