Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2000 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.android.mail.lib.base;
     17 
     18 import static com.android.mail.lib.base.Preconditions.checkArgument;
     19 
     20 import com.google.common.base.Joiner;
     21 import com.google.common.base.Joiner.MapJoiner;
     22 
     23 import java.io.IOException;
     24 import java.io.InputStream;
     25 import java.io.StringWriter;
     26 import java.util.ArrayList;
     27 import java.util.Collection;
     28 import java.util.Collections;
     29 import java.util.HashMap;
     30 import java.util.HashSet;
     31 import java.util.Iterator;
     32 import java.util.LinkedHashMap;
     33 import java.util.LinkedList;
     34 import java.util.List;
     35 import java.util.Map;
     36 import java.util.Set;
     37 import java.util.StringTokenizer;
     38 import java.util.regex.Matcher;
     39 import java.util.regex.Pattern;
     40 
     41 /**
     42  * Static utility methods and constants pertaining to {@code String} or {@code
     43  * CharSequence} instances.
     44  */
     45 public final class StringUtil {
     46   private StringUtil() {} // COV_NF_LINE
     47 
     48   /**
     49    * A completely arbitrary selection of eight whitespace characters. See
     50    * <a href="http://go/white+space">this spreadsheet</a> for more details
     51    * about whitespace characters.
     52    *
     53    * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or
     54    *     consider the precise set of characters you want to match and construct
     55    *     the right explicit {@link CharMatcher} or {@link String} for your own
     56    *     purposes.
     57    */
     58   @Deprecated
     59   public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F";
     60 
     61   /** A string containing the carriage return and linefeed characters. */
     62   public static final String LINE_BREAKS = "\r\n";
     63 
     64   /**
     65    * Old location of {@link Strings#isNullOrEmpty}; this method will be
     66    * deprecated soon.
     67    */
     68   public static boolean isEmpty(String string) {
     69     return Strings.isNullOrEmpty(string);
     70   }
     71 
     72   /**
     73    * Returns {@code true} if the given string is null, empty, or comprises only
     74    * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}.
     75    *
     76    * <p><b>Warning:</b> there are many competing definitions of "whitespace";
     77    * please see <a href="http://go/white+space">this spreadsheet</a> for
     78    * details.
     79    *
     80    * @param string the string reference to check
     81    * @return {@code true} if {@code string} is null, empty, or consists of
     82    *     whitespace characters only
     83    */
     84   public static boolean isEmptyOrWhitespace(String string) {
     85     return string == null || CharMatcher.WHITESPACE.matchesAllOf(string);
     86   }
     87 
     88   /**
     89    * Old location of {@link Strings#nullToEmpty}; this method will be
     90    * deprecated soon.
     91    */
     92   public static String makeSafe(String string) {
     93     return Strings.nullToEmpty(string);
     94   }
     95 
     96   /**
     97    * Old location of {@link Strings#emptyToNull}; this method will be
     98    * deprecated soon.
     99    */
    100   public static String toNullIfEmpty(String string) {
    101     return Strings.emptyToNull(string);
    102   }
    103 
    104   /**
    105    * Returns the given string if it is nonempty and contains at least one
    106    * non-whitespace character; {@code null} otherwise. See comment in {@link
    107    * #isEmptyOrWhitespace} on the definition of whitespace.
    108    *
    109    * @param string the string to test and possibly return
    110    * @return {@code null} if {@code string} is null, empty, or contains only
    111    *     whitespace characters; {@code string} itself otherwise
    112    */
    113   public static String toNullIfEmptyOrWhitespace(
    114       String string) {
    115     return isEmptyOrWhitespace(string) ? null : string;
    116   }
    117 
    118   /**
    119    * Old location of {@link Strings#repeat}; this method will be deprecated
    120    * soon.
    121    */
    122   public static String repeat(String string, int count) {
    123     return Strings.repeat(string, count);
    124   }
    125 
    126   /**
    127    * Return the first index in the string of any of the specified characters,
    128    * starting at a given index, or {@code -1} if none of the characters is
    129    * present.
    130    *
    131    * @param string the non-null character sequence to look in
    132    * @param chars a non-null character sequence containing the set of characters
    133    *     to look for. If empty, this method will find no matches and return
    134    *     {@code -1}
    135    * @param fromIndex the index of the first character to examine in the input
    136    *     string. If negative, the entire string will be searched. If greater
    137    *     than or equal to the string length, no characters will be searched and
    138    *     {@code -1} will be returned.
    139    * @return the index of the first match, or {@code -1} if no match was found.
    140    *     Guaranteed to be either {@code -1} or a number greater than or equal to
    141    *     {@code fromIndex}
    142    * @throws NullPointerException if any argument is null
    143    */
    144   // author: pault
    145   public static int indexOfChars(
    146       CharSequence string, CharSequence chars, int fromIndex) {
    147     if (fromIndex >= string.length()) {
    148       return -1;
    149     }
    150 
    151     /*
    152      * Prepare lookup structures for the characters. TODO(pault): This loop
    153      * could be factored into another method to allow caching of the resulting
    154      * struct if a use-case of very large character sets exists.
    155      */
    156     Set<Character> charSet = Collections.emptySet();
    157     boolean[] charArray = new boolean[128];
    158     for (int i = 0; i < chars.length(); i++) {
    159       char c = chars.charAt(i);
    160       if (c < 128) {
    161         charArray[c] = true;
    162       } else {
    163         if (charSet.isEmpty()) {
    164           charSet = new HashSet<Character>();
    165         }
    166         charSet.add(c);
    167       }
    168     }
    169 
    170     // Scan the string for matches
    171     for (int i = Math.max(fromIndex, 0); i < string.length(); i++) {
    172       char c = string.charAt(i);
    173       if (c < 128) {
    174         if (charArray[c]) {
    175           return i;
    176         }
    177       } else if (charSet.contains(c)) {
    178         return i;
    179       }
    180     }
    181     return -1;
    182   }
    183 
    184 /*
    185  * -------------------------------------------------------------------
    186  * This marks the end of the code that has been written or rewritten
    187  * in 2008 to the quality standards of the Java core libraries group.
    188  * Code below this point is still awaiting cleanup (you can help!).
    189  * See http://wiki/Nonconf/JavaCoreLibrariesStandards.
    190  * -------------------------------------------------------------------
    191  */
    192 
    193 
    194   /**
    195    * @param str the string to split.  Must not be null.
    196    * @param delims the delimiter characters. Each character in the
    197    *        string is individually treated as a delimiter.
    198    * @return an array of tokens. Will not return null. Individual tokens
    199    *        do not have leading/trailing whitespace removed.
    200    * @deprecated see the detailed instructions under
    201    *     {@link #split(String, String, boolean)}
    202    */
    203   @Deprecated
    204   public static String[] split(String str, String delims) {
    205     return split(str, delims, false);
    206   }
    207 
    208   /**
    209    * This method is deprecated because it is too inflexible, providing
    210    * only a very specific set of behaviors that almost never matches exactly
    211    * what you intend. Prefer using a {@link Splitter}, which is more flexible
    212    * and consistent in the way it handles trimming and empty tokens.
    213    *
    214    * <ul>
    215    * <li>Create a {@link Splitter} using {@link Splitter#on(CharMatcher)} such
    216    *     as {@code Splitter.on(CharMatcher.anyOf(delims))}.
    217    * <li><i>If</i> you need whitespace trimmed from the ends of each segment,
    218    *     adding {@code .trimResults()} to your splitter definition should work
    219    *     in most cases. To match the exact behavior of this method, use
    220    *     {@code .trimResults(CharMatcher.inRange('\0', ' '))}.
    221    * <li>This method silently ignores empty tokens in the input, but allows
    222    *     empty tokens to appear in the output if {@code trimTokens} is
    223    *     {@code true}. Adding {@code .omitEmptyStrings()} to your splitter
    224    *     definition will filter empty tokens out but will do so <i>after</i>
    225    *     having performed trimming. If you absolutely require this method's
    226    *     behavior in this respect, Splitter is not able to match it.
    227    * <li>If you need the result as an array, use {@link
    228    *     com.google.common.collect.Iterables#toArray(Iterable, Class)} on the
    229    *     {@code Iterable<String>} returned by {@link Splitter#split}.
    230    * </ul>
    231    *
    232    * @param str the string to split.  Must not be null.
    233    * @param delims the delimiter characters. Each character in the string
    234    *        is individually treated as a delimiter.
    235    * @param trimTokens if true, leading/trailing whitespace is removed
    236    *        from the tokens
    237    * @return an array of tokens. Will not return null.
    238    * @deprecated
    239    */
    240   @Deprecated
    241   public static String[] split(
    242       String str, String delims, boolean trimTokens) {
    243     StringTokenizer tokenizer = new StringTokenizer(str, delims);
    244     int n = tokenizer.countTokens();
    245     String[] list = new String[n];
    246     for (int i = 0; i < n; i++) {
    247       if (trimTokens) {
    248         list[i] = tokenizer.nextToken().trim();
    249       } else {
    250         list[i] = tokenizer.nextToken();
    251       }
    252     }
    253     return list;
    254   }
    255 
    256   /**
    257    * Trim characters from only the beginning of a string.
    258    * This is a convenience method, it simply calls trimStart(s, null).
    259    *
    260    * @param s String to be trimmed
    261    * @return String with whitespace characters removed from the beginning
    262    */
    263   public static String trimStart(String s) {
    264     return trimStart(s, null);
    265   }
    266 
    267   /**
    268    * Trim characters from only the beginning of a string.
    269    * This method will remove all whitespace characters
    270    * (defined by Character.isWhitespace(char), in addition to the characters
    271    * provided, from the end of the provided string.
    272    *
    273    * @param s String to be trimmed
    274    * @param extraChars Characters in addition to whitespace characters that
    275    *                   should be trimmed.  May be null.
    276    * @return String with whitespace and characters in extraChars removed
    277    *                   from the beginning
    278    */
    279   public static String trimStart(String s, String extraChars) {
    280     int trimCount = 0;
    281     while (trimCount < s.length()) {
    282       char ch = s.charAt(trimCount);
    283       if (Character.isWhitespace(ch)
    284         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
    285         trimCount++;
    286       } else {
    287         break;
    288       }
    289     }
    290 
    291     if (trimCount == 0) {
    292       return s;
    293     }
    294     return s.substring(trimCount);
    295   }
    296 
    297   /**
    298    * Trim characters from only the end of a string.
    299    * This is a convenience method, it simply calls trimEnd(s, null).
    300    *
    301    * @param s String to be trimmed
    302    * @return String with whitespace characters removed from the end
    303    */
    304   public static String trimEnd(String s) {
    305     return trimEnd(s, null);
    306   }
    307 
    308   /**
    309    * Trim characters from only the end of a string.
    310    * This method will remove all whitespace characters
    311    * (defined by Character.isWhitespace(char), in addition to the characters
    312    * provided, from the end of the provided string.
    313    *
    314    * @param s String to be trimmed
    315    * @param extraChars Characters in addition to whitespace characters that
    316    *                   should be trimmed.  May be null.
    317    * @return String with whitespace and characters in extraChars removed
    318    *                   from the end
    319    */
    320   public static String trimEnd(String s, String extraChars) {
    321     int trimCount = 0;
    322     while (trimCount < s.length()) {
    323       char ch = s.charAt(s.length() - trimCount - 1);
    324       if (Character.isWhitespace(ch)
    325         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
    326         trimCount++;
    327       } else {
    328         break;
    329       }
    330     }
    331 
    332     if (trimCount == 0) {
    333       return s;
    334     }
    335     return s.substring(0, s.length() - trimCount);
    336   }
    337 
    338   /**
    339    * @param str the string to split.  Must not be null.
    340    * @param delims the delimiter characters. Each character in the
    341    *        string is individually treated as a delimiter.
    342    * @return an array of tokens. Will not return null. Leading/trailing
    343    *        whitespace is removed from the tokens.
    344    * @deprecated see the detailed instructions under
    345    *     {@link #split(String, String, boolean)}
    346    */
    347   @Deprecated
    348   public static String[] splitAndTrim(String str, String delims) {
    349     return split(str, delims, true);
    350   }
    351 
    352   /** Parse comma-separated list of ints and return as array. */
    353   public static int[] splitInts(String str) throws IllegalArgumentException {
    354     StringTokenizer tokenizer = new StringTokenizer(str, ",");
    355     int n = tokenizer.countTokens();
    356     int[] list = new int[n];
    357     for (int i = 0; i < n; i++) {
    358       String token = tokenizer.nextToken();
    359       list[i] = Integer.parseInt(token);
    360     }
    361     return list;
    362   }
    363 
    364   /** Parse comma-separated list of longs and return as array. */
    365   public static long[] splitLongs(String str) throws IllegalArgumentException {
    366     StringTokenizer tokenizer = new StringTokenizer(str, ",");
    367     int n = tokenizer.countTokens();
    368     long[] list = new long[n];
    369     for (int i = 0; i < n; i++) {
    370       String token = tokenizer.nextToken();
    371       list[i] = Long.parseLong(token);
    372     }
    373     return list;
    374   }
    375 
    376   /** This replaces the occurrences of 'what' in 'str' with 'with'
    377    *
    378    * @param str the string to process
    379    * @param what to replace
    380    * @param with replace with this
    381    * @return String str where 'what' was replaced with 'with'
    382    *
    383    * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}.
    384    */
    385   @Deprecated
    386   public static String replace(
    387       String str, CharSequence what, CharSequence with) {
    388     // Have to check this argument, for compatibility with the old impl.
    389     // For the record, String.replace() is capable of handling an empty target
    390     // string... but it does something kind of weird in that case.
    391     checkArgument(what.length() > 0);
    392     return str.replace(what, with);
    393   }
    394 
    395   private static final Splitter NEWLINE_SPLITTER =
    396       Splitter.on('\n').omitEmptyStrings();
    397 
    398   /**
    399    * Reformats the given string to a fixed width by inserting carriage returns
    400    * and trimming unnecessary whitespace. See
    401    * {@link #fixedWidth(String[], int)} for details. The {@code str} argument
    402    * to this method will be split on newline characters ({@code '\n'}) only
    403    * (regardless of platform).  An array of resulting non-empty strings is
    404    * then passed to {@link #fixedWidth(String[], int)} as the {@code lines}
    405    * parameter.
    406    *
    407    * @param str the string to format
    408    * @param width the fixed width (in characters)
    409    */
    410   public static String fixedWidth(String str, int width) {
    411     List<String> lines = new ArrayList<String>();
    412 
    413     for (String line : NEWLINE_SPLITTER.split(str)) {
    414       lines.add(line);
    415     }
    416 
    417     String[] lineArray = lines.toArray(new String[0]);
    418     return fixedWidth(lineArray, width);
    419   }
    420 
    421   /**
    422    * Reformats the given array of lines to a fixed width by inserting
    423    * newlines and trimming unnecessary whitespace.  This uses simple
    424    * whitespace-based splitting, not sophisticated internationalized
    425    * line breaking.  Newlines within a line are treated like any other
    426    * whitespace.  Lines which are already short enough will be passed
    427    * through unmodified.
    428    *
    429    * <p>Only breaking whitespace characters (those which match
    430    * {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by
    431    * this method. Non-breaking whitespace characters will be considered as
    432    * ordinary characters which are connected to any other adjacent
    433    * non-whitespace characters, and will therefore appear in the returned
    434    * string in their original context.
    435    *
    436    * @param lines array of lines to format
    437    * @param width the fixed width (in characters)
    438    */
    439   public static String fixedWidth(String[] lines, int width) {
    440     List<String> formattedLines = new ArrayList<String>();
    441 
    442     for (String line : lines) {
    443       formattedLines.add(formatLineToFixedWidth(line, width));
    444     }
    445 
    446     return Joiner.on('\n').join(formattedLines);
    447   }
    448 
    449   private static final Splitter TO_WORDS =
    450       Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings();
    451 
    452   /**
    453    * Helper method for {@link #fixedWidth(String[], int)}
    454    */
    455   private static String formatLineToFixedWidth(String line, int width) {
    456     if (line.length() <= width) {
    457       return line;
    458     }
    459 
    460     StringBuilder builder = new StringBuilder();
    461     int col = 0;
    462 
    463     for (String word : TO_WORDS.split(line)) {
    464       if (col == 0) {
    465         col = word.length();
    466       } else {
    467         int newCol = col + word.length() + 1;  // +1 for the space
    468 
    469         if (newCol <= width) {
    470           builder.append(' ');
    471           col = newCol;
    472         } else {
    473           builder.append('\n');
    474           col = word.length();
    475         }
    476       }
    477 
    478       builder.append(word);
    479     }
    480 
    481     return builder.toString();
    482   }
    483 
    484   /**
    485    * Splits the argument original into a list of substrings.  All the
    486    * substrings in the returned list (except possibly the last) will
    487    * have length lineLen.
    488    *
    489    * @param lineLen  the length of the substrings to put in the list
    490    * @param original the original string
    491    *
    492    * @return a list of strings of length lineLen that together make up the
    493    *     original string
    494    * @deprecated use {@code Splitter.fixedLength(lineLen).split(original))}
    495    *     (note that it returns an {@code Iterable}, not a {@code List})
    496    */
    497   @Deprecated
    498   public static List<String> fixedSplit(String original, int lineLen) {
    499     List<String> output = new ArrayList<String>();
    500     for (String elem : Splitter.fixedLength(lineLen).split(original)) {
    501       output.add(elem);
    502     }
    503     return output;
    504   }
    505 
    506   /**
    507    * Indents the given String per line.
    508    * @param iString the string to indent
    509    * @param iIndentDepth the depth of the indentation
    510    * @return the indented string
    511    */
    512   public static String indent(String iString, int iIndentDepth) {
    513     StringBuilder spacer = new StringBuilder();
    514     spacer.append("\n");
    515     for (int i = 0; i < iIndentDepth; i++) {
    516       spacer.append("  ");
    517     }
    518     return iString.replace("\n", spacer.toString());
    519   }
    520 
    521   /**
    522    * This is a both way strip.
    523    *
    524    * @param str the string to strip
    525    * @param left strip from left
    526    * @param right strip from right
    527    * @param what character(s) to strip
    528    * @return the stripped string
    529    * @deprecated ensure the string is not null and use
    530    *  <ul>
    531    *    <li> {@code CharMatcher.anyOf(what).trimFrom(str)}
    532    *        if {@code left == true} and {@code right == true}
    533    *    <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)}
    534    *        if {@code left == true} and {@code right == false}
    535    *    <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)}
    536    *        if {@code left == false} and {@code right == true}
    537    *  </ul>
    538    */
    539   @Deprecated
    540   public static String megastrip(String str,
    541                                  boolean left, boolean right,
    542                                  String what) {
    543     if (str == null) {
    544       return null;
    545     }
    546 
    547     CharMatcher matcher = CharMatcher.anyOf(what);
    548     if (left) {
    549       if (right) {
    550         return matcher.trimFrom(str);
    551       }
    552       return matcher.trimLeadingFrom(str);
    553     }
    554     if (right) {
    555       return matcher.trimTrailingFrom(str);
    556     }
    557     return str;
    558   }
    559 
    560   /** strip - strips both ways
    561    *
    562    * @param str what to strip
    563    * @return String the striped string
    564    * @deprecated ensure the string is not null and use {@code
    565    *     CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you
    566    *     really want the legacy whitespace definition, or something more
    567    *     standard like {@link CharMatcher#WHITESPACE}.
    568    */
    569   @SuppressWarnings("deprecation") // this is deprecated itself
    570   @Deprecated public static String strip(String str) {
    571     return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str);
    572   }
    573 
    574   /** Strip white spaces from both end, and collapse white spaces
    575    * in the middle.
    576    *
    577    * @param str what to strip
    578    * @return String the striped and collapsed string
    579    * @deprecated ensure the string is not null and use {@code
    580    *     CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also
    581    *     consider whether you really want the legacy whitespace definition, or
    582    *     something more standard like {@link CharMatcher#WHITESPACE}.
    583    */
    584   @SuppressWarnings("deprecation") // this is deprecated itself
    585   @Deprecated public static String stripAndCollapse(String str) {
    586     return (str == null) ? null
    587         : CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ');
    588   }
    589 
    590   /**
    591    * Give me a string and a potential prefix, and I return the string
    592    * following the prefix if the prefix matches, else null.
    593    * Analogous to the c++ functions strprefix and var_strprefix.
    594    *
    595    * @param str the string to strip
    596    * @param prefix the expected prefix
    597    * @return the stripped string or <code>null</code> if the string
    598    * does not start with the prefix
    599    */
    600   public static String stripPrefix(String str, String prefix) {
    601     return str.startsWith(prefix)
    602         ? str.substring(prefix.length())
    603         : null;
    604   }
    605 
    606   /**
    607    * Case insensitive version of stripPrefix. Strings are compared in
    608    * the same way as in {@link String#equalsIgnoreCase}.
    609    * Analogous to the c++ functions strcaseprefix and var_strcaseprefix.
    610    *
    611    * @param str the string to strip
    612    * @param prefix the expected prefix
    613    * @return the stripped string or <code>null</code> if the string
    614    * does not start with the prefix
    615    */
    616   public static String stripPrefixIgnoreCase(String str, String prefix) {
    617     return startsWithIgnoreCase(str, prefix)
    618         ? str.substring(prefix.length())
    619         : null;
    620   }
    621 
    622   /**
    623    * Give me a string and a potential suffix, and I return the string
    624    * before the suffix if the suffix matches, else null.
    625    * Analogous to the c++ function strsuffix.
    626    *
    627    * @param str the string to strip
    628    * @param suffix the expected suffix
    629    * @return the stripped string or <code>null</code> if the string
    630    * does not end with the suffix
    631    */
    632   public static String stripSuffix(String str, String suffix) {
    633     return str.endsWith(suffix)
    634         ? str.substring(0, str.length() - suffix.length())
    635         : null;
    636   }
    637 
    638   /**
    639    * Case insensitive version of stripSuffix. Strings are compared in
    640    * the same way as in {@link String#equalsIgnoreCase}.
    641    * Analogous to the c++ function strcasesuffix.
    642    *
    643    * @param str the string to strip
    644    * @param suffix the expected suffix
    645    * @return the stripped string or <code>null</code> if the string
    646    * does not end with the suffix
    647    */
    648   public static String stripSuffixIgnoreCase(
    649       String str, String suffix) {
    650     return endsWithIgnoreCase(str, suffix)
    651         ? str.substring(0, str.length() - suffix.length())
    652         : null;
    653   }
    654 
    655   /**
    656    * Strips all non-digit characters from a string.
    657    *
    658    * The resulting string will only contain characters for which isDigit()
    659    * returns true.
    660    *
    661    * @param str the string to strip
    662    * @return a string consisting of digits only, or an empty string
    663    * @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also
    664    *     consider whether this is really the definition of "digit" you wish to
    665    *     use)
    666    */
    667   @Deprecated public static String stripNonDigits(String str) {
    668     return CharMatcher.JAVA_DIGIT.retainFrom(str);
    669   }
    670 
    671   /**
    672    * Finds the last index in str of a character not in the characters
    673    * in 'chars' (similar to ANSI string.find_last_not_of).
    674    *
    675    * Returns -1 if no such character can be found.
    676    *
    677    * <p><b>Note:</b> If {@code fromIndex} is zero, use {@link CharMatcher}
    678    * instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}.
    679    */
    680   // TODO(kevinb): after adding fromIndex versions of (last)IndexOf to
    681   // CharMatcher, deprecate this
    682   public static int lastIndexNotOf(String str, String chars, int fromIndex) {
    683     fromIndex = Math.min(fromIndex, str.length() - 1);
    684 
    685     for (int pos = fromIndex; pos >= 0; pos--) {
    686       if (chars.indexOf(str.charAt(pos)) < 0) {
    687         return pos;
    688       }
    689     }
    690 
    691     return -1;
    692   }
    693 
    694   /**
    695    * Like String.replace() except that it accepts any number of old chars.
    696    * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'.
    697    * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello  world "
    698    *
    699    * @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example
    700    *     {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)}
    701    */
    702   @Deprecated public static String replaceChars(
    703       String str, CharSequence oldchars, char newchar) {
    704     return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar);
    705   }
    706 
    707   /**
    708    * Remove any occurrances of 'oldchars' in 'str'.
    709    * Example: removeChars("Hello, world!", ",!") returns "Hello world"
    710    *
    711    * @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example
    712    *     {@code CharMatcher.anyOf(oldchars).removeFrom(str)}
    713    */
    714   @Deprecated public static String removeChars(
    715       String str, CharSequence oldchars) {
    716     return CharMatcher.anyOf(oldchars).removeFrom(str);
    717   }
    718 
    719   // See http://www.microsoft.com/typography/unicode/1252.htm
    720   private static final CharMatcher FANCY_SINGLE_QUOTE
    721       = CharMatcher.anyOf("\u0091\u0092\u2018\u2019");
    722   private static final CharMatcher FANCY_DOUBLE_QUOTE
    723       = CharMatcher.anyOf("\u0093\u0094\u201c\u201d");
    724 
    725   /**
    726    * Replaces microsoft "smart quotes" (curly " and ') with their
    727    * ascii counterparts.
    728    */
    729   public static String replaceSmartQuotes(String str) {
    730     String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\'');
    731     return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"');
    732   }
    733 
    734   /**
    735    * Convert a string of hex digits to a byte array, with the first
    736    * byte in the array being the MSB. The string passed in should be
    737    * just the raw digits (upper or lower case), with no leading
    738    * or trailing characters (like '0x' or 'h').
    739    * An odd number of characters is supported.
    740    * If the string is empty, an empty array will be returned.
    741    *
    742    * This is significantly faster than using
    743    *   new BigInteger(str, 16).toByteArray();
    744    * especially with larger strings. Here are the results of some
    745    * microbenchmarks done on a P4 2.8GHz 2GB RAM running
    746    * linux 2.4.22-gg11 and JDK 1.5 with an optimized build:
    747    *
    748    * String length        hexToBytes (usec)   BigInteger
    749    * -----------------------------------------------------
    750    * 16                       0.570                 1.43
    751    * 256                      8.21                 44.4
    752    * 1024                    32.8                 526
    753    * 16384                  546                121000
    754    */
    755   public static byte[] hexToBytes(CharSequence str) {
    756     byte[] bytes = new byte[(str.length() + 1) / 2];
    757     if (str.length() == 0) {
    758       return bytes;
    759     }
    760     bytes[0] = 0;
    761     int nibbleIdx = (str.length() % 2);
    762     for (int i = 0; i < str.length(); i++) {
    763       char c = str.charAt(i);
    764       if (!isHex(c)) {
    765         throw new IllegalArgumentException("string contains non-hex chars");
    766       }
    767       if ((nibbleIdx % 2) == 0) {
    768         bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4);
    769       } else {
    770         bytes[nibbleIdx >> 1] += (byte) hexValue(c);
    771       }
    772       nibbleIdx++;
    773     }
    774     return bytes;
    775   }
    776 
    777   /**
    778    * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed).
    779    */
    780   public static String convertEOLToLF(String input) {
    781     StringBuilder res = new StringBuilder(input.length());
    782     char[] s = input.toCharArray();
    783     int from = 0;
    784     final int end = s.length;
    785     for (int i = 0; i < end; i++) {
    786       if (s[i] == '\r') {
    787         res.append(s, from, i - from);
    788         res.append('\n');
    789         if (i + 1 < end && s[i + 1] == '\n') {
    790           i++;
    791         }
    792 
    793         from = i + 1;
    794       }
    795     }
    796 
    797     if (from == 0) {   // no \r!
    798       return input;
    799     }
    800 
    801     res.append(s, from, end - from);
    802     return res.toString();
    803   }
    804 
    805   /**
    806    * Old location of {@link Strings#padStart}; this method will be deprecated
    807    * soon.
    808    */
    809   public static String padLeft(String s, int len, char padChar) {
    810     return Strings.padStart(s, len, padChar);
    811   }
    812 
    813   /**
    814    * Old location of {@link Strings#padEnd}; this method will be deprecated
    815    * soon.
    816    */
    817   public static String padRight(String s, int len, char padChar) {
    818     return Strings.padEnd(s, len, padChar);
    819   }
    820 
    821   /**
    822    * Returns a string consisting of "s", with each of the first "len" characters
    823    * replaced by "maskChar" character.
    824    */
    825   public static String maskLeft(String s, int len, char maskChar) {
    826     if (len <= 0) {
    827       return s;
    828     }
    829     len = Math.min(len, s.length());
    830     StringBuilder sb = new StringBuilder();
    831     for (int i = 0; i < len; i++) {
    832       sb.append(maskChar);
    833     }
    834     sb.append(s.substring(len));
    835     return sb.toString();
    836   }
    837 
    838   private static boolean isOctal(char c) {
    839     return (c >= '0') && (c <= '7');
    840   }
    841 
    842   private static boolean isHex(char c) {
    843     return ((c >= '0') && (c <= '9')) ||
    844            ((c >= 'a') && (c <= 'f')) ||
    845            ((c >= 'A') && (c <= 'F'));
    846   }
    847 
    848   private static int hexValue(char c) {
    849     if ((c >= '0') && (c <= '9')) {
    850       return (c - '0');
    851     } else if ((c >= 'a') && (c <= 'f')) {
    852       return (c - 'a') + 10;
    853     } else {
    854       return (c - 'A') + 10;
    855     }
    856   }
    857 
    858   /**
    859    * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the
    860    * resulting string.
    861    */
    862   public static String unescapeCString(String s) {
    863     if (s.indexOf('\\') < 0) {
    864       // Fast path: nothing to unescape
    865       return s;
    866     }
    867 
    868     StringBuilder sb = new StringBuilder();
    869     int len = s.length();
    870     for (int i = 0; i < len;) {
    871       char c = s.charAt(i++);
    872       if (c == '\\' && (i < len)) {
    873         c = s.charAt(i++);
    874         switch (c) {
    875           case 'a':  c = '\007';  break;
    876           case 'b':  c = '\b';    break;
    877           case 'f':  c = '\f';    break;
    878           case 'n':  c = '\n';    break;
    879           case 'r':  c = '\r';    break;
    880           case 't':  c = '\t';    break;
    881           case 'v':  c = '\013';  break;
    882           case '\\': c = '\\';    break;
    883           case '?':  c = '?';     break;
    884           case '\'': c = '\'';    break;
    885           case '"':  c = '\"';    break;
    886 
    887           default: {
    888             if ((c == 'x') && (i < len) && isHex(s.charAt(i))) {
    889               // "\xXX"
    890               int v = hexValue(s.charAt(i++));
    891               if ((i < len) && isHex(s.charAt(i))) {
    892                 v = v * 16 + hexValue(s.charAt(i++));
    893               }
    894               c = (char) v;
    895             } else if (isOctal(c)) {
    896               // "\OOO"
    897               int v = (c - '0');
    898               if ((i < len) && isOctal(s.charAt(i))) {
    899                 v = v * 8 + (s.charAt(i++) - '0');
    900               }
    901               if ((i < len) && isOctal(s.charAt(i))) {
    902                 v = v * 8 + (s.charAt(i++) - '0');
    903               }
    904               c = (char) v;
    905             } else {
    906               // Propagate unknown escape sequences.
    907               sb.append('\\');
    908             }
    909             break;
    910           }
    911         }
    912       }
    913       sb.append(c);
    914     }
    915     return sb.toString();
    916   }
    917 
    918   /**
    919    * Unescape any MySQL escape sequences.
    920    * See MySQL language reference Chapter 6 at
    921    * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>.
    922    * This function will <strong>not</strong> work for other SQL-like
    923    * dialects.
    924    * @param s string to unescape, with the surrounding quotes.
    925    * @return unescaped string, without the surrounding quotes.
    926    * @exception IllegalArgumentException if s is not a valid MySQL string.
    927    */
    928   public static String unescapeMySQLString(String s)
    929       throws IllegalArgumentException {
    930     // note: the same buffer is used for both reading and writing
    931     // it works because the writer can never outrun the reader
    932     char chars[] = s.toCharArray();
    933 
    934     // the string must be quoted 'like this' or "like this"
    935     if (chars.length < 2 || chars[0] != chars[chars.length - 1] ||
    936         (chars[0] != '\'' && chars[0] != '"')) {
    937       throw new IllegalArgumentException("not a valid MySQL string: " + s);
    938     }
    939 
    940     // parse the string and decode the backslash sequences; in addition,
    941     // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "'
    942     int j = 1;  // write position in the string (never exceeds read position)
    943     int f = 0;  // state: 0 (normal), 1 (backslash), 2 (quote)
    944     for (int i = 1; i < chars.length - 1; i++) {
    945       if (f == 0) {             // previous character was normal
    946         if (chars[i] == '\\') {
    947           f = 1;  // backslash
    948         } else if (chars[i] == chars[0]) {
    949           f = 2;  // quoting character
    950         } else {
    951           chars[j++] = chars[i];
    952         }
    953       } else if (f == 1) {      // previous character was a backslash
    954         switch (chars[i]) {
    955           case '0':   chars[j++] = '\0';   break;
    956           case '\'':  chars[j++] = '\'';   break;
    957           case '"':   chars[j++] = '"';    break;
    958           case 'b':   chars[j++] = '\b';   break;
    959           case 'n':   chars[j++] = '\n';   break;
    960           case 'r':   chars[j++] = '\r';   break;
    961           case 't':   chars[j++] = '\t';   break;
    962           case 'z':   chars[j++] = '\032'; break;
    963           case '\\':  chars[j++] = '\\';   break;
    964           default:
    965             // if the character is not special, backslash disappears
    966             chars[j++] = chars[i];
    967             break;
    968         }
    969         f = 0;
    970       } else {                  // previous character was a quote
    971         // quoting characters must be doubled inside a string
    972         if (chars[i] != chars[0]) {
    973           throw new IllegalArgumentException("not a valid MySQL string: " + s);
    974         }
    975         chars[j++] = chars[0];
    976         f = 0;
    977       }
    978     }
    979     // string contents cannot end with a special character
    980     if (f != 0) {
    981       throw new IllegalArgumentException("not a valid MySQL string: " + s);
    982     }
    983 
    984     // done
    985     return new String(chars, 1, j - 1);
    986   }
    987 
    988   // TODO(pbarry): move all HTML methods to common.html package
    989 
    990   static final Map<String, Character> ESCAPE_STRINGS;
    991   static final Set<Character> HEX_LETTERS;
    992 
    993   static {
    994     // HTML character entity references as defined in HTML 4
    995     // see http://www.w3.org/TR/REC-html40/sgml/entities.html
    996     ESCAPE_STRINGS = new HashMap<String, Character>(252);
    997 
    998     ESCAPE_STRINGS.put("&nbsp", '\u00A0');
    999     ESCAPE_STRINGS.put("&iexcl", '\u00A1');
   1000     ESCAPE_STRINGS.put("&cent", '\u00A2');
   1001     ESCAPE_STRINGS.put("&pound", '\u00A3');
   1002     ESCAPE_STRINGS.put("&curren", '\u00A4');
   1003     ESCAPE_STRINGS.put("&yen", '\u00A5');
   1004     ESCAPE_STRINGS.put("&brvbar", '\u00A6');
   1005     ESCAPE_STRINGS.put("&sect", '\u00A7');
   1006     ESCAPE_STRINGS.put("&uml", '\u00A8');
   1007     ESCAPE_STRINGS.put("&copy", '\u00A9');
   1008     ESCAPE_STRINGS.put("&ordf", '\u00AA');
   1009     ESCAPE_STRINGS.put("&laquo", '\u00AB');
   1010     ESCAPE_STRINGS.put("&not", '\u00AC');
   1011     ESCAPE_STRINGS.put("&shy", '\u00AD');
   1012     ESCAPE_STRINGS.put("&reg", '\u00AE');
   1013     ESCAPE_STRINGS.put("&macr", '\u00AF');
   1014     ESCAPE_STRINGS.put("&deg", '\u00B0');
   1015     ESCAPE_STRINGS.put("&plusmn", '\u00B1');
   1016     ESCAPE_STRINGS.put("&sup2", '\u00B2');
   1017     ESCAPE_STRINGS.put("&sup3", '\u00B3');
   1018     ESCAPE_STRINGS.put("&acute", '\u00B4');
   1019     ESCAPE_STRINGS.put("&micro", '\u00B5');
   1020     ESCAPE_STRINGS.put("&para", '\u00B6');
   1021     ESCAPE_STRINGS.put("&middot", '\u00B7');
   1022     ESCAPE_STRINGS.put("&cedil", '\u00B8');
   1023     ESCAPE_STRINGS.put("&sup1", '\u00B9');
   1024     ESCAPE_STRINGS.put("&ordm", '\u00BA');
   1025     ESCAPE_STRINGS.put("&raquo", '\u00BB');
   1026     ESCAPE_STRINGS.put("&frac14", '\u00BC');
   1027     ESCAPE_STRINGS.put("&frac12", '\u00BD');
   1028     ESCAPE_STRINGS.put("&frac34", '\u00BE');
   1029     ESCAPE_STRINGS.put("&iquest", '\u00BF');
   1030     ESCAPE_STRINGS.put("&Agrave", '\u00C0');
   1031     ESCAPE_STRINGS.put("&Aacute", '\u00C1');
   1032     ESCAPE_STRINGS.put("&Acirc", '\u00C2');
   1033     ESCAPE_STRINGS.put("&Atilde", '\u00C3');
   1034     ESCAPE_STRINGS.put("&Auml", '\u00C4');
   1035     ESCAPE_STRINGS.put("&Aring", '\u00C5');
   1036     ESCAPE_STRINGS.put("&AElig", '\u00C6');
   1037     ESCAPE_STRINGS.put("&Ccedil", '\u00C7');
   1038     ESCAPE_STRINGS.put("&Egrave", '\u00C8');
   1039     ESCAPE_STRINGS.put("&Eacute", '\u00C9');
   1040     ESCAPE_STRINGS.put("&Ecirc", '\u00CA');
   1041     ESCAPE_STRINGS.put("&Euml", '\u00CB');
   1042     ESCAPE_STRINGS.put("&Igrave", '\u00CC');
   1043     ESCAPE_STRINGS.put("&Iacute", '\u00CD');
   1044     ESCAPE_STRINGS.put("&Icirc", '\u00CE');
   1045     ESCAPE_STRINGS.put("&Iuml", '\u00CF');
   1046     ESCAPE_STRINGS.put("&ETH", '\u00D0');
   1047     ESCAPE_STRINGS.put("&Ntilde", '\u00D1');
   1048     ESCAPE_STRINGS.put("&Ograve", '\u00D2');
   1049     ESCAPE_STRINGS.put("&Oacute", '\u00D3');
   1050     ESCAPE_STRINGS.put("&Ocirc", '\u00D4');
   1051     ESCAPE_STRINGS.put("&Otilde", '\u00D5');
   1052     ESCAPE_STRINGS.put("&Ouml", '\u00D6');
   1053     ESCAPE_STRINGS.put("&times", '\u00D7');
   1054     ESCAPE_STRINGS.put("&Oslash", '\u00D8');
   1055     ESCAPE_STRINGS.put("&Ugrave", '\u00D9');
   1056     ESCAPE_STRINGS.put("&Uacute", '\u00DA');
   1057     ESCAPE_STRINGS.put("&Ucirc", '\u00DB');
   1058     ESCAPE_STRINGS.put("&Uuml", '\u00DC');
   1059     ESCAPE_STRINGS.put("&Yacute", '\u00DD');
   1060     ESCAPE_STRINGS.put("&THORN", '\u00DE');
   1061     ESCAPE_STRINGS.put("&szlig", '\u00DF');
   1062     ESCAPE_STRINGS.put("&agrave", '\u00E0');
   1063     ESCAPE_STRINGS.put("&aacute", '\u00E1');
   1064     ESCAPE_STRINGS.put("&acirc", '\u00E2');
   1065     ESCAPE_STRINGS.put("&atilde", '\u00E3');
   1066     ESCAPE_STRINGS.put("&auml", '\u00E4');
   1067     ESCAPE_STRINGS.put("&aring", '\u00E5');
   1068     ESCAPE_STRINGS.put("&aelig", '\u00E6');
   1069     ESCAPE_STRINGS.put("&ccedil", '\u00E7');
   1070     ESCAPE_STRINGS.put("&egrave", '\u00E8');
   1071     ESCAPE_STRINGS.put("&eacute", '\u00E9');
   1072     ESCAPE_STRINGS.put("&ecirc", '\u00EA');
   1073     ESCAPE_STRINGS.put("&euml", '\u00EB');
   1074     ESCAPE_STRINGS.put("&igrave", '\u00EC');
   1075     ESCAPE_STRINGS.put("&iacute", '\u00ED');
   1076     ESCAPE_STRINGS.put("&icirc", '\u00EE');
   1077     ESCAPE_STRINGS.put("&iuml", '\u00EF');
   1078     ESCAPE_STRINGS.put("&eth", '\u00F0');
   1079     ESCAPE_STRINGS.put("&ntilde", '\u00F1');
   1080     ESCAPE_STRINGS.put("&ograve", '\u00F2');
   1081     ESCAPE_STRINGS.put("&oacute", '\u00F3');
   1082     ESCAPE_STRINGS.put("&ocirc", '\u00F4');
   1083     ESCAPE_STRINGS.put("&otilde", '\u00F5');
   1084     ESCAPE_STRINGS.put("&ouml", '\u00F6');
   1085     ESCAPE_STRINGS.put("&divide", '\u00F7');
   1086     ESCAPE_STRINGS.put("&oslash", '\u00F8');
   1087     ESCAPE_STRINGS.put("&ugrave", '\u00F9');
   1088     ESCAPE_STRINGS.put("&uacute", '\u00FA');
   1089     ESCAPE_STRINGS.put("&ucirc", '\u00FB');
   1090     ESCAPE_STRINGS.put("&uuml", '\u00FC');
   1091     ESCAPE_STRINGS.put("&yacute", '\u00FD');
   1092     ESCAPE_STRINGS.put("&thorn", '\u00FE');
   1093     ESCAPE_STRINGS.put("&yuml", '\u00FF');
   1094     ESCAPE_STRINGS.put("&fnof", '\u0192');
   1095     ESCAPE_STRINGS.put("&Alpha", '\u0391');
   1096     ESCAPE_STRINGS.put("&Beta", '\u0392');
   1097     ESCAPE_STRINGS.put("&Gamma", '\u0393');
   1098     ESCAPE_STRINGS.put("&Delta", '\u0394');
   1099     ESCAPE_STRINGS.put("&Epsilon", '\u0395');
   1100     ESCAPE_STRINGS.put("&Zeta", '\u0396');
   1101     ESCAPE_STRINGS.put("&Eta", '\u0397');
   1102     ESCAPE_STRINGS.put("&Theta", '\u0398');
   1103     ESCAPE_STRINGS.put("&Iota", '\u0399');
   1104     ESCAPE_STRINGS.put("&Kappa", '\u039A');
   1105     ESCAPE_STRINGS.put("&Lambda", '\u039B');
   1106     ESCAPE_STRINGS.put("&Mu", '\u039C');
   1107     ESCAPE_STRINGS.put("&Nu", '\u039D');
   1108     ESCAPE_STRINGS.put("&Xi", '\u039E');
   1109     ESCAPE_STRINGS.put("&Omicron", '\u039F');
   1110     ESCAPE_STRINGS.put("&Pi", '\u03A0');
   1111     ESCAPE_STRINGS.put("&Rho", '\u03A1');
   1112     ESCAPE_STRINGS.put("&Sigma", '\u03A3');
   1113     ESCAPE_STRINGS.put("&Tau", '\u03A4');
   1114     ESCAPE_STRINGS.put("&Upsilon", '\u03A5');
   1115     ESCAPE_STRINGS.put("&Phi", '\u03A6');
   1116     ESCAPE_STRINGS.put("&Chi", '\u03A7');
   1117     ESCAPE_STRINGS.put("&Psi", '\u03A8');
   1118     ESCAPE_STRINGS.put("&Omega", '\u03A9');
   1119     ESCAPE_STRINGS.put("&alpha", '\u03B1');
   1120     ESCAPE_STRINGS.put("&beta", '\u03B2');
   1121     ESCAPE_STRINGS.put("&gamma", '\u03B3');
   1122     ESCAPE_STRINGS.put("&delta", '\u03B4');
   1123     ESCAPE_STRINGS.put("&epsilon", '\u03B5');
   1124     ESCAPE_STRINGS.put("&zeta", '\u03B6');
   1125     ESCAPE_STRINGS.put("&eta", '\u03B7');
   1126     ESCAPE_STRINGS.put("&theta", '\u03B8');
   1127     ESCAPE_STRINGS.put("&iota", '\u03B9');
   1128     ESCAPE_STRINGS.put("&kappa", '\u03BA');
   1129     ESCAPE_STRINGS.put("&lambda", '\u03BB');
   1130     ESCAPE_STRINGS.put("&mu", '\u03BC');
   1131     ESCAPE_STRINGS.put("&nu", '\u03BD');
   1132     ESCAPE_STRINGS.put("&xi", '\u03BE');
   1133     ESCAPE_STRINGS.put("&omicron", '\u03BF');
   1134     ESCAPE_STRINGS.put("&pi", '\u03C0');
   1135     ESCAPE_STRINGS.put("&rho", '\u03C1');
   1136     ESCAPE_STRINGS.put("&sigmaf", '\u03C2');
   1137     ESCAPE_STRINGS.put("&sigma", '\u03C3');
   1138     ESCAPE_STRINGS.put("&tau", '\u03C4');
   1139     ESCAPE_STRINGS.put("&upsilon", '\u03C5');
   1140     ESCAPE_STRINGS.put("&phi", '\u03C6');
   1141     ESCAPE_STRINGS.put("&chi", '\u03C7');
   1142     ESCAPE_STRINGS.put("&psi", '\u03C8');
   1143     ESCAPE_STRINGS.put("&omega", '\u03C9');
   1144     ESCAPE_STRINGS.put("&thetasym", '\u03D1');
   1145     ESCAPE_STRINGS.put("&upsih", '\u03D2');
   1146     ESCAPE_STRINGS.put("&piv", '\u03D6');
   1147     ESCAPE_STRINGS.put("&bull", '\u2022');
   1148     ESCAPE_STRINGS.put("&hellip", '\u2026');
   1149     ESCAPE_STRINGS.put("&prime", '\u2032');
   1150     ESCAPE_STRINGS.put("&Prime", '\u2033');
   1151     ESCAPE_STRINGS.put("&oline", '\u203E');
   1152     ESCAPE_STRINGS.put("&frasl", '\u2044');
   1153     ESCAPE_STRINGS.put("&weierp", '\u2118');
   1154     ESCAPE_STRINGS.put("&image", '\u2111');
   1155     ESCAPE_STRINGS.put("&real", '\u211C');
   1156     ESCAPE_STRINGS.put("&trade", '\u2122');
   1157     ESCAPE_STRINGS.put("&alefsym", '\u2135');
   1158     ESCAPE_STRINGS.put("&larr", '\u2190');
   1159     ESCAPE_STRINGS.put("&uarr", '\u2191');
   1160     ESCAPE_STRINGS.put("&rarr", '\u2192');
   1161     ESCAPE_STRINGS.put("&darr", '\u2193');
   1162     ESCAPE_STRINGS.put("&harr", '\u2194');
   1163     ESCAPE_STRINGS.put("&crarr", '\u21B5');
   1164     ESCAPE_STRINGS.put("&lArr", '\u21D0');
   1165     ESCAPE_STRINGS.put("&uArr", '\u21D1');
   1166     ESCAPE_STRINGS.put("&rArr", '\u21D2');
   1167     ESCAPE_STRINGS.put("&dArr", '\u21D3');
   1168     ESCAPE_STRINGS.put("&hArr", '\u21D4');
   1169     ESCAPE_STRINGS.put("&forall", '\u2200');
   1170     ESCAPE_STRINGS.put("&part", '\u2202');
   1171     ESCAPE_STRINGS.put("&exist", '\u2203');
   1172     ESCAPE_STRINGS.put("&empty", '\u2205');
   1173     ESCAPE_STRINGS.put("&nabla", '\u2207');
   1174     ESCAPE_STRINGS.put("&isin", '\u2208');
   1175     ESCAPE_STRINGS.put("&notin", '\u2209');
   1176     ESCAPE_STRINGS.put("&ni", '\u220B');
   1177     ESCAPE_STRINGS.put("&prod", '\u220F');
   1178     ESCAPE_STRINGS.put("&sum", '\u2211');
   1179     ESCAPE_STRINGS.put("&minus", '\u2212');
   1180     ESCAPE_STRINGS.put("&lowast", '\u2217');
   1181     ESCAPE_STRINGS.put("&radic", '\u221A');
   1182     ESCAPE_STRINGS.put("&prop", '\u221D');
   1183     ESCAPE_STRINGS.put("&infin", '\u221E');
   1184     ESCAPE_STRINGS.put("&ang", '\u2220');
   1185     ESCAPE_STRINGS.put("&and", '\u2227');
   1186     ESCAPE_STRINGS.put("&or", '\u2228');
   1187     ESCAPE_STRINGS.put("&cap", '\u2229');
   1188     ESCAPE_STRINGS.put("&cup", '\u222A');
   1189     ESCAPE_STRINGS.put("&int", '\u222B');
   1190     ESCAPE_STRINGS.put("&there4", '\u2234');
   1191     ESCAPE_STRINGS.put("&sim", '\u223C');
   1192     ESCAPE_STRINGS.put("&cong", '\u2245');
   1193     ESCAPE_STRINGS.put("&asymp", '\u2248');
   1194     ESCAPE_STRINGS.put("&ne", '\u2260');
   1195     ESCAPE_STRINGS.put("&equiv", '\u2261');
   1196     ESCAPE_STRINGS.put("&le", '\u2264');
   1197     ESCAPE_STRINGS.put("&ge", '\u2265');
   1198     ESCAPE_STRINGS.put("&sub", '\u2282');
   1199     ESCAPE_STRINGS.put("&sup", '\u2283');
   1200     ESCAPE_STRINGS.put("&nsub", '\u2284');
   1201     ESCAPE_STRINGS.put("&sube", '\u2286');
   1202     ESCAPE_STRINGS.put("&supe", '\u2287');
   1203     ESCAPE_STRINGS.put("&oplus", '\u2295');
   1204     ESCAPE_STRINGS.put("&otimes", '\u2297');
   1205     ESCAPE_STRINGS.put("&perp", '\u22A5');
   1206     ESCAPE_STRINGS.put("&sdot", '\u22C5');
   1207     ESCAPE_STRINGS.put("&lceil", '\u2308');
   1208     ESCAPE_STRINGS.put("&rceil", '\u2309');
   1209     ESCAPE_STRINGS.put("&lfloor", '\u230A');
   1210     ESCAPE_STRINGS.put("&rfloor", '\u230B');
   1211     ESCAPE_STRINGS.put("&lang", '\u2329');
   1212     ESCAPE_STRINGS.put("&rang", '\u232A');
   1213     ESCAPE_STRINGS.put("&loz", '\u25CA');
   1214     ESCAPE_STRINGS.put("&spades", '\u2660');
   1215     ESCAPE_STRINGS.put("&clubs", '\u2663');
   1216     ESCAPE_STRINGS.put("&hearts", '\u2665');
   1217     ESCAPE_STRINGS.put("&diams", '\u2666');
   1218     ESCAPE_STRINGS.put("&quot", '\u0022');
   1219     ESCAPE_STRINGS.put("&amp", '\u0026');
   1220     ESCAPE_STRINGS.put("&lt", '\u003C');
   1221     ESCAPE_STRINGS.put("&gt", '\u003E');
   1222     ESCAPE_STRINGS.put("&OElig", '\u0152');
   1223     ESCAPE_STRINGS.put("&oelig", '\u0153');
   1224     ESCAPE_STRINGS.put("&Scaron", '\u0160');
   1225     ESCAPE_STRINGS.put("&scaron", '\u0161');
   1226     ESCAPE_STRINGS.put("&Yuml", '\u0178');
   1227     ESCAPE_STRINGS.put("&circ", '\u02C6');
   1228     ESCAPE_STRINGS.put("&tilde", '\u02DC');
   1229     ESCAPE_STRINGS.put("&ensp", '\u2002');
   1230     ESCAPE_STRINGS.put("&emsp", '\u2003');
   1231     ESCAPE_STRINGS.put("&thinsp", '\u2009');
   1232     ESCAPE_STRINGS.put("&zwnj", '\u200C');
   1233     ESCAPE_STRINGS.put("&zwj", '\u200D');
   1234     ESCAPE_STRINGS.put("&lrm", '\u200E');
   1235     ESCAPE_STRINGS.put("&rlm", '\u200F');
   1236     ESCAPE_STRINGS.put("&ndash", '\u2013');
   1237     ESCAPE_STRINGS.put("&mdash", '\u2014');
   1238     ESCAPE_STRINGS.put("&lsquo", '\u2018');
   1239     ESCAPE_STRINGS.put("&rsquo", '\u2019');
   1240     ESCAPE_STRINGS.put("&sbquo", '\u201A');
   1241     ESCAPE_STRINGS.put("&ldquo", '\u201C');
   1242     ESCAPE_STRINGS.put("&rdquo", '\u201D');
   1243     ESCAPE_STRINGS.put("&bdquo", '\u201E');
   1244     ESCAPE_STRINGS.put("&dagger", '\u2020');
   1245     ESCAPE_STRINGS.put("&Dagger", '\u2021');
   1246     ESCAPE_STRINGS.put("&permil", '\u2030');
   1247     ESCAPE_STRINGS.put("&lsaquo", '\u2039');
   1248     ESCAPE_STRINGS.put("&rsaquo", '\u203A');
   1249     ESCAPE_STRINGS.put("&euro", '\u20AC');
   1250 
   1251     HEX_LETTERS = new HashSet<Character>(12);
   1252 
   1253     HEX_LETTERS.add('a');
   1254     HEX_LETTERS.add('A');
   1255     HEX_LETTERS.add('b');
   1256     HEX_LETTERS.add('B');
   1257     HEX_LETTERS.add('c');
   1258     HEX_LETTERS.add('C');
   1259     HEX_LETTERS.add('d');
   1260     HEX_LETTERS.add('D');
   1261     HEX_LETTERS.add('e');
   1262     HEX_LETTERS.add('E');
   1263     HEX_LETTERS.add('f');
   1264     HEX_LETTERS.add('F');
   1265   }
   1266 
   1267   /**
   1268    * <p>
   1269    * Replace all the occurences of HTML escape strings with the
   1270    * respective characters.
   1271    * </p>
   1272    * <p>
   1273    * The default mode is strict (requiring semicolons).
   1274    * </p>
   1275    *
   1276    * @param s a <code>String</code> value
   1277    * @return a <code>String</code> value
   1278    * @throws NullPointerException if the input string is null.
   1279    */
   1280   public static final String unescapeHTML(String s) {
   1281     return unescapeHTML(s, false);
   1282   }
   1283 
   1284   /**
   1285    * Replace all the occurences of HTML escape strings with the
   1286    * respective characters.
   1287    *
   1288    * @param s a <code>String</code> value
   1289    * @param emulateBrowsers a <code>Boolean</code> value that tells the method
   1290    *     to allow entity refs not terminated with a semicolon to be unescaped.
   1291    *     (a quirk of this feature, and some browsers, is that an explicit
   1292    *     terminating character is needed - e.g., &lt$ would be unescaped, but
   1293    *     not &ltab - see the tests for a more in-depth description of browsers)
   1294    * @return a <code>String</code> value
   1295    * @throws NullPointerException if the input string is null.
   1296    */
   1297   public static final String unescapeHTML(String s, boolean emulateBrowsers) {
   1298 
   1299     // See if there are any '&' in the string since that is what we look
   1300     // for to escape. If there isn't, then we don't need to escape this string
   1301     // Based on similar technique used in the escape function.
   1302     int index = s.indexOf('&');
   1303     if (index == -1) {
   1304       // Nothing to escape. Return the original string.
   1305       return s;
   1306     }
   1307 
   1308     // We found an escaped character. Start slow escaping from there.
   1309     char[] chars = s.toCharArray();
   1310     char[] escaped = new char[chars.length];
   1311     System.arraycopy(chars, 0, escaped, 0, index);
   1312 
   1313     // Note: escaped[pos] = end of the escaped char array.
   1314     int pos = index;
   1315 
   1316     for (int i = index; i < chars.length;) {
   1317       if (chars[i] != '&') {
   1318         escaped[pos++] = chars[i++];
   1319         continue;
   1320       }
   1321 
   1322       // Allow e.g. &#123;
   1323       int j = i + 1;
   1324       boolean isNumericEntity = false;
   1325       if (j < chars.length && chars[j] == '#') {
   1326         j++;
   1327         isNumericEntity = true;
   1328       }
   1329 
   1330       // if it's numeric, also check for hex
   1331       boolean isHexEntity = false;
   1332       if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) {
   1333         j++;
   1334         isHexEntity = true;
   1335       }
   1336 
   1337       // Scan until we find a char that is not valid for this sequence.
   1338       for (; j < chars.length; j++) {
   1339         char ch = chars[j];
   1340         boolean isDigit = Character.isDigit(ch);
   1341         if (isNumericEntity) {
   1342           // non-hex numeric sequence end condition
   1343           if (!isHexEntity && !isDigit) {
   1344             break;
   1345           }
   1346           // hex sequence end contition
   1347           if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) {
   1348             break;
   1349           }
   1350         }
   1351         // anything other than a digit or letter is always an end condition
   1352         if (!isDigit && !Character.isLetter(ch)) {
   1353           break;
   1354         }
   1355       }
   1356 
   1357       boolean replaced = false;
   1358       if ((j <= chars.length && emulateBrowsers) ||
   1359           (j < chars.length && chars[j] == ';')) {
   1360         // Check for &#D; and &#xD; pattern
   1361         if (i + 2 < chars.length && s.charAt(i + 1) == '#') {
   1362           try {
   1363             long charcode = 0;
   1364             char ch = s.charAt(i + 2);
   1365             if (isHexEntity) {
   1366               charcode = Long.parseLong(
   1367                   new String(chars, i + 3, j - i - 3), 16);
   1368             } else if (Character.isDigit(ch)) {
   1369               charcode = Long.parseLong(
   1370                   new String(chars, i + 2, j - i - 2));
   1371             }
   1372             if (charcode > 0 && charcode < 65536) {
   1373               escaped[pos++] = (char) charcode;
   1374               replaced = true;
   1375             }
   1376           } catch (NumberFormatException ex) {
   1377             // Failed, not replaced.
   1378           }
   1379         } else {
   1380           String key = new String(chars, i, j - i);
   1381           Character repl = ESCAPE_STRINGS.get(key);
   1382           if (repl != null) {
   1383             escaped[pos++] = repl;
   1384             replaced = true;
   1385           }
   1386         }
   1387         // Skip over ';'
   1388         if (j < chars.length && chars[j] == ';') {
   1389           j++;
   1390         }
   1391       }
   1392 
   1393       if (!replaced) {
   1394         // Not a recognized escape sequence, leave as-is
   1395         System.arraycopy(chars, i, escaped, pos, j - i);
   1396         pos += j - i;
   1397       }
   1398       i = j;
   1399     }
   1400     return new String(escaped, 0, pos);
   1401   }
   1402 
   1403   // Escaper for < and > only.
   1404   private static final CharEscaper LT_GT_ESCAPE =
   1405       new CharEscaperBuilder()
   1406         .addEscape('<', "&lt;")
   1407         .addEscape('>', "&gt;")
   1408         .toEscaper();
   1409 
   1410   private static final Pattern htmlTagPattern =
   1411       Pattern.compile("</?[a-zA-Z][^>]*>");
   1412 
   1413   /**
   1414    * Given a <code>String</code>, returns an equivalent <code>String</code> with
   1415    * all HTML tags stripped. Note that HTML entities, such as "&amp;amp;" will
   1416    * still be preserved.
   1417    */
   1418   public static String stripHtmlTags(String string) {
   1419     if ((string == null) || "".equals(string)) {
   1420       return string;
   1421     }
   1422     String stripped = htmlTagPattern.matcher(string).replaceAll("");
   1423     /*
   1424      * Certain inputs result in a well-formed HTML:
   1425      * <<X>script>alert(0)<</X>/script> results in <script>alert(0)</script>
   1426      * The following step ensures that no HTML can slip through by replacing all
   1427      * < and > characters with &lt; and &gt; after HTML tags were stripped.
   1428      */
   1429     return LT_GT_ESCAPE.escape(stripped);
   1430   }
   1431 
   1432   /**
   1433    * We escape some characters in s to be able to insert strings into JavaScript
   1434    * code. Also, make sure that we don't write out {@code -->} or
   1435    * {@code </script>}, which may close a script tag, or any char in ["'>] which
   1436    * might close a tag or attribute if seen inside an attribute.
   1437    */
   1438   public static String javaScriptEscape(CharSequence s) {
   1439     return javaScriptEscapeHelper(s, false);
   1440   }
   1441 
   1442   /**
   1443    * We escape some characters in s to be able to insert strings into JavaScript
   1444    * code. Also, make sure that we don't write out {@code -->} or
   1445    * {@code </script>}, which may close a script tag, or any char in ["'>] which
   1446    * might close a tag or attribute if seen inside an attribute.
   1447    * Turns all non-ascii characters into ASCII javascript escape sequences
   1448    * (eg \\uhhhh or \ooo).
   1449    */
   1450   public static String javaScriptEscapeToAscii(CharSequence s) {
   1451     return javaScriptEscapeHelper(s, true);
   1452   }
   1453 
   1454   /**
   1455    * Represents the type of javascript escaping to perform.  Each enum below
   1456    * determines whether to use octal escapes and how to handle quotes.
   1457    */
   1458   public static enum JsEscapingMode {
   1459     /** No octal escapes, pass-through ', and escape " as \". */
   1460     JSON,
   1461 
   1462     /** Octal escapes, escapes ' and " to \42 and \47, respectively. */
   1463     EMBEDDABLE_JS,
   1464 
   1465     /** Octal escapes, escapes ' and " to \' and \". */
   1466     MINIMAL_JS
   1467   }
   1468 
   1469   /**
   1470    * Helper for javaScriptEscape and javaScriptEscapeToAscii
   1471    */
   1472   private static String javaScriptEscapeHelper(CharSequence s,
   1473                                                boolean escapeToAscii) {
   1474     StringBuilder sb = new StringBuilder(s.length() * 9 / 8);
   1475     try {
   1476       escapeStringBody(s, escapeToAscii, JsEscapingMode.EMBEDDABLE_JS, sb);
   1477     } catch (IOException ex) {
   1478       // StringBuilder.append does not throw IOExceptions.
   1479       throw new RuntimeException(ex);
   1480     }
   1481     return sb.toString();
   1482   }
   1483 
   1484   /**
   1485    * Appends the javascript string literal equivalent of plainText to the given
   1486    * out buffer.
   1487    * @param plainText the string to escape.
   1488    * @param escapeToAscii true to encode all characters not in ascii [\x20-\x7e]
   1489    *   <br>
   1490    *   Full escaping of unicode entites isn't required but this makes
   1491    *   sure that unicode strings will survive regardless of the
   1492    *   content-encoding of the javascript file which is important when
   1493    *   we use this function to autogenerated javascript source files.
   1494    *   This is disabled by default because it makes non-latin strings very long.
   1495    *   <br>
   1496    *   If you seem to have trouble with character-encodings, maybe
   1497    *   turn this on to see if the problem goes away.  If so, you need
   1498    *   to specify a character encoding for your javascript somewhere.
   1499    * @param jsEscapingMode determines the type of escaping to perform.
   1500    * @param out the buffer to append output to.
   1501    */
   1502   /*
   1503    * To avoid fallthrough, we would have to either use a hybrid switch-case/if
   1504    * approach (which would obscure our special handling for ' and "), duplicate
   1505    * the content of the default case, or pass a half-dozen parameters to a
   1506    * helper method containing the code from the default case.
   1507    */
   1508   @SuppressWarnings("fallthrough")
   1509   public static void escapeStringBody(
   1510       CharSequence plainText, boolean escapeToAscii,
   1511       JsEscapingMode jsEscapingMode, Appendable out)
   1512       throws IOException {
   1513     int pos = 0;  // Index just past the last char in plainText written to out.
   1514     int len = plainText.length();
   1515     for (int codePoint, charCount, i = 0; i < len; i += charCount) {
   1516       codePoint = Character.codePointAt(plainText, i);
   1517       charCount = Character.charCount(codePoint);
   1518 
   1519       if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) {
   1520         continue;
   1521       }
   1522 
   1523       out.append(plainText, pos, i);
   1524       pos = i + charCount;
   1525       switch (codePoint) {
   1526         case '\b': out.append("\\b"); break;
   1527         case '\t': out.append("\\t"); break;
   1528         case '\n': out.append("\\n"); break;
   1529         case '\f': out.append("\\f"); break;
   1530         case '\r': out.append("\\r"); break;
   1531         case '\\': out.append("\\\\"); break;
   1532         case '"': case '\'':
   1533           if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) {
   1534             // JSON does not escape a single quote (and it should be surrounded
   1535             // by double quotes).
   1536             out.append((char) codePoint);
   1537             break;
   1538           } else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) {
   1539             out.append('\\').append((char) codePoint);
   1540             break;
   1541           }
   1542           // fall through
   1543         default:
   1544           if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) {
   1545             appendHexJavaScriptRepresentation(codePoint, out);
   1546           } else {
   1547             // Output the minimal octal encoding.  We can't use an encoding
   1548             // shorter than three digits if the next digit is a valid octal
   1549             // digit.
   1550             boolean pad = i + charCount >= len
   1551                 || isOctal(plainText.charAt(i + charCount));
   1552             appendOctalJavaScriptRepresentation((char) codePoint, pad, out);
   1553           }
   1554           break;
   1555       }
   1556     }
   1557     out.append(plainText, pos, len);
   1558   }
   1559 
   1560   /**
   1561    * Helper for escapeStringBody, which decides whether to escape a character.
   1562    */
   1563   private static boolean shouldEscapeChar(int codePoint,
   1564       boolean escapeToAscii, JsEscapingMode jsEscapingMode) {
   1565     // If non-ASCII chars should be escaped, identify non-ASCII code points.
   1566     if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) {
   1567       return true;
   1568     }
   1569 
   1570     // If in JSON escaping mode, check JSON *and* JS escaping rules. The JS
   1571     // escaping rules will escape more characters than needed for JSON,
   1572     // but it is safe to escape any character in JSON.
   1573     // TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be
   1574     //               shown that this change in legacy behavior is safe.
   1575     if (jsEscapingMode == JsEscapingMode.JSON) {
   1576       return mustEscapeCharInJsonString(codePoint)
   1577           || mustEscapeCharInJsString(codePoint);
   1578     }
   1579 
   1580     // Finally, just check the default JS escaping rules.
   1581     return mustEscapeCharInJsString(codePoint);
   1582   }
   1583 
   1584   /**
   1585    * Returns a javascript representation of the character in a hex escaped
   1586    * format.
   1587    *
   1588    * @param codePoint The codepoint to append.
   1589    * @param out The buffer to which the hex representation should be appended.
   1590    */
   1591   private static void appendHexJavaScriptRepresentation(
   1592       int codePoint, Appendable out)
   1593       throws IOException {
   1594     if (Character.isSupplementaryCodePoint(codePoint)) {
   1595       // Handle supplementary unicode values which are not representable in
   1596       // javascript.  We deal with these by escaping them as two 4B sequences
   1597       // so that they will round-trip properly when sent from java to javascript
   1598       // and back.
   1599       char[] surrogates = Character.toChars(codePoint);
   1600       appendHexJavaScriptRepresentation(surrogates[0], out);
   1601       appendHexJavaScriptRepresentation(surrogates[1], out);
   1602       return;
   1603     }
   1604     out.append("\\u")
   1605         .append(HEX_CHARS[(codePoint >>> 12) & 0xf])
   1606         .append(HEX_CHARS[(codePoint >>> 8) & 0xf])
   1607         .append(HEX_CHARS[(codePoint >>> 4) & 0xf])
   1608         .append(HEX_CHARS[codePoint & 0xf]);
   1609   }
   1610 
   1611   /**
   1612    * Returns a javascript representation of the character in a hex escaped
   1613    * format. Although this is a rather specific method, it is made public
   1614    * because it is also used by the JSCompiler.
   1615    *
   1616    * @param ch The character to append.
   1617    * @param pad true to force use of the full 3 digit representation.
   1618    * @param out The buffer to which the hex representation should be appended.
   1619    */
   1620   private static void appendOctalJavaScriptRepresentation(
   1621       char ch, boolean pad, Appendable out) throws IOException {
   1622     if (ch >= 0100
   1623         // Be paranoid at the end of a string since someone might call
   1624         // this method again with another string segment.
   1625         || pad) {
   1626       out.append('\\')
   1627           .append(OCTAL_CHARS[(ch >>> 6) & 0x7])
   1628           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
   1629           .append(OCTAL_CHARS[ch & 0x7]);
   1630     } else if (ch >= 010) {
   1631       out.append('\\')
   1632           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
   1633           .append(OCTAL_CHARS[ch & 0x7]);
   1634     } else {
   1635       out.append('\\')
   1636           .append(OCTAL_CHARS[ch & 0x7]);
   1637     }
   1638   }
   1639 
   1640   /**
   1641    * Although this is a rather specific method, it is made public
   1642    * because it is also used by the JSCompiler.
   1643    *
   1644    * @see #appendHexJavaScriptRepresentation(int, Appendable)
   1645    */
   1646   public static void appendHexJavaScriptRepresentation(StringBuilder sb,
   1647                                                        char c) {
   1648     try {
   1649       appendHexJavaScriptRepresentation(c, sb);
   1650     } catch (IOException ex) {
   1651       // StringBuilder does not throw IOException.
   1652       throw new RuntimeException(ex);
   1653     }
   1654   }
   1655 
   1656   /**
   1657    * Undo escaping as performed in javaScriptEscape(.)
   1658    * Throws an IllegalArgumentException if the string contains
   1659    * bad escaping.
   1660    */
   1661   public static String javaScriptUnescape(String s) {
   1662     StringBuilder sb = new StringBuilder(s.length());
   1663     for (int i = 0; i < s.length(); ) {
   1664       char c = s.charAt(i);
   1665       if (c == '\\') {
   1666         i = javaScriptUnescapeHelper(s, i + 1, sb);
   1667       } else {
   1668         sb.append(c);
   1669         i++;
   1670       }
   1671     }
   1672     return sb.toString();
   1673   }
   1674 
   1675   /**
   1676    * Looks for an escape code starting at index i of s,
   1677    * and appends it to sb.
   1678    * @return the index of the first character in s
   1679    * after the escape code.
   1680    * @throws IllegalArgumentException if the escape code
   1681    * is invalid
   1682    */
   1683   private static int javaScriptUnescapeHelper(String s, int i,
   1684                                               StringBuilder sb) {
   1685     if (i >= s.length()) {
   1686       throw new IllegalArgumentException(
   1687           "End-of-string after escape character in [" + s + "]");
   1688     }
   1689 
   1690     char c = s.charAt(i++);
   1691     switch (c) {
   1692       case 'n': sb.append('\n'); break;
   1693       case 'r': sb.append('\r'); break;
   1694       case 't': sb.append('\t'); break;
   1695       case 'b': sb.append('\b'); break;
   1696       case 'f': sb.append('\f'); break;
   1697       case '\\':
   1698       case '\"':
   1699       case '\'':
   1700       case '>':
   1701         sb.append(c);
   1702         break;
   1703       case '0': case '1': case '2': case '3':
   1704       case '4': case '5': case '6': case '7':
   1705         --i;  // backup to first octal digit
   1706         int nOctalDigits = 1;
   1707         int digitLimit = c < '4' ? 3 : 2;
   1708         while (nOctalDigits < digitLimit && i + nOctalDigits < s.length()
   1709                && isOctal(s.charAt(i + nOctalDigits))) {
   1710           ++nOctalDigits;
   1711         }
   1712         sb.append(
   1713             (char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8));
   1714         i += nOctalDigits;
   1715         break;
   1716       case 'x':
   1717       case 'u':
   1718         String hexCode;
   1719         int nHexDigits = (c == 'u' ? 4 : 2);
   1720         try {
   1721           hexCode = s.substring(i, i + nHexDigits);
   1722         } catch (IndexOutOfBoundsException ioobe) {
   1723           throw new IllegalArgumentException(
   1724               "Invalid unicode sequence [" + s.substring(i) + "] at index " + i
   1725               + " in [" + s + "]");
   1726         }
   1727         int unicodeValue;
   1728         try {
   1729           unicodeValue = Integer.parseInt(hexCode, 16);
   1730         } catch (NumberFormatException nfe) {
   1731           throw new IllegalArgumentException(
   1732               "Invalid unicode sequence [" + hexCode + "] at index " + i +
   1733               " in [" + s + "]");
   1734         }
   1735         sb.append((char) unicodeValue);
   1736         i += nHexDigits;
   1737         break;
   1738       default:
   1739         throw new IllegalArgumentException(
   1740             "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"
   1741             );
   1742     }
   1743 
   1744     return i;
   1745   }
   1746 
   1747   // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF
   1748   private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf(
   1749       "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
   1750       "\u0008\u000B\u000C\u000E\u000F" +
   1751       "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
   1752       "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
   1753       "\uFFFE\uFFFF");
   1754 
   1755   /**
   1756    * Escape a string that is meant to be embedded in a CDATA section.
   1757    * The returned string is guaranteed to be valid CDATA content.
   1758    * The syntax of CDATA sections is the following:
   1759    * <blockquote>
   1760    *   <code>&lt;[!CDATA[...]]&gt;</code>
   1761    * </blockquote>
   1762    * The only invalid character sequence in a CDATA tag is "]]&gt;".
   1763    * If this sequence is present in the input string, we replace
   1764    * it by closing the current CDATA field, then write ']]&amp;gt;',
   1765    * then reopen a new CDATA section.
   1766    */
   1767   public static String xmlCDataEscape(String s) {
   1768      // Make sure there are no illegal control characters.
   1769      s = CONTROL_MATCHER.removeFrom(s);
   1770     // Return the original reference if the string doesn't have a match.
   1771     int found = s.indexOf("]]>");
   1772     if (found == -1) {
   1773       return s;
   1774     }
   1775 
   1776     // For each occurrence of "]]>", append a string that adds "]]&gt;" after
   1777     // the end of the CDATA which has just been closed, then opens a new CDATA.
   1778     StringBuilder sb = new StringBuilder();
   1779     int prev = 0;
   1780     do {
   1781       sb.append(s.substring(prev, found + 3));
   1782       sb.append("]]&gt;<![CDATA[");
   1783       prev = found + 3;
   1784     } while ((found = s.indexOf("]]>", prev)) != -1);
   1785     sb.append(s.substring(prev));
   1786     return sb.toString();
   1787   }
   1788 
   1789   /**
   1790    * We escape some characters in s to be able to insert strings into Java code
   1791    *
   1792    * @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link
   1793    * CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()}
   1794    * instead. This method combines two forms of escaping in a way that's rarely
   1795    * desired.
   1796    */
   1797   @Deprecated
   1798   public static String javaEscape(String s) {
   1799     return JAVA_ESCAPE.escape(s);
   1800   }
   1801 
   1802   // Java escaper.
   1803   private static final CharEscaper JAVA_ESCAPE =
   1804       new CharEscaperBuilder()
   1805         .addEscape('\n', "\\n")
   1806         .addEscape('\r', "\\r")
   1807         .addEscape('\t', "\\t")
   1808         .addEscape('\\', "\\\\")
   1809         .addEscape('\"', "\\\"")
   1810         .addEscape('&', "&amp;")
   1811         .addEscape('<', "&lt;")
   1812         .addEscape('>', "&gt;")
   1813         .addEscape('\'', "\\\'")
   1814         .toEscaper();
   1815 
   1816   /**
   1817    * Escapes the special characters from a string so it can be used as part of
   1818    * a regex pattern. This method is for use on gnu.regexp style regular
   1819    * expressions.
   1820    *
   1821    * @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not
   1822    * be compatible with gnu.regexp style regular expressions.
   1823    */
   1824   @Deprecated
   1825   public static String regexEscape(String s) {
   1826     return REGEX_ESCAPE.escape(s);
   1827   }
   1828 
   1829   // Regex escaper escapes all regex characters.
   1830   private static final CharEscaper REGEX_ESCAPE =
   1831       new CharEscaperBuilder()
   1832         .addEscape('(', "\\(")
   1833         .addEscape(')', "\\)")
   1834         .addEscape('|', "\\|")
   1835         .addEscape('*', "\\*")
   1836         .addEscape('+', "\\+")
   1837         .addEscape('?', "\\?")
   1838         .addEscape('.', "\\.")
   1839         .addEscape('{', "\\{")
   1840         .addEscape('}', "\\}")
   1841         .addEscape('[', "\\[")
   1842         .addEscape(']', "\\]")
   1843         .addEscape('$', "\\$")
   1844         .addEscape('^', "\\^")
   1845         .addEscape('\\', "\\\\")
   1846         .toEscaper();
   1847 
   1848   /**
   1849    *  If you want to preserve the exact
   1850    * current (odd) behavior when {@code doStrip} is {@code true}, use
   1851    * {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on
   1852    * the splitter.
   1853    *
   1854    * @param in what to process
   1855    * @param delimiter the delimiting string
   1856    * @return the tokens
   1857    * @deprecated see the detailed instructions under
   1858    *     {@link #split(String, String, boolean)}
   1859    */
   1860   @Deprecated
   1861   public static LinkedList<String> string2List(
   1862       String in, String delimiter, boolean doStrip) {
   1863     if (in == null) {
   1864       return null;
   1865     }
   1866 
   1867     LinkedList<String> out = new LinkedList<String>();
   1868     string2Collection(in, delimiter, doStrip, out);
   1869     return out;
   1870   }
   1871 
   1872   /**
   1873    * See the detailed instructions under {@link
   1874    * #split(String, String, boolean)}. Pass the resulting {@code Iterable} to
   1875    * {@link com.google.common.collect.Sets#newHashSet(Iterable)}. If you want to
   1876    * preserve the exact current (odd) behavior when {@code doStrip} is {@code
   1877    * true}, use {@code
   1878    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
   1879    * splitter.
   1880    *
   1881    * @param in what to process
   1882    * @param delimiter the delimiting string
   1883    * @param doStrip to strip the substrings before adding to the list
   1884    * @return the tokens
   1885    * @deprecated see the detailed instructions under
   1886    *     {@link #split(String, String, boolean)}
   1887    */
   1888   @Deprecated
   1889   public static Set<String> string2Set(
   1890        String in, String delimiter, boolean doStrip) {
   1891     if (in == null) {
   1892       return null;
   1893     }
   1894 
   1895     HashSet<String> out = new HashSet<String>();
   1896     string2Collection(in, delimiter, doStrip, out);
   1897     return out;
   1898   }
   1899 
   1900   /**
   1901    * See the detailed instructions under {@link
   1902    * #split(String, String, boolean)}. If you want to preserve the exact current
   1903    * (odd) behavior when {@code doStrip} is {@code true}, use {@code
   1904    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
   1905    * splitter.
   1906    *
   1907    * @param in The delimited input string to process
   1908    * @param delimiter The string delimiting entries in the input string.
   1909    * @param doStrip whether to strip the substrings before adding to the
   1910    *          collection
   1911    * @param collection The collection to which the strings will be added. If
   1912    *          <code>null</code>, a new <code>List</code> will be created.
   1913    * @return The collection to which the substrings were added. This is
   1914    *         syntactic sugar to allow call chaining.
   1915    * @deprecated see the detailed instructions under
   1916    *     {@link #split(String, String, boolean)}
   1917    */
   1918   @Deprecated
   1919   public static Collection<String> string2Collection(
   1920       String in,
   1921       String delimiter,
   1922       boolean doStrip,
   1923       Collection<String> collection) {
   1924     if (in == null) {
   1925       return null;
   1926     }
   1927     if (collection == null) {
   1928       collection = new ArrayList<String>();
   1929     }
   1930     if (delimiter == null || delimiter.length() == 0) {
   1931       collection.add(in);
   1932       return collection;
   1933     }
   1934 
   1935     int fromIndex = 0;
   1936     int pos;
   1937     while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) {
   1938       String interim = in.substring(fromIndex, pos);
   1939       if (doStrip) {
   1940         interim = strip(interim);
   1941       }
   1942       if (!doStrip || interim.length() > 0) {
   1943         collection.add(interim);
   1944       }
   1945 
   1946       fromIndex = pos + delimiter.length();
   1947     }
   1948 
   1949     String interim = in.substring(fromIndex);
   1950     if (doStrip) {
   1951       interim = strip(interim);
   1952     }
   1953     if (!doStrip || interim.length() > 0) {
   1954       collection.add(interim);
   1955     }
   1956 
   1957     return collection;
   1958   }
   1959 
   1960   /**
   1961    * This converts a string to a Map. It will first split the string into
   1962    * entries using delimEntry. Then each entry is split into a key and a value
   1963    * using delimKey. By default we strip the keys. Use doStripEntry to strip
   1964    * also the entries.
   1965    *
   1966    * Note that this method returns a {@link HashMap}, which means that entries
   1967    * will be in no particular order. See {@link #stringToOrderedMap}.
   1968    *
   1969    * @param in the string to be processed
   1970    * @param delimEntry delimiter for the entries
   1971    * @param delimKey delimiter between keys and values
   1972    * @param doStripEntry strip entries before inserting in the map
   1973    *
   1974    * @return HashMap
   1975    */
   1976   public static HashMap<String, String> string2Map(
   1977       String in, String delimEntry, String delimKey,
   1978       boolean doStripEntry) {
   1979     if (in == null) {
   1980       return null;
   1981     }
   1982 
   1983     return stringToMapImpl(new HashMap<String, String>(), in, delimEntry,
   1984         delimKey, doStripEntry);
   1985   }
   1986 
   1987   /**
   1988    * This converts a string to a Map, with entries in the same order as the
   1989    * key/value pairs in the input string. It will first split the string into
   1990    * entries using delimEntry. Then each entry is split into a key and a value
   1991    * using delimKey. By default we strip the keys. Use doStripEntry to strip
   1992    * also the entries.
   1993    *
   1994    * @param in the string to be processed
   1995    * @param delimEntry delimiter for the entries
   1996    * @param delimKey delimiter between keys and values
   1997    * @param doStripEntry strip entries before inserting in the map
   1998    *
   1999    * @return key/value pairs as a Map, in order
   2000    */
   2001   public static Map<String, String> stringToOrderedMap(
   2002       String in, String delimEntry, String delimKey,
   2003       boolean doStripEntry) {
   2004     if (in == null) {
   2005       return null;
   2006     }
   2007 
   2008     return stringToMapImpl(new LinkedHashMap<String, String>(), in, delimEntry,
   2009         delimKey, doStripEntry);
   2010   }
   2011 
   2012   /**
   2013    * This adds key/value pairs from the given string to the given Map.
   2014    * It will first split the string into entries using delimEntry. Then each
   2015    * entry is split into a key and a value using delimKey. By default we
   2016    * strip the keys. Use doStripEntry to strip also the entries.
   2017    *
   2018    * @param out - Map to output into
   2019    * @param in - the string to be processed
   2020    * @param delimEntry - delimiter for the entries
   2021    * @param delimKey - delimiter between keys and values
   2022    * @param doStripEntry - strip entries before inserting in the map
   2023    * @return out, for caller's convenience
   2024    */
   2025   private static <T extends Map<String, String>> T stringToMapImpl(T out,
   2026       String in, String delimEntry, String delimKey, boolean doStripEntry) {
   2027 
   2028     if (isEmpty(delimEntry) || isEmpty(delimKey)) {
   2029       out.put(strip(in), "");
   2030       return out;
   2031     }
   2032 
   2033     Iterator<String> it = string2List(in, delimEntry, false).iterator();
   2034     int len = delimKey.length();
   2035     while (it.hasNext()) {
   2036       String entry = it.next();
   2037       int pos = entry.indexOf(delimKey);
   2038       if (pos > 0) {
   2039         String value = entry.substring(pos + len);
   2040         if (doStripEntry) {
   2041           value = strip(value);
   2042         }
   2043         out.put(strip(entry.substring(0, pos)), value);
   2044       } else {
   2045         out.put(strip(entry), "");
   2046       }
   2047     }
   2048 
   2049     return out;
   2050   }
   2051 
   2052   /**
   2053    * This function concatenates the elements of a Map in a string with form
   2054    *  "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>"
   2055    *
   2056    * @param in - the map to be converted
   2057    * @param sepKey - the separator to put between key and value
   2058    * @param sepEntry - the separator to put between map entries
   2059    * @return String
   2060    * @deprecated create a {@link MapJoiner}, for example {@code
   2061    *     Joiner.on(sepEntry).withKeyValueSeparator(sepKey)}. Ensure that your
   2062    *     map is non-null and use this map joiner's {@link MapJoiner#join(Map)}
   2063    *     method. To preserve behavior exactly, just in-line this method call.
   2064    */
   2065   @Deprecated public static <K, V> String map2String(
   2066       Map<K, V> in, String sepKey, String sepEntry) {
   2067     return (in == null) ? null : Joiner
   2068         .on(sepEntry)
   2069         .useForNull("null")
   2070         .withKeyValueSeparator(sepKey)
   2071         .join(in);
   2072   }
   2073 
   2074   /**
   2075    * Given a map, creates and returns a new map in which all keys are the
   2076    * lower-cased version of each key.
   2077    *
   2078    * @param map A map containing String keys to be lowercased
   2079    * @throws IllegalArgumentException if the map contains duplicate string keys
   2080    *           after lower casing
   2081    */
   2082   public static <V> Map<String, V> lowercaseKeys(Map<String, V> map) {
   2083     Map<String, V> result = new HashMap<String, V>(map.size());
   2084     for (Map.Entry<String, V> entry : map.entrySet()) {
   2085       String key = entry.getKey();
   2086       if (result.containsKey(key.toLowerCase())) {
   2087         throw new IllegalArgumentException(
   2088             "Duplicate string key in map when lower casing");
   2089       }
   2090       result.put(key.toLowerCase(), entry.getValue());
   2091     }
   2092     return result;
   2093   }
   2094 
   2095   /**
   2096    * Replaces any string of adjacent whitespace characters with the whitespace
   2097    * character " ".
   2098    *
   2099    * @param str the string you want to munge
   2100    * @return String with no more excessive whitespace!
   2101    * @deprecated ensure the string is not null and use {@code
   2102    *     CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ')}; also consider
   2103    *     whether you really want the legacy whitespace definition, or something
   2104    *     more standard like {@link CharMatcher#WHITESPACE}.
   2105    */
   2106   @Deprecated public static String collapseWhitespace(String str) {
   2107     return (str == null) ? null
   2108         : CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ');
   2109   }
   2110 
   2111   /**
   2112    * Replaces any string of matched characters with the supplied string.<p>
   2113    *
   2114    * This is a more general version of collapseWhitespace.
   2115    *
   2116    * <pre>
   2117    *   E.g. collapse("hello     world", " ", "::")
   2118    *   will return the following string: "hello::world"
   2119    * </pre>
   2120    *
   2121    * @param str the string you want to munge
   2122    * @param chars all of the characters to be considered for munge
   2123    * @param replacement the replacement string
   2124    * @return munged and replaced string.
   2125    * @deprecated if {@code replacement} is the empty string, use {@link
   2126    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
   2127    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
   2128    *     replacement strings use {@link String#replaceAll(String, String)} with
   2129    *     a regular expression that matches one or more occurrences of {@code
   2130    *     chars}. In all cases you must first ensure that {@code str} is not
   2131    *     null.
   2132    */
   2133   @Deprecated public static String collapse(
   2134       String str, String chars, String replacement) {
   2135     if (str == null) {
   2136       return null;
   2137     }
   2138 
   2139     StringBuilder newStr = new StringBuilder();
   2140 
   2141     boolean prevCharMatched = false;
   2142     char c;
   2143     for (int i = 0; i < str.length(); i++) {
   2144       c = str.charAt(i);
   2145       if (chars.indexOf(c) != -1) {
   2146         // this character is matched
   2147         if (prevCharMatched) {
   2148           // apparently a string of matched chars, so don't append anything
   2149           // to the string
   2150           continue;
   2151         }
   2152         prevCharMatched = true;
   2153         newStr.append(replacement);
   2154       } else {
   2155         prevCharMatched = false;
   2156         newStr.append(c);
   2157       }
   2158     }
   2159 
   2160     return newStr.toString();
   2161   }
   2162 
   2163   /**
   2164    * Returns a string with all sequences of ISO control chars (0x00 to 0x1F and
   2165    * 0x7F to 0x9F) replaced by the supplied string.  ISO control characters are
   2166    * identified via {@link Character#isISOControl(char)}.
   2167    *
   2168    * @param str the string you want to strip of ISO control chars
   2169    * @param replacement the replacement string
   2170    * @return a String with all control characters replaced by the replacement
   2171    * string, or null if input is null.
   2172    * @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code
   2173    *     replacement} is the empty string, use {@link
   2174    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
   2175    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
   2176    *     replacement strings use
   2177    *     {@code str.replaceAll("\p{Cntrl}+", replacement)}.
   2178    *     In all cases you must first ensure that {@code str} is not null.
   2179    */
   2180   @Deprecated public static String collapseControlChars(
   2181       String str, String replacement) {
   2182     /*
   2183      * We re-implement the StringUtil.collapse() loop here rather than call
   2184      * collapse() with an input String of control chars, because matching via
   2185      * isISOControl() is about 10x faster.
   2186      */
   2187     if (str == null) {
   2188       return null;
   2189     }
   2190 
   2191     StringBuilder newStr = new StringBuilder();
   2192 
   2193     boolean prevCharMatched = false;
   2194     char c;
   2195     for (int i = 0; i < str.length(); i++) {
   2196       c = str.charAt(i);
   2197       if (Character.isISOControl(c)) {
   2198         // this character is matched
   2199         if (prevCharMatched) {
   2200           // apparently a string of matched chars, so don't append anything
   2201           // to the string
   2202           continue;
   2203         }
   2204         prevCharMatched = true;
   2205         newStr.append(replacement);
   2206       } else {
   2207         prevCharMatched = false;
   2208         newStr.append(c);
   2209       }
   2210     }
   2211 
   2212     return newStr.toString();
   2213   }
   2214 
   2215   /**
   2216    * Read a String of up to maxLength bytes from an InputStream.
   2217    *
   2218    * <p>Note that this method uses the default platform encoding, and expects
   2219    * that encoding to be single-byte, which is not always the case. Its use
   2220    * is discouraged. For reading the entire stream (maxLength == -1) you can use:
   2221    * <pre>
   2222    *   CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1))
   2223    * </pre>
   2224    * {@code CharStreams} is in the {@code com.google.common.io} package.
   2225    *
   2226    * <p>For maxLength >= 0 a literal translation would be
   2227    * <pre>
   2228    *   CharStreams.toString(new InputStreamReader(
   2229    *       new LimitInputStream(is, maxLength), Charsets.ISO_8859_1))
   2230    * </pre>
   2231    * For multi-byte encodings that is broken because the limit could end in
   2232    * the middle of the character--it would be better to limit the reader than
   2233    * the underlying stream.
   2234    *
   2235    * @param is input stream
   2236    * @param maxLength max number of bytes to read from "is". If this is -1, we
   2237    *          read everything.
   2238    *
   2239    * @return String up to maxLength bytes, read from "is"
   2240    * @deprecated see the advice above
   2241    */
   2242   @Deprecated public static String stream2String(InputStream is, int maxLength)
   2243       throws IOException {
   2244     byte[] buffer = new byte[4096];
   2245     StringWriter sw = new StringWriter();
   2246     int totalRead = 0;
   2247     int read = 0;
   2248 
   2249     do {
   2250       sw.write(new String(buffer, 0, read));
   2251       totalRead += read;
   2252       read = is.read(buffer, 0, buffer.length);
   2253     } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1));
   2254 
   2255     return sw.toString();
   2256   }
   2257 
   2258   /**
   2259    * Parse a list of substrings separated by a given delimiter. The delimiter
   2260    * can also appear in substrings (just double them):
   2261    *
   2262    * parseDelimitedString("this|is", '|') returns ["this","is"]
   2263    * parseDelimitedString("this||is", '|') returns ["this|is"]
   2264    *
   2265    * @param list String containing delimited substrings
   2266    * @param delimiter Delimiter (anything except ' ' is allowed)
   2267    *
   2268    * @return String[] A String array of parsed substrings
   2269    */
   2270   public static String[] parseDelimitedList(String list,
   2271                                             char delimiter) {
   2272     String delim = "" + delimiter;
   2273     // Append a sentinel of delimiter + space
   2274     // (see comments below for more info)
   2275     StringTokenizer st = new StringTokenizer(list + delim + " ",
   2276                                              delim,
   2277                                              true);
   2278     ArrayList<String> v = new ArrayList<String>();
   2279     String lastToken = "";
   2280     StringBuilder word = new StringBuilder();
   2281 
   2282     // We keep a sliding window of 2 tokens
   2283     //
   2284     // delimiter : delimiter -> append delimiter to current word
   2285     //                          and clear most recent token
   2286     //                          (so delim : delim : delim will not
   2287     //                          be treated as two escaped delims.)
   2288     //
   2289     // tok : delimiter -> append tok to current word
   2290     //
   2291     // delimiter : tok -> add current word to list, and clear it.
   2292     //                    (We append a sentinel that conforms to this
   2293     //                    pattern to make sure we've pushed every parsed token)
   2294     while (st.hasMoreTokens()) {
   2295       String tok = st.nextToken();
   2296       if (lastToken != null) {
   2297         if (tok.equals(delim)) {
   2298           word.append(lastToken);
   2299           if (lastToken.equals(delim)) { tok = null; }
   2300         } else {
   2301           if (word.length() != 0) {
   2302             v.add(word.toString());
   2303           }
   2304           word.setLength(0);
   2305         }
   2306       }
   2307       lastToken = tok;
   2308     }
   2309 
   2310     return v.toArray(new String[0]);
   2311   }
   2312 
   2313   /**
   2314    * Compares two strings, guarding against nulls.
   2315    *
   2316    * @param nullsAreGreater true if nulls should be greater than any string,
   2317    *  false is less than.
   2318    * @deprecated use {@link String#CASE_INSENSITIVE_ORDER}, together with
   2319    *     {@link com.google.common.collect.Ordering#nullsFirst()} or
   2320    *     {@link com.google.common.collect.Ordering#nullsLast()} if
   2321    *     needed
   2322    */
   2323   @Deprecated public static int compareToIgnoreCase(String s1, String s2,
   2324       boolean nullsAreGreater) {
   2325     if (s1 == s2) {
   2326       return 0; // Either both the same String, or both null
   2327     }
   2328     if (s1 == null) {
   2329       return nullsAreGreater ? 1 : -1;
   2330     }
   2331     if (s2 == null) {
   2332       return nullsAreGreater ? -1 : 1;
   2333     }
   2334     return s1.compareToIgnoreCase(s2);
   2335   }
   2336 
   2337   /**
   2338    * Splits s with delimiters in delimiter and returns the last token
   2339    */
   2340   public static String lastToken(String s, String delimiter) {
   2341     return s.substring(CharMatcher.anyOf(delimiter).lastIndexIn(s) + 1);
   2342   }
   2343 
   2344   private static final Pattern characterReferencePattern =
   2345       Pattern.compile("&#?[a-zA-Z0-9]{1,8};");
   2346 
   2347   /**
   2348    * Determines if a string contains what looks like an html character
   2349    * reference. Useful for deciding whether unescaping is necessary.
   2350    */
   2351   public static boolean containsCharRef(String s) {
   2352     return characterReferencePattern.matcher(s).find();
   2353   }
   2354 
   2355   /**
   2356    * Determines if a string is a Hebrew word. A string is considered to be
   2357    * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters.
   2358    */
   2359   public static boolean isHebrew(String s) {
   2360     int len = s.length();
   2361     for (int i = 0; i < len; ++i) {
   2362       if (isHebrew(s.codePointAt(i))) {
   2363         return true;
   2364       }
   2365     }
   2366     return false;
   2367   }
   2368 
   2369   /**
   2370    * Determines if a character is a Hebrew character.
   2371    */
   2372   public static boolean isHebrew(int codePoint) {
   2373     return Character.UnicodeBlock.HEBREW.equals(
   2374                Character.UnicodeBlock.of(codePoint));
   2375   }
   2376 
   2377   /**
   2378    * Determines if a string is a CJK word. A string is considered to be CJK
   2379    * if {@link #isCjk(char)} is true for any of its characters.
   2380    */
   2381   public static boolean isCjk(String s) {
   2382     int len = s.length();
   2383     for (int i = 0; i < len; ++i) {
   2384       if (isCjk(s.codePointAt(i))) {
   2385         return true;
   2386       }
   2387     }
   2388     return false;
   2389   }
   2390 
   2391   /**
   2392    * Unicode code blocks containing CJK characters.
   2393    */
   2394   private static final Set<Character.UnicodeBlock> CJK_BLOCKS;
   2395   static {
   2396     Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>();
   2397     set.add(Character.UnicodeBlock.HANGUL_JAMO);
   2398     set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
   2399     set.add(Character.UnicodeBlock.KANGXI_RADICALS);
   2400     set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
   2401     set.add(Character.UnicodeBlock.HIRAGANA);
   2402     set.add(Character.UnicodeBlock.KATAKANA);
   2403     set.add(Character.UnicodeBlock.BOPOMOFO);
   2404     set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO);
   2405     set.add(Character.UnicodeBlock.KANBUN);
   2406     set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED);
   2407     set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
   2408     set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS);
   2409     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY);
   2410     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
   2411     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
   2412     set.add(Character.UnicodeBlock.HANGUL_SYLLABLES);
   2413     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
   2414     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS);
   2415     set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS);
   2416     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
   2417     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
   2418     CJK_BLOCKS = Collections.unmodifiableSet(set);
   2419   }
   2420 
   2421   /**
   2422    * Determines if a character is a CJK ideograph or a character typically
   2423    * used only in CJK text.
   2424    *
   2425    * Note: This function cannot handle supplementary characters. To handle all
   2426    * Unicode characters, including supplementary characters, use the function
   2427    * {@link #isCjk(int)}.
   2428    */
   2429   public static boolean isCjk(char ch) {
   2430     return isCjk((int) ch);
   2431   }
   2432 
   2433   /**
   2434    * Determines if a character is a CJK ideograph or a character typically
   2435    * used only in CJK text.
   2436    */
   2437   public static boolean isCjk(int codePoint) {
   2438     // Time-saving early exit for all Latin-1 characters.
   2439     if ((codePoint & 0xFFFFFF00) == 0) {
   2440       return false;
   2441     }
   2442 
   2443     return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint));
   2444   }
   2445 
   2446   /**
   2447    * Returns the approximate display width of the string, measured in units of
   2448    * ascii characters.
   2449    *
   2450    * @see StringUtil#displayWidth(char)
   2451    */
   2452   public static int displayWidth(String s) {
   2453     // TODO(kevinb): could reimplement this as
   2454     // return s.length() * 2 - CharMatcher.SINGLE_WIDTH.countIn(s);
   2455     int width = 0;
   2456     int len = s.length();
   2457     for (int i = 0; i < len; ++i) {
   2458       width += displayWidth(s.charAt(i));
   2459     }
   2460     return width;
   2461   }
   2462 
   2463   /**
   2464    * Returns the approximate display width of the character, measured
   2465    * in units of ascii characters.
   2466    *
   2467    * This method should err on the side of caution. By default, characters
   2468    * are assumed to have width 2; this covers CJK ideographs, various
   2469    * symbols and miscellaneous weird scripts. Given below are some Unicode
   2470    * ranges for which it seems safe to assume that no character is
   2471    * substantially wider than an ascii character:
   2472    *   - Latin, extended Latin, even more extended Latin.
   2473    *   - Greek, extended Greek, Cyrillic.
   2474    *   - Some symbols (including currency symbols) and punctuation.
   2475    *   - Half-width Katakana and Hangul.
   2476    *   - Hebrew
   2477    *   - Arabic
   2478    *   - Thai
   2479    * Characters in these ranges are given a width of 1.
   2480    *
   2481    * IMPORTANT: this function has analogs in C++ (encodingutils.cc,
   2482    * named UnicodeCharWidth) and JavaScript
   2483    * (java/com/google/ads/common/frontend/adwordsbase/resources/CreateAdUtil.js),
   2484    * which need to be updated if you change the implementation here.
   2485    */
   2486   public static int displayWidth(char ch) {
   2487     if (ch <= '\u04f9' ||   // CYRILLIC SMALL LETTER YERU WITH DIAERESIS
   2488         ch == '\u05be' ||   // HEBREW PUNCTUATION MAQAF
   2489         (ch >= '\u05d0' && ch <= '\u05ea') ||  // HEBREW LETTER ALEF ... TAV
   2490         ch == '\u05F3' ||   // HEBREW PUNCTUATION GERESH
   2491         ch == '\u05f4' ||   // HEBREW PUNCTUATION GERSHAYIM
   2492         (ch >= '\u0600' && ch <= '\u06ff') || // Block=Arabic
   2493         (ch >= '\u0750' && ch <= '\u077f') || // Block=Arabic_Supplement
   2494         (ch >= '\ufb50' && ch <= '\ufdff') || // Block=Arabic_Presentation_Forms-A
   2495         (ch >= '\ufe70' && ch <= '\ufeff') || // Block=Arabic_Presentation_Forms-B
   2496         (ch >= '\u1e00' && ch <= '\u20af') || /* LATIN CAPITAL LETTER A WITH RING BELOW
   2497                                                  ... DRACHMA SIGN */
   2498         (ch >= '\u2100' && ch <= '\u213a') || // ACCOUNT OF ... ROTATED CAPITAL Q
   2499         (ch >= '\u0e00' && ch <= '\u0e7f') || // Thai
   2500         (ch >= '\uff61' && ch <= '\uffdc')) { /* HALFWIDTH IDEOGRAPHIC FULL STOP
   2501                                                  ... HALFWIDTH HANGUL LETTER I */
   2502       return 1;
   2503     }
   2504     return 2;
   2505   }
   2506 
   2507   /**
   2508    * @return a string representation of the given native array.
   2509    */
   2510   public static String toString(float[] iArray) {
   2511     if (iArray == null) {
   2512       return "NULL";
   2513     }
   2514 
   2515     StringBuilder buffer = new StringBuilder();
   2516     buffer.append("[");
   2517     for (int i = 0; i < iArray.length; i++) {
   2518       buffer.append(iArray[i]);
   2519       if (i != (iArray.length - 1)) {
   2520         buffer.append(", ");
   2521       }
   2522     }
   2523     buffer.append("]");
   2524     return buffer.toString();
   2525   }
   2526 
   2527   /**
   2528    * @return a string representation of the given native array.
   2529    */
   2530   public static String toString(long[] iArray) {
   2531     if (iArray == null) {
   2532       return "NULL";
   2533     }
   2534 
   2535     StringBuilder buffer = new StringBuilder();
   2536     buffer.append("[");
   2537     for (int i = 0; i < iArray.length; i++) {
   2538       buffer.append(iArray[i]);
   2539       if (i != (iArray.length - 1)) {
   2540         buffer.append(", ");
   2541       }
   2542     }
   2543     buffer.append("]");
   2544     return buffer.toString();
   2545   }
   2546 
   2547   /**
   2548    * @return a string representation of the given native array
   2549    */
   2550   public static String toString(int[] iArray) {
   2551     if (iArray == null) {
   2552       return "NULL";
   2553     }
   2554 
   2555     StringBuilder buffer = new StringBuilder();
   2556     buffer.append("[");
   2557     for (int i = 0; i < iArray.length; i++) {
   2558       buffer.append(iArray[i]);
   2559       if (i != (iArray.length - 1)) {
   2560         buffer.append(", ");
   2561       }
   2562     }
   2563     buffer.append("]");
   2564     return buffer.toString();
   2565   }
   2566 
   2567   /**
   2568    * @return a string representation of the given array.
   2569    */
   2570   public static String toString(String[] iArray) {
   2571     if (iArray == null) { return "NULL"; }
   2572 
   2573     StringBuilder buffer = new StringBuilder();
   2574     buffer.append("[");
   2575     for (int i = 0; i < iArray.length; i++) {
   2576       buffer.append("'").append(iArray[i]).append("'");
   2577       if (i != iArray.length - 1) {
   2578         buffer.append(", ");
   2579       }
   2580     }
   2581     buffer.append("]");
   2582 
   2583     return buffer.toString();
   2584   }
   2585 
   2586   /**
   2587    * Returns the string, in single quotes, or "NULL". Intended only for
   2588    * logging.
   2589    *
   2590    * @param s the string
   2591    * @return the string, in single quotes, or the string "null" if it's null.
   2592    */
   2593   public static String toString(String s) {
   2594     if (s == null) {
   2595       return "NULL";
   2596     } else {
   2597       return new StringBuilder(s.length() + 2).append("'").append(s)
   2598                                               .append("'").toString();
   2599     }
   2600   }
   2601 
   2602   /**
   2603    * @return a string representation of the given native array
   2604    */
   2605   public static String toString(int[][] iArray) {
   2606     if (iArray == null) {
   2607       return "NULL";
   2608     }
   2609 
   2610     StringBuilder buffer = new StringBuilder();
   2611     buffer.append("[");
   2612     for (int i = 0; i < iArray.length; i++) {
   2613       buffer.append("[");
   2614       for (int j = 0; j < iArray[i].length; j++) {
   2615         buffer.append(iArray[i][j]);
   2616         if (j != (iArray[i].length - 1)) {
   2617           buffer.append(", ");
   2618         }
   2619       }
   2620       buffer.append("]");
   2621       if (i != iArray.length - 1) {
   2622         buffer.append(" ");
   2623       }
   2624     }
   2625     buffer.append("]");
   2626     return buffer.toString();
   2627   }
   2628 
   2629   /**
   2630    * @return a string representation of the given native array.
   2631    */
   2632   public static String toString(long[][] iArray) {
   2633     if (iArray == null) { return "NULL"; }
   2634 
   2635     StringBuilder buffer = new StringBuilder();
   2636     buffer.append("[");
   2637     for (int i = 0; i < iArray.length; i++) {
   2638       buffer.append("[");
   2639       for (int j = 0; j < iArray[i].length; j++) {
   2640         buffer.append(iArray[i][j]);
   2641         if (j != (iArray[i].length - 1)) {
   2642           buffer.append(", ");
   2643         }
   2644       }
   2645       buffer.append("]");
   2646       if (i != iArray.length - 1) {
   2647         buffer.append(" ");
   2648       }
   2649     }
   2650     buffer.append("]");
   2651     return buffer.toString();
   2652   }
   2653 
   2654   /**
   2655    * @return a String representation of the given object array.
   2656    * The strings are obtained by calling toString() on the
   2657    * underlying objects.
   2658    */
   2659   public static String toString(Object[] obj) {
   2660     if (obj == null) { return "NULL"; }
   2661     StringBuilder tmp = new StringBuilder();
   2662     tmp.append("[");
   2663     for (int i = 0; i < obj.length; i++) {
   2664       tmp.append(obj[i].toString());
   2665       if (i != obj.length - 1) {
   2666         tmp.append(",");
   2667       }
   2668     }
   2669     tmp.append("]");
   2670     return tmp.toString();
   2671   }
   2672 
   2673   private static final char[] HEX_CHARS
   2674       = { '0', '1', '2', '3', '4', '5', '6', '7',
   2675           '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
   2676   private static final char[] OCTAL_CHARS = HEX_CHARS;  // ignore the last 8 :)
   2677 
   2678   /**
   2679    * Convert a byte array to a hex-encoding string: "a33bff00..."
   2680    *
   2681    * @deprecated Use {@link ByteArrays#toHexString}.
   2682    */
   2683   @Deprecated public static String bytesToHexString(final byte[] bytes) {
   2684     return ByteArrays.toHexString(bytes);
   2685   }
   2686 
   2687   /**
   2688    * Convert a byte array to a hex-encoding string with the specified
   2689    * delimiter: "a3&lt;delimiter&gt;3b&lt;delimiter&gt;ff..."
   2690    */
   2691   public static String bytesToHexString(final byte[] bytes,
   2692       Character delimiter) {
   2693     StringBuilder hex =
   2694       new StringBuilder(bytes.length * (delimiter == null ? 2 : 3));
   2695     int nibble1, nibble2;
   2696     for (int i = 0; i < bytes.length; i++) {
   2697       nibble1 = (bytes[i] >>> 4) & 0xf;
   2698       nibble2 = bytes[i] & 0xf;
   2699       if (i > 0 && delimiter != null) { hex.append(delimiter.charValue()); }
   2700       hex.append(HEX_CHARS[nibble1]);
   2701       hex.append(HEX_CHARS[nibble2]);
   2702     }
   2703     return hex.toString();
   2704   }
   2705 
   2706   /**
   2707    * Safely convert the string to uppercase.
   2708    * @return upper case representation of the String; or null if
   2709    * the input string is null.
   2710    */
   2711   public static String toUpperCase(String src) {
   2712     if (src == null) {
   2713       return null;
   2714     } else {
   2715       return src.toUpperCase();
   2716     }
   2717   }
   2718 
   2719   /**
   2720    * Safely convert the string to lowercase.
   2721    * @return lower case representation of the String; or null if
   2722    * the input string is null.
   2723    */
   2724   public static String toLowerCase(String src) {
   2725     if (src == null) {
   2726       return null;
   2727     } else {
   2728       return src.toLowerCase();
   2729     }
   2730   }
   2731 
   2732   private static final Pattern dbSpecPattern =
   2733       Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)");
   2734 
   2735   /**
   2736    * @param dbSpecComponent a single component of a DBDescriptor spec
   2737    * (e.g. the host or database component). The expected format of the string is:
   2738    * <br>
   2739    *             <center>(prefix){(digits),(digits)}(suffix)</center>
   2740    * </br>
   2741    * @return a shard expansion of the given String.
   2742    * Note that unless the pattern is matched exactly, no expansion is
   2743    * performed and the original string is returned unaltered.
   2744    * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'.
   2745    * Note that this method is added to StringUtil instead of
   2746    * DBDescriptor to better encapsulate the choice of regexp implementation.
   2747    * @throws IllegalArgumentException if the string does not parse.
   2748    */
   2749   public static String expandShardNames(String dbSpecComponent)
   2750       throws IllegalArgumentException, IllegalStateException {
   2751 
   2752     Matcher matcher = dbSpecPattern.matcher(dbSpecComponent);
   2753     if (matcher.find()) {
   2754       try {
   2755         String prefix = dbSpecComponent.substring(
   2756           matcher.start(1), matcher.end(1));
   2757         int minShard =
   2758           Integer.parseInt(
   2759             dbSpecComponent.substring(
   2760               matcher.start(2), matcher.end(2)));
   2761         int maxShard =
   2762           Integer.parseInt(
   2763             dbSpecComponent.substring(
   2764               matcher.start(3), matcher.end(3)));
   2765         String suffix = dbSpecComponent.substring(
   2766           matcher.start(4), matcher.end(4));
   2767         //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix);
   2768         if (minShard > maxShard) {
   2769           throw new IllegalArgumentException(
   2770             "Maximum shard must be greater than or equal to " +
   2771             "the minimum shard");
   2772         }
   2773         StringBuilder tmp = new StringBuilder();
   2774         for (int shard = minShard; shard <= maxShard; shard++) {
   2775           tmp.append(prefix).append(shard).append(suffix);
   2776           if (shard != maxShard) {
   2777             tmp.append(",");
   2778           }
   2779         }
   2780         return tmp.toString();
   2781       } catch (NumberFormatException nfex) {
   2782         throw new IllegalArgumentException(
   2783           "Malformed DB specification component: " + dbSpecComponent);
   2784       }
   2785     } else {
   2786       return dbSpecComponent;
   2787     }
   2788   }
   2789 
   2790 
   2791   /**
   2792   * Returns a string that is equivalent to the specified string with its
   2793   * first character converted to uppercase as by {@link String#toUpperCase()}.
   2794   * The returned string will have the same value as the specified string if
   2795   * its first character is non-alphabetic, if its first character is already
   2796   * uppercase, or if the specified string is of length 0.
   2797   *
   2798   * <p>For example:
   2799   * <pre>
   2800   *    capitalize("foo bar").equals("Foo bar");
   2801   *    capitalize("2b or not 2b").equals("2b or not 2b")
   2802   *    capitalize("Foo bar").equals("Foo bar");
   2803   *    capitalize("").equals("");
   2804   * </pre>
   2805   *
   2806   * @param s the string whose first character is to be uppercased
   2807   * @return a string equivalent to <tt>s</tt> with its first character
   2808   *     converted to uppercase
   2809   * @throws NullPointerException if <tt>s</tt> is null
   2810   */
   2811   public static String capitalize(String s) {
   2812     if (s.length() == 0) {
   2813       return s;
   2814     }
   2815     char first = s.charAt(0);
   2816     char capitalized = Character.toUpperCase(first);
   2817     return (first == capitalized)
   2818         ? s
   2819         : capitalized + s.substring(1);
   2820   }
   2821 
   2822   /**
   2823    * Examine a string to see if it starts with a given prefix (case
   2824    * insensitive). Just like String.startsWith() except doesn't
   2825    * respect case. Strings are compared in the same way as in
   2826    * {@link String#equalsIgnoreCase}.
   2827    *
   2828    * @param str the string to examine
   2829    * @param prefix the prefix to look for
   2830    * @return a boolean indicating if str starts with prefix (case insensitive)
   2831    */
   2832   public static boolean startsWithIgnoreCase(String str, String prefix) {
   2833     return str.regionMatches(true, 0, prefix, 0, prefix.length());
   2834   }
   2835 
   2836   /**
   2837    * Examine a string to see if it ends with a given suffix (case
   2838    * insensitive). Just like String.endsWith() except doesn't respect
   2839    * case. Strings are compared in the same way as in
   2840    * {@link String#equalsIgnoreCase}.
   2841    *
   2842    * @param str the string to examine
   2843    * @param suffix the suffix to look for
   2844    * @return a boolean indicating if str ends with suffix (case insensitive)
   2845    */
   2846   public static boolean endsWithIgnoreCase(String str, String suffix) {
   2847     int len = suffix.length();
   2848     return str.regionMatches(true, str.length() - len, suffix, 0, len);
   2849   }
   2850 
   2851   /**
   2852    * @param c one codePoint
   2853    * @return the number of bytes needed to encode this codePoint in UTF-8
   2854    */
   2855   private static int bytesUtf8(int c) {
   2856     if (c < 0x80) {
   2857       return 1;
   2858     } else if (c < 0x00800) {
   2859       return 2;
   2860     } else if (c < 0x10000) {
   2861       return 3;
   2862     } else if (c < 0x200000) {
   2863       return 4;
   2864 
   2865     // RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF,
   2866     // so if the caller respects this RFC, this should not happen
   2867     } else if (c < 0x4000000) {
   2868       return 5;
   2869     } else {
   2870       return 6;
   2871     }
   2872   }
   2873 
   2874   /**
   2875    * @param str a string
   2876    * @return the number of bytes required to represent this string in UTF-8
   2877    */
   2878   public static int bytesStorage(String str) {
   2879     // offsetByCodePoint has a bug if its argument is the result of a
   2880     // call to substring. To avoid this, we create a new String
   2881     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
   2882     String s = new String(str);
   2883 
   2884     int len = 0;
   2885     for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) {
   2886       len += bytesUtf8(s.codePointAt(i));
   2887     }
   2888     return len;
   2889   }
   2890 
   2891   /**
   2892    * @param str a string
   2893    * @param maxbytes
   2894    * @return the beginning of the string, so that it uses less than
   2895    *     maxbytes bytes in UTF-8
   2896    * @throws IndexOutOfBoundsException if maxbytes is negative
   2897    */
   2898   public static String truncateStringForUtf8Storage(String str, int maxbytes) {
   2899     if (maxbytes < 0) {
   2900       throw new IndexOutOfBoundsException();
   2901     }
   2902 
   2903     // offsetByCodePoint has a bug if its argument is the result of a
   2904     // call to substring. To avoid this, we create a new String
   2905     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
   2906     // TODO(cquinn): should be fixed as of 1.5.0_01
   2907     String s = new String(str);
   2908 
   2909     int codepoints = 0;
   2910     int bytesUsed = 0;
   2911     for (codepoints = 0; codepoints < s.length();
   2912         codepoints = s.offsetByCodePoints(codepoints, 1)) {
   2913       int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints));
   2914       if (bytesUsed + glyphBytes > maxbytes) {
   2915         break;
   2916       }
   2917       bytesUsed += glyphBytes;
   2918     }
   2919     return s.substring(0, codepoints);
   2920   }
   2921 
   2922   /**
   2923    * If the given string is of length {@code maxLength} or less, then it is
   2924    * returned as is.
   2925    * If the string is longer than {@code maxLength}, the returned string is
   2926    * truncated before the last space character on or before
   2927    * {@code source.charAt(maxLength)}. If the string has no spaces, the
   2928    * returned string is truncated to {@code maxLength}.
   2929    *
   2930    * @param source the string to truncate if necessary
   2931    * @param maxLength
   2932    * @return the original string if its length is less than or equal to
   2933    *     maxLength, otherwise a truncated string as mentioned above
   2934    */
   2935   public static String truncateIfNecessary(String source, int maxLength) {
   2936     if (source.length() <= maxLength) {
   2937       return source;
   2938     }
   2939     String str = unicodePreservingSubstring(source, 0, maxLength);
   2940 
   2941     @SuppressWarnings("deprecation") // we'll make this go away before that does
   2942     CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE;
   2943     String truncated = whitespaceMatcher.trimTrailingFrom(str);
   2944 
   2945     // We may have had multiple spaces at maxLength, which were stripped away
   2946     if (truncated.length() < maxLength) {
   2947       return truncated;
   2948     }
   2949     // We have a truncated string of length maxLength. If the next char was a
   2950     // space, we truncated at a word boundary, so we can return immediately
   2951     if (Character.isSpaceChar(source.charAt(maxLength))) {
   2952       return truncated;
   2953     }
   2954     // We truncated in the middle of the word. Try to truncate before
   2955     // the last space, if it exists. Otherwise, return the truncated string
   2956     for (int i = truncated.length() - 1; i >= 0; --i) {
   2957       if (Character.isSpaceChar(truncated.charAt(i))) {
   2958         String substr = truncated.substring(0, i);
   2959         return whitespaceMatcher.trimTrailingFrom(substr);
   2960       }
   2961     }
   2962     return truncated;
   2963   }
   2964 
   2965   /**
   2966    * If this given string is of length {@code maxLength} or less, it will
   2967    * be returned as-is.
   2968    * Otherwise it will be trucated to {@code maxLength}, regardless of whether
   2969    * there are any space characters in the String. If an ellipsis is requested
   2970    * to be appended to the truncated String, the String will be truncated so
   2971    * that the ellipsis will also fit within maxLength.
   2972    * If no truncation was necessary, no ellipsis will be added.
   2973    *
   2974    * @param source the String to truncate if necessary
   2975    * @param maxLength the maximum number of characters to keep
   2976    * @param addEllipsis if true, and if the String had to be truncated,
   2977    *     add "..." to the end of the String before returning. Additionally,
   2978    *     the ellipsis will only be added if maxLength is greater than 3.
   2979    * @return the original string if its length is less than or equal to
   2980    *     maxLength, otherwise a truncated string as mentioned above
   2981    */
   2982   public static String truncateAtMaxLength(String source, int maxLength,
   2983       boolean addEllipsis) {
   2984 
   2985     if (source.length() <= maxLength) {
   2986       return source;
   2987     }
   2988     if (addEllipsis && maxLength > 3) {
   2989       return unicodePreservingSubstring(source, 0, maxLength - 3) + "...";
   2990     }
   2991     return unicodePreservingSubstring(source, 0, maxLength);
   2992   }
   2993 
   2994   /**
   2995    * Normalizes {@code index} such that it respects Unicode character
   2996    * boundaries in {@code str}.
   2997    *
   2998    * <p>If {@code index} is the low surrogate of a unicode character,
   2999    * the method returns {@code index - 1}. Otherwise, {@code index} is
   3000    * returned.
   3001    *
   3002    * <p>In the case in which {@code index} falls in an invalid surrogate pair
   3003    * (e.g. consecutive low surrogates, consecutive high surrogates), or if
   3004    * if it is not a valid index into {@code str}, the original value of
   3005    * {@code index} is returned.
   3006    *
   3007    * @param str the String
   3008    * @param index the index to be normalized
   3009    * @return a normalized index that does not split a Unicode character
   3010    */
   3011   public static int unicodePreservingIndex(String str, int index) {
   3012     if (index > 0 && index < str.length()) {
   3013       if (Character.isHighSurrogate(str.charAt(index - 1)) &&
   3014           Character.isLowSurrogate(str.charAt(index))) {
   3015         return index - 1;
   3016       }
   3017     }
   3018     return index;
   3019   }
   3020 
   3021   /**
   3022    * Returns a substring of {@code str} that respects Unicode character
   3023    * boundaries.
   3024    *
   3025    * <p>The string will never be split between a [high, low] surrogate pair,
   3026    * as defined by {@link Character#isHighSurrogate} and
   3027    * {@link Character#isLowSurrogate}.
   3028    *
   3029    * <p>If {@code begin} or {@code end} are the low surrogate of a unicode
   3030    * character, it will be offset by -1.
   3031    *
   3032    * <p>This behavior guarantees that
   3033    * {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) +
   3034    *     StringUtil.unicodePreservingSubstring(str, n, str.length())) } is
   3035    * true for all {@code n}.
   3036    * </pre>
   3037    *
   3038    * <p>This means that unlike {@link String#substring(int, int)}, the length of
   3039    * the returned substring may not necessarily be equivalent to
   3040    * {@code end - begin}.
   3041    *
   3042    * @param str the original String
   3043    * @param begin the beginning index, inclusive
   3044    * @param end the ending index, exclusive
   3045    * @return the specified substring, possibly adjusted in order to not
   3046    *   split unicode surrogate pairs
   3047    * @throws IndexOutOfBoundsException if the {@code begin} is negative,
   3048    *   or {@code end} is larger than the length of {@code str}, or
   3049    *   {@code begin} is larger than {@code end}
   3050    */
   3051   public static String unicodePreservingSubstring(
   3052       String str, int begin, int end) {
   3053     return str.substring(unicodePreservingIndex(str, begin),
   3054         unicodePreservingIndex(str, end));
   3055   }
   3056 
   3057   /**
   3058    * Equivalent to:
   3059    *
   3060    * <pre>
   3061    * {@link #unicodePreservingSubstring(String, int, int)}(
   3062    *     str, begin, str.length())
   3063    * </pre>
   3064    */
   3065   public static String unicodePreservingSubstring(String str, int begin) {
   3066     return unicodePreservingSubstring(str, begin, str.length());
   3067   }
   3068 
   3069   /**
   3070    * True iff the given character needs to be escaped in a javascript string
   3071    * literal.
   3072    * <p>
   3073    * We need to escape the following characters in javascript string literals.
   3074    * <dl>
   3075    * <dt> \           <dd> the escape character
   3076    * <dt> ', "        <dd> string delimiters.
   3077    *                       TODO(msamuel): what about backticks (`) which are
   3078    *                       non-standard but recognized as attribute delimiters.
   3079    * <dt> &, <, >, =  <dd> so that a string literal can be embedded in XHTML
   3080    *                       without further escaping.
   3081    * </dl>
   3082    * TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7
   3083    * attacks?
   3084    * <p>
   3085    * Unicode format control characters (category Cf) must be escaped since they
   3086    * are removed by javascript parser in a pre-lex pass.
   3087    * <br>According to EcmaScript 262 Section 7.1:
   3088    * <blockquote>
   3089    *     The format control characters can occur anywhere in the source text of
   3090    *     an ECMAScript program. These characters are removed from the source
   3091    *     text before applying the lexical grammar.
   3092    * </blockquote>
   3093    * <p>
   3094    * Additionally, line terminators are not allowed to appear inside strings
   3095    * and Section 7.3 says
   3096    * <blockquote>
   3097    *     The following characters are considered to be line terminators:<pre>
   3098    *         Code Point Value   Name                  Formal Name
   3099    *         \u000A             Line Feed             [LF]
   3100    *         \u000D             Carriage Return       [CR]
   3101    *         \u2028             Line separator        [LS]
   3102    *         \u2029             Paragraph separator   [PS]
   3103    * </pre></blockquote>
   3104    *
   3105    * @param codepoint a char instead of an int since the javascript language
   3106    *    does not support extended unicode.
   3107    */
   3108   static boolean mustEscapeCharInJsString(int codepoint) {
   3109     return JS_ESCAPE_CHARS.contains(codepoint);
   3110   }
   3111 
   3112   /**
   3113    * True iff the given character needs to be escaped in a JSON string literal.
   3114    * <p>
   3115    * We need to escape the following characters in JSON string literals.
   3116    * <dl>
   3117    * <dt> \           <dd> the escape character
   3118    * <dt> "           <dd> string delimiter
   3119    * <dt> 0x00 - 0x1F <dd> control characters
   3120    * </dl>
   3121    * <p>
   3122    * See EcmaScript 262 Section 15.12.1 for the full JSON grammar.
   3123    */
   3124   static boolean mustEscapeCharInJsonString(int codepoint) {
   3125     return JSON_ESCAPE_CHARS.contains(codepoint);
   3126   }
   3127 
   3128   /**
   3129    * Builds a small set of code points.
   3130    * {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's
   3131    * {@code UnicodeSet}.
   3132    * For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}.
   3133    */
   3134   private static class UnicodeSetBuilder {
   3135     Set<Integer> codePointSet = new HashSet<Integer>();
   3136 
   3137     UnicodeSetBuilder addCodePoint(int c) {
   3138       codePointSet.add(c);
   3139       return this;
   3140     }
   3141 
   3142     UnicodeSetBuilder addRange(int from, int to) {
   3143       for (int i = from; i <= to; i++) {
   3144         codePointSet.add(i);
   3145       }
   3146       return this;
   3147     }
   3148 
   3149     Set<Integer> create() {
   3150       return codePointSet;
   3151     }
   3152   }
   3153 
   3154   private static final Set<Integer> JS_ESCAPE_CHARS = new UnicodeSetBuilder()
   3155       // All characters in the class of format characters, [:Cf:].
   3156       // Source: http://unicode.org/cldr/utility/list-unicodeset.jsp.
   3157       .addCodePoint(0xAD)
   3158       .addRange(0x600, 0x603)
   3159       .addCodePoint(0x6DD)
   3160       .addCodePoint(0x070F)
   3161       .addRange(0x17B4, 0x17B5)
   3162       .addRange(0x200B, 0x200F)
   3163       .addRange(0x202A, 0x202E)
   3164       .addRange(0x2060, 0x2064)
   3165       .addRange(0x206A, 0x206F)
   3166       .addCodePoint(0xFEFF)
   3167       .addRange(0xFFF9, 0xFFFB)
   3168       .addRange(0x0001D173, 0x0001D17A)
   3169       .addCodePoint(0x000E0001)
   3170       .addRange(0x000E0020, 0x000E007F)
   3171       // Plus characters mentioned in the docs of mustEscapeCharInJsString().
   3172       .addCodePoint(0x0000)
   3173       .addCodePoint(0x000A)
   3174       .addCodePoint(0x000D)
   3175       .addRange(0x2028, 0x2029)
   3176       .addCodePoint(0x0085)
   3177       .addCodePoint(Character.codePointAt("'", 0))
   3178       .addCodePoint(Character.codePointAt("\"", 0))
   3179       .addCodePoint(Character.codePointAt("&", 0))
   3180       .addCodePoint(Character.codePointAt("<", 0))
   3181       .addCodePoint(Character.codePointAt(">", 0))
   3182       .addCodePoint(Character.codePointAt("=", 0))
   3183       .addCodePoint(Character.codePointAt("\\", 0))
   3184       .create();
   3185 
   3186   private static final Set<Integer> JSON_ESCAPE_CHARS = new UnicodeSetBuilder()
   3187       .addCodePoint(Character.codePointAt("\"", 0))
   3188       .addCodePoint(Character.codePointAt("\\", 0))
   3189       .addRange(0x0000, 0x001F)
   3190       .create();
   3191 
   3192   /**
   3193    * <b>To be deprecated:</b> use {@link CharEscapers#xmlEscaper()} instead.
   3194    */
   3195   public static String xmlEscape(String s) {
   3196     return CharEscapers.xmlEscaper().escape(s);
   3197   }
   3198 
   3199   /**
   3200    * <b>To be deprecated:</b> use {@link CharEscapers#asciiHtmlEscaper()} instead.
   3201    */
   3202   public static String htmlEscape(String s) {
   3203     return CharEscapers.asciiHtmlEscaper().escape(s);
   3204   }
   3205 }