Home | History | Annotate | Download | only in base
      1 /**
      2  * Copyright (c) 2000, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.android.mail.common.base;
     18 
     19 import static com.google.android.mail.common.base.Preconditions.checkArgument;
     20 
     21 import com.google.common.base.Joiner;
     22 import com.google.common.base.Joiner.MapJoiner;
     23 
     24 import java.io.IOException;
     25 import java.io.InputStream;
     26 import java.io.StringWriter;
     27 import java.util.ArrayList;
     28 import java.util.Collection;
     29 import java.util.Collections;
     30 import java.util.HashMap;
     31 import java.util.HashSet;
     32 import java.util.Iterator;
     33 import java.util.LinkedHashMap;
     34 import java.util.LinkedList;
     35 import java.util.List;
     36 import java.util.Map;
     37 import java.util.Set;
     38 import java.util.StringTokenizer;
     39 import java.util.regex.Matcher;
     40 import java.util.regex.Pattern;
     41 
     42 /**
     43  * Static utility methods and constants pertaining to {@code String} or {@code
     44  * CharSequence} instances.
     45  */
     46 public final class StringUtil {
     47   private StringUtil() {} // COV_NF_LINE
     48 
     49   /**
     50    * A completely arbitrary selection of eight whitespace characters. See
     51    * <a href="http://go/white+space">this spreadsheet</a> for more details
     52    * about whitespace characters.
     53    *
     54    * @deprecated Rewrite your code to use {@link CharMatcher#WHITESPACE}, or
     55    *     consider the precise set of characters you want to match and construct
     56    *     the right explicit {@link CharMatcher} or {@link String} for your own
     57    *     purposes.
     58    */
     59   @Deprecated
     60   public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F";
     61 
     62   /** A string containing the carriage return and linefeed characters. */
     63   public static final String LINE_BREAKS = "\r\n";
     64 
     65   /**
     66    * Old location of {@link Strings#isNullOrEmpty}; this method will be
     67    * deprecated soon.
     68    */
     69   public static boolean isEmpty(String string) {
     70     return Strings.isNullOrEmpty(string);
     71   }
     72 
     73   /**
     74    * Returns {@code true} if the given string is null, empty, or comprises only
     75    * whitespace characters, as defined by {@link CharMatcher#WHITESPACE}.
     76    *
     77    * <p><b>Warning:</b> there are many competing definitions of "whitespace";
     78    * please see <a href="http://go/white+space">this spreadsheet</a> for
     79    * details.
     80    *
     81    * @param string the string reference to check
     82    * @return {@code true} if {@code string} is null, empty, or consists of
     83    *     whitespace characters only
     84    */
     85   public static boolean isEmptyOrWhitespace(String string) {
     86     return string == null || CharMatcher.WHITESPACE.matchesAllOf(string);
     87   }
     88 
     89   /**
     90    * Old location of {@link Strings#nullToEmpty}; this method will be
     91    * deprecated soon.
     92    */
     93   public static String makeSafe(String string) {
     94     return Strings.nullToEmpty(string);
     95   }
     96 
     97   /**
     98    * Old location of {@link Strings#emptyToNull}; this method will be
     99    * deprecated soon.
    100    */
    101   public static String toNullIfEmpty(String string) {
    102     return Strings.emptyToNull(string);
    103   }
    104 
    105   /**
    106    * Returns the given string if it is nonempty and contains at least one
    107    * non-whitespace character; {@code null} otherwise. See comment in {@link
    108    * #isEmptyOrWhitespace} on the definition of whitespace.
    109    *
    110    * @param string the string to test and possibly return
    111    * @return {@code null} if {@code string} is null, empty, or contains only
    112    *     whitespace characters; {@code string} itself otherwise
    113    */
    114   public static String toNullIfEmptyOrWhitespace(
    115       String string) {
    116     return isEmptyOrWhitespace(string) ? null : string;
    117   }
    118 
    119   /**
    120    * Old location of {@link Strings#repeat}; this method will be deprecated
    121    * soon.
    122    */
    123   public static String repeat(String string, int count) {
    124     return Strings.repeat(string, count);
    125   }
    126 
    127   /**
    128    * Return the first index in the string of any of the specified characters,
    129    * starting at a given index, or {@code -1} if none of the characters is
    130    * present.
    131    *
    132    * @param string the non-null character sequence to look in
    133    * @param chars a non-null character sequence containing the set of characters
    134    *     to look for. If empty, this method will find no matches and return
    135    *     {@code -1}
    136    * @param fromIndex the index of the first character to examine in the input
    137    *     string. If negative, the entire string will be searched. If greater
    138    *     than or equal to the string length, no characters will be searched and
    139    *     {@code -1} will be returned.
    140    * @return the index of the first match, or {@code -1} if no match was found.
    141    *     Guaranteed to be either {@code -1} or a number greater than or equal to
    142    *     {@code fromIndex}
    143    * @throws NullPointerException if any argument is null
    144    */
    145   // author: pault
    146   public static int indexOfChars(
    147       CharSequence string, CharSequence chars, int fromIndex) {
    148     if (fromIndex >= string.length()) {
    149       return -1;
    150     }
    151 
    152     /*
    153      * Prepare lookup structures for the characters. TODO(pault): This loop
    154      * could be factored into another method to allow caching of the resulting
    155      * struct if a use-case of very large character sets exists.
    156      */
    157     Set<Character> charSet = Collections.emptySet();
    158     boolean[] charArray = new boolean[128];
    159     for (int i = 0; i < chars.length(); i++) {
    160       char c = chars.charAt(i);
    161       if (c < 128) {
    162         charArray[c] = true;
    163       } else {
    164         if (charSet.isEmpty()) {
    165           charSet = new HashSet<Character>();
    166         }
    167         charSet.add(c);
    168       }
    169     }
    170 
    171     // Scan the string for matches
    172     for (int i = Math.max(fromIndex, 0); i < string.length(); i++) {
    173       char c = string.charAt(i);
    174       if (c < 128) {
    175         if (charArray[c]) {
    176           return i;
    177         }
    178       } else if (charSet.contains(c)) {
    179         return i;
    180       }
    181     }
    182     return -1;
    183   }
    184 
    185 /*
    186  * -------------------------------------------------------------------
    187  * This marks the end of the code that has been written or rewritten
    188  * in 2008 to the quality standards of the Java core libraries group.
    189  * Code below this point is still awaiting cleanup (you can help!).
    190  * See http://wiki/Nonconf/JavaCoreLibrariesStandards.
    191  * -------------------------------------------------------------------
    192  */
    193 
    194 
    195   /**
    196    * @param str the string to split.  Must not be null.
    197    * @param delims the delimiter characters. Each character in the
    198    *        string is individually treated as a delimiter.
    199    * @return an array of tokens. Will not return null. Individual tokens
    200    *        do not have leading/trailing whitespace removed.
    201    * @deprecated see the detailed instructions under
    202    *     {@link #split(String, String, boolean)}
    203    */
    204   @Deprecated
    205   public static String[] split(String str, String delims) {
    206     return split(str, delims, false);
    207   }
    208 
    209   /**
    210    * This method is deprecated because it is too inflexible, providing
    211    * only a very specific set of behaviors that almost never matches exactly
    212    * what you intend. Prefer using a {@link Splitter}, which is more flexible
    213    * and consistent in the way it handles trimming and empty tokens.
    214    *
    215    * <ul>
    216    * <li>Create a {@link Splitter} using {@link Splitter#on(CharMatcher)} such
    217    *     as {@code Splitter.on(CharMatcher.anyOf(delims))}.
    218    * <li><i>If</i> you need whitespace trimmed from the ends of each segment,
    219    *     adding {@code .trimResults()} to your splitter definition should work
    220    *     in most cases. To match the exact behavior of this method, use
    221    *     {@code .trimResults(CharMatcher.inRange('\0', ' '))}.
    222    * <li>This method silently ignores empty tokens in the input, but allows
    223    *     empty tokens to appear in the output if {@code trimTokens} is
    224    *     {@code true}. Adding {@code .omitEmptyStrings()} to your splitter
    225    *     definition will filter empty tokens out but will do so <i>after</i>
    226    *     having performed trimming. If you absolutely require this method's
    227    *     behavior in this respect, Splitter is not able to match it.
    228    * <li>If you need the result as an array, use {@link
    229    *     com.google.common.collect.Iterables#toArray(Iterable, Class)} on the
    230    *     {@code Iterable<String>} returned by {@link Splitter#split}.
    231    * </ul>
    232    *
    233    * @param str the string to split.  Must not be null.
    234    * @param delims the delimiter characters. Each character in the string
    235    *        is individually treated as a delimiter.
    236    * @param trimTokens if true, leading/trailing whitespace is removed
    237    *        from the tokens
    238    * @return an array of tokens. Will not return null.
    239    * @deprecated
    240    */
    241   @Deprecated
    242   public static String[] split(
    243       String str, String delims, boolean trimTokens) {
    244     StringTokenizer tokenizer = new StringTokenizer(str, delims);
    245     int n = tokenizer.countTokens();
    246     String[] list = new String[n];
    247     for (int i = 0; i < n; i++) {
    248       if (trimTokens) {
    249         list[i] = tokenizer.nextToken().trim();
    250       } else {
    251         list[i] = tokenizer.nextToken();
    252       }
    253     }
    254     return list;
    255   }
    256 
    257   /**
    258    * Trim characters from only the beginning of a string.
    259    * This is a convenience method, it simply calls trimStart(s, null).
    260    *
    261    * @param s String to be trimmed
    262    * @return String with whitespace characters removed from the beginning
    263    */
    264   public static String trimStart(String s) {
    265     return trimStart(s, null);
    266   }
    267 
    268   /**
    269    * Trim characters from only the beginning of a string.
    270    * This method will remove all whitespace characters
    271    * (defined by Character.isWhitespace(char), in addition to the characters
    272    * provided, from the end of the provided string.
    273    *
    274    * @param s String to be trimmed
    275    * @param extraChars Characters in addition to whitespace characters that
    276    *                   should be trimmed.  May be null.
    277    * @return String with whitespace and characters in extraChars removed
    278    *                   from the beginning
    279    */
    280   public static String trimStart(String s, String extraChars) {
    281     int trimCount = 0;
    282     while (trimCount < s.length()) {
    283       char ch = s.charAt(trimCount);
    284       if (Character.isWhitespace(ch)
    285         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
    286         trimCount++;
    287       } else {
    288         break;
    289       }
    290     }
    291 
    292     if (trimCount == 0) {
    293       return s;
    294     }
    295     return s.substring(trimCount);
    296   }
    297 
    298   /**
    299    * Trim characters from only the end of a string.
    300    * This is a convenience method, it simply calls trimEnd(s, null).
    301    *
    302    * @param s String to be trimmed
    303    * @return String with whitespace characters removed from the end
    304    */
    305   public static String trimEnd(String s) {
    306     return trimEnd(s, null);
    307   }
    308 
    309   /**
    310    * Trim characters from only the end of a string.
    311    * This method will remove all whitespace characters
    312    * (defined by Character.isWhitespace(char), in addition to the characters
    313    * provided, from the end of the provided string.
    314    *
    315    * @param s String to be trimmed
    316    * @param extraChars Characters in addition to whitespace characters that
    317    *                   should be trimmed.  May be null.
    318    * @return String with whitespace and characters in extraChars removed
    319    *                   from the end
    320    */
    321   public static String trimEnd(String s, String extraChars) {
    322     int trimCount = 0;
    323     while (trimCount < s.length()) {
    324       char ch = s.charAt(s.length() - trimCount - 1);
    325       if (Character.isWhitespace(ch)
    326         || (extraChars != null && extraChars.indexOf(ch) >= 0)) {
    327         trimCount++;
    328       } else {
    329         break;
    330       }
    331     }
    332 
    333     if (trimCount == 0) {
    334       return s;
    335     }
    336     return s.substring(0, s.length() - trimCount);
    337   }
    338 
    339   /**
    340    * @param str the string to split.  Must not be null.
    341    * @param delims the delimiter characters. Each character in the
    342    *        string is individually treated as a delimiter.
    343    * @return an array of tokens. Will not return null. Leading/trailing
    344    *        whitespace is removed from the tokens.
    345    * @deprecated see the detailed instructions under
    346    *     {@link #split(String, String, boolean)}
    347    */
    348   @Deprecated
    349   public static String[] splitAndTrim(String str, String delims) {
    350     return split(str, delims, true);
    351   }
    352 
    353   /** Parse comma-separated list of ints and return as array. */
    354   public static int[] splitInts(String str) throws IllegalArgumentException {
    355     StringTokenizer tokenizer = new StringTokenizer(str, ",");
    356     int n = tokenizer.countTokens();
    357     int[] list = new int[n];
    358     for (int i = 0; i < n; i++) {
    359       String token = tokenizer.nextToken();
    360       list[i] = Integer.parseInt(token);
    361     }
    362     return list;
    363   }
    364 
    365   /** Parse comma-separated list of longs and return as array. */
    366   public static long[] splitLongs(String str) throws IllegalArgumentException {
    367     StringTokenizer tokenizer = new StringTokenizer(str, ",");
    368     int n = tokenizer.countTokens();
    369     long[] list = new long[n];
    370     for (int i = 0; i < n; i++) {
    371       String token = tokenizer.nextToken();
    372       list[i] = Long.parseLong(token);
    373     }
    374     return list;
    375   }
    376 
    377   /** This replaces the occurrences of 'what' in 'str' with 'with'
    378    *
    379    * @param str the string to process
    380    * @param what to replace
    381    * @param with replace with this
    382    * @return String str where 'what' was replaced with 'with'
    383    *
    384    * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}.
    385    */
    386   @Deprecated
    387   public static String replace(
    388       String str, CharSequence what, CharSequence with) {
    389     // Have to check this argument, for compatibility with the old impl.
    390     // For the record, String.replace() is capable of handling an empty target
    391     // string... but it does something kind of weird in that case.
    392     checkArgument(what.length() > 0);
    393     return str.replace(what, with);
    394   }
    395 
    396   private static final Splitter NEWLINE_SPLITTER =
    397       Splitter.on('\n').omitEmptyStrings();
    398 
    399   /**
    400    * Reformats the given string to a fixed width by inserting carriage returns
    401    * and trimming unnecessary whitespace. See
    402    * {@link #fixedWidth(String[], int)} for details. The {@code str} argument
    403    * to this method will be split on newline characters ({@code '\n'}) only
    404    * (regardless of platform).  An array of resulting non-empty strings is
    405    * then passed to {@link #fixedWidth(String[], int)} as the {@code lines}
    406    * parameter.
    407    *
    408    * @param str the string to format
    409    * @param width the fixed width (in characters)
    410    */
    411   public static String fixedWidth(String str, int width) {
    412     List<String> lines = new ArrayList<String>();
    413 
    414     for (String line : NEWLINE_SPLITTER.split(str)) {
    415       lines.add(line);
    416     }
    417 
    418     String[] lineArray = lines.toArray(new String[0]);
    419     return fixedWidth(lineArray, width);
    420   }
    421 
    422   /**
    423    * Reformats the given array of lines to a fixed width by inserting
    424    * newlines and trimming unnecessary whitespace.  This uses simple
    425    * whitespace-based splitting, not sophisticated internationalized
    426    * line breaking.  Newlines within a line are treated like any other
    427    * whitespace.  Lines which are already short enough will be passed
    428    * through unmodified.
    429    *
    430    * <p>Only breaking whitespace characters (those which match
    431    * {@link CharMatcher#BREAKING_WHITESPACE}) are treated as whitespace by
    432    * this method. Non-breaking whitespace characters will be considered as
    433    * ordinary characters which are connected to any other adjacent
    434    * non-whitespace characters, and will therefore appear in the returned
    435    * string in their original context.
    436    *
    437    * @param lines array of lines to format
    438    * @param width the fixed width (in characters)
    439    */
    440   public static String fixedWidth(String[] lines, int width) {
    441     List<String> formattedLines = new ArrayList<String>();
    442 
    443     for (String line : lines) {
    444       formattedLines.add(formatLineToFixedWidth(line, width));
    445     }
    446 
    447     return Joiner.on('\n').join(formattedLines);
    448   }
    449 
    450   private static final Splitter TO_WORDS =
    451       Splitter.on(CharMatcher.BREAKING_WHITESPACE).omitEmptyStrings();
    452 
    453   /**
    454    * Helper method for {@link #fixedWidth(String[], int)}
    455    */
    456   private static String formatLineToFixedWidth(String line, int width) {
    457     if (line.length() <= width) {
    458       return line;
    459     }
    460 
    461     StringBuilder builder = new StringBuilder();
    462     int col = 0;
    463 
    464     for (String word : TO_WORDS.split(line)) {
    465       if (col == 0) {
    466         col = word.length();
    467       } else {
    468         int newCol = col + word.length() + 1;  // +1 for the space
    469 
    470         if (newCol <= width) {
    471           builder.append(' ');
    472           col = newCol;
    473         } else {
    474           builder.append('\n');
    475           col = word.length();
    476         }
    477       }
    478 
    479       builder.append(word);
    480     }
    481 
    482     return builder.toString();
    483   }
    484 
    485   /**
    486    * Splits the argument original into a list of substrings.  All the
    487    * substrings in the returned list (except possibly the last) will
    488    * have length lineLen.
    489    *
    490    * @param lineLen  the length of the substrings to put in the list
    491    * @param original the original string
    492    *
    493    * @return a list of strings of length lineLen that together make up the
    494    *     original string
    495    * @deprecated use {@code Splitter.fixedLength(lineLen).split(original))}
    496    *     (note that it returns an {@code Iterable}, not a {@code List})
    497    */
    498   @Deprecated
    499   public static List<String> fixedSplit(String original, int lineLen) {
    500     List<String> output = new ArrayList<String>();
    501     for (String elem : Splitter.fixedLength(lineLen).split(original)) {
    502       output.add(elem);
    503     }
    504     return output;
    505   }
    506 
    507   /**
    508    * Indents the given String per line.
    509    * @param iString the string to indent
    510    * @param iIndentDepth the depth of the indentation
    511    * @return the indented string
    512    */
    513   public static String indent(String iString, int iIndentDepth) {
    514     StringBuilder spacer = new StringBuilder();
    515     spacer.append("\n");
    516     for (int i = 0; i < iIndentDepth; i++) {
    517       spacer.append("  ");
    518     }
    519     return iString.replace("\n", spacer.toString());
    520   }
    521 
    522   /**
    523    * This is a both way strip.
    524    *
    525    * @param str the string to strip
    526    * @param left strip from left
    527    * @param right strip from right
    528    * @param what character(s) to strip
    529    * @return the stripped string
    530    * @deprecated ensure the string is not null and use
    531    *  <ul>
    532    *    <li> {@code CharMatcher.anyOf(what).trimFrom(str)}
    533    *        if {@code left == true} and {@code right == true}
    534    *    <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)}
    535    *        if {@code left == true} and {@code right == false}
    536    *    <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)}
    537    *        if {@code left == false} and {@code right == true}
    538    *  </ul>
    539    */
    540   @Deprecated
    541   public static String megastrip(String str,
    542                                  boolean left, boolean right,
    543                                  String what) {
    544     if (str == null) {
    545       return null;
    546     }
    547 
    548     CharMatcher matcher = CharMatcher.anyOf(what);
    549     if (left) {
    550       if (right) {
    551         return matcher.trimFrom(str);
    552       }
    553       return matcher.trimLeadingFrom(str);
    554     }
    555     if (right) {
    556       return matcher.trimTrailingFrom(str);
    557     }
    558     return str;
    559   }
    560 
    561   /** strip - strips both ways
    562    *
    563    * @param str what to strip
    564    * @return String the striped string
    565    * @deprecated ensure the string is not null and use {@code
    566    *     CharMatcher.LEGACY_WHITESPACE.trimFrom(str)}; also consider whether you
    567    *     really want the legacy whitespace definition, or something more
    568    *     standard like {@link CharMatcher#WHITESPACE}.
    569    */
    570   @SuppressWarnings("deprecation") // this is deprecated itself
    571   @Deprecated public static String strip(String str) {
    572     return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimFrom(str);
    573   }
    574 
    575   /** Strip white spaces from both end, and collapse white spaces
    576    * in the middle.
    577    *
    578    * @param str what to strip
    579    * @return String the striped and collapsed string
    580    * @deprecated ensure the string is not null and use {@code
    581    *     CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ')}; also
    582    *     consider whether you really want the legacy whitespace definition, or
    583    *     something more standard like {@link CharMatcher#WHITESPACE}.
    584    */
    585   @SuppressWarnings("deprecation") // this is deprecated itself
    586   @Deprecated public static String stripAndCollapse(String str) {
    587     return (str == null) ? null
    588         : CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(str, ' ');
    589   }
    590 
    591   /**
    592    * Give me a string and a potential prefix, and I return the string
    593    * following the prefix if the prefix matches, else null.
    594    * Analogous to the c++ functions strprefix and var_strprefix.
    595    *
    596    * @param str the string to strip
    597    * @param prefix the expected prefix
    598    * @return the stripped string or <code>null</code> if the string
    599    * does not start with the prefix
    600    */
    601   public static String stripPrefix(String str, String prefix) {
    602     return str.startsWith(prefix)
    603         ? str.substring(prefix.length())
    604         : null;
    605   }
    606 
    607   /**
    608    * Case insensitive version of stripPrefix. Strings are compared in
    609    * the same way as in {@link String#equalsIgnoreCase}.
    610    * Analogous to the c++ functions strcaseprefix and var_strcaseprefix.
    611    *
    612    * @param str the string to strip
    613    * @param prefix the expected prefix
    614    * @return the stripped string or <code>null</code> if the string
    615    * does not start with the prefix
    616    */
    617   public static String stripPrefixIgnoreCase(String str, String prefix) {
    618     return startsWithIgnoreCase(str, prefix)
    619         ? str.substring(prefix.length())
    620         : null;
    621   }
    622 
    623   /**
    624    * Give me a string and a potential suffix, and I return the string
    625    * before the suffix if the suffix matches, else null.
    626    * Analogous to the c++ function strsuffix.
    627    *
    628    * @param str the string to strip
    629    * @param suffix the expected suffix
    630    * @return the stripped string or <code>null</code> if the string
    631    * does not end with the suffix
    632    */
    633   public static String stripSuffix(String str, String suffix) {
    634     return str.endsWith(suffix)
    635         ? str.substring(0, str.length() - suffix.length())
    636         : null;
    637   }
    638 
    639   /**
    640    * Case insensitive version of stripSuffix. Strings are compared in
    641    * the same way as in {@link String#equalsIgnoreCase}.
    642    * Analogous to the c++ function strcasesuffix.
    643    *
    644    * @param str the string to strip
    645    * @param suffix the expected suffix
    646    * @return the stripped string or <code>null</code> if the string
    647    * does not end with the suffix
    648    */
    649   public static String stripSuffixIgnoreCase(
    650       String str, String suffix) {
    651     return endsWithIgnoreCase(str, suffix)
    652         ? str.substring(0, str.length() - suffix.length())
    653         : null;
    654   }
    655 
    656   /**
    657    * Strips all non-digit characters from a string.
    658    *
    659    * The resulting string will only contain characters for which isDigit()
    660    * returns true.
    661    *
    662    * @param str the string to strip
    663    * @return a string consisting of digits only, or an empty string
    664    * @deprecated use {@code CharMatcher.JAVA_DIGIT.retainFrom(str)} (also
    665    *     consider whether this is really the definition of "digit" you wish to
    666    *     use)
    667    */
    668   @Deprecated public static String stripNonDigits(String str) {
    669     return CharMatcher.JAVA_DIGIT.retainFrom(str);
    670   }
    671 
    672   /**
    673    * Finds the last index in str of a character not in the characters
    674    * in 'chars' (similar to ANSI string.find_last_not_of).
    675    *
    676    * Returns -1 if no such character can be found.
    677    *
    678    * <p><b>Note:</b> If {@code fromIndex} is zero, use {@link CharMatcher}
    679    * instead for this: {@code CharMatcher.noneOf(chars).lastIndexIn(str)}.
    680    */
    681   // TODO(kevinb): after adding fromIndex versions of (last)IndexOf to
    682   // CharMatcher, deprecate this
    683   public static int lastIndexNotOf(String str, String chars, int fromIndex) {
    684     fromIndex = Math.min(fromIndex, str.length() - 1);
    685 
    686     for (int pos = fromIndex; pos >= 0; pos--) {
    687       if (chars.indexOf(str.charAt(pos)) < 0) {
    688         return pos;
    689       }
    690     }
    691 
    692     return -1;
    693   }
    694 
    695   /**
    696    * Like String.replace() except that it accepts any number of old chars.
    697    * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'.
    698    * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello  world "
    699    *
    700    * @deprecated use {@code CharMatcher#replaceFrom(String, char)}, for example
    701    *     {@code CharMatcher.anyOf(oldchars).replaceFrom(str, newchar)}
    702    */
    703   @Deprecated public static String replaceChars(
    704       String str, CharSequence oldchars, char newchar) {
    705     return CharMatcher.anyOf(oldchars).replaceFrom(str, newchar);
    706   }
    707 
    708   /**
    709    * Remove any occurrances of 'oldchars' in 'str'.
    710    * Example: removeChars("Hello, world!", ",!") returns "Hello world"
    711    *
    712    * @deprecated use {@link CharMatcher#removeFrom(CharSequence)}, for example
    713    *     {@code CharMatcher.anyOf(oldchars).removeFrom(str)}
    714    */
    715   @Deprecated public static String removeChars(
    716       String str, CharSequence oldchars) {
    717     return CharMatcher.anyOf(oldchars).removeFrom(str);
    718   }
    719 
    720   // See http://www.microsoft.com/typography/unicode/1252.htm
    721   private static final CharMatcher FANCY_SINGLE_QUOTE
    722       = CharMatcher.anyOf("\u0091\u0092\u2018\u2019");
    723   private static final CharMatcher FANCY_DOUBLE_QUOTE
    724       = CharMatcher.anyOf("\u0093\u0094\u201c\u201d");
    725 
    726   /**
    727    * Replaces microsoft "smart quotes" (curly " and ') with their
    728    * ascii counterparts.
    729    */
    730   public static String replaceSmartQuotes(String str) {
    731     String tmp = FANCY_SINGLE_QUOTE.replaceFrom(str, '\'');
    732     return FANCY_DOUBLE_QUOTE.replaceFrom(tmp, '"');
    733   }
    734 
    735   /**
    736    * Convert a string of hex digits to a byte array, with the first
    737    * byte in the array being the MSB. The string passed in should be
    738    * just the raw digits (upper or lower case), with no leading
    739    * or trailing characters (like '0x' or 'h').
    740    * An odd number of characters is supported.
    741    * If the string is empty, an empty array will be returned.
    742    *
    743    * This is significantly faster than using
    744    *   new BigInteger(str, 16).toByteArray();
    745    * especially with larger strings. Here are the results of some
    746    * microbenchmarks done on a P4 2.8GHz 2GB RAM running
    747    * linux 2.4.22-gg11 and JDK 1.5 with an optimized build:
    748    *
    749    * String length        hexToBytes (usec)   BigInteger
    750    * -----------------------------------------------------
    751    * 16                       0.570                 1.43
    752    * 256                      8.21                 44.4
    753    * 1024                    32.8                 526
    754    * 16384                  546                121000
    755    */
    756   public static byte[] hexToBytes(CharSequence str) {
    757     byte[] bytes = new byte[(str.length() + 1) / 2];
    758     if (str.length() == 0) {
    759       return bytes;
    760     }
    761     bytes[0] = 0;
    762     int nibbleIdx = (str.length() % 2);
    763     for (int i = 0; i < str.length(); i++) {
    764       char c = str.charAt(i);
    765       if (!isHex(c)) {
    766         throw new IllegalArgumentException("string contains non-hex chars");
    767       }
    768       if ((nibbleIdx % 2) == 0) {
    769         bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4);
    770       } else {
    771         bytes[nibbleIdx >> 1] += (byte) hexValue(c);
    772       }
    773       nibbleIdx++;
    774     }
    775     return bytes;
    776   }
    777 
    778   /**
    779    * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed).
    780    */
    781   public static String convertEOLToLF(String input) {
    782     StringBuilder res = new StringBuilder(input.length());
    783     char[] s = input.toCharArray();
    784     int from = 0;
    785     final int end = s.length;
    786     for (int i = 0; i < end; i++) {
    787       if (s[i] == '\r') {
    788         res.append(s, from, i - from);
    789         res.append('\n');
    790         if (i + 1 < end && s[i + 1] == '\n') {
    791           i++;
    792         }
    793 
    794         from = i + 1;
    795       }
    796     }
    797 
    798     if (from == 0) {   // no \r!
    799       return input;
    800     }
    801 
    802     res.append(s, from, end - from);
    803     return res.toString();
    804   }
    805 
    806   /**
    807    * Old location of {@link Strings#padStart}; this method will be deprecated
    808    * soon.
    809    */
    810   public static String padLeft(String s, int len, char padChar) {
    811     return Strings.padStart(s, len, padChar);
    812   }
    813 
    814   /**
    815    * Old location of {@link Strings#padEnd}; this method will be deprecated
    816    * soon.
    817    */
    818   public static String padRight(String s, int len, char padChar) {
    819     return Strings.padEnd(s, len, padChar);
    820   }
    821 
    822   /**
    823    * Returns a string consisting of "s", with each of the first "len" characters
    824    * replaced by "maskChar" character.
    825    */
    826   public static String maskLeft(String s, int len, char maskChar) {
    827     if (len <= 0) {
    828       return s;
    829     }
    830     len = Math.min(len, s.length());
    831     StringBuilder sb = new StringBuilder();
    832     for (int i = 0; i < len; i++) {
    833       sb.append(maskChar);
    834     }
    835     sb.append(s.substring(len));
    836     return sb.toString();
    837   }
    838 
    839   private static boolean isOctal(char c) {
    840     return (c >= '0') && (c <= '7');
    841   }
    842 
    843   private static boolean isHex(char c) {
    844     return ((c >= '0') && (c <= '9')) ||
    845            ((c >= 'a') && (c <= 'f')) ||
    846            ((c >= 'A') && (c <= 'F'));
    847   }
    848 
    849   private static int hexValue(char c) {
    850     if ((c >= '0') && (c <= '9')) {
    851       return (c - '0');
    852     } else if ((c >= 'a') && (c <= 'f')) {
    853       return (c - 'a') + 10;
    854     } else {
    855       return (c - 'A') + 10;
    856     }
    857   }
    858 
    859   /**
    860    * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the
    861    * resulting string.
    862    */
    863   public static String unescapeCString(String s) {
    864     if (s.indexOf('\\') < 0) {
    865       // Fast path: nothing to unescape
    866       return s;
    867     }
    868 
    869     StringBuilder sb = new StringBuilder();
    870     int len = s.length();
    871     for (int i = 0; i < len;) {
    872       char c = s.charAt(i++);
    873       if (c == '\\' && (i < len)) {
    874         c = s.charAt(i++);
    875         switch (c) {
    876           case 'a':  c = '\007';  break;
    877           case 'b':  c = '\b';    break;
    878           case 'f':  c = '\f';    break;
    879           case 'n':  c = '\n';    break;
    880           case 'r':  c = '\r';    break;
    881           case 't':  c = '\t';    break;
    882           case 'v':  c = '\013';  break;
    883           case '\\': c = '\\';    break;
    884           case '?':  c = '?';     break;
    885           case '\'': c = '\'';    break;
    886           case '"':  c = '\"';    break;
    887 
    888           default: {
    889             if ((c == 'x') && (i < len) && isHex(s.charAt(i))) {
    890               // "\xXX"
    891               int v = hexValue(s.charAt(i++));
    892               if ((i < len) && isHex(s.charAt(i))) {
    893                 v = v * 16 + hexValue(s.charAt(i++));
    894               }
    895               c = (char) v;
    896             } else if (isOctal(c)) {
    897               // "\OOO"
    898               int v = (c - '0');
    899               if ((i < len) && isOctal(s.charAt(i))) {
    900                 v = v * 8 + (s.charAt(i++) - '0');
    901               }
    902               if ((i < len) && isOctal(s.charAt(i))) {
    903                 v = v * 8 + (s.charAt(i++) - '0');
    904               }
    905               c = (char) v;
    906             } else {
    907               // Propagate unknown escape sequences.
    908               sb.append('\\');
    909             }
    910             break;
    911           }
    912         }
    913       }
    914       sb.append(c);
    915     }
    916     return sb.toString();
    917   }
    918 
    919   /**
    920    * Unescape any MySQL escape sequences.
    921    * See MySQL language reference Chapter 6 at
    922    * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>.
    923    * This function will <strong>not</strong> work for other SQL-like
    924    * dialects.
    925    * @param s string to unescape, with the surrounding quotes.
    926    * @return unescaped string, without the surrounding quotes.
    927    * @exception IllegalArgumentException if s is not a valid MySQL string.
    928    */
    929   public static String unescapeMySQLString(String s)
    930       throws IllegalArgumentException {
    931     // note: the same buffer is used for both reading and writing
    932     // it works because the writer can never outrun the reader
    933     char chars[] = s.toCharArray();
    934 
    935     // the string must be quoted 'like this' or "like this"
    936     if (chars.length < 2 || chars[0] != chars[chars.length - 1] ||
    937         (chars[0] != '\'' && chars[0] != '"')) {
    938       throw new IllegalArgumentException("not a valid MySQL string: " + s);
    939     }
    940 
    941     // parse the string and decode the backslash sequences; in addition,
    942     // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "'
    943     int j = 1;  // write position in the string (never exceeds read position)
    944     int f = 0;  // state: 0 (normal), 1 (backslash), 2 (quote)
    945     for (int i = 1; i < chars.length - 1; i++) {
    946       if (f == 0) {             // previous character was normal
    947         if (chars[i] == '\\') {
    948           f = 1;  // backslash
    949         } else if (chars[i] == chars[0]) {
    950           f = 2;  // quoting character
    951         } else {
    952           chars[j++] = chars[i];
    953         }
    954       } else if (f == 1) {      // previous character was a backslash
    955         switch (chars[i]) {
    956           case '0':   chars[j++] = '\0';   break;
    957           case '\'':  chars[j++] = '\'';   break;
    958           case '"':   chars[j++] = '"';    break;
    959           case 'b':   chars[j++] = '\b';   break;
    960           case 'n':   chars[j++] = '\n';   break;
    961           case 'r':   chars[j++] = '\r';   break;
    962           case 't':   chars[j++] = '\t';   break;
    963           case 'z':   chars[j++] = '\032'; break;
    964           case '\\':  chars[j++] = '\\';   break;
    965           default:
    966             // if the character is not special, backslash disappears
    967             chars[j++] = chars[i];
    968             break;
    969         }
    970         f = 0;
    971       } else {                  // previous character was a quote
    972         // quoting characters must be doubled inside a string
    973         if (chars[i] != chars[0]) {
    974           throw new IllegalArgumentException("not a valid MySQL string: " + s);
    975         }
    976         chars[j++] = chars[0];
    977         f = 0;
    978       }
    979     }
    980     // string contents cannot end with a special character
    981     if (f != 0) {
    982       throw new IllegalArgumentException("not a valid MySQL string: " + s);
    983     }
    984 
    985     // done
    986     return new String(chars, 1, j - 1);
    987   }
    988 
    989   // TODO(pbarry): move all HTML methods to common.html package
    990 
    991   static final Map<String, Character> ESCAPE_STRINGS;
    992   static final Set<Character> HEX_LETTERS;
    993 
    994   static {
    995     // HTML character entity references as defined in HTML 4
    996     // see http://www.w3.org/TR/REC-html40/sgml/entities.html
    997     ESCAPE_STRINGS = new HashMap<String, Character>(252);
    998 
    999     ESCAPE_STRINGS.put("&nbsp", '\u00A0');
   1000     ESCAPE_STRINGS.put("&iexcl", '\u00A1');
   1001     ESCAPE_STRINGS.put("&cent", '\u00A2');
   1002     ESCAPE_STRINGS.put("&pound", '\u00A3');
   1003     ESCAPE_STRINGS.put("&curren", '\u00A4');
   1004     ESCAPE_STRINGS.put("&yen", '\u00A5');
   1005     ESCAPE_STRINGS.put("&brvbar", '\u00A6');
   1006     ESCAPE_STRINGS.put("&sect", '\u00A7');
   1007     ESCAPE_STRINGS.put("&uml", '\u00A8');
   1008     ESCAPE_STRINGS.put("&copy", '\u00A9');
   1009     ESCAPE_STRINGS.put("&ordf", '\u00AA');
   1010     ESCAPE_STRINGS.put("&laquo", '\u00AB');
   1011     ESCAPE_STRINGS.put("&not", '\u00AC');
   1012     ESCAPE_STRINGS.put("&shy", '\u00AD');
   1013     ESCAPE_STRINGS.put("&reg", '\u00AE');
   1014     ESCAPE_STRINGS.put("&macr", '\u00AF');
   1015     ESCAPE_STRINGS.put("&deg", '\u00B0');
   1016     ESCAPE_STRINGS.put("&plusmn", '\u00B1');
   1017     ESCAPE_STRINGS.put("&sup2", '\u00B2');
   1018     ESCAPE_STRINGS.put("&sup3", '\u00B3');
   1019     ESCAPE_STRINGS.put("&acute", '\u00B4');
   1020     ESCAPE_STRINGS.put("&micro", '\u00B5');
   1021     ESCAPE_STRINGS.put("&para", '\u00B6');
   1022     ESCAPE_STRINGS.put("&middot", '\u00B7');
   1023     ESCAPE_STRINGS.put("&cedil", '\u00B8');
   1024     ESCAPE_STRINGS.put("&sup1", '\u00B9');
   1025     ESCAPE_STRINGS.put("&ordm", '\u00BA');
   1026     ESCAPE_STRINGS.put("&raquo", '\u00BB');
   1027     ESCAPE_STRINGS.put("&frac14", '\u00BC');
   1028     ESCAPE_STRINGS.put("&frac12", '\u00BD');
   1029     ESCAPE_STRINGS.put("&frac34", '\u00BE');
   1030     ESCAPE_STRINGS.put("&iquest", '\u00BF');
   1031     ESCAPE_STRINGS.put("&Agrave", '\u00C0');
   1032     ESCAPE_STRINGS.put("&Aacute", '\u00C1');
   1033     ESCAPE_STRINGS.put("&Acirc", '\u00C2');
   1034     ESCAPE_STRINGS.put("&Atilde", '\u00C3');
   1035     ESCAPE_STRINGS.put("&Auml", '\u00C4');
   1036     ESCAPE_STRINGS.put("&Aring", '\u00C5');
   1037     ESCAPE_STRINGS.put("&AElig", '\u00C6');
   1038     ESCAPE_STRINGS.put("&Ccedil", '\u00C7');
   1039     ESCAPE_STRINGS.put("&Egrave", '\u00C8');
   1040     ESCAPE_STRINGS.put("&Eacute", '\u00C9');
   1041     ESCAPE_STRINGS.put("&Ecirc", '\u00CA');
   1042     ESCAPE_STRINGS.put("&Euml", '\u00CB');
   1043     ESCAPE_STRINGS.put("&Igrave", '\u00CC');
   1044     ESCAPE_STRINGS.put("&Iacute", '\u00CD');
   1045     ESCAPE_STRINGS.put("&Icirc", '\u00CE');
   1046     ESCAPE_STRINGS.put("&Iuml", '\u00CF');
   1047     ESCAPE_STRINGS.put("&ETH", '\u00D0');
   1048     ESCAPE_STRINGS.put("&Ntilde", '\u00D1');
   1049     ESCAPE_STRINGS.put("&Ograve", '\u00D2');
   1050     ESCAPE_STRINGS.put("&Oacute", '\u00D3');
   1051     ESCAPE_STRINGS.put("&Ocirc", '\u00D4');
   1052     ESCAPE_STRINGS.put("&Otilde", '\u00D5');
   1053     ESCAPE_STRINGS.put("&Ouml", '\u00D6');
   1054     ESCAPE_STRINGS.put("&times", '\u00D7');
   1055     ESCAPE_STRINGS.put("&Oslash", '\u00D8');
   1056     ESCAPE_STRINGS.put("&Ugrave", '\u00D9');
   1057     ESCAPE_STRINGS.put("&Uacute", '\u00DA');
   1058     ESCAPE_STRINGS.put("&Ucirc", '\u00DB');
   1059     ESCAPE_STRINGS.put("&Uuml", '\u00DC');
   1060     ESCAPE_STRINGS.put("&Yacute", '\u00DD');
   1061     ESCAPE_STRINGS.put("&THORN", '\u00DE');
   1062     ESCAPE_STRINGS.put("&szlig", '\u00DF');
   1063     ESCAPE_STRINGS.put("&agrave", '\u00E0');
   1064     ESCAPE_STRINGS.put("&aacute", '\u00E1');
   1065     ESCAPE_STRINGS.put("&acirc", '\u00E2');
   1066     ESCAPE_STRINGS.put("&atilde", '\u00E3');
   1067     ESCAPE_STRINGS.put("&auml", '\u00E4');
   1068     ESCAPE_STRINGS.put("&aring", '\u00E5');
   1069     ESCAPE_STRINGS.put("&aelig", '\u00E6');
   1070     ESCAPE_STRINGS.put("&ccedil", '\u00E7');
   1071     ESCAPE_STRINGS.put("&egrave", '\u00E8');
   1072     ESCAPE_STRINGS.put("&eacute", '\u00E9');
   1073     ESCAPE_STRINGS.put("&ecirc", '\u00EA');
   1074     ESCAPE_STRINGS.put("&euml", '\u00EB');
   1075     ESCAPE_STRINGS.put("&igrave", '\u00EC');
   1076     ESCAPE_STRINGS.put("&iacute", '\u00ED');
   1077     ESCAPE_STRINGS.put("&icirc", '\u00EE');
   1078     ESCAPE_STRINGS.put("&iuml", '\u00EF');
   1079     ESCAPE_STRINGS.put("&eth", '\u00F0');
   1080     ESCAPE_STRINGS.put("&ntilde", '\u00F1');
   1081     ESCAPE_STRINGS.put("&ograve", '\u00F2');
   1082     ESCAPE_STRINGS.put("&oacute", '\u00F3');
   1083     ESCAPE_STRINGS.put("&ocirc", '\u00F4');
   1084     ESCAPE_STRINGS.put("&otilde", '\u00F5');
   1085     ESCAPE_STRINGS.put("&ouml", '\u00F6');
   1086     ESCAPE_STRINGS.put("&divide", '\u00F7');
   1087     ESCAPE_STRINGS.put("&oslash", '\u00F8');
   1088     ESCAPE_STRINGS.put("&ugrave", '\u00F9');
   1089     ESCAPE_STRINGS.put("&uacute", '\u00FA');
   1090     ESCAPE_STRINGS.put("&ucirc", '\u00FB');
   1091     ESCAPE_STRINGS.put("&uuml", '\u00FC');
   1092     ESCAPE_STRINGS.put("&yacute", '\u00FD');
   1093     ESCAPE_STRINGS.put("&thorn", '\u00FE');
   1094     ESCAPE_STRINGS.put("&yuml", '\u00FF');
   1095     ESCAPE_STRINGS.put("&fnof", '\u0192');
   1096     ESCAPE_STRINGS.put("&Alpha", '\u0391');
   1097     ESCAPE_STRINGS.put("&Beta", '\u0392');
   1098     ESCAPE_STRINGS.put("&Gamma", '\u0393');
   1099     ESCAPE_STRINGS.put("&Delta", '\u0394');
   1100     ESCAPE_STRINGS.put("&Epsilon", '\u0395');
   1101     ESCAPE_STRINGS.put("&Zeta", '\u0396');
   1102     ESCAPE_STRINGS.put("&Eta", '\u0397');
   1103     ESCAPE_STRINGS.put("&Theta", '\u0398');
   1104     ESCAPE_STRINGS.put("&Iota", '\u0399');
   1105     ESCAPE_STRINGS.put("&Kappa", '\u039A');
   1106     ESCAPE_STRINGS.put("&Lambda", '\u039B');
   1107     ESCAPE_STRINGS.put("&Mu", '\u039C');
   1108     ESCAPE_STRINGS.put("&Nu", '\u039D');
   1109     ESCAPE_STRINGS.put("&Xi", '\u039E');
   1110     ESCAPE_STRINGS.put("&Omicron", '\u039F');
   1111     ESCAPE_STRINGS.put("&Pi", '\u03A0');
   1112     ESCAPE_STRINGS.put("&Rho", '\u03A1');
   1113     ESCAPE_STRINGS.put("&Sigma", '\u03A3');
   1114     ESCAPE_STRINGS.put("&Tau", '\u03A4');
   1115     ESCAPE_STRINGS.put("&Upsilon", '\u03A5');
   1116     ESCAPE_STRINGS.put("&Phi", '\u03A6');
   1117     ESCAPE_STRINGS.put("&Chi", '\u03A7');
   1118     ESCAPE_STRINGS.put("&Psi", '\u03A8');
   1119     ESCAPE_STRINGS.put("&Omega", '\u03A9');
   1120     ESCAPE_STRINGS.put("&alpha", '\u03B1');
   1121     ESCAPE_STRINGS.put("&beta", '\u03B2');
   1122     ESCAPE_STRINGS.put("&gamma", '\u03B3');
   1123     ESCAPE_STRINGS.put("&delta", '\u03B4');
   1124     ESCAPE_STRINGS.put("&epsilon", '\u03B5');
   1125     ESCAPE_STRINGS.put("&zeta", '\u03B6');
   1126     ESCAPE_STRINGS.put("&eta", '\u03B7');
   1127     ESCAPE_STRINGS.put("&theta", '\u03B8');
   1128     ESCAPE_STRINGS.put("&iota", '\u03B9');
   1129     ESCAPE_STRINGS.put("&kappa", '\u03BA');
   1130     ESCAPE_STRINGS.put("&lambda", '\u03BB');
   1131     ESCAPE_STRINGS.put("&mu", '\u03BC');
   1132     ESCAPE_STRINGS.put("&nu", '\u03BD');
   1133     ESCAPE_STRINGS.put("&xi", '\u03BE');
   1134     ESCAPE_STRINGS.put("&omicron", '\u03BF');
   1135     ESCAPE_STRINGS.put("&pi", '\u03C0');
   1136     ESCAPE_STRINGS.put("&rho", '\u03C1');
   1137     ESCAPE_STRINGS.put("&sigmaf", '\u03C2');
   1138     ESCAPE_STRINGS.put("&sigma", '\u03C3');
   1139     ESCAPE_STRINGS.put("&tau", '\u03C4');
   1140     ESCAPE_STRINGS.put("&upsilon", '\u03C5');
   1141     ESCAPE_STRINGS.put("&phi", '\u03C6');
   1142     ESCAPE_STRINGS.put("&chi", '\u03C7');
   1143     ESCAPE_STRINGS.put("&psi", '\u03C8');
   1144     ESCAPE_STRINGS.put("&omega", '\u03C9');
   1145     ESCAPE_STRINGS.put("&thetasym", '\u03D1');
   1146     ESCAPE_STRINGS.put("&upsih", '\u03D2');
   1147     ESCAPE_STRINGS.put("&piv", '\u03D6');
   1148     ESCAPE_STRINGS.put("&bull", '\u2022');
   1149     ESCAPE_STRINGS.put("&hellip", '\u2026');
   1150     ESCAPE_STRINGS.put("&prime", '\u2032');
   1151     ESCAPE_STRINGS.put("&Prime", '\u2033');
   1152     ESCAPE_STRINGS.put("&oline", '\u203E');
   1153     ESCAPE_STRINGS.put("&frasl", '\u2044');
   1154     ESCAPE_STRINGS.put("&weierp", '\u2118');
   1155     ESCAPE_STRINGS.put("&image", '\u2111');
   1156     ESCAPE_STRINGS.put("&real", '\u211C');
   1157     ESCAPE_STRINGS.put("&trade", '\u2122');
   1158     ESCAPE_STRINGS.put("&alefsym", '\u2135');
   1159     ESCAPE_STRINGS.put("&larr", '\u2190');
   1160     ESCAPE_STRINGS.put("&uarr", '\u2191');
   1161     ESCAPE_STRINGS.put("&rarr", '\u2192');
   1162     ESCAPE_STRINGS.put("&darr", '\u2193');
   1163     ESCAPE_STRINGS.put("&harr", '\u2194');
   1164     ESCAPE_STRINGS.put("&crarr", '\u21B5');
   1165     ESCAPE_STRINGS.put("&lArr", '\u21D0');
   1166     ESCAPE_STRINGS.put("&uArr", '\u21D1');
   1167     ESCAPE_STRINGS.put("&rArr", '\u21D2');
   1168     ESCAPE_STRINGS.put("&dArr", '\u21D3');
   1169     ESCAPE_STRINGS.put("&hArr", '\u21D4');
   1170     ESCAPE_STRINGS.put("&forall", '\u2200');
   1171     ESCAPE_STRINGS.put("&part", '\u2202');
   1172     ESCAPE_STRINGS.put("&exist", '\u2203');
   1173     ESCAPE_STRINGS.put("&empty", '\u2205');
   1174     ESCAPE_STRINGS.put("&nabla", '\u2207');
   1175     ESCAPE_STRINGS.put("&isin", '\u2208');
   1176     ESCAPE_STRINGS.put("&notin", '\u2209');
   1177     ESCAPE_STRINGS.put("&ni", '\u220B');
   1178     ESCAPE_STRINGS.put("&prod", '\u220F');
   1179     ESCAPE_STRINGS.put("&sum", '\u2211');
   1180     ESCAPE_STRINGS.put("&minus", '\u2212');
   1181     ESCAPE_STRINGS.put("&lowast", '\u2217');
   1182     ESCAPE_STRINGS.put("&radic", '\u221A');
   1183     ESCAPE_STRINGS.put("&prop", '\u221D');
   1184     ESCAPE_STRINGS.put("&infin", '\u221E');
   1185     ESCAPE_STRINGS.put("&ang", '\u2220');
   1186     ESCAPE_STRINGS.put("&and", '\u2227');
   1187     ESCAPE_STRINGS.put("&or", '\u2228');
   1188     ESCAPE_STRINGS.put("&cap", '\u2229');
   1189     ESCAPE_STRINGS.put("&cup", '\u222A');
   1190     ESCAPE_STRINGS.put("&int", '\u222B');
   1191     ESCAPE_STRINGS.put("&there4", '\u2234');
   1192     ESCAPE_STRINGS.put("&sim", '\u223C');
   1193     ESCAPE_STRINGS.put("&cong", '\u2245');
   1194     ESCAPE_STRINGS.put("&asymp", '\u2248');
   1195     ESCAPE_STRINGS.put("&ne", '\u2260');
   1196     ESCAPE_STRINGS.put("&equiv", '\u2261');
   1197     ESCAPE_STRINGS.put("&le", '\u2264');
   1198     ESCAPE_STRINGS.put("&ge", '\u2265');
   1199     ESCAPE_STRINGS.put("&sub", '\u2282');
   1200     ESCAPE_STRINGS.put("&sup", '\u2283');
   1201     ESCAPE_STRINGS.put("&nsub", '\u2284');
   1202     ESCAPE_STRINGS.put("&sube", '\u2286');
   1203     ESCAPE_STRINGS.put("&supe", '\u2287');
   1204     ESCAPE_STRINGS.put("&oplus", '\u2295');
   1205     ESCAPE_STRINGS.put("&otimes", '\u2297');
   1206     ESCAPE_STRINGS.put("&perp", '\u22A5');
   1207     ESCAPE_STRINGS.put("&sdot", '\u22C5');
   1208     ESCAPE_STRINGS.put("&lceil", '\u2308');
   1209     ESCAPE_STRINGS.put("&rceil", '\u2309');
   1210     ESCAPE_STRINGS.put("&lfloor", '\u230A');
   1211     ESCAPE_STRINGS.put("&rfloor", '\u230B');
   1212     ESCAPE_STRINGS.put("&lang", '\u2329');
   1213     ESCAPE_STRINGS.put("&rang", '\u232A');
   1214     ESCAPE_STRINGS.put("&loz", '\u25CA');
   1215     ESCAPE_STRINGS.put("&spades", '\u2660');
   1216     ESCAPE_STRINGS.put("&clubs", '\u2663');
   1217     ESCAPE_STRINGS.put("&hearts", '\u2665');
   1218     ESCAPE_STRINGS.put("&diams", '\u2666');
   1219     ESCAPE_STRINGS.put("&quot", '\u0022');
   1220     ESCAPE_STRINGS.put("&amp", '\u0026');
   1221     ESCAPE_STRINGS.put("&lt", '\u003C');
   1222     ESCAPE_STRINGS.put("&gt", '\u003E');
   1223     ESCAPE_STRINGS.put("&OElig", '\u0152');
   1224     ESCAPE_STRINGS.put("&oelig", '\u0153');
   1225     ESCAPE_STRINGS.put("&Scaron", '\u0160');
   1226     ESCAPE_STRINGS.put("&scaron", '\u0161');
   1227     ESCAPE_STRINGS.put("&Yuml", '\u0178');
   1228     ESCAPE_STRINGS.put("&circ", '\u02C6');
   1229     ESCAPE_STRINGS.put("&tilde", '\u02DC');
   1230     ESCAPE_STRINGS.put("&ensp", '\u2002');
   1231     ESCAPE_STRINGS.put("&emsp", '\u2003');
   1232     ESCAPE_STRINGS.put("&thinsp", '\u2009');
   1233     ESCAPE_STRINGS.put("&zwnj", '\u200C');
   1234     ESCAPE_STRINGS.put("&zwj", '\u200D');
   1235     ESCAPE_STRINGS.put("&lrm", '\u200E');
   1236     ESCAPE_STRINGS.put("&rlm", '\u200F');
   1237     ESCAPE_STRINGS.put("&ndash", '\u2013');
   1238     ESCAPE_STRINGS.put("&mdash", '\u2014');
   1239     ESCAPE_STRINGS.put("&lsquo", '\u2018');
   1240     ESCAPE_STRINGS.put("&rsquo", '\u2019');
   1241     ESCAPE_STRINGS.put("&sbquo", '\u201A');
   1242     ESCAPE_STRINGS.put("&ldquo", '\u201C');
   1243     ESCAPE_STRINGS.put("&rdquo", '\u201D');
   1244     ESCAPE_STRINGS.put("&bdquo", '\u201E');
   1245     ESCAPE_STRINGS.put("&dagger", '\u2020');
   1246     ESCAPE_STRINGS.put("&Dagger", '\u2021');
   1247     ESCAPE_STRINGS.put("&permil", '\u2030');
   1248     ESCAPE_STRINGS.put("&lsaquo", '\u2039');
   1249     ESCAPE_STRINGS.put("&rsaquo", '\u203A');
   1250     ESCAPE_STRINGS.put("&euro", '\u20AC');
   1251 
   1252     HEX_LETTERS = new HashSet<Character>(12);
   1253 
   1254     HEX_LETTERS.add('a');
   1255     HEX_LETTERS.add('A');
   1256     HEX_LETTERS.add('b');
   1257     HEX_LETTERS.add('B');
   1258     HEX_LETTERS.add('c');
   1259     HEX_LETTERS.add('C');
   1260     HEX_LETTERS.add('d');
   1261     HEX_LETTERS.add('D');
   1262     HEX_LETTERS.add('e');
   1263     HEX_LETTERS.add('E');
   1264     HEX_LETTERS.add('f');
   1265     HEX_LETTERS.add('F');
   1266   }
   1267 
   1268   /**
   1269    * <p>
   1270    * Replace all the occurences of HTML escape strings with the
   1271    * respective characters.
   1272    * </p>
   1273    * <p>
   1274    * The default mode is strict (requiring semicolons).
   1275    * </p>
   1276    *
   1277    * @param s a <code>String</code> value
   1278    * @return a <code>String</code> value
   1279    * @throws NullPointerException if the input string is null.
   1280    */
   1281   public static final String unescapeHTML(String s) {
   1282     return unescapeHTML(s, false);
   1283   }
   1284 
   1285   /**
   1286    * Replace all the occurences of HTML escape strings with the
   1287    * respective characters.
   1288    *
   1289    * @param s a <code>String</code> value
   1290    * @param emulateBrowsers a <code>Boolean</code> value that tells the method
   1291    *     to allow entity refs not terminated with a semicolon to be unescaped.
   1292    *     (a quirk of this feature, and some browsers, is that an explicit
   1293    *     terminating character is needed - e.g., &lt$ would be unescaped, but
   1294    *     not &ltab - see the tests for a more in-depth description of browsers)
   1295    * @return a <code>String</code> value
   1296    * @throws NullPointerException if the input string is null.
   1297    */
   1298   public static final String unescapeHTML(String s, boolean emulateBrowsers) {
   1299 
   1300     // See if there are any '&' in the string since that is what we look
   1301     // for to escape. If there isn't, then we don't need to escape this string
   1302     // Based on similar technique used in the escape function.
   1303     int index = s.indexOf('&');
   1304     if (index == -1) {
   1305       // Nothing to escape. Return the original string.
   1306       return s;
   1307     }
   1308 
   1309     // We found an escaped character. Start slow escaping from there.
   1310     char[] chars = s.toCharArray();
   1311     char[] escaped = new char[chars.length];
   1312     System.arraycopy(chars, 0, escaped, 0, index);
   1313 
   1314     // Note: escaped[pos] = end of the escaped char array.
   1315     int pos = index;
   1316 
   1317     for (int i = index; i < chars.length;) {
   1318       if (chars[i] != '&') {
   1319         escaped[pos++] = chars[i++];
   1320         continue;
   1321       }
   1322 
   1323       // Allow e.g. &#123;
   1324       int j = i + 1;
   1325       boolean isNumericEntity = false;
   1326       if (j < chars.length && chars[j] == '#') {
   1327         j++;
   1328         isNumericEntity = true;
   1329       }
   1330 
   1331       // if it's numeric, also check for hex
   1332       boolean isHexEntity = false;
   1333       if (j < chars.length && (chars[j] == 'x' || chars[j] == 'X')) {
   1334         j++;
   1335         isHexEntity = true;
   1336       }
   1337 
   1338       // Scan until we find a char that is not valid for this sequence.
   1339       for (; j < chars.length; j++) {
   1340         char ch = chars[j];
   1341         boolean isDigit = Character.isDigit(ch);
   1342         if (isNumericEntity) {
   1343           // non-hex numeric sequence end condition
   1344           if (!isHexEntity && !isDigit) {
   1345             break;
   1346           }
   1347           // hex sequence end contition
   1348           if (isHexEntity && !isDigit && !HEX_LETTERS.contains(ch)) {
   1349             break;
   1350           }
   1351         }
   1352         // anything other than a digit or letter is always an end condition
   1353         if (!isDigit && !Character.isLetter(ch)) {
   1354           break;
   1355         }
   1356       }
   1357 
   1358       boolean replaced = false;
   1359       if ((j <= chars.length && emulateBrowsers) ||
   1360           (j < chars.length && chars[j] == ';')) {
   1361         // Check for &#D; and &#xD; pattern
   1362         if (i + 2 < chars.length && s.charAt(i + 1) == '#') {
   1363           try {
   1364             long charcode = 0;
   1365             char ch = s.charAt(i + 2);
   1366             if (isHexEntity) {
   1367               charcode = Long.parseLong(
   1368                   new String(chars, i + 3, j - i - 3), 16);
   1369             } else if (Character.isDigit(ch)) {
   1370               charcode = Long.parseLong(
   1371                   new String(chars, i + 2, j - i - 2));
   1372             }
   1373             // D800 to DFFF are for UTF16 surrogate pairs, and are not valid HTML entities
   1374             // Code points 0xFFFE and 0xFFFF are unicode noncharacters
   1375             if ((charcode > 0 && charcode < 0xD800) || (charcode > 0xDFFF && charcode < 0xFFFE)) {
   1376               escaped[pos++] = (char) charcode;
   1377               replaced = true;
   1378             } else if (charcode >= 0x10000 && charcode < 0x110000) {
   1379               // These characters are represented as surrogate pairs in UTF16
   1380               escaped[pos++] = (char) ((charcode - 0x10000) / 0x400 + 0xD800);
   1381               escaped[pos++] = (char) ((charcode - 0x10000) % 0x400 + 0xDC00);
   1382               replaced = true;
   1383             }
   1384           } catch (NumberFormatException ex) {
   1385             // Failed, not replaced.
   1386           }
   1387         } else {
   1388           String key = new String(chars, i, j - i);
   1389           Character repl = ESCAPE_STRINGS.get(key);
   1390           if (repl != null) {
   1391             escaped[pos++] = repl;
   1392             replaced = true;
   1393           }
   1394         }
   1395         // Skip over ';'
   1396         if (j < chars.length && chars[j] == ';') {
   1397           j++;
   1398         }
   1399       }
   1400 
   1401       if (!replaced) {
   1402         // Not a recognized escape sequence, leave as-is
   1403         System.arraycopy(chars, i, escaped, pos, j - i);
   1404         pos += j - i;
   1405       }
   1406       i = j;
   1407     }
   1408     return new String(escaped, 0, pos);
   1409   }
   1410 
   1411   // Escaper for < and > only.
   1412   private static final CharEscaper LT_GT_ESCAPE =
   1413       new CharEscaperBuilder()
   1414         .addEscape('<', "&lt;")
   1415         .addEscape('>', "&gt;")
   1416         .toEscaper();
   1417 
   1418   private static final Pattern htmlTagPattern =
   1419       Pattern.compile("</?[a-zA-Z][^>]*>");
   1420 
   1421   /**
   1422    * Given a <code>String</code>, returns an equivalent <code>String</code> with
   1423    * all HTML tags stripped. Note that HTML entities, such as "&amp;amp;" will
   1424    * still be preserved.
   1425    */
   1426   public static String stripHtmlTags(String string) {
   1427     if ((string == null) || "".equals(string)) {
   1428       return string;
   1429     }
   1430     String stripped = htmlTagPattern.matcher(string).replaceAll("");
   1431     /*
   1432      * Certain inputs result in a well-formed HTML:
   1433      * <<X>script>alert(0)<</X>/script> results in <script>alert(0)</script>
   1434      * The following step ensures that no HTML can slip through by replacing all
   1435      * < and > characters with &lt; and &gt; after HTML tags were stripped.
   1436      */
   1437     return LT_GT_ESCAPE.escape(stripped);
   1438   }
   1439 
   1440   /**
   1441    * We escape some characters in s to be able to insert strings into JavaScript
   1442    * code. Also, make sure that we don't write out {@code -->} or
   1443    * {@code </script>}, which may close a script tag, or any char in ["'>] which
   1444    * might close a tag or attribute if seen inside an attribute.
   1445    */
   1446   public static String javaScriptEscape(CharSequence s) {
   1447     return javaScriptEscapeHelper(s, false);
   1448   }
   1449 
   1450   /**
   1451    * We escape some characters in s to be able to insert strings into JavaScript
   1452    * code. Also, make sure that we don't write out {@code -->} or
   1453    * {@code </script>}, which may close a script tag, or any char in ["'>] which
   1454    * might close a tag or attribute if seen inside an attribute.
   1455    * Turns all non-ascii characters into ASCII javascript escape sequences
   1456    * (eg \\uhhhh or \ooo).
   1457    */
   1458   public static String javaScriptEscapeToAscii(CharSequence s) {
   1459     return javaScriptEscapeHelper(s, true);
   1460   }
   1461 
   1462   /**
   1463    * Represents the type of javascript escaping to perform.  Each enum below
   1464    * determines whether to use octal escapes and how to handle quotes.
   1465    */
   1466   public static enum JsEscapingMode {
   1467     /** No octal escapes, pass-through ', and escape " as \". */
   1468     JSON,
   1469 
   1470     /** Octal escapes, escapes ' and " to \42 and \47, respectively. */
   1471     EMBEDDABLE_JS,
   1472 
   1473     /** Octal escapes, escapes ' and " to \' and \". */
   1474     MINIMAL_JS
   1475   }
   1476 
   1477   /**
   1478    * Helper for javaScriptEscape and javaScriptEscapeToAscii
   1479    */
   1480   private static String javaScriptEscapeHelper(CharSequence s,
   1481                                                boolean escapeToAscii) {
   1482     StringBuilder sb = new StringBuilder(s.length() * 9 / 8);
   1483     try {
   1484       escapeStringBody(s, escapeToAscii, JsEscapingMode.EMBEDDABLE_JS, sb);
   1485     } catch (IOException ex) {
   1486       // StringBuilder.append does not throw IOExceptions.
   1487       throw new RuntimeException(ex);
   1488     }
   1489     return sb.toString();
   1490   }
   1491 
   1492   /**
   1493    * Appends the javascript string literal equivalent of plainText to the given
   1494    * out buffer.
   1495    * @param plainText the string to escape.
   1496    * @param escapeToAscii true to encode all characters not in ascii [\x20-\x7e]
   1497    *   <br>
   1498    *   Full escaping of unicode entites isn't required but this makes
   1499    *   sure that unicode strings will survive regardless of the
   1500    *   content-encoding of the javascript file which is important when
   1501    *   we use this function to autogenerated javascript source files.
   1502    *   This is disabled by default because it makes non-latin strings very long.
   1503    *   <br>
   1504    *   If you seem to have trouble with character-encodings, maybe
   1505    *   turn this on to see if the problem goes away.  If so, you need
   1506    *   to specify a character encoding for your javascript somewhere.
   1507    * @param jsEscapingMode determines the type of escaping to perform.
   1508    * @param out the buffer to append output to.
   1509    */
   1510   /*
   1511    * To avoid fallthrough, we would have to either use a hybrid switch-case/if
   1512    * approach (which would obscure our special handling for ' and "), duplicate
   1513    * the content of the default case, or pass a half-dozen parameters to a
   1514    * helper method containing the code from the default case.
   1515    */
   1516   @SuppressWarnings("fallthrough")
   1517   public static void escapeStringBody(
   1518       CharSequence plainText, boolean escapeToAscii,
   1519       JsEscapingMode jsEscapingMode, Appendable out)
   1520       throws IOException {
   1521     int pos = 0;  // Index just past the last char in plainText written to out.
   1522     int len = plainText.length();
   1523     for (int codePoint, charCount, i = 0; i < len; i += charCount) {
   1524       codePoint = Character.codePointAt(plainText, i);
   1525       charCount = Character.charCount(codePoint);
   1526 
   1527       if (!shouldEscapeChar(codePoint, escapeToAscii, jsEscapingMode)) {
   1528         continue;
   1529       }
   1530 
   1531       out.append(plainText, pos, i);
   1532       pos = i + charCount;
   1533       switch (codePoint) {
   1534         case '\b': out.append("\\b"); break;
   1535         case '\t': out.append("\\t"); break;
   1536         case '\n': out.append("\\n"); break;
   1537         case '\f': out.append("\\f"); break;
   1538         case '\r': out.append("\\r"); break;
   1539         case '\\': out.append("\\\\"); break;
   1540         case '"': case '\'':
   1541           if (jsEscapingMode == JsEscapingMode.JSON && '\'' == codePoint) {
   1542             // JSON does not escape a single quote (and it should be surrounded
   1543             // by double quotes).
   1544             out.append((char) codePoint);
   1545             break;
   1546           } else if (jsEscapingMode != JsEscapingMode.EMBEDDABLE_JS) {
   1547             out.append('\\').append((char) codePoint);
   1548             break;
   1549           }
   1550           // fall through
   1551         default:
   1552           if (codePoint >= 0x100 || jsEscapingMode == JsEscapingMode.JSON) {
   1553             appendHexJavaScriptRepresentation(codePoint, out);
   1554           } else {
   1555             // Output the minimal octal encoding.  We can't use an encoding
   1556             // shorter than three digits if the next digit is a valid octal
   1557             // digit.
   1558             boolean pad = i + charCount >= len
   1559                 || isOctal(plainText.charAt(i + charCount));
   1560             appendOctalJavaScriptRepresentation((char) codePoint, pad, out);
   1561           }
   1562           break;
   1563       }
   1564     }
   1565     out.append(plainText, pos, len);
   1566   }
   1567 
   1568   /**
   1569    * Helper for escapeStringBody, which decides whether to escape a character.
   1570    */
   1571   private static boolean shouldEscapeChar(int codePoint,
   1572       boolean escapeToAscii, JsEscapingMode jsEscapingMode) {
   1573     // If non-ASCII chars should be escaped, identify non-ASCII code points.
   1574     if (escapeToAscii && (codePoint < 0x20 || codePoint > 0x7e)) {
   1575       return true;
   1576     }
   1577 
   1578     // If in JSON escaping mode, check JSON *and* JS escaping rules. The JS
   1579     // escaping rules will escape more characters than needed for JSON,
   1580     // but it is safe to escape any character in JSON.
   1581     // TODO(bbavar): Remove unnecessary escaping for JSON, as long as it can be
   1582     //               shown that this change in legacy behavior is safe.
   1583     if (jsEscapingMode == JsEscapingMode.JSON) {
   1584       return mustEscapeCharInJsonString(codePoint)
   1585           || mustEscapeCharInJsString(codePoint);
   1586     }
   1587 
   1588     // Finally, just check the default JS escaping rules.
   1589     return mustEscapeCharInJsString(codePoint);
   1590   }
   1591 
   1592   /**
   1593    * Returns a javascript representation of the character in a hex escaped
   1594    * format.
   1595    *
   1596    * @param codePoint The codepoint to append.
   1597    * @param out The buffer to which the hex representation should be appended.
   1598    */
   1599   private static void appendHexJavaScriptRepresentation(
   1600       int codePoint, Appendable out)
   1601       throws IOException {
   1602     if (Character.isSupplementaryCodePoint(codePoint)) {
   1603       // Handle supplementary unicode values which are not representable in
   1604       // javascript.  We deal with these by escaping them as two 4B sequences
   1605       // so that they will round-trip properly when sent from java to javascript
   1606       // and back.
   1607       char[] surrogates = Character.toChars(codePoint);
   1608       appendHexJavaScriptRepresentation(surrogates[0], out);
   1609       appendHexJavaScriptRepresentation(surrogates[1], out);
   1610       return;
   1611     }
   1612     out.append("\\u")
   1613         .append(HEX_CHARS[(codePoint >>> 12) & 0xf])
   1614         .append(HEX_CHARS[(codePoint >>> 8) & 0xf])
   1615         .append(HEX_CHARS[(codePoint >>> 4) & 0xf])
   1616         .append(HEX_CHARS[codePoint & 0xf]);
   1617   }
   1618 
   1619   /**
   1620    * Returns a javascript representation of the character in a hex escaped
   1621    * format. Although this is a rather specific method, it is made public
   1622    * because it is also used by the JSCompiler.
   1623    *
   1624    * @param ch The character to append.
   1625    * @param pad true to force use of the full 3 digit representation.
   1626    * @param out The buffer to which the hex representation should be appended.
   1627    */
   1628   private static void appendOctalJavaScriptRepresentation(
   1629       char ch, boolean pad, Appendable out) throws IOException {
   1630     if (ch >= 0100
   1631         // Be paranoid at the end of a string since someone might call
   1632         // this method again with another string segment.
   1633         || pad) {
   1634       out.append('\\')
   1635           .append(OCTAL_CHARS[(ch >>> 6) & 0x7])
   1636           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
   1637           .append(OCTAL_CHARS[ch & 0x7]);
   1638     } else if (ch >= 010) {
   1639       out.append('\\')
   1640           .append(OCTAL_CHARS[(ch >>> 3) & 0x7])
   1641           .append(OCTAL_CHARS[ch & 0x7]);
   1642     } else {
   1643       out.append('\\')
   1644           .append(OCTAL_CHARS[ch & 0x7]);
   1645     }
   1646   }
   1647 
   1648   /**
   1649    * Although this is a rather specific method, it is made public
   1650    * because it is also used by the JSCompiler.
   1651    *
   1652    * @see #appendHexJavaScriptRepresentation(int, Appendable)
   1653    */
   1654   public static void appendHexJavaScriptRepresentation(StringBuilder sb,
   1655                                                        char c) {
   1656     try {
   1657       appendHexJavaScriptRepresentation(c, sb);
   1658     } catch (IOException ex) {
   1659       // StringBuilder does not throw IOException.
   1660       throw new RuntimeException(ex);
   1661     }
   1662   }
   1663 
   1664   /**
   1665    * Undo escaping as performed in javaScriptEscape(.)
   1666    * Throws an IllegalArgumentException if the string contains
   1667    * bad escaping.
   1668    */
   1669   public static String javaScriptUnescape(String s) {
   1670     StringBuilder sb = new StringBuilder(s.length());
   1671     for (int i = 0; i < s.length(); ) {
   1672       char c = s.charAt(i);
   1673       if (c == '\\') {
   1674         i = javaScriptUnescapeHelper(s, i + 1, sb);
   1675       } else {
   1676         sb.append(c);
   1677         i++;
   1678       }
   1679     }
   1680     return sb.toString();
   1681   }
   1682 
   1683   /**
   1684    * Looks for an escape code starting at index i of s,
   1685    * and appends it to sb.
   1686    * @return the index of the first character in s
   1687    * after the escape code.
   1688    * @throws IllegalArgumentException if the escape code
   1689    * is invalid
   1690    */
   1691   private static int javaScriptUnescapeHelper(String s, int i,
   1692                                               StringBuilder sb) {
   1693     if (i >= s.length()) {
   1694       throw new IllegalArgumentException(
   1695           "End-of-string after escape character in [" + s + "]");
   1696     }
   1697 
   1698     char c = s.charAt(i++);
   1699     switch (c) {
   1700       case 'n': sb.append('\n'); break;
   1701       case 'r': sb.append('\r'); break;
   1702       case 't': sb.append('\t'); break;
   1703       case 'b': sb.append('\b'); break;
   1704       case 'f': sb.append('\f'); break;
   1705       case '\\':
   1706       case '\"':
   1707       case '\'':
   1708       case '>':
   1709         sb.append(c);
   1710         break;
   1711       case '0': case '1': case '2': case '3':
   1712       case '4': case '5': case '6': case '7':
   1713         --i;  // backup to first octal digit
   1714         int nOctalDigits = 1;
   1715         int digitLimit = c < '4' ? 3 : 2;
   1716         while (nOctalDigits < digitLimit && i + nOctalDigits < s.length()
   1717                && isOctal(s.charAt(i + nOctalDigits))) {
   1718           ++nOctalDigits;
   1719         }
   1720         sb.append(
   1721             (char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8));
   1722         i += nOctalDigits;
   1723         break;
   1724       case 'x':
   1725       case 'u':
   1726         String hexCode;
   1727         int nHexDigits = (c == 'u' ? 4 : 2);
   1728         try {
   1729           hexCode = s.substring(i, i + nHexDigits);
   1730         } catch (IndexOutOfBoundsException ioobe) {
   1731           throw new IllegalArgumentException(
   1732               "Invalid unicode sequence [" + s.substring(i) + "] at index " + i
   1733               + " in [" + s + "]");
   1734         }
   1735         int unicodeValue;
   1736         try {
   1737           unicodeValue = Integer.parseInt(hexCode, 16);
   1738         } catch (NumberFormatException nfe) {
   1739           throw new IllegalArgumentException(
   1740               "Invalid unicode sequence [" + hexCode + "] at index " + i +
   1741               " in [" + s + "]");
   1742         }
   1743         sb.append((char) unicodeValue);
   1744         i += nHexDigits;
   1745         break;
   1746       default:
   1747         throw new IllegalArgumentException(
   1748             "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"
   1749             );
   1750     }
   1751 
   1752     return i;
   1753   }
   1754 
   1755   // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF
   1756   private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf(
   1757       "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
   1758       "\u0008\u000B\u000C\u000E\u000F" +
   1759       "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
   1760       "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
   1761       "\uFFFE\uFFFF");
   1762 
   1763   /**
   1764    * Escape a string that is meant to be embedded in a CDATA section.
   1765    * The returned string is guaranteed to be valid CDATA content.
   1766    * The syntax of CDATA sections is the following:
   1767    * <blockquote>
   1768    *   <code>&lt;[!CDATA[...]]&gt;</code>
   1769    * </blockquote>
   1770    * The only invalid character sequence in a CDATA tag is "]]&gt;".
   1771    * If this sequence is present in the input string, we replace
   1772    * it by closing the current CDATA field, then write ']]&amp;gt;',
   1773    * then reopen a new CDATA section.
   1774    */
   1775   public static String xmlCDataEscape(String s) {
   1776      // Make sure there are no illegal control characters.
   1777      s = CONTROL_MATCHER.removeFrom(s);
   1778     // Return the original reference if the string doesn't have a match.
   1779     int found = s.indexOf("]]>");
   1780     if (found == -1) {
   1781       return s;
   1782     }
   1783 
   1784     // For each occurrence of "]]>", append a string that adds "]]&gt;" after
   1785     // the end of the CDATA which has just been closed, then opens a new CDATA.
   1786     StringBuilder sb = new StringBuilder();
   1787     int prev = 0;
   1788     do {
   1789       sb.append(s.substring(prev, found + 3));
   1790       sb.append("]]&gt;<![CDATA[");
   1791       prev = found + 3;
   1792     } while ((found = s.indexOf("]]>", prev)) != -1);
   1793     sb.append(s.substring(prev));
   1794     return sb.toString();
   1795   }
   1796 
   1797   /**
   1798    * We escape some characters in s to be able to insert strings into Java code
   1799    *
   1800    * @deprecated Use {@link CharEscapers#asciiHtmlEscaper()} and {@link
   1801    * CharEscapers#javaCharEscaper()} or {@link CharEscapers#javaStringEscaper()}
   1802    * instead. This method combines two forms of escaping in a way that's rarely
   1803    * desired.
   1804    */
   1805   @Deprecated
   1806   public static String javaEscape(String s) {
   1807     return JAVA_ESCAPE.escape(s);
   1808   }
   1809 
   1810   // Java escaper.
   1811   private static final CharEscaper JAVA_ESCAPE =
   1812       new CharEscaperBuilder()
   1813         .addEscape('\n', "\\n")
   1814         .addEscape('\r', "\\r")
   1815         .addEscape('\t', "\\t")
   1816         .addEscape('\\', "\\\\")
   1817         .addEscape('\"', "\\\"")
   1818         .addEscape('&', "&amp;")
   1819         .addEscape('<', "&lt;")
   1820         .addEscape('>', "&gt;")
   1821         .addEscape('\'', "\\\'")
   1822         .toEscaper();
   1823 
   1824   /**
   1825    * Escapes the special characters from a string so it can be used as part of
   1826    * a regex pattern. This method is for use on gnu.regexp style regular
   1827    * expressions.
   1828    *
   1829    * @deprecated Use {@link Pattern#quote(String)} instead. Note that it may not
   1830    * be compatible with gnu.regexp style regular expressions.
   1831    */
   1832   @Deprecated
   1833   public static String regexEscape(String s) {
   1834     return REGEX_ESCAPE.escape(s);
   1835   }
   1836 
   1837   // Regex escaper escapes all regex characters.
   1838   private static final CharEscaper REGEX_ESCAPE =
   1839       new CharEscaperBuilder()
   1840         .addEscape('(', "\\(")
   1841         .addEscape(')', "\\)")
   1842         .addEscape('|', "\\|")
   1843         .addEscape('*', "\\*")
   1844         .addEscape('+', "\\+")
   1845         .addEscape('?', "\\?")
   1846         .addEscape('.', "\\.")
   1847         .addEscape('{', "\\{")
   1848         .addEscape('}', "\\}")
   1849         .addEscape('[', "\\[")
   1850         .addEscape(']', "\\]")
   1851         .addEscape('$', "\\$")
   1852         .addEscape('^', "\\^")
   1853         .addEscape('\\', "\\\\")
   1854         .toEscaper();
   1855 
   1856   /**
   1857    *  If you want to preserve the exact
   1858    * current (odd) behavior when {@code doStrip} is {@code true}, use
   1859    * {@code .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on
   1860    * the splitter.
   1861    *
   1862    * @param in what to process
   1863    * @param delimiter the delimiting string
   1864    * @return the tokens
   1865    * @deprecated see the detailed instructions under
   1866    *     {@link #split(String, String, boolean)}
   1867    */
   1868   @Deprecated
   1869   public static LinkedList<String> string2List(
   1870       String in, String delimiter, boolean doStrip) {
   1871     if (in == null) {
   1872       return null;
   1873     }
   1874 
   1875     LinkedList<String> out = new LinkedList<String>();
   1876     string2Collection(in, delimiter, doStrip, out);
   1877     return out;
   1878   }
   1879 
   1880   /**
   1881    * See the detailed instructions under {@link
   1882    * #split(String, String, boolean)}. Pass the resulting {@code Iterable} to
   1883    * {@link com.google.common.collect.Sets#newHashSet(Iterable)}. If you want to
   1884    * preserve the exact current (odd) behavior when {@code doStrip} is {@code
   1885    * true}, use {@code
   1886    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
   1887    * splitter.
   1888    *
   1889    * @param in what to process
   1890    * @param delimiter the delimiting string
   1891    * @param doStrip to strip the substrings before adding to the list
   1892    * @return the tokens
   1893    * @deprecated see the detailed instructions under
   1894    *     {@link #split(String, String, boolean)}
   1895    */
   1896   @Deprecated
   1897   public static Set<String> string2Set(
   1898        String in, String delimiter, boolean doStrip) {
   1899     if (in == null) {
   1900       return null;
   1901     }
   1902 
   1903     HashSet<String> out = new HashSet<String>();
   1904     string2Collection(in, delimiter, doStrip, out);
   1905     return out;
   1906   }
   1907 
   1908   /**
   1909    * See the detailed instructions under {@link
   1910    * #split(String, String, boolean)}. If you want to preserve the exact current
   1911    * (odd) behavior when {@code doStrip} is {@code true}, use {@code
   1912    * .trimResults(CharMatcher.LEGACY_WHITESPACE).omitEmptyStrings()} on the
   1913    * splitter.
   1914    *
   1915    * @param in The delimited input string to process
   1916    * @param delimiter The string delimiting entries in the input string.
   1917    * @param doStrip whether to strip the substrings before adding to the
   1918    *          collection
   1919    * @param collection The collection to which the strings will be added. If
   1920    *          <code>null</code>, a new <code>List</code> will be created.
   1921    * @return The collection to which the substrings were added. This is
   1922    *         syntactic sugar to allow call chaining.
   1923    * @deprecated see the detailed instructions under
   1924    *     {@link #split(String, String, boolean)}
   1925    */
   1926   @Deprecated
   1927   public static Collection<String> string2Collection(
   1928       String in,
   1929       String delimiter,
   1930       boolean doStrip,
   1931       Collection<String> collection) {
   1932     if (in == null) {
   1933       return null;
   1934     }
   1935     if (collection == null) {
   1936       collection = new ArrayList<String>();
   1937     }
   1938     if (delimiter == null || delimiter.length() == 0) {
   1939       collection.add(in);
   1940       return collection;
   1941     }
   1942 
   1943     int fromIndex = 0;
   1944     int pos;
   1945     while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) {
   1946       String interim = in.substring(fromIndex, pos);
   1947       if (doStrip) {
   1948         interim = strip(interim);
   1949       }
   1950       if (!doStrip || interim.length() > 0) {
   1951         collection.add(interim);
   1952       }
   1953 
   1954       fromIndex = pos + delimiter.length();
   1955     }
   1956 
   1957     String interim = in.substring(fromIndex);
   1958     if (doStrip) {
   1959       interim = strip(interim);
   1960     }
   1961     if (!doStrip || interim.length() > 0) {
   1962       collection.add(interim);
   1963     }
   1964 
   1965     return collection;
   1966   }
   1967 
   1968   /**
   1969    * This converts a string to a Map. It will first split the string into
   1970    * entries using delimEntry. Then each entry is split into a key and a value
   1971    * using delimKey. By default we strip the keys. Use doStripEntry to strip
   1972    * also the entries.
   1973    *
   1974    * Note that this method returns a {@link HashMap}, which means that entries
   1975    * will be in no particular order. See {@link #stringToOrderedMap}.
   1976    *
   1977    * @param in the string to be processed
   1978    * @param delimEntry delimiter for the entries
   1979    * @param delimKey delimiter between keys and values
   1980    * @param doStripEntry strip entries before inserting in the map
   1981    *
   1982    * @return HashMap
   1983    */
   1984   public static HashMap<String, String> string2Map(
   1985       String in, String delimEntry, String delimKey,
   1986       boolean doStripEntry) {
   1987     if (in == null) {
   1988       return null;
   1989     }
   1990 
   1991     return stringToMapImpl(new HashMap<String, String>(), in, delimEntry,
   1992         delimKey, doStripEntry);
   1993   }
   1994 
   1995   /**
   1996    * This converts a string to a Map, with entries in the same order as the
   1997    * key/value pairs in the input string. It will first split the string into
   1998    * entries using delimEntry. Then each entry is split into a key and a value
   1999    * using delimKey. By default we strip the keys. Use doStripEntry to strip
   2000    * also the entries.
   2001    *
   2002    * @param in the string to be processed
   2003    * @param delimEntry delimiter for the entries
   2004    * @param delimKey delimiter between keys and values
   2005    * @param doStripEntry strip entries before inserting in the map
   2006    *
   2007    * @return key/value pairs as a Map, in order
   2008    */
   2009   public static Map<String, String> stringToOrderedMap(
   2010       String in, String delimEntry, String delimKey,
   2011       boolean doStripEntry) {
   2012     if (in == null) {
   2013       return null;
   2014     }
   2015 
   2016     return stringToMapImpl(new LinkedHashMap<String, String>(), in, delimEntry,
   2017         delimKey, doStripEntry);
   2018   }
   2019 
   2020   /**
   2021    * This adds key/value pairs from the given string to the given Map.
   2022    * It will first split the string into entries using delimEntry. Then each
   2023    * entry is split into a key and a value using delimKey. By default we
   2024    * strip the keys. Use doStripEntry to strip also the entries.
   2025    *
   2026    * @param out - Map to output into
   2027    * @param in - the string to be processed
   2028    * @param delimEntry - delimiter for the entries
   2029    * @param delimKey - delimiter between keys and values
   2030    * @param doStripEntry - strip entries before inserting in the map
   2031    * @return out, for caller's convenience
   2032    */
   2033   private static <T extends Map<String, String>> T stringToMapImpl(T out,
   2034       String in, String delimEntry, String delimKey, boolean doStripEntry) {
   2035 
   2036     if (isEmpty(delimEntry) || isEmpty(delimKey)) {
   2037       out.put(strip(in), "");
   2038       return out;
   2039     }
   2040 
   2041     Iterator<String> it = string2List(in, delimEntry, false).iterator();
   2042     int len = delimKey.length();
   2043     while (it.hasNext()) {
   2044       String entry = it.next();
   2045       int pos = entry.indexOf(delimKey);
   2046       if (pos > 0) {
   2047         String value = entry.substring(pos + len);
   2048         if (doStripEntry) {
   2049           value = strip(value);
   2050         }
   2051         out.put(strip(entry.substring(0, pos)), value);
   2052       } else {
   2053         out.put(strip(entry), "");
   2054       }
   2055     }
   2056 
   2057     return out;
   2058   }
   2059 
   2060   /**
   2061    * This function concatenates the elements of a Map in a string with form
   2062    *  "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>"
   2063    *
   2064    * @param in - the map to be converted
   2065    * @param sepKey - the separator to put between key and value
   2066    * @param sepEntry - the separator to put between map entries
   2067    * @return String
   2068    * @deprecated create a {@link MapJoiner}, for example {@code
   2069    *     Joiner.on(sepEntry).withKeyValueSeparator(sepKey)}. Ensure that your
   2070    *     map is non-null and use this map joiner's {@link MapJoiner#join(Map)}
   2071    *     method. To preserve behavior exactly, just in-line this method call.
   2072    */
   2073   @Deprecated public static <K, V> String map2String(
   2074       Map<K, V> in, String sepKey, String sepEntry) {
   2075     return (in == null) ? null : Joiner
   2076         .on(sepEntry)
   2077         .useForNull("null")
   2078         .withKeyValueSeparator(sepKey)
   2079         .join(in);
   2080   }
   2081 
   2082   /**
   2083    * Given a map, creates and returns a new map in which all keys are the
   2084    * lower-cased version of each key.
   2085    *
   2086    * @param map A map containing String keys to be lowercased
   2087    * @throws IllegalArgumentException if the map contains duplicate string keys
   2088    *           after lower casing
   2089    */
   2090   public static <V> Map<String, V> lowercaseKeys(Map<String, V> map) {
   2091     Map<String, V> result = new HashMap<String, V>(map.size());
   2092     for (Map.Entry<String, V> entry : map.entrySet()) {
   2093       String key = entry.getKey();
   2094       if (result.containsKey(key.toLowerCase())) {
   2095         throw new IllegalArgumentException(
   2096             "Duplicate string key in map when lower casing");
   2097       }
   2098       result.put(key.toLowerCase(), entry.getValue());
   2099     }
   2100     return result;
   2101   }
   2102 
   2103   /**
   2104    * Replaces any string of adjacent whitespace characters with the whitespace
   2105    * character " ".
   2106    *
   2107    * @param str the string you want to munge
   2108    * @return String with no more excessive whitespace!
   2109    * @deprecated ensure the string is not null and use {@code
   2110    *     CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ')}; also consider
   2111    *     whether you really want the legacy whitespace definition, or something
   2112    *     more standard like {@link CharMatcher#WHITESPACE}.
   2113    */
   2114   @Deprecated public static String collapseWhitespace(String str) {
   2115     return (str == null) ? null
   2116         : CharMatcher.LEGACY_WHITESPACE.collapseFrom(str, ' ');
   2117   }
   2118 
   2119   /**
   2120    * Replaces any string of matched characters with the supplied string.<p>
   2121    *
   2122    * This is a more general version of collapseWhitespace.
   2123    *
   2124    * <pre>
   2125    *   E.g. collapse("hello     world", " ", "::")
   2126    *   will return the following string: "hello::world"
   2127    * </pre>
   2128    *
   2129    * @param str the string you want to munge
   2130    * @param chars all of the characters to be considered for munge
   2131    * @param replacement the replacement string
   2132    * @return munged and replaced string.
   2133    * @deprecated if {@code replacement} is the empty string, use {@link
   2134    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
   2135    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
   2136    *     replacement strings use {@link String#replaceAll(String, String)} with
   2137    *     a regular expression that matches one or more occurrences of {@code
   2138    *     chars}. In all cases you must first ensure that {@code str} is not
   2139    *     null.
   2140    */
   2141   @Deprecated public static String collapse(
   2142       String str, String chars, String replacement) {
   2143     if (str == null) {
   2144       return null;
   2145     }
   2146 
   2147     StringBuilder newStr = new StringBuilder();
   2148 
   2149     boolean prevCharMatched = false;
   2150     char c;
   2151     for (int i = 0; i < str.length(); i++) {
   2152       c = str.charAt(i);
   2153       if (chars.indexOf(c) != -1) {
   2154         // this character is matched
   2155         if (prevCharMatched) {
   2156           // apparently a string of matched chars, so don't append anything
   2157           // to the string
   2158           continue;
   2159         }
   2160         prevCharMatched = true;
   2161         newStr.append(replacement);
   2162       } else {
   2163         prevCharMatched = false;
   2164         newStr.append(c);
   2165       }
   2166     }
   2167 
   2168     return newStr.toString();
   2169   }
   2170 
   2171   /**
   2172    * Returns a string with all sequences of ISO control chars (0x00 to 0x1F and
   2173    * 0x7F to 0x9F) replaced by the supplied string.  ISO control characters are
   2174    * identified via {@link Character#isISOControl(char)}.
   2175    *
   2176    * @param str the string you want to strip of ISO control chars
   2177    * @param replacement the replacement string
   2178    * @return a String with all control characters replaced by the replacement
   2179    * string, or null if input is null.
   2180    * @deprecated use {@link CharMatcher#JAVA_ISO_CONTROL}. If {@code
   2181    *     replacement} is the empty string, use {@link
   2182    *     CharMatcher#removeFrom(CharSequence)}; if it is a single character,
   2183    *     use {@link CharMatcher#collapseFrom(CharSequence, char)}; for longer
   2184    *     replacement strings use
   2185    *     {@code str.replaceAll("\p{Cntrl}+", replacement)}.
   2186    *     In all cases you must first ensure that {@code str} is not null.
   2187    */
   2188   @Deprecated public static String collapseControlChars(
   2189       String str, String replacement) {
   2190     /*
   2191      * We re-implement the StringUtil.collapse() loop here rather than call
   2192      * collapse() with an input String of control chars, because matching via
   2193      * isISOControl() is about 10x faster.
   2194      */
   2195     if (str == null) {
   2196       return null;
   2197     }
   2198 
   2199     StringBuilder newStr = new StringBuilder();
   2200 
   2201     boolean prevCharMatched = false;
   2202     char c;
   2203     for (int i = 0; i < str.length(); i++) {
   2204       c = str.charAt(i);
   2205       if (Character.isISOControl(c)) {
   2206         // this character is matched
   2207         if (prevCharMatched) {
   2208           // apparently a string of matched chars, so don't append anything
   2209           // to the string
   2210           continue;
   2211         }
   2212         prevCharMatched = true;
   2213         newStr.append(replacement);
   2214       } else {
   2215         prevCharMatched = false;
   2216         newStr.append(c);
   2217       }
   2218     }
   2219 
   2220     return newStr.toString();
   2221   }
   2222 
   2223   /**
   2224    * Read a String of up to maxLength bytes from an InputStream.
   2225    *
   2226    * <p>Note that this method uses the default platform encoding, and expects
   2227    * that encoding to be single-byte, which is not always the case. Its use
   2228    * is discouraged. For reading the entire stream (maxLength == -1) you can use:
   2229    * <pre>
   2230    *   CharStreams.toString(new InputStreamReader(is, Charsets.ISO_8859_1))
   2231    * </pre>
   2232    * {@code CharStreams} is in the {@code com.google.common.io} package.
   2233    *
   2234    * <p>For maxLength >= 0 a literal translation would be
   2235    * <pre>
   2236    *   CharStreams.toString(new InputStreamReader(
   2237    *       new LimitInputStream(is, maxLength), Charsets.ISO_8859_1))
   2238    * </pre>
   2239    * For multi-byte encodings that is broken because the limit could end in
   2240    * the middle of the character--it would be better to limit the reader than
   2241    * the underlying stream.
   2242    *
   2243    * @param is input stream
   2244    * @param maxLength max number of bytes to read from "is". If this is -1, we
   2245    *          read everything.
   2246    *
   2247    * @return String up to maxLength bytes, read from "is"
   2248    * @deprecated see the advice above
   2249    */
   2250   @Deprecated public static String stream2String(InputStream is, int maxLength)
   2251       throws IOException {
   2252     byte[] buffer = new byte[4096];
   2253     StringWriter sw = new StringWriter();
   2254     int totalRead = 0;
   2255     int read = 0;
   2256 
   2257     do {
   2258       sw.write(new String(buffer, 0, read));
   2259       totalRead += read;
   2260       read = is.read(buffer, 0, buffer.length);
   2261     } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1));
   2262 
   2263     return sw.toString();
   2264   }
   2265 
   2266   /**
   2267    * Parse a list of substrings separated by a given delimiter. The delimiter
   2268    * can also appear in substrings (just double them):
   2269    *
   2270    * parseDelimitedString("this|is", '|') returns ["this","is"]
   2271    * parseDelimitedString("this||is", '|') returns ["this|is"]
   2272    *
   2273    * @param list String containing delimited substrings
   2274    * @param delimiter Delimiter (anything except ' ' is allowed)
   2275    *
   2276    * @return String[] A String array of parsed substrings
   2277    */
   2278   public static String[] parseDelimitedList(String list,
   2279                                             char delimiter) {
   2280     String delim = "" + delimiter;
   2281     // Append a sentinel of delimiter + space
   2282     // (see comments below for more info)
   2283     StringTokenizer st = new StringTokenizer(list + delim + " ",
   2284                                              delim,
   2285                                              true);
   2286     ArrayList<String> v = new ArrayList<String>();
   2287     String lastToken = "";
   2288     StringBuilder word = new StringBuilder();
   2289 
   2290     // We keep a sliding window of 2 tokens
   2291     //
   2292     // delimiter : delimiter -> append delimiter to current word
   2293     //                          and clear most recent token
   2294     //                          (so delim : delim : delim will not
   2295     //                          be treated as two escaped delims.)
   2296     //
   2297     // tok : delimiter -> append tok to current word
   2298     //
   2299     // delimiter : tok -> add current word to list, and clear it.
   2300     //                    (We append a sentinel that conforms to this
   2301     //                    pattern to make sure we've pushed every parsed token)
   2302     while (st.hasMoreTokens()) {
   2303       String tok = st.nextToken();
   2304       if (lastToken != null) {
   2305         if (tok.equals(delim)) {
   2306           word.append(lastToken);
   2307           if (lastToken.equals(delim)) { tok = null; }
   2308         } else {
   2309           if (word.length() != 0) {
   2310             v.add(word.toString());
   2311           }
   2312           word.setLength(0);
   2313         }
   2314       }
   2315       lastToken = tok;
   2316     }
   2317 
   2318     return v.toArray(new String[0]);
   2319   }
   2320 
   2321   /**
   2322    * Compares two strings, guarding against nulls.
   2323    *
   2324    * @param nullsAreGreater true if nulls should be greater than any string,
   2325    *  false is less than.
   2326    * @deprecated use {@link String#CASE_INSENSITIVE_ORDER}, together with
   2327    *     {@link com.google.common.collect.Ordering#nullsFirst()} or
   2328    *     {@link com.google.common.collect.Ordering#nullsLast()} if
   2329    *     needed
   2330    */
   2331   @Deprecated public static int compareToIgnoreCase(String s1, String s2,
   2332       boolean nullsAreGreater) {
   2333     if (s1 == s2) {
   2334       return 0; // Either both the same String, or both null
   2335     }
   2336     if (s1 == null) {
   2337       return nullsAreGreater ? 1 : -1;
   2338     }
   2339     if (s2 == null) {
   2340       return nullsAreGreater ? -1 : 1;
   2341     }
   2342     return s1.compareToIgnoreCase(s2);
   2343   }
   2344 
   2345   /**
   2346    * Splits s with delimiters in delimiter and returns the last token
   2347    */
   2348   public static String lastToken(String s, String delimiter) {
   2349     return s.substring(CharMatcher.anyOf(delimiter).lastIndexIn(s) + 1);
   2350   }
   2351 
   2352   private static final Pattern characterReferencePattern =
   2353       Pattern.compile("&#?[a-zA-Z0-9]{1,8};");
   2354 
   2355   /**
   2356    * Determines if a string contains what looks like an html character
   2357    * reference. Useful for deciding whether unescaping is necessary.
   2358    */
   2359   public static boolean containsCharRef(String s) {
   2360     return characterReferencePattern.matcher(s).find();
   2361   }
   2362 
   2363   /**
   2364    * Determines if a string is a Hebrew word. A string is considered to be
   2365    * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters.
   2366    */
   2367   public static boolean isHebrew(String s) {
   2368     int len = s.length();
   2369     for (int i = 0; i < len; ++i) {
   2370       if (isHebrew(s.codePointAt(i))) {
   2371         return true;
   2372       }
   2373     }
   2374     return false;
   2375   }
   2376 
   2377   /**
   2378    * Determines if a character is a Hebrew character.
   2379    */
   2380   public static boolean isHebrew(int codePoint) {
   2381     return Character.UnicodeBlock.HEBREW.equals(
   2382                Character.UnicodeBlock.of(codePoint));
   2383   }
   2384 
   2385   /**
   2386    * Determines if a string is a CJK word. A string is considered to be CJK
   2387    * if {@link #isCjk(char)} is true for any of its characters.
   2388    */
   2389   public static boolean isCjk(String s) {
   2390     int len = s.length();
   2391     for (int i = 0; i < len; ++i) {
   2392       if (isCjk(s.codePointAt(i))) {
   2393         return true;
   2394       }
   2395     }
   2396     return false;
   2397   }
   2398 
   2399   /**
   2400    * Unicode code blocks containing CJK characters.
   2401    */
   2402   private static final Set<Character.UnicodeBlock> CJK_BLOCKS;
   2403   static {
   2404     Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>();
   2405     set.add(Character.UnicodeBlock.HANGUL_JAMO);
   2406     set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
   2407     set.add(Character.UnicodeBlock.KANGXI_RADICALS);
   2408     set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
   2409     set.add(Character.UnicodeBlock.HIRAGANA);
   2410     set.add(Character.UnicodeBlock.KATAKANA);
   2411     set.add(Character.UnicodeBlock.BOPOMOFO);
   2412     set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO);
   2413     set.add(Character.UnicodeBlock.KANBUN);
   2414     set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED);
   2415     set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
   2416     set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS);
   2417     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY);
   2418     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
   2419     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
   2420     set.add(Character.UnicodeBlock.HANGUL_SYLLABLES);
   2421     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
   2422     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS);
   2423     set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS);
   2424     set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
   2425     set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
   2426     CJK_BLOCKS = Collections.unmodifiableSet(set);
   2427   }
   2428 
   2429   /**
   2430    * Determines if a character is a CJK ideograph or a character typically
   2431    * used only in CJK text.
   2432    *
   2433    * Note: This function cannot handle supplementary characters. To handle all
   2434    * Unicode characters, including supplementary characters, use the function
   2435    * {@link #isCjk(int)}.
   2436    */
   2437   public static boolean isCjk(char ch) {
   2438     return isCjk((int) ch);
   2439   }
   2440 
   2441   /**
   2442    * Determines if a character is a CJK ideograph or a character typically
   2443    * used only in CJK text.
   2444    */
   2445   public static boolean isCjk(int codePoint) {
   2446     // Time-saving early exit for all Latin-1 characters.
   2447     if ((codePoint & 0xFFFFFF00) == 0) {
   2448       return false;
   2449     }
   2450 
   2451     return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint));
   2452   }
   2453 
   2454   /**
   2455    * Returns the approximate display width of the string, measured in units of
   2456    * ascii characters.
   2457    *
   2458    * @see StringUtil#displayWidth(char)
   2459    */
   2460   public static int displayWidth(String s) {
   2461     // TODO(kevinb): could reimplement this as
   2462     // return s.length() * 2 - CharMatcher.SINGLE_WIDTH.countIn(s);
   2463     int width = 0;
   2464     int len = s.length();
   2465     for (int i = 0; i < len; ++i) {
   2466       width += displayWidth(s.charAt(i));
   2467     }
   2468     return width;
   2469   }
   2470 
   2471   /**
   2472    * Returns the approximate display width of the character, measured
   2473    * in units of ascii characters.
   2474    *
   2475    * This method should err on the side of caution. By default, characters
   2476    * are assumed to have width 2; this covers CJK ideographs, various
   2477    * symbols and miscellaneous weird scripts. Given below are some Unicode
   2478    * ranges for which it seems safe to assume that no character is
   2479    * substantially wider than an ascii character:
   2480    *   - Latin, extended Latin, even more extended Latin.
   2481    *   - Greek, extended Greek, Cyrillic.
   2482    *   - Some symbols (including currency symbols) and punctuation.
   2483    *   - Half-width Katakana and Hangul.
   2484    *   - Hebrew
   2485    *   - Arabic
   2486    *   - Thai
   2487    * Characters in these ranges are given a width of 1.
   2488    *
   2489    * IMPORTANT: this function has analogs in C++ (encodingutils.cc,
   2490    * named UnicodeCharWidth) and JavaScript
   2491    * (java/com/google/ads/common/frontend/adwordsbase/resources/CreateAdUtil.js),
   2492    * which need to be updated if you change the implementation here.
   2493    */
   2494   public static int displayWidth(char ch) {
   2495     if (ch <= '\u04f9' ||   // CYRILLIC SMALL LETTER YERU WITH DIAERESIS
   2496         ch == '\u05be' ||   // HEBREW PUNCTUATION MAQAF
   2497         (ch >= '\u05d0' && ch <= '\u05ea') ||  // HEBREW LETTER ALEF ... TAV
   2498         ch == '\u05F3' ||   // HEBREW PUNCTUATION GERESH
   2499         ch == '\u05f4' ||   // HEBREW PUNCTUATION GERSHAYIM
   2500         (ch >= '\u0600' && ch <= '\u06ff') || // Block=Arabic
   2501         (ch >= '\u0750' && ch <= '\u077f') || // Block=Arabic_Supplement
   2502         (ch >= '\ufb50' && ch <= '\ufdff') || // Block=Arabic_Presentation_Forms-A
   2503         (ch >= '\ufe70' && ch <= '\ufeff') || // Block=Arabic_Presentation_Forms-B
   2504         (ch >= '\u1e00' && ch <= '\u20af') || /* LATIN CAPITAL LETTER A WITH RING BELOW
   2505                                                  ... DRACHMA SIGN */
   2506         (ch >= '\u2100' && ch <= '\u213a') || // ACCOUNT OF ... ROTATED CAPITAL Q
   2507         (ch >= '\u0e00' && ch <= '\u0e7f') || // Thai
   2508         (ch >= '\uff61' && ch <= '\uffdc')) { /* HALFWIDTH IDEOGRAPHIC FULL STOP
   2509                                                  ... HALFWIDTH HANGUL LETTER I */
   2510       return 1;
   2511     }
   2512     return 2;
   2513   }
   2514 
   2515   /**
   2516    * @return a string representation of the given native array.
   2517    */
   2518   public static String toString(float[] iArray) {
   2519     if (iArray == null) {
   2520       return "NULL";
   2521     }
   2522 
   2523     StringBuilder buffer = new StringBuilder();
   2524     buffer.append("[");
   2525     for (int i = 0; i < iArray.length; i++) {
   2526       buffer.append(iArray[i]);
   2527       if (i != (iArray.length - 1)) {
   2528         buffer.append(", ");
   2529       }
   2530     }
   2531     buffer.append("]");
   2532     return buffer.toString();
   2533   }
   2534 
   2535   /**
   2536    * @return a string representation of the given native array.
   2537    */
   2538   public static String toString(long[] iArray) {
   2539     if (iArray == null) {
   2540       return "NULL";
   2541     }
   2542 
   2543     StringBuilder buffer = new StringBuilder();
   2544     buffer.append("[");
   2545     for (int i = 0; i < iArray.length; i++) {
   2546       buffer.append(iArray[i]);
   2547       if (i != (iArray.length - 1)) {
   2548         buffer.append(", ");
   2549       }
   2550     }
   2551     buffer.append("]");
   2552     return buffer.toString();
   2553   }
   2554 
   2555   /**
   2556    * @return a string representation of the given native array
   2557    */
   2558   public static String toString(int[] iArray) {
   2559     if (iArray == null) {
   2560       return "NULL";
   2561     }
   2562 
   2563     StringBuilder buffer = new StringBuilder();
   2564     buffer.append("[");
   2565     for (int i = 0; i < iArray.length; i++) {
   2566       buffer.append(iArray[i]);
   2567       if (i != (iArray.length - 1)) {
   2568         buffer.append(", ");
   2569       }
   2570     }
   2571     buffer.append("]");
   2572     return buffer.toString();
   2573   }
   2574 
   2575   /**
   2576    * @return a string representation of the given array.
   2577    */
   2578   public static String toString(String[] iArray) {
   2579     if (iArray == null) { return "NULL"; }
   2580 
   2581     StringBuilder buffer = new StringBuilder();
   2582     buffer.append("[");
   2583     for (int i = 0; i < iArray.length; i++) {
   2584       buffer.append("'").append(iArray[i]).append("'");
   2585       if (i != iArray.length - 1) {
   2586         buffer.append(", ");
   2587       }
   2588     }
   2589     buffer.append("]");
   2590 
   2591     return buffer.toString();
   2592   }
   2593 
   2594   /**
   2595    * Returns the string, in single quotes, or "NULL". Intended only for
   2596    * logging.
   2597    *
   2598    * @param s the string
   2599    * @return the string, in single quotes, or the string "null" if it's null.
   2600    */
   2601   public static String toString(String s) {
   2602     if (s == null) {
   2603       return "NULL";
   2604     } else {
   2605       return new StringBuilder(s.length() + 2).append("'").append(s)
   2606                                               .append("'").toString();
   2607     }
   2608   }
   2609 
   2610   /**
   2611    * @return a string representation of the given native array
   2612    */
   2613   public static String toString(int[][] iArray) {
   2614     if (iArray == null) {
   2615       return "NULL";
   2616     }
   2617 
   2618     StringBuilder buffer = new StringBuilder();
   2619     buffer.append("[");
   2620     for (int i = 0; i < iArray.length; i++) {
   2621       buffer.append("[");
   2622       for (int j = 0; j < iArray[i].length; j++) {
   2623         buffer.append(iArray[i][j]);
   2624         if (j != (iArray[i].length - 1)) {
   2625           buffer.append(", ");
   2626         }
   2627       }
   2628       buffer.append("]");
   2629       if (i != iArray.length - 1) {
   2630         buffer.append(" ");
   2631       }
   2632     }
   2633     buffer.append("]");
   2634     return buffer.toString();
   2635   }
   2636 
   2637   /**
   2638    * @return a string representation of the given native array.
   2639    */
   2640   public static String toString(long[][] iArray) {
   2641     if (iArray == null) { return "NULL"; }
   2642 
   2643     StringBuilder buffer = new StringBuilder();
   2644     buffer.append("[");
   2645     for (int i = 0; i < iArray.length; i++) {
   2646       buffer.append("[");
   2647       for (int j = 0; j < iArray[i].length; j++) {
   2648         buffer.append(iArray[i][j]);
   2649         if (j != (iArray[i].length - 1)) {
   2650           buffer.append(", ");
   2651         }
   2652       }
   2653       buffer.append("]");
   2654       if (i != iArray.length - 1) {
   2655         buffer.append(" ");
   2656       }
   2657     }
   2658     buffer.append("]");
   2659     return buffer.toString();
   2660   }
   2661 
   2662   /**
   2663    * @return a String representation of the given object array.
   2664    * The strings are obtained by calling toString() on the
   2665    * underlying objects.
   2666    */
   2667   public static String toString(Object[] obj) {
   2668     if (obj == null) { return "NULL"; }
   2669     StringBuilder tmp = new StringBuilder();
   2670     tmp.append("[");
   2671     for (int i = 0; i < obj.length; i++) {
   2672       tmp.append(obj[i].toString());
   2673       if (i != obj.length - 1) {
   2674         tmp.append(",");
   2675       }
   2676     }
   2677     tmp.append("]");
   2678     return tmp.toString();
   2679   }
   2680 
   2681   private static final char[] HEX_CHARS
   2682       = { '0', '1', '2', '3', '4', '5', '6', '7',
   2683           '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
   2684   private static final char[] OCTAL_CHARS = HEX_CHARS;  // ignore the last 8 :)
   2685 
   2686   /**
   2687    * Convert a byte array to a hex-encoding string: "a33bff00..."
   2688    *
   2689    * @deprecated Use {@link ByteArrays#toHexString}.
   2690    */
   2691   @Deprecated public static String bytesToHexString(final byte[] bytes) {
   2692     return ByteArrays.toHexString(bytes);
   2693   }
   2694 
   2695   /**
   2696    * Convert a byte array to a hex-encoding string with the specified
   2697    * delimiter: "a3&lt;delimiter&gt;3b&lt;delimiter&gt;ff..."
   2698    */
   2699   public static String bytesToHexString(final byte[] bytes,
   2700       Character delimiter) {
   2701     StringBuilder hex =
   2702       new StringBuilder(bytes.length * (delimiter == null ? 2 : 3));
   2703     int nibble1, nibble2;
   2704     for (int i = 0; i < bytes.length; i++) {
   2705       nibble1 = (bytes[i] >>> 4) & 0xf;
   2706       nibble2 = bytes[i] & 0xf;
   2707       if (i > 0 && delimiter != null) { hex.append(delimiter.charValue()); }
   2708       hex.append(HEX_CHARS[nibble1]);
   2709       hex.append(HEX_CHARS[nibble2]);
   2710     }
   2711     return hex.toString();
   2712   }
   2713 
   2714   /**
   2715    * Safely convert the string to uppercase.
   2716    * @return upper case representation of the String; or null if
   2717    * the input string is null.
   2718    */
   2719   public static String toUpperCase(String src) {
   2720     if (src == null) {
   2721       return null;
   2722     } else {
   2723       return src.toUpperCase();
   2724     }
   2725   }
   2726 
   2727   /**
   2728    * Safely convert the string to lowercase.
   2729    * @return lower case representation of the String; or null if
   2730    * the input string is null.
   2731    */
   2732   public static String toLowerCase(String src) {
   2733     if (src == null) {
   2734       return null;
   2735     } else {
   2736       return src.toLowerCase();
   2737     }
   2738   }
   2739 
   2740   private static final Pattern dbSpecPattern =
   2741       Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)");
   2742 
   2743   /**
   2744    * @param dbSpecComponent a single component of a DBDescriptor spec
   2745    * (e.g. the host or database component). The expected format of the string is:
   2746    * <br>
   2747    *             <center>(prefix){(digits),(digits)}(suffix)</center>
   2748    * </br>
   2749    * @return a shard expansion of the given String.
   2750    * Note that unless the pattern is matched exactly, no expansion is
   2751    * performed and the original string is returned unaltered.
   2752    * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'.
   2753    * Note that this method is added to StringUtil instead of
   2754    * DBDescriptor to better encapsulate the choice of regexp implementation.
   2755    * @throws IllegalArgumentException if the string does not parse.
   2756    */
   2757   public static String expandShardNames(String dbSpecComponent)
   2758       throws IllegalArgumentException, IllegalStateException {
   2759 
   2760     Matcher matcher = dbSpecPattern.matcher(dbSpecComponent);
   2761     if (matcher.find()) {
   2762       try {
   2763         String prefix = dbSpecComponent.substring(
   2764           matcher.start(1), matcher.end(1));
   2765         int minShard =
   2766           Integer.parseInt(
   2767             dbSpecComponent.substring(
   2768               matcher.start(2), matcher.end(2)));
   2769         int maxShard =
   2770           Integer.parseInt(
   2771             dbSpecComponent.substring(
   2772               matcher.start(3), matcher.end(3)));
   2773         String suffix = dbSpecComponent.substring(
   2774           matcher.start(4), matcher.end(4));
   2775         //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix);
   2776         if (minShard > maxShard) {
   2777           throw new IllegalArgumentException(
   2778             "Maximum shard must be greater than or equal to " +
   2779             "the minimum shard");
   2780         }
   2781         StringBuilder tmp = new StringBuilder();
   2782         for (int shard = minShard; shard <= maxShard; shard++) {
   2783           tmp.append(prefix).append(shard).append(suffix);
   2784           if (shard != maxShard) {
   2785             tmp.append(",");
   2786           }
   2787         }
   2788         return tmp.toString();
   2789       } catch (NumberFormatException nfex) {
   2790         throw new IllegalArgumentException(
   2791           "Malformed DB specification component: " + dbSpecComponent);
   2792       }
   2793     } else {
   2794       return dbSpecComponent;
   2795     }
   2796   }
   2797 
   2798 
   2799   /**
   2800   * Returns a string that is equivalent to the specified string with its
   2801   * first character converted to uppercase as by {@link String#toUpperCase()}.
   2802   * The returned string will have the same value as the specified string if
   2803   * its first character is non-alphabetic, if its first character is already
   2804   * uppercase, or if the specified string is of length 0.
   2805   *
   2806   * <p>For example:
   2807   * <pre>
   2808   *    capitalize("foo bar").equals("Foo bar");
   2809   *    capitalize("2b or not 2b").equals("2b or not 2b")
   2810   *    capitalize("Foo bar").equals("Foo bar");
   2811   *    capitalize("").equals("");
   2812   * </pre>
   2813   *
   2814   * @param s the string whose first character is to be uppercased
   2815   * @return a string equivalent to <tt>s</tt> with its first character
   2816   *     converted to uppercase
   2817   * @throws NullPointerException if <tt>s</tt> is null
   2818   */
   2819   public static String capitalize(String s) {
   2820     if (s.length() == 0) {
   2821       return s;
   2822     }
   2823     char first = s.charAt(0);
   2824     char capitalized = Character.toUpperCase(first);
   2825     return (first == capitalized)
   2826         ? s
   2827         : capitalized + s.substring(1);
   2828   }
   2829 
   2830   /**
   2831    * Examine a string to see if it starts with a given prefix (case
   2832    * insensitive). Just like String.startsWith() except doesn't
   2833    * respect case. Strings are compared in the same way as in
   2834    * {@link String#equalsIgnoreCase}.
   2835    *
   2836    * @param str the string to examine
   2837    * @param prefix the prefix to look for
   2838    * @return a boolean indicating if str starts with prefix (case insensitive)
   2839    */
   2840   public static boolean startsWithIgnoreCase(String str, String prefix) {
   2841     return str.regionMatches(true, 0, prefix, 0, prefix.length());
   2842   }
   2843 
   2844   /**
   2845    * Examine a string to see if it ends with a given suffix (case
   2846    * insensitive). Just like String.endsWith() except doesn't respect
   2847    * case. Strings are compared in the same way as in
   2848    * {@link String#equalsIgnoreCase}.
   2849    *
   2850    * @param str the string to examine
   2851    * @param suffix the suffix to look for
   2852    * @return a boolean indicating if str ends with suffix (case insensitive)
   2853    */
   2854   public static boolean endsWithIgnoreCase(String str, String suffix) {
   2855     int len = suffix.length();
   2856     return str.regionMatches(true, str.length() - len, suffix, 0, len);
   2857   }
   2858 
   2859   /**
   2860    * @param c one codePoint
   2861    * @return the number of bytes needed to encode this codePoint in UTF-8
   2862    */
   2863   private static int bytesUtf8(int c) {
   2864     if (c < 0x80) {
   2865       return 1;
   2866     } else if (c < 0x00800) {
   2867       return 2;
   2868     } else if (c < 0x10000) {
   2869       return 3;
   2870     } else if (c < 0x200000) {
   2871       return 4;
   2872 
   2873     // RFC 3629 forbids the use of UTF-8 for codePoint greater than 0x10FFFF,
   2874     // so if the caller respects this RFC, this should not happen
   2875     } else if (c < 0x4000000) {
   2876       return 5;
   2877     } else {
   2878       return 6;
   2879     }
   2880   }
   2881 
   2882   /**
   2883    * @param str a string
   2884    * @return the number of bytes required to represent this string in UTF-8
   2885    */
   2886   public static int bytesStorage(String str) {
   2887     // offsetByCodePoint has a bug if its argument is the result of a
   2888     // call to substring. To avoid this, we create a new String
   2889     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
   2890     String s = new String(str);
   2891 
   2892     int len = 0;
   2893     for (int i = 0; i < s.length(); i = s.offsetByCodePoints(i, 1)) {
   2894       len += bytesUtf8(s.codePointAt(i));
   2895     }
   2896     return len;
   2897   }
   2898 
   2899   /**
   2900    * @param str a string
   2901    * @param maxbytes
   2902    * @return the beginning of the string, so that it uses less than
   2903    *     maxbytes bytes in UTF-8
   2904    * @throws IndexOutOfBoundsException if maxbytes is negative
   2905    */
   2906   public static String truncateStringForUtf8Storage(String str, int maxbytes) {
   2907     if (maxbytes < 0) {
   2908       throw new IndexOutOfBoundsException();
   2909     }
   2910 
   2911     // offsetByCodePoint has a bug if its argument is the result of a
   2912     // call to substring. To avoid this, we create a new String
   2913     // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6242664
   2914     // TODO(cquinn): should be fixed as of 1.5.0_01
   2915     String s = new String(str);
   2916 
   2917     int codepoints = 0;
   2918     int bytesUsed = 0;
   2919     for (codepoints = 0; codepoints < s.length();
   2920         codepoints = s.offsetByCodePoints(codepoints, 1)) {
   2921       int glyphBytes = StringUtil.bytesUtf8(s.codePointAt(codepoints));
   2922       if (bytesUsed + glyphBytes > maxbytes) {
   2923         break;
   2924       }
   2925       bytesUsed += glyphBytes;
   2926     }
   2927     return s.substring(0, codepoints);
   2928   }
   2929 
   2930   /**
   2931    * If the given string is of length {@code maxLength} or less, then it is
   2932    * returned as is.
   2933    * If the string is longer than {@code maxLength}, the returned string is
   2934    * truncated before the last space character on or before
   2935    * {@code source.charAt(maxLength)}. If the string has no spaces, the
   2936    * returned string is truncated to {@code maxLength}.
   2937    *
   2938    * @param source the string to truncate if necessary
   2939    * @param maxLength
   2940    * @return the original string if its length is less than or equal to
   2941    *     maxLength, otherwise a truncated string as mentioned above
   2942    */
   2943   public static String truncateIfNecessary(String source, int maxLength) {
   2944     if (source.length() <= maxLength) {
   2945       return source;
   2946     }
   2947     String str = unicodePreservingSubstring(source, 0, maxLength);
   2948 
   2949     @SuppressWarnings("deprecation") // we'll make this go away before that does
   2950     CharMatcher whitespaceMatcher = CharMatcher.LEGACY_WHITESPACE;
   2951     String truncated = whitespaceMatcher.trimTrailingFrom(str);
   2952 
   2953     // We may have had multiple spaces at maxLength, which were stripped away
   2954     if (truncated.length() < maxLength) {
   2955       return truncated;
   2956     }
   2957     // We have a truncated string of length maxLength. If the next char was a
   2958     // space, we truncated at a word boundary, so we can return immediately
   2959     if (Character.isSpaceChar(source.charAt(maxLength))) {
   2960       return truncated;
   2961     }
   2962     // We truncated in the middle of the word. Try to truncate before
   2963     // the last space, if it exists. Otherwise, return the truncated string
   2964     for (int i = truncated.length() - 1; i >= 0; --i) {
   2965       if (Character.isSpaceChar(truncated.charAt(i))) {
   2966         String substr = truncated.substring(0, i);
   2967         return whitespaceMatcher.trimTrailingFrom(substr);
   2968       }
   2969     }
   2970     return truncated;
   2971   }
   2972 
   2973   /**
   2974    * If this given string is of length {@code maxLength} or less, it will
   2975    * be returned as-is.
   2976    * Otherwise it will be trucated to {@code maxLength}, regardless of whether
   2977    * there are any space characters in the String. If an ellipsis is requested
   2978    * to be appended to the truncated String, the String will be truncated so
   2979    * that the ellipsis will also fit within maxLength.
   2980    * If no truncation was necessary, no ellipsis will be added.
   2981    *
   2982    * @param source the String to truncate if necessary
   2983    * @param maxLength the maximum number of characters to keep
   2984    * @param addEllipsis if true, and if the String had to be truncated,
   2985    *     add "..." to the end of the String before returning. Additionally,
   2986    *     the ellipsis will only be added if maxLength is greater than 3.
   2987    * @return the original string if its length is less than or equal to
   2988    *     maxLength, otherwise a truncated string as mentioned above
   2989    */
   2990   public static String truncateAtMaxLength(String source, int maxLength,
   2991       boolean addEllipsis) {
   2992 
   2993     if (source.length() <= maxLength) {
   2994       return source;
   2995     }
   2996     if (addEllipsis && maxLength > 3) {
   2997       return unicodePreservingSubstring(source, 0, maxLength - 3) + "...";
   2998     }
   2999     return unicodePreservingSubstring(source, 0, maxLength);
   3000   }
   3001 
   3002   /**
   3003    * Normalizes {@code index} such that it respects Unicode character
   3004    * boundaries in {@code str}.
   3005    *
   3006    * <p>If {@code index} is the low surrogate of a unicode character,
   3007    * the method returns {@code index - 1}. Otherwise, {@code index} is
   3008    * returned.
   3009    *
   3010    * <p>In the case in which {@code index} falls in an invalid surrogate pair
   3011    * (e.g. consecutive low surrogates, consecutive high surrogates), or if
   3012    * if it is not a valid index into {@code str}, the original value of
   3013    * {@code index} is returned.
   3014    *
   3015    * @param str the String
   3016    * @param index the index to be normalized
   3017    * @return a normalized index that does not split a Unicode character
   3018    */
   3019   public static int unicodePreservingIndex(String str, int index) {
   3020     if (index > 0 && index < str.length()) {
   3021       if (Character.isHighSurrogate(str.charAt(index - 1)) &&
   3022           Character.isLowSurrogate(str.charAt(index))) {
   3023         return index - 1;
   3024       }
   3025     }
   3026     return index;
   3027   }
   3028 
   3029   /**
   3030    * Returns a substring of {@code str} that respects Unicode character
   3031    * boundaries.
   3032    *
   3033    * <p>The string will never be split between a [high, low] surrogate pair,
   3034    * as defined by {@link Character#isHighSurrogate} and
   3035    * {@link Character#isLowSurrogate}.
   3036    *
   3037    * <p>If {@code begin} or {@code end} are the low surrogate of a unicode
   3038    * character, it will be offset by -1.
   3039    *
   3040    * <p>This behavior guarantees that
   3041    * {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) +
   3042    *     StringUtil.unicodePreservingSubstring(str, n, str.length())) } is
   3043    * true for all {@code n}.
   3044    * </pre>
   3045    *
   3046    * <p>This means that unlike {@link String#substring(int, int)}, the length of
   3047    * the returned substring may not necessarily be equivalent to
   3048    * {@code end - begin}.
   3049    *
   3050    * @param str the original String
   3051    * @param begin the beginning index, inclusive
   3052    * @param end the ending index, exclusive
   3053    * @return the specified substring, possibly adjusted in order to not
   3054    *   split unicode surrogate pairs
   3055    * @throws IndexOutOfBoundsException if the {@code begin} is negative,
   3056    *   or {@code end} is larger than the length of {@code str}, or
   3057    *   {@code begin} is larger than {@code end}
   3058    */
   3059   public static String unicodePreservingSubstring(
   3060       String str, int begin, int end) {
   3061     return str.substring(unicodePreservingIndex(str, begin),
   3062         unicodePreservingIndex(str, end));
   3063   }
   3064 
   3065   /**
   3066    * Equivalent to:
   3067    *
   3068    * <pre>
   3069    * {@link #unicodePreservingSubstring(String, int, int)}(
   3070    *     str, begin, str.length())
   3071    * </pre>
   3072    */
   3073   public static String unicodePreservingSubstring(String str, int begin) {
   3074     return unicodePreservingSubstring(str, begin, str.length());
   3075   }
   3076 
   3077   /**
   3078    * True iff the given character needs to be escaped in a javascript string
   3079    * literal.
   3080    * <p>
   3081    * We need to escape the following characters in javascript string literals.
   3082    * <dl>
   3083    * <dt> \           <dd> the escape character
   3084    * <dt> ', "        <dd> string delimiters.
   3085    *                       TODO(msamuel): what about backticks (`) which are
   3086    *                       non-standard but recognized as attribute delimiters.
   3087    * <dt> &, <, >, =  <dd> so that a string literal can be embedded in XHTML
   3088    *                       without further escaping.
   3089    * </dl>
   3090    * TODO(msamuel): If we're being paranoid, should we escape + to avoid UTF-7
   3091    * attacks?
   3092    * <p>
   3093    * Unicode format control characters (category Cf) must be escaped since they
   3094    * are removed by javascript parser in a pre-lex pass.
   3095    * <br>According to EcmaScript 262 Section 7.1:
   3096    * <blockquote>
   3097    *     The format control characters can occur anywhere in the source text of
   3098    *     an ECMAScript program. These characters are removed from the source
   3099    *     text before applying the lexical grammar.
   3100    * </blockquote>
   3101    * <p>
   3102    * Additionally, line terminators are not allowed to appear inside strings
   3103    * and Section 7.3 says
   3104    * <blockquote>
   3105    *     The following characters are considered to be line terminators:<pre>
   3106    *         Code Point Value   Name                  Formal Name
   3107    *         \u000A             Line Feed             [LF]
   3108    *         \u000D             Carriage Return       [CR]
   3109    *         \u2028             Line separator        [LS]
   3110    *         \u2029             Paragraph separator   [PS]
   3111    * </pre></blockquote>
   3112    *
   3113    * @param codepoint a char instead of an int since the javascript language
   3114    *    does not support extended unicode.
   3115    */
   3116   static boolean mustEscapeCharInJsString(int codepoint) {
   3117     return JS_ESCAPE_CHARS.contains(codepoint);
   3118   }
   3119 
   3120   /**
   3121    * True iff the given character needs to be escaped in a JSON string literal.
   3122    * <p>
   3123    * We need to escape the following characters in JSON string literals.
   3124    * <dl>
   3125    * <dt> \           <dd> the escape character
   3126    * <dt> "           <dd> string delimiter
   3127    * <dt> 0x00 - 0x1F <dd> control characters
   3128    * </dl>
   3129    * <p>
   3130    * See EcmaScript 262 Section 15.12.1 for the full JSON grammar.
   3131    */
   3132   static boolean mustEscapeCharInJsonString(int codepoint) {
   3133     return JSON_ESCAPE_CHARS.contains(codepoint);
   3134   }
   3135 
   3136   /**
   3137    * Builds a small set of code points.
   3138    * {@code com.google.common.base} cannot depend on ICU4J, thus avoiding ICU's
   3139    * {@code UnicodeSet}.
   3140    * For all other purposes, please use {@code com.ibm.icu.text.UnicodeSet}.
   3141    */
   3142   private static class UnicodeSetBuilder {
   3143     Set<Integer> codePointSet = new HashSet<Integer>();
   3144 
   3145     UnicodeSetBuilder addCodePoint(int c) {
   3146       codePointSet.add(c);
   3147       return this;
   3148     }
   3149 
   3150     UnicodeSetBuilder addRange(int from, int to) {
   3151       for (int i = from; i <= to; i++) {
   3152         codePointSet.add(i);
   3153       }
   3154       return this;
   3155     }
   3156 
   3157     Set<Integer> create() {
   3158       return codePointSet;
   3159     }
   3160   }
   3161 
   3162   private static final Set<Integer> JS_ESCAPE_CHARS = new UnicodeSetBuilder()
   3163       // All characters in the class of format characters, [:Cf:].
   3164       // Source: http://unicode.org/cldr/utility/list-unicodeset.jsp.
   3165       .addCodePoint(0xAD)
   3166       .addRange(0x600, 0x603)
   3167       .addCodePoint(0x6DD)
   3168       .addCodePoint(0x070F)
   3169       .addRange(0x17B4, 0x17B5)
   3170       .addRange(0x200B, 0x200F)
   3171       .addRange(0x202A, 0x202E)
   3172       .addRange(0x2060, 0x2064)
   3173       .addRange(0x206A, 0x206F)
   3174       .addCodePoint(0xFEFF)
   3175       .addRange(0xFFF9, 0xFFFB)
   3176       .addRange(0x0001D173, 0x0001D17A)
   3177       .addCodePoint(0x000E0001)
   3178       .addRange(0x000E0020, 0x000E007F)
   3179       // Plus characters mentioned in the docs of mustEscapeCharInJsString().
   3180       .addCodePoint(0x0000)
   3181       .addCodePoint(0x000A)
   3182       .addCodePoint(0x000D)
   3183       .addRange(0x2028, 0x2029)
   3184       .addCodePoint(0x0085)
   3185       .addCodePoint(Character.codePointAt("'", 0))
   3186       .addCodePoint(Character.codePointAt("\"", 0))
   3187       .addCodePoint(Character.codePointAt("&", 0))
   3188       .addCodePoint(Character.codePointAt("<", 0))
   3189       .addCodePoint(Character.codePointAt(">", 0))
   3190       .addCodePoint(Character.codePointAt("=", 0))
   3191       .addCodePoint(Character.codePointAt("\\", 0))
   3192       .create();
   3193 
   3194   private static final Set<Integer> JSON_ESCAPE_CHARS = new UnicodeSetBuilder()
   3195       .addCodePoint(Character.codePointAt("\"", 0))
   3196       .addCodePoint(Character.codePointAt("\\", 0))
   3197       .addRange(0x0000, 0x001F)
   3198       .create();
   3199 
   3200   /**
   3201    * <b>To be deprecated:</b> use {@link CharEscapers#xmlEscaper()} instead.
   3202    */
   3203   public static String xmlEscape(String s) {
   3204     return CharEscapers.xmlEscaper().escape(s);
   3205   }
   3206 
   3207   /**
   3208    * <b>To be deprecated:</b> use {@link CharEscapers#asciiHtmlEscaper()} instead.
   3209    */
   3210   public static String htmlEscape(String s) {
   3211     return CharEscapers.asciiHtmlEscaper().escape(s);
   3212   }
   3213 }