Home | History | Annotate | Download | only in base
      1 /**
      2  * Copyright (c) 2006, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.google.android.mail.common.base;
     17 
     18 import static com.google.android.mail.common.base.Preconditions.checkNotNull;
     19 
     20 import java.io.IOException;
     21 
     22 /**
     23  * Utility functions for dealing with {@code CharEscaper}s, and some commonly
     24  * used {@code CharEscaper} instances.
     25  *
     26  * @author sven (at) google.com (Sven Mawson)
     27  * @author laurence (at) google.com (Laurence Gonsalves)
     28  */
     29 public final class CharEscapers {
     30   private CharEscapers() {}
     31 
     32   // TODO(matevossian): To implementors of escapers --
     33   //                    For each xxxEscaper method, please add links to external
     34   //                    reference pages that we consider authoritative for what
     35   //                    that escaper should exactly be doing.
     36 
     37   /**
     38    * Performs no escaping.
     39    */
     40   private static final CharEscaper NULL_ESCAPER = new CharEscaper() {
     41       @Override
     42     public String escape(String string) {
     43         checkNotNull(string);
     44         return string;
     45       }
     46 
     47       @Override
     48       public Appendable escape(final Appendable out) {
     49         checkNotNull(out);
     50 
     51         // we can't simply return out because the CharEscaper contract says that
     52         // the returned Appendable will throw a NullPointerException if asked to
     53         // append null.
     54         return new Appendable() {
     55             @Override public Appendable append(CharSequence csq) throws IOException {
     56               checkNotNull(csq);
     57               out.append(csq);
     58               return this;
     59             }
     60 
     61             @Override public Appendable append(CharSequence csq, int start, int end)
     62                 throws IOException {
     63               checkNotNull(csq);
     64               out.append(csq, start, end);
     65               return this;
     66             }
     67 
     68             @Override public Appendable append(char c) throws IOException {
     69               out.append(c);
     70               return this;
     71             }
     72           };
     73       }
     74 
     75       @Override
     76       protected char[] escape(char c) {
     77         return null;
     78       }
     79     };
     80 
     81   /**
     82    * Returns a {@link CharEscaper} that does no escaping.
     83    */
     84   public static CharEscaper nullEscaper() {
     85     return NULL_ESCAPER;
     86   }
     87 
     88   /**
     89    * Returns a {@link CharEscaper} instance that escapes special characters in a
     90    * string so it can safely be included in an XML document in either element
     91    * content or attribute values.
     92    *
     93    * <p><b>Note</b></p>: silently removes null-characters and control
     94    * characters, as there is no way to represent them in XML.
     95    */
     96   public static CharEscaper xmlEscaper() {
     97     return XML_ESCAPER;
     98   }
     99 
    100   /**
    101    * Escapes special characters from a string so it can safely be included in an
    102    * XML document in either element content or attribute values.  Also removes
    103    * null-characters and control characters, as there is no way to represent
    104    * them in XML.
    105    */
    106   private static final CharEscaper XML_ESCAPER = newBasicXmlEscapeBuilder()
    107       .addEscape('"', "&quot;")
    108       .addEscape('\'', "&apos;")
    109       .toEscaper();
    110 
    111   /**
    112    * Returns a {@link CharEscaper} instance that escapes special characters in a
    113    * string so it can safely be included in an XML document in element content.
    114    *
    115    * <p><b>Note</b></p>: double and single quotes are not escaped, so it is not
    116    * safe to use this escaper to escape attribute values. Use the
    117    * {@link #xmlEscaper()} escaper to escape attribute values or if you are
    118    * unsure. Also silently removes non-whitespace control characters, as there
    119    * is no way to represent them in XML.
    120    */
    121   public static CharEscaper xmlContentEscaper() {
    122     return XML_CONTENT_ESCAPER;
    123   }
    124 
    125   /**
    126    * Escapes special characters from a string so it can safely be included in an
    127    * XML document in element content.  Note that quotes are <em>not</em>
    128    * escaped, so <em>this is not safe for use in attribute values</em>. Use
    129    * {@link #XML_ESCAPER} for attribute values, or if you are unsure.  Also
    130    * removes non-whitespace control characters, as there is no way to represent
    131    * them in XML.
    132    */
    133   private static final CharEscaper XML_CONTENT_ESCAPER =
    134       newBasicXmlEscapeBuilder().toEscaper();
    135 
    136   /**
    137    * Returns a {@link CharEscaper} instance that escapes special characters in a
    138    * string so it can safely be included in an HTML document in either element
    139    * content or attribute values.
    140    *
    141    * <p><b>Note</b></p>: alters non-ASCII and control characters.
    142    *
    143    * The entity list was taken from:
    144    * <a href="http://www.w3.org/TR/html4/sgml/entities.html">here</a>
    145    */
    146   public static CharEscaper htmlEscaper() {
    147     return HtmlEscaperHolder.HTML_ESCAPER;
    148   }
    149 
    150   /**
    151    * A lazy initialization holder for HTML_ESCAPER.
    152    */
    153   private static class HtmlEscaperHolder {
    154     private static final CharEscaper HTML_ESCAPER
    155         = new HtmlCharEscaper(new CharEscaperBuilder()
    156             .addEscape('"',      "&quot;")
    157             .addEscape('\'',     "&#39;")
    158             .addEscape('&',      "&amp;")
    159             .addEscape('<',      "&lt;")
    160             .addEscape('>',      "&gt;")
    161             .addEscape('\u00A0', "&nbsp;")
    162             .addEscape('\u00A1', "&iexcl;")
    163             .addEscape('\u00A2', "&cent;")
    164             .addEscape('\u00A3', "&pound;")
    165             .addEscape('\u00A4', "&curren;")
    166             .addEscape('\u00A5', "&yen;")
    167             .addEscape('\u00A6', "&brvbar;")
    168             .addEscape('\u00A7', "&sect;")
    169             .addEscape('\u00A8', "&uml;")
    170             .addEscape('\u00A9', "&copy;")
    171             .addEscape('\u00AA', "&ordf;")
    172             .addEscape('\u00AB', "&laquo;")
    173             .addEscape('\u00AC', "&not;")
    174             .addEscape('\u00AD', "&shy;")
    175             .addEscape('\u00AE', "&reg;")
    176             .addEscape('\u00AF', "&macr;")
    177             .addEscape('\u00B0', "&deg;")
    178             .addEscape('\u00B1', "&plusmn;")
    179             .addEscape('\u00B2', "&sup2;")
    180             .addEscape('\u00B3', "&sup3;")
    181             .addEscape('\u00B4', "&acute;")
    182             .addEscape('\u00B5', "&micro;")
    183             .addEscape('\u00B6', "&para;")
    184             .addEscape('\u00B7', "&middot;")
    185             .addEscape('\u00B8', "&cedil;")
    186             .addEscape('\u00B9', "&sup1;")
    187             .addEscape('\u00BA', "&ordm;")
    188             .addEscape('\u00BB', "&raquo;")
    189             .addEscape('\u00BC', "&frac14;")
    190             .addEscape('\u00BD', "&frac12;")
    191             .addEscape('\u00BE', "&frac34;")
    192             .addEscape('\u00BF', "&iquest;")
    193             .addEscape('\u00C0', "&Agrave;")
    194             .addEscape('\u00C1', "&Aacute;")
    195             .addEscape('\u00C2', "&Acirc;")
    196             .addEscape('\u00C3', "&Atilde;")
    197             .addEscape('\u00C4', "&Auml;")
    198             .addEscape('\u00C5', "&Aring;")
    199             .addEscape('\u00C6', "&AElig;")
    200             .addEscape('\u00C7', "&Ccedil;")
    201             .addEscape('\u00C8', "&Egrave;")
    202             .addEscape('\u00C9', "&Eacute;")
    203             .addEscape('\u00CA', "&Ecirc;")
    204             .addEscape('\u00CB', "&Euml;")
    205             .addEscape('\u00CC', "&Igrave;")
    206             .addEscape('\u00CD', "&Iacute;")
    207             .addEscape('\u00CE', "&Icirc;")
    208             .addEscape('\u00CF', "&Iuml;")
    209             .addEscape('\u00D0', "&ETH;")
    210             .addEscape('\u00D1', "&Ntilde;")
    211             .addEscape('\u00D2', "&Ograve;")
    212             .addEscape('\u00D3', "&Oacute;")
    213             .addEscape('\u00D4', "&Ocirc;")
    214             .addEscape('\u00D5', "&Otilde;")
    215             .addEscape('\u00D6', "&Ouml;")
    216             .addEscape('\u00D7', "&times;")
    217             .addEscape('\u00D8', "&Oslash;")
    218             .addEscape('\u00D9', "&Ugrave;")
    219             .addEscape('\u00DA', "&Uacute;")
    220             .addEscape('\u00DB', "&Ucirc;")
    221             .addEscape('\u00DC', "&Uuml;")
    222             .addEscape('\u00DD', "&Yacute;")
    223             .addEscape('\u00DE', "&THORN;")
    224             .addEscape('\u00DF', "&szlig;")
    225             .addEscape('\u00E0', "&agrave;")
    226             .addEscape('\u00E1', "&aacute;")
    227             .addEscape('\u00E2', "&acirc;")
    228             .addEscape('\u00E3', "&atilde;")
    229             .addEscape('\u00E4', "&auml;")
    230             .addEscape('\u00E5', "&aring;")
    231             .addEscape('\u00E6', "&aelig;")
    232             .addEscape('\u00E7', "&ccedil;")
    233             .addEscape('\u00E8', "&egrave;")
    234             .addEscape('\u00E9', "&eacute;")
    235             .addEscape('\u00EA', "&ecirc;")
    236             .addEscape('\u00EB', "&euml;")
    237             .addEscape('\u00EC', "&igrave;")
    238             .addEscape('\u00ED', "&iacute;")
    239             .addEscape('\u00EE', "&icirc;")
    240             .addEscape('\u00EF', "&iuml;")
    241             .addEscape('\u00F0', "&eth;")
    242             .addEscape('\u00F1', "&ntilde;")
    243             .addEscape('\u00F2', "&ograve;")
    244             .addEscape('\u00F3', "&oacute;")
    245             .addEscape('\u00F4', "&ocirc;")
    246             .addEscape('\u00F5', "&otilde;")
    247             .addEscape('\u00F6', "&ouml;")
    248             .addEscape('\u00F7', "&divide;")
    249             .addEscape('\u00F8', "&oslash;")
    250             .addEscape('\u00F9', "&ugrave;")
    251             .addEscape('\u00FA', "&uacute;")
    252             .addEscape('\u00FB', "&ucirc;")
    253             .addEscape('\u00FC', "&uuml;")
    254             .addEscape('\u00FD', "&yacute;")
    255             .addEscape('\u00FE', "&thorn;")
    256             .addEscape('\u00FF', "&yuml;")
    257             .addEscape('\u0152', "&OElig;")
    258             .addEscape('\u0153', "&oelig;")
    259             .addEscape('\u0160', "&Scaron;")
    260             .addEscape('\u0161', "&scaron;")
    261             .addEscape('\u0178', "&Yuml;")
    262             .addEscape('\u0192', "&fnof;")
    263             .addEscape('\u02C6', "&circ;")
    264             .addEscape('\u02DC', "&tilde;")
    265             .addEscape('\u0391', "&Alpha;")
    266             .addEscape('\u0392', "&Beta;")
    267             .addEscape('\u0393', "&Gamma;")
    268             .addEscape('\u0394', "&Delta;")
    269             .addEscape('\u0395', "&Epsilon;")
    270             .addEscape('\u0396', "&Zeta;")
    271             .addEscape('\u0397', "&Eta;")
    272             .addEscape('\u0398', "&Theta;")
    273             .addEscape('\u0399', "&Iota;")
    274             .addEscape('\u039A', "&Kappa;")
    275             .addEscape('\u039B', "&Lambda;")
    276             .addEscape('\u039C', "&Mu;")
    277             .addEscape('\u039D', "&Nu;")
    278             .addEscape('\u039E', "&Xi;")
    279             .addEscape('\u039F', "&Omicron;")
    280             .addEscape('\u03A0', "&Pi;")
    281             .addEscape('\u03A1', "&Rho;")
    282             .addEscape('\u03A3', "&Sigma;")
    283             .addEscape('\u03A4', "&Tau;")
    284             .addEscape('\u03A5', "&Upsilon;")
    285             .addEscape('\u03A6', "&Phi;")
    286             .addEscape('\u03A7', "&Chi;")
    287             .addEscape('\u03A8', "&Psi;")
    288             .addEscape('\u03A9', "&Omega;")
    289             .addEscape('\u03B1', "&alpha;")
    290             .addEscape('\u03B2', "&beta;")
    291             .addEscape('\u03B3', "&gamma;")
    292             .addEscape('\u03B4', "&delta;")
    293             .addEscape('\u03B5', "&epsilon;")
    294             .addEscape('\u03B6', "&zeta;")
    295             .addEscape('\u03B7', "&eta;")
    296             .addEscape('\u03B8', "&theta;")
    297             .addEscape('\u03B9', "&iota;")
    298             .addEscape('\u03BA', "&kappa;")
    299             .addEscape('\u03BB', "&lambda;")
    300             .addEscape('\u03BC', "&mu;")
    301             .addEscape('\u03BD', "&nu;")
    302             .addEscape('\u03BE', "&xi;")
    303             .addEscape('\u03BF', "&omicron;")
    304             .addEscape('\u03C0', "&pi;")
    305             .addEscape('\u03C1', "&rho;")
    306             .addEscape('\u03C2', "&sigmaf;")
    307             .addEscape('\u03C3', "&sigma;")
    308             .addEscape('\u03C4', "&tau;")
    309             .addEscape('\u03C5', "&upsilon;")
    310             .addEscape('\u03C6', "&phi;")
    311             .addEscape('\u03C7', "&chi;")
    312             .addEscape('\u03C8', "&psi;")
    313             .addEscape('\u03C9', "&omega;")
    314             .addEscape('\u03D1', "&thetasym;")
    315             .addEscape('\u03D2', "&upsih;")
    316             .addEscape('\u03D6', "&piv;")
    317             .addEscape('\u2002', "&ensp;")
    318             .addEscape('\u2003', "&emsp;")
    319             .addEscape('\u2009', "&thinsp;")
    320             .addEscape('\u200C', "&zwnj;")
    321             .addEscape('\u200D', "&zwj;")
    322             .addEscape('\u200E', "&lrm;")
    323             .addEscape('\u200F', "&rlm;")
    324             .addEscape('\u2013', "&ndash;")
    325             .addEscape('\u2014', "&mdash;")
    326             .addEscape('\u2018', "&lsquo;")
    327             .addEscape('\u2019', "&rsquo;")
    328             .addEscape('\u201A', "&sbquo;")
    329             .addEscape('\u201C', "&ldquo;")
    330             .addEscape('\u201D', "&rdquo;")
    331             .addEscape('\u201E', "&bdquo;")
    332             .addEscape('\u2020', "&dagger;")
    333             .addEscape('\u2021', "&Dagger;")
    334             .addEscape('\u2022', "&bull;")
    335             .addEscape('\u2026', "&hellip;")
    336             .addEscape('\u2030', "&permil;")
    337             .addEscape('\u2032', "&prime;")
    338             .addEscape('\u2033', "&Prime;")
    339             .addEscape('\u2039', "&lsaquo;")
    340             .addEscape('\u203A', "&rsaquo;")
    341             .addEscape('\u203E', "&oline;")
    342             .addEscape('\u2044', "&frasl;")
    343             .addEscape('\u20AC', "&euro;")
    344             .addEscape('\u2111', "&image;")
    345             .addEscape('\u2118', "&weierp;")
    346             .addEscape('\u211C', "&real;")
    347             .addEscape('\u2122', "&trade;")
    348             .addEscape('\u2135', "&alefsym;")
    349             .addEscape('\u2190', "&larr;")
    350             .addEscape('\u2191', "&uarr;")
    351             .addEscape('\u2192', "&rarr;")
    352             .addEscape('\u2193', "&darr;")
    353             .addEscape('\u2194', "&harr;")
    354             .addEscape('\u21B5', "&crarr;")
    355             .addEscape('\u21D0', "&lArr;")
    356             .addEscape('\u21D1', "&uArr;")
    357             .addEscape('\u21D2', "&rArr;")
    358             .addEscape('\u21D3', "&dArr;")
    359             .addEscape('\u21D4', "&hArr;")
    360             .addEscape('\u2200', "&forall;")
    361             .addEscape('\u2202', "&part;")
    362             .addEscape('\u2203', "&exist;")
    363             .addEscape('\u2205', "&empty;")
    364             .addEscape('\u2207', "&nabla;")
    365             .addEscape('\u2208', "&isin;")
    366             .addEscape('\u2209', "&notin;")
    367             .addEscape('\u220B', "&ni;")
    368             .addEscape('\u220F', "&prod;")
    369             .addEscape('\u2211', "&sum;")
    370             .addEscape('\u2212', "&minus;")
    371             .addEscape('\u2217', "&lowast;")
    372             .addEscape('\u221A', "&radic;")
    373             .addEscape('\u221D', "&prop;")
    374             .addEscape('\u221E', "&infin;")
    375             .addEscape('\u2220', "&ang;")
    376             .addEscape('\u2227', "&and;")
    377             .addEscape('\u2228', "&or;")
    378             .addEscape('\u2229', "&cap;")
    379             .addEscape('\u222A', "&cup;")
    380             .addEscape('\u222B', "&int;")
    381             .addEscape('\u2234', "&there4;")
    382             .addEscape('\u223C', "&sim;")
    383             .addEscape('\u2245', "&cong;")
    384             .addEscape('\u2248', "&asymp;")
    385             .addEscape('\u2260', "&ne;")
    386             .addEscape('\u2261', "&equiv;")
    387             .addEscape('\u2264', "&le;")
    388             .addEscape('\u2265', "&ge;")
    389             .addEscape('\u2282', "&sub;")
    390             .addEscape('\u2283', "&sup;")
    391             .addEscape('\u2284', "&nsub;")
    392             .addEscape('\u2286', "&sube;")
    393             .addEscape('\u2287', "&supe;")
    394             .addEscape('\u2295', "&oplus;")
    395             .addEscape('\u2297', "&otimes;")
    396             .addEscape('\u22A5', "&perp;")
    397             .addEscape('\u22C5', "&sdot;")
    398             .addEscape('\u2308', "&lceil;")
    399             .addEscape('\u2309', "&rceil;")
    400             .addEscape('\u230A', "&lfloor;")
    401             .addEscape('\u230B', "&rfloor;")
    402             .addEscape('\u2329', "&lang;")
    403             .addEscape('\u232A', "&rang;")
    404             .addEscape('\u25CA', "&loz;")
    405             .addEscape('\u2660', "&spades;")
    406             .addEscape('\u2663', "&clubs;")
    407             .addEscape('\u2665', "&hearts;")
    408             .addEscape('\u2666', "&diams;")
    409             .toArray());
    410   }
    411 
    412   /**
    413    * Returns a {@link CharEscaper} instance that escapes special characters in a
    414    * string so it can safely be included in an HTML document in either element
    415    * content or attribute values.
    416    *
    417    * <p><b>Note</b></p>: does not alter non-ASCII and control characters.
    418    */
    419   public static CharEscaper asciiHtmlEscaper() {
    420     return ASCII_HTML_ESCAPER;
    421   }
    422 
    423   /**
    424    * Escapes special characters from a string so it can safely be included in an
    425    * HTML document in either element content or attribute values. Does
    426    * <em>not</em> alter non-ASCII characters or control characters.
    427    */
    428   private static final CharEscaper ASCII_HTML_ESCAPER = new CharEscaperBuilder()
    429       .addEscape('"', "&quot;")
    430       .addEscape('\'', "&#39;")
    431       .addEscape('&', "&amp;")
    432       .addEscape('<', "&lt;")
    433       .addEscape('>', "&gt;")
    434       .toEscaper();
    435 
    436   /**
    437    * Returns an {@link Escaper} instance that escapes Java chars so they can be
    438    * safely included in URIs. For details on escaping URIs, see section 2.4 of
    439    * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
    440    *
    441    * <p>When encoding a String, the following rules apply:
    442    * <ul>
    443    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
    444    *     through "9" remain the same.
    445    * <li>The special characters ".", "-", "*", and "_" remain the same.
    446    * <li>The space character " " is converted into a plus sign "+".
    447    * <li>All other characters are converted into one or more bytes using UTF-8
    448    *     encoding and each byte is then represented by the 3-character string
    449    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
    450    *     representation of the byte value.
    451    * <ul>
    452    *
    453    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
    454    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
    455    * RFC 3986</a>:<br>
    456    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
    457    * for all percent-encodings."</i>
    458    *
    459    * <p>This escaper has identical behavior to (but is potentially much faster
    460    * than):
    461    * <ul>
    462    * <li>{@link com.google.httputil.FastURLEncoder#encode(String)}
    463    * <li>{@link com.google.httputil.FastURLEncoder#encode(String,String)}
    464    *     with the encoding name "UTF-8"
    465    * <li>{@link java.net.URLEncoder#encode(String, String)}
    466    *     with the encoding name "UTF-8"
    467    * </ul>
    468    *
    469    * <p>This method is equivalent to {@code uriEscaper(true)}.
    470    */
    471   public static Escaper uriEscaper() {
    472     return uriEscaper(true);
    473   }
    474 
    475   /**
    476    * Returns an {@link Escaper} instance that escapes Java chars so they can be
    477    * safely included in URI path segments. For details on escaping URIs, see
    478    * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
    479    *
    480    * <p>When encoding a String, the following rules apply:
    481    * <ul>
    482    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
    483    *     through "9" remain the same.
    484    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
    485    * <li>The general delimiters "@" and ":" remain the same.
    486    * <li>The subdelimiters "!", "$", "&amp;", "'", "(", ")", "*", ",", ";",
    487    *     and "=" remain the same.
    488    * <li>The space character " " is converted into %20.
    489    * <li>All other characters are converted into one or more bytes using UTF-8
    490    *     encoding and each byte is then represented by the 3-character string
    491    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
    492    *     representation of the byte value.
    493    * </ul>
    494    *
    495    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
    496    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
    497    * RFC 3986</a>:<br>
    498    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
    499    * for all percent-encodings."</i>
    500    */
    501   public static Escaper uriPathEscaper() {
    502     return URI_PATH_ESCAPER;
    503   }
    504 
    505   /**
    506    * Returns an {@link Escaper} instance that escapes Java chars so they can be
    507    * safely included in URI query string segments. When the query string
    508    * consists of a sequence of name=value pairs separated by &amp;, the names
    509    * and values should be individually encoded. If you escape an entire query
    510    * string in one pass with this escaper, then the "=" and "&amp;" characters
    511    * used as separators will also be escaped.
    512    *
    513    * <p>This escaper is also suitable for escaping fragment identifiers.
    514    *
    515    * <p>For details on escaping URIs, see
    516    * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
    517    *
    518    * <p>When encoding a String, the following rules apply:
    519    * <ul>
    520    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
    521    *     through "9" remain the same.
    522    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
    523    * <li>The general delimiters "@" and ":" remain the same.
    524    * <li>The path delimiters "/" and "?" remain the same.
    525    * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
    526    *     remain the same.
    527    * <li>The space character " " is converted into %20.
    528    * <li>The equals sign "=" is converted into %3D.
    529    * <li>The ampersand "&amp;" is converted into %26.
    530    * <li>All other characters are converted into one or more bytes using UTF-8
    531    *     encoding and each byte is then represented by the 3-character string
    532    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
    533    *     representation of the byte value.
    534    * </ul>
    535    *
    536    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
    537    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
    538    * RFC 3986</a>:<br>
    539    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
    540    * for all percent-encodings."</i>
    541    *
    542    * <p>This method is equivalent to {@code uriQueryStringEscaper(false)}.
    543    */
    544   public static Escaper uriQueryStringEscaper() {
    545     return uriQueryStringEscaper(false);
    546   }
    547 
    548   /**
    549    * Returns a {@link Escaper} instance that escapes Java characters so they can
    550    * be safely included in URIs. For details on escaping URIs, see section 2.4
    551    * of <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
    552    *
    553    * <p>When encoding a String, the following rules apply:
    554    * <ul>
    555    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
    556    *     through "9" remain the same.
    557    * <li>The special characters ".", "-", "*", and "_" remain the same.
    558    * <li>If {@code plusForSpace} was specified, the space character " " is
    559    *     converted into a plus sign "+". Otherwise it is converted into "%20".
    560    * <li>All other characters are converted into one or more bytes using UTF-8
    561    *     encoding and each byte is then represented by the 3-character string
    562    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
    563    *     representation of the byte value.
    564    * </ul>
    565    *
    566    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
    567    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
    568    * RFC 3986</a>:<br>
    569    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
    570    * for all percent-encodings."</i>
    571    *
    572    * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
    573    *        it is escaped to {@code %20}. Although common, the escaping of
    574    *        spaces as plus signs has a very ambiguous status in the relevant
    575    *        specifications. You should prefer {@code %20} unless you are doing
    576    *        exact character-by-character comparisons of URLs and backwards
    577    *        compatibility requires you to use plus signs.
    578    *
    579    * @see #uriEscaper()
    580    */
    581   public static Escaper uriEscaper(boolean plusForSpace) {
    582     return plusForSpace ? URI_ESCAPER : URI_ESCAPER_NO_PLUS;
    583   }
    584 
    585   /**
    586    * Returns an {@link Escaper} instance that escapes Java chars so they can be
    587    * safely included in URI query string segments. When the query string
    588    * consists of a sequence of name=value pairs separated by &amp;, the names
    589    * and values should be individually encoded. If you escape an entire query
    590    * string in one pass with this escaper, then the "=" and "&amp;" characters
    591    * used as separators will also be escaped.
    592    *
    593    * <p>This escaper is also suitable for escaping fragment identifiers.
    594    *
    595    * <p>For details on escaping URIs, see
    596    * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
    597    *
    598    * <p>When encoding a String, the following rules apply:
    599    * <ul>
    600    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
    601    *     through "9" remain the same.
    602    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
    603    * <li>The general delimiters "@" and ":" remain the same.
    604    * <li>The path delimiters "/" and "?" remain the same.
    605    * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";",
    606    *     remain the same.
    607    * <li>If {@code plusForSpace} was specified, the space character " " is
    608    *     converted into a plus sign "+". Otherwise it is converted into "%20".
    609    * <li>The equals sign "=" is converted into %3D.
    610    * <li>The ampersand "&amp;" is converted into %26.
    611    * <li>All other characters are converted into one or more bytes using UTF-8
    612    *     encoding and each byte is then represented by the 3-character string
    613    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
    614    *     representation of the byte value.
    615    * </ul>
    616    *
    617    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
    618    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
    619    * RFC 3986</a>:<br>
    620    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
    621    * for all percent-encodings."</i>
    622    *
    623    * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise
    624    *        it is escaped to {@code %20}. Although common, the escaping of
    625    *        spaces as plus signs has a very ambiguous status in the relevant
    626    *        specifications. You should prefer {@code %20} unless you are doing
    627    *        exact character-by-character comparisons of URLs and backwards
    628    *        compatibility requires you to use plus signs.
    629    *
    630    * @see #uriQueryStringEscaper()
    631    */
    632   public static Escaper uriQueryStringEscaper(boolean plusForSpace) {
    633     return plusForSpace ?
    634            URI_QUERY_STRING_ESCAPER_WITH_PLUS : URI_QUERY_STRING_ESCAPER;
    635   }
    636 
    637   private static final Escaper URI_ESCAPER =
    638       new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, true);
    639 
    640   private static final Escaper URI_ESCAPER_NO_PLUS =
    641       new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, false);
    642 
    643   private static final Escaper URI_PATH_ESCAPER =
    644       new PercentEscaper(PercentEscaper.SAFEPATHCHARS_URLENCODER, false);
    645 
    646   private static final Escaper URI_QUERY_STRING_ESCAPER =
    647       new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, false);
    648 
    649   private static final Escaper URI_QUERY_STRING_ESCAPER_WITH_PLUS =
    650       new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, true);
    651 
    652   /**
    653    * Returns a {@link Escaper} instance that escapes Java characters in a manner
    654    * compatible with the C++ webutil/url URL class (the {@code kGoogle1Escape}
    655    * set).
    656    *
    657    * <p>When encoding a String, the following rules apply:
    658    * <ul>
    659    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
    660    * through "9" remain the same.
    661    * <li>The special characters "!", "(", ")", "*", "-", ".", "_", "~", ",", "/"
    662    * and ":" remain the same.
    663    * <li>The space character " " is converted into a plus sign "+".
    664    * <li>All other characters are converted into one or more bytes using UTF-8
    665    *     encoding and each byte is then represented by the 3-character string
    666    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
    667    *     representation of the byte value.
    668    * </ul>
    669    *
    670    * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase
    671    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
    672    * RFC 3986</a>:<br>
    673    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
    674    * for all percent-encodings."</i>
    675    *
    676    * <p><b>Note</b>: This escaper is a special case and is <em>not
    677    * compliant</em> with <a href="http://www.ietf.org/rfc/rfc2396.txt">
    678    * RFC 2396</a>. Specifically it will not escape "/", ":" and ",". This is
    679    * only provided for certain limited use cases and you should favor using
    680    * {@link #uriEscaper()} whenever possible.
    681    */
    682   public static Escaper cppUriEscaper() {
    683     return CPP_URI_ESCAPER;
    684   }
    685 
    686   // Based on comments from FastURLEncoder:
    687   // These octets mimic the ones escaped by the C++ webutil/url URL class --
    688   // the kGoogle1Escape set.
    689   // To produce the same escaping as C++, use this set with the plusForSpace
    690   // option.
    691   // WARNING: Contrary to RFC 2396 ",", "/" and ":" are listed as safe here.
    692   private static final Escaper CPP_URI_ESCAPER =
    693       new PercentEscaper("!()*-._~,/:", true);
    694 
    695   /**
    696    * Returns a {@link CharEscaper} instance that escapes special characters in a
    697    * string so it can safely be included in a Java string literal.
    698    *
    699    * <p><b>Note</b></p>: does not escape single quotes, so use the escaper
    700    * returned by {@link #javaCharEscaper()} if you are generating char
    701    * literals or if you are unsure.
    702    */
    703   public static CharEscaper javaStringEscaper() {
    704     return JAVA_STRING_ESCAPER;
    705   }
    706 
    707   /**
    708    * Escapes special characters from a string so it can safely be included in a
    709    * Java string literal. Does <em>not</em> escape single-quotes, so use
    710    * JAVA_CHAR_ESCAPE if you are generating char literals, or if you are unsure.
    711    *
    712    * <p>Note that non-ASCII characters will be octal or Unicode escaped.
    713    */
    714   private static final CharEscaper JAVA_STRING_ESCAPER
    715       = new JavaCharEscaper(new CharEscaperBuilder()
    716           .addEscape('\b', "\\b")
    717           .addEscape('\f', "\\f")
    718           .addEscape('\n', "\\n")
    719           .addEscape('\r', "\\r")
    720           .addEscape('\t', "\\t")
    721           .addEscape('\"', "\\\"")
    722           .addEscape('\\', "\\\\")
    723           .toArray());
    724 
    725   /**
    726    * Returns a {@link CharEscaper} instance that escapes special characters in a
    727    * string so it can safely be included in a Java char or string literal. The
    728    * behavior of this escaper is the same as that of the
    729    * {@link #javaStringEscaper()}, except it also escapes single quotes.
    730    */
    731   public static CharEscaper javaCharEscaper() {
    732     return JAVA_CHAR_ESCAPER;
    733   }
    734 
    735   /**
    736    * Escapes special characters from a string so it can safely be included in a
    737    * Java char literal or string literal.
    738    *
    739    * <p>Note that non-ASCII characters will be octal or Unicode escaped.
    740    *
    741    * <p>This is the same as {@link #JAVA_STRING_ESCAPER}, except that it escapes
    742    * single quotes.
    743    */
    744   private static final CharEscaper JAVA_CHAR_ESCAPER
    745       = new JavaCharEscaper(new CharEscaperBuilder()
    746           .addEscape('\b', "\\b")
    747           .addEscape('\f', "\\f")
    748           .addEscape('\n', "\\n")
    749           .addEscape('\r', "\\r")
    750           .addEscape('\t', "\\t")
    751           .addEscape('\'', "\\'")
    752           .addEscape('\"', "\\\"")
    753           .addEscape('\\', "\\\\")
    754           .toArray());
    755 
    756   /**
    757    * Returns a {@link CharEscaper} instance that replaces non-ASCII characters
    758    * in a string with their Unicode escape sequences ({@code \\uxxxx} where
    759    * {@code xxxx} is a hex number). Existing escape sequences won't be affected.
    760    */
    761   public static CharEscaper javaStringUnicodeEscaper() {
    762     return JAVA_STRING_UNICODE_ESCAPER;
    763   }
    764 
    765   /**
    766    * Escapes each non-ASCII character in with its Unicode escape sequence
    767    * {@code \\uxxxx} where {@code xxxx} is a hex number. Existing escape
    768    * sequences won't be affected.
    769    */
    770   private static final CharEscaper JAVA_STRING_UNICODE_ESCAPER
    771       = new CharEscaper() {
    772           @Override protected char[] escape(char c) {
    773             if (c <= 127) {
    774               return null;
    775             }
    776 
    777             char[] r = new char[6];
    778             r[5] = HEX_DIGITS[c & 15];
    779             c >>>= 4;
    780             r[4] = HEX_DIGITS[c & 15];
    781             c >>>= 4;
    782             r[3] = HEX_DIGITS[c & 15];
    783             c >>>= 4;
    784             r[2] = HEX_DIGITS[c & 15];
    785             r[1] = 'u';
    786             r[0] = '\\';
    787             return r;
    788           }
    789         };
    790 
    791   /**
    792    * Returns a {@link CharEscaper} instance that escapes special characters from
    793    * a string so it can safely be included in a Python string literal. Does not
    794    * have any special handling for non-ASCII characters.
    795    */
    796   public static CharEscaper pythonEscaper() {
    797     return PYTHON_ESCAPER;
    798   }
    799 
    800   /**
    801    * Escapes special characters in a string so it can safely be included in a
    802    * Python string literal. Does not have any special handling for non-ASCII
    803    * characters.
    804    */
    805   private static final CharEscaper PYTHON_ESCAPER = new CharEscaperBuilder()
    806       // TODO(laurence): perhaps this should escape non-ASCII characters?
    807       .addEscape('\n', "\\n")
    808       .addEscape('\r', "\\r")
    809       .addEscape('\t', "\\t")
    810       .addEscape('\\', "\\\\")
    811       .addEscape('\"', "\\\"")
    812       .addEscape('\'', "\\\'")
    813       .toEscaper();
    814 
    815   /**
    816    * Returns a {@link CharEscaper} instance that escapes non-ASCII characters in
    817    * a string so it can safely be included in a Javascript string literal.
    818    * Non-ASCII characters are replaced with their ASCII javascript escape
    819    * sequences (e.g., \\uhhhh or \xhh).
    820    */
    821   public static CharEscaper javascriptEscaper() {
    822     return JAVASCRIPT_ESCAPER;
    823   }
    824 
    825   /**
    826    * {@code CharEscaper} to escape javascript strings. Turns all non-ASCII
    827    * characters into ASCII javascript escape sequences (e.g., \\uhhhh or \xhh).
    828    */
    829   private static final CharEscaper JAVASCRIPT_ESCAPER
    830       = new JavascriptCharEscaper(new CharEscaperBuilder()
    831           .addEscape('\'', "\\x27")
    832           .addEscape('"',  "\\x22")
    833           .addEscape('<',  "\\x3c")
    834           .addEscape('=',  "\\x3d")
    835           .addEscape('>',  "\\x3e")
    836           .addEscape('&',  "\\x26")
    837           .addEscape('\b', "\\b")
    838           .addEscape('\t', "\\t")
    839           .addEscape('\n', "\\n")
    840           .addEscape('\f', "\\f")
    841           .addEscape('\r', "\\r")
    842           .addEscape('\\', "\\\\")
    843           .toArray());
    844 
    845   private static CharEscaperBuilder newBasicXmlEscapeBuilder() {
    846     return new CharEscaperBuilder()
    847         .addEscape('&', "&amp;")
    848         .addEscape('<', "&lt;")
    849         .addEscape('>', "&gt;")
    850         .addEscapes(new char[] {
    851             '\000', '\001', '\002', '\003', '\004',
    852             '\005', '\006', '\007', '\010', '\013',
    853             '\014', '\016', '\017', '\020', '\021',
    854             '\022', '\023', '\024', '\025', '\026',
    855             '\027', '\030', '\031', '\032', '\033',
    856             '\034', '\035', '\036', '\037'}, "");
    857   }
    858 
    859   /**
    860    * Returns a composite {@link CharEscaper} instance that tries to escape
    861    * characters using a primary {@code CharEscaper} first and falls back to a
    862    * secondary one if there is no escaping.
    863    *
    864    * <p>The returned escaper will attempt to escape each character using the
    865    * primary escaper, and if the primary escaper has no escaping for that
    866    * character, it will use the secondary escaper. If the secondary escaper has
    867    * no escaping for a character either, the original character will be used.
    868    * If the primary escaper has an escape for a character, the secondary escaper
    869    * will not be used at all for that character; the escaped output of the
    870    * primary is not run through the secondary. For a case where you would like
    871    * to first escape with one escaper, and then with another, it is recommended
    872    * that you call each escaper in order.
    873    *
    874    * @param primary The primary {@code CharEscaper} to use
    875    * @param secondary The secondary {@code CharEscaper} to use if the first one
    876    *     has no escaping rule for a character
    877    * @throws NullPointerException if any of the arguments is null
    878    */
    879   public static CharEscaper fallThrough(CharEscaper primary,
    880       CharEscaper secondary) {
    881     checkNotNull(primary);
    882     checkNotNull(secondary);
    883     return new FallThroughCharEscaper(primary, secondary);
    884   }
    885 
    886   /**
    887    * A fast {@link CharEscaper} that uses an array of replacement characters and
    888    * a range of safe characters. It overrides {@link #escape(String)} to improve
    889    * performance. Rough benchmarking shows that this almost doubles the speed
    890    * when processing strings that do not require escaping (providing the escape
    891    * test itself is efficient).
    892    */
    893   private static abstract class FastCharEscaper extends CharEscaper {
    894 
    895     protected final char[][] replacements;
    896     protected final int replacementLength;
    897     protected final char safeMin;
    898     protected final char safeMax;
    899 
    900     public FastCharEscaper(char[][] replacements, char safeMin, char safeMax) {
    901       this.replacements = replacements;
    902       this.replacementLength = replacements.length;
    903       this.safeMin = safeMin;
    904       this.safeMax = safeMax;
    905     }
    906 
    907     /** Overridden for performance (see {@link FastCharEscaper}). */
    908     @Override public String escape(String s) {
    909       int slen = s.length();
    910       for (int index = 0; index < slen; index++) {
    911         char c = s.charAt(index);
    912         if ((c < replacementLength && replacements[c] != null)
    913             || c < safeMin || c > safeMax) {
    914           return escapeSlow(s, index);
    915         }
    916       }
    917       return s;
    918     }
    919   }
    920 
    921   /**
    922    * Escaper for Java character escaping, contains both an array and a
    923    * backup function.  We're not overriding the array decorator because we
    924    * want to keep this as fast as possible, so no calls to super.escape first.
    925    */
    926   private static class JavaCharEscaper extends FastCharEscaper {
    927 
    928     public JavaCharEscaper(char[][] replacements) {
    929       super(replacements, ' ', '~');
    930     }
    931 
    932     @Override protected char[] escape(char c) {
    933       // First check if our array has a valid escaping.
    934       if (c < replacementLength) {
    935         char[] r = replacements[c];
    936         if (r != null) {
    937           return r;
    938         }
    939       }
    940 
    941       // This range is un-escaped.
    942       if (safeMin <= c && c <= safeMax) {
    943         return null;
    944       }
    945 
    946       if (c <= 0xFF) {
    947         // Convert c to an octal-escaped string.
    948         // Equivalent to String.format("\\%03o", (int)c);
    949         char[] r = new char[4];
    950         r[0] = '\\';
    951         r[3] = HEX_DIGITS[c & 7];
    952         c >>>= 3;
    953         r[2] = HEX_DIGITS[c & 7];
    954         c >>>= 3;
    955         r[1] = HEX_DIGITS[c & 7];
    956         return r;
    957       }
    958 
    959       // Convert c to a hex-escaped string.
    960       // Equivalent to String.format("\\u%04x", (int)c);
    961       char[] r = new char[6];
    962       r[0] = '\\';
    963       r[1] = 'u';
    964       r[5] = HEX_DIGITS[c & 15];
    965       c >>>= 4;
    966       r[4] = HEX_DIGITS[c & 15];
    967       c >>>= 4;
    968       r[3] = HEX_DIGITS[c & 15];
    969       c >>>= 4;
    970       r[2] = HEX_DIGITS[c & 15];
    971       return r;
    972     }
    973   }
    974 
    975   /**
    976    * Escaper for javascript character escaping, contains both an array and a
    977    * backup function. We're not overriding the array decorator because we
    978    * want to keep this as fast as possible, so no calls to super.escape first.
    979    */
    980   private static class JavascriptCharEscaper extends FastCharEscaper {
    981 
    982     public JavascriptCharEscaper(char[][] replacements) {
    983       super(replacements, ' ', '~');
    984     }
    985 
    986     @Override protected char[] escape(char c) {
    987       // First check if our array has a valid escaping.
    988       if (c < replacementLength) {
    989         char[] r = replacements[c];
    990         if (r != null) {
    991           return r;
    992         }
    993       }
    994 
    995       // This range is unescaped.
    996       if (safeMin <= c && c <= safeMax) {
    997         return null;
    998       }
    999 
   1000       // we can do a 2 digit hex escape for chars less that 0x100
   1001       if (c < 0x100) {
   1002         char[] r = new char[4];
   1003         r[3] = HEX_DIGITS[c & 0xf];
   1004         c >>>= 4;
   1005         r[2] = HEX_DIGITS[c & 0xf];
   1006         r[1] = 'x';
   1007         r[0] = '\\';
   1008         return r;
   1009       }
   1010 
   1011       // 4 digit hex escape everything else
   1012       char[] r = new char[6];
   1013       r[5] = HEX_DIGITS[c & 0xf];
   1014       c >>>= 4;
   1015       r[4] = HEX_DIGITS[c & 0xf];
   1016       c >>>= 4;
   1017       r[3] = HEX_DIGITS[c & 0xf];
   1018       c >>>= 4;
   1019       r[2] = HEX_DIGITS[c & 0xf];
   1020       r[1] = 'u';
   1021       r[0] = '\\';
   1022       return r;
   1023     }
   1024   }
   1025 
   1026   /**
   1027    * Escaper for HTML character escaping, contains both an array and a
   1028    * backup function.  We're not overriding the array decorator because we
   1029    * want to keep this as fast as possible, so no calls to super.escape first.
   1030    */
   1031   private static class HtmlCharEscaper extends FastCharEscaper {
   1032 
   1033     public HtmlCharEscaper(char[][] replacements) {
   1034       super(replacements, Character.MIN_VALUE, '~');
   1035     }
   1036 
   1037     @Override protected char[] escape(char c) {
   1038       // First check if our array has a valid escaping.
   1039       if (c < replacementLength) {
   1040         char[] r = replacements[c];
   1041         if (r != null) {
   1042           return r;
   1043         }
   1044       }
   1045 
   1046       // ~ is ASCII 126, the highest value char that does not need
   1047       // to be escaped
   1048       if (c <= safeMax) {
   1049         return null;
   1050       }
   1051 
   1052       int index;
   1053       if (c < 1000) {
   1054         index = 4;
   1055       } else if (c < 10000) {
   1056         index = 5;
   1057       } else {
   1058         index = 6;
   1059       }
   1060       char[] result = new char[index + 2];
   1061       result[0] = '&';
   1062       result[1] = '#';
   1063       result[index + 1] = ';';
   1064 
   1065       // TODO(sven): Convert this to a sequence of shifts/additions
   1066       // to avoid the division and modulo operators.
   1067       int intValue = c;
   1068       for (; index > 1; index--) {
   1069         result[index] = HEX_DIGITS[intValue % 10];
   1070         intValue /= 10;
   1071       }
   1072       return result;
   1073     }
   1074   }
   1075 
   1076   /**
   1077    * A composite {@code CharEscaper} object that tries to escape characters
   1078    * using a primary {@code CharEscaper} first and falls back to a secondary
   1079    * one if there is no escaping.
   1080    */
   1081   private static class FallThroughCharEscaper extends CharEscaper {
   1082 
   1083     private final CharEscaper primary;
   1084     private final CharEscaper secondary;
   1085 
   1086     public FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary) {
   1087       this.primary = primary;
   1088       this.secondary = secondary;
   1089     }
   1090 
   1091     @Override
   1092     protected char[] escape(char c) {
   1093       char result[] = primary.escape(c);
   1094       if (result == null) {
   1095         result = secondary.escape(c);
   1096       }
   1097       return result;
   1098     }
   1099   }
   1100 
   1101   private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
   1102 }