Home | History | Annotate | Download | only in base
      1 /**
      2  * Copyright (c) 2008, Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.android.mail.common.base;
     18 
     19 import static com.google.android.mail.common.base.Preconditions.checkNotNull;
     20 
     21 /**
     22  * A {@code UnicodeEscaper} that escapes some set of Java characters using
     23  * the URI percent encoding scheme. The set of safe characters (those which
     24  * remain unescaped) can be specified on construction.
     25  *
     26  * <p>For details on escaping URIs for use in web pages, see section 2.4 of
     27  * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
     28  *
     29  * <p>In most cases this class should not need to be used directly. If you
     30  * have no special requirements for escaping your URIs, you should use either
     31  * {@link CharEscapers#uriEscaper()} or
     32  * {@link CharEscapers#uriEscaper(boolean)}.
     33  *
     34  * <p>When encoding a String, the following rules apply:
     35  * <ul>
     36  * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
     37  * through "9" remain the same.
     38  * <li>Any additionally specified safe characters remain the same.
     39  * <li>If {@code plusForSpace} was specified, the space character " " is
     40  * converted into a plus sign "+".
     41  * <li>All other characters are converted into one or more bytes using UTF-8
     42  *     encoding and each byte is then represented by the 3-character string
     43  *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
     44  *     of the byte value.
     45  * </ul>
     46  *
     47  * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
     48  * "~", "*", "'", "(" and ")". It goes on to state:
     49  *
     50  * <p><i>Unreserved characters can be escaped without changing the semantics
     51  * of the URI, but this should not be done unless the URI is being used
     52  * in a context that does not allow the unescaped character to appear.</i>
     53  *
     54  * <p>For performance reasons the only currently supported character encoding of
     55  * this class is UTF-8.
     56  *
     57  * <p><b>Note</b>: This escaper produces uppercase hexadecimal sequences. From
     58  * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
     59  * <i>"URI producers and normalizers should use uppercase hexadecimal digits
     60  * for all percent-encodings."</i>
     61  *
     62  * @author dbeaumont (at) google.com (David Beaumont)
     63  */
     64 public class PercentEscaper extends UnicodeEscaper {
     65   /**
     66    * A string of safe characters that mimics the behavior of
     67    * {@link java.net.URLEncoder}.
     68    *
     69    * TODO(dbeaumont): Fix escapers to be compliant with RFC 3986
     70    */
     71   public static final String SAFECHARS_URLENCODER = "-_.*";
     72 
     73   /**
     74    * A string of characters that do not need to be encoded when used in URI
     75    * path segments, as specified in RFC 3986. Note that some of these
     76    * characters do need to be escaped when used in other parts of the URI.
     77    */
     78   public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
     79 
     80   /**
     81    * A string of characters that do not need to be encoded when used in URI
     82    * query strings, as specified in RFC 3986. Note that some of these
     83    * characters do need to be escaped when used in other parts of the URI.
     84    */
     85   public static final String SAFEQUERYSTRINGCHARS_URLENCODER
     86       = "-_.!~*'()@:$,;/?:";
     87 
     88   // In some uri escapers spaces are escaped to '+'
     89   private static final char[] URI_ESCAPED_SPACE = { '+' };
     90 
     91   // TODO(dbeaumont): Remove this once UriEscaper uses lower case
     92   private static final char[] UPPER_HEX_DIGITS =
     93       "0123456789ABCDEF".toCharArray();
     94 
     95   /**
     96    * If true we should convert space to the {@code +} character.
     97    */
     98   private final boolean plusForSpace;
     99 
    100   /**
    101    * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
    102    * true then {@code c} should remain unmodified in the output. If
    103    * {@code c > safeOctets.length} then it should be escaped.
    104    */
    105   private final boolean[] safeOctets;
    106 
    107   /**
    108    * Constructs a URI escaper with the specified safe characters and optional
    109    * handling of the space character.
    110    *
    111    * @param safeChars a non null string specifying additional safe characters
    112    *        for this escaper (the ranges 0..9, a..z and A..Z are always safe and
    113    *        should not be specified here)
    114    * @param plusForSpace true if ASCII space should be escaped to {@code +}
    115    *        rather than {@code %20}
    116    * @throws IllegalArgumentException if any of the parameters were invalid
    117    */
    118   public PercentEscaper(String safeChars, boolean plusForSpace) {
    119     checkNotNull(safeChars);  // eager for GWT.
    120 
    121     // Avoid any misunderstandings about the behavior of this escaper
    122     if (safeChars.matches(".*[0-9A-Za-z].*")) {
    123       throw new IllegalArgumentException(
    124           "Alphanumeric characters are always 'safe' and should not be " +
    125           "explicitly specified");
    126     }
    127     // Avoid ambiguous parameters. Safe characters are never modified so if
    128     // space is a safe character then setting plusForSpace is meaningless.
    129     if (plusForSpace && safeChars.contains(" ")) {
    130       throw new IllegalArgumentException(
    131           "plusForSpace cannot be specified when space is a 'safe' character");
    132     }
    133     if (safeChars.contains("%")) {
    134       throw new IllegalArgumentException(
    135           "The '%' character cannot be specified as 'safe'");
    136     }
    137     this.plusForSpace = plusForSpace;
    138     this.safeOctets = createSafeOctets(safeChars);
    139   }
    140 
    141   /**
    142    * Creates a boolean[] with entries corresponding to the character values
    143    * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
    144    * is as small as is required to hold the given character information.
    145    */
    146   private static boolean[] createSafeOctets(String safeChars) {
    147     int maxChar = 'z';
    148     char[] safeCharArray = safeChars.toCharArray();
    149     for (char c : safeCharArray) {
    150       maxChar = Math.max(c, maxChar);
    151     }
    152     boolean[] octets = new boolean[maxChar + 1];
    153     for (int c = '0'; c <= '9'; c++) {
    154       octets[c] = true;
    155     }
    156     for (int c = 'A'; c <= 'Z'; c++) {
    157       octets[c] = true;
    158     }
    159     for (int c = 'a'; c <= 'z'; c++) {
    160       octets[c] = true;
    161     }
    162     for (char c : safeCharArray) {
    163       octets[c] = true;
    164     }
    165     return octets;
    166   }
    167 
    168   /*
    169    * Overridden for performance. For unescaped strings this improved the
    170    * performance of the uri escaper from ~760ns to ~400ns as measured by
    171    * {@link CharEscapersBenchmark}.
    172    */
    173   @Override
    174   protected int nextEscapeIndex(CharSequence csq, int index, int end) {
    175     for (; index < end; index++) {
    176       char c = csq.charAt(index);
    177       if (c >= safeOctets.length || !safeOctets[c]) {
    178         break;
    179       }
    180     }
    181     return index;
    182   }
    183 
    184   /*
    185    * Overridden for performance. For unescaped strings this improved the
    186    * performance of the uri escaper from ~400ns to ~170ns as measured by
    187    * {@link CharEscapersBenchmark}.
    188    */
    189   @Override
    190   public String escape(String s) {
    191     checkNotNull(s);
    192     int slen = s.length();
    193     for (int index = 0; index < slen; index++) {
    194       char c = s.charAt(index);
    195       if (c >= safeOctets.length || !safeOctets[c]) {
    196         return escapeSlow(s, index);
    197       }
    198     }
    199     return s;
    200   }
    201 
    202   /**
    203    * Escapes the given Unicode code point in UTF-8.
    204    */
    205   @Override
    206   protected char[] escape(int cp) {
    207     // We should never get negative values here but if we do it will throw an
    208     // IndexOutOfBoundsException, so at least it will get spotted.
    209     if (cp < safeOctets.length && safeOctets[cp]) {
    210       return null;
    211     } else if (cp == ' ' && plusForSpace) {
    212       return URI_ESCAPED_SPACE;
    213     } else if (cp <= 0x7F) {
    214       // Single byte UTF-8 characters
    215       // Start with "%--" and fill in the blanks
    216       char[] dest = new char[3];
    217       dest[0] = '%';
    218       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
    219       dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
    220       return dest;
    221     } else if (cp <= 0x7ff) {
    222       // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
    223       // Start with "%--%--" and fill in the blanks
    224       char[] dest = new char[6];
    225       dest[0] = '%';
    226       dest[3] = '%';
    227       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
    228       cp >>>= 4;
    229       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
    230       cp >>>= 2;
    231       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
    232       cp >>>= 4;
    233       dest[1] = UPPER_HEX_DIGITS[0xC | cp];
    234       return dest;
    235     } else if (cp <= 0xffff) {
    236       // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
    237       // Start with "%E-%--%--" and fill in the blanks
    238       char[] dest = new char[9];
    239       dest[0] = '%';
    240       dest[1] = 'E';
    241       dest[3] = '%';
    242       dest[6] = '%';
    243       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
    244       cp >>>= 4;
    245       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
    246       cp >>>= 2;
    247       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
    248       cp >>>= 4;
    249       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
    250       cp >>>= 2;
    251       dest[2] = UPPER_HEX_DIGITS[cp];
    252       return dest;
    253     } else if (cp <= 0x10ffff) {
    254       char[] dest = new char[12];
    255       // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
    256       // Start with "%F-%--%--%--" and fill in the blanks
    257       dest[0] = '%';
    258       dest[1] = 'F';
    259       dest[3] = '%';
    260       dest[6] = '%';
    261       dest[9] = '%';
    262       dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
    263       cp >>>= 4;
    264       dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
    265       cp >>>= 2;
    266       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
    267       cp >>>= 4;
    268       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
    269       cp >>>= 2;
    270       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
    271       cp >>>= 4;
    272       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
    273       cp >>>= 2;
    274       dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
    275       return dest;
    276     } else {
    277       // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
    278       throw new IllegalArgumentException(
    279           "Invalid unicode character value " + cp);
    280     }
    281   }
    282 }