1 /** 2 * Copyright (c) 2008, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.mail.common.base; 18 19 import static com.google.android.mail.common.base.Preconditions.checkNotNull; 20 21 /** 22 * A {@code UnicodeEscaper} that escapes some set of Java characters using 23 * the URI percent encoding scheme. The set of safe characters (those which 24 * remain unescaped) can be specified on construction. 25 * 26 * <p>For details on escaping URIs for use in web pages, see section 2.4 of 27 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. 28 * 29 * <p>In most cases this class should not need to be used directly. If you 30 * have no special requirements for escaping your URIs, you should use either 31 * {@link CharEscapers#uriEscaper()} or 32 * {@link CharEscapers#uriEscaper(boolean)}. 33 * 34 * <p>When encoding a String, the following rules apply: 35 * <ul> 36 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 37 * through "9" remain the same. 38 * <li>Any additionally specified safe characters remain the same. 39 * <li>If {@code plusForSpace} was specified, the space character " " is 40 * converted into a plus sign "+". 41 * <li>All other characters are converted into one or more bytes using UTF-8 42 * encoding and each byte is then represented by the 3-character string 43 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation 44 * of the byte value. 45 * </ul> 46 * 47 * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!", 48 * "~", "*", "'", "(" and ")". It goes on to state: 49 * 50 * <p><i>Unreserved characters can be escaped without changing the semantics 51 * of the URI, but this should not be done unless the URI is being used 52 * in a context that does not allow the unescaped character to appear.</i> 53 * 54 * <p>For performance reasons the only currently supported character encoding of 55 * this class is UTF-8. 56 * 57 * <p><b>Note</b>: This escaper produces uppercase hexadecimal sequences. From 58 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br> 59 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 60 * for all percent-encodings."</i> 61 * 62 * @author dbeaumont (at) google.com (David Beaumont) 63 */ 64 public class PercentEscaper extends UnicodeEscaper { 65 /** 66 * A string of safe characters that mimics the behavior of 67 * {@link java.net.URLEncoder}. 68 * 69 * TODO(dbeaumont): Fix escapers to be compliant with RFC 3986 70 */ 71 public static final String SAFECHARS_URLENCODER = "-_.*"; 72 73 /** 74 * A string of characters that do not need to be encoded when used in URI 75 * path segments, as specified in RFC 3986. Note that some of these 76 * characters do need to be escaped when used in other parts of the URI. 77 */ 78 public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;="; 79 80 /** 81 * A string of characters that do not need to be encoded when used in URI 82 * query strings, as specified in RFC 3986. Note that some of these 83 * characters do need to be escaped when used in other parts of the URI. 84 */ 85 public static final String SAFEQUERYSTRINGCHARS_URLENCODER 86 = "-_.!~*'()@:$,;/?:"; 87 88 // In some uri escapers spaces are escaped to '+' 89 private static final char[] URI_ESCAPED_SPACE = { '+' }; 90 91 // TODO(dbeaumont): Remove this once UriEscaper uses lower case 92 private static final char[] UPPER_HEX_DIGITS = 93 "0123456789ABCDEF".toCharArray(); 94 95 /** 96 * If true we should convert space to the {@code +} character. 97 */ 98 private final boolean plusForSpace; 99 100 /** 101 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is 102 * true then {@code c} should remain unmodified in the output. If 103 * {@code c > safeOctets.length} then it should be escaped. 104 */ 105 private final boolean[] safeOctets; 106 107 /** 108 * Constructs a URI escaper with the specified safe characters and optional 109 * handling of the space character. 110 * 111 * @param safeChars a non null string specifying additional safe characters 112 * for this escaper (the ranges 0..9, a..z and A..Z are always safe and 113 * should not be specified here) 114 * @param plusForSpace true if ASCII space should be escaped to {@code +} 115 * rather than {@code %20} 116 * @throws IllegalArgumentException if any of the parameters were invalid 117 */ 118 public PercentEscaper(String safeChars, boolean plusForSpace) { 119 checkNotNull(safeChars); // eager for GWT. 120 121 // Avoid any misunderstandings about the behavior of this escaper 122 if (safeChars.matches(".*[0-9A-Za-z].*")) { 123 throw new IllegalArgumentException( 124 "Alphanumeric characters are always 'safe' and should not be " + 125 "explicitly specified"); 126 } 127 // Avoid ambiguous parameters. Safe characters are never modified so if 128 // space is a safe character then setting plusForSpace is meaningless. 129 if (plusForSpace && safeChars.contains(" ")) { 130 throw new IllegalArgumentException( 131 "plusForSpace cannot be specified when space is a 'safe' character"); 132 } 133 if (safeChars.contains("%")) { 134 throw new IllegalArgumentException( 135 "The '%' character cannot be specified as 'safe'"); 136 } 137 this.plusForSpace = plusForSpace; 138 this.safeOctets = createSafeOctets(safeChars); 139 } 140 141 /** 142 * Creates a boolean[] with entries corresponding to the character values 143 * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array 144 * is as small as is required to hold the given character information. 145 */ 146 private static boolean[] createSafeOctets(String safeChars) { 147 int maxChar = 'z'; 148 char[] safeCharArray = safeChars.toCharArray(); 149 for (char c : safeCharArray) { 150 maxChar = Math.max(c, maxChar); 151 } 152 boolean[] octets = new boolean[maxChar + 1]; 153 for (int c = '0'; c <= '9'; c++) { 154 octets[c] = true; 155 } 156 for (int c = 'A'; c <= 'Z'; c++) { 157 octets[c] = true; 158 } 159 for (int c = 'a'; c <= 'z'; c++) { 160 octets[c] = true; 161 } 162 for (char c : safeCharArray) { 163 octets[c] = true; 164 } 165 return octets; 166 } 167 168 /* 169 * Overridden for performance. For unescaped strings this improved the 170 * performance of the uri escaper from ~760ns to ~400ns as measured by 171 * {@link CharEscapersBenchmark}. 172 */ 173 @Override 174 protected int nextEscapeIndex(CharSequence csq, int index, int end) { 175 for (; index < end; index++) { 176 char c = csq.charAt(index); 177 if (c >= safeOctets.length || !safeOctets[c]) { 178 break; 179 } 180 } 181 return index; 182 } 183 184 /* 185 * Overridden for performance. For unescaped strings this improved the 186 * performance of the uri escaper from ~400ns to ~170ns as measured by 187 * {@link CharEscapersBenchmark}. 188 */ 189 @Override 190 public String escape(String s) { 191 checkNotNull(s); 192 int slen = s.length(); 193 for (int index = 0; index < slen; index++) { 194 char c = s.charAt(index); 195 if (c >= safeOctets.length || !safeOctets[c]) { 196 return escapeSlow(s, index); 197 } 198 } 199 return s; 200 } 201 202 /** 203 * Escapes the given Unicode code point in UTF-8. 204 */ 205 @Override 206 protected char[] escape(int cp) { 207 // We should never get negative values here but if we do it will throw an 208 // IndexOutOfBoundsException, so at least it will get spotted. 209 if (cp < safeOctets.length && safeOctets[cp]) { 210 return null; 211 } else if (cp == ' ' && plusForSpace) { 212 return URI_ESCAPED_SPACE; 213 } else if (cp <= 0x7F) { 214 // Single byte UTF-8 characters 215 // Start with "%--" and fill in the blanks 216 char[] dest = new char[3]; 217 dest[0] = '%'; 218 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 219 dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 220 return dest; 221 } else if (cp <= 0x7ff) { 222 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 223 // Start with "%--%--" and fill in the blanks 224 char[] dest = new char[6]; 225 dest[0] = '%'; 226 dest[3] = '%'; 227 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 228 cp >>>= 4; 229 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 230 cp >>>= 2; 231 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 232 cp >>>= 4; 233 dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 234 return dest; 235 } else if (cp <= 0xffff) { 236 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 237 // Start with "%E-%--%--" and fill in the blanks 238 char[] dest = new char[9]; 239 dest[0] = '%'; 240 dest[1] = 'E'; 241 dest[3] = '%'; 242 dest[6] = '%'; 243 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 244 cp >>>= 4; 245 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 246 cp >>>= 2; 247 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 248 cp >>>= 4; 249 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 250 cp >>>= 2; 251 dest[2] = UPPER_HEX_DIGITS[cp]; 252 return dest; 253 } else if (cp <= 0x10ffff) { 254 char[] dest = new char[12]; 255 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 256 // Start with "%F-%--%--%--" and fill in the blanks 257 dest[0] = '%'; 258 dest[1] = 'F'; 259 dest[3] = '%'; 260 dest[6] = '%'; 261 dest[9] = '%'; 262 dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 263 cp >>>= 4; 264 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 265 cp >>>= 2; 266 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 267 cp >>>= 4; 268 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 269 cp >>>= 2; 270 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 271 cp >>>= 4; 272 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 273 cp >>>= 2; 274 dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 275 return dest; 276 } else { 277 // If this ever happens it is due to bug in UnicodeEscaper, not bad input. 278 throw new IllegalArgumentException( 279 "Invalid unicode character value " + cp); 280 } 281 } 282 }