1 /** 2 * Copyright (c) 2008, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.mail.common.base; 18 19 import static com.google.android.mail.common.base.Preconditions.checkNotNull; 20 import static com.google.android.mail.common.base.Preconditions.checkPositionIndexes; 21 22 import java.io.IOException; 23 24 /** 25 * An {@link Escaper} that converts literal text into a format safe for 26 * inclusion in a particular context (such as an XML document). Typically (but 27 * not always), the inverse process of "unescaping" the text is performed 28 * automatically by the relevant parser. 29 * 30 * <p>For example, an XML escaper would convert the literal string {@code 31 * "Foo<Bar>"} into {@code "Foo<Bar>"} to prevent {@code "<Bar>"} from 32 * being confused with an XML tag. When the resulting XML document is parsed, 33 * the parser API will return this text as the original literal string {@code 34 * "Foo<Bar>"}. 35 * 36 * <p><b>Note:</b> This class is similar to {@link CharEscaper} but with one 37 * very important difference. A CharEscaper can only process Java 38 * <a href="http://en.wikipedia.org/wiki/UTF-16">UTF16</a> characters in 39 * isolation and may not cope when it encounters surrogate pairs. This class 40 * facilitates the correct escaping of all Unicode characters. 41 * 42 * <p>As there are important reasons, including potential security issues, to 43 * handle Unicode correctly if you are considering implementing a new escaper 44 * you should favor using UnicodeEscaper wherever possible. 45 * 46 * <p>A {@code UnicodeEscaper} instance is required to be stateless, and safe 47 * when used concurrently by multiple threads. 48 * 49 * <p>Several popular escapers are defined as constants in the class {@link 50 * CharEscapers}. To create your own escapers extend this class and implement 51 * the {@link #escape(int)} method. 52 * 53 * @author dbeaumont (at) google.com (David Beaumont) 54 */ 55 public abstract class UnicodeEscaper extends Escaper { 56 /** The amount of padding (chars) to use when growing the escape buffer. */ 57 private static final int DEST_PAD = 32; 58 59 /** 60 * Returns the escaped form of the given Unicode code point, or {@code null} 61 * if this code point does not need to be escaped. When called as part of an 62 * escaping operation, the given code point is guaranteed to be in the range 63 * {@code 0 <= cp <= Character#MAX_CODE_POINT}. 64 * 65 * <p>If an empty array is returned, this effectively strips the input 66 * character from the resulting text. 67 * 68 * <p>If the character does not need to be escaped, this method should return 69 * {@code null}, rather than an array containing the character representation 70 * of the code point. This enables the escaping algorithm to perform more 71 * efficiently. 72 * 73 * <p>If the implementation of this method cannot correctly handle a 74 * particular code point then it should either throw an appropriate runtime 75 * exception or return a suitable replacement character. It must never 76 * silently discard invalid input as this may constitute a security risk. 77 * 78 * @param cp the Unicode code point to escape if necessary 79 * @return the replacement characters, or {@code null} if no escaping was 80 * needed 81 */ 82 protected abstract char[] escape(int cp); 83 84 /** 85 * Scans a sub-sequence of characters from a given {@link CharSequence}, 86 * returning the index of the next character that requires escaping. 87 * 88 * <p><b>Note:</b> When implementing an escaper, it is a good idea to override 89 * this method for efficiency. The base class implementation determines 90 * successive Unicode code points and invokes {@link #escape(int)} for each of 91 * them. If the semantics of your escaper are such that code points in the 92 * supplementary range are either all escaped or all unescaped, this method 93 * can be implemented more efficiently using {@link CharSequence#charAt(int)}. 94 * 95 * <p>Note however that if your escaper does not escape characters in the 96 * supplementary range, you should either continue to validate the correctness 97 * of any surrogate characters encountered or provide a clear warning to users 98 * that your escaper does not validate its input. 99 * 100 * <p>See {@link PercentEscaper} for an example. 101 * 102 * @param csq a sequence of characters 103 * @param start the index of the first character to be scanned 104 * @param end the index immediately after the last character to be scanned 105 * @throws IllegalArgumentException if the scanned sub-sequence of {@code csq} 106 * contains invalid surrogate pairs 107 */ 108 protected int nextEscapeIndex(CharSequence csq, int start, int end) { 109 int index = start; 110 while (index < end) { 111 int cp = codePointAt(csq, index, end); 112 if (cp < 0 || escape(cp) != null) { 113 break; 114 } 115 index += Character.isSupplementaryCodePoint(cp) ? 2 : 1; 116 } 117 return index; 118 } 119 120 /** 121 * Returns the escaped form of a given literal string. 122 * 123 * <p>If you are escaping input in arbitrary successive chunks, then it is not 124 * generally safe to use this method. If an input string ends with an 125 * unmatched high surrogate character, then this method will throw 126 * {@link IllegalArgumentException}. You should either ensure your input is 127 * valid <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> before 128 * calling this method or use an escaped {@link Appendable} (as returned by 129 * {@link #escape(Appendable)}) which can cope with arbitrarily split input. 130 * 131 * <p><b>Note:</b> When implementing an escaper it is a good idea to override 132 * this method for efficiency by inlining the implementation of 133 * {@link #nextEscapeIndex(CharSequence, int, int)} directly. Doing this for 134 * {@link PercentEscaper} more than doubled the performance for unescaped 135 * strings (as measured by {@link CharEscapersBenchmark}). 136 * 137 * @param string the literal string to be escaped 138 * @return the escaped form of {@code string} 139 * @throws NullPointerException if {@code string} is null 140 * @throws IllegalArgumentException if invalid surrogate characters are 141 * encountered 142 */ 143 @Override 144 public String escape(String string) { 145 checkNotNull(string); 146 int end = string.length(); 147 int index = nextEscapeIndex(string, 0, end); 148 return index == end ? string : escapeSlow(string, index); 149 } 150 151 /** 152 * Returns the escaped form of a given literal string, starting at the given 153 * index. This method is called by the {@link #escape(String)} method when it 154 * discovers that escaping is required. It is protected to allow subclasses 155 * to override the fastpath escaping function to inline their escaping test. 156 * See {@link CharEscaperBuilder} for an example usage. 157 * 158 * <p>This method is not reentrant and may only be invoked by the top level 159 * {@link #escape(String)} method. 160 * 161 * @param s the literal string to be escaped 162 * @param index the index to start escaping from 163 * @return the escaped form of {@code string} 164 * @throws NullPointerException if {@code string} is null 165 * @throws IllegalArgumentException if invalid surrogate characters are 166 * encountered 167 */ 168 protected final String escapeSlow(String s, int index) { 169 int end = s.length(); 170 171 // Get a destination buffer and setup some loop variables. 172 char[] dest = Platform.charBufferFromThreadLocal(); 173 int destIndex = 0; 174 int unescapedChunkStart = 0; 175 176 while (index < end) { 177 int cp = codePointAt(s, index, end); 178 if (cp < 0) { 179 throw new IllegalArgumentException( 180 "Trailing high surrogate at end of input"); 181 } 182 // It is possible for this to return null because nextEscapeIndex() may 183 // (for performance reasons) yield some false positives but it must never 184 // give false negatives. 185 char[] escaped = escape(cp); 186 int nextIndex = index + (Character.isSupplementaryCodePoint(cp) ? 2 : 1); 187 if (escaped != null) { 188 int charsSkipped = index - unescapedChunkStart; 189 190 // This is the size needed to add the replacement, not the full 191 // size needed by the string. We only regrow when we absolutely must. 192 int sizeNeeded = destIndex + charsSkipped + escaped.length; 193 if (dest.length < sizeNeeded) { 194 int destLength = sizeNeeded + (end - index) + DEST_PAD; 195 dest = growBuffer(dest, destIndex, destLength); 196 } 197 // If we have skipped any characters, we need to copy them now. 198 if (charsSkipped > 0) { 199 s.getChars(unescapedChunkStart, index, dest, destIndex); 200 destIndex += charsSkipped; 201 } 202 if (escaped.length > 0) { 203 System.arraycopy(escaped, 0, dest, destIndex, escaped.length); 204 destIndex += escaped.length; 205 } 206 // If we dealt with an escaped character, reset the unescaped range. 207 unescapedChunkStart = nextIndex; 208 } 209 index = nextEscapeIndex(s, nextIndex, end); 210 } 211 212 // Process trailing unescaped characters - no need to account for escaped 213 // length or padding the allocation. 214 int charsSkipped = end - unescapedChunkStart; 215 if (charsSkipped > 0) { 216 int endIndex = destIndex + charsSkipped; 217 if (dest.length < endIndex) { 218 dest = growBuffer(dest, destIndex, endIndex); 219 } 220 s.getChars(unescapedChunkStart, end, dest, destIndex); 221 destIndex = endIndex; 222 } 223 return new String(dest, 0, destIndex); 224 } 225 226 /** 227 * Returns an {@code Appendable} instance which automatically escapes all 228 * text appended to it before passing the resulting text to an underlying 229 * {@code Appendable}. 230 * 231 * <p>Unlike {@link #escape(String)} it is permitted to append arbitrarily 232 * split input to this Appendable, including input that is split over a 233 * surrogate pair. In this case the pending high surrogate character will not 234 * be processed until the corresponding low surrogate is appended. This means 235 * that a trailing high surrogate character at the end of the input cannot be 236 * detected and will be silently ignored. This is unavoidable since the 237 * Appendable interface has no {@code close()} method, and it is impossible to 238 * determine when the last characters have been appended. 239 * 240 * <p>The methods of the returned object will propagate any exceptions thrown 241 * by the underlying {@code Appendable}. 242 * 243 * <p>For well formed <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> 244 * the escaping behavior is identical to that of {@link #escape(String)} and 245 * the following code is equivalent to (but much slower than) 246 * {@code escaper.escape(string)}: <pre>{@code 247 * 248 * StringBuilder sb = new StringBuilder(); 249 * escaper.escape(sb).append(string); 250 * return sb.toString();}</pre> 251 * 252 * @param out the underlying {@code Appendable} to append escaped output to 253 * @return an {@code Appendable} which passes text to {@code out} after 254 * escaping it 255 * @throws NullPointerException if {@code out} is null 256 * @throws IllegalArgumentException if invalid surrogate characters are 257 * encountered 258 * 259 * TODO(dbeaumont): Maybe return a Writer here so we have a close() method 260 */ 261 @Override 262 public Appendable escape(final Appendable out) { 263 checkNotNull(out); 264 265 return new Appendable() { 266 char pendingHighSurrogate = 0; 267 268 @Override 269 public Appendable append(CharSequence csq) throws IOException { 270 return append(csq, 0, csq.length()); 271 } 272 273 @Override 274 public Appendable append(CharSequence csq, int start, int end) 275 throws IOException { 276 checkNotNull(csq); 277 checkPositionIndexes(start, end, csq.length()); 278 279 // If there is a pending high surrogate, handle it and start at the 280 // next character. 281 if (pendingHighSurrogate != 0 && start < end) { 282 completeSurrogatePair(csq.charAt(start++)); 283 } 284 285 if (start < end) { 286 // If the string ends with a high surrogate, store it for the next 287 // append, and skip that character from the current escaping. 288 char last = csq.charAt(end - 1); 289 if (Character.isHighSurrogate(last)) { 290 pendingHighSurrogate = last; 291 end--; 292 } 293 294 // Escape the subsequence from start to end, which cannot legally 295 // contain an unpaired surrogate 296 out.append(escape(csq.subSequence(start, end).toString())); 297 } 298 return this; 299 } 300 301 @Override 302 public Appendable append(char c) throws IOException { 303 if (pendingHighSurrogate != 0) { 304 completeSurrogatePair(c); 305 } else if (Character.isHighSurrogate(c)) { 306 pendingHighSurrogate = c; 307 } else { 308 if (Character.isLowSurrogate(c)) { 309 throw new IllegalArgumentException( 310 "Unexpected low surrogate character '" + c + 311 "' with value " + (int) c); 312 } 313 // This is a normal (non surrogate) char. 314 char[] escaped = escape(c); 315 if (escaped != null) { 316 outputChars(escaped); 317 } else { 318 out.append(c); 319 } 320 } 321 return this; 322 } 323 324 /** 325 * Our last append operation ended halfway through a surrogate pair so we 326 * complete the surrogate pair using {@code c}, which must be a low 327 * surrogate. 328 */ 329 private void completeSurrogatePair(char c) throws IOException { 330 if (!Character.isLowSurrogate(c)) { 331 throw new IllegalArgumentException( 332 "Expected low surrogate character but got '" + c + 333 "' with value " + (int) c); 334 } 335 char[] escaped = escape( 336 Character.toCodePoint(pendingHighSurrogate, c)); 337 if (escaped != null) { 338 outputChars(escaped); 339 } else { 340 out.append(pendingHighSurrogate); 341 out.append(c); 342 } 343 pendingHighSurrogate = 0; 344 } 345 346 /** 347 * Output some characters to the underlying appendable. 348 */ 349 private void outputChars(char[] chars) throws IOException { 350 for (int n = 0; n < chars.length; n++) { 351 out.append(chars[n]); 352 } 353 } 354 }; 355 } 356 357 /** 358 * Returns the Unicode code point of the character at the given index. 359 * 360 * <p>Unlike {@link Character#codePointAt(CharSequence, int)} or 361 * {@link String#codePointAt(int)} this method will never fail silently when 362 * encountering an invalid surrogate pair. 363 * 364 * <p>The behaviour of this method is as follows: 365 * <ol> 366 * <li>If {@code index >= end}, {@link IndexOutOfBoundsException} is thrown. 367 * <li><b>If the character at the specified index is not a surrogate, it is 368 * returned.</b> 369 * <li>If the first character was a high surrogate value, then an attempt is 370 * made to read the next character. 371 * <ol> 372 * <li><b>If the end of the sequence was reached, the negated value of 373 * the trailing high surrogate is returned.</b> 374 * <li><b>If the next character was a valid low surrogate, the code point 375 * value of the high/low surrogate pair is returned.</b> 376 * <li>If the next character was not a low surrogate value, then 377 * {@link IllegalArgumentException} is thrown. 378 * </ol> 379 * <li>If the first character was a low surrogate value, 380 * {@link IllegalArgumentException} is thrown. 381 * </ol> 382 * 383 * @param seq the sequence of characters from which to decode the code point 384 * @param index the index of the first character to decode 385 * @param end the index beyond the last valid character to decode 386 * @return the Unicode code point for the given index or the negated value of 387 * the trailing high surrogate character at the end of the sequence 388 */ 389 protected static final int codePointAt(CharSequence seq, int index, int end) { 390 if (index < end) { 391 char c1 = seq.charAt(index++); 392 if (c1 < Character.MIN_HIGH_SURROGATE || 393 c1 > Character.MAX_LOW_SURROGATE) { 394 // Fast path (first test is probably all we need to do) 395 return c1; 396 } else if (c1 <= Character.MAX_HIGH_SURROGATE) { 397 // If the high surrogate was the last character, return its inverse 398 if (index == end) { 399 return -c1; 400 } 401 // Otherwise look for the low surrogate following it 402 char c2 = seq.charAt(index); 403 if (Character.isLowSurrogate(c2)) { 404 return Character.toCodePoint(c1, c2); 405 } 406 throw new IllegalArgumentException( 407 "Expected low surrogate but got char '" + c2 + 408 "' with value " + (int) c2 + " at index " + index); 409 } else { 410 throw new IllegalArgumentException( 411 "Unexpected low surrogate character '" + c1 + 412 "' with value " + (int) c1 + " at index " + (index - 1)); 413 } 414 } 415 throw new IndexOutOfBoundsException("Index exceeds specified range"); 416 } 417 418 /** 419 * Helper method to grow the character buffer as needed, this only happens 420 * once in a while so it's ok if it's in a method call. If the index passed 421 * in is 0 then no copying will be done. 422 */ 423 private static final char[] growBuffer(char[] dest, int index, int size) { 424 char[] copy = new char[size]; 425 if (index > 0) { 426 System.arraycopy(dest, 0, copy, 0, index); 427 } 428 return copy; 429 } 430 }