1 /** 2 * Copyright (c) 2006, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.google.android.mail.common.base; 17 18 import static com.google.android.mail.common.base.Preconditions.checkNotNull; 19 20 import java.io.IOException; 21 22 /** 23 * Utility functions for dealing with {@code CharEscaper}s, and some commonly 24 * used {@code CharEscaper} instances. 25 * 26 * @author sven (at) google.com (Sven Mawson) 27 * @author laurence (at) google.com (Laurence Gonsalves) 28 */ 29 public final class CharEscapers { 30 private CharEscapers() {} 31 32 // TODO(matevossian): To implementors of escapers -- 33 // For each xxxEscaper method, please add links to external 34 // reference pages that we consider authoritative for what 35 // that escaper should exactly be doing. 36 37 /** 38 * Performs no escaping. 39 */ 40 private static final CharEscaper NULL_ESCAPER = new CharEscaper() { 41 @Override 42 public String escape(String string) { 43 checkNotNull(string); 44 return string; 45 } 46 47 @Override 48 public Appendable escape(final Appendable out) { 49 checkNotNull(out); 50 51 // we can't simply return out because the CharEscaper contract says that 52 // the returned Appendable will throw a NullPointerException if asked to 53 // append null. 54 return new Appendable() { 55 @Override public Appendable append(CharSequence csq) throws IOException { 56 checkNotNull(csq); 57 out.append(csq); 58 return this; 59 } 60 61 @Override public Appendable append(CharSequence csq, int start, int end) 62 throws IOException { 63 checkNotNull(csq); 64 out.append(csq, start, end); 65 return this; 66 } 67 68 @Override public Appendable append(char c) throws IOException { 69 out.append(c); 70 return this; 71 } 72 }; 73 } 74 75 @Override 76 protected char[] escape(char c) { 77 return null; 78 } 79 }; 80 81 /** 82 * Returns a {@link CharEscaper} that does no escaping. 83 */ 84 public static CharEscaper nullEscaper() { 85 return NULL_ESCAPER; 86 } 87 88 /** 89 * Returns a {@link CharEscaper} instance that escapes special characters in a 90 * string so it can safely be included in an XML document in either element 91 * content or attribute values. 92 * 93 * <p><b>Note</b></p>: silently removes null-characters and control 94 * characters, as there is no way to represent them in XML. 95 */ 96 public static CharEscaper xmlEscaper() { 97 return XML_ESCAPER; 98 } 99 100 /** 101 * Escapes special characters from a string so it can safely be included in an 102 * XML document in either element content or attribute values. Also removes 103 * null-characters and control characters, as there is no way to represent 104 * them in XML. 105 */ 106 private static final CharEscaper XML_ESCAPER = newBasicXmlEscapeBuilder() 107 .addEscape('"', """) 108 .addEscape('\'', "'") 109 .toEscaper(); 110 111 /** 112 * Returns a {@link CharEscaper} instance that escapes special characters in a 113 * string so it can safely be included in an XML document in element content. 114 * 115 * <p><b>Note</b></p>: double and single quotes are not escaped, so it is not 116 * safe to use this escaper to escape attribute values. Use the 117 * {@link #xmlEscaper()} escaper to escape attribute values or if you are 118 * unsure. Also silently removes non-whitespace control characters, as there 119 * is no way to represent them in XML. 120 */ 121 public static CharEscaper xmlContentEscaper() { 122 return XML_CONTENT_ESCAPER; 123 } 124 125 /** 126 * Escapes special characters from a string so it can safely be included in an 127 * XML document in element content. Note that quotes are <em>not</em> 128 * escaped, so <em>this is not safe for use in attribute values</em>. Use 129 * {@link #XML_ESCAPER} for attribute values, or if you are unsure. Also 130 * removes non-whitespace control characters, as there is no way to represent 131 * them in XML. 132 */ 133 private static final CharEscaper XML_CONTENT_ESCAPER = 134 newBasicXmlEscapeBuilder().toEscaper(); 135 136 /** 137 * Returns a {@link CharEscaper} instance that escapes special characters in a 138 * string so it can safely be included in an HTML document in either element 139 * content or attribute values. 140 * 141 * <p><b>Note</b></p>: alters non-ASCII and control characters. 142 * 143 * The entity list was taken from: 144 * <a href="http://www.w3.org/TR/html4/sgml/entities.html">here</a> 145 */ 146 public static CharEscaper htmlEscaper() { 147 return HtmlEscaperHolder.HTML_ESCAPER; 148 } 149 150 /** 151 * A lazy initialization holder for HTML_ESCAPER. 152 */ 153 private static class HtmlEscaperHolder { 154 private static final CharEscaper HTML_ESCAPER 155 = new HtmlCharEscaper(new CharEscaperBuilder() 156 .addEscape('"', """) 157 .addEscape('\'', "'") 158 .addEscape('&', "&") 159 .addEscape('<', "<") 160 .addEscape('>', ">") 161 .addEscape('\u00A0', " ") 162 .addEscape('\u00A1', "¡") 163 .addEscape('\u00A2', "¢") 164 .addEscape('\u00A3', "£") 165 .addEscape('\u00A4', "¤") 166 .addEscape('\u00A5', "¥") 167 .addEscape('\u00A6', "¦") 168 .addEscape('\u00A7', "§") 169 .addEscape('\u00A8', "¨") 170 .addEscape('\u00A9', "©") 171 .addEscape('\u00AA', "ª") 172 .addEscape('\u00AB', "«") 173 .addEscape('\u00AC', "¬") 174 .addEscape('\u00AD', "­") 175 .addEscape('\u00AE', "®") 176 .addEscape('\u00AF', "¯") 177 .addEscape('\u00B0', "°") 178 .addEscape('\u00B1', "±") 179 .addEscape('\u00B2', "²") 180 .addEscape('\u00B3', "³") 181 .addEscape('\u00B4', "´") 182 .addEscape('\u00B5', "µ") 183 .addEscape('\u00B6', "¶") 184 .addEscape('\u00B7', "·") 185 .addEscape('\u00B8', "¸") 186 .addEscape('\u00B9', "¹") 187 .addEscape('\u00BA', "º") 188 .addEscape('\u00BB', "»") 189 .addEscape('\u00BC', "¼") 190 .addEscape('\u00BD', "½") 191 .addEscape('\u00BE', "¾") 192 .addEscape('\u00BF', "¿") 193 .addEscape('\u00C0', "À") 194 .addEscape('\u00C1', "Á") 195 .addEscape('\u00C2', "Â") 196 .addEscape('\u00C3', "Ã") 197 .addEscape('\u00C4', "Ä") 198 .addEscape('\u00C5', "Å") 199 .addEscape('\u00C6', "Æ") 200 .addEscape('\u00C7', "Ç") 201 .addEscape('\u00C8', "È") 202 .addEscape('\u00C9', "É") 203 .addEscape('\u00CA', "Ê") 204 .addEscape('\u00CB', "Ë") 205 .addEscape('\u00CC', "Ì") 206 .addEscape('\u00CD', "Í") 207 .addEscape('\u00CE', "Î") 208 .addEscape('\u00CF', "Ï") 209 .addEscape('\u00D0', "Ð") 210 .addEscape('\u00D1', "Ñ") 211 .addEscape('\u00D2', "Ò") 212 .addEscape('\u00D3', "Ó") 213 .addEscape('\u00D4', "Ô") 214 .addEscape('\u00D5', "Õ") 215 .addEscape('\u00D6', "Ö") 216 .addEscape('\u00D7', "×") 217 .addEscape('\u00D8', "Ø") 218 .addEscape('\u00D9', "Ù") 219 .addEscape('\u00DA', "Ú") 220 .addEscape('\u00DB', "Û") 221 .addEscape('\u00DC', "Ü") 222 .addEscape('\u00DD', "Ý") 223 .addEscape('\u00DE', "Þ") 224 .addEscape('\u00DF', "ß") 225 .addEscape('\u00E0', "à") 226 .addEscape('\u00E1', "á") 227 .addEscape('\u00E2', "â") 228 .addEscape('\u00E3', "ã") 229 .addEscape('\u00E4', "ä") 230 .addEscape('\u00E5', "å") 231 .addEscape('\u00E6', "æ") 232 .addEscape('\u00E7', "ç") 233 .addEscape('\u00E8', "è") 234 .addEscape('\u00E9', "é") 235 .addEscape('\u00EA', "ê") 236 .addEscape('\u00EB', "ë") 237 .addEscape('\u00EC', "ì") 238 .addEscape('\u00ED', "í") 239 .addEscape('\u00EE', "î") 240 .addEscape('\u00EF', "ï") 241 .addEscape('\u00F0', "ð") 242 .addEscape('\u00F1', "ñ") 243 .addEscape('\u00F2', "ò") 244 .addEscape('\u00F3', "ó") 245 .addEscape('\u00F4', "ô") 246 .addEscape('\u00F5', "õ") 247 .addEscape('\u00F6', "ö") 248 .addEscape('\u00F7', "÷") 249 .addEscape('\u00F8', "ø") 250 .addEscape('\u00F9', "ù") 251 .addEscape('\u00FA', "ú") 252 .addEscape('\u00FB', "û") 253 .addEscape('\u00FC', "ü") 254 .addEscape('\u00FD', "ý") 255 .addEscape('\u00FE', "þ") 256 .addEscape('\u00FF', "ÿ") 257 .addEscape('\u0152', "Œ") 258 .addEscape('\u0153', "œ") 259 .addEscape('\u0160', "Š") 260 .addEscape('\u0161', "š") 261 .addEscape('\u0178', "Ÿ") 262 .addEscape('\u0192', "ƒ") 263 .addEscape('\u02C6', "ˆ") 264 .addEscape('\u02DC', "˜") 265 .addEscape('\u0391', "Α") 266 .addEscape('\u0392', "Β") 267 .addEscape('\u0393', "Γ") 268 .addEscape('\u0394', "Δ") 269 .addEscape('\u0395', "Ε") 270 .addEscape('\u0396', "Ζ") 271 .addEscape('\u0397', "Η") 272 .addEscape('\u0398', "Θ") 273 .addEscape('\u0399', "Ι") 274 .addEscape('\u039A', "Κ") 275 .addEscape('\u039B', "Λ") 276 .addEscape('\u039C', "Μ") 277 .addEscape('\u039D', "Ν") 278 .addEscape('\u039E', "Ξ") 279 .addEscape('\u039F', "Ο") 280 .addEscape('\u03A0', "Π") 281 .addEscape('\u03A1', "Ρ") 282 .addEscape('\u03A3', "Σ") 283 .addEscape('\u03A4', "Τ") 284 .addEscape('\u03A5', "Υ") 285 .addEscape('\u03A6', "Φ") 286 .addEscape('\u03A7', "Χ") 287 .addEscape('\u03A8', "Ψ") 288 .addEscape('\u03A9', "Ω") 289 .addEscape('\u03B1', "α") 290 .addEscape('\u03B2', "β") 291 .addEscape('\u03B3', "γ") 292 .addEscape('\u03B4', "δ") 293 .addEscape('\u03B5', "ε") 294 .addEscape('\u03B6', "ζ") 295 .addEscape('\u03B7', "η") 296 .addEscape('\u03B8', "θ") 297 .addEscape('\u03B9', "ι") 298 .addEscape('\u03BA', "κ") 299 .addEscape('\u03BB', "λ") 300 .addEscape('\u03BC', "μ") 301 .addEscape('\u03BD', "ν") 302 .addEscape('\u03BE', "ξ") 303 .addEscape('\u03BF', "ο") 304 .addEscape('\u03C0', "π") 305 .addEscape('\u03C1', "ρ") 306 .addEscape('\u03C2', "ς") 307 .addEscape('\u03C3', "σ") 308 .addEscape('\u03C4', "τ") 309 .addEscape('\u03C5', "υ") 310 .addEscape('\u03C6', "φ") 311 .addEscape('\u03C7', "χ") 312 .addEscape('\u03C8', "ψ") 313 .addEscape('\u03C9', "ω") 314 .addEscape('\u03D1', "ϑ") 315 .addEscape('\u03D2', "ϒ") 316 .addEscape('\u03D6', "ϖ") 317 .addEscape('\u2002', " ") 318 .addEscape('\u2003', " ") 319 .addEscape('\u2009', " ") 320 .addEscape('\u200C', "‌") 321 .addEscape('\u200D', "‍") 322 .addEscape('\u200E', "‎") 323 .addEscape('\u200F', "‏") 324 .addEscape('\u2013', "–") 325 .addEscape('\u2014', "—") 326 .addEscape('\u2018', "‘") 327 .addEscape('\u2019', "’") 328 .addEscape('\u201A', "‚") 329 .addEscape('\u201C', "“") 330 .addEscape('\u201D', "”") 331 .addEscape('\u201E', "„") 332 .addEscape('\u2020', "†") 333 .addEscape('\u2021', "‡") 334 .addEscape('\u2022', "•") 335 .addEscape('\u2026', "…") 336 .addEscape('\u2030', "‰") 337 .addEscape('\u2032', "′") 338 .addEscape('\u2033', "″") 339 .addEscape('\u2039', "‹") 340 .addEscape('\u203A', "›") 341 .addEscape('\u203E', "‾") 342 .addEscape('\u2044', "⁄") 343 .addEscape('\u20AC', "€") 344 .addEscape('\u2111', "ℑ") 345 .addEscape('\u2118', "℘") 346 .addEscape('\u211C', "ℜ") 347 .addEscape('\u2122', "™") 348 .addEscape('\u2135', "ℵ") 349 .addEscape('\u2190', "←") 350 .addEscape('\u2191', "↑") 351 .addEscape('\u2192', "→") 352 .addEscape('\u2193', "↓") 353 .addEscape('\u2194', "↔") 354 .addEscape('\u21B5', "↵") 355 .addEscape('\u21D0', "⇐") 356 .addEscape('\u21D1', "⇑") 357 .addEscape('\u21D2', "⇒") 358 .addEscape('\u21D3', "⇓") 359 .addEscape('\u21D4', "⇔") 360 .addEscape('\u2200', "∀") 361 .addEscape('\u2202', "∂") 362 .addEscape('\u2203', "∃") 363 .addEscape('\u2205', "∅") 364 .addEscape('\u2207', "∇") 365 .addEscape('\u2208', "∈") 366 .addEscape('\u2209', "∉") 367 .addEscape('\u220B', "∋") 368 .addEscape('\u220F', "∏") 369 .addEscape('\u2211', "∑") 370 .addEscape('\u2212', "−") 371 .addEscape('\u2217', "∗") 372 .addEscape('\u221A', "√") 373 .addEscape('\u221D', "∝") 374 .addEscape('\u221E', "∞") 375 .addEscape('\u2220', "∠") 376 .addEscape('\u2227', "∧") 377 .addEscape('\u2228', "∨") 378 .addEscape('\u2229', "∩") 379 .addEscape('\u222A', "∪") 380 .addEscape('\u222B', "∫") 381 .addEscape('\u2234', "∴") 382 .addEscape('\u223C', "∼") 383 .addEscape('\u2245', "≅") 384 .addEscape('\u2248', "≈") 385 .addEscape('\u2260', "≠") 386 .addEscape('\u2261', "≡") 387 .addEscape('\u2264', "≤") 388 .addEscape('\u2265', "≥") 389 .addEscape('\u2282', "⊂") 390 .addEscape('\u2283', "⊃") 391 .addEscape('\u2284', "⊄") 392 .addEscape('\u2286', "⊆") 393 .addEscape('\u2287', "⊇") 394 .addEscape('\u2295', "⊕") 395 .addEscape('\u2297', "⊗") 396 .addEscape('\u22A5', "⊥") 397 .addEscape('\u22C5', "⋅") 398 .addEscape('\u2308', "⌈") 399 .addEscape('\u2309', "⌉") 400 .addEscape('\u230A', "⌊") 401 .addEscape('\u230B', "⌋") 402 .addEscape('\u2329', "⟨") 403 .addEscape('\u232A', "⟩") 404 .addEscape('\u25CA', "◊") 405 .addEscape('\u2660', "♠") 406 .addEscape('\u2663', "♣") 407 .addEscape('\u2665', "♥") 408 .addEscape('\u2666', "♦") 409 .toArray()); 410 } 411 412 /** 413 * Returns a {@link CharEscaper} instance that escapes special characters in a 414 * string so it can safely be included in an HTML document in either element 415 * content or attribute values. 416 * 417 * <p><b>Note</b></p>: does not alter non-ASCII and control characters. 418 */ 419 public static CharEscaper asciiHtmlEscaper() { 420 return ASCII_HTML_ESCAPER; 421 } 422 423 /** 424 * Escapes special characters from a string so it can safely be included in an 425 * HTML document in either element content or attribute values. Does 426 * <em>not</em> alter non-ASCII characters or control characters. 427 */ 428 private static final CharEscaper ASCII_HTML_ESCAPER = new CharEscaperBuilder() 429 .addEscape('"', """) 430 .addEscape('\'', "'") 431 .addEscape('&', "&") 432 .addEscape('<', "<") 433 .addEscape('>', ">") 434 .toEscaper(); 435 436 /** 437 * Returns an {@link Escaper} instance that escapes Java chars so they can be 438 * safely included in URIs. For details on escaping URIs, see section 2.4 of 439 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. 440 * 441 * <p>When encoding a String, the following rules apply: 442 * <ul> 443 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 444 * through "9" remain the same. 445 * <li>The special characters ".", "-", "*", and "_" remain the same. 446 * <li>The space character " " is converted into a plus sign "+". 447 * <li>All other characters are converted into one or more bytes using UTF-8 448 * encoding and each byte is then represented by the 3-character string 449 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal 450 * representation of the byte value. 451 * <ul> 452 * 453 * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase 454 * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> 455 * RFC 3986</a>:<br> 456 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 457 * for all percent-encodings."</i> 458 * 459 * <p>This escaper has identical behavior to (but is potentially much faster 460 * than): 461 * <ul> 462 * <li>{@link com.google.httputil.FastURLEncoder#encode(String)} 463 * <li>{@link com.google.httputil.FastURLEncoder#encode(String,String)} 464 * with the encoding name "UTF-8" 465 * <li>{@link java.net.URLEncoder#encode(String, String)} 466 * with the encoding name "UTF-8" 467 * </ul> 468 * 469 * <p>This method is equivalent to {@code uriEscaper(true)}. 470 */ 471 public static Escaper uriEscaper() { 472 return uriEscaper(true); 473 } 474 475 /** 476 * Returns an {@link Escaper} instance that escapes Java chars so they can be 477 * safely included in URI path segments. For details on escaping URIs, see 478 * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. 479 * 480 * <p>When encoding a String, the following rules apply: 481 * <ul> 482 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 483 * through "9" remain the same. 484 * <li>The unreserved characters ".", "-", "~", and "_" remain the same. 485 * <li>The general delimiters "@" and ":" remain the same. 486 * <li>The subdelimiters "!", "$", "&", "'", "(", ")", "*", ",", ";", 487 * and "=" remain the same. 488 * <li>The space character " " is converted into %20. 489 * <li>All other characters are converted into one or more bytes using UTF-8 490 * encoding and each byte is then represented by the 3-character string 491 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal 492 * representation of the byte value. 493 * </ul> 494 * 495 * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase 496 * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> 497 * RFC 3986</a>:<br> 498 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 499 * for all percent-encodings."</i> 500 */ 501 public static Escaper uriPathEscaper() { 502 return URI_PATH_ESCAPER; 503 } 504 505 /** 506 * Returns an {@link Escaper} instance that escapes Java chars so they can be 507 * safely included in URI query string segments. When the query string 508 * consists of a sequence of name=value pairs separated by &, the names 509 * and values should be individually encoded. If you escape an entire query 510 * string in one pass with this escaper, then the "=" and "&" characters 511 * used as separators will also be escaped. 512 * 513 * <p>This escaper is also suitable for escaping fragment identifiers. 514 * 515 * <p>For details on escaping URIs, see 516 * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. 517 * 518 * <p>When encoding a String, the following rules apply: 519 * <ul> 520 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 521 * through "9" remain the same. 522 * <li>The unreserved characters ".", "-", "~", and "_" remain the same. 523 * <li>The general delimiters "@" and ":" remain the same. 524 * <li>The path delimiters "/" and "?" remain the same. 525 * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";", 526 * remain the same. 527 * <li>The space character " " is converted into %20. 528 * <li>The equals sign "=" is converted into %3D. 529 * <li>The ampersand "&" is converted into %26. 530 * <li>All other characters are converted into one or more bytes using UTF-8 531 * encoding and each byte is then represented by the 3-character string 532 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal 533 * representation of the byte value. 534 * </ul> 535 * 536 * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase 537 * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> 538 * RFC 3986</a>:<br> 539 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 540 * for all percent-encodings."</i> 541 * 542 * <p>This method is equivalent to {@code uriQueryStringEscaper(false)}. 543 */ 544 public static Escaper uriQueryStringEscaper() { 545 return uriQueryStringEscaper(false); 546 } 547 548 /** 549 * Returns a {@link Escaper} instance that escapes Java characters so they can 550 * be safely included in URIs. For details on escaping URIs, see section 2.4 551 * of <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. 552 * 553 * <p>When encoding a String, the following rules apply: 554 * <ul> 555 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 556 * through "9" remain the same. 557 * <li>The special characters ".", "-", "*", and "_" remain the same. 558 * <li>If {@code plusForSpace} was specified, the space character " " is 559 * converted into a plus sign "+". Otherwise it is converted into "%20". 560 * <li>All other characters are converted into one or more bytes using UTF-8 561 * encoding and each byte is then represented by the 3-character string 562 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal 563 * representation of the byte value. 564 * </ul> 565 * 566 * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase 567 * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> 568 * RFC 3986</a>:<br> 569 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 570 * for all percent-encodings."</i> 571 * 572 * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise 573 * it is escaped to {@code %20}. Although common, the escaping of 574 * spaces as plus signs has a very ambiguous status in the relevant 575 * specifications. You should prefer {@code %20} unless you are doing 576 * exact character-by-character comparisons of URLs and backwards 577 * compatibility requires you to use plus signs. 578 * 579 * @see #uriEscaper() 580 */ 581 public static Escaper uriEscaper(boolean plusForSpace) { 582 return plusForSpace ? URI_ESCAPER : URI_ESCAPER_NO_PLUS; 583 } 584 585 /** 586 * Returns an {@link Escaper} instance that escapes Java chars so they can be 587 * safely included in URI query string segments. When the query string 588 * consists of a sequence of name=value pairs separated by &, the names 589 * and values should be individually encoded. If you escape an entire query 590 * string in one pass with this escaper, then the "=" and "&" characters 591 * used as separators will also be escaped. 592 * 593 * <p>This escaper is also suitable for escaping fragment identifiers. 594 * 595 * <p>For details on escaping URIs, see 596 * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. 597 * 598 * <p>When encoding a String, the following rules apply: 599 * <ul> 600 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 601 * through "9" remain the same. 602 * <li>The unreserved characters ".", "-", "~", and "_" remain the same. 603 * <li>The general delimiters "@" and ":" remain the same. 604 * <li>The path delimiters "/" and "?" remain the same. 605 * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";", 606 * remain the same. 607 * <li>If {@code plusForSpace} was specified, the space character " " is 608 * converted into a plus sign "+". Otherwise it is converted into "%20". 609 * <li>The equals sign "=" is converted into %3D. 610 * <li>The ampersand "&" is converted into %26. 611 * <li>All other characters are converted into one or more bytes using UTF-8 612 * encoding and each byte is then represented by the 3-character string 613 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal 614 * representation of the byte value. 615 * </ul> 616 * 617 * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase 618 * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> 619 * RFC 3986</a>:<br> 620 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 621 * for all percent-encodings."</i> 622 * 623 * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise 624 * it is escaped to {@code %20}. Although common, the escaping of 625 * spaces as plus signs has a very ambiguous status in the relevant 626 * specifications. You should prefer {@code %20} unless you are doing 627 * exact character-by-character comparisons of URLs and backwards 628 * compatibility requires you to use plus signs. 629 * 630 * @see #uriQueryStringEscaper() 631 */ 632 public static Escaper uriQueryStringEscaper(boolean plusForSpace) { 633 return plusForSpace ? 634 URI_QUERY_STRING_ESCAPER_WITH_PLUS : URI_QUERY_STRING_ESCAPER; 635 } 636 637 private static final Escaper URI_ESCAPER = 638 new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, true); 639 640 private static final Escaper URI_ESCAPER_NO_PLUS = 641 new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, false); 642 643 private static final Escaper URI_PATH_ESCAPER = 644 new PercentEscaper(PercentEscaper.SAFEPATHCHARS_URLENCODER, false); 645 646 private static final Escaper URI_QUERY_STRING_ESCAPER = 647 new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, false); 648 649 private static final Escaper URI_QUERY_STRING_ESCAPER_WITH_PLUS = 650 new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, true); 651 652 /** 653 * Returns a {@link Escaper} instance that escapes Java characters in a manner 654 * compatible with the C++ webutil/url URL class (the {@code kGoogle1Escape} 655 * set). 656 * 657 * <p>When encoding a String, the following rules apply: 658 * <ul> 659 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" 660 * through "9" remain the same. 661 * <li>The special characters "!", "(", ")", "*", "-", ".", "_", "~", ",", "/" 662 * and ":" remain the same. 663 * <li>The space character " " is converted into a plus sign "+". 664 * <li>All other characters are converted into one or more bytes using UTF-8 665 * encoding and each byte is then represented by the 3-character string 666 * "%XY", where "XY" is the two-digit, uppercase, hexadecimal 667 * representation of the byte value. 668 * </ul> 669 * 670 * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase 671 * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> 672 * RFC 3986</a>:<br> 673 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 674 * for all percent-encodings."</i> 675 * 676 * <p><b>Note</b>: This escaper is a special case and is <em>not 677 * compliant</em> with <a href="http://www.ietf.org/rfc/rfc2396.txt"> 678 * RFC 2396</a>. Specifically it will not escape "/", ":" and ",". This is 679 * only provided for certain limited use cases and you should favor using 680 * {@link #uriEscaper()} whenever possible. 681 */ 682 public static Escaper cppUriEscaper() { 683 return CPP_URI_ESCAPER; 684 } 685 686 // Based on comments from FastURLEncoder: 687 // These octets mimic the ones escaped by the C++ webutil/url URL class -- 688 // the kGoogle1Escape set. 689 // To produce the same escaping as C++, use this set with the plusForSpace 690 // option. 691 // WARNING: Contrary to RFC 2396 ",", "/" and ":" are listed as safe here. 692 private static final Escaper CPP_URI_ESCAPER = 693 new PercentEscaper("!()*-._~,/:", true); 694 695 /** 696 * Returns a {@link CharEscaper} instance that escapes special characters in a 697 * string so it can safely be included in a Java string literal. 698 * 699 * <p><b>Note</b></p>: does not escape single quotes, so use the escaper 700 * returned by {@link #javaCharEscaper()} if you are generating char 701 * literals or if you are unsure. 702 */ 703 public static CharEscaper javaStringEscaper() { 704 return JAVA_STRING_ESCAPER; 705 } 706 707 /** 708 * Escapes special characters from a string so it can safely be included in a 709 * Java string literal. Does <em>not</em> escape single-quotes, so use 710 * JAVA_CHAR_ESCAPE if you are generating char literals, or if you are unsure. 711 * 712 * <p>Note that non-ASCII characters will be octal or Unicode escaped. 713 */ 714 private static final CharEscaper JAVA_STRING_ESCAPER 715 = new JavaCharEscaper(new CharEscaperBuilder() 716 .addEscape('\b', "\\b") 717 .addEscape('\f', "\\f") 718 .addEscape('\n', "\\n") 719 .addEscape('\r', "\\r") 720 .addEscape('\t', "\\t") 721 .addEscape('\"', "\\\"") 722 .addEscape('\\', "\\\\") 723 .toArray()); 724 725 /** 726 * Returns a {@link CharEscaper} instance that escapes special characters in a 727 * string so it can safely be included in a Java char or string literal. The 728 * behavior of this escaper is the same as that of the 729 * {@link #javaStringEscaper()}, except it also escapes single quotes. 730 */ 731 public static CharEscaper javaCharEscaper() { 732 return JAVA_CHAR_ESCAPER; 733 } 734 735 /** 736 * Escapes special characters from a string so it can safely be included in a 737 * Java char literal or string literal. 738 * 739 * <p>Note that non-ASCII characters will be octal or Unicode escaped. 740 * 741 * <p>This is the same as {@link #JAVA_STRING_ESCAPER}, except that it escapes 742 * single quotes. 743 */ 744 private static final CharEscaper JAVA_CHAR_ESCAPER 745 = new JavaCharEscaper(new CharEscaperBuilder() 746 .addEscape('\b', "\\b") 747 .addEscape('\f', "\\f") 748 .addEscape('\n', "\\n") 749 .addEscape('\r', "\\r") 750 .addEscape('\t', "\\t") 751 .addEscape('\'', "\\'") 752 .addEscape('\"', "\\\"") 753 .addEscape('\\', "\\\\") 754 .toArray()); 755 756 /** 757 * Returns a {@link CharEscaper} instance that replaces non-ASCII characters 758 * in a string with their Unicode escape sequences ({@code \\uxxxx} where 759 * {@code xxxx} is a hex number). Existing escape sequences won't be affected. 760 */ 761 public static CharEscaper javaStringUnicodeEscaper() { 762 return JAVA_STRING_UNICODE_ESCAPER; 763 } 764 765 /** 766 * Escapes each non-ASCII character in with its Unicode escape sequence 767 * {@code \\uxxxx} where {@code xxxx} is a hex number. Existing escape 768 * sequences won't be affected. 769 */ 770 private static final CharEscaper JAVA_STRING_UNICODE_ESCAPER 771 = new CharEscaper() { 772 @Override protected char[] escape(char c) { 773 if (c <= 127) { 774 return null; 775 } 776 777 char[] r = new char[6]; 778 r[5] = HEX_DIGITS[c & 15]; 779 c >>>= 4; 780 r[4] = HEX_DIGITS[c & 15]; 781 c >>>= 4; 782 r[3] = HEX_DIGITS[c & 15]; 783 c >>>= 4; 784 r[2] = HEX_DIGITS[c & 15]; 785 r[1] = 'u'; 786 r[0] = '\\'; 787 return r; 788 } 789 }; 790 791 /** 792 * Returns a {@link CharEscaper} instance that escapes special characters from 793 * a string so it can safely be included in a Python string literal. Does not 794 * have any special handling for non-ASCII characters. 795 */ 796 public static CharEscaper pythonEscaper() { 797 return PYTHON_ESCAPER; 798 } 799 800 /** 801 * Escapes special characters in a string so it can safely be included in a 802 * Python string literal. Does not have any special handling for non-ASCII 803 * characters. 804 */ 805 private static final CharEscaper PYTHON_ESCAPER = new CharEscaperBuilder() 806 // TODO(laurence): perhaps this should escape non-ASCII characters? 807 .addEscape('\n', "\\n") 808 .addEscape('\r', "\\r") 809 .addEscape('\t', "\\t") 810 .addEscape('\\', "\\\\") 811 .addEscape('\"', "\\\"") 812 .addEscape('\'', "\\\'") 813 .toEscaper(); 814 815 /** 816 * Returns a {@link CharEscaper} instance that escapes non-ASCII characters in 817 * a string so it can safely be included in a Javascript string literal. 818 * Non-ASCII characters are replaced with their ASCII javascript escape 819 * sequences (e.g., \\uhhhh or \xhh). 820 */ 821 public static CharEscaper javascriptEscaper() { 822 return JAVASCRIPT_ESCAPER; 823 } 824 825 /** 826 * {@code CharEscaper} to escape javascript strings. Turns all non-ASCII 827 * characters into ASCII javascript escape sequences (e.g., \\uhhhh or \xhh). 828 */ 829 private static final CharEscaper JAVASCRIPT_ESCAPER 830 = new JavascriptCharEscaper(new CharEscaperBuilder() 831 .addEscape('\'', "\\x27") 832 .addEscape('"', "\\x22") 833 .addEscape('<', "\\x3c") 834 .addEscape('=', "\\x3d") 835 .addEscape('>', "\\x3e") 836 .addEscape('&', "\\x26") 837 .addEscape('\b', "\\b") 838 .addEscape('\t', "\\t") 839 .addEscape('\n', "\\n") 840 .addEscape('\f', "\\f") 841 .addEscape('\r', "\\r") 842 .addEscape('\\', "\\\\") 843 .toArray()); 844 845 private static CharEscaperBuilder newBasicXmlEscapeBuilder() { 846 return new CharEscaperBuilder() 847 .addEscape('&', "&") 848 .addEscape('<', "<") 849 .addEscape('>', ">") 850 .addEscapes(new char[] { 851 '\000', '\001', '\002', '\003', '\004', 852 '\005', '\006', '\007', '\010', '\013', 853 '\014', '\016', '\017', '\020', '\021', 854 '\022', '\023', '\024', '\025', '\026', 855 '\027', '\030', '\031', '\032', '\033', 856 '\034', '\035', '\036', '\037'}, ""); 857 } 858 859 /** 860 * Returns a composite {@link CharEscaper} instance that tries to escape 861 * characters using a primary {@code CharEscaper} first and falls back to a 862 * secondary one if there is no escaping. 863 * 864 * <p>The returned escaper will attempt to escape each character using the 865 * primary escaper, and if the primary escaper has no escaping for that 866 * character, it will use the secondary escaper. If the secondary escaper has 867 * no escaping for a character either, the original character will be used. 868 * If the primary escaper has an escape for a character, the secondary escaper 869 * will not be used at all for that character; the escaped output of the 870 * primary is not run through the secondary. For a case where you would like 871 * to first escape with one escaper, and then with another, it is recommended 872 * that you call each escaper in order. 873 * 874 * @param primary The primary {@code CharEscaper} to use 875 * @param secondary The secondary {@code CharEscaper} to use if the first one 876 * has no escaping rule for a character 877 * @throws NullPointerException if any of the arguments is null 878 */ 879 public static CharEscaper fallThrough(CharEscaper primary, 880 CharEscaper secondary) { 881 checkNotNull(primary); 882 checkNotNull(secondary); 883 return new FallThroughCharEscaper(primary, secondary); 884 } 885 886 /** 887 * A fast {@link CharEscaper} that uses an array of replacement characters and 888 * a range of safe characters. It overrides {@link #escape(String)} to improve 889 * performance. Rough benchmarking shows that this almost doubles the speed 890 * when processing strings that do not require escaping (providing the escape 891 * test itself is efficient). 892 */ 893 private static abstract class FastCharEscaper extends CharEscaper { 894 895 protected final char[][] replacements; 896 protected final int replacementLength; 897 protected final char safeMin; 898 protected final char safeMax; 899 900 public FastCharEscaper(char[][] replacements, char safeMin, char safeMax) { 901 this.replacements = replacements; 902 this.replacementLength = replacements.length; 903 this.safeMin = safeMin; 904 this.safeMax = safeMax; 905 } 906 907 /** Overridden for performance (see {@link FastCharEscaper}). */ 908 @Override public String escape(String s) { 909 int slen = s.length(); 910 for (int index = 0; index < slen; index++) { 911 char c = s.charAt(index); 912 if ((c < replacementLength && replacements[c] != null) 913 || c < safeMin || c > safeMax) { 914 return escapeSlow(s, index); 915 } 916 } 917 return s; 918 } 919 } 920 921 /** 922 * Escaper for Java character escaping, contains both an array and a 923 * backup function. We're not overriding the array decorator because we 924 * want to keep this as fast as possible, so no calls to super.escape first. 925 */ 926 private static class JavaCharEscaper extends FastCharEscaper { 927 928 public JavaCharEscaper(char[][] replacements) { 929 super(replacements, ' ', '~'); 930 } 931 932 @Override protected char[] escape(char c) { 933 // First check if our array has a valid escaping. 934 if (c < replacementLength) { 935 char[] r = replacements[c]; 936 if (r != null) { 937 return r; 938 } 939 } 940 941 // This range is un-escaped. 942 if (safeMin <= c && c <= safeMax) { 943 return null; 944 } 945 946 if (c <= 0xFF) { 947 // Convert c to an octal-escaped string. 948 // Equivalent to String.format("\\%03o", (int)c); 949 char[] r = new char[4]; 950 r[0] = '\\'; 951 r[3] = HEX_DIGITS[c & 7]; 952 c >>>= 3; 953 r[2] = HEX_DIGITS[c & 7]; 954 c >>>= 3; 955 r[1] = HEX_DIGITS[c & 7]; 956 return r; 957 } 958 959 // Convert c to a hex-escaped string. 960 // Equivalent to String.format("\\u%04x", (int)c); 961 char[] r = new char[6]; 962 r[0] = '\\'; 963 r[1] = 'u'; 964 r[5] = HEX_DIGITS[c & 15]; 965 c >>>= 4; 966 r[4] = HEX_DIGITS[c & 15]; 967 c >>>= 4; 968 r[3] = HEX_DIGITS[c & 15]; 969 c >>>= 4; 970 r[2] = HEX_DIGITS[c & 15]; 971 return r; 972 } 973 } 974 975 /** 976 * Escaper for javascript character escaping, contains both an array and a 977 * backup function. We're not overriding the array decorator because we 978 * want to keep this as fast as possible, so no calls to super.escape first. 979 */ 980 private static class JavascriptCharEscaper extends FastCharEscaper { 981 982 public JavascriptCharEscaper(char[][] replacements) { 983 super(replacements, ' ', '~'); 984 } 985 986 @Override protected char[] escape(char c) { 987 // First check if our array has a valid escaping. 988 if (c < replacementLength) { 989 char[] r = replacements[c]; 990 if (r != null) { 991 return r; 992 } 993 } 994 995 // This range is unescaped. 996 if (safeMin <= c && c <= safeMax) { 997 return null; 998 } 999 1000 // we can do a 2 digit hex escape for chars less that 0x100 1001 if (c < 0x100) { 1002 char[] r = new char[4]; 1003 r[3] = HEX_DIGITS[c & 0xf]; 1004 c >>>= 4; 1005 r[2] = HEX_DIGITS[c & 0xf]; 1006 r[1] = 'x'; 1007 r[0] = '\\'; 1008 return r; 1009 } 1010 1011 // 4 digit hex escape everything else 1012 char[] r = new char[6]; 1013 r[5] = HEX_DIGITS[c & 0xf]; 1014 c >>>= 4; 1015 r[4] = HEX_DIGITS[c & 0xf]; 1016 c >>>= 4; 1017 r[3] = HEX_DIGITS[c & 0xf]; 1018 c >>>= 4; 1019 r[2] = HEX_DIGITS[c & 0xf]; 1020 r[1] = 'u'; 1021 r[0] = '\\'; 1022 return r; 1023 } 1024 } 1025 1026 /** 1027 * Escaper for HTML character escaping, contains both an array and a 1028 * backup function. We're not overriding the array decorator because we 1029 * want to keep this as fast as possible, so no calls to super.escape first. 1030 */ 1031 private static class HtmlCharEscaper extends FastCharEscaper { 1032 1033 public HtmlCharEscaper(char[][] replacements) { 1034 super(replacements, Character.MIN_VALUE, '~'); 1035 } 1036 1037 @Override protected char[] escape(char c) { 1038 // First check if our array has a valid escaping. 1039 if (c < replacementLength) { 1040 char[] r = replacements[c]; 1041 if (r != null) { 1042 return r; 1043 } 1044 } 1045 1046 // ~ is ASCII 126, the highest value char that does not need 1047 // to be escaped 1048 if (c <= safeMax) { 1049 return null; 1050 } 1051 1052 int index; 1053 if (c < 1000) { 1054 index = 4; 1055 } else if (c < 10000) { 1056 index = 5; 1057 } else { 1058 index = 6; 1059 } 1060 char[] result = new char[index + 2]; 1061 result[0] = '&'; 1062 result[1] = '#'; 1063 result[index + 1] = ';'; 1064 1065 // TODO(sven): Convert this to a sequence of shifts/additions 1066 // to avoid the division and modulo operators. 1067 int intValue = c; 1068 for (; index > 1; index--) { 1069 result[index] = HEX_DIGITS[intValue % 10]; 1070 intValue /= 10; 1071 } 1072 return result; 1073 } 1074 } 1075 1076 /** 1077 * A composite {@code CharEscaper} object that tries to escape characters 1078 * using a primary {@code CharEscaper} first and falls back to a secondary 1079 * one if there is no escaping. 1080 */ 1081 private static class FallThroughCharEscaper extends CharEscaper { 1082 1083 private final CharEscaper primary; 1084 private final CharEscaper secondary; 1085 1086 public FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary) { 1087 this.primary = primary; 1088 this.secondary = secondary; 1089 } 1090 1091 @Override 1092 protected char[] escape(char c) { 1093 char result[] = primary.escape(c); 1094 if (result == null) { 1095 result = secondary.escape(c); 1096 } 1097 return result; 1098 } 1099 } 1100 1101 private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray(); 1102 }