1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package com.github.javaparser.utils; 18 19 import java.io.IOException; 20 import java.io.StringWriter; 21 import java.io.Writer; 22 import java.util.HashMap; 23 import java.util.HashSet; 24 25 /** 26 * Adapted from apache commons-lang3 project. 27 * <p> 28 * Unescapes escaped chars in strings. 29 */ 30 public class StringEscapeUtils { 31 32 private StringEscapeUtils() { 33 } 34 35 /** 36 * <p>Escapes the characters in a {@code String} using Java String rules.</p> 37 * <p> 38 * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> 39 * <p> 40 * <p>So a tab becomes the characters {@code '\\'} and 41 * {@code 't'}.</p> 42 * <p> 43 * <p>The only difference between Java strings and JavaScript strings 44 * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p> 45 * <p> 46 * <p>Example:</p> 47 * <pre> 48 * input string: He didn't say, "Stop!" 49 * output string: He didn't say, \"Stop!\" 50 * </pre> 51 * 52 * @param input String to escape values in, may be null 53 * @return String with escaped values, {@code null} if null string input 54 */ 55 public static String escapeJava(final String input) { 56 return ESCAPE_JAVA.translate(input); 57 } 58 59 /** 60 * <p>Unescapes any Java literals found in the {@code String}. 61 * For example, it will turn a sequence of {@code '\'} and 62 * {@code 'n'} into a newline character, unless the {@code '\'} 63 * is preceded by another {@code '\'}.</p> 64 * 65 * @param input the {@code String} to unescape, may be null 66 * @return a new unescaped {@code String}, {@code null} if null string input 67 */ 68 public static String unescapeJava(final String input) { 69 return UNESCAPE_JAVA.translate(input); 70 } 71 72 private static final String[][] JAVA_CTRL_CHARS_UNESCAPE = { 73 {"\\b", "\b"}, 74 {"\\n", "\n"}, 75 {"\\t", "\t"}, 76 {"\\f", "\f"}, 77 {"\\r", "\r"} 78 }; 79 80 private static final String[][] JAVA_CTRL_CHARS_ESCAPE = { 81 {"\b", "\\b"}, 82 {"\n", "\\n"}, 83 {"\t", "\\t"}, 84 {"\f", "\\f"}, 85 {"\r", "\\r"} 86 }; 87 88 private static final CharSequenceTranslator ESCAPE_JAVA = 89 new AggregateTranslator( 90 new LookupTranslator( 91 new String[][]{ 92 {"\"", "\\\""}, 93 {"\\", "\\\\"}, 94 }), 95 new LookupTranslator(JAVA_CTRL_CHARS_ESCAPE.clone()) 96 ); 97 98 private static final CharSequenceTranslator UNESCAPE_JAVA = 99 new AggregateTranslator( 100 new OctalUnescaper(), 101 new UnicodeUnescaper(), 102 new LookupTranslator(JAVA_CTRL_CHARS_UNESCAPE.clone()), 103 new LookupTranslator( 104 new String[][]{ 105 {"\\\\", "\\"}, 106 {"\\\"", "\""}, 107 {"\\'", "'"}, 108 {"\\", ""} 109 }) 110 ); 111 112 /** 113 * Adapted from apache commons-lang3 project. 114 * <p> 115 * An API for translating text. 116 * Its core use is to escape and unescape text. Because escaping and unescaping 117 * is completely contextual, the API does not present two separate signatures. 118 * 119 * @since 3.0 120 */ 121 private static abstract class CharSequenceTranslator { 122 123 /** 124 * Translate a set of codepoints, represented by an int index into a CharSequence, 125 * into another set of codepoints. The number of codepoints consumed must be returned, 126 * and the only IOExceptions thrown must be from interacting with the Writer so that 127 * the top level API may reliably ignore StringWriter IOExceptions. 128 * 129 * @param input CharSequence that is being translated 130 * @param index int representing the current point of translation 131 * @param out Writer to translate the text to 132 * @return int count of codepoints consumed 133 * @throws IOException if and only if the Writer produces an IOException 134 */ 135 public abstract int translate(CharSequence input, int index, Writer out) throws IOException; 136 137 /** 138 * Helper for non-Writer usage. 139 * 140 * @param input CharSequence to be translated 141 * @return String output of translation 142 */ 143 public final String translate(final CharSequence input) { 144 if (input == null) { 145 return null; 146 } 147 try { 148 final StringWriter writer = new StringWriter(input.length() * 2); 149 translate(input, writer); 150 return writer.toString(); 151 } catch (final IOException ioe) { 152 // this should never ever happen while writing to a StringWriter 153 throw new RuntimeException(ioe); 154 } 155 } 156 157 /** 158 * Translate an input onto a Writer. This is intentionally final as its algorithm is 159 * tightly coupled with the abstract method of this class. 160 * 161 * @param input CharSequence that is being translated 162 * @param out Writer to translate the text to 163 * @throws IOException if and only if the Writer produces an IOException 164 */ 165 public final void translate(final CharSequence input, final Writer out) throws IOException { 166 if (out == null) { 167 throw new IllegalArgumentException("The Writer must not be null"); 168 } 169 if (input == null) { 170 return; 171 } 172 int pos = 0; 173 final int len = input.length(); 174 while (pos < len) { 175 final int consumed = translate(input, pos, out); 176 if (consumed == 0) { 177 // inlined implementation of Character.toChars(Character.codePointAt(input, pos)) 178 // avoids allocating temp char arrays and duplicate checks 179 char c1 = input.charAt(pos); 180 out.write(c1); 181 pos++; 182 if (Character.isHighSurrogate(c1) && pos < len) { 183 char c2 = input.charAt(pos); 184 if (Character.isLowSurrogate(c2)) { 185 out.write(c2); 186 pos++; 187 } 188 } 189 continue; 190 } 191 // contract with translators is that they have to understand codepoints 192 // and they just took care of a surrogate pair 193 for (int pt = 0; pt < consumed; pt++) { 194 pos += Character.charCount(Character.codePointAt(input, pos)); 195 } 196 } 197 } 198 199 /** 200 * Helper method to create a merger of this translator with another set of 201 * translators. Useful in customizing the standard functionality. 202 * 203 * @param translators CharSequenceTranslator array of translators to merge with this one 204 * @return CharSequenceTranslator merging this translator with the others 205 */ 206 public final CharSequenceTranslator with(final CharSequenceTranslator... translators) { 207 final CharSequenceTranslator[] newArray = new CharSequenceTranslator[translators.length + 1]; 208 newArray[0] = this; 209 System.arraycopy(translators, 0, newArray, 1, translators.length); 210 return new AggregateTranslator(newArray); 211 } 212 213 } 214 215 /** 216 * Adapted from apache commons-lang3 project. 217 * <p> 218 * Translates a value using a lookup table. 219 * 220 * @since 3.0 221 */ 222 private static class LookupTranslator extends CharSequenceTranslator { 223 224 private final HashMap<String, String> lookupMap; 225 private final HashSet<Character> prefixSet; 226 private final int shortest; 227 private final int longest; 228 229 /** 230 * Define the lookup table to be used in translation 231 * <p> 232 * Note that, as of Lang 3.1, the key to the lookup table is converted to a 233 * java.lang.String. This is because we need the key to support hashCode and 234 * equals(Object), allowing it to be the key for a HashMap. See LANG-882. 235 * 236 * @param lookup CharSequence[][] table of size [*][2] 237 */ 238 public LookupTranslator(final CharSequence[]... lookup) { 239 lookupMap = new HashMap<>(); 240 prefixSet = new HashSet<>(); 241 int _shortest = Integer.MAX_VALUE; 242 int _longest = 0; 243 if (lookup != null) { 244 for (final CharSequence[] seq : lookup) { 245 this.lookupMap.put(seq[0].toString(), seq[1].toString()); 246 this.prefixSet.add(seq[0].charAt(0)); 247 final int sz = seq[0].length(); 248 if (sz < _shortest) { 249 _shortest = sz; 250 } 251 if (sz > _longest) { 252 _longest = sz; 253 } 254 } 255 } 256 shortest = _shortest; 257 longest = _longest; 258 } 259 260 /** 261 * {@inheritDoc} 262 */ 263 @Override 264 public int translate(final CharSequence input, final int index, final Writer out) throws IOException { 265 // check if translation exists for the input at position index 266 if (prefixSet.contains(input.charAt(index))) { 267 int max = longest; 268 if (index + longest > input.length()) { 269 max = input.length() - index; 270 } 271 // implement greedy algorithm by trying maximum match first 272 for (int i = max; i >= shortest; i--) { 273 final CharSequence subSeq = input.subSequence(index, index + i); 274 final String result = lookupMap.get(subSeq.toString()); 275 276 if (result != null) { 277 out.write(result); 278 return i; 279 } 280 } 281 } 282 return 0; 283 } 284 } 285 286 /** 287 * Adapted from apache commons-lang3 project. 288 * <p> 289 * Executes a sequence of translators one after the other. Execution ends whenever 290 * the first translator consumes codepoints from the input. 291 * 292 * @since 3.0 293 */ 294 private static class AggregateTranslator extends CharSequenceTranslator { 295 296 private final CharSequenceTranslator[] translators; 297 298 /** 299 * Specify the translators to be used at creation time. 300 * 301 * @param translators CharSequenceTranslator array to aggregate 302 */ 303 public AggregateTranslator(final CharSequenceTranslator... translators) { 304 this.translators = translators == null ? null : translators.clone(); 305 } 306 307 /** 308 * The first translator to consume codepoints from the input is the 'winner'. 309 * Execution stops with the number of consumed codepoints being returned. 310 * {@inheritDoc} 311 */ 312 @Override 313 public int translate(final CharSequence input, final int index, final Writer out) throws IOException { 314 for (final CharSequenceTranslator translator : translators) { 315 final int consumed = translator.translate(input, index, out); 316 if (consumed != 0) { 317 return consumed; 318 } 319 } 320 return 0; 321 } 322 323 } 324 325 /** 326 * Adapted from apache commons-lang3 project. 327 * <p> 328 * Translate escaped octal Strings back to their octal values. 329 * <p> 330 * For example, "\45" should go back to being the specific value (a %). 331 * <p> 332 * Note that this currently only supports the viable range of octal for Java; namely 333 * 1 to 377. This is because parsing Java is the main use case. 334 * 335 * @since 3.0 336 */ 337 private static class OctalUnescaper extends CharSequenceTranslator { 338 339 /** 340 * {@inheritDoc} 341 */ 342 @Override 343 public int translate(final CharSequence input, final int index, final Writer out) throws IOException { 344 final int remaining = input.length() - index - 1; // how many characters left, ignoring the first \ 345 final StringBuilder builder = new StringBuilder(); 346 if (input.charAt(index) == '\\' && remaining > 0 && isOctalDigit(input.charAt(index + 1))) { 347 final int next = index + 1; 348 final int next2 = index + 2; 349 final int next3 = index + 3; 350 351 // we know this is good as we checked it in the if block above 352 builder.append(input.charAt(next)); 353 354 if (remaining > 1 && isOctalDigit(input.charAt(next2))) { 355 builder.append(input.charAt(next2)); 356 if (remaining > 2 && isZeroToThree(input.charAt(next)) && isOctalDigit(input.charAt(next3))) { 357 builder.append(input.charAt(next3)); 358 } 359 } 360 361 out.write(Integer.parseInt(builder.toString(), 8)); 362 return 1 + builder.length(); 363 } 364 return 0; 365 } 366 367 /** 368 * Checks if the given char is an octal digit. Octal digits are the character representations of the digits 0 to 369 * 7. 370 * 371 * @param ch the char to check 372 * @return true if the given char is the character representation of one of the digits from 0 to 7 373 */ 374 private boolean isOctalDigit(final char ch) { 375 return ch >= '0' && ch <= '7'; 376 } 377 378 /** 379 * Checks if the given char is the character representation of one of the digit from 0 to 3. 380 * 381 * @param ch the char to check 382 * @return true if the given char is the character representation of one of the digits from 0 to 3 383 */ 384 private boolean isZeroToThree(final char ch) { 385 return ch >= '0' && ch <= '3'; 386 } 387 } 388 389 /** 390 * Adapted from apache commons-lang3 project. 391 * <p> 392 * Translates escaped Unicode values of the form \\u+\d\d\d\d back to 393 * Unicode. It supports multiple 'u' characters and will work with or 394 * without the +. 395 * 396 * @since 3.0 397 */ 398 private static class UnicodeUnescaper extends CharSequenceTranslator { 399 400 /** 401 * {@inheritDoc} 402 */ 403 @Override 404 public int translate(final CharSequence input, final int index, final Writer out) throws IOException { 405 if (input.charAt(index) == '\\' && index + 1 < input.length() && input.charAt(index + 1) == 'u') { 406 // consume optional additional 'u' chars 407 int i = 2; 408 while (index + i < input.length() && input.charAt(index + i) == 'u') { 409 i++; 410 } 411 412 if (index + i < input.length() && input.charAt(index + i) == '+') { 413 i++; 414 } 415 416 if (index + i + 4 <= input.length()) { 417 // Get 4 hex digits 418 final CharSequence unicode = input.subSequence(index + i, index + i + 4); 419 420 try { 421 final int value = Integer.parseInt(unicode.toString(), 16); 422 out.write((char) value); 423 } catch (final NumberFormatException nfe) { 424 throw new IllegalArgumentException("Unable to parse unicode value: " + unicode, nfe); 425 } 426 return i + 4; 427 } 428 throw new IllegalArgumentException("Less than 4 hex digits in unicode value: '" + input.subSequence(index, input.length()) 429 + "' due to end of CharSequence"); 430 } 431 return 0; 432 } 433 } 434 435 }