Home | History | Annotate | Download | only in utils
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one or more
      3  * contributor license agreements.  See the NOTICE file distributed with
      4  * this work for additional information regarding copyright ownership.
      5  * The ASF licenses this file to You under the Apache License, Version 2.0
      6  * (the "License"); you may not use this file except in compliance with
      7  * the License.  You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 package com.github.javaparser.utils;
     18 
     19 import java.io.IOException;
     20 import java.io.StringWriter;
     21 import java.io.Writer;
     22 import java.util.HashMap;
     23 import java.util.HashSet;
     24 
     25 /**
     26  * Adapted from apache commons-lang3 project.
     27  * <p>
     28  * Unescapes escaped chars in strings.
     29  */
     30 public class StringEscapeUtils {
     31 
     32     private StringEscapeUtils() {
     33     }
     34 
     35     /**
     36      * <p>Escapes the characters in a {@code String} using Java String rules.</p>
     37      * <p>
     38      * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
     39      * <p>
     40      * <p>So a tab becomes the characters {@code '\\'} and
     41      * {@code 't'}.</p>
     42      * <p>
     43      * <p>The only difference between Java strings and JavaScript strings
     44      * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p>
     45      * <p>
     46      * <p>Example:</p>
     47      * <pre>
     48      * input string: He didn't say, "Stop!"
     49      * output string: He didn't say, \"Stop!\"
     50      * </pre>
     51      *
     52      * @param input String to escape values in, may be null
     53      * @return String with escaped values, {@code null} if null string input
     54      */
     55     public static String escapeJava(final String input) {
     56         return ESCAPE_JAVA.translate(input);
     57     }
     58 
     59     /**
     60      * <p>Unescapes any Java literals found in the {@code String}.
     61      * For example, it will turn a sequence of {@code '\'} and
     62      * {@code 'n'} into a newline character, unless the {@code '\'}
     63      * is preceded by another {@code '\'}.</p>
     64      *
     65      * @param input the {@code String} to unescape, may be null
     66      * @return a new unescaped {@code String}, {@code null} if null string input
     67      */
     68     public static String unescapeJava(final String input) {
     69         return UNESCAPE_JAVA.translate(input);
     70     }
     71 
     72     private static final String[][] JAVA_CTRL_CHARS_UNESCAPE = {
     73             {"\\b", "\b"},
     74             {"\\n", "\n"},
     75             {"\\t", "\t"},
     76             {"\\f", "\f"},
     77             {"\\r", "\r"}
     78     };
     79 
     80     private static final String[][] JAVA_CTRL_CHARS_ESCAPE = {
     81             {"\b", "\\b"},
     82             {"\n", "\\n"},
     83             {"\t", "\\t"},
     84             {"\f", "\\f"},
     85             {"\r", "\\r"}
     86     };
     87 
     88     private static final CharSequenceTranslator ESCAPE_JAVA =
     89             new AggregateTranslator(
     90                     new LookupTranslator(
     91                             new String[][]{
     92                                     {"\"", "\\\""},
     93                                     {"\\", "\\\\"},
     94                             }),
     95                     new LookupTranslator(JAVA_CTRL_CHARS_ESCAPE.clone())
     96             );
     97 
     98     private static final CharSequenceTranslator UNESCAPE_JAVA =
     99             new AggregateTranslator(
    100                     new OctalUnescaper(),
    101                     new UnicodeUnescaper(),
    102                     new LookupTranslator(JAVA_CTRL_CHARS_UNESCAPE.clone()),
    103                     new LookupTranslator(
    104                             new String[][]{
    105                                     {"\\\\", "\\"},
    106                                     {"\\\"", "\""},
    107                                     {"\\'", "'"},
    108                                     {"\\", ""}
    109                             })
    110             );
    111 
    112     /**
    113      * Adapted from apache commons-lang3 project.
    114      * <p>
    115      * An API for translating text.
    116      * Its core use is to escape and unescape text. Because escaping and unescaping
    117      * is completely contextual, the API does not present two separate signatures.
    118      *
    119      * @since 3.0
    120      */
    121     private static abstract class CharSequenceTranslator {
    122 
    123         /**
    124          * Translate a set of codepoints, represented by an int index into a CharSequence,
    125          * into another set of codepoints. The number of codepoints consumed must be returned,
    126          * and the only IOExceptions thrown must be from interacting with the Writer so that
    127          * the top level API may reliably ignore StringWriter IOExceptions.
    128          *
    129          * @param input CharSequence that is being translated
    130          * @param index int representing the current point of translation
    131          * @param out Writer to translate the text to
    132          * @return int count of codepoints consumed
    133          * @throws IOException if and only if the Writer produces an IOException
    134          */
    135         public abstract int translate(CharSequence input, int index, Writer out) throws IOException;
    136 
    137         /**
    138          * Helper for non-Writer usage.
    139          *
    140          * @param input CharSequence to be translated
    141          * @return String output of translation
    142          */
    143         public final String translate(final CharSequence input) {
    144             if (input == null) {
    145                 return null;
    146             }
    147             try {
    148                 final StringWriter writer = new StringWriter(input.length() * 2);
    149                 translate(input, writer);
    150                 return writer.toString();
    151             } catch (final IOException ioe) {
    152                 // this should never ever happen while writing to a StringWriter
    153                 throw new RuntimeException(ioe);
    154             }
    155         }
    156 
    157         /**
    158          * Translate an input onto a Writer. This is intentionally final as its algorithm is
    159          * tightly coupled with the abstract method of this class.
    160          *
    161          * @param input CharSequence that is being translated
    162          * @param out Writer to translate the text to
    163          * @throws IOException if and only if the Writer produces an IOException
    164          */
    165         public final void translate(final CharSequence input, final Writer out) throws IOException {
    166             if (out == null) {
    167                 throw new IllegalArgumentException("The Writer must not be null");
    168             }
    169             if (input == null) {
    170                 return;
    171             }
    172             int pos = 0;
    173             final int len = input.length();
    174             while (pos < len) {
    175                 final int consumed = translate(input, pos, out);
    176                 if (consumed == 0) {
    177                     // inlined implementation of Character.toChars(Character.codePointAt(input, pos))
    178                     // avoids allocating temp char arrays and duplicate checks
    179                     char c1 = input.charAt(pos);
    180                     out.write(c1);
    181                     pos++;
    182                     if (Character.isHighSurrogate(c1) && pos < len) {
    183                         char c2 = input.charAt(pos);
    184                         if (Character.isLowSurrogate(c2)) {
    185                             out.write(c2);
    186                             pos++;
    187                         }
    188                     }
    189                     continue;
    190                 }
    191                 // contract with translators is that they have to understand codepoints
    192                 // and they just took care of a surrogate pair
    193                 for (int pt = 0; pt < consumed; pt++) {
    194                     pos += Character.charCount(Character.codePointAt(input, pos));
    195                 }
    196             }
    197         }
    198 
    199         /**
    200          * Helper method to create a merger of this translator with another set of
    201          * translators. Useful in customizing the standard functionality.
    202          *
    203          * @param translators CharSequenceTranslator array of translators to merge with this one
    204          * @return CharSequenceTranslator merging this translator with the others
    205          */
    206         public final CharSequenceTranslator with(final CharSequenceTranslator... translators) {
    207             final CharSequenceTranslator[] newArray = new CharSequenceTranslator[translators.length + 1];
    208             newArray[0] = this;
    209             System.arraycopy(translators, 0, newArray, 1, translators.length);
    210             return new AggregateTranslator(newArray);
    211         }
    212 
    213     }
    214 
    215     /**
    216      * Adapted from apache commons-lang3 project.
    217      * <p>
    218      * Translates a value using a lookup table.
    219      *
    220      * @since 3.0
    221      */
    222     private static class LookupTranslator extends CharSequenceTranslator {
    223 
    224         private final HashMap<String, String> lookupMap;
    225         private final HashSet<Character> prefixSet;
    226         private final int shortest;
    227         private final int longest;
    228 
    229         /**
    230          * Define the lookup table to be used in translation
    231          * <p>
    232          * Note that, as of Lang 3.1, the key to the lookup table is converted to a
    233          * java.lang.String. This is because we need the key to support hashCode and
    234          * equals(Object), allowing it to be the key for a HashMap. See LANG-882.
    235          *
    236          * @param lookup CharSequence[][] table of size [*][2]
    237          */
    238         public LookupTranslator(final CharSequence[]... lookup) {
    239             lookupMap = new HashMap<>();
    240             prefixSet = new HashSet<>();
    241             int _shortest = Integer.MAX_VALUE;
    242             int _longest = 0;
    243             if (lookup != null) {
    244                 for (final CharSequence[] seq : lookup) {
    245                     this.lookupMap.put(seq[0].toString(), seq[1].toString());
    246                     this.prefixSet.add(seq[0].charAt(0));
    247                     final int sz = seq[0].length();
    248                     if (sz < _shortest) {
    249                         _shortest = sz;
    250                     }
    251                     if (sz > _longest) {
    252                         _longest = sz;
    253                     }
    254                 }
    255             }
    256             shortest = _shortest;
    257             longest = _longest;
    258         }
    259 
    260         /**
    261          * {@inheritDoc}
    262          */
    263         @Override
    264         public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
    265             // check if translation exists for the input at position index
    266             if (prefixSet.contains(input.charAt(index))) {
    267                 int max = longest;
    268                 if (index + longest > input.length()) {
    269                     max = input.length() - index;
    270                 }
    271                 // implement greedy algorithm by trying maximum match first
    272                 for (int i = max; i >= shortest; i--) {
    273                     final CharSequence subSeq = input.subSequence(index, index + i);
    274                     final String result = lookupMap.get(subSeq.toString());
    275 
    276                     if (result != null) {
    277                         out.write(result);
    278                         return i;
    279                     }
    280                 }
    281             }
    282             return 0;
    283         }
    284     }
    285 
    286     /**
    287      * Adapted from apache commons-lang3 project.
    288      * <p>
    289      * Executes a sequence of translators one after the other. Execution ends whenever
    290      * the first translator consumes codepoints from the input.
    291      *
    292      * @since 3.0
    293      */
    294     private static class AggregateTranslator extends CharSequenceTranslator {
    295 
    296         private final CharSequenceTranslator[] translators;
    297 
    298         /**
    299          * Specify the translators to be used at creation time.
    300          *
    301          * @param translators CharSequenceTranslator array to aggregate
    302          */
    303         public AggregateTranslator(final CharSequenceTranslator... translators) {
    304             this.translators = translators == null ? null : translators.clone();
    305         }
    306 
    307         /**
    308          * The first translator to consume codepoints from the input is the 'winner'.
    309          * Execution stops with the number of consumed codepoints being returned.
    310          * {@inheritDoc}
    311          */
    312         @Override
    313         public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
    314             for (final CharSequenceTranslator translator : translators) {
    315                 final int consumed = translator.translate(input, index, out);
    316                 if (consumed != 0) {
    317                     return consumed;
    318                 }
    319             }
    320             return 0;
    321         }
    322 
    323     }
    324 
    325     /**
    326      * Adapted from apache commons-lang3 project.
    327      * <p>
    328      * Translate escaped octal Strings back to their octal values.
    329      * <p>
    330      * For example, "\45" should go back to being the specific value (a %).
    331      * <p>
    332      * Note that this currently only supports the viable range of octal for Java; namely
    333      * 1 to 377. This is because parsing Java is the main use case.
    334      *
    335      * @since 3.0
    336      */
    337     private static class OctalUnescaper extends CharSequenceTranslator {
    338 
    339         /**
    340          * {@inheritDoc}
    341          */
    342         @Override
    343         public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
    344             final int remaining = input.length() - index - 1; // how many characters left, ignoring the first \
    345             final StringBuilder builder = new StringBuilder();
    346             if (input.charAt(index) == '\\' && remaining > 0 && isOctalDigit(input.charAt(index + 1))) {
    347                 final int next = index + 1;
    348                 final int next2 = index + 2;
    349                 final int next3 = index + 3;
    350 
    351                 // we know this is good as we checked it in the if block above
    352                 builder.append(input.charAt(next));
    353 
    354                 if (remaining > 1 && isOctalDigit(input.charAt(next2))) {
    355                     builder.append(input.charAt(next2));
    356                     if (remaining > 2 && isZeroToThree(input.charAt(next)) && isOctalDigit(input.charAt(next3))) {
    357                         builder.append(input.charAt(next3));
    358                     }
    359                 }
    360 
    361                 out.write(Integer.parseInt(builder.toString(), 8));
    362                 return 1 + builder.length();
    363             }
    364             return 0;
    365         }
    366 
    367         /**
    368          * Checks if the given char is an octal digit. Octal digits are the character representations of the digits 0 to
    369          * 7.
    370          *
    371          * @param ch the char to check
    372          * @return true if the given char is the character representation of one of the digits from 0 to 7
    373          */
    374         private boolean isOctalDigit(final char ch) {
    375             return ch >= '0' && ch <= '7';
    376         }
    377 
    378         /**
    379          * Checks if the given char is the character representation of one of the digit from 0 to 3.
    380          *
    381          * @param ch the char to check
    382          * @return true if the given char is the character representation of one of the digits from 0 to 3
    383          */
    384         private boolean isZeroToThree(final char ch) {
    385             return ch >= '0' && ch <= '3';
    386         }
    387     }
    388 
    389     /**
    390      * Adapted from apache commons-lang3 project.
    391      * <p>
    392      * Translates escaped Unicode values of the form \\u+\d\d\d\d back to
    393      * Unicode. It supports multiple 'u' characters and will work with or
    394      * without the +.
    395      *
    396      * @since 3.0
    397      */
    398     private static class UnicodeUnescaper extends CharSequenceTranslator {
    399 
    400         /**
    401          * {@inheritDoc}
    402          */
    403         @Override
    404         public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
    405             if (input.charAt(index) == '\\' && index + 1 < input.length() && input.charAt(index + 1) == 'u') {
    406                 // consume optional additional 'u' chars
    407                 int i = 2;
    408                 while (index + i < input.length() && input.charAt(index + i) == 'u') {
    409                     i++;
    410                 }
    411 
    412                 if (index + i < input.length() && input.charAt(index + i) == '+') {
    413                     i++;
    414                 }
    415 
    416                 if (index + i + 4 <= input.length()) {
    417                     // Get 4 hex digits
    418                     final CharSequence unicode = input.subSequence(index + i, index + i + 4);
    419 
    420                     try {
    421                         final int value = Integer.parseInt(unicode.toString(), 16);
    422                         out.write((char) value);
    423                     } catch (final NumberFormatException nfe) {
    424                         throw new IllegalArgumentException("Unable to parse unicode value: " + unicode, nfe);
    425                     }
    426                     return i + 4;
    427                 }
    428                 throw new IllegalArgumentException("Less than 4 hex digits in unicode value: '" + input.subSequence(index, input.length())
    429                         + "' due to end of CharSequence");
    430             }
    431             return 0;
    432         }
    433     }
    434 
    435 }