Home | History | Annotate | Download | only in escape
      1 /*
      2  * Copyright (C) 2009 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.escape;
     18 
     19 import static com.google.common.base.Preconditions.checkNotNull;
     20 
     21 import com.google.common.annotations.Beta;
     22 import com.google.common.annotations.GwtCompatible;
     23 
     24 import java.util.Map;
     25 
     26 import javax.annotation.Nullable;
     27 
     28 /**
     29  * A {@link UnicodeEscaper} that uses an array to quickly look up replacement
     30  * characters for a given code point. An additional safe range is provided that
     31  * determines whether code points without specific replacements are to be
     32  * considered safe and left unescaped or should be escaped in a general way.
     33  *
     34  * <p>A good example of usage of this class is for HTML escaping where the
     35  * replacement array contains information about the named HTML entities
     36  * such as {@code &amp;} and {@code &quot;} while {@link #escapeUnsafe} is
     37  * overridden to handle general escaping of the form {@code &#NNNNN;}.
     38  *
     39  * <p>The size of the data structure used by {@link ArrayBasedUnicodeEscaper} is
     40  * proportional to the highest valued code point that requires escaping.
     41  * For example a replacement map containing the single character
     42  * '{@code \}{@code u1000}' will require approximately 16K of memory. If you
     43  * need to create multiple escaper instances that have the same character
     44  * replacement mapping consider using {@link ArrayBasedEscaperMap}.
     45  *
     46  * @author David Beaumont
     47  * @since 15.0
     48  */
     49 @Beta
     50 @GwtCompatible
     51 public abstract class ArrayBasedUnicodeEscaper extends UnicodeEscaper {
     52   // The replacement array (see ArrayBasedEscaperMap).
     53   private final char[][] replacements;
     54   // The number of elements in the replacement array.
     55   private final int replacementsLength;
     56   // The first code point in the safe range.
     57   private final int safeMin;
     58   // The last code point in the safe range.
     59   private final int safeMax;
     60 
     61   // Cropped values used in the fast path range checks.
     62   private final char safeMinChar;
     63   private final char safeMaxChar;
     64 
     65   /**
     66    * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement
     67    * map and specified safe range. If {@code safeMax < safeMin} then no code
     68    * points are considered safe.
     69    *
     70    * <p>If a code point has no mapped replacement then it is checked against the
     71    * safe range. If it lies outside that, then {@link #escapeUnsafe} is
     72    * called, otherwise no escaping is performed.
     73    *
     74    * @param replacementMap a map of characters to their escaped representations
     75    * @param safeMin the lowest character value in the safe range
     76    * @param safeMax the highest character value in the safe range
     77    * @param unsafeReplacement the default replacement for unsafe characters or
     78    *     null if no default replacement is required
     79    */
     80   protected ArrayBasedUnicodeEscaper(Map<Character, String> replacementMap,
     81       int safeMin, int safeMax, @Nullable String unsafeReplacement) {
     82 
     83     this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax,
     84         unsafeReplacement);
     85   }
     86 
     87   /**
     88    * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement
     89    * map and specified safe range. If {@code safeMax < safeMin} then no code
     90    * points are considered safe. This initializer is useful when explicit
     91    * instances of ArrayBasedEscaperMap are used to allow the sharing of large
     92    * replacement mappings.
     93    *
     94    * <p>If a code point has no mapped replacement then it is checked against the
     95    * safe range. If it lies outside that, then {@link #escapeUnsafe} is
     96    * called, otherwise no escaping is performed.
     97    *
     98    * @param escaperMap the map of replacements
     99    * @param safeMin the lowest character value in the safe range
    100    * @param safeMax the highest character value in the safe range
    101    * @param unsafeReplacement the default replacement for unsafe characters or
    102    *     null if no default replacement is required
    103    */
    104   protected ArrayBasedUnicodeEscaper(ArrayBasedEscaperMap escaperMap,
    105       int safeMin, int safeMax, @Nullable String unsafeReplacement) {
    106 
    107     checkNotNull(escaperMap);  // GWT specific check (do not optimize)
    108     this.replacements = escaperMap.getReplacementArray();
    109     this.replacementsLength = replacements.length;
    110     if (safeMax < safeMin) {
    111       // If the safe range is empty, set the range limits to opposite extremes
    112       // to ensure the first test of either value will fail.
    113       safeMax = -1;
    114       safeMin = Integer.MAX_VALUE;
    115     }
    116     this.safeMin = safeMin;
    117     this.safeMax = safeMax;
    118 
    119     // This is a bit of a hack but lets us do quicker per-character checks in
    120     // the fast path code. The safe min/max values are very unlikely to extend
    121     // into the range of surrogate characters, but if they do we must not test
    122     // any values in that range. To see why, consider the case where:
    123     //   safeMin <= {hi,lo} <= safeMax
    124     // where {hi,lo} are characters forming a surrogate pair such that:
    125     //   codePointOf(hi, lo) > safeMax
    126     // which would result in the surrogate pair being (wrongly) considered safe.
    127     // If we clip the safe range used during the per-character tests so it is
    128     // below the values of characters in surrogate pairs, this cannot occur.
    129     // This approach does mean that we break out of the fast path code in cases
    130     // where we don't strictly need to, but this situation will almost never
    131     // occur in practice.
    132     if (safeMin >= Character.MIN_HIGH_SURROGATE) {
    133       // The safe range is empty or the all safe code points lie in or above the
    134       // surrogate range. Either way the character range is empty.
    135       this.safeMinChar = Character.MAX_VALUE;
    136       this.safeMaxChar = 0;
    137     } else {
    138       // The safe range is non empty and contains values below the surrogate
    139       // range but may extend above it. We may need to clip the maximum value.
    140       this.safeMinChar = (char) safeMin;
    141       this.safeMaxChar = (char) Math.min(safeMax,
    142                                          Character.MIN_HIGH_SURROGATE - 1);
    143     }
    144   }
    145 
    146   /*
    147    * This is overridden to improve performance. Rough benchmarking shows that
    148    * this almost doubles the speed when processing strings that do not require
    149    * any escaping.
    150    */
    151   @Override
    152   public final String escape(String s) {
    153     checkNotNull(s);  // GWT specific check (do not optimize)
    154     for (int i = 0; i < s.length(); i++) {
    155       char c = s.charAt(i);
    156       if ((c < replacementsLength && replacements[c] != null) ||
    157           c > safeMaxChar || c < safeMinChar) {
    158         return escapeSlow(s, i);
    159       }
    160     }
    161     return s;
    162   }
    163 
    164   /* Overridden for performance. */
    165   @Override
    166   protected final int nextEscapeIndex(CharSequence csq, int index, int end) {
    167     while (index < end) {
    168       char c = csq.charAt(index);
    169       if ((c < replacementsLength && replacements[c] != null) ||
    170           c > safeMaxChar || c < safeMinChar) {
    171         break;
    172       }
    173       index++;
    174     }
    175     return index;
    176   }
    177 
    178   /**
    179    * Escapes a single Unicode code point using the replacement array and safe
    180    * range values. If the given character does not have an explicit replacement
    181    * and lies outside the safe range then {@link #escapeUnsafe} is called.
    182    */
    183   @Override
    184   protected final char[] escape(int cp) {
    185     if (cp < replacementsLength) {
    186       char[] chars = replacements[cp];
    187       if (chars != null) {
    188         return chars;
    189       }
    190     }
    191     if (cp >= safeMin && cp <= safeMax) {
    192       return null;
    193     }
    194     return escapeUnsafe(cp);
    195   }
    196 
    197   /**
    198    * Escapes a code point that has no direct explicit value in the replacement
    199    * array and lies outside the stated safe range. Subclasses should override
    200    * this method to provide generalized escaping for code points if required.
    201    *
    202    * <p>Note that arrays returned by this method must not be modified once they
    203    * have been returned. However it is acceptable to return the same array
    204    * multiple times (even for different input characters).
    205    *
    206    * @param cp the Unicode code point to escape
    207    * @return the replacement characters, or {@code null} if no escaping was
    208    *         required
    209    */
    210   protected abstract char[] escapeUnsafe(int cp);
    211 }
    212