1 /* 2 * Copyright (C) 2009 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.escape; 18 19 import static com.google.common.base.Preconditions.checkNotNull; 20 21 import com.google.common.annotations.Beta; 22 import com.google.common.annotations.GwtCompatible; 23 24 import java.util.Map; 25 26 import javax.annotation.Nullable; 27 28 /** 29 * A {@link UnicodeEscaper} that uses an array to quickly look up replacement 30 * characters for a given code point. An additional safe range is provided that 31 * determines whether code points without specific replacements are to be 32 * considered safe and left unescaped or should be escaped in a general way. 33 * 34 * <p>A good example of usage of this class is for HTML escaping where the 35 * replacement array contains information about the named HTML entities 36 * such as {@code &} and {@code "} while {@link #escapeUnsafe} is 37 * overridden to handle general escaping of the form {@code &#NNNNN;}. 38 * 39 * <p>The size of the data structure used by {@link ArrayBasedUnicodeEscaper} is 40 * proportional to the highest valued code point that requires escaping. 41 * For example a replacement map containing the single character 42 * '{@code \}{@code u1000}' will require approximately 16K of memory. If you 43 * need to create multiple escaper instances that have the same character 44 * replacement mapping consider using {@link ArrayBasedEscaperMap}. 45 * 46 * @author David Beaumont 47 * @since 15.0 48 */ 49 @Beta 50 @GwtCompatible 51 public abstract class ArrayBasedUnicodeEscaper extends UnicodeEscaper { 52 // The replacement array (see ArrayBasedEscaperMap). 53 private final char[][] replacements; 54 // The number of elements in the replacement array. 55 private final int replacementsLength; 56 // The first code point in the safe range. 57 private final int safeMin; 58 // The last code point in the safe range. 59 private final int safeMax; 60 61 // Cropped values used in the fast path range checks. 62 private final char safeMinChar; 63 private final char safeMaxChar; 64 65 /** 66 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement 67 * map and specified safe range. If {@code safeMax < safeMin} then no code 68 * points are considered safe. 69 * 70 * <p>If a code point has no mapped replacement then it is checked against the 71 * safe range. If it lies outside that, then {@link #escapeUnsafe} is 72 * called, otherwise no escaping is performed. 73 * 74 * @param replacementMap a map of characters to their escaped representations 75 * @param safeMin the lowest character value in the safe range 76 * @param safeMax the highest character value in the safe range 77 * @param unsafeReplacement the default replacement for unsafe characters or 78 * null if no default replacement is required 79 */ 80 protected ArrayBasedUnicodeEscaper(Map<Character, String> replacementMap, 81 int safeMin, int safeMax, @Nullable String unsafeReplacement) { 82 83 this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax, 84 unsafeReplacement); 85 } 86 87 /** 88 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement 89 * map and specified safe range. If {@code safeMax < safeMin} then no code 90 * points are considered safe. This initializer is useful when explicit 91 * instances of ArrayBasedEscaperMap are used to allow the sharing of large 92 * replacement mappings. 93 * 94 * <p>If a code point has no mapped replacement then it is checked against the 95 * safe range. If it lies outside that, then {@link #escapeUnsafe} is 96 * called, otherwise no escaping is performed. 97 * 98 * @param escaperMap the map of replacements 99 * @param safeMin the lowest character value in the safe range 100 * @param safeMax the highest character value in the safe range 101 * @param unsafeReplacement the default replacement for unsafe characters or 102 * null if no default replacement is required 103 */ 104 protected ArrayBasedUnicodeEscaper(ArrayBasedEscaperMap escaperMap, 105 int safeMin, int safeMax, @Nullable String unsafeReplacement) { 106 107 checkNotNull(escaperMap); // GWT specific check (do not optimize) 108 this.replacements = escaperMap.getReplacementArray(); 109 this.replacementsLength = replacements.length; 110 if (safeMax < safeMin) { 111 // If the safe range is empty, set the range limits to opposite extremes 112 // to ensure the first test of either value will fail. 113 safeMax = -1; 114 safeMin = Integer.MAX_VALUE; 115 } 116 this.safeMin = safeMin; 117 this.safeMax = safeMax; 118 119 // This is a bit of a hack but lets us do quicker per-character checks in 120 // the fast path code. The safe min/max values are very unlikely to extend 121 // into the range of surrogate characters, but if they do we must not test 122 // any values in that range. To see why, consider the case where: 123 // safeMin <= {hi,lo} <= safeMax 124 // where {hi,lo} are characters forming a surrogate pair such that: 125 // codePointOf(hi, lo) > safeMax 126 // which would result in the surrogate pair being (wrongly) considered safe. 127 // If we clip the safe range used during the per-character tests so it is 128 // below the values of characters in surrogate pairs, this cannot occur. 129 // This approach does mean that we break out of the fast path code in cases 130 // where we don't strictly need to, but this situation will almost never 131 // occur in practice. 132 if (safeMin >= Character.MIN_HIGH_SURROGATE) { 133 // The safe range is empty or the all safe code points lie in or above the 134 // surrogate range. Either way the character range is empty. 135 this.safeMinChar = Character.MAX_VALUE; 136 this.safeMaxChar = 0; 137 } else { 138 // The safe range is non empty and contains values below the surrogate 139 // range but may extend above it. We may need to clip the maximum value. 140 this.safeMinChar = (char) safeMin; 141 this.safeMaxChar = (char) Math.min(safeMax, 142 Character.MIN_HIGH_SURROGATE - 1); 143 } 144 } 145 146 /* 147 * This is overridden to improve performance. Rough benchmarking shows that 148 * this almost doubles the speed when processing strings that do not require 149 * any escaping. 150 */ 151 @Override 152 public final String escape(String s) { 153 checkNotNull(s); // GWT specific check (do not optimize) 154 for (int i = 0; i < s.length(); i++) { 155 char c = s.charAt(i); 156 if ((c < replacementsLength && replacements[c] != null) || 157 c > safeMaxChar || c < safeMinChar) { 158 return escapeSlow(s, i); 159 } 160 } 161 return s; 162 } 163 164 /* Overridden for performance. */ 165 @Override 166 protected final int nextEscapeIndex(CharSequence csq, int index, int end) { 167 while (index < end) { 168 char c = csq.charAt(index); 169 if ((c < replacementsLength && replacements[c] != null) || 170 c > safeMaxChar || c < safeMinChar) { 171 break; 172 } 173 index++; 174 } 175 return index; 176 } 177 178 /** 179 * Escapes a single Unicode code point using the replacement array and safe 180 * range values. If the given character does not have an explicit replacement 181 * and lies outside the safe range then {@link #escapeUnsafe} is called. 182 */ 183 @Override 184 protected final char[] escape(int cp) { 185 if (cp < replacementsLength) { 186 char[] chars = replacements[cp]; 187 if (chars != null) { 188 return chars; 189 } 190 } 191 if (cp >= safeMin && cp <= safeMax) { 192 return null; 193 } 194 return escapeUnsafe(cp); 195 } 196 197 /** 198 * Escapes a code point that has no direct explicit value in the replacement 199 * array and lies outside the stated safe range. Subclasses should override 200 * this method to provide generalized escaping for code points if required. 201 * 202 * <p>Note that arrays returned by this method must not be modified once they 203 * have been returned. However it is acceptable to return the same array 204 * multiple times (even for different input characters). 205 * 206 * @param cp the Unicode code point to escape 207 * @return the replacement characters, or {@code null} if no escaping was 208 * required 209 */ 210 protected abstract char[] escapeUnsafe(int cp); 211 } 212