1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ********************************************************************** 5 * Copyright (c) 2001-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/19/2001 aliu Creation. 10 ********************************************************************** 11 */ 12 package com.ibm.icu.text; 13 import com.ibm.icu.impl.Utility; 14 import com.ibm.icu.lang.UCharacter; 15 16 /** 17 * A transliterator that converts Unicode escape forms to the 18 * characters they represent. Escape forms have a prefix, a suffix, a 19 * radix, and minimum and maximum digit counts. 20 * 21 * <p>This class is package private. It registers several standard 22 * variants with the system which are then accessed via their IDs. 23 * 24 * @author Alan Liu 25 */ 26 class UnescapeTransliterator extends Transliterator { 27 28 /** 29 * The encoded pattern specification. The pattern consists of 30 * zero or more forms. Each form consists of a prefix, suffix, 31 * radix, minimum digit count, and maximum digit count. These 32 * values are stored as a five character header. That is, their 33 * numeric values are cast to 16-bit characters and stored in the 34 * string. Following these five characters, the prefix 35 * characters, then suffix characters are stored. Each form thus 36 * takes n+5 characters, where n is the total length of the prefix 37 * and suffix. The end is marked by a header of length one 38 * consisting of the character END. 39 */ 40 private char spec[]; 41 42 /** 43 * Special character marking the end of the spec[] array. 44 */ 45 private static final char END = 0xFFFF; 46 47 /** 48 * Registers standard variants with the system. Called by 49 * Transliterator during initialization. 50 */ 51 static void register() { 52 // Unicode: "U+10FFFF" hex, min=4, max=6 53 Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() { 54 @Override 55 public Transliterator getInstance(String ID) { 56 return new UnescapeTransliterator("Hex-Any/Unicode", new char[] { 57 2, 0, 16, 4, 6, 'U', '+', 58 END 59 }); 60 } 61 }); 62 63 // Java: "\\uFFFF" hex, min=4, max=4 64 Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() { 65 @Override 66 public Transliterator getInstance(String ID) { 67 return new UnescapeTransliterator("Hex-Any/Java", new char[] { 68 2, 0, 16, 4, 4, '\\', 'u', 69 END 70 }); 71 } 72 }); 73 74 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 75 Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() { 76 @Override 77 public Transliterator getInstance(String ID) { 78 return new UnescapeTransliterator("Hex-Any/C", new char[] { 79 2, 0, 16, 4, 4, '\\', 'u', 80 2, 0, 16, 8, 8, '\\', 'U', 81 END 82 }); 83 } 84 }); 85 86 // XML: "" hex, min=1, max=6 87 Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() { 88 @Override 89 public Transliterator getInstance(String ID) { 90 return new UnescapeTransliterator("Hex-Any/XML", new char[] { 91 3, 1, 16, 1, 6, '&', '#', 'x', ';', 92 END 93 }); 94 } 95 }); 96 97 // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any") 98 Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() { 99 @Override 100 public Transliterator getInstance(String ID) { 101 return new UnescapeTransliterator("Hex-Any/XML10", new char[] { 102 2, 1, 10, 1, 7, '&', '#', ';', 103 END 104 }); 105 } 106 }); 107 108 // Perl: "\\x{263A}" hex, min=1, max=6 109 Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() { 110 @Override 111 public Transliterator getInstance(String ID) { 112 return new UnescapeTransliterator("Hex-Any/Perl", new char[] { 113 3, 1, 16, 1, 6, '\\', 'x', '{', '}', 114 END 115 }); 116 } 117 }); 118 119 // All: Java, C, Perl, XML, XML10, Unicode 120 Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() { 121 @Override 122 public Transliterator getInstance(String ID) { 123 return new UnescapeTransliterator("Hex-Any", new char[] { 124 2, 0, 16, 4, 6, 'U', '+', // Unicode 125 2, 0, 16, 4, 4, '\\', 'u', // Java 126 2, 0, 16, 8, 8, '\\', 'U', // C (surrogates) 127 3, 1, 16, 1, 6, '&', '#', 'x', ';', // XML 128 2, 1, 10, 1, 7, '&', '#', ';', // XML10 129 3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl 130 END 131 }); 132 } 133 }); 134 } 135 136 /** 137 * Package private constructor. Takes the encoded spec array. 138 */ 139 UnescapeTransliterator(String ID, char spec[]) { 140 super(ID, null); 141 this.spec = spec; 142 } 143 144 /** 145 * Implements {@link Transliterator#handleTransliterate}. 146 */ 147 @Override 148 protected void handleTransliterate(Replaceable text, 149 Position pos, boolean isIncremental) { 150 int start = pos.start; 151 int limit = pos.limit; 152 int i, ipat; 153 154 loop: 155 while (start < limit) { 156 // Loop over the forms in spec[]. Exit this loop when we 157 // match one of the specs. Exit the outer loop if a 158 // partial match is detected and isIncremental is true. 159 for (ipat = 0; spec[ipat] != END;) { 160 161 // Read the header 162 int prefixLen = spec[ipat++]; 163 int suffixLen = spec[ipat++]; 164 int radix = spec[ipat++]; 165 int minDigits = spec[ipat++]; 166 int maxDigits = spec[ipat++]; 167 168 // s is a copy of start that is advanced over the 169 // characters as we parse them. 170 int s = start; 171 boolean match = true; 172 173 for (i=0; i<prefixLen; ++i) { 174 if (s >= limit) { 175 if (i > 0) { 176 // We've already matched a character. This is 177 // a partial match, so we return if in 178 // incremental mode. In non-incremental mode, 179 // go to the next spec. 180 if (isIncremental) { 181 break loop; 182 } 183 match = false; 184 break; 185 } 186 } 187 char c = text.charAt(s++); 188 if (c != spec[ipat + i]) { 189 match = false; 190 break; 191 } 192 } 193 194 if (match) { 195 int u = 0; 196 int digitCount = 0; 197 for (;;) { 198 if (s >= limit) { 199 // Check for partial match in incremental mode. 200 if (s > start && isIncremental) { 201 break loop; 202 } 203 break; 204 } 205 int ch = text.char32At(s); 206 int digit = UCharacter.digit(ch, radix); 207 if (digit < 0) { 208 break; 209 } 210 s += UTF16.getCharCount(ch); 211 u = (u * radix) + digit; 212 if (++digitCount == maxDigits) { 213 break; 214 } 215 } 216 217 match = (digitCount >= minDigits); 218 219 if (match) { 220 for (i=0; i<suffixLen; ++i) { 221 if (s >= limit) { 222 // Check for partial match in incremental mode. 223 if (s > start && isIncremental) { 224 break loop; 225 } 226 match = false; 227 break; 228 } 229 char c = text.charAt(s++); 230 if (c != spec[ipat + prefixLen + i]) { 231 match = false; 232 break; 233 } 234 } 235 236 if (match) { 237 // At this point, we have a match 238 String str = UTF16.valueOf(u); 239 text.replace(start, s, str); 240 limit -= s - start - str.length(); 241 // The following break statement leaves the 242 // loop that is traversing the forms in 243 // spec[]. We then parse the next input 244 // character. 245 break; 246 } 247 } 248 } 249 250 ipat += prefixLen + suffixLen; 251 } 252 253 if (start < limit) { 254 start += UTF16.getCharCount(text.char32At(start)); 255 } 256 } 257 258 pos.contextLimit += limit - pos.limit; 259 pos.limit = limit; 260 pos.start = start; 261 } 262 263 /* (non-Javadoc) 264 * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) 265 */ 266 @Override 267 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { 268 // Each form consists of a prefix, suffix, 269 // * radix, minimum digit count, and maximum digit count. These 270 // * values are stored as a five character header. ... 271 UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); 272 UnicodeSet items = new UnicodeSet(); 273 StringBuilder buffer = new StringBuilder(); 274 for (int i = 0; spec[i] != END;) { 275 // first 5 items are header 276 int end = i + spec[i] + spec[i+1] + 5; 277 int radix = spec[i+2]; 278 for (int j = 0; j < radix; ++j) { 279 Utility.appendNumber(buffer, j, radix, 0); 280 } 281 // then add the characters 282 for (int j = i + 5; j < end; ++j) { 283 items.add(spec[j]); 284 } 285 // and go to next block 286 i = end; 287 } 288 items.addAll(buffer.toString()); 289 items.retainAll(myFilter); 290 291 if (items.size() > 0) { 292 sourceSet.addAll(items); 293 targetSet.addAll(0,0x10FFFF); // assume we can produce any character 294 } 295 } 296 } 297