1 /* 2 ********************************************************************** 3 * Copyright (c) 2001-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/19/2001 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/uchar.h" 16 #include "unicode/utf16.h" 17 #include "unesctrn.h" 18 #include "util.h" 19 20 #include "cmemory.h" 21 22 U_NAMESPACE_BEGIN 23 24 /** 25 * Special character marking the end of the spec[] array. 26 */ 27 static const UChar END = 0xFFFF; 28 29 // Unicode: "U+10FFFF" hex, min=4, max=6 30 static const UChar SPEC_Unicode[] = { 31 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, 32 END 33 }; 34 35 // Java: "\\uFFFF" hex, min=4, max=4 36 static const UChar SPEC_Java[] = { 37 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 38 END 39 }; 40 41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 42 static const UChar SPEC_C[] = { 43 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 44 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, 45 END 46 }; 47 48 // XML: "" hex, min=1, max=6 49 static const UChar SPEC_XML[] = { 50 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, 51 END 52 }; 53 54 // XML10: "" dec, min=1, max=7 (not really "Hex-Any") 55 static const UChar SPEC_XML10[] = { 56 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, 57 END 58 }; 59 60 // Perl: "\\x{263A}" hex, min=1, max=6 61 static const UChar SPEC_Perl[] = { 62 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, 63 END 64 }; 65 66 // All: Java, C, Perl, XML, XML10, Unicode 67 static const UChar SPEC_Any[] = { 68 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode 69 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java 70 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) 71 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML 72 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 73 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl 74 END 75 }; 76 77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) 78 79 static UChar* copySpec(const UChar* spec) { 80 int32_t len = 0; 81 while (spec[len] != END) { 82 ++len; 83 } 84 ++len; 85 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar)); 86 // Check for memory allocation error. 87 if (result != NULL) { 88 uprv_memcpy(result, spec, len*sizeof(result[0])); 89 } 90 return result; 91 } 92 93 /** 94 * Factory methods. Ignore the context. 95 */ 96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { 97 return new UnescapeTransliterator(ID, SPEC_Unicode); 98 } 99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { 100 return new UnescapeTransliterator(ID, SPEC_Java); 101 } 102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { 103 return new UnescapeTransliterator(ID, SPEC_C); 104 } 105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { 106 return new UnescapeTransliterator(ID, SPEC_XML); 107 } 108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { 109 return new UnescapeTransliterator(ID, SPEC_XML10); 110 } 111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { 112 return new UnescapeTransliterator(ID, SPEC_Perl); 113 } 114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { 115 return new UnescapeTransliterator(ID, SPEC_Any); 116 } 117 118 /** 119 * Registers standard variants with the system. Called by 120 * Transliterator during initialization. 121 */ 122 void UnescapeTransliterator::registerIDs() { 123 Token t = integerToken(0); 124 125 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); 126 127 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); 128 129 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); 130 131 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); 132 133 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); 134 135 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); 136 137 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); 138 } 139 140 /** 141 * Constructor. Takes the encoded spec array. 142 */ 143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, 144 const UChar *newSpec) : 145 Transliterator(newID, NULL) 146 { 147 this->spec = copySpec(newSpec); 148 } 149 150 /** 151 * Copy constructor. 152 */ 153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : 154 Transliterator(o) { 155 this->spec = copySpec(o.spec); 156 } 157 158 UnescapeTransliterator::~UnescapeTransliterator() { 159 uprv_free(spec); 160 } 161 162 /** 163 * Transliterator API. 164 */ 165 Transliterator* UnescapeTransliterator::clone() const { 166 return new UnescapeTransliterator(*this); 167 } 168 169 /** 170 * Implements {@link Transliterator#handleTransliterate}. 171 */ 172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, 173 UBool isIncremental) const { 174 int32_t start = pos.start; 175 int32_t limit = pos.limit; 176 int32_t i, j, ipat; 177 178 while (start < limit) { 179 // Loop over the forms in spec[]. Exit this loop when we 180 // match one of the specs. Exit the outer loop if a 181 // partial match is detected and isIncremental is true. 182 for (j=0, ipat=0; spec[ipat] != END; ++j) { 183 184 // Read the header 185 int32_t prefixLen = spec[ipat++]; 186 int32_t suffixLen = spec[ipat++]; 187 int8_t radix = (int8_t) spec[ipat++]; 188 int32_t minDigits = spec[ipat++]; 189 int32_t maxDigits = spec[ipat++]; 190 191 // s is a copy of start that is advanced over the 192 // characters as we parse them. 193 int32_t s = start; 194 UBool match = TRUE; 195 196 for (i=0; i<prefixLen; ++i) { 197 if (s >= limit) { 198 if (i > 0) { 199 // We've already matched a character. This is 200 // a partial match, so we return if in 201 // incremental mode. In non-incremental mode, 202 // go to the next spec. 203 if (isIncremental) { 204 goto exit; 205 } 206 match = FALSE; 207 break; 208 } 209 } 210 UChar c = text.charAt(s++); 211 if (c != spec[ipat + i]) { 212 match = FALSE; 213 break; 214 } 215 } 216 217 if (match) { 218 UChar32 u = 0; 219 int32_t digitCount = 0; 220 for (;;) { 221 if (s >= limit) { 222 // Check for partial match in incremental mode. 223 if (s > start && isIncremental) { 224 goto exit; 225 } 226 break; 227 } 228 UChar32 ch = text.char32At(s); 229 int32_t digit = u_digit(ch, radix); 230 if (digit < 0) { 231 break; 232 } 233 s += U16_LENGTH(ch); 234 u = (u * radix) + digit; 235 if (++digitCount == maxDigits) { 236 break; 237 } 238 } 239 240 match = (digitCount >= minDigits); 241 242 if (match) { 243 for (i=0; i<suffixLen; ++i) { 244 if (s >= limit) { 245 // Check for partial match in incremental mode. 246 if (s > start && isIncremental) { 247 goto exit; 248 } 249 match = FALSE; 250 break; 251 } 252 UChar c = text.charAt(s++); 253 if (c != spec[ipat + prefixLen + i]) { 254 match = FALSE; 255 break; 256 } 257 } 258 259 if (match) { 260 // At this point, we have a match 261 UnicodeString str(u); 262 text.handleReplaceBetween(start, s, str); 263 limit -= s - start - str.length(); 264 // The following break statement leaves the 265 // loop that is traversing the forms in 266 // spec[]. We then parse the next input 267 // character. 268 break; 269 } 270 } 271 } 272 273 ipat += prefixLen + suffixLen; 274 } 275 276 if (start < limit) { 277 start += U16_LENGTH(text.char32At(start)); 278 } 279 } 280 281 exit: 282 pos.contextLimit += limit - pos.limit; 283 pos.limit = limit; 284 pos.start = start; 285 } 286 287 U_NAMESPACE_END 288 289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 290 291 //eof 292