1 /* 2 ********************************************************************** 3 * Copyright (c) 2001-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/19/2001 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/unimatch.h" 12 #include "unicode/utf16.h" 13 #include "patternprops.h" 14 #include "util.h" 15 16 // Define UChar constants using hex for EBCDIC compatibility 17 18 static const UChar BACKSLASH = 0x005C; /*\*/ 19 static const UChar UPPER_U = 0x0055; /*U*/ 20 static const UChar LOWER_U = 0x0075; /*u*/ 21 static const UChar APOSTROPHE = 0x0027; // '\'' 22 static const UChar SPACE = 0x0020; // ' ' 23 24 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 25 static const UChar DIGITS[] = { 26 48,49,50,51,52,53,54,55,56,57, 27 65,66,67,68,69,70,71,72,73,74, 28 75,76,77,78,79,80,81,82,83,84, 29 85,86,87,88,89,90 30 }; 31 32 U_NAMESPACE_BEGIN 33 34 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n, 35 int32_t radix, int32_t minDigits) { 36 if (radix < 2 || radix > 36) { 37 // Bogus radix 38 return result.append((UChar)63/*?*/); 39 } 40 // Handle negatives 41 if (n < 0) { 42 n = -n; 43 result.append((UChar)45/*-*/); 44 } 45 // First determine the number of digits 46 int32_t nn = n; 47 int32_t r = 1; 48 while (nn >= radix) { 49 nn /= radix; 50 r *= radix; 51 --minDigits; 52 } 53 // Now generate the digits 54 while (--minDigits > 0) { 55 result.append(DIGITS[0]); 56 } 57 while (r > 0) { 58 int32_t digit = n / r; 59 result.append(DIGITS[digit]); 60 n -= digit * r; 61 r /= radix; 62 } 63 return result; 64 } 65 66 /** 67 * Return true if the character is NOT printable ASCII. 68 */ 69 UBool ICU_Utility::isUnprintable(UChar32 c) { 70 return !(c >= 0x20 && c <= 0x7E); 71 } 72 73 /** 74 * Escape unprintable characters using \uxxxx notation for U+0000 to 75 * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is 76 * printable ASCII, then do nothing and return FALSE. Otherwise, 77 * append the escaped notation and return TRUE. 78 */ 79 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) { 80 if (isUnprintable(c)) { 81 result.append(BACKSLASH); 82 if (c & ~0xFFFF) { 83 result.append(UPPER_U); 84 result.append(DIGITS[0xF&(c>>28)]); 85 result.append(DIGITS[0xF&(c>>24)]); 86 result.append(DIGITS[0xF&(c>>20)]); 87 result.append(DIGITS[0xF&(c>>16)]); 88 } else { 89 result.append(LOWER_U); 90 } 91 result.append(DIGITS[0xF&(c>>12)]); 92 result.append(DIGITS[0xF&(c>>8)]); 93 result.append(DIGITS[0xF&(c>>4)]); 94 result.append(DIGITS[0xF&c]); 95 return TRUE; 96 } 97 return FALSE; 98 } 99 100 /** 101 * Returns the index of a character, ignoring quoted text. 102 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 103 * found by a search for 'h'. 104 */ 105 // FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 106 /* 107 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text, 108 int32_t start, int32_t limit, 109 UChar charToFind) { 110 for (int32_t i=start; i<limit; ++i) { 111 UChar c = text.charAt(i); 112 if (c == BACKSLASH) { 113 ++i; 114 } else if (c == APOSTROPHE) { 115 while (++i < limit 116 && text.charAt(i) != APOSTROPHE) {} 117 } else if (c == charToFind) { 118 return i; 119 } 120 } 121 return -1; 122 } 123 */ 124 125 /** 126 * Skip over a sequence of zero or more white space characters at pos. 127 * @param advance if true, advance pos to the first non-white-space 128 * character at or after pos, or str.length(), if there is none. 129 * Otherwise leave pos unchanged. 130 * @return the index of the first non-white-space character at or 131 * after pos, or str.length(), if there is none. 132 */ 133 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos, 134 UBool advance) { 135 int32_t p = pos; 136 const UChar* s = str.getBuffer(); 137 p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s); 138 if (advance) { 139 pos = p; 140 } 141 return p; 142 } 143 144 /** 145 * Skip over Pattern_White_Space in a Replaceable. 146 * Skipping may be done in the forward or 147 * reverse direction. In either case, the leftmost index will be 148 * inclusive, and the rightmost index will be exclusive. That is, 149 * given a range defined as [start, limit), the call 150 * skipWhitespace(text, start, limit) will advance start past leading 151 * whitespace, whereas the call skipWhitespace(text, limit, start), 152 * will back up limit past trailing whitespace. 153 * @param text the text to be analyzed 154 * @param pos either the start or limit of a range of 'text', to skip 155 * leading or trailing whitespace, respectively 156 * @param stop either the limit or start of a range of 'text', to skip 157 * leading or trailing whitespace, respectively 158 * @return the new start or limit, depending on what was passed in to 159 * 'pos' 160 */ 161 //?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 162 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text, 163 //? int32_t pos, int32_t stop) { 164 //? UChar32 c; 165 //? UBool isForward = (stop >= pos); 166 //? 167 //? if (!isForward) { 168 //? --pos; // pos is a limit, so back up by one 169 //? } 170 //? 171 //? while (pos != stop && 172 //? PatternProps::isWhiteSpace(c = text.char32At(pos))) { 173 //? if (isForward) { 174 //? pos += U16_LENGTH(c); 175 //? } else { 176 //? pos -= U16_LENGTH(c); 177 //? } 178 //? } 179 //? 180 //? if (!isForward) { 181 //? ++pos; // make pos back into a limit 182 //? } 183 //? 184 //? return pos; 185 //?} 186 187 /** 188 * Parse a single non-whitespace character 'ch', optionally 189 * preceded by whitespace. 190 * @param id the string to be parsed 191 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 192 * offset of the first character to be parsed. On output, pos[0] 193 * is the index after the last parsed character. If the parse 194 * fails, pos[0] will be unchanged. 195 * @param ch the non-whitespace character to be parsed. 196 * @return true if 'ch' is seen preceded by zero or more 197 * whitespace characters. 198 */ 199 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) { 200 int32_t start = pos; 201 skipWhitespace(id, pos, TRUE); 202 if (pos == id.length() || 203 id.charAt(pos) != ch) { 204 pos = start; 205 return FALSE; 206 } 207 ++pos; 208 return TRUE; 209 } 210 211 /** 212 * Parse a pattern string within the given Replaceable and a parsing 213 * pattern. Characters are matched literally and case-sensitively 214 * except for the following special characters: 215 * 216 * ~ zero or more Pattern_White_Space chars 217 * 218 * If end of pattern is reached with all matches along the way, 219 * pos is advanced to the first unparsed index and returned. 220 * Otherwise -1 is returned. 221 * @param pat pattern that controls parsing 222 * @param text text to be parsed, starting at index 223 * @param index offset to first character to parse 224 * @param limit offset after last character to parse 225 * @return index after last parsed character, or -1 on parse failure. 226 */ 227 int32_t ICU_Utility::parsePattern(const UnicodeString& pat, 228 const Replaceable& text, 229 int32_t index, 230 int32_t limit) { 231 int32_t ipat = 0; 232 233 // empty pattern matches immediately 234 if (ipat == pat.length()) { 235 return index; 236 } 237 238 UChar32 cpat = pat.char32At(ipat); 239 240 while (index < limit) { 241 UChar32 c = text.char32At(index); 242 243 // parse \s* 244 if (cpat == 126 /*~*/) { 245 if (PatternProps::isWhiteSpace(c)) { 246 index += U16_LENGTH(c); 247 continue; 248 } else { 249 if (++ipat == pat.length()) { 250 return index; // success; c unparsed 251 } 252 // fall thru; process c again with next cpat 253 } 254 } 255 256 // parse literal 257 else if (c == cpat) { 258 index += U16_LENGTH(c); 259 ipat += U16_LENGTH(cpat); 260 if (ipat == pat.length()) { 261 return index; // success; c parsed 262 } 263 // fall thru; get next cpat 264 } 265 266 // match failure of literal 267 else { 268 return -1; 269 } 270 271 cpat = pat.char32At(ipat); 272 } 273 274 return -1; // text ended before end of pat 275 } 276 277 /** 278 * Append a character to a rule that is being built up. To flush 279 * the quoteBuf to rule, make one final call with isLiteral == TRUE. 280 * If there is no final character, pass in (UChar32)-1 as c. 281 * @param rule the string to append the character to 282 * @param c the character to append, or (UChar32)-1 if none. 283 * @param isLiteral if true, then the given character should not be 284 * quoted or escaped. Usually this means it is a syntactic element 285 * such as > or $ 286 * @param escapeUnprintable if true, then unprintable characters 287 * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will 288 * appear outside of quotes. 289 * @param quoteBuf a buffer which is used to build up quoted 290 * substrings. The caller should initially supply an empty buffer, 291 * and thereafter should not modify the buffer. The buffer should be 292 * cleared out by, at the end, calling this method with a literal 293 * character. 294 */ 295 void ICU_Utility::appendToRule(UnicodeString& rule, 296 UChar32 c, 297 UBool isLiteral, 298 UBool escapeUnprintable, 299 UnicodeString& quoteBuf) { 300 // If we are escaping unprintables, then escape them outside 301 // quotes. \u and \U are not recognized within quotes. The same 302 // logic applies to literals, but literals are never escaped. 303 if (isLiteral || 304 (escapeUnprintable && ICU_Utility::isUnprintable(c))) { 305 if (quoteBuf.length() > 0) { 306 // We prefer backslash APOSTROPHE to double APOSTROPHE 307 // (more readable, less similar to ") so if there are 308 // double APOSTROPHEs at the ends, we pull them outside 309 // of the quote. 310 311 // If the first thing in the quoteBuf is APOSTROPHE 312 // (doubled) then pull it out. 313 while (quoteBuf.length() >= 2 && 314 quoteBuf.charAt(0) == APOSTROPHE && 315 quoteBuf.charAt(1) == APOSTROPHE) { 316 rule.append(BACKSLASH).append(APOSTROPHE); 317 quoteBuf.remove(0, 2); 318 } 319 // If the last thing in the quoteBuf is APOSTROPHE 320 // (doubled) then remove and count it and add it after. 321 int32_t trailingCount = 0; 322 while (quoteBuf.length() >= 2 && 323 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 324 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 325 quoteBuf.truncate(quoteBuf.length()-2); 326 ++trailingCount; 327 } 328 if (quoteBuf.length() > 0) { 329 rule.append(APOSTROPHE); 330 rule.append(quoteBuf); 331 rule.append(APOSTROPHE); 332 quoteBuf.truncate(0); 333 } 334 while (trailingCount-- > 0) { 335 rule.append(BACKSLASH).append(APOSTROPHE); 336 } 337 } 338 if (c != (UChar32)-1) { 339 /* Since spaces are ignored during parsing, they are 340 * emitted only for readability. We emit one here 341 * only if there isn't already one at the end of the 342 * rule. 343 */ 344 if (c == SPACE) { 345 int32_t len = rule.length(); 346 if (len > 0 && rule.charAt(len-1) != c) { 347 rule.append(c); 348 } 349 } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) { 350 rule.append(c); 351 } 352 } 353 } 354 355 // Escape ' and '\' and don't begin a quote just for them 356 else if (quoteBuf.length() == 0 && 357 (c == APOSTROPHE || c == BACKSLASH)) { 358 rule.append(BACKSLASH); 359 rule.append(c); 360 } 361 362 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 363 // whitespace need quoting. Also append stuff to quotes if we are 364 // building up a quoted substring already. 365 else if (quoteBuf.length() > 0 || 366 (c >= 0x0021 && c <= 0x007E && 367 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 368 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 369 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 370 PatternProps::isWhiteSpace(c)) { 371 quoteBuf.append(c); 372 // Double ' within a quote 373 if (c == APOSTROPHE) { 374 quoteBuf.append(c); 375 } 376 } 377 378 // Otherwise just append 379 else { 380 rule.append(c); 381 } 382 } 383 384 void ICU_Utility::appendToRule(UnicodeString& rule, 385 const UnicodeString& text, 386 UBool isLiteral, 387 UBool escapeUnprintable, 388 UnicodeString& quoteBuf) { 389 for (int32_t i=0; i<text.length(); ++i) { 390 appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf); 391 } 392 } 393 394 /** 395 * Given a matcher reference, which may be null, append its 396 * pattern as a literal to the given rule. 397 */ 398 void ICU_Utility::appendToRule(UnicodeString& rule, 399 const UnicodeMatcher* matcher, 400 UBool escapeUnprintable, 401 UnicodeString& quoteBuf) { 402 if (matcher != NULL) { 403 UnicodeString pat; 404 appendToRule(rule, matcher->toPattern(pat, escapeUnprintable), 405 TRUE, escapeUnprintable, quoteBuf); 406 } 407 } 408 409 U_NAMESPACE_END 410