1 /* 2 ********************************************************************** 3 * Copyright (c) 2001-2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/19/2001 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "util.h" 12 #include "unicode/unimatch.h" 13 #include "unicode/uniset.h" 14 15 // Define UChar constants using hex for EBCDIC compatibility 16 17 static const UChar BACKSLASH = 0x005C; /*\*/ 18 static const UChar UPPER_U = 0x0055; /*U*/ 19 static const UChar LOWER_U = 0x0075; /*u*/ 20 static const UChar APOSTROPHE = 0x0027; // '\'' 21 static const UChar SPACE = 0x0020; // ' ' 22 23 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 24 static const UChar DIGITS[] = { 25 48,49,50,51,52,53,54,55,56,57, 26 65,66,67,68,69,70,71,72,73,74, 27 75,76,77,78,79,80,81,82,83,84, 28 85,86,87,88,89,90 29 }; 30 31 U_NAMESPACE_BEGIN 32 33 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n, 34 int32_t radix, int32_t minDigits) { 35 if (radix < 2 || radix > 36) { 36 // Bogus radix 37 return result.append((UChar)63/*?*/); 38 } 39 // Handle negatives 40 if (n < 0) { 41 n = -n; 42 result.append((UChar)45/*-*/); 43 } 44 // First determine the number of digits 45 int32_t nn = n; 46 int32_t r = 1; 47 while (nn >= radix) { 48 nn /= radix; 49 r *= radix; 50 --minDigits; 51 } 52 // Now generate the digits 53 while (--minDigits > 0) { 54 result.append(DIGITS[0]); 55 } 56 while (r > 0) { 57 int32_t digit = n / r; 58 result.append(DIGITS[digit]); 59 n -= digit * r; 60 r /= radix; 61 } 62 return result; 63 } 64 65 /** 66 * Return true if the character is NOT printable ASCII. 67 */ 68 UBool ICU_Utility::isUnprintable(UChar32 c) { 69 return !(c >= 0x20 && c <= 0x7E); 70 } 71 72 /** 73 * Escape unprintable characters using \uxxxx notation for U+0000 to 74 * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is 75 * printable ASCII, then do nothing and return FALSE. Otherwise, 76 * append the escaped notation and return TRUE. 77 */ 78 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) { 79 if (isUnprintable(c)) { 80 result.append(BACKSLASH); 81 if (c & ~0xFFFF) { 82 result.append(UPPER_U); 83 result.append(DIGITS[0xF&(c>>28)]); 84 result.append(DIGITS[0xF&(c>>24)]); 85 result.append(DIGITS[0xF&(c>>20)]); 86 result.append(DIGITS[0xF&(c>>16)]); 87 } else { 88 result.append(LOWER_U); 89 } 90 result.append(DIGITS[0xF&(c>>12)]); 91 result.append(DIGITS[0xF&(c>>8)]); 92 result.append(DIGITS[0xF&(c>>4)]); 93 result.append(DIGITS[0xF&c]); 94 return TRUE; 95 } 96 return FALSE; 97 } 98 99 /** 100 * Returns the index of a character, ignoring quoted text. 101 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 102 * found by a search for 'h'. 103 */ 104 // FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 105 /* 106 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text, 107 int32_t start, int32_t limit, 108 UChar charToFind) { 109 for (int32_t i=start; i<limit; ++i) { 110 UChar c = text.charAt(i); 111 if (c == BACKSLASH) { 112 ++i; 113 } else if (c == APOSTROPHE) { 114 while (++i < limit 115 && text.charAt(i) != APOSTROPHE) {} 116 } else if (c == charToFind) { 117 return i; 118 } 119 } 120 return -1; 121 } 122 */ 123 124 /** 125 * Skip over a sequence of zero or more white space characters at pos. 126 * @param advance if true, advance pos to the first non-white-space 127 * character at or after pos, or str.length(), if there is none. 128 * Otherwise leave pos unchanged. 129 * @return the index of the first non-white-space character at or 130 * after pos, or str.length(), if there is none. 131 */ 132 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos, 133 UBool advance) { 134 int32_t p = pos; 135 while (p < str.length()) { 136 UChar32 c = str.char32At(p); 137 if (!uprv_isRuleWhiteSpace(c)) { 138 break; 139 } 140 p += UTF_CHAR_LENGTH(c); 141 } 142 if (advance) { 143 pos = p; 144 } 145 return p; 146 } 147 148 /** 149 * Skip over whitespace in a Replaceable. Whitespace is defined by 150 * uprv_isRuleWhiteSpace(). Skipping may be done in the forward or 151 * reverse direction. In either case, the leftmost index will be 152 * inclusive, and the rightmost index will be exclusive. That is, 153 * given a range defined as [start, limit), the call 154 * skipWhitespace(text, start, limit) will advance start past leading 155 * whitespace, whereas the call skipWhitespace(text, limit, start), 156 * will back up limit past trailing whitespace. 157 * @param text the text to be analyzed 158 * @param pos either the start or limit of a range of 'text', to skip 159 * leading or trailing whitespace, respectively 160 * @param stop either the limit or start of a range of 'text', to skip 161 * leading or trailing whitespace, respectively 162 * @return the new start or limit, depending on what was passed in to 163 * 'pos' 164 */ 165 //?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 166 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text, 167 //? int32_t pos, int32_t stop) { 168 //? UChar32 c; 169 //? UBool isForward = (stop >= pos); 170 //? 171 //? if (!isForward) { 172 //? --pos; // pos is a limit, so back up by one 173 //? } 174 //? 175 //? while (pos != stop && 176 //? uprv_isRuleWhiteSpace(c = text.char32At(pos))) { 177 //? if (isForward) { 178 //? pos += UTF_CHAR_LENGTH(c); 179 //? } else { 180 //? pos -= UTF_CHAR_LENGTH(c); 181 //? } 182 //? } 183 //? 184 //? if (!isForward) { 185 //? ++pos; // make pos back into a limit 186 //? } 187 //? 188 //? return pos; 189 //?} 190 191 /** 192 * Parse a single non-whitespace character 'ch', optionally 193 * preceded by whitespace. 194 * @param id the string to be parsed 195 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 196 * offset of the first character to be parsed. On output, pos[0] 197 * is the index after the last parsed character. If the parse 198 * fails, pos[0] will be unchanged. 199 * @param ch the non-whitespace character to be parsed. 200 * @return true if 'ch' is seen preceded by zero or more 201 * whitespace characters. 202 */ 203 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) { 204 int32_t start = pos; 205 skipWhitespace(id, pos, TRUE); 206 if (pos == id.length() || 207 id.charAt(pos) != ch) { 208 pos = start; 209 return FALSE; 210 } 211 ++pos; 212 return TRUE; 213 } 214 215 /** 216 * Parse a pattern string within the given Replaceable and a parsing 217 * pattern. Characters are matched literally and case-sensitively 218 * except for the following special characters: 219 * 220 * ~ zero or more uprv_isRuleWhiteSpace chars 221 * 222 * If end of pattern is reached with all matches along the way, 223 * pos is advanced to the first unparsed index and returned. 224 * Otherwise -1 is returned. 225 * @param pat pattern that controls parsing 226 * @param text text to be parsed, starting at index 227 * @param index offset to first character to parse 228 * @param limit offset after last character to parse 229 * @return index after last parsed character, or -1 on parse failure. 230 */ 231 int32_t ICU_Utility::parsePattern(const UnicodeString& pat, 232 const Replaceable& text, 233 int32_t index, 234 int32_t limit) { 235 int32_t ipat = 0; 236 237 // empty pattern matches immediately 238 if (ipat == pat.length()) { 239 return index; 240 } 241 242 UChar32 cpat = pat.char32At(ipat); 243 244 while (index < limit) { 245 UChar32 c = text.char32At(index); 246 247 // parse \s* 248 if (cpat == 126 /*~*/) { 249 if (uprv_isRuleWhiteSpace(c)) { 250 index += UTF_CHAR_LENGTH(c); 251 continue; 252 } else { 253 if (++ipat == pat.length()) { 254 return index; // success; c unparsed 255 } 256 // fall thru; process c again with next cpat 257 } 258 } 259 260 // parse literal 261 else if (c == cpat) { 262 index += UTF_CHAR_LENGTH(c); 263 ipat += UTF_CHAR_LENGTH(cpat); 264 if (ipat == pat.length()) { 265 return index; // success; c parsed 266 } 267 // fall thru; get next cpat 268 } 269 270 // match failure of literal 271 else { 272 return -1; 273 } 274 275 cpat = pat.char32At(ipat); 276 } 277 278 return -1; // text ended before end of pat 279 } 280 281 /** 282 * Append a character to a rule that is being built up. To flush 283 * the quoteBuf to rule, make one final call with isLiteral == TRUE. 284 * If there is no final character, pass in (UChar32)-1 as c. 285 * @param rule the string to append the character to 286 * @param c the character to append, or (UChar32)-1 if none. 287 * @param isLiteral if true, then the given character should not be 288 * quoted or escaped. Usually this means it is a syntactic element 289 * such as > or $ 290 * @param escapeUnprintable if true, then unprintable characters 291 * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will 292 * appear outside of quotes. 293 * @param quoteBuf a buffer which is used to build up quoted 294 * substrings. The caller should initially supply an empty buffer, 295 * and thereafter should not modify the buffer. The buffer should be 296 * cleared out by, at the end, calling this method with a literal 297 * character. 298 */ 299 void ICU_Utility::appendToRule(UnicodeString& rule, 300 UChar32 c, 301 UBool isLiteral, 302 UBool escapeUnprintable, 303 UnicodeString& quoteBuf) { 304 // If we are escaping unprintables, then escape them outside 305 // quotes. \u and \U are not recognized within quotes. The same 306 // logic applies to literals, but literals are never escaped. 307 if (isLiteral || 308 (escapeUnprintable && ICU_Utility::isUnprintable(c))) { 309 if (quoteBuf.length() > 0) { 310 // We prefer backslash APOSTROPHE to double APOSTROPHE 311 // (more readable, less similar to ") so if there are 312 // double APOSTROPHEs at the ends, we pull them outside 313 // of the quote. 314 315 // If the first thing in the quoteBuf is APOSTROPHE 316 // (doubled) then pull it out. 317 while (quoteBuf.length() >= 2 && 318 quoteBuf.charAt(0) == APOSTROPHE && 319 quoteBuf.charAt(1) == APOSTROPHE) { 320 rule.append(BACKSLASH).append(APOSTROPHE); 321 quoteBuf.remove(0, 2); 322 } 323 // If the last thing in the quoteBuf is APOSTROPHE 324 // (doubled) then remove and count it and add it after. 325 int32_t trailingCount = 0; 326 while (quoteBuf.length() >= 2 && 327 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 328 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 329 quoteBuf.truncate(quoteBuf.length()-2); 330 ++trailingCount; 331 } 332 if (quoteBuf.length() > 0) { 333 rule.append(APOSTROPHE); 334 rule.append(quoteBuf); 335 rule.append(APOSTROPHE); 336 quoteBuf.truncate(0); 337 } 338 while (trailingCount-- > 0) { 339 rule.append(BACKSLASH).append(APOSTROPHE); 340 } 341 } 342 if (c != (UChar32)-1) { 343 /* Since spaces are ignored during parsing, they are 344 * emitted only for readability. We emit one here 345 * only if there isn't already one at the end of the 346 * rule. 347 */ 348 if (c == SPACE) { 349 int32_t len = rule.length(); 350 if (len > 0 && rule.charAt(len-1) != c) { 351 rule.append(c); 352 } 353 } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) { 354 rule.append(c); 355 } 356 } 357 } 358 359 // Escape ' and '\' and don't begin a quote just for them 360 else if (quoteBuf.length() == 0 && 361 (c == APOSTROPHE || c == BACKSLASH)) { 362 rule.append(BACKSLASH); 363 rule.append(c); 364 } 365 366 // Specials (printable ascii that isn't [0-9a-zA-Z]) and 367 // whitespace need quoting. Also append stuff to quotes if we are 368 // building up a quoted substring already. 369 else if (quoteBuf.length() > 0 || 370 (c >= 0x0021 && c <= 0x007E && 371 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 372 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 373 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 374 uprv_isRuleWhiteSpace(c)) { 375 quoteBuf.append(c); 376 // Double ' within a quote 377 if (c == APOSTROPHE) { 378 quoteBuf.append(c); 379 } 380 } 381 382 // Otherwise just append 383 else { 384 rule.append(c); 385 } 386 } 387 388 void ICU_Utility::appendToRule(UnicodeString& rule, 389 const UnicodeString& text, 390 UBool isLiteral, 391 UBool escapeUnprintable, 392 UnicodeString& quoteBuf) { 393 for (int32_t i=0; i<text.length(); ++i) { 394 appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf); 395 } 396 } 397 398 /** 399 * Given a matcher reference, which may be null, append its 400 * pattern as a literal to the given rule. 401 */ 402 void ICU_Utility::appendToRule(UnicodeString& rule, 403 const UnicodeMatcher* matcher, 404 UBool escapeUnprintable, 405 UnicodeString& quoteBuf) { 406 if (matcher != NULL) { 407 UnicodeString pat; 408 appendToRule(rule, matcher->toPattern(pat, escapeUnprintable), 409 TRUE, escapeUnprintable, quoteBuf); 410 } 411 } 412 413 U_NAMESPACE_END 414 415 U_CAPI UBool U_EXPORT2 416 uprv_isRuleWhiteSpace(UChar32 c) { 417 /* "white space" in the sense of ICU rule parsers 418 This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. 419 See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ 420 U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 421 Equivalent to test for Pattern_White_Space Unicode property. 422 */ 423 return (c >= 0x0009 && c <= 0x2029 && 424 (c <= 0x000D || c == 0x0020 || c == 0x0085 || 425 c == 0x200E || c == 0x200F || c >= 0x2028)); 426 } 427 428 U_CAPI U_NAMESPACE_QUALIFIER UnicodeSet* U_EXPORT2 429 uprv_openRuleWhiteSpaceSet(UErrorCode* ec) { 430 if(U_FAILURE(*ec)) { 431 return NULL; 432 } 433 // create a set with the Pattern_White_Space characters, 434 // without a pattern for fewer code dependencies 435 U_NAMESPACE_QUALIFIER UnicodeSet *set=new U_NAMESPACE_QUALIFIER UnicodeSet(9, 0xd); 436 // Check for new failure. 437 if (set == NULL) { 438 *ec = U_MEMORY_ALLOCATION_ERROR; 439 return NULL; 440 } 441 set->UnicodeSet::add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); 442 return set; 443 } 444 445 //eof 446