1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uniset_props.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004aug25 14 * created by: Markus W. Scherer 15 * 16 * Character property dependent functions moved here from uniset.cpp 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uniset.h" 21 #include "unicode/parsepos.h" 22 #include "unicode/uchar.h" 23 #include "unicode/uscript.h" 24 #include "unicode/symtable.h" 25 #include "unicode/uset.h" 26 #include "unicode/locid.h" 27 #include "unicode/brkiter.h" 28 #include "uset_imp.h" 29 #include "ruleiter.h" 30 #include "cmemory.h" 31 #include "ucln_cmn.h" 32 #include "util.h" 33 #include "uvector.h" 34 #include "uprops.h" 35 #include "propname.h" 36 #include "normalizer2impl.h" 37 #include "ucase.h" 38 #include "ubidi_props.h" 39 #include "uinvchar.h" 40 #include "uprops.h" 41 #include "charstr.h" 42 #include "cstring.h" 43 #include "mutex.h" 44 #include "umutex.h" 45 #include "uassert.h" 46 #include "hash.h" 47 48 U_NAMESPACE_USE 49 50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 51 52 // initial storage. Must be >= 0 53 // *** same as in uniset.cpp ! *** 54 #define START_EXTRA 16 55 56 // Define UChar constants using hex for EBCDIC compatibility 57 // Used #define to reduce private static exports and memory access time. 58 #define SET_OPEN ((UChar)0x005B) /*[*/ 59 #define SET_CLOSE ((UChar)0x005D) /*]*/ 60 #define HYPHEN ((UChar)0x002D) /*-*/ 61 #define COMPLEMENT ((UChar)0x005E) /*^*/ 62 #define COLON ((UChar)0x003A) /*:*/ 63 #define BACKSLASH ((UChar)0x005C) /*\*/ 64 #define INTERSECTION ((UChar)0x0026) /*&*/ 65 #define UPPER_U ((UChar)0x0055) /*U*/ 66 #define LOWER_U ((UChar)0x0075) /*u*/ 67 #define OPEN_BRACE ((UChar)123) /*{*/ 68 #define CLOSE_BRACE ((UChar)125) /*}*/ 69 #define UPPER_P ((UChar)0x0050) /*P*/ 70 #define LOWER_P ((UChar)0x0070) /*p*/ 71 #define UPPER_N ((UChar)78) /*N*/ 72 #define EQUALS ((UChar)0x003D) /*=*/ 73 74 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" 75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" 76 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" 77 static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" 78 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" 79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ 80 81 // Special property set IDs 82 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F] 84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:] 85 86 // Unicode name property alias 87 #define NAME_PROP "na" 88 #define NAME_PROP_LENGTH 2 89 90 /** 91 * Delimiter string used in patterns to close a category reference: 92 * ":]". Example: "[:Lu:]". 93 */ 94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 95 96 // Cached sets ------------------------------------------------------------- *** 97 98 U_CDECL_BEGIN 99 static UBool U_CALLCONV uset_cleanup(); 100 U_CDECL_END 101 102 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor 103 // can only fail with an out-of-memory error 104 // if we have a correct pattern and the properties data is hardcoded and always available. 105 class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> { 106 public: 107 UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) : 108 SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {} 109 UnicodeSet *getInstance(UErrorCode &errorCode) { 110 return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode); 111 } 112 private: 113 static void *createInstance(const void *context, UErrorCode &errorCode) { 114 UnicodeString pattern((const char *)context, -1, US_INV); 115 UnicodeSet *set=new UnicodeSet(pattern, errorCode); 116 if(set==NULL) { 117 errorCode=U_MEMORY_ALLOCATION_ERROR; 118 } 119 set->freeze(); 120 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 121 return set; 122 } 123 124 const char *fPattern; 125 }; 126 127 U_CDECL_BEGIN 128 129 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions() 130 131 STATIC_SIMPLE_SINGLETON(uni32Singleton); 132 133 //---------------------------------------------------------------- 134 // Inclusions list 135 //---------------------------------------------------------------- 136 137 // USetAdder implementation 138 // Does not use uset.h to reduce code dependencies 139 static void U_CALLCONV 140 _set_add(USet *set, UChar32 c) { 141 ((UnicodeSet *)set)->add(c); 142 } 143 144 static void U_CALLCONV 145 _set_addRange(USet *set, UChar32 start, UChar32 end) { 146 ((UnicodeSet *)set)->add(start, end); 147 } 148 149 static void U_CALLCONV 150 _set_addString(USet *set, const UChar *str, int32_t length) { 151 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 152 } 153 154 /** 155 * Cleanup function for UnicodeSet 156 */ 157 static UBool U_CALLCONV uset_cleanup(void) { 158 int32_t i; 159 160 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { 161 if (INCLUSIONS[i] != NULL) { 162 delete INCLUSIONS[i]; 163 INCLUSIONS[i] = NULL; 164 } 165 } 166 UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance(); 167 return TRUE; 168 } 169 170 U_CDECL_END 171 172 U_NAMESPACE_BEGIN 173 174 /* 175 Reduce excessive reallocation, and make it easier to detect initialization 176 problems. 177 Usually you don't see smaller sets than this for Unicode 5.0. 178 */ 179 #define DEFAULT_INCLUSION_CAPACITY 3072 180 181 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { 182 UBool needInit; 183 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); 184 if (needInit) { 185 UnicodeSet* incl = new UnicodeSet(); 186 USetAdder sa = { 187 (USet *)incl, 188 _set_add, 189 _set_addRange, 190 _set_addString, 191 NULL, // don't need remove() 192 NULL // don't need removeRange() 193 }; 194 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); 195 if (incl != NULL) { 196 switch(src) { 197 case UPROPS_SRC_CHAR: 198 uchar_addPropertyStarts(&sa, &status); 199 break; 200 case UPROPS_SRC_PROPSVEC: 201 upropsvec_addPropertyStarts(&sa, &status); 202 break; 203 case UPROPS_SRC_CHAR_AND_PROPSVEC: 204 uchar_addPropertyStarts(&sa, &status); 205 upropsvec_addPropertyStarts(&sa, &status); 206 break; 207 #if !UCONFIG_NO_NORMALIZATION 208 case UPROPS_SRC_CASE_AND_NORM: { 209 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 210 if(U_SUCCESS(status)) { 211 impl->addPropertyStarts(&sa, status); 212 } 213 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 214 break; 215 } 216 case UPROPS_SRC_NFC: { 217 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 218 if(U_SUCCESS(status)) { 219 impl->addPropertyStarts(&sa, status); 220 } 221 break; 222 } 223 case UPROPS_SRC_NFKC: { 224 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); 225 if(U_SUCCESS(status)) { 226 impl->addPropertyStarts(&sa, status); 227 } 228 break; 229 } 230 case UPROPS_SRC_NFKC_CF: { 231 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); 232 if(U_SUCCESS(status)) { 233 impl->addPropertyStarts(&sa, status); 234 } 235 break; 236 } 237 case UPROPS_SRC_NFC_CANON_ITER: { 238 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 239 if(U_SUCCESS(status)) { 240 impl->addCanonIterPropertyStarts(&sa, status); 241 } 242 break; 243 } 244 #endif 245 case UPROPS_SRC_CASE: 246 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 247 break; 248 case UPROPS_SRC_BIDI: 249 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); 250 break; 251 default: 252 status = U_INTERNAL_PROGRAM_ERROR; 253 break; 254 } 255 if (U_SUCCESS(status)) { 256 // Compact for caching 257 incl->compact(); 258 umtx_lock(NULL); 259 if (INCLUSIONS[src] == NULL) { 260 INCLUSIONS[src] = incl; 261 incl = NULL; 262 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 263 } 264 umtx_unlock(NULL); 265 } 266 delete incl; 267 } else { 268 status = U_MEMORY_ALLOCATION_ERROR; 269 } 270 } 271 return INCLUSIONS[src]; 272 } 273 274 // Cache some sets for other services -------------------------------------- *** 275 276 U_CFUNC UnicodeSet * 277 uniset_getUnicode32Instance(UErrorCode &errorCode) { 278 return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode); 279 } 280 281 // helper functions for matching of pattern syntax pieces ------------------ *** 282 // these functions are parallel to the PERL_OPEN etc. strings above 283 284 // using these functions is not only faster than UnicodeString::compare() and 285 // caseCompare(), but they also make UnicodeSet work for simple patterns when 286 // no Unicode properties data is available - when caseCompare() fails 287 288 static inline UBool 289 isPerlOpen(const UnicodeString &pattern, int32_t pos) { 290 UChar c; 291 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); 292 } 293 294 /*static inline UBool 295 isPerlClose(const UnicodeString &pattern, int32_t pos) { 296 return pattern.charAt(pos)==CLOSE_BRACE; 297 }*/ 298 299 static inline UBool 300 isNameOpen(const UnicodeString &pattern, int32_t pos) { 301 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; 302 } 303 304 static inline UBool 305 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 306 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; 307 } 308 309 /*static inline UBool 310 isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 311 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 312 }*/ 313 314 // TODO memory debugging provided inside uniset.cpp 315 // could be made available here but probably obsolete with use of modern 316 // memory leak checker tools 317 #define _dbgct(me) 318 319 //---------------------------------------------------------------- 320 // Constructors &c 321 //---------------------------------------------------------------- 322 323 /** 324 * Constructs a set from the given pattern, optionally ignoring 325 * white space. See the class description for the syntax of the 326 * pattern language. 327 * @param pattern a string specifying what characters are in the set 328 */ 329 UnicodeSet::UnicodeSet(const UnicodeString& pattern, 330 UErrorCode& status) : 331 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 332 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 333 fFlags(0) 334 { 335 if(U_SUCCESS(status)){ 336 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 337 /* test for NULL */ 338 if(list == NULL) { 339 status = U_MEMORY_ALLOCATION_ERROR; 340 }else{ 341 allocateStrings(status); 342 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 343 } 344 } 345 _dbgct(this); 346 } 347 348 /** 349 * Constructs a set from the given pattern, optionally ignoring 350 * white space. See the class description for the syntax of the 351 * pattern language. 352 * @param pattern a string specifying what characters are in the set 353 * @param options bitmask for options to apply to the pattern. 354 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 355 */ 356 UnicodeSet::UnicodeSet(const UnicodeString& pattern, 357 uint32_t options, 358 const SymbolTable* symbols, 359 UErrorCode& status) : 360 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 361 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 362 fFlags(0) 363 { 364 if(U_SUCCESS(status)){ 365 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 366 /* test for NULL */ 367 if(list == NULL) { 368 status = U_MEMORY_ALLOCATION_ERROR; 369 }else{ 370 allocateStrings(status); 371 applyPattern(pattern, options, symbols, status); 372 } 373 } 374 _dbgct(this); 375 } 376 377 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 378 uint32_t options, 379 const SymbolTable* symbols, 380 UErrorCode& status) : 381 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 382 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 383 fFlags(0) 384 { 385 if(U_SUCCESS(status)){ 386 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 387 /* test for NULL */ 388 if(list == NULL) { 389 status = U_MEMORY_ALLOCATION_ERROR; 390 }else{ 391 allocateStrings(status); 392 applyPattern(pattern, pos, options, symbols, status); 393 } 394 } 395 _dbgct(this); 396 } 397 398 //---------------------------------------------------------------- 399 // Public API 400 //---------------------------------------------------------------- 401 402 /** 403 * Modifies this set to represent the set specified by the given 404 * pattern, optionally ignoring white space. See the class 405 * description for the syntax of the pattern language. 406 * @param pattern a string specifying what characters are in the set 407 * @param ignoreSpaces if <code>true</code>, all spaces in the 408 * pattern are ignored. Spaces are those characters for which 409 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>. 410 * Characters preceded by '\\' are escaped, losing any special 411 * meaning they otherwise have. Spaces may be included by 412 * escaping them. 413 * @exception <code>IllegalArgumentException</code> if the pattern 414 * contains a syntax error. 415 */ 416 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 417 UErrorCode& status) { 418 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 419 } 420 421 422 /** 423 * Modifies this set to represent the set specified by the given 424 * pattern, optionally ignoring white space. See the class 425 * description for the syntax of the pattern language. 426 * @param pattern a string specifying what characters are in the set 427 * @param options bitmask for options to apply to the pattern. 428 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 429 */ 430 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 431 uint32_t options, 432 const SymbolTable* symbols, 433 UErrorCode& status) { 434 if (U_FAILURE(status) || isFrozen()) { 435 return *this; 436 } 437 438 ParsePosition pos(0); 439 applyPattern(pattern, pos, options, symbols, status); 440 if (U_FAILURE(status)) return *this; 441 442 int32_t i = pos.getIndex(); 443 444 if (options & USET_IGNORE_SPACE) { 445 // Skip over trailing whitespace 446 ICU_Utility::skipWhitespace(pattern, i, TRUE); 447 } 448 449 if (i != pattern.length()) { 450 status = U_ILLEGAL_ARGUMENT_ERROR; 451 } 452 return *this; 453 } 454 455 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 456 ParsePosition& pos, 457 uint32_t options, 458 const SymbolTable* symbols, 459 UErrorCode& status) { 460 if (U_FAILURE(status) || isFrozen()) { 461 return *this; 462 } 463 // Need to build the pattern in a temporary string because 464 // _applyPattern calls add() etc., which set pat to empty. 465 UnicodeString rebuiltPat; 466 RuleCharacterIterator chars(pattern, symbols, pos); 467 applyPattern(chars, symbols, rebuiltPat, options, status); 468 if (U_FAILURE(status)) return *this; 469 if (chars.inVariable()) { 470 // syntaxError(chars, "Extra chars in variable value"); 471 status = U_MALFORMED_SET; 472 return *this; 473 } 474 setPattern(rebuiltPat); 475 return *this; 476 } 477 478 /** 479 * Return true if the given position, in the given pattern, appears 480 * to be the start of a UnicodeSet pattern. 481 */ 482 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 483 return ((pos+1) < pattern.length() && 484 pattern.charAt(pos) == (UChar)91/*[*/) || 485 resemblesPropertyPattern(pattern, pos); 486 } 487 488 //---------------------------------------------------------------- 489 // Implementation: Pattern parsing 490 //---------------------------------------------------------------- 491 492 /** 493 * A small all-inline class to manage a UnicodeSet pointer. Add 494 * operator->() etc. as needed. 495 */ 496 class UnicodeSetPointer { 497 UnicodeSet* p; 498 public: 499 inline UnicodeSetPointer() : p(0) {} 500 inline ~UnicodeSetPointer() { delete p; } 501 inline UnicodeSet* pointer() { return p; } 502 inline UBool allocate() { 503 if (p == 0) { 504 p = new UnicodeSet(); 505 } 506 return p != 0; 507 } 508 }; 509 510 /** 511 * Parse the pattern from the given RuleCharacterIterator. The 512 * iterator is advanced over the parsed pattern. 513 * @param chars iterator over the pattern characters. Upon return 514 * it will be advanced to the first character after the parsed 515 * pattern, or the end of the iteration if all characters are 516 * parsed. 517 * @param symbols symbol table to use to parse and dereference 518 * variables, or null if none. 519 * @param rebuiltPat the pattern that was parsed, rebuilt or 520 * copied from the input pattern, as appropriate. 521 * @param options a bit mask of zero or more of the following: 522 * IGNORE_SPACE, CASE. 523 */ 524 void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 525 const SymbolTable* symbols, 526 UnicodeString& rebuiltPat, 527 uint32_t options, 528 UErrorCode& ec) { 529 if (U_FAILURE(ec)) return; 530 531 // Syntax characters: [ ] ^ - & { } 532 533 // Recognized special forms for chars, sets: c-c s-s s&s 534 535 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 536 RuleCharacterIterator::PARSE_ESCAPES; 537 if ((options & USET_IGNORE_SPACE) != 0) { 538 opts |= RuleCharacterIterator::SKIP_WHITESPACE; 539 } 540 541 UnicodeString patLocal, buf; 542 UBool usePat = FALSE; 543 UnicodeSetPointer scratch; 544 RuleCharacterIterator::Pos backup; 545 546 // mode: 0=before [, 1=between [...], 2=after ] 547 // lastItem: 0=none, 1=char, 2=set 548 int8_t lastItem = 0, mode = 0; 549 UChar32 lastChar = 0; 550 UChar op = 0; 551 552 UBool invert = FALSE; 553 554 clear(); 555 556 while (mode != 2 && !chars.atEnd()) { 557 U_ASSERT((lastItem == 0 && op == 0) || 558 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || 559 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || 560 op == INTERSECTION /*'&'*/))); 561 562 UChar32 c = 0; 563 UBool literal = FALSE; 564 UnicodeSet* nested = 0; // alias - do not delete 565 566 // -------- Check for property pattern 567 568 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 569 int8_t setMode = 0; 570 if (resemblesPropertyPattern(chars, opts)) { 571 setMode = 2; 572 } 573 574 // -------- Parse '[' of opening delimiter OR nested set. 575 // If there is a nested set, use `setMode' to define how 576 // the set should be parsed. If the '[' is part of the 577 // opening delimiter for this pattern, parse special 578 // strings "[", "[^", "[-", and "[^-". Check for stand-in 579 // characters representing a nested set in the symbol 580 // table. 581 582 else { 583 // Prepare to backup if necessary 584 chars.getPos(backup); 585 c = chars.next(opts, literal, ec); 586 if (U_FAILURE(ec)) return; 587 588 if (c == 0x5B /*'['*/ && !literal) { 589 if (mode == 1) { 590 chars.setPos(backup); // backup 591 setMode = 1; 592 } else { 593 // Handle opening '[' delimiter 594 mode = 1; 595 patLocal.append((UChar) 0x5B /*'['*/); 596 chars.getPos(backup); // prepare to backup 597 c = chars.next(opts, literal, ec); 598 if (U_FAILURE(ec)) return; 599 if (c == 0x5E /*'^'*/ && !literal) { 600 invert = TRUE; 601 patLocal.append((UChar) 0x5E /*'^'*/); 602 chars.getPos(backup); // prepare to backup 603 c = chars.next(opts, literal, ec); 604 if (U_FAILURE(ec)) return; 605 } 606 // Fall through to handle special leading '-'; 607 // otherwise restart loop for nested [], \p{}, etc. 608 if (c == HYPHEN /*'-'*/) { 609 literal = TRUE; 610 // Fall through to handle literal '-' below 611 } else { 612 chars.setPos(backup); // backup 613 continue; 614 } 615 } 616 } else if (symbols != 0) { 617 const UnicodeFunctor *m = symbols->lookupMatcher(c); 618 if (m != 0) { 619 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); 620 if (ms == NULL) { 621 ec = U_MALFORMED_SET; 622 return; 623 } 624 // casting away const, but `nested' won't be modified 625 // (important not to modify stored set) 626 nested = const_cast<UnicodeSet*>(ms); 627 setMode = 3; 628 } 629 } 630 } 631 632 // -------- Handle a nested set. This either is inline in 633 // the pattern or represented by a stand-in that has 634 // previously been parsed and was looked up in the symbol 635 // table. 636 637 if (setMode != 0) { 638 if (lastItem == 1) { 639 if (op != 0) { 640 // syntaxError(chars, "Char expected after operator"); 641 ec = U_MALFORMED_SET; 642 return; 643 } 644 add(lastChar, lastChar); 645 _appendToPat(patLocal, lastChar, FALSE); 646 lastItem = 0; 647 op = 0; 648 } 649 650 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { 651 patLocal.append(op); 652 } 653 654 if (nested == 0) { 655 // lazy allocation 656 if (!scratch.allocate()) { 657 ec = U_MEMORY_ALLOCATION_ERROR; 658 return; 659 } 660 nested = scratch.pointer(); 661 } 662 switch (setMode) { 663 case 1: 664 nested->applyPattern(chars, symbols, patLocal, options, ec); 665 break; 666 case 2: 667 chars.skipIgnored(opts); 668 nested->applyPropertyPattern(chars, patLocal, ec); 669 if (U_FAILURE(ec)) return; 670 break; 671 case 3: // `nested' already parsed 672 nested->_toPattern(patLocal, FALSE); 673 break; 674 } 675 676 usePat = TRUE; 677 678 if (mode == 0) { 679 // Entire pattern is a category; leave parse loop 680 *this = *nested; 681 mode = 2; 682 break; 683 } 684 685 switch (op) { 686 case HYPHEN: /*'-'*/ 687 removeAll(*nested); 688 break; 689 case INTERSECTION: /*'&'*/ 690 retainAll(*nested); 691 break; 692 case 0: 693 addAll(*nested); 694 break; 695 } 696 697 op = 0; 698 lastItem = 2; 699 700 continue; 701 } 702 703 if (mode == 0) { 704 // syntaxError(chars, "Missing '['"); 705 ec = U_MALFORMED_SET; 706 return; 707 } 708 709 // -------- Parse special (syntax) characters. If the 710 // current character is not special, or if it is escaped, 711 // then fall through and handle it below. 712 713 if (!literal) { 714 switch (c) { 715 case 0x5D /*']'*/: 716 if (lastItem == 1) { 717 add(lastChar, lastChar); 718 _appendToPat(patLocal, lastChar, FALSE); 719 } 720 // Treat final trailing '-' as a literal 721 if (op == HYPHEN /*'-'*/) { 722 add(op, op); 723 patLocal.append(op); 724 } else if (op == INTERSECTION /*'&'*/) { 725 // syntaxError(chars, "Trailing '&'"); 726 ec = U_MALFORMED_SET; 727 return; 728 } 729 patLocal.append((UChar) 0x5D /*']'*/); 730 mode = 2; 731 continue; 732 case HYPHEN /*'-'*/: 733 if (op == 0) { 734 if (lastItem != 0) { 735 op = (UChar) c; 736 continue; 737 } else { 738 // Treat final trailing '-' as a literal 739 add(c, c); 740 c = chars.next(opts, literal, ec); 741 if (U_FAILURE(ec)) return; 742 if (c == 0x5D /*']'*/ && !literal) { 743 patLocal.append(HYPHEN_RIGHT_BRACE); 744 mode = 2; 745 continue; 746 } 747 } 748 } 749 // syntaxError(chars, "'-' not after char or set"); 750 ec = U_MALFORMED_SET; 751 return; 752 case INTERSECTION /*'&'*/: 753 if (lastItem == 2 && op == 0) { 754 op = (UChar) c; 755 continue; 756 } 757 // syntaxError(chars, "'&' not after set"); 758 ec = U_MALFORMED_SET; 759 return; 760 case 0x5E /*'^'*/: 761 // syntaxError(chars, "'^' not after '['"); 762 ec = U_MALFORMED_SET; 763 return; 764 case 0x7B /*'{'*/: 765 if (op != 0) { 766 // syntaxError(chars, "Missing operand after operator"); 767 ec = U_MALFORMED_SET; 768 return; 769 } 770 if (lastItem == 1) { 771 add(lastChar, lastChar); 772 _appendToPat(patLocal, lastChar, FALSE); 773 } 774 lastItem = 0; 775 buf.truncate(0); 776 { 777 UBool ok = FALSE; 778 while (!chars.atEnd()) { 779 c = chars.next(opts, literal, ec); 780 if (U_FAILURE(ec)) return; 781 if (c == 0x7D /*'}'*/ && !literal) { 782 ok = TRUE; 783 break; 784 } 785 buf.append(c); 786 } 787 if (buf.length() < 1 || !ok) { 788 // syntaxError(chars, "Invalid multicharacter string"); 789 ec = U_MALFORMED_SET; 790 return; 791 } 792 } 793 // We have new string. Add it to set and continue; 794 // we don't need to drop through to the further 795 // processing 796 add(buf); 797 patLocal.append((UChar) 0x7B /*'{'*/); 798 _appendToPat(patLocal, buf, FALSE); 799 patLocal.append((UChar) 0x7D /*'}'*/); 800 continue; 801 case SymbolTable::SYMBOL_REF: 802 // symbols nosymbols 803 // [a-$] error error (ambiguous) 804 // [a$] anchor anchor 805 // [a-$x] var "x"* literal '$' 806 // [a-$.] error literal '$' 807 // *We won't get here in the case of var "x" 808 { 809 chars.getPos(backup); 810 c = chars.next(opts, literal, ec); 811 if (U_FAILURE(ec)) return; 812 UBool anchor = (c == 0x5D /*']'*/ && !literal); 813 if (symbols == 0 && !anchor) { 814 c = SymbolTable::SYMBOL_REF; 815 chars.setPos(backup); 816 break; // literal '$' 817 } 818 if (anchor && op == 0) { 819 if (lastItem == 1) { 820 add(lastChar, lastChar); 821 _appendToPat(patLocal, lastChar, FALSE); 822 } 823 add(U_ETHER); 824 usePat = TRUE; 825 patLocal.append((UChar) SymbolTable::SYMBOL_REF); 826 patLocal.append((UChar) 0x5D /*']'*/); 827 mode = 2; 828 continue; 829 } 830 // syntaxError(chars, "Unquoted '$'"); 831 ec = U_MALFORMED_SET; 832 return; 833 } 834 default: 835 break; 836 } 837 } 838 839 // -------- Parse literal characters. This includes both 840 // escaped chars ("\u4E01") and non-syntax characters 841 // ("a"). 842 843 switch (lastItem) { 844 case 0: 845 lastItem = 1; 846 lastChar = c; 847 break; 848 case 1: 849 if (op == HYPHEN /*'-'*/) { 850 if (lastChar >= c) { 851 // Don't allow redundant (a-a) or empty (b-a) ranges; 852 // these are most likely typos. 853 // syntaxError(chars, "Invalid range"); 854 ec = U_MALFORMED_SET; 855 return; 856 } 857 add(lastChar, c); 858 _appendToPat(patLocal, lastChar, FALSE); 859 patLocal.append(op); 860 _appendToPat(patLocal, c, FALSE); 861 lastItem = 0; 862 op = 0; 863 } else { 864 add(lastChar, lastChar); 865 _appendToPat(patLocal, lastChar, FALSE); 866 lastChar = c; 867 } 868 break; 869 case 2: 870 if (op != 0) { 871 // syntaxError(chars, "Set expected after operator"); 872 ec = U_MALFORMED_SET; 873 return; 874 } 875 lastChar = c; 876 lastItem = 1; 877 break; 878 } 879 } 880 881 if (mode != 2) { 882 // syntaxError(chars, "Missing ']'"); 883 ec = U_MALFORMED_SET; 884 return; 885 } 886 887 chars.skipIgnored(opts); 888 889 /** 890 * Handle global flags (invert, case insensitivity). If this 891 * pattern should be compiled case-insensitive, then we need 892 * to close over case BEFORE COMPLEMENTING. This makes 893 * patterns like /[^abc]/i work. 894 */ 895 if ((options & USET_CASE_INSENSITIVE) != 0) { 896 closeOver(USET_CASE_INSENSITIVE); 897 } 898 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 899 closeOver(USET_ADD_CASE_MAPPINGS); 900 } 901 if (invert) { 902 complement(); 903 } 904 905 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 906 // generated pattern. 907 if (usePat) { 908 rebuiltPat.append(patLocal); 909 } else { 910 _generatePattern(rebuiltPat, FALSE); 911 } 912 if (isBogus() && U_SUCCESS(ec)) { 913 // We likely ran out of memory. AHHH! 914 ec = U_MEMORY_ALLOCATION_ERROR; 915 } 916 } 917 918 //---------------------------------------------------------------- 919 // Property set implementation 920 //---------------------------------------------------------------- 921 922 static UBool numericValueFilter(UChar32 ch, void* context) { 923 return u_getNumericValue(ch) == *(double*)context; 924 } 925 926 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 927 int32_t value = *(int32_t*)context; 928 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 929 } 930 931 static UBool versionFilter(UChar32 ch, void* context) { 932 static const UVersionInfo none = { 0, 0, 0, 0 }; 933 UVersionInfo v; 934 u_charAge(ch, v); 935 UVersionInfo* version = (UVersionInfo*)context; 936 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 937 } 938 939 typedef struct { 940 UProperty prop; 941 int32_t value; 942 } IntPropertyContext; 943 944 static UBool intPropertyFilter(UChar32 ch, void* context) { 945 IntPropertyContext* c = (IntPropertyContext*)context; 946 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 947 } 948 949 static UBool scriptExtensionsFilter(UChar32 ch, void* context) { 950 return uscript_hasScript(ch, *(UScriptCode*)context); 951 } 952 953 /** 954 * Generic filter-based scanning code for UCD property UnicodeSets. 955 */ 956 void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 957 void* context, 958 int32_t src, 959 UErrorCode &status) { 960 if (U_FAILURE(status)) return; 961 962 // Logically, walk through all Unicode characters, noting the start 963 // and end of each range for which filter.contain(c) is 964 // true. Add each range to a set. 965 // 966 // To improve performance, use an inclusions set which 967 // encodes information about character ranges that are known 968 // to have identical properties. 969 // getInclusions(src) contains exactly the first characters of 970 // same-value ranges for the given properties "source". 971 const UnicodeSet* inclusions = getInclusions(src, status); 972 if (U_FAILURE(status)) { 973 return; 974 } 975 976 clear(); 977 978 UChar32 startHasProperty = -1; 979 int32_t limitRange = inclusions->getRangeCount(); 980 981 for (int j=0; j<limitRange; ++j) { 982 // get current range 983 UChar32 start = inclusions->getRangeStart(j); 984 UChar32 end = inclusions->getRangeEnd(j); 985 986 // for all the code points in the range, process 987 for (UChar32 ch = start; ch <= end; ++ch) { 988 // only add to this UnicodeSet on inflection points -- 989 // where the hasProperty value changes to false 990 if ((*filter)(ch, context)) { 991 if (startHasProperty < 0) { 992 startHasProperty = ch; 993 } 994 } else if (startHasProperty >= 0) { 995 add(startHasProperty, ch-1); 996 startHasProperty = -1; 997 } 998 } 999 } 1000 if (startHasProperty >= 0) { 1001 add((UChar32)startHasProperty, (UChar32)0x10FFFF); 1002 } 1003 if (isBogus() && U_SUCCESS(status)) { 1004 // We likely ran out of memory. AHHH! 1005 status = U_MEMORY_ALLOCATION_ERROR; 1006 } 1007 } 1008 1009 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 1010 /* Note: we use ' ' in compiler code page */ 1011 int32_t j = 0; 1012 char ch; 1013 --dstCapacity; /* make room for term. zero */ 1014 while ((ch = *src++) != 0) { 1015 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 1016 continue; 1017 } 1018 if (j >= dstCapacity) return FALSE; 1019 dst[j++] = ch; 1020 } 1021 if (j > 0 && dst[j-1] == ' ') --j; 1022 dst[j] = 0; 1023 return TRUE; 1024 } 1025 1026 //---------------------------------------------------------------- 1027 // Property set API 1028 //---------------------------------------------------------------- 1029 1030 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 1031 1032 UnicodeSet& 1033 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 1034 if (U_FAILURE(ec) || isFrozen()) return *this; 1035 1036 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 1037 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); 1038 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 1039 UScriptCode script = (UScriptCode)value; 1040 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); 1041 } else { 1042 IntPropertyContext c = {prop, value}; 1043 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); 1044 } 1045 return *this; 1046 } 1047 1048 UnicodeSet& 1049 UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 1050 const UnicodeString& value, 1051 UErrorCode& ec) { 1052 if (U_FAILURE(ec) || isFrozen()) return *this; 1053 1054 // prop and value used to be converted to char * using the default 1055 // converter instead of the invariant conversion. 1056 // This should not be necessary because all Unicode property and value 1057 // names use only invariant characters. 1058 // If there are any variant characters, then we won't find them anyway. 1059 // Checking first avoids assertion failures in the conversion. 1060 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 1061 !uprv_isInvariantUString(value.getBuffer(), value.length()) 1062 ) { 1063 FAIL(ec); 1064 } 1065 CharString pname, vname; 1066 pname.appendInvariantChars(prop, ec); 1067 vname.appendInvariantChars(value, ec); 1068 if (U_FAILURE(ec)) return *this; 1069 1070 UProperty p; 1071 int32_t v; 1072 UBool mustNotBeEmpty = FALSE, invert = FALSE; 1073 1074 if (value.length() > 0) { 1075 p = u_getPropertyEnum(pname.data()); 1076 if (p == UCHAR_INVALID_CODE) FAIL(ec); 1077 1078 // Treat gc as gcm 1079 if (p == UCHAR_GENERAL_CATEGORY) { 1080 p = UCHAR_GENERAL_CATEGORY_MASK; 1081 } 1082 1083 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 1084 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 1085 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 1086 v = u_getPropertyValueEnum(p, vname.data()); 1087 if (v == UCHAR_INVALID_CODE) { 1088 // Handle numeric CCC 1089 if (p == UCHAR_CANONICAL_COMBINING_CLASS || 1090 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 1091 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 1092 char* end; 1093 double value = uprv_strtod(vname.data(), &end); 1094 v = (int32_t) value; 1095 if (v != value || v < 0 || *end != 0) { 1096 // non-integral or negative value, or trailing junk 1097 FAIL(ec); 1098 } 1099 // If the resultant set is empty then the numeric value 1100 // was invalid. 1101 mustNotBeEmpty = TRUE; 1102 } else { 1103 FAIL(ec); 1104 } 1105 } 1106 } 1107 1108 else { 1109 1110 switch (p) { 1111 case UCHAR_NUMERIC_VALUE: 1112 { 1113 char* end; 1114 double value = uprv_strtod(vname.data(), &end); 1115 if (*end != 0) { 1116 FAIL(ec); 1117 } 1118 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); 1119 return *this; 1120 } 1121 break; 1122 case UCHAR_NAME: 1123 case UCHAR_UNICODE_1_NAME: 1124 { 1125 // Must munge name, since u_charFromName() does not do 1126 // 'loose' matching. 1127 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 1128 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1129 UCharNameChoice choice = (p == UCHAR_NAME) ? 1130 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME; 1131 UChar32 ch = u_charFromName(choice, buf, &ec); 1132 if (U_SUCCESS(ec)) { 1133 clear(); 1134 add(ch); 1135 return *this; 1136 } else { 1137 FAIL(ec); 1138 } 1139 } 1140 break; 1141 case UCHAR_AGE: 1142 { 1143 // Must munge name, since u_versionFromString() does not do 1144 // 'loose' matching. 1145 char buf[128]; 1146 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1147 UVersionInfo version; 1148 u_versionFromString(version, buf); 1149 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); 1150 return *this; 1151 } 1152 break; 1153 case UCHAR_SCRIPT_EXTENSIONS: 1154 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); 1155 if (v == UCHAR_INVALID_CODE) { 1156 FAIL(ec); 1157 } 1158 // fall through to calling applyIntPropertyValue() 1159 break; 1160 default: 1161 // p is a non-binary, non-enumerated property that we 1162 // don't support (yet). 1163 FAIL(ec); 1164 } 1165 } 1166 } 1167 1168 else { 1169 // value is empty. Interpret as General Category, Script, or 1170 // Binary property. 1171 p = UCHAR_GENERAL_CATEGORY_MASK; 1172 v = u_getPropertyValueEnum(p, pname.data()); 1173 if (v == UCHAR_INVALID_CODE) { 1174 p = UCHAR_SCRIPT; 1175 v = u_getPropertyValueEnum(p, pname.data()); 1176 if (v == UCHAR_INVALID_CODE) { 1177 p = u_getPropertyEnum(pname.data()); 1178 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 1179 v = 1; 1180 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { 1181 set(MIN_VALUE, MAX_VALUE); 1182 return *this; 1183 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { 1184 set(0, 0x7F); 1185 return *this; 1186 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { 1187 // [:Assigned:]=[:^Cn:] 1188 p = UCHAR_GENERAL_CATEGORY_MASK; 1189 v = U_GC_CN_MASK; 1190 invert = TRUE; 1191 } else { 1192 FAIL(ec); 1193 } 1194 } 1195 } 1196 } 1197 1198 applyIntPropertyValue(p, v, ec); 1199 if(invert) { 1200 complement(); 1201 } 1202 1203 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { 1204 // mustNotBeEmpty is set to true if an empty set indicates 1205 // invalid input. 1206 ec = U_ILLEGAL_ARGUMENT_ERROR; 1207 } 1208 1209 if (isBogus() && U_SUCCESS(ec)) { 1210 // We likely ran out of memory. AHHH! 1211 ec = U_MEMORY_ALLOCATION_ERROR; 1212 } 1213 return *this; 1214 } 1215 1216 //---------------------------------------------------------------- 1217 // Property set patterns 1218 //---------------------------------------------------------------- 1219 1220 /** 1221 * Return true if the given position, in the given pattern, appears 1222 * to be the start of a property set pattern. 1223 */ 1224 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 1225 int32_t pos) { 1226 // Patterns are at least 5 characters long 1227 if ((pos+5) > pattern.length()) { 1228 return FALSE; 1229 } 1230 1231 // Look for an opening [:, [:^, \p, or \P 1232 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1233 } 1234 1235 /** 1236 * Return true if the given iterator appears to point at a 1237 * property pattern. Regardless of the result, return with the 1238 * iterator unchanged. 1239 * @param chars iterator over the pattern characters. Upon return 1240 * it will be unchanged. 1241 * @param iterOpts RuleCharacterIterator options 1242 */ 1243 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1244 int32_t iterOpts) { 1245 // NOTE: literal will always be FALSE, because we don't parse escapes. 1246 UBool result = FALSE, literal; 1247 UErrorCode ec = U_ZERO_ERROR; 1248 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1249 RuleCharacterIterator::Pos pos; 1250 chars.getPos(pos); 1251 UChar32 c = chars.next(iterOpts, literal, ec); 1252 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { 1253 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1254 literal, ec); 1255 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : 1256 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); 1257 } 1258 chars.setPos(pos); 1259 return result && U_SUCCESS(ec); 1260 } 1261 1262 /** 1263 * Parse the given property pattern at the given parse position. 1264 */ 1265 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1266 ParsePosition& ppos, 1267 UErrorCode &ec) { 1268 int32_t pos = ppos.getIndex(); 1269 1270 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1271 UBool isName = FALSE; // true for \N{pat}, o/w false 1272 UBool invert = FALSE; 1273 1274 if (U_FAILURE(ec)) return *this; 1275 1276 // Minimum length is 5 characters, e.g. \p{L} 1277 if ((pos+5) > pattern.length()) { 1278 FAIL(ec); 1279 } 1280 1281 // On entry, ppos should point to one of the following locations: 1282 // Look for an opening [:, [:^, \p, or \P 1283 if (isPOSIXOpen(pattern, pos)) { 1284 posix = TRUE; 1285 pos += 2; 1286 pos = ICU_Utility::skipWhitespace(pattern, pos); 1287 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { 1288 ++pos; 1289 invert = TRUE; 1290 } 1291 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1292 UChar c = pattern.charAt(pos+1); 1293 invert = (c == UPPER_P); 1294 isName = (c == UPPER_N); 1295 pos += 2; 1296 pos = ICU_Utility::skipWhitespace(pattern, pos); 1297 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { 1298 // Syntax error; "\p" or "\P" not followed by "{" 1299 FAIL(ec); 1300 } 1301 } else { 1302 // Open delimiter not seen 1303 FAIL(ec); 1304 } 1305 1306 // Look for the matching close delimiter, either :] or } 1307 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos); 1308 if (close < 0) { 1309 // Syntax error; close delimiter missing 1310 FAIL(ec); 1311 } 1312 1313 // Look for an '=' sign. If this is present, we will parse a 1314 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1315 // pattern. 1316 int32_t equals = pattern.indexOf(EQUALS, pos); 1317 UnicodeString propName, valueName; 1318 if (equals >= 0 && equals < close && !isName) { 1319 // Equals seen; parse medium/long pattern 1320 pattern.extractBetween(pos, equals, propName); 1321 pattern.extractBetween(equals+1, close, valueName); 1322 } 1323 1324 else { 1325 // Handle case where no '=' is seen, and \N{} 1326 pattern.extractBetween(pos, close, propName); 1327 1328 // Handle \N{name} 1329 if (isName) { 1330 // This is a little inefficient since it means we have to 1331 // parse NAME_PROP back to UCHAR_NAME even though we already 1332 // know it's UCHAR_NAME. If we refactor the API to 1333 // support args of (UProperty, char*) then we can remove 1334 // NAME_PROP and make this a little more efficient. 1335 valueName = propName; 1336 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 1337 } 1338 } 1339 1340 applyPropertyAlias(propName, valueName, ec); 1341 1342 if (U_SUCCESS(ec)) { 1343 if (invert) { 1344 complement(); 1345 } 1346 1347 // Move to the limit position after the close delimiter if the 1348 // parse succeeded. 1349 ppos.setIndex(close + (posix ? 2 : 1)); 1350 } 1351 1352 return *this; 1353 } 1354 1355 /** 1356 * Parse a property pattern. 1357 * @param chars iterator over the pattern characters. Upon return 1358 * it will be advanced to the first character after the parsed 1359 * pattern, or the end of the iteration if all characters are 1360 * parsed. 1361 * @param rebuiltPat the pattern that was parsed, rebuilt or 1362 * copied from the input pattern, as appropriate. 1363 */ 1364 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1365 UnicodeString& rebuiltPat, 1366 UErrorCode& ec) { 1367 if (U_FAILURE(ec)) return; 1368 UnicodeString pattern; 1369 chars.lookahead(pattern); 1370 ParsePosition pos(0); 1371 applyPropertyPattern(pattern, pos, ec); 1372 if (U_FAILURE(ec)) return; 1373 if (pos.getIndex() == 0) { 1374 // syntaxError(chars, "Invalid property pattern"); 1375 ec = U_MALFORMED_SET; 1376 return; 1377 } 1378 chars.jumpahead(pos.getIndex()); 1379 rebuiltPat.append(pattern, 0, pos.getIndex()); 1380 } 1381 1382 //---------------------------------------------------------------- 1383 // Case folding API 1384 //---------------------------------------------------------------- 1385 1386 // add the result of a full case mapping to the set 1387 // use str as a temporary string to avoid constructing one 1388 static inline void 1389 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { 1390 if(result >= 0) { 1391 if(result > UCASE_MAX_STRING_LENGTH) { 1392 // add a single-code point case mapping 1393 set.add(result); 1394 } else { 1395 // add a string case mapping from full with length result 1396 str.setTo((UBool)FALSE, full, result); 1397 set.add(str); 1398 } 1399 } 1400 // result < 0: the code point mapped to itself, no need to add it 1401 // see ucase.h 1402 } 1403 1404 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 1405 if (isFrozen() || isBogus()) { 1406 return *this; 1407 } 1408 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { 1409 const UCaseProps *csp = ucase_getSingleton(); 1410 { 1411 UnicodeSet foldSet(*this); 1412 UnicodeString str; 1413 USetAdder sa = { 1414 foldSet.toUSet(), 1415 _set_add, 1416 _set_addRange, 1417 _set_addString, 1418 NULL, // don't need remove() 1419 NULL // don't need removeRange() 1420 }; 1421 1422 // start with input set to guarantee inclusion 1423 // USET_CASE: remove strings because the strings will actually be reduced (folded); 1424 // therefore, start with no strings and add only those needed 1425 if (attribute & USET_CASE_INSENSITIVE) { 1426 foldSet.strings->removeAllElements(); 1427 } 1428 1429 int32_t n = getRangeCount(); 1430 UChar32 result; 1431 const UChar *full; 1432 int32_t locCache = 0; 1433 1434 for (int32_t i=0; i<n; ++i) { 1435 UChar32 start = getRangeStart(i); 1436 UChar32 end = getRangeEnd(i); 1437 1438 if (attribute & USET_CASE_INSENSITIVE) { 1439 // full case closure 1440 for (UChar32 cp=start; cp<=end; ++cp) { 1441 ucase_addCaseClosure(csp, cp, &sa); 1442 } 1443 } else { 1444 // add case mappings 1445 // (does not add long s for regular s, or Kelvin for k, for example) 1446 for (UChar32 cp=start; cp<=end; ++cp) { 1447 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); 1448 addCaseMapping(foldSet, result, full, str); 1449 1450 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); 1451 addCaseMapping(foldSet, result, full, str); 1452 1453 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); 1454 addCaseMapping(foldSet, result, full, str); 1455 1456 result = ucase_toFullFolding(csp, cp, &full, 0); 1457 addCaseMapping(foldSet, result, full, str); 1458 } 1459 } 1460 } 1461 if (strings != NULL && strings->size() > 0) { 1462 if (attribute & USET_CASE_INSENSITIVE) { 1463 for (int32_t j=0; j<strings->size(); ++j) { 1464 str = *(const UnicodeString *) strings->elementAt(j); 1465 str.foldCase(); 1466 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { 1467 foldSet.add(str); // does not map to code points: add the folded string itself 1468 } 1469 } 1470 } else { 1471 Locale root(""); 1472 #if !UCONFIG_NO_BREAK_ITERATION 1473 UErrorCode status = U_ZERO_ERROR; 1474 BreakIterator *bi = BreakIterator::createWordInstance(root, status); 1475 if (U_SUCCESS(status)) { 1476 #endif 1477 const UnicodeString *pStr; 1478 1479 for (int32_t j=0; j<strings->size(); ++j) { 1480 pStr = (const UnicodeString *) strings->elementAt(j); 1481 (str = *pStr).toLower(root); 1482 foldSet.add(str); 1483 #if !UCONFIG_NO_BREAK_ITERATION 1484 (str = *pStr).toTitle(bi, root); 1485 foldSet.add(str); 1486 #endif 1487 (str = *pStr).toUpper(root); 1488 foldSet.add(str); 1489 (str = *pStr).foldCase(); 1490 foldSet.add(str); 1491 } 1492 #if !UCONFIG_NO_BREAK_ITERATION 1493 } 1494 delete bi; 1495 #endif 1496 } 1497 } 1498 *this = foldSet; 1499 } 1500 } 1501 return *this; 1502 } 1503 1504 U_NAMESPACE_END 1505