1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uniset_props.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004aug25 14 * created by: Markus W. Scherer 15 * 16 * Character property dependent functions moved here from uniset.cpp 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uniset.h" 21 #include "unicode/parsepos.h" 22 #include "unicode/uchar.h" 23 #include "unicode/uscript.h" 24 #include "unicode/symtable.h" 25 #include "unicode/uset.h" 26 #include "unicode/locid.h" 27 #include "unicode/brkiter.h" 28 #include "uset_imp.h" 29 #include "ruleiter.h" 30 #include "cmemory.h" 31 #include "ucln_cmn.h" 32 #include "util.h" 33 #include "uvector.h" 34 #include "uprops.h" 35 #include "propname.h" 36 #include "normalizer2impl.h" 37 #include "ucase.h" 38 #include "ubidi_props.h" 39 #include "uinvchar.h" 40 #include "uprops.h" 41 #include "charstr.h" 42 #include "cstring.h" 43 #include "mutex.h" 44 #include "umutex.h" 45 #include "uassert.h" 46 #include "hash.h" 47 48 U_NAMESPACE_USE 49 50 // initial storage. Must be >= 0 51 // *** same as in uniset.cpp ! *** 52 #define START_EXTRA 16 53 54 // Define UChar constants using hex for EBCDIC compatibility 55 // Used #define to reduce private static exports and memory access time. 56 #define SET_OPEN ((UChar)0x005B) /*[*/ 57 #define SET_CLOSE ((UChar)0x005D) /*]*/ 58 #define HYPHEN ((UChar)0x002D) /*-*/ 59 #define COMPLEMENT ((UChar)0x005E) /*^*/ 60 #define COLON ((UChar)0x003A) /*:*/ 61 #define BACKSLASH ((UChar)0x005C) /*\*/ 62 #define INTERSECTION ((UChar)0x0026) /*&*/ 63 #define UPPER_U ((UChar)0x0055) /*U*/ 64 #define LOWER_U ((UChar)0x0075) /*u*/ 65 #define OPEN_BRACE ((UChar)123) /*{*/ 66 #define CLOSE_BRACE ((UChar)125) /*}*/ 67 #define UPPER_P ((UChar)0x0050) /*P*/ 68 #define LOWER_P ((UChar)0x0070) /*p*/ 69 #define UPPER_N ((UChar)78) /*N*/ 70 #define EQUALS ((UChar)0x003D) /*=*/ 71 72 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" 73 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" 74 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" 75 //static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" 76 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" 77 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ 78 79 // Special property set IDs 80 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 81 static const char ASCII[] = "ASCII"; // [\u0000-\u007F] 82 static const char ASSIGNED[] = "Assigned"; // [:^Cn:] 83 84 // Unicode name property alias 85 #define NAME_PROP "na" 86 #define NAME_PROP_LENGTH 2 87 88 /** 89 * Delimiter string used in patterns to close a category reference: 90 * ":]". Example: "[:Lu:]". 91 */ 92 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 93 94 // Cached sets ------------------------------------------------------------- *** 95 96 U_CDECL_BEGIN 97 static UBool U_CALLCONV uset_cleanup(); 98 99 struct Inclusion { 100 UnicodeSet *fSet; 101 UInitOnce fInitOnce; 102 }; 103 static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() 104 105 static UnicodeSet *uni32Singleton; 106 static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; 107 108 //---------------------------------------------------------------- 109 // Inclusions list 110 //---------------------------------------------------------------- 111 112 // USetAdder implementation 113 // Does not use uset.h to reduce code dependencies 114 static void U_CALLCONV 115 _set_add(USet *set, UChar32 c) { 116 ((UnicodeSet *)set)->add(c); 117 } 118 119 static void U_CALLCONV 120 _set_addRange(USet *set, UChar32 start, UChar32 end) { 121 ((UnicodeSet *)set)->add(start, end); 122 } 123 124 static void U_CALLCONV 125 _set_addString(USet *set, const UChar *str, int32_t length) { 126 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 127 } 128 129 /** 130 * Cleanup function for UnicodeSet 131 */ 132 static UBool U_CALLCONV uset_cleanup(void) { 133 for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { 134 Inclusion &in = gInclusions[i]; 135 delete in.fSet; 136 in.fSet = NULL; 137 in.fInitOnce.reset(); 138 } 139 140 delete uni32Singleton; 141 uni32Singleton = NULL; 142 uni32InitOnce.reset(); 143 return TRUE; 144 } 145 146 U_CDECL_END 147 148 U_NAMESPACE_BEGIN 149 150 /* 151 Reduce excessive reallocation, and make it easier to detect initialization problems. 152 Usually you don't see smaller sets than this for Unicode 5.0. 153 */ 154 #define DEFAULT_INCLUSION_CAPACITY 3072 155 156 void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { 157 // This function is invoked only via umtx_initOnce(). 158 // This function is a friend of class UnicodeSet. 159 160 U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); 161 UnicodeSet * &incl = gInclusions[src].fSet; 162 U_ASSERT(incl == NULL); 163 164 incl = new UnicodeSet(); 165 if (incl == NULL) { 166 status = U_MEMORY_ALLOCATION_ERROR; 167 return; 168 } 169 USetAdder sa = { 170 (USet *)incl, 171 _set_add, 172 _set_addRange, 173 _set_addString, 174 NULL, // don't need remove() 175 NULL // don't need removeRange() 176 }; 177 178 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); 179 switch(src) { 180 case UPROPS_SRC_CHAR: 181 uchar_addPropertyStarts(&sa, &status); 182 break; 183 case UPROPS_SRC_PROPSVEC: 184 upropsvec_addPropertyStarts(&sa, &status); 185 break; 186 case UPROPS_SRC_CHAR_AND_PROPSVEC: 187 uchar_addPropertyStarts(&sa, &status); 188 upropsvec_addPropertyStarts(&sa, &status); 189 break; 190 #if !UCONFIG_NO_NORMALIZATION 191 case UPROPS_SRC_CASE_AND_NORM: { 192 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 193 if(U_SUCCESS(status)) { 194 impl->addPropertyStarts(&sa, status); 195 } 196 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 197 break; 198 } 199 case UPROPS_SRC_NFC: { 200 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 201 if(U_SUCCESS(status)) { 202 impl->addPropertyStarts(&sa, status); 203 } 204 break; 205 } 206 case UPROPS_SRC_NFKC: { 207 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); 208 if(U_SUCCESS(status)) { 209 impl->addPropertyStarts(&sa, status); 210 } 211 break; 212 } 213 case UPROPS_SRC_NFKC_CF: { 214 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); 215 if(U_SUCCESS(status)) { 216 impl->addPropertyStarts(&sa, status); 217 } 218 break; 219 } 220 case UPROPS_SRC_NFC_CANON_ITER: { 221 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 222 if(U_SUCCESS(status)) { 223 impl->addCanonIterPropertyStarts(&sa, status); 224 } 225 break; 226 } 227 #endif 228 case UPROPS_SRC_CASE: 229 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 230 break; 231 case UPROPS_SRC_BIDI: 232 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); 233 break; 234 default: 235 status = U_INTERNAL_PROGRAM_ERROR; 236 break; 237 } 238 239 if (U_FAILURE(status)) { 240 delete incl; 241 incl = NULL; 242 return; 243 } 244 // Compact for caching 245 incl->compact(); 246 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 247 } 248 249 250 251 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { 252 U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT); 253 Inclusion &i = gInclusions[src]; 254 umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status); 255 return i.fSet; 256 } 257 258 259 // Cache some sets for other services -------------------------------------- *** 260 void U_CALLCONV createUni32Set(UErrorCode &errorCode) { 261 U_ASSERT(uni32Singleton == NULL); 262 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); 263 if(uni32Singleton==NULL) { 264 errorCode=U_MEMORY_ALLOCATION_ERROR; 265 } else { 266 uni32Singleton->freeze(); 267 } 268 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 269 } 270 271 272 U_CFUNC UnicodeSet * 273 uniset_getUnicode32Instance(UErrorCode &errorCode) { 274 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); 275 return uni32Singleton; 276 } 277 278 // helper functions for matching of pattern syntax pieces ------------------ *** 279 // these functions are parallel to the PERL_OPEN etc. strings above 280 281 // using these functions is not only faster than UnicodeString::compare() and 282 // caseCompare(), but they also make UnicodeSet work for simple patterns when 283 // no Unicode properties data is available - when caseCompare() fails 284 285 static inline UBool 286 isPerlOpen(const UnicodeString &pattern, int32_t pos) { 287 UChar c; 288 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); 289 } 290 291 /*static inline UBool 292 isPerlClose(const UnicodeString &pattern, int32_t pos) { 293 return pattern.charAt(pos)==CLOSE_BRACE; 294 }*/ 295 296 static inline UBool 297 isNameOpen(const UnicodeString &pattern, int32_t pos) { 298 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; 299 } 300 301 static inline UBool 302 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 303 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; 304 } 305 306 /*static inline UBool 307 isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 308 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 309 }*/ 310 311 // TODO memory debugging provided inside uniset.cpp 312 // could be made available here but probably obsolete with use of modern 313 // memory leak checker tools 314 #define _dbgct(me) 315 316 //---------------------------------------------------------------- 317 // Constructors &c 318 //---------------------------------------------------------------- 319 320 /** 321 * Constructs a set from the given pattern, optionally ignoring 322 * white space. See the class description for the syntax of the 323 * pattern language. 324 * @param pattern a string specifying what characters are in the set 325 */ 326 UnicodeSet::UnicodeSet(const UnicodeString& pattern, 327 UErrorCode& status) : 328 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 329 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 330 fFlags(0) 331 { 332 if(U_SUCCESS(status)){ 333 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 334 /* test for NULL */ 335 if(list == NULL) { 336 status = U_MEMORY_ALLOCATION_ERROR; 337 }else{ 338 allocateStrings(status); 339 applyPattern(pattern, status); 340 } 341 } 342 _dbgct(this); 343 } 344 345 //---------------------------------------------------------------- 346 // Public API 347 //---------------------------------------------------------------- 348 349 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 350 UErrorCode& status) { 351 // Equivalent to 352 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 353 // but without dependency on closeOver(). 354 ParsePosition pos(0); 355 applyPatternIgnoreSpace(pattern, pos, NULL, status); 356 if (U_FAILURE(status)) return *this; 357 358 int32_t i = pos.getIndex(); 359 // Skip over trailing whitespace 360 ICU_Utility::skipWhitespace(pattern, i, TRUE); 361 if (i != pattern.length()) { 362 status = U_ILLEGAL_ARGUMENT_ERROR; 363 } 364 return *this; 365 } 366 367 void 368 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, 369 ParsePosition& pos, 370 const SymbolTable* symbols, 371 UErrorCode& status) { 372 if (U_FAILURE(status)) { 373 return; 374 } 375 if (isFrozen()) { 376 status = U_NO_WRITE_PERMISSION; 377 return; 378 } 379 // Need to build the pattern in a temporary string because 380 // _applyPattern calls add() etc., which set pat to empty. 381 UnicodeString rebuiltPat; 382 RuleCharacterIterator chars(pattern, symbols, pos); 383 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); 384 if (U_FAILURE(status)) return; 385 if (chars.inVariable()) { 386 // syntaxError(chars, "Extra chars in variable value"); 387 status = U_MALFORMED_SET; 388 return; 389 } 390 setPattern(rebuiltPat); 391 } 392 393 /** 394 * Return true if the given position, in the given pattern, appears 395 * to be the start of a UnicodeSet pattern. 396 */ 397 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 398 return ((pos+1) < pattern.length() && 399 pattern.charAt(pos) == (UChar)91/*[*/) || 400 resemblesPropertyPattern(pattern, pos); 401 } 402 403 //---------------------------------------------------------------- 404 // Implementation: Pattern parsing 405 //---------------------------------------------------------------- 406 407 /** 408 * A small all-inline class to manage a UnicodeSet pointer. Add 409 * operator->() etc. as needed. 410 */ 411 class UnicodeSetPointer { 412 UnicodeSet* p; 413 public: 414 inline UnicodeSetPointer() : p(0) {} 415 inline ~UnicodeSetPointer() { delete p; } 416 inline UnicodeSet* pointer() { return p; } 417 inline UBool allocate() { 418 if (p == 0) { 419 p = new UnicodeSet(); 420 } 421 return p != 0; 422 } 423 }; 424 425 /** 426 * Parse the pattern from the given RuleCharacterIterator. The 427 * iterator is advanced over the parsed pattern. 428 * @param chars iterator over the pattern characters. Upon return 429 * it will be advanced to the first character after the parsed 430 * pattern, or the end of the iteration if all characters are 431 * parsed. 432 * @param symbols symbol table to use to parse and dereference 433 * variables, or null if none. 434 * @param rebuiltPat the pattern that was parsed, rebuilt or 435 * copied from the input pattern, as appropriate. 436 * @param options a bit mask of zero or more of the following: 437 * IGNORE_SPACE, CASE. 438 */ 439 void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 440 const SymbolTable* symbols, 441 UnicodeString& rebuiltPat, 442 uint32_t options, 443 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 444 UErrorCode& ec) { 445 if (U_FAILURE(ec)) return; 446 447 // Syntax characters: [ ] ^ - & { } 448 449 // Recognized special forms for chars, sets: c-c s-s s&s 450 451 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 452 RuleCharacterIterator::PARSE_ESCAPES; 453 if ((options & USET_IGNORE_SPACE) != 0) { 454 opts |= RuleCharacterIterator::SKIP_WHITESPACE; 455 } 456 457 UnicodeString patLocal, buf; 458 UBool usePat = FALSE; 459 UnicodeSetPointer scratch; 460 RuleCharacterIterator::Pos backup; 461 462 // mode: 0=before [, 1=between [...], 2=after ] 463 // lastItem: 0=none, 1=char, 2=set 464 int8_t lastItem = 0, mode = 0; 465 UChar32 lastChar = 0; 466 UChar op = 0; 467 468 UBool invert = FALSE; 469 470 clear(); 471 472 while (mode != 2 && !chars.atEnd()) { 473 U_ASSERT((lastItem == 0 && op == 0) || 474 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || 475 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || 476 op == INTERSECTION /*'&'*/))); 477 478 UChar32 c = 0; 479 UBool literal = FALSE; 480 UnicodeSet* nested = 0; // alias - do not delete 481 482 // -------- Check for property pattern 483 484 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 485 int8_t setMode = 0; 486 if (resemblesPropertyPattern(chars, opts)) { 487 setMode = 2; 488 } 489 490 // -------- Parse '[' of opening delimiter OR nested set. 491 // If there is a nested set, use `setMode' to define how 492 // the set should be parsed. If the '[' is part of the 493 // opening delimiter for this pattern, parse special 494 // strings "[", "[^", "[-", and "[^-". Check for stand-in 495 // characters representing a nested set in the symbol 496 // table. 497 498 else { 499 // Prepare to backup if necessary 500 chars.getPos(backup); 501 c = chars.next(opts, literal, ec); 502 if (U_FAILURE(ec)) return; 503 504 if (c == 0x5B /*'['*/ && !literal) { 505 if (mode == 1) { 506 chars.setPos(backup); // backup 507 setMode = 1; 508 } else { 509 // Handle opening '[' delimiter 510 mode = 1; 511 patLocal.append((UChar) 0x5B /*'['*/); 512 chars.getPos(backup); // prepare to backup 513 c = chars.next(opts, literal, ec); 514 if (U_FAILURE(ec)) return; 515 if (c == 0x5E /*'^'*/ && !literal) { 516 invert = TRUE; 517 patLocal.append((UChar) 0x5E /*'^'*/); 518 chars.getPos(backup); // prepare to backup 519 c = chars.next(opts, literal, ec); 520 if (U_FAILURE(ec)) return; 521 } 522 // Fall through to handle special leading '-'; 523 // otherwise restart loop for nested [], \p{}, etc. 524 if (c == HYPHEN /*'-'*/) { 525 literal = TRUE; 526 // Fall through to handle literal '-' below 527 } else { 528 chars.setPos(backup); // backup 529 continue; 530 } 531 } 532 } else if (symbols != 0) { 533 const UnicodeFunctor *m = symbols->lookupMatcher(c); 534 if (m != 0) { 535 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); 536 if (ms == NULL) { 537 ec = U_MALFORMED_SET; 538 return; 539 } 540 // casting away const, but `nested' won't be modified 541 // (important not to modify stored set) 542 nested = const_cast<UnicodeSet*>(ms); 543 setMode = 3; 544 } 545 } 546 } 547 548 // -------- Handle a nested set. This either is inline in 549 // the pattern or represented by a stand-in that has 550 // previously been parsed and was looked up in the symbol 551 // table. 552 553 if (setMode != 0) { 554 if (lastItem == 1) { 555 if (op != 0) { 556 // syntaxError(chars, "Char expected after operator"); 557 ec = U_MALFORMED_SET; 558 return; 559 } 560 add(lastChar, lastChar); 561 _appendToPat(patLocal, lastChar, FALSE); 562 lastItem = 0; 563 op = 0; 564 } 565 566 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { 567 patLocal.append(op); 568 } 569 570 if (nested == 0) { 571 // lazy allocation 572 if (!scratch.allocate()) { 573 ec = U_MEMORY_ALLOCATION_ERROR; 574 return; 575 } 576 nested = scratch.pointer(); 577 } 578 switch (setMode) { 579 case 1: 580 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); 581 break; 582 case 2: 583 chars.skipIgnored(opts); 584 nested->applyPropertyPattern(chars, patLocal, ec); 585 if (U_FAILURE(ec)) return; 586 break; 587 case 3: // `nested' already parsed 588 nested->_toPattern(patLocal, FALSE); 589 break; 590 } 591 592 usePat = TRUE; 593 594 if (mode == 0) { 595 // Entire pattern is a category; leave parse loop 596 *this = *nested; 597 mode = 2; 598 break; 599 } 600 601 switch (op) { 602 case HYPHEN: /*'-'*/ 603 removeAll(*nested); 604 break; 605 case INTERSECTION: /*'&'*/ 606 retainAll(*nested); 607 break; 608 case 0: 609 addAll(*nested); 610 break; 611 } 612 613 op = 0; 614 lastItem = 2; 615 616 continue; 617 } 618 619 if (mode == 0) { 620 // syntaxError(chars, "Missing '['"); 621 ec = U_MALFORMED_SET; 622 return; 623 } 624 625 // -------- Parse special (syntax) characters. If the 626 // current character is not special, or if it is escaped, 627 // then fall through and handle it below. 628 629 if (!literal) { 630 switch (c) { 631 case 0x5D /*']'*/: 632 if (lastItem == 1) { 633 add(lastChar, lastChar); 634 _appendToPat(patLocal, lastChar, FALSE); 635 } 636 // Treat final trailing '-' as a literal 637 if (op == HYPHEN /*'-'*/) { 638 add(op, op); 639 patLocal.append(op); 640 } else if (op == INTERSECTION /*'&'*/) { 641 // syntaxError(chars, "Trailing '&'"); 642 ec = U_MALFORMED_SET; 643 return; 644 } 645 patLocal.append((UChar) 0x5D /*']'*/); 646 mode = 2; 647 continue; 648 case HYPHEN /*'-'*/: 649 if (op == 0) { 650 if (lastItem != 0) { 651 op = (UChar) c; 652 continue; 653 } else { 654 // Treat final trailing '-' as a literal 655 add(c, c); 656 c = chars.next(opts, literal, ec); 657 if (U_FAILURE(ec)) return; 658 if (c == 0x5D /*']'*/ && !literal) { 659 patLocal.append(HYPHEN_RIGHT_BRACE, 2); 660 mode = 2; 661 continue; 662 } 663 } 664 } 665 // syntaxError(chars, "'-' not after char or set"); 666 ec = U_MALFORMED_SET; 667 return; 668 case INTERSECTION /*'&'*/: 669 if (lastItem == 2 && op == 0) { 670 op = (UChar) c; 671 continue; 672 } 673 // syntaxError(chars, "'&' not after set"); 674 ec = U_MALFORMED_SET; 675 return; 676 case 0x5E /*'^'*/: 677 // syntaxError(chars, "'^' not after '['"); 678 ec = U_MALFORMED_SET; 679 return; 680 case 0x7B /*'{'*/: 681 if (op != 0) { 682 // syntaxError(chars, "Missing operand after operator"); 683 ec = U_MALFORMED_SET; 684 return; 685 } 686 if (lastItem == 1) { 687 add(lastChar, lastChar); 688 _appendToPat(patLocal, lastChar, FALSE); 689 } 690 lastItem = 0; 691 buf.truncate(0); 692 { 693 UBool ok = FALSE; 694 while (!chars.atEnd()) { 695 c = chars.next(opts, literal, ec); 696 if (U_FAILURE(ec)) return; 697 if (c == 0x7D /*'}'*/ && !literal) { 698 ok = TRUE; 699 break; 700 } 701 buf.append(c); 702 } 703 if (buf.length() < 1 || !ok) { 704 // syntaxError(chars, "Invalid multicharacter string"); 705 ec = U_MALFORMED_SET; 706 return; 707 } 708 } 709 // We have new string. Add it to set and continue; 710 // we don't need to drop through to the further 711 // processing 712 add(buf); 713 patLocal.append((UChar) 0x7B /*'{'*/); 714 _appendToPat(patLocal, buf, FALSE); 715 patLocal.append((UChar) 0x7D /*'}'*/); 716 continue; 717 case SymbolTable::SYMBOL_REF: 718 // symbols nosymbols 719 // [a-$] error error (ambiguous) 720 // [a$] anchor anchor 721 // [a-$x] var "x"* literal '$' 722 // [a-$.] error literal '$' 723 // *We won't get here in the case of var "x" 724 { 725 chars.getPos(backup); 726 c = chars.next(opts, literal, ec); 727 if (U_FAILURE(ec)) return; 728 UBool anchor = (c == 0x5D /*']'*/ && !literal); 729 if (symbols == 0 && !anchor) { 730 c = SymbolTable::SYMBOL_REF; 731 chars.setPos(backup); 732 break; // literal '$' 733 } 734 if (anchor && op == 0) { 735 if (lastItem == 1) { 736 add(lastChar, lastChar); 737 _appendToPat(patLocal, lastChar, FALSE); 738 } 739 add(U_ETHER); 740 usePat = TRUE; 741 patLocal.append((UChar) SymbolTable::SYMBOL_REF); 742 patLocal.append((UChar) 0x5D /*']'*/); 743 mode = 2; 744 continue; 745 } 746 // syntaxError(chars, "Unquoted '$'"); 747 ec = U_MALFORMED_SET; 748 return; 749 } 750 default: 751 break; 752 } 753 } 754 755 // -------- Parse literal characters. This includes both 756 // escaped chars ("\u4E01") and non-syntax characters 757 // ("a"). 758 759 switch (lastItem) { 760 case 0: 761 lastItem = 1; 762 lastChar = c; 763 break; 764 case 1: 765 if (op == HYPHEN /*'-'*/) { 766 if (lastChar >= c) { 767 // Don't allow redundant (a-a) or empty (b-a) ranges; 768 // these are most likely typos. 769 // syntaxError(chars, "Invalid range"); 770 ec = U_MALFORMED_SET; 771 return; 772 } 773 add(lastChar, c); 774 _appendToPat(patLocal, lastChar, FALSE); 775 patLocal.append(op); 776 _appendToPat(patLocal, c, FALSE); 777 lastItem = 0; 778 op = 0; 779 } else { 780 add(lastChar, lastChar); 781 _appendToPat(patLocal, lastChar, FALSE); 782 lastChar = c; 783 } 784 break; 785 case 2: 786 if (op != 0) { 787 // syntaxError(chars, "Set expected after operator"); 788 ec = U_MALFORMED_SET; 789 return; 790 } 791 lastChar = c; 792 lastItem = 1; 793 break; 794 } 795 } 796 797 if (mode != 2) { 798 // syntaxError(chars, "Missing ']'"); 799 ec = U_MALFORMED_SET; 800 return; 801 } 802 803 chars.skipIgnored(opts); 804 805 /** 806 * Handle global flags (invert, case insensitivity). If this 807 * pattern should be compiled case-insensitive, then we need 808 * to close over case BEFORE COMPLEMENTING. This makes 809 * patterns like /[^abc]/i work. 810 */ 811 if ((options & USET_CASE_INSENSITIVE) != 0) { 812 (this->*caseClosure)(USET_CASE_INSENSITIVE); 813 } 814 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 815 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); 816 } 817 if (invert) { 818 complement(); 819 } 820 821 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 822 // generated pattern. 823 if (usePat) { 824 rebuiltPat.append(patLocal); 825 } else { 826 _generatePattern(rebuiltPat, FALSE); 827 } 828 if (isBogus() && U_SUCCESS(ec)) { 829 // We likely ran out of memory. AHHH! 830 ec = U_MEMORY_ALLOCATION_ERROR; 831 } 832 } 833 834 //---------------------------------------------------------------- 835 // Property set implementation 836 //---------------------------------------------------------------- 837 838 static UBool numericValueFilter(UChar32 ch, void* context) { 839 return u_getNumericValue(ch) == *(double*)context; 840 } 841 842 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 843 int32_t value = *(int32_t*)context; 844 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 845 } 846 847 static UBool versionFilter(UChar32 ch, void* context) { 848 static const UVersionInfo none = { 0, 0, 0, 0 }; 849 UVersionInfo v; 850 u_charAge(ch, v); 851 UVersionInfo* version = (UVersionInfo*)context; 852 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 853 } 854 855 typedef struct { 856 UProperty prop; 857 int32_t value; 858 } IntPropertyContext; 859 860 static UBool intPropertyFilter(UChar32 ch, void* context) { 861 IntPropertyContext* c = (IntPropertyContext*)context; 862 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 863 } 864 865 static UBool scriptExtensionsFilter(UChar32 ch, void* context) { 866 return uscript_hasScript(ch, *(UScriptCode*)context); 867 } 868 869 /** 870 * Generic filter-based scanning code for UCD property UnicodeSets. 871 */ 872 void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 873 void* context, 874 int32_t src, 875 UErrorCode &status) { 876 if (U_FAILURE(status)) return; 877 878 // Logically, walk through all Unicode characters, noting the start 879 // and end of each range for which filter.contain(c) is 880 // true. Add each range to a set. 881 // 882 // To improve performance, use an inclusions set which 883 // encodes information about character ranges that are known 884 // to have identical properties. 885 // getInclusions(src) contains exactly the first characters of 886 // same-value ranges for the given properties "source". 887 const UnicodeSet* inclusions = getInclusions(src, status); 888 if (U_FAILURE(status)) { 889 return; 890 } 891 892 clear(); 893 894 UChar32 startHasProperty = -1; 895 int32_t limitRange = inclusions->getRangeCount(); 896 897 for (int j=0; j<limitRange; ++j) { 898 // get current range 899 UChar32 start = inclusions->getRangeStart(j); 900 UChar32 end = inclusions->getRangeEnd(j); 901 902 // for all the code points in the range, process 903 for (UChar32 ch = start; ch <= end; ++ch) { 904 // only add to this UnicodeSet on inflection points -- 905 // where the hasProperty value changes to false 906 if ((*filter)(ch, context)) { 907 if (startHasProperty < 0) { 908 startHasProperty = ch; 909 } 910 } else if (startHasProperty >= 0) { 911 add(startHasProperty, ch-1); 912 startHasProperty = -1; 913 } 914 } 915 } 916 if (startHasProperty >= 0) { 917 add((UChar32)startHasProperty, (UChar32)0x10FFFF); 918 } 919 if (isBogus() && U_SUCCESS(status)) { 920 // We likely ran out of memory. AHHH! 921 status = U_MEMORY_ALLOCATION_ERROR; 922 } 923 } 924 925 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 926 /* Note: we use ' ' in compiler code page */ 927 int32_t j = 0; 928 char ch; 929 --dstCapacity; /* make room for term. zero */ 930 while ((ch = *src++) != 0) { 931 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 932 continue; 933 } 934 if (j >= dstCapacity) return FALSE; 935 dst[j++] = ch; 936 } 937 if (j > 0 && dst[j-1] == ' ') --j; 938 dst[j] = 0; 939 return TRUE; 940 } 941 942 //---------------------------------------------------------------- 943 // Property set API 944 //---------------------------------------------------------------- 945 946 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 947 948 UnicodeSet& 949 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 950 if (U_FAILURE(ec) || isFrozen()) return *this; 951 952 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 953 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); 954 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 955 UScriptCode script = (UScriptCode)value; 956 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); 957 } else { 958 IntPropertyContext c = {prop, value}; 959 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); 960 } 961 return *this; 962 } 963 964 UnicodeSet& 965 UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 966 const UnicodeString& value, 967 UErrorCode& ec) { 968 if (U_FAILURE(ec) || isFrozen()) return *this; 969 970 // prop and value used to be converted to char * using the default 971 // converter instead of the invariant conversion. 972 // This should not be necessary because all Unicode property and value 973 // names use only invariant characters. 974 // If there are any variant characters, then we won't find them anyway. 975 // Checking first avoids assertion failures in the conversion. 976 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 977 !uprv_isInvariantUString(value.getBuffer(), value.length()) 978 ) { 979 FAIL(ec); 980 } 981 CharString pname, vname; 982 pname.appendInvariantChars(prop, ec); 983 vname.appendInvariantChars(value, ec); 984 if (U_FAILURE(ec)) return *this; 985 986 UProperty p; 987 int32_t v; 988 UBool mustNotBeEmpty = FALSE, invert = FALSE; 989 990 if (value.length() > 0) { 991 p = u_getPropertyEnum(pname.data()); 992 if (p == UCHAR_INVALID_CODE) FAIL(ec); 993 994 // Treat gc as gcm 995 if (p == UCHAR_GENERAL_CATEGORY) { 996 p = UCHAR_GENERAL_CATEGORY_MASK; 997 } 998 999 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 1000 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 1001 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 1002 v = u_getPropertyValueEnum(p, vname.data()); 1003 if (v == UCHAR_INVALID_CODE) { 1004 // Handle numeric CCC 1005 if (p == UCHAR_CANONICAL_COMBINING_CLASS || 1006 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 1007 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 1008 char* end; 1009 double value = uprv_strtod(vname.data(), &end); 1010 v = (int32_t) value; 1011 if (v != value || v < 0 || *end != 0) { 1012 // non-integral or negative value, or trailing junk 1013 FAIL(ec); 1014 } 1015 // If the resultant set is empty then the numeric value 1016 // was invalid. 1017 mustNotBeEmpty = TRUE; 1018 } else { 1019 FAIL(ec); 1020 } 1021 } 1022 } 1023 1024 else { 1025 1026 switch (p) { 1027 case UCHAR_NUMERIC_VALUE: 1028 { 1029 char* end; 1030 double value = uprv_strtod(vname.data(), &end); 1031 if (*end != 0) { 1032 FAIL(ec); 1033 } 1034 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); 1035 return *this; 1036 } 1037 case UCHAR_NAME: 1038 { 1039 // Must munge name, since u_charFromName() does not do 1040 // 'loose' matching. 1041 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 1042 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1043 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); 1044 if (U_SUCCESS(ec)) { 1045 clear(); 1046 add(ch); 1047 return *this; 1048 } else { 1049 FAIL(ec); 1050 } 1051 } 1052 case UCHAR_UNICODE_1_NAME: 1053 // ICU 49 deprecates the Unicode_1_Name property APIs. 1054 FAIL(ec); 1055 case UCHAR_AGE: 1056 { 1057 // Must munge name, since u_versionFromString() does not do 1058 // 'loose' matching. 1059 char buf[128]; 1060 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1061 UVersionInfo version; 1062 u_versionFromString(version, buf); 1063 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); 1064 return *this; 1065 } 1066 case UCHAR_SCRIPT_EXTENSIONS: 1067 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); 1068 if (v == UCHAR_INVALID_CODE) { 1069 FAIL(ec); 1070 } 1071 // fall through to calling applyIntPropertyValue() 1072 break; 1073 default: 1074 // p is a non-binary, non-enumerated property that we 1075 // don't support (yet). 1076 FAIL(ec); 1077 } 1078 } 1079 } 1080 1081 else { 1082 // value is empty. Interpret as General Category, Script, or 1083 // Binary property. 1084 p = UCHAR_GENERAL_CATEGORY_MASK; 1085 v = u_getPropertyValueEnum(p, pname.data()); 1086 if (v == UCHAR_INVALID_CODE) { 1087 p = UCHAR_SCRIPT; 1088 v = u_getPropertyValueEnum(p, pname.data()); 1089 if (v == UCHAR_INVALID_CODE) { 1090 p = u_getPropertyEnum(pname.data()); 1091 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 1092 v = 1; 1093 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { 1094 set(MIN_VALUE, MAX_VALUE); 1095 return *this; 1096 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { 1097 set(0, 0x7F); 1098 return *this; 1099 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { 1100 // [:Assigned:]=[:^Cn:] 1101 p = UCHAR_GENERAL_CATEGORY_MASK; 1102 v = U_GC_CN_MASK; 1103 invert = TRUE; 1104 } else { 1105 FAIL(ec); 1106 } 1107 } 1108 } 1109 } 1110 1111 applyIntPropertyValue(p, v, ec); 1112 if(invert) { 1113 complement(); 1114 } 1115 1116 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { 1117 // mustNotBeEmpty is set to true if an empty set indicates 1118 // invalid input. 1119 ec = U_ILLEGAL_ARGUMENT_ERROR; 1120 } 1121 1122 if (isBogus() && U_SUCCESS(ec)) { 1123 // We likely ran out of memory. AHHH! 1124 ec = U_MEMORY_ALLOCATION_ERROR; 1125 } 1126 return *this; 1127 } 1128 1129 //---------------------------------------------------------------- 1130 // Property set patterns 1131 //---------------------------------------------------------------- 1132 1133 /** 1134 * Return true if the given position, in the given pattern, appears 1135 * to be the start of a property set pattern. 1136 */ 1137 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 1138 int32_t pos) { 1139 // Patterns are at least 5 characters long 1140 if ((pos+5) > pattern.length()) { 1141 return FALSE; 1142 } 1143 1144 // Look for an opening [:, [:^, \p, or \P 1145 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1146 } 1147 1148 /** 1149 * Return true if the given iterator appears to point at a 1150 * property pattern. Regardless of the result, return with the 1151 * iterator unchanged. 1152 * @param chars iterator over the pattern characters. Upon return 1153 * it will be unchanged. 1154 * @param iterOpts RuleCharacterIterator options 1155 */ 1156 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1157 int32_t iterOpts) { 1158 // NOTE: literal will always be FALSE, because we don't parse escapes. 1159 UBool result = FALSE, literal; 1160 UErrorCode ec = U_ZERO_ERROR; 1161 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1162 RuleCharacterIterator::Pos pos; 1163 chars.getPos(pos); 1164 UChar32 c = chars.next(iterOpts, literal, ec); 1165 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { 1166 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1167 literal, ec); 1168 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : 1169 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); 1170 } 1171 chars.setPos(pos); 1172 return result && U_SUCCESS(ec); 1173 } 1174 1175 /** 1176 * Parse the given property pattern at the given parse position. 1177 */ 1178 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1179 ParsePosition& ppos, 1180 UErrorCode &ec) { 1181 int32_t pos = ppos.getIndex(); 1182 1183 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1184 UBool isName = FALSE; // true for \N{pat}, o/w false 1185 UBool invert = FALSE; 1186 1187 if (U_FAILURE(ec)) return *this; 1188 1189 // Minimum length is 5 characters, e.g. \p{L} 1190 if ((pos+5) > pattern.length()) { 1191 FAIL(ec); 1192 } 1193 1194 // On entry, ppos should point to one of the following locations: 1195 // Look for an opening [:, [:^, \p, or \P 1196 if (isPOSIXOpen(pattern, pos)) { 1197 posix = TRUE; 1198 pos += 2; 1199 pos = ICU_Utility::skipWhitespace(pattern, pos); 1200 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { 1201 ++pos; 1202 invert = TRUE; 1203 } 1204 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1205 UChar c = pattern.charAt(pos+1); 1206 invert = (c == UPPER_P); 1207 isName = (c == UPPER_N); 1208 pos += 2; 1209 pos = ICU_Utility::skipWhitespace(pattern, pos); 1210 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { 1211 // Syntax error; "\p" or "\P" not followed by "{" 1212 FAIL(ec); 1213 } 1214 } else { 1215 // Open delimiter not seen 1216 FAIL(ec); 1217 } 1218 1219 // Look for the matching close delimiter, either :] or } 1220 int32_t close; 1221 if (posix) { 1222 close = pattern.indexOf(POSIX_CLOSE, 2, pos); 1223 } else { 1224 close = pattern.indexOf(CLOSE_BRACE, pos); 1225 } 1226 if (close < 0) { 1227 // Syntax error; close delimiter missing 1228 FAIL(ec); 1229 } 1230 1231 // Look for an '=' sign. If this is present, we will parse a 1232 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1233 // pattern. 1234 int32_t equals = pattern.indexOf(EQUALS, pos); 1235 UnicodeString propName, valueName; 1236 if (equals >= 0 && equals < close && !isName) { 1237 // Equals seen; parse medium/long pattern 1238 pattern.extractBetween(pos, equals, propName); 1239 pattern.extractBetween(equals+1, close, valueName); 1240 } 1241 1242 else { 1243 // Handle case where no '=' is seen, and \N{} 1244 pattern.extractBetween(pos, close, propName); 1245 1246 // Handle \N{name} 1247 if (isName) { 1248 // This is a little inefficient since it means we have to 1249 // parse NAME_PROP back to UCHAR_NAME even though we already 1250 // know it's UCHAR_NAME. If we refactor the API to 1251 // support args of (UProperty, char*) then we can remove 1252 // NAME_PROP and make this a little more efficient. 1253 valueName = propName; 1254 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 1255 } 1256 } 1257 1258 applyPropertyAlias(propName, valueName, ec); 1259 1260 if (U_SUCCESS(ec)) { 1261 if (invert) { 1262 complement(); 1263 } 1264 1265 // Move to the limit position after the close delimiter if the 1266 // parse succeeded. 1267 ppos.setIndex(close + (posix ? 2 : 1)); 1268 } 1269 1270 return *this; 1271 } 1272 1273 /** 1274 * Parse a property pattern. 1275 * @param chars iterator over the pattern characters. Upon return 1276 * it will be advanced to the first character after the parsed 1277 * pattern, or the end of the iteration if all characters are 1278 * parsed. 1279 * @param rebuiltPat the pattern that was parsed, rebuilt or 1280 * copied from the input pattern, as appropriate. 1281 */ 1282 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1283 UnicodeString& rebuiltPat, 1284 UErrorCode& ec) { 1285 if (U_FAILURE(ec)) return; 1286 UnicodeString pattern; 1287 chars.lookahead(pattern); 1288 ParsePosition pos(0); 1289 applyPropertyPattern(pattern, pos, ec); 1290 if (U_FAILURE(ec)) return; 1291 if (pos.getIndex() == 0) { 1292 // syntaxError(chars, "Invalid property pattern"); 1293 ec = U_MALFORMED_SET; 1294 return; 1295 } 1296 chars.jumpahead(pos.getIndex()); 1297 rebuiltPat.append(pattern, 0, pos.getIndex()); 1298 } 1299 1300 U_NAMESPACE_END 1301