1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 10/20/99 alan Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 #include "unicode/parsepos.h" 13 #include "unicode/symtable.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utf8.h" 16 #include "unicode/utf16.h" 17 #include "ruleiter.h" 18 #include "cmemory.h" 19 #include "cstring.h" 20 #include "patternprops.h" 21 #include "uelement.h" 22 #include "util.h" 23 #include "uvector.h" 24 #include "charstr.h" 25 #include "ustrfmt.h" 26 #include "uassert.h" 27 #include "bmpset.h" 28 #include "unisetspan.h" 29 30 // Define UChar constants using hex for EBCDIC compatibility 31 // Used #define to reduce private static exports and memory access time. 32 #define SET_OPEN ((UChar)0x005B) /*[*/ 33 #define SET_CLOSE ((UChar)0x005D) /*]*/ 34 #define HYPHEN ((UChar)0x002D) /*-*/ 35 #define COMPLEMENT ((UChar)0x005E) /*^*/ 36 #define COLON ((UChar)0x003A) /*:*/ 37 #define BACKSLASH ((UChar)0x005C) /*\*/ 38 #define INTERSECTION ((UChar)0x0026) /*&*/ 39 #define UPPER_U ((UChar)0x0055) /*U*/ 40 #define LOWER_U ((UChar)0x0075) /*u*/ 41 #define OPEN_BRACE ((UChar)123) /*{*/ 42 #define CLOSE_BRACE ((UChar)125) /*}*/ 43 #define UPPER_P ((UChar)0x0050) /*P*/ 44 #define LOWER_P ((UChar)0x0070) /*p*/ 45 #define UPPER_N ((UChar)78) /*N*/ 46 #define EQUALS ((UChar)0x003D) /*=*/ 47 48 // HIGH_VALUE > all valid values. 110000 for codepoints 49 #define UNICODESET_HIGH 0x0110000 50 51 // LOW <= all valid values. ZERO for codepoints 52 #define UNICODESET_LOW 0x000000 53 54 // initial storage. Must be >= 0 55 #define START_EXTRA 16 56 57 // extra amount for growth. Must be >= 0 58 #define GROW_EXTRA START_EXTRA 59 60 U_NAMESPACE_BEGIN 61 62 SymbolTable::~SymbolTable() {} 63 64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) 65 66 /** 67 * Modify the given UChar32 variable so that it is in range, by 68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and 69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1. 70 * It modifies its argument in-place and also returns it. 71 */ 72 static inline UChar32 pinCodePoint(UChar32& c) { 73 if (c < UNICODESET_LOW) { 74 c = UNICODESET_LOW; 75 } else if (c > (UNICODESET_HIGH-1)) { 76 c = (UNICODESET_HIGH-1); 77 } 78 return c; 79 } 80 81 //---------------------------------------------------------------- 82 // Debugging 83 //---------------------------------------------------------------- 84 85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks. 86 // To enable the debugging, define the symbol DEBUG_MEM in the line 87 // below. This will result in text being sent to stdout that looks 88 // like this: 89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85- 90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85- 91 // Each line lists a construction (ct) or destruction (dt) event, the 92 // object address, the number of outstanding objects after the event, 93 // and the pattern of the object in question. 94 95 // #define DEBUG_MEM 96 97 #ifdef DEBUG_MEM 98 #include <stdio.h> 99 static int32_t _dbgCount = 0; 100 101 static inline void _dbgct(UnicodeSet* set) { 102 UnicodeString str; 103 set->toPattern(str, TRUE); 104 char buf[40]; 105 str.extract(0, 39, buf, ""); 106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf); 107 } 108 109 static inline void _dbgdt(UnicodeSet* set) { 110 UnicodeString str; 111 set->toPattern(str, TRUE); 112 char buf[40]; 113 str.extract(0, 39, buf, ""); 114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf); 115 } 116 117 #else 118 119 #define _dbgct(set) 120 #define _dbgdt(set) 121 122 #endif 123 124 //---------------------------------------------------------------- 125 // UnicodeString in UVector support 126 //---------------------------------------------------------------- 127 128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { 129 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); 130 } 131 132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 133 const UnicodeString &a = *(const UnicodeString*)t1.pointer; 134 const UnicodeString &b = *(const UnicodeString*)t2.pointer; 135 return a.compare(b); 136 } 137 138 //---------------------------------------------------------------- 139 // Constructors &c 140 //---------------------------------------------------------------- 141 142 /** 143 * Constructs an empty set. 144 */ 145 UnicodeSet::UnicodeSet() : 146 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 147 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 148 fFlags(0) 149 { 150 UErrorCode status = U_ZERO_ERROR; 151 allocateStrings(status); 152 if (U_FAILURE(status)) { 153 return; 154 } 155 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 156 if(list!=NULL){ 157 list[0] = UNICODESET_HIGH; 158 } else { // If memory allocation failed, set to bogus state. 159 setToBogus(); 160 return; 161 } 162 _dbgct(this); 163 } 164 165 /** 166 * Constructs a set containing the given range. If <code>end > 167 * start</code> then an empty set is created. 168 * 169 * @param start first character, inclusive, of range 170 * @param end last character, inclusive, of range 171 */ 172 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : 173 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 174 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 175 fFlags(0) 176 { 177 UErrorCode status = U_ZERO_ERROR; 178 allocateStrings(status); 179 if (U_FAILURE(status)) { 180 return; 181 } 182 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 183 if(list!=NULL){ 184 list[0] = UNICODESET_HIGH; 185 complement(start, end); 186 } else { // If memory allocation failed, set to bogus state. 187 setToBogus(); 188 return; 189 } 190 _dbgct(this); 191 } 192 193 /** 194 * Constructs a set that is identical to the given UnicodeSet. 195 */ 196 UnicodeSet::UnicodeSet(const UnicodeSet& o) : 197 UnicodeFilter(o), 198 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), 199 bmpSet(0), 200 buffer(0), bufferCapacity(0), 201 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 202 fFlags(0) 203 { 204 UErrorCode status = U_ZERO_ERROR; 205 allocateStrings(status); 206 if (U_FAILURE(status)) { 207 return; 208 } 209 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 210 if(list!=NULL){ 211 *this = o; 212 } else { // If memory allocation failed, set to bogus state. 213 setToBogus(); 214 return; 215 } 216 _dbgct(this); 217 } 218 219 // Copy-construct as thawed. 220 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : 221 UnicodeFilter(o), 222 len(0), capacity(o.len + GROW_EXTRA), list(0), 223 bmpSet(0), 224 buffer(0), bufferCapacity(0), 225 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 226 fFlags(0) 227 { 228 UErrorCode status = U_ZERO_ERROR; 229 allocateStrings(status); 230 if (U_FAILURE(status)) { 231 return; 232 } 233 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 234 if(list!=NULL){ 235 // *this = o except for bmpSet and stringSpan 236 len = o.len; 237 uprv_memcpy(list, o.list, len*sizeof(UChar32)); 238 if (strings != NULL && o.strings != NULL) { 239 strings->assign(*o.strings, cloneUnicodeString, status); 240 } else { // Invalid strings. 241 setToBogus(); 242 return; 243 } 244 if (o.pat) { 245 setPattern(UnicodeString(o.pat, o.patLen)); 246 } 247 } else { // If memory allocation failed, set to bogus state. 248 setToBogus(); 249 return; 250 } 251 _dbgct(this); 252 } 253 254 /** 255 * Destructs the set. 256 */ 257 UnicodeSet::~UnicodeSet() { 258 _dbgdt(this); // first! 259 uprv_free(list); 260 delete bmpSet; 261 if (buffer) { 262 uprv_free(buffer); 263 } 264 delete strings; 265 delete stringSpan; 266 releasePattern(); 267 } 268 269 /** 270 * Assigns this object to be a copy of another. 271 */ 272 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { 273 if (this == &o) { 274 return *this; 275 } 276 if (isFrozen()) { 277 return *this; 278 } 279 if (o.isBogus()) { 280 setToBogus(); 281 return *this; 282 } 283 UErrorCode ec = U_ZERO_ERROR; 284 ensureCapacity(o.len, ec); 285 if (U_FAILURE(ec)) { 286 return *this; // There is no way to report this error :-( 287 } 288 len = o.len; 289 uprv_memcpy(list, o.list, len*sizeof(UChar32)); 290 if (o.bmpSet == NULL) { 291 bmpSet = NULL; 292 } else { 293 bmpSet = new BMPSet(*o.bmpSet, list, len); 294 if (bmpSet == NULL) { // Check for memory allocation error. 295 setToBogus(); 296 return *this; 297 } 298 } 299 if (strings != NULL && o.strings != NULL) { 300 strings->assign(*o.strings, cloneUnicodeString, ec); 301 } else { // Invalid strings. 302 setToBogus(); 303 return *this; 304 } 305 if (o.stringSpan == NULL) { 306 stringSpan = NULL; 307 } else { 308 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); 309 if (stringSpan == NULL) { // Check for memory allocation error. 310 setToBogus(); 311 return *this; 312 } 313 } 314 releasePattern(); 315 if (o.pat) { 316 setPattern(UnicodeString(o.pat, o.patLen)); 317 } 318 return *this; 319 } 320 321 /** 322 * Returns a copy of this object. All UnicodeMatcher objects have 323 * to support cloning in order to allow classes using 324 * UnicodeMatchers, such as Transliterator, to implement cloning. 325 */ 326 UnicodeFunctor* UnicodeSet::clone() const { 327 return new UnicodeSet(*this); 328 } 329 330 UnicodeFunctor *UnicodeSet::cloneAsThawed() const { 331 return new UnicodeSet(*this, TRUE); 332 } 333 334 /** 335 * Compares the specified object with this set for equality. Returns 336 * <tt>true</tt> if the two sets 337 * have the same size, and every member of the specified set is 338 * contained in this set (or equivalently, every member of this set is 339 * contained in the specified set). 340 * 341 * @param o set to be compared for equality with this set. 342 * @return <tt>true</tt> if the specified set is equal to this set. 343 */ 344 UBool UnicodeSet::operator==(const UnicodeSet& o) const { 345 if (len != o.len) return FALSE; 346 for (int32_t i = 0; i < len; ++i) { 347 if (list[i] != o.list[i]) return FALSE; 348 } 349 if (*strings != *o.strings) return FALSE; 350 return TRUE; 351 } 352 353 /** 354 * Returns the hash code value for this set. 355 * 356 * @return the hash code value for this set. 357 * @see Object#hashCode() 358 */ 359 int32_t UnicodeSet::hashCode(void) const { 360 int32_t result = len; 361 for (int32_t i = 0; i < len; ++i) { 362 result *= 1000003; 363 result += list[i]; 364 } 365 return result; 366 } 367 368 //---------------------------------------------------------------- 369 // Public API 370 //---------------------------------------------------------------- 371 372 /** 373 * Returns the number of elements in this set (its cardinality), 374 * Note than the elements of a set may include both individual 375 * codepoints and strings. 376 * 377 * @return the number of elements in this set (its cardinality). 378 */ 379 int32_t UnicodeSet::size(void) const { 380 int32_t n = 0; 381 int32_t count = getRangeCount(); 382 for (int32_t i = 0; i < count; ++i) { 383 n += getRangeEnd(i) - getRangeStart(i) + 1; 384 } 385 return n + strings->size(); 386 } 387 388 /** 389 * Returns <tt>true</tt> if this set contains no elements. 390 * 391 * @return <tt>true</tt> if this set contains no elements. 392 */ 393 UBool UnicodeSet::isEmpty(void) const { 394 return len == 1 && strings->size() == 0; 395 } 396 397 /** 398 * Returns true if this set contains the given character. 399 * @param c character to be checked for containment 400 * @return true if the test condition is met 401 */ 402 UBool UnicodeSet::contains(UChar32 c) const { 403 // Set i to the index of the start item greater than ch 404 // We know we will terminate without length test! 405 // LATER: for large sets, add binary search 406 //int32_t i = -1; 407 //for (;;) { 408 // if (c < list[++i]) break; 409 //} 410 if (bmpSet != NULL) { 411 return bmpSet->contains(c); 412 } 413 if (stringSpan != NULL) { 414 return stringSpan->contains(c); 415 } 416 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound 417 return FALSE; 418 } 419 int32_t i = findCodePoint(c); 420 return (UBool)(i & 1); // return true if odd 421 } 422 423 /** 424 * Returns the smallest value i such that c < list[i]. Caller 425 * must ensure that c is a legal value or this method will enter 426 * an infinite loop. This method performs a binary search. 427 * @param c a character in the range MIN_VALUE..MAX_VALUE 428 * inclusive 429 * @return the smallest integer i in the range 0..len-1, 430 * inclusive, such that c < list[i] 431 */ 432 int32_t UnicodeSet::findCodePoint(UChar32 c) const { 433 /* Examples: 434 findCodePoint(c) 435 set list[] c=0 1 3 4 7 8 436 === ============== =========== 437 [] [110000] 0 0 0 0 0 0 438 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 439 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 440 [:Any:] [0, 110000] 1 1 1 1 1 1 441 */ 442 443 // Return the smallest i such that c < list[i]. Assume 444 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 445 if (c < list[0]) 446 return 0; 447 // High runner test. c is often after the last range, so an 448 // initial check for this condition pays off. 449 int32_t lo = 0; 450 int32_t hi = len - 1; 451 if (lo >= hi || c >= list[hi-1]) 452 return hi; 453 // invariant: c >= list[lo] 454 // invariant: c < list[hi] 455 for (;;) { 456 int32_t i = (lo + hi) >> 1; 457 if (i == lo) { 458 break; // Found! 459 } else if (c < list[i]) { 460 hi = i; 461 } else { 462 lo = i; 463 } 464 } 465 return hi; 466 } 467 468 /** 469 * Returns true if this set contains every character 470 * of the given range. 471 * @param start first character, inclusive, of the range 472 * @param end last character, inclusive, of the range 473 * @return true if the test condition is met 474 */ 475 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { 476 //int32_t i = -1; 477 //for (;;) { 478 // if (start < list[++i]) break; 479 //} 480 int32_t i = findCodePoint(start); 481 return ((i & 1) != 0 && end < list[i]); 482 } 483 484 /** 485 * Returns <tt>true</tt> if this set contains the given 486 * multicharacter string. 487 * @param s string to be checked for containment 488 * @return <tt>true</tt> if this set contains the specified string 489 */ 490 UBool UnicodeSet::contains(const UnicodeString& s) const { 491 if (s.length() == 0) return FALSE; 492 int32_t cp = getSingleCP(s); 493 if (cp < 0) { 494 return strings->contains((void*) &s); 495 } else { 496 return contains((UChar32) cp); 497 } 498 } 499 500 /** 501 * Returns true if this set contains all the characters and strings 502 * of the given set. 503 * @param c set to be checked for containment 504 * @return true if the test condition is met 505 */ 506 UBool UnicodeSet::containsAll(const UnicodeSet& c) const { 507 // The specified set is a subset if all of its pairs are contained in 508 // this set. It's possible to code this more efficiently in terms of 509 // direct manipulation of the inversion lists if the need arises. 510 int32_t n = c.getRangeCount(); 511 for (int i=0; i<n; ++i) { 512 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 513 return FALSE; 514 } 515 } 516 if (!strings->containsAll(*c.strings)) return FALSE; 517 return TRUE; 518 } 519 520 /** 521 * Returns true if this set contains all the characters 522 * of the given string. 523 * @param s string containing characters to be checked for containment 524 * @return true if the test condition is met 525 */ 526 UBool UnicodeSet::containsAll(const UnicodeString& s) const { 527 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == 528 s.length()); 529 } 530 531 /** 532 * Returns true if this set contains none of the characters 533 * of the given range. 534 * @param start first character, inclusive, of the range 535 * @param end last character, inclusive, of the range 536 * @return true if the test condition is met 537 */ 538 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { 539 //int32_t i = -1; 540 //for (;;) { 541 // if (start < list[++i]) break; 542 //} 543 int32_t i = findCodePoint(start); 544 return ((i & 1) == 0 && end < list[i]); 545 } 546 547 /** 548 * Returns true if this set contains none of the characters and strings 549 * of the given set. 550 * @param c set to be checked for containment 551 * @return true if the test condition is met 552 */ 553 UBool UnicodeSet::containsNone(const UnicodeSet& c) const { 554 // The specified set is a subset if all of its pairs are contained in 555 // this set. It's possible to code this more efficiently in terms of 556 // direct manipulation of the inversion lists if the need arises. 557 int32_t n = c.getRangeCount(); 558 for (int32_t i=0; i<n; ++i) { 559 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 560 return FALSE; 561 } 562 } 563 if (!strings->containsNone(*c.strings)) return FALSE; 564 return TRUE; 565 } 566 567 /** 568 * Returns true if this set contains none of the characters 569 * of the given string. 570 * @param s string containing characters to be checked for containment 571 * @return true if the test condition is met 572 */ 573 UBool UnicodeSet::containsNone(const UnicodeString& s) const { 574 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == 575 s.length()); 576 } 577 578 /** 579 * Returns <tt>true</tt> if this set contains any character whose low byte 580 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 581 * indexing. 582 */ 583 UBool UnicodeSet::matchesIndexValue(uint8_t v) const { 584 /* The index value v, in the range [0,255], is contained in this set if 585 * it is contained in any pair of this set. Pairs either have the high 586 * bytes equal, or unequal. If the high bytes are equal, then we have 587 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 588 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 589 * Then v is contained if xx <= v || v <= yy. (This is identical to the 590 * time zone month containment logic.) 591 */ 592 int32_t i; 593 int32_t rangeCount=getRangeCount(); 594 for (i=0; i<rangeCount; ++i) { 595 UChar32 low = getRangeStart(i); 596 UChar32 high = getRangeEnd(i); 597 if ((low & ~0xFF) == (high & ~0xFF)) { 598 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 599 return TRUE; 600 } 601 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 602 return TRUE; 603 } 604 } 605 if (strings->size() != 0) { 606 for (i=0; i<strings->size(); ++i) { 607 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); 608 //if (s.length() == 0) { 609 // // Empty strings match everything 610 // return TRUE; 611 //} 612 // assert(s.length() != 0); // We enforce this elsewhere 613 UChar32 c = s.char32At(0); 614 if ((c & 0xFF) == v) { 615 return TRUE; 616 } 617 } 618 } 619 return FALSE; 620 } 621 622 /** 623 * Implementation of UnicodeMatcher::matches(). Always matches the 624 * longest possible multichar string. 625 */ 626 UMatchDegree UnicodeSet::matches(const Replaceable& text, 627 int32_t& offset, 628 int32_t limit, 629 UBool incremental) { 630 if (offset == limit) { 631 // Strings, if any, have length != 0, so we don't worry 632 // about them here. If we ever allow zero-length strings 633 // we much check for them here. 634 if (contains(U_ETHER)) { 635 return incremental ? U_PARTIAL_MATCH : U_MATCH; 636 } else { 637 return U_MISMATCH; 638 } 639 } else { 640 if (strings->size() != 0) { // try strings first 641 642 // might separate forward and backward loops later 643 // for now they are combined 644 645 // TODO Improve efficiency of this, at least in the forward 646 // direction, if not in both. In the forward direction we 647 // can assume the strings are sorted. 648 649 int32_t i; 650 UBool forward = offset < limit; 651 652 // firstChar is the leftmost char to match in the 653 // forward direction or the rightmost char to match in 654 // the reverse direction. 655 UChar firstChar = text.charAt(offset); 656 657 // If there are multiple strings that can match we 658 // return the longest match. 659 int32_t highWaterLength = 0; 660 661 for (i=0; i<strings->size(); ++i) { 662 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i); 663 664 //if (trial.length() == 0) { 665 // return U_MATCH; // null-string always matches 666 //} 667 // assert(trial.length() != 0); // We ensure this elsewhere 668 669 UChar c = trial.charAt(forward ? 0 : trial.length() - 1); 670 671 // Strings are sorted, so we can optimize in the 672 // forward direction. 673 if (forward && c > firstChar) break; 674 if (c != firstChar) continue; 675 676 int32_t matchLen = matchRest(text, offset, limit, trial); 677 678 if (incremental) { 679 int32_t maxLen = forward ? limit-offset : offset-limit; 680 if (matchLen == maxLen) { 681 // We have successfully matched but only up to limit. 682 return U_PARTIAL_MATCH; 683 } 684 } 685 686 if (matchLen == trial.length()) { 687 // We have successfully matched the whole string. 688 if (matchLen > highWaterLength) { 689 highWaterLength = matchLen; 690 } 691 // In the forward direction we know strings 692 // are sorted so we can bail early. 693 if (forward && matchLen < highWaterLength) { 694 break; 695 } 696 continue; 697 } 698 } 699 700 // We've checked all strings without a partial match. 701 // If we have full matches, return the longest one. 702 if (highWaterLength != 0) { 703 offset += forward ? highWaterLength : -highWaterLength; 704 return U_MATCH; 705 } 706 } 707 return UnicodeFilter::matches(text, offset, limit, incremental); 708 } 709 } 710 711 /** 712 * Returns the longest match for s in text at the given position. 713 * If limit > start then match forward from start+1 to limit 714 * matching all characters except s.charAt(0). If limit < start, 715 * go backward starting from start-1 matching all characters 716 * except s.charAt(s.length()-1). This method assumes that the 717 * first character, text.charAt(start), matches s, so it does not 718 * check it. 719 * @param text the text to match 720 * @param start the first character to match. In the forward 721 * direction, text.charAt(start) is matched against s.charAt(0). 722 * In the reverse direction, it is matched against 723 * s.charAt(s.length()-1). 724 * @param limit the limit offset for matching, either last+1 in 725 * the forward direction, or last-1 in the reverse direction, 726 * where last is the index of the last character to match. 727 * @return If part of s matches up to the limit, return |limit - 728 * start|. If all of s matches before reaching the limit, return 729 * s.length(). If there is a mismatch between s and text, return 730 * 0 731 */ 732 int32_t UnicodeSet::matchRest(const Replaceable& text, 733 int32_t start, int32_t limit, 734 const UnicodeString& s) { 735 int32_t i; 736 int32_t maxLen; 737 int32_t slen = s.length(); 738 if (start < limit) { 739 maxLen = limit - start; 740 if (maxLen > slen) maxLen = slen; 741 for (i = 1; i < maxLen; ++i) { 742 if (text.charAt(start + i) != s.charAt(i)) return 0; 743 } 744 } else { 745 maxLen = start - limit; 746 if (maxLen > slen) maxLen = slen; 747 --slen; // <=> slen = s.length() - 1; 748 for (i = 1; i < maxLen; ++i) { 749 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 750 } 751 } 752 return maxLen; 753 } 754 755 /** 756 * Implement of UnicodeMatcher 757 */ 758 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const { 759 toUnionTo.addAll(*this); 760 } 761 762 /** 763 * Returns the index of the given character within this set, where 764 * the set is ordered by ascending code point. If the character 765 * is not in this set, return -1. The inverse of this method is 766 * <code>charAt()</code>. 767 * @return an index from 0..size()-1, or -1 768 */ 769 int32_t UnicodeSet::indexOf(UChar32 c) const { 770 if (c < MIN_VALUE || c > MAX_VALUE) { 771 return -1; 772 } 773 int32_t i = 0; 774 int32_t n = 0; 775 for (;;) { 776 UChar32 start = list[i++]; 777 if (c < start) { 778 return -1; 779 } 780 UChar32 limit = list[i++]; 781 if (c < limit) { 782 return n + c - start; 783 } 784 n += limit - start; 785 } 786 } 787 788 /** 789 * Returns the character at the given index within this set, where 790 * the set is ordered by ascending code point. If the index is 791 * out of range, return (UChar32)-1. The inverse of this method is 792 * <code>indexOf()</code>. 793 * @param index an index from 0..size()-1 794 * @return the character at the given index, or (UChar32)-1. 795 */ 796 UChar32 UnicodeSet::charAt(int32_t index) const { 797 if (index >= 0) { 798 // len2 is the largest even integer <= len, that is, it is len 799 // for even values and len-1 for odd values. With odd values 800 // the last entry is UNICODESET_HIGH. 801 int32_t len2 = len & ~1; 802 for (int32_t i=0; i < len2;) { 803 UChar32 start = list[i++]; 804 int32_t count = list[i++] - start; 805 if (index < count) { 806 return (UChar32)(start + index); 807 } 808 index -= count; 809 } 810 } 811 return (UChar32)-1; 812 } 813 814 /** 815 * Make this object represent the range <code>start - end</code>. 816 * If <code>end > start</code> then this object is set to an 817 * an empty range. 818 * 819 * @param start first character in the set, inclusive 820 * @rparam end last character in the set, inclusive 821 */ 822 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { 823 clear(); 824 complement(start, end); 825 return *this; 826 } 827 828 /** 829 * Adds the specified range to this set if it is not already 830 * present. If this set already contains the specified range, 831 * the call leaves this set unchanged. If <code>end > start</code> 832 * then an empty range is added, leaving the set unchanged. 833 * 834 * @param start first character, inclusive, of range to be added 835 * to this set. 836 * @param end last character, inclusive, of range to be added 837 * to this set. 838 */ 839 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { 840 if (pinCodePoint(start) < pinCodePoint(end)) { 841 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 842 add(range, 2, 0); 843 } else if (start == end) { 844 add(start); 845 } 846 return *this; 847 } 848 849 // #define DEBUG_US_ADD 850 851 #ifdef DEBUG_US_ADD 852 #include <stdio.h> 853 void dump(UChar32 c) { 854 if (c <= 0xFF) { 855 printf("%c", (char)c); 856 } else { 857 printf("U+%04X", c); 858 } 859 } 860 void dump(const UChar32* list, int32_t len) { 861 printf("["); 862 for (int32_t i=0; i<len; ++i) { 863 if (i != 0) printf(", "); 864 dump(list[i]); 865 } 866 printf("]"); 867 } 868 #endif 869 870 /** 871 * Adds the specified character to this set if it is not already 872 * present. If this set already contains the specified character, 873 * the call leaves this set unchanged. 874 */ 875 UnicodeSet& UnicodeSet::add(UChar32 c) { 876 // find smallest i such that c < list[i] 877 // if odd, then it is IN the set 878 // if even, then it is OUT of the set 879 int32_t i = findCodePoint(pinCodePoint(c)); 880 881 // already in set? 882 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this; 883 884 // HIGH is 0x110000 885 // assert(list[len-1] == HIGH); 886 887 // empty = [HIGH] 888 // [start_0, limit_0, start_1, limit_1, HIGH] 889 890 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 891 // ^ 892 // list[i] 893 894 // i == 0 means c is before the first range 895 896 #ifdef DEBUG_US_ADD 897 printf("Add of "); 898 dump(c); 899 printf(" found at %d", i); 900 printf(": "); 901 dump(list, len); 902 printf(" => "); 903 #endif 904 905 if (c == list[i]-1) { 906 // c is before start of next range 907 list[i] = c; 908 // if we touched the HIGH mark, then add a new one 909 if (c == (UNICODESET_HIGH - 1)) { 910 UErrorCode status = U_ZERO_ERROR; 911 ensureCapacity(len+1, status); 912 if (U_FAILURE(status)) { 913 return *this; // There is no way to report this error :-( 914 } 915 list[len++] = UNICODESET_HIGH; 916 } 917 if (i > 0 && c == list[i-1]) { 918 // collapse adjacent ranges 919 920 // [..., start_k-1, c, c, limit_k, ..., HIGH] 921 // ^ 922 // list[i] 923 924 //for (int32_t k=i-1; k<len-2; ++k) { 925 // list[k] = list[k+2]; 926 //} 927 UChar32* dst = list + i - 1; 928 UChar32* src = dst + 2; 929 UChar32* srclimit = list + len; 930 while (src < srclimit) *(dst++) = *(src++); 931 932 len -= 2; 933 } 934 } 935 936 else if (i > 0 && c == list[i-1]) { 937 // c is after end of prior range 938 list[i-1]++; 939 // no need to check for collapse here 940 } 941 942 else { 943 // At this point we know the new char is not adjacent to 944 // any existing ranges, and it is not 10FFFF. 945 946 947 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 948 // ^ 949 // list[i] 950 951 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 952 // ^ 953 // list[i] 954 955 UErrorCode status = U_ZERO_ERROR; 956 ensureCapacity(len+2, status); 957 if (U_FAILURE(status)) { 958 return *this; // There is no way to report this error :-( 959 } 960 961 //for (int32_t k=len-1; k>=i; --k) { 962 // list[k+2] = list[k]; 963 //} 964 UChar32* src = list + len; 965 UChar32* dst = src + 2; 966 UChar32* srclimit = list + i; 967 while (src > srclimit) *(--dst) = *(--src); 968 969 list[i] = c; 970 list[i+1] = c+1; 971 len += 2; 972 } 973 974 #ifdef DEBUG_US_ADD 975 dump(list, len); 976 printf("\n"); 977 978 for (i=1; i<len; ++i) { 979 if (list[i] <= list[i-1]) { 980 // Corrupt array! 981 printf("ERROR: list has been corrupted\n"); 982 exit(1); 983 } 984 } 985 #endif 986 987 releasePattern(); 988 return *this; 989 } 990 991 /** 992 * Adds the specified multicharacter to this set if it is not already 993 * present. If this set already contains the multicharacter, 994 * the call leaves this set unchanged. 995 * Thus "ch" => {"ch"} 996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 997 * @param s the source string 998 * @return the modified set, for chaining 999 */ 1000 UnicodeSet& UnicodeSet::add(const UnicodeString& s) { 1001 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1002 int32_t cp = getSingleCP(s); 1003 if (cp < 0) { 1004 if (!strings->contains((void*) &s)) { 1005 _add(s); 1006 releasePattern(); 1007 } 1008 } else { 1009 add((UChar32)cp); 1010 } 1011 return *this; 1012 } 1013 1014 /** 1015 * Adds the given string, in order, to 'strings'. The given string 1016 * must have been checked by the caller to not be empty and to not 1017 * already be in 'strings'. 1018 */ 1019 void UnicodeSet::_add(const UnicodeString& s) { 1020 if (isFrozen() || isBogus()) { 1021 return; 1022 } 1023 UnicodeString* t = new UnicodeString(s); 1024 if (t == NULL) { // Check for memory allocation error. 1025 setToBogus(); 1026 return; 1027 } 1028 UErrorCode ec = U_ZERO_ERROR; 1029 strings->sortedInsert(t, compareUnicodeString, ec); 1030 if (U_FAILURE(ec)) { 1031 setToBogus(); 1032 delete t; 1033 } 1034 } 1035 1036 /** 1037 * @return a code point IF the string consists of a single one. 1038 * otherwise returns -1. 1039 * @param string to test 1040 */ 1041 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { 1042 //if (s.length() < 1) { 1043 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 1044 //} 1045 if (s.length() > 2) return -1; 1046 if (s.length() == 1) return s.charAt(0); 1047 1048 // at this point, len = 2 1049 UChar32 cp = s.char32At(0); 1050 if (cp > 0xFFFF) { // is surrogate pair 1051 return cp; 1052 } 1053 return -1; 1054 } 1055 1056 /** 1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1058 * If this set already any particular character, it has no effect on that character. 1059 * @param the source string 1060 * @return the modified set, for chaining 1061 */ 1062 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { 1063 UChar32 cp; 1064 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1065 cp = s.char32At(i); 1066 add(cp); 1067 } 1068 return *this; 1069 } 1070 1071 /** 1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1073 * If this set already any particular character, it has no effect on that character. 1074 * @param the source string 1075 * @return the modified set, for chaining 1076 */ 1077 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) { 1078 UnicodeSet set; 1079 set.addAll(s); 1080 retainAll(set); 1081 return *this; 1082 } 1083 1084 /** 1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1086 * If this set already any particular character, it has no effect on that character. 1087 * @param the source string 1088 * @return the modified set, for chaining 1089 */ 1090 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) { 1091 UnicodeSet set; 1092 set.addAll(s); 1093 complementAll(set); 1094 return *this; 1095 } 1096 1097 /** 1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1099 * If this set already any particular character, it has no effect on that character. 1100 * @param the source string 1101 * @return the modified set, for chaining 1102 */ 1103 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { 1104 UnicodeSet set; 1105 set.addAll(s); 1106 removeAll(set); 1107 return *this; 1108 } 1109 1110 UnicodeSet& UnicodeSet::removeAllStrings() { 1111 strings->removeAllElements(); 1112 return *this; 1113 } 1114 1115 1116 /** 1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1119 * @param the source string 1120 * @return a newly created set containing the given string 1121 */ 1122 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { 1123 UnicodeSet *set = new UnicodeSet(); 1124 if (set != NULL) { // Check for memory allocation error. 1125 set->add(s); 1126 } 1127 return set; 1128 } 1129 1130 1131 /** 1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1133 * @param the source string 1134 * @return a newly created set containing the given characters 1135 */ 1136 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { 1137 UnicodeSet *set = new UnicodeSet(); 1138 if (set != NULL) { // Check for memory allocation error. 1139 set->addAll(s); 1140 } 1141 return set; 1142 } 1143 1144 /** 1145 * Retain only the elements in this set that are contained in the 1146 * specified range. If <code>end > start</code> then an empty range is 1147 * retained, leaving the set empty. 1148 * 1149 * @param start first character, inclusive, of range to be retained 1150 * to this set. 1151 * @param end last character, inclusive, of range to be retained 1152 * to this set. 1153 */ 1154 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { 1155 if (pinCodePoint(start) <= pinCodePoint(end)) { 1156 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1157 retain(range, 2, 0); 1158 } else { 1159 clear(); 1160 } 1161 return *this; 1162 } 1163 1164 UnicodeSet& UnicodeSet::retain(UChar32 c) { 1165 return retain(c, c); 1166 } 1167 1168 /** 1169 * Removes the specified range from this set if it is present. 1170 * The set will not contain the specified range once the call 1171 * returns. If <code>end > start</code> then an empty range is 1172 * removed, leaving the set unchanged. 1173 * 1174 * @param start first character, inclusive, of range to be removed 1175 * from this set. 1176 * @param end last character, inclusive, of range to be removed 1177 * from this set. 1178 */ 1179 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { 1180 if (pinCodePoint(start) <= pinCodePoint(end)) { 1181 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1182 retain(range, 2, 2); 1183 } 1184 return *this; 1185 } 1186 1187 /** 1188 * Removes the specified character from this set if it is present. 1189 * The set will not contain the specified range once the call 1190 * returns. 1191 */ 1192 UnicodeSet& UnicodeSet::remove(UChar32 c) { 1193 return remove(c, c); 1194 } 1195 1196 /** 1197 * Removes the specified string from this set if it is present. 1198 * The set will not contain the specified character once the call 1199 * returns. 1200 * @param the source string 1201 * @return the modified set, for chaining 1202 */ 1203 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { 1204 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1205 int32_t cp = getSingleCP(s); 1206 if (cp < 0) { 1207 strings->removeElement((void*) &s); 1208 releasePattern(); 1209 } else { 1210 remove((UChar32)cp, (UChar32)cp); 1211 } 1212 return *this; 1213 } 1214 1215 /** 1216 * Complements the specified range in this set. Any character in 1217 * the range will be removed if it is in this set, or will be 1218 * added if it is not in this set. If <code>end > start</code> 1219 * then an empty range is xor'ed, leaving the set unchanged. 1220 * 1221 * @param start first character, inclusive, of range to be removed 1222 * from this set. 1223 * @param end last character, inclusive, of range to be removed 1224 * from this set. 1225 */ 1226 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { 1227 if (isFrozen() || isBogus()) { 1228 return *this; 1229 } 1230 if (pinCodePoint(start) <= pinCodePoint(end)) { 1231 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1232 exclusiveOr(range, 2, 0); 1233 } 1234 releasePattern(); 1235 return *this; 1236 } 1237 1238 UnicodeSet& UnicodeSet::complement(UChar32 c) { 1239 return complement(c, c); 1240 } 1241 1242 /** 1243 * This is equivalent to 1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1245 */ 1246 UnicodeSet& UnicodeSet::complement(void) { 1247 if (isFrozen() || isBogus()) { 1248 return *this; 1249 } 1250 UErrorCode status = U_ZERO_ERROR; 1251 if (list[0] == UNICODESET_LOW) { 1252 ensureBufferCapacity(len-1, status); 1253 if (U_FAILURE(status)) { 1254 return *this; 1255 } 1256 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32)); 1257 --len; 1258 } else { 1259 ensureBufferCapacity(len+1, status); 1260 if (U_FAILURE(status)) { 1261 return *this; 1262 } 1263 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32)); 1264 buffer[0] = UNICODESET_LOW; 1265 ++len; 1266 } 1267 swapBuffers(); 1268 releasePattern(); 1269 return *this; 1270 } 1271 1272 /** 1273 * Complement the specified string in this set. 1274 * The set will not contain the specified string once the call 1275 * returns. 1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1277 * @param s the string to complement 1278 * @return this object, for chaining 1279 */ 1280 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { 1281 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1282 int32_t cp = getSingleCP(s); 1283 if (cp < 0) { 1284 if (strings->contains((void*) &s)) { 1285 strings->removeElement((void*) &s); 1286 } else { 1287 _add(s); 1288 } 1289 releasePattern(); 1290 } else { 1291 complement((UChar32)cp, (UChar32)cp); 1292 } 1293 return *this; 1294 } 1295 1296 /** 1297 * Adds all of the elements in the specified set to this set if 1298 * they're not already present. This operation effectively 1299 * modifies this set so that its value is the <i>union</i> of the two 1300 * sets. The behavior of this operation is unspecified if the specified 1301 * collection is modified while the operation is in progress. 1302 * 1303 * @param c set whose elements are to be added to this set. 1304 * @see #add(char, char) 1305 */ 1306 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { 1307 if ( c.len>0 && c.list!=NULL ) { 1308 add(c.list, c.len, 0); 1309 } 1310 1311 // Add strings in order 1312 if ( c.strings!=NULL ) { 1313 for (int32_t i=0; i<c.strings->size(); ++i) { 1314 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); 1315 if (!strings->contains((void*) s)) { 1316 _add(*s); 1317 } 1318 } 1319 } 1320 return *this; 1321 } 1322 1323 /** 1324 * Retains only the elements in this set that are contained in the 1325 * specified set. In other words, removes from this set all of 1326 * its elements that are not contained in the specified set. This 1327 * operation effectively modifies this set so that its value is 1328 * the <i>intersection</i> of the two sets. 1329 * 1330 * @param c set that defines which elements this set will retain. 1331 */ 1332 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { 1333 if (isFrozen() || isBogus()) { 1334 return *this; 1335 } 1336 retain(c.list, c.len, 0); 1337 strings->retainAll(*c.strings); 1338 return *this; 1339 } 1340 1341 /** 1342 * Removes from this set all of its elements that are contained in the 1343 * specified set. This operation effectively modifies this 1344 * set so that its value is the <i>asymmetric set difference</i> of 1345 * the two sets. 1346 * 1347 * @param c set that defines which elements will be removed from 1348 * this set. 1349 */ 1350 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { 1351 if (isFrozen() || isBogus()) { 1352 return *this; 1353 } 1354 retain(c.list, c.len, 2); 1355 strings->removeAll(*c.strings); 1356 return *this; 1357 } 1358 1359 /** 1360 * Complements in this set all elements contained in the specified 1361 * set. Any character in the other set will be removed if it is 1362 * in this set, or will be added if it is not in this set. 1363 * 1364 * @param c set that defines which elements will be xor'ed from 1365 * this set. 1366 */ 1367 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { 1368 if (isFrozen() || isBogus()) { 1369 return *this; 1370 } 1371 exclusiveOr(c.list, c.len, 0); 1372 1373 for (int32_t i=0; i<c.strings->size(); ++i) { 1374 void* e = c.strings->elementAt(i); 1375 if (!strings->removeElement(e)) { 1376 _add(*(const UnicodeString*)e); 1377 } 1378 } 1379 return *this; 1380 } 1381 1382 /** 1383 * Removes all of the elements from this set. This set will be 1384 * empty after this call returns. 1385 */ 1386 UnicodeSet& UnicodeSet::clear(void) { 1387 if (isFrozen()) { 1388 return *this; 1389 } 1390 if (list != NULL) { 1391 list[0] = UNICODESET_HIGH; 1392 } 1393 len = 1; 1394 releasePattern(); 1395 if (strings != NULL) { 1396 strings->removeAllElements(); 1397 } 1398 if (list != NULL && strings != NULL) { 1399 // Remove bogus 1400 fFlags = 0; 1401 } 1402 return *this; 1403 } 1404 1405 /** 1406 * Iteration method that returns the number of ranges contained in 1407 * this set. 1408 * @see #getRangeStart 1409 * @see #getRangeEnd 1410 */ 1411 int32_t UnicodeSet::getRangeCount() const { 1412 return len/2; 1413 } 1414 1415 /** 1416 * Iteration method that returns the first character in the 1417 * specified range of this set. 1418 * @see #getRangeCount 1419 * @see #getRangeEnd 1420 */ 1421 UChar32 UnicodeSet::getRangeStart(int32_t index) const { 1422 return list[index*2]; 1423 } 1424 1425 /** 1426 * Iteration method that returns the last character in the 1427 * specified range of this set. 1428 * @see #getRangeStart 1429 * @see #getRangeEnd 1430 */ 1431 UChar32 UnicodeSet::getRangeEnd(int32_t index) const { 1432 return list[index*2 + 1] - 1; 1433 } 1434 1435 int32_t UnicodeSet::getStringCount() const { 1436 return strings->size(); 1437 } 1438 1439 const UnicodeString* UnicodeSet::getString(int32_t index) const { 1440 return (const UnicodeString*) strings->elementAt(index); 1441 } 1442 1443 /** 1444 * Reallocate this objects internal structures to take up the least 1445 * possible space, without changing this object's value. 1446 */ 1447 UnicodeSet& UnicodeSet::compact() { 1448 if (isFrozen() || isBogus()) { 1449 return *this; 1450 } 1451 // Delete buffer first to defragment memory less. 1452 if (buffer != NULL) { 1453 uprv_free(buffer); 1454 buffer = NULL; 1455 } 1456 if (len < capacity) { 1457 // Make the capacity equal to len or 1. 1458 // We don't want to realloc of 0 size. 1459 int32_t newCapacity = len + (len == 0); 1460 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); 1461 if (temp) { 1462 list = temp; 1463 capacity = newCapacity; 1464 } 1465 // else what the heck happened?! We allocated less memory! 1466 // Oh well. We'll keep our original array. 1467 } 1468 return *this; 1469 } 1470 1471 #ifdef DEBUG_SERIALIZE 1472 #include <stdio.h> 1473 #endif 1474 1475 /** 1476 * Deserialize constructor. 1477 */ 1478 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) 1479 : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), 1480 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1481 fFlags(0) { 1482 1483 if(U_FAILURE(ec)) { 1484 setToBogus(); 1485 return; 1486 } 1487 1488 if( (serialization != kSerialized) 1489 || (data==NULL) 1490 || (dataLen < 1)) { 1491 ec = U_ILLEGAL_ARGUMENT_ERROR; 1492 setToBogus(); 1493 return; 1494 } 1495 1496 allocateStrings(ec); 1497 if (U_FAILURE(ec)) { 1498 setToBogus(); 1499 return; 1500 } 1501 1502 // bmp? 1503 int32_t headerSize = ((data[0]&0x8000)) ?2:1; 1504 int32_t bmpLength = (headerSize==1)?data[0]:data[1]; 1505 1506 len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; 1507 #ifdef DEBUG_SERIALIZE 1508 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); 1509 #endif 1510 capacity = len+1; 1511 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1512 if(!list || U_FAILURE(ec)) { 1513 setToBogus(); 1514 return; 1515 } 1516 // copy bmp 1517 int32_t i; 1518 for(i = 0; i< bmpLength;i++) { 1519 list[i] = data[i+headerSize]; 1520 #ifdef DEBUG_SERIALIZE 1521 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]); 1522 #endif 1523 } 1524 // copy smp 1525 for(i=bmpLength;i<len;i++) { 1526 list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) + 1527 ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]); 1528 #ifdef DEBUG_SERIALIZE 1529 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]); 1530 #endif 1531 } 1532 // terminator 1533 list[len++]=UNICODESET_HIGH; 1534 } 1535 1536 1537 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const { 1538 int32_t bmpLength, length, destLength; 1539 1540 if (U_FAILURE(ec)) { 1541 return 0; 1542 } 1543 1544 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) { 1545 ec=U_ILLEGAL_ARGUMENT_ERROR; 1546 return 0; 1547 } 1548 1549 /* count necessary 16-bit units */ 1550 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH 1551 // assert(length>=0); 1552 if (length==0) { 1553 /* empty set */ 1554 if (destCapacity>0) { 1555 *dest=0; 1556 } else { 1557 ec=U_BUFFER_OVERFLOW_ERROR; 1558 } 1559 return 1; 1560 } 1561 /* now length>0 */ 1562 1563 if (this->list[length-1]<=0xffff) { 1564 /* all BMP */ 1565 bmpLength=length; 1566 } else if (this->list[0]>=0x10000) { 1567 /* all supplementary */ 1568 bmpLength=0; 1569 length*=2; 1570 } else { 1571 /* some BMP, some supplementary */ 1572 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {} 1573 length=bmpLength+2*(length-bmpLength); 1574 } 1575 #ifdef DEBUG_SERIALIZE 1576 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len); 1577 #endif 1578 /* length: number of 16-bit array units */ 1579 if (length>0x7fff) { 1580 /* there are only 15 bits for the length in the first serialized word */ 1581 ec=U_INDEX_OUTOFBOUNDS_ERROR; 1582 return 0; 1583 } 1584 1585 /* 1586 * total serialized length: 1587 * number of 16-bit array units (length) + 1588 * 1 length unit (always) + 1589 * 1 bmpLength unit (if there are supplementary values) 1590 */ 1591 destLength=length+((length>bmpLength)?2:1); 1592 if (destLength<=destCapacity) { 1593 const UChar32 *p; 1594 int32_t i; 1595 1596 #ifdef DEBUG_SERIALIZE 1597 printf("writeHdr\n"); 1598 #endif 1599 *dest=(uint16_t)length; 1600 if (length>bmpLength) { 1601 *dest|=0x8000; 1602 *++dest=(uint16_t)bmpLength; 1603 } 1604 ++dest; 1605 1606 /* write the BMP part of the array */ 1607 p=this->list; 1608 for (i=0; i<bmpLength; ++i) { 1609 #ifdef DEBUG_SERIALIZE 1610 printf("writebmp: %x\n", (int)*p); 1611 #endif 1612 *dest++=(uint16_t)*p++; 1613 } 1614 1615 /* write the supplementary part of the array */ 1616 for (; i<length; i+=2) { 1617 #ifdef DEBUG_SERIALIZE 1618 printf("write32: %x\n", (int)*p); 1619 #endif 1620 *dest++=(uint16_t)(*p>>16); 1621 *dest++=(uint16_t)*p++; 1622 } 1623 } else { 1624 ec=U_BUFFER_OVERFLOW_ERROR; 1625 } 1626 return destLength; 1627 } 1628 1629 //---------------------------------------------------------------- 1630 // Implementation: Utility methods 1631 //---------------------------------------------------------------- 1632 1633 /** 1634 * Allocate our strings vector and return TRUE if successful. 1635 */ 1636 UBool UnicodeSet::allocateStrings(UErrorCode &status) { 1637 if (U_FAILURE(status)) { 1638 return FALSE; 1639 } 1640 strings = new UVector(uprv_deleteUObject, 1641 uhash_compareUnicodeString, 1, status); 1642 if (strings == NULL) { // Check for memory allocation error. 1643 status = U_MEMORY_ALLOCATION_ERROR; 1644 return FALSE; 1645 } 1646 if (U_FAILURE(status)) { 1647 delete strings; 1648 strings = NULL; 1649 return FALSE; 1650 } 1651 return TRUE; 1652 } 1653 1654 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { 1655 if (newLen <= capacity) 1656 return; 1657 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1658 if (temp == NULL) { 1659 ec = U_MEMORY_ALLOCATION_ERROR; 1660 setToBogus(); 1661 return; 1662 } 1663 list = temp; 1664 capacity = newLen + GROW_EXTRA; 1665 // else we keep the original contents on the memory failure. 1666 } 1667 1668 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { 1669 if (buffer != NULL && newLen <= bufferCapacity) 1670 return; 1671 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1672 if (temp == NULL) { 1673 ec = U_MEMORY_ALLOCATION_ERROR; 1674 setToBogus(); 1675 return; 1676 } 1677 buffer = temp; 1678 bufferCapacity = newLen + GROW_EXTRA; 1679 // else we keep the original contents on the memory failure. 1680 } 1681 1682 /** 1683 * Swap list and buffer. 1684 */ 1685 void UnicodeSet::swapBuffers(void) { 1686 // swap list and buffer 1687 UChar32* temp = list; 1688 list = buffer; 1689 buffer = temp; 1690 1691 int32_t c = capacity; 1692 capacity = bufferCapacity; 1693 bufferCapacity = c; 1694 } 1695 1696 void UnicodeSet::setToBogus() { 1697 clear(); // Remove everything in the set. 1698 fFlags = kIsBogus; 1699 } 1700 1701 //---------------------------------------------------------------- 1702 // Implementation: Fundamental operators 1703 //---------------------------------------------------------------- 1704 1705 static inline UChar32 max(UChar32 a, UChar32 b) { 1706 return (a > b) ? a : b; 1707 } 1708 1709 // polarity = 0, 3 is normal: x xor y 1710 // polarity = 1, 2: x xor ~y == x === y 1711 1712 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { 1713 if (isFrozen() || isBogus()) { 1714 return; 1715 } 1716 UErrorCode status = U_ZERO_ERROR; 1717 ensureBufferCapacity(len + otherLen, status); 1718 if (U_FAILURE(status)) { 1719 return; 1720 } 1721 1722 int32_t i = 0, j = 0, k = 0; 1723 UChar32 a = list[i++]; 1724 UChar32 b; 1725 if (polarity == 1 || polarity == 2) { 1726 b = UNICODESET_LOW; 1727 if (other[j] == UNICODESET_LOW) { // skip base if already LOW 1728 ++j; 1729 b = other[j]; 1730 } 1731 } else { 1732 b = other[j++]; 1733 } 1734 // simplest of all the routines 1735 // sort the values, discarding identicals! 1736 for (;;) { 1737 if (a < b) { 1738 buffer[k++] = a; 1739 a = list[i++]; 1740 } else if (b < a) { 1741 buffer[k++] = b; 1742 b = other[j++]; 1743 } else if (a != UNICODESET_HIGH) { // at this point, a == b 1744 // discard both values! 1745 a = list[i++]; 1746 b = other[j++]; 1747 } else { // DONE! 1748 buffer[k++] = UNICODESET_HIGH; 1749 len = k; 1750 break; 1751 } 1752 } 1753 swapBuffers(); 1754 releasePattern(); 1755 } 1756 1757 // polarity = 0 is normal: x union y 1758 // polarity = 2: x union ~y 1759 // polarity = 1: ~x union y 1760 // polarity = 3: ~x union ~y 1761 1762 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { 1763 if (isFrozen() || isBogus() || other==NULL) { 1764 return; 1765 } 1766 UErrorCode status = U_ZERO_ERROR; 1767 ensureBufferCapacity(len + otherLen, status); 1768 if (U_FAILURE(status)) { 1769 return; 1770 } 1771 1772 int32_t i = 0, j = 0, k = 0; 1773 UChar32 a = list[i++]; 1774 UChar32 b = other[j++]; 1775 // change from xor is that we have to check overlapping pairs 1776 // polarity bit 1 means a is second, bit 2 means b is. 1777 for (;;) { 1778 switch (polarity) { 1779 case 0: // both first; take lower if unequal 1780 if (a < b) { // take a 1781 // Back up over overlapping ranges in buffer[] 1782 if (k > 0 && a <= buffer[k-1]) { 1783 // Pick latter end value in buffer[] vs. list[] 1784 a = max(list[i], buffer[--k]); 1785 } else { 1786 // No overlap 1787 buffer[k++] = a; 1788 a = list[i]; 1789 } 1790 i++; // Common if/else code factored out 1791 polarity ^= 1; 1792 } else if (b < a) { // take b 1793 if (k > 0 && b <= buffer[k-1]) { 1794 b = max(other[j], buffer[--k]); 1795 } else { 1796 buffer[k++] = b; 1797 b = other[j]; 1798 } 1799 j++; 1800 polarity ^= 2; 1801 } else { // a == b, take a, drop b 1802 if (a == UNICODESET_HIGH) goto loop_end; 1803 // This is symmetrical; it doesn't matter if 1804 // we backtrack with a or b. - liu 1805 if (k > 0 && a <= buffer[k-1]) { 1806 a = max(list[i], buffer[--k]); 1807 } else { 1808 // No overlap 1809 buffer[k++] = a; 1810 a = list[i]; 1811 } 1812 i++; 1813 polarity ^= 1; 1814 b = other[j++]; 1815 polarity ^= 2; 1816 } 1817 break; 1818 case 3: // both second; take higher if unequal, and drop other 1819 if (b <= a) { // take a 1820 if (a == UNICODESET_HIGH) goto loop_end; 1821 buffer[k++] = a; 1822 } else { // take b 1823 if (b == UNICODESET_HIGH) goto loop_end; 1824 buffer[k++] = b; 1825 } 1826 a = list[i++]; 1827 polarity ^= 1; // factored common code 1828 b = other[j++]; 1829 polarity ^= 2; 1830 break; 1831 case 1: // a second, b first; if b < a, overlap 1832 if (a < b) { // no overlap, take a 1833 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1834 } else if (b < a) { // OVERLAP, drop b 1835 b = other[j++]; 1836 polarity ^= 2; 1837 } else { // a == b, drop both! 1838 if (a == UNICODESET_HIGH) goto loop_end; 1839 a = list[i++]; 1840 polarity ^= 1; 1841 b = other[j++]; 1842 polarity ^= 2; 1843 } 1844 break; 1845 case 2: // a first, b second; if a < b, overlap 1846 if (b < a) { // no overlap, take b 1847 buffer[k++] = b; 1848 b = other[j++]; 1849 polarity ^= 2; 1850 } else if (a < b) { // OVERLAP, drop a 1851 a = list[i++]; 1852 polarity ^= 1; 1853 } else { // a == b, drop both! 1854 if (a == UNICODESET_HIGH) goto loop_end; 1855 a = list[i++]; 1856 polarity ^= 1; 1857 b = other[j++]; 1858 polarity ^= 2; 1859 } 1860 break; 1861 } 1862 } 1863 loop_end: 1864 buffer[k++] = UNICODESET_HIGH; // terminate 1865 len = k; 1866 swapBuffers(); 1867 releasePattern(); 1868 } 1869 1870 // polarity = 0 is normal: x intersect y 1871 // polarity = 2: x intersect ~y == set-minus 1872 // polarity = 1: ~x intersect y 1873 // polarity = 3: ~x intersect ~y 1874 1875 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { 1876 if (isFrozen() || isBogus()) { 1877 return; 1878 } 1879 UErrorCode status = U_ZERO_ERROR; 1880 ensureBufferCapacity(len + otherLen, status); 1881 if (U_FAILURE(status)) { 1882 return; 1883 } 1884 1885 int32_t i = 0, j = 0, k = 0; 1886 UChar32 a = list[i++]; 1887 UChar32 b = other[j++]; 1888 // change from xor is that we have to check overlapping pairs 1889 // polarity bit 1 means a is second, bit 2 means b is. 1890 for (;;) { 1891 switch (polarity) { 1892 case 0: // both first; drop the smaller 1893 if (a < b) { // drop a 1894 a = list[i++]; 1895 polarity ^= 1; 1896 } else if (b < a) { // drop b 1897 b = other[j++]; 1898 polarity ^= 2; 1899 } else { // a == b, take one, drop other 1900 if (a == UNICODESET_HIGH) goto loop_end; 1901 buffer[k++] = a; 1902 a = list[i++]; 1903 polarity ^= 1; 1904 b = other[j++]; 1905 polarity ^= 2; 1906 } 1907 break; 1908 case 3: // both second; take lower if unequal 1909 if (a < b) { // take a 1910 buffer[k++] = a; 1911 a = list[i++]; 1912 polarity ^= 1; 1913 } else if (b < a) { // take b 1914 buffer[k++] = b; 1915 b = other[j++]; 1916 polarity ^= 2; 1917 } else { // a == b, take one, drop other 1918 if (a == UNICODESET_HIGH) goto loop_end; 1919 buffer[k++] = a; 1920 a = list[i++]; 1921 polarity ^= 1; 1922 b = other[j++]; 1923 polarity ^= 2; 1924 } 1925 break; 1926 case 1: // a second, b first; 1927 if (a < b) { // NO OVERLAP, drop a 1928 a = list[i++]; 1929 polarity ^= 1; 1930 } else if (b < a) { // OVERLAP, take b 1931 buffer[k++] = b; 1932 b = other[j++]; 1933 polarity ^= 2; 1934 } else { // a == b, drop both! 1935 if (a == UNICODESET_HIGH) goto loop_end; 1936 a = list[i++]; 1937 polarity ^= 1; 1938 b = other[j++]; 1939 polarity ^= 2; 1940 } 1941 break; 1942 case 2: // a first, b second; if a < b, overlap 1943 if (b < a) { // no overlap, drop b 1944 b = other[j++]; 1945 polarity ^= 2; 1946 } else if (a < b) { // OVERLAP, take a 1947 buffer[k++] = a; 1948 a = list[i++]; 1949 polarity ^= 1; 1950 } else { // a == b, drop both! 1951 if (a == UNICODESET_HIGH) goto loop_end; 1952 a = list[i++]; 1953 polarity ^= 1; 1954 b = other[j++]; 1955 polarity ^= 2; 1956 } 1957 break; 1958 } 1959 } 1960 loop_end: 1961 buffer[k++] = UNICODESET_HIGH; // terminate 1962 len = k; 1963 swapBuffers(); 1964 releasePattern(); 1965 } 1966 1967 /** 1968 * Append the <code>toPattern()</code> representation of a 1969 * string to the given <code>StringBuffer</code>. 1970 */ 1971 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool 1972 escapeUnprintable) { 1973 UChar32 cp; 1974 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1975 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); 1976 } 1977 } 1978 1979 /** 1980 * Append the <code>toPattern()</code> representation of a 1981 * character to the given <code>StringBuffer</code>. 1982 */ 1983 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool 1984 escapeUnprintable) { 1985 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 1986 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything 1987 // unprintable 1988 if (ICU_Utility::escapeUnprintable(buf, c)) { 1989 return; 1990 } 1991 } 1992 // Okay to let ':' pass through 1993 switch (c) { 1994 case SET_OPEN: 1995 case SET_CLOSE: 1996 case HYPHEN: 1997 case COMPLEMENT: 1998 case INTERSECTION: 1999 case BACKSLASH: 2000 case OPEN_BRACE: 2001 case CLOSE_BRACE: 2002 case COLON: 2003 case SymbolTable::SYMBOL_REF: 2004 buf.append(BACKSLASH); 2005 break; 2006 default: 2007 // Escape whitespace 2008 if (PatternProps::isWhiteSpace(c)) { 2009 buf.append(BACKSLASH); 2010 } 2011 break; 2012 } 2013 buf.append(c); 2014 } 2015 2016 /** 2017 * Append a string representation of this set to result. This will be 2018 * a cleaned version of the string passed to applyPattern(), if there 2019 * is one. Otherwise it will be generated. 2020 */ 2021 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, 2022 UBool escapeUnprintable) const 2023 { 2024 if (pat != NULL) { 2025 int32_t i; 2026 int32_t backslashCount = 0; 2027 for (i=0; i<patLen; ) { 2028 UChar32 c; 2029 U16_NEXT(pat, i, patLen, c); 2030 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 2031 // If the unprintable character is preceded by an odd 2032 // number of backslashes, then it has been escaped. 2033 // Before unescaping it, we delete the final 2034 // backslash. 2035 if ((backslashCount % 2) == 1) { 2036 result.truncate(result.length() - 1); 2037 } 2038 ICU_Utility::escapeUnprintable(result, c); 2039 backslashCount = 0; 2040 } else { 2041 result.append(c); 2042 if (c == BACKSLASH) { 2043 ++backslashCount; 2044 } else { 2045 backslashCount = 0; 2046 } 2047 } 2048 } 2049 return result; 2050 } 2051 2052 return _generatePattern(result, escapeUnprintable); 2053 } 2054 2055 /** 2056 * Returns a string representation of this set. If the result of 2057 * calling this function is passed to a UnicodeSet constructor, it 2058 * will produce another set that is equal to this one. 2059 */ 2060 UnicodeString& UnicodeSet::toPattern(UnicodeString& result, 2061 UBool escapeUnprintable) const 2062 { 2063 result.truncate(0); 2064 return _toPattern(result, escapeUnprintable); 2065 } 2066 2067 /** 2068 * Generate and append a string representation of this set to result. 2069 * This does not use this.pat, the cleaned up copy of the string 2070 * passed to applyPattern(). 2071 */ 2072 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, 2073 UBool escapeUnprintable) const 2074 { 2075 result.append(SET_OPEN); 2076 2077 // // Check against the predefined categories. We implicitly build 2078 // // up ALL category sets the first time toPattern() is called. 2079 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { 2080 // if (*this == getCategorySet(cat)) { 2081 // result.append(COLON); 2082 // result.append(CATEGORY_NAMES, cat*2, 2); 2083 // return result.append(CATEGORY_CLOSE); 2084 // } 2085 // } 2086 2087 int32_t count = getRangeCount(); 2088 2089 // If the set contains at least 2 intervals and includes both 2090 // MIN_VALUE and MAX_VALUE, then the inverse representation will 2091 // be more economical. 2092 if (count > 1 && 2093 getRangeStart(0) == MIN_VALUE && 2094 getRangeEnd(count-1) == MAX_VALUE) { 2095 2096 // Emit the inverse 2097 result.append(COMPLEMENT); 2098 2099 for (int32_t i = 1; i < count; ++i) { 2100 UChar32 start = getRangeEnd(i-1)+1; 2101 UChar32 end = getRangeStart(i)-1; 2102 _appendToPat(result, start, escapeUnprintable); 2103 if (start != end) { 2104 if ((start+1) != end) { 2105 result.append(HYPHEN); 2106 } 2107 _appendToPat(result, end, escapeUnprintable); 2108 } 2109 } 2110 } 2111 2112 // Default; emit the ranges as pairs 2113 else { 2114 for (int32_t i = 0; i < count; ++i) { 2115 UChar32 start = getRangeStart(i); 2116 UChar32 end = getRangeEnd(i); 2117 _appendToPat(result, start, escapeUnprintable); 2118 if (start != end) { 2119 if ((start+1) != end) { 2120 result.append(HYPHEN); 2121 } 2122 _appendToPat(result, end, escapeUnprintable); 2123 } 2124 } 2125 } 2126 2127 for (int32_t i = 0; i<strings->size(); ++i) { 2128 result.append(OPEN_BRACE); 2129 _appendToPat(result, 2130 *(const UnicodeString*) strings->elementAt(i), 2131 escapeUnprintable); 2132 result.append(CLOSE_BRACE); 2133 } 2134 return result.append(SET_CLOSE); 2135 } 2136 2137 /** 2138 * Release existing cached pattern 2139 */ 2140 void UnicodeSet::releasePattern() { 2141 if (pat) { 2142 uprv_free(pat); 2143 pat = NULL; 2144 patLen = 0; 2145 } 2146 } 2147 2148 /** 2149 * Set the new pattern to cache. 2150 */ 2151 void UnicodeSet::setPattern(const UnicodeString& newPat) { 2152 releasePattern(); 2153 int32_t newPatLen = newPat.length(); 2154 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); 2155 if (pat) { 2156 patLen = newPatLen; 2157 newPat.extractBetween(0, patLen, pat); 2158 pat[patLen] = 0; 2159 } 2160 // else we don't care if malloc failed. This was just a nice cache. 2161 // We can regenerate an equivalent pattern later when requested. 2162 } 2163 2164 UnicodeFunctor *UnicodeSet::freeze() { 2165 if(!isFrozen() && !isBogus()) { 2166 // Do most of what compact() does before freezing because 2167 // compact() will not work when the set is frozen. 2168 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). 2169 2170 // Delete buffer first to defragment memory less. 2171 if (buffer != NULL) { 2172 uprv_free(buffer); 2173 buffer = NULL; 2174 } 2175 if (capacity > (len + GROW_EXTRA)) { 2176 // Make the capacity equal to len or 1. 2177 // We don't want to realloc of 0 size. 2178 capacity = len + (len == 0); 2179 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); 2180 if (list == NULL) { // Check for memory allocation error. 2181 setToBogus(); 2182 return this; 2183 } 2184 } 2185 2186 // Optimize contains() and span() and similar functions. 2187 if (!strings->isEmpty()) { 2188 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); 2189 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { 2190 // All strings are irrelevant for span() etc. because 2191 // all of each string's code points are contained in this set. 2192 // Do not check needsStringSpanUTF8() because UTF-8 has at most as 2193 // many relevant strings as UTF-16. 2194 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) 2195 delete stringSpan; 2196 stringSpan = NULL; 2197 } 2198 } 2199 if (stringSpan == NULL) { 2200 // No span-relevant strings: Optimize for code point spans. 2201 bmpSet=new BMPSet(list, len); 2202 if (bmpSet == NULL) { // Check for memory allocation error. 2203 setToBogus(); 2204 } 2205 } 2206 } 2207 return this; 2208 } 2209 2210 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 2211 if(length>0 && bmpSet!=NULL) { 2212 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s); 2213 } 2214 if(length<0) { 2215 length=u_strlen(s); 2216 } 2217 if(length==0) { 2218 return 0; 2219 } 2220 if(stringSpan!=NULL) { 2221 return stringSpan->span(s, length, spanCondition); 2222 } else if(!strings->isEmpty()) { 2223 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2224 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : 2225 UnicodeSetStringSpan::FWD_UTF16_CONTAINED; 2226 UnicodeSetStringSpan strSpan(*this, *strings, which); 2227 if(strSpan.needsStringSpanUTF16()) { 2228 return strSpan.span(s, length, spanCondition); 2229 } 2230 } 2231 2232 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2233 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2234 } 2235 2236 UChar32 c; 2237 int32_t start=0, prev=0; 2238 do { 2239 U16_NEXT(s, start, length, c); 2240 if(spanCondition!=contains(c)) { 2241 break; 2242 } 2243 } while((prev=start)<length); 2244 return prev; 2245 } 2246 2247 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 2248 if(length>0 && bmpSet!=NULL) { 2249 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s); 2250 } 2251 if(length<0) { 2252 length=u_strlen(s); 2253 } 2254 if(length==0) { 2255 return 0; 2256 } 2257 if(stringSpan!=NULL) { 2258 return stringSpan->spanBack(s, length, spanCondition); 2259 } else if(!strings->isEmpty()) { 2260 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2261 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : 2262 UnicodeSetStringSpan::BACK_UTF16_CONTAINED; 2263 UnicodeSetStringSpan strSpan(*this, *strings, which); 2264 if(strSpan.needsStringSpanUTF16()) { 2265 return strSpan.spanBack(s, length, spanCondition); 2266 } 2267 } 2268 2269 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2270 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2271 } 2272 2273 UChar32 c; 2274 int32_t prev=length; 2275 do { 2276 U16_PREV(s, 0, length, c); 2277 if(spanCondition!=contains(c)) { 2278 break; 2279 } 2280 } while((prev=length)>0); 2281 return prev; 2282 } 2283 2284 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2285 if(length>0 && bmpSet!=NULL) { 2286 const uint8_t *s0=(const uint8_t *)s; 2287 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0); 2288 } 2289 if(length<0) { 2290 length=(int32_t)uprv_strlen(s); 2291 } 2292 if(length==0) { 2293 return 0; 2294 } 2295 if(stringSpan!=NULL) { 2296 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); 2297 } else if(!strings->isEmpty()) { 2298 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2299 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : 2300 UnicodeSetStringSpan::FWD_UTF8_CONTAINED; 2301 UnicodeSetStringSpan strSpan(*this, *strings, which); 2302 if(strSpan.needsStringSpanUTF8()) { 2303 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition); 2304 } 2305 } 2306 2307 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2308 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2309 } 2310 2311 UChar32 c; 2312 int32_t start=0, prev=0; 2313 do { 2314 U8_NEXT_OR_FFFD(s, start, length, c); 2315 if(spanCondition!=contains(c)) { 2316 break; 2317 } 2318 } while((prev=start)<length); 2319 return prev; 2320 } 2321 2322 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2323 if(length>0 && bmpSet!=NULL) { 2324 const uint8_t *s0=(const uint8_t *)s; 2325 return bmpSet->spanBackUTF8(s0, length, spanCondition); 2326 } 2327 if(length<0) { 2328 length=(int32_t)uprv_strlen(s); 2329 } 2330 if(length==0) { 2331 return 0; 2332 } 2333 if(stringSpan!=NULL) { 2334 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); 2335 } else if(!strings->isEmpty()) { 2336 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2337 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : 2338 UnicodeSetStringSpan::BACK_UTF8_CONTAINED; 2339 UnicodeSetStringSpan strSpan(*this, *strings, which); 2340 if(strSpan.needsStringSpanUTF8()) { 2341 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition); 2342 } 2343 } 2344 2345 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2346 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2347 } 2348 2349 UChar32 c; 2350 int32_t prev=length; 2351 do { 2352 U8_PREV_OR_FFFD(s, 0, length, c); 2353 if(spanCondition!=contains(c)) { 2354 break; 2355 } 2356 } while((prev=length)>0); 2357 return prev; 2358 } 2359 2360 U_NAMESPACE_END 2361