1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 10/20/99 alan Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 #include "unicode/parsepos.h" 13 #include "unicode/symtable.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utf8.h" 16 #include "unicode/utf16.h" 17 #include "ruleiter.h" 18 #include "cmemory.h" 19 #include "cstring.h" 20 #include "patternprops.h" 21 #include "uelement.h" 22 #include "util.h" 23 #include "uvector.h" 24 #include "charstr.h" 25 #include "ustrfmt.h" 26 #include "uassert.h" 27 #include "bmpset.h" 28 #include "unisetspan.h" 29 30 // Define UChar constants using hex for EBCDIC compatibility 31 // Used #define to reduce private static exports and memory access time. 32 #define SET_OPEN ((UChar)0x005B) /*[*/ 33 #define SET_CLOSE ((UChar)0x005D) /*]*/ 34 #define HYPHEN ((UChar)0x002D) /*-*/ 35 #define COMPLEMENT ((UChar)0x005E) /*^*/ 36 #define COLON ((UChar)0x003A) /*:*/ 37 #define BACKSLASH ((UChar)0x005C) /*\*/ 38 #define INTERSECTION ((UChar)0x0026) /*&*/ 39 #define UPPER_U ((UChar)0x0055) /*U*/ 40 #define LOWER_U ((UChar)0x0075) /*u*/ 41 #define OPEN_BRACE ((UChar)123) /*{*/ 42 #define CLOSE_BRACE ((UChar)125) /*}*/ 43 #define UPPER_P ((UChar)0x0050) /*P*/ 44 #define LOWER_P ((UChar)0x0070) /*p*/ 45 #define UPPER_N ((UChar)78) /*N*/ 46 #define EQUALS ((UChar)0x003D) /*=*/ 47 48 // HIGH_VALUE > all valid values. 110000 for codepoints 49 #define UNICODESET_HIGH 0x0110000 50 51 // LOW <= all valid values. ZERO for codepoints 52 #define UNICODESET_LOW 0x000000 53 54 // initial storage. Must be >= 0 55 #define START_EXTRA 16 56 57 // extra amount for growth. Must be >= 0 58 #define GROW_EXTRA START_EXTRA 59 60 U_NAMESPACE_BEGIN 61 62 SymbolTable::~SymbolTable() {} 63 64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) 65 66 /** 67 * Modify the given UChar32 variable so that it is in range, by 68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and 69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1. 70 * It modifies its argument in-place and also returns it. 71 */ 72 static inline UChar32 pinCodePoint(UChar32& c) { 73 if (c < UNICODESET_LOW) { 74 c = UNICODESET_LOW; 75 } else if (c > (UNICODESET_HIGH-1)) { 76 c = (UNICODESET_HIGH-1); 77 } 78 return c; 79 } 80 81 //---------------------------------------------------------------- 82 // Debugging 83 //---------------------------------------------------------------- 84 85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks. 86 // To enable the debugging, define the symbol DEBUG_MEM in the line 87 // below. This will result in text being sent to stdout that looks 88 // like this: 89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85- 90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85- 91 // Each line lists a construction (ct) or destruction (dt) event, the 92 // object address, the number of outstanding objects after the event, 93 // and the pattern of the object in question. 94 95 // #define DEBUG_MEM 96 97 #ifdef DEBUG_MEM 98 #include <stdio.h> 99 static int32_t _dbgCount = 0; 100 101 static inline void _dbgct(UnicodeSet* set) { 102 UnicodeString str; 103 set->toPattern(str, TRUE); 104 char buf[40]; 105 str.extract(0, 39, buf, ""); 106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf); 107 } 108 109 static inline void _dbgdt(UnicodeSet* set) { 110 UnicodeString str; 111 set->toPattern(str, TRUE); 112 char buf[40]; 113 str.extract(0, 39, buf, ""); 114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf); 115 } 116 117 #else 118 119 #define _dbgct(set) 120 #define _dbgdt(set) 121 122 #endif 123 124 //---------------------------------------------------------------- 125 // UnicodeString in UVector support 126 //---------------------------------------------------------------- 127 128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { 129 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); 130 } 131 132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 133 const UnicodeString &a = *(const UnicodeString*)t1.pointer; 134 const UnicodeString &b = *(const UnicodeString*)t2.pointer; 135 return a.compare(b); 136 } 137 138 //---------------------------------------------------------------- 139 // Constructors &c 140 //---------------------------------------------------------------- 141 142 /** 143 * Constructs an empty set. 144 */ 145 UnicodeSet::UnicodeSet() : 146 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 147 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 148 fFlags(0) 149 { 150 UErrorCode status = U_ZERO_ERROR; 151 allocateStrings(status); 152 if (U_FAILURE(status)) { 153 return; 154 } 155 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 156 if(list!=NULL){ 157 list[0] = UNICODESET_HIGH; 158 } else { // If memory allocation failed, set to bogus state. 159 setToBogus(); 160 return; 161 } 162 _dbgct(this); 163 } 164 165 /** 166 * Constructs a set containing the given range. If <code>end > 167 * start</code> then an empty set is created. 168 * 169 * @param start first character, inclusive, of range 170 * @param end last character, inclusive, of range 171 */ 172 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : 173 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 174 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 175 fFlags(0) 176 { 177 UErrorCode status = U_ZERO_ERROR; 178 allocateStrings(status); 179 if (U_FAILURE(status)) { 180 return; 181 } 182 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 183 if(list!=NULL){ 184 list[0] = UNICODESET_HIGH; 185 complement(start, end); 186 } else { // If memory allocation failed, set to bogus state. 187 setToBogus(); 188 return; 189 } 190 _dbgct(this); 191 } 192 193 /** 194 * Constructs a set that is identical to the given UnicodeSet. 195 */ 196 UnicodeSet::UnicodeSet(const UnicodeSet& o) : 197 UnicodeFilter(o), 198 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), 199 bmpSet(0), 200 buffer(0), bufferCapacity(0), 201 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 202 fFlags(0) 203 { 204 UErrorCode status = U_ZERO_ERROR; 205 allocateStrings(status); 206 if (U_FAILURE(status)) { 207 return; 208 } 209 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 210 if(list!=NULL){ 211 *this = o; 212 } else { // If memory allocation failed, set to bogus state. 213 setToBogus(); 214 return; 215 } 216 _dbgct(this); 217 } 218 219 // Copy-construct as thawed. 220 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : 221 UnicodeFilter(o), 222 len(0), capacity(o.len + GROW_EXTRA), list(0), 223 bmpSet(0), 224 buffer(0), bufferCapacity(0), 225 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 226 fFlags(0) 227 { 228 UErrorCode status = U_ZERO_ERROR; 229 allocateStrings(status); 230 if (U_FAILURE(status)) { 231 return; 232 } 233 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 234 if(list!=NULL){ 235 // *this = o except for bmpSet and stringSpan 236 len = o.len; 237 uprv_memcpy(list, o.list, len*sizeof(UChar32)); 238 if (strings != NULL && o.strings != NULL) { 239 strings->assign(*o.strings, cloneUnicodeString, status); 240 } else { // Invalid strings. 241 setToBogus(); 242 return; 243 } 244 if (o.pat) { 245 setPattern(UnicodeString(o.pat, o.patLen)); 246 } 247 } else { // If memory allocation failed, set to bogus state. 248 setToBogus(); 249 return; 250 } 251 _dbgct(this); 252 } 253 254 /** 255 * Destructs the set. 256 */ 257 UnicodeSet::~UnicodeSet() { 258 _dbgdt(this); // first! 259 uprv_free(list); 260 delete bmpSet; 261 if (buffer) { 262 uprv_free(buffer); 263 } 264 delete strings; 265 delete stringSpan; 266 releasePattern(); 267 } 268 269 /** 270 * Assigns this object to be a copy of another. 271 */ 272 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { 273 if (this == &o) { 274 return *this; 275 } 276 if (isFrozen()) { 277 return *this; 278 } 279 if (o.isBogus()) { 280 setToBogus(); 281 return *this; 282 } 283 UErrorCode ec = U_ZERO_ERROR; 284 ensureCapacity(o.len, ec); 285 if (U_FAILURE(ec)) { 286 return *this; // There is no way to report this error :-( 287 } 288 len = o.len; 289 uprv_memcpy(list, o.list, len*sizeof(UChar32)); 290 if (o.bmpSet == NULL) { 291 bmpSet = NULL; 292 } else { 293 bmpSet = new BMPSet(*o.bmpSet, list, len); 294 if (bmpSet == NULL) { // Check for memory allocation error. 295 setToBogus(); 296 return *this; 297 } 298 } 299 if (strings != NULL && o.strings != NULL) { 300 strings->assign(*o.strings, cloneUnicodeString, ec); 301 } else { // Invalid strings. 302 setToBogus(); 303 return *this; 304 } 305 if (o.stringSpan == NULL) { 306 stringSpan = NULL; 307 } else { 308 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); 309 if (stringSpan == NULL) { // Check for memory allocation error. 310 setToBogus(); 311 return *this; 312 } 313 } 314 releasePattern(); 315 if (o.pat) { 316 setPattern(UnicodeString(o.pat, o.patLen)); 317 } 318 return *this; 319 } 320 321 /** 322 * Returns a copy of this object. All UnicodeMatcher objects have 323 * to support cloning in order to allow classes using 324 * UnicodeMatchers, such as Transliterator, to implement cloning. 325 */ 326 UnicodeFunctor* UnicodeSet::clone() const { 327 return new UnicodeSet(*this); 328 } 329 330 UnicodeFunctor *UnicodeSet::cloneAsThawed() const { 331 return new UnicodeSet(*this, TRUE); 332 } 333 334 /** 335 * Compares the specified object with this set for equality. Returns 336 * <tt>true</tt> if the two sets 337 * have the same size, and every member of the specified set is 338 * contained in this set (or equivalently, every member of this set is 339 * contained in the specified set). 340 * 341 * @param o set to be compared for equality with this set. 342 * @return <tt>true</tt> if the specified set is equal to this set. 343 */ 344 UBool UnicodeSet::operator==(const UnicodeSet& o) const { 345 if (len != o.len) return FALSE; 346 for (int32_t i = 0; i < len; ++i) { 347 if (list[i] != o.list[i]) return FALSE; 348 } 349 if (*strings != *o.strings) return FALSE; 350 return TRUE; 351 } 352 353 /** 354 * Returns the hash code value for this set. 355 * 356 * @return the hash code value for this set. 357 * @see Object#hashCode() 358 */ 359 int32_t UnicodeSet::hashCode(void) const { 360 int32_t result = len; 361 for (int32_t i = 0; i < len; ++i) { 362 result *= 1000003; 363 result += list[i]; 364 } 365 return result; 366 } 367 368 //---------------------------------------------------------------- 369 // Public API 370 //---------------------------------------------------------------- 371 372 /** 373 * Returns the number of elements in this set (its cardinality), 374 * Note than the elements of a set may include both individual 375 * codepoints and strings. 376 * 377 * @return the number of elements in this set (its cardinality). 378 */ 379 int32_t UnicodeSet::size(void) const { 380 int32_t n = 0; 381 int32_t count = getRangeCount(); 382 for (int32_t i = 0; i < count; ++i) { 383 n += getRangeEnd(i) - getRangeStart(i) + 1; 384 } 385 return n + strings->size(); 386 } 387 388 /** 389 * Returns <tt>true</tt> if this set contains no elements. 390 * 391 * @return <tt>true</tt> if this set contains no elements. 392 */ 393 UBool UnicodeSet::isEmpty(void) const { 394 return len == 1 && strings->size() == 0; 395 } 396 397 /** 398 * Returns true if this set contains the given character. 399 * @param c character to be checked for containment 400 * @return true if the test condition is met 401 */ 402 UBool UnicodeSet::contains(UChar32 c) const { 403 // Set i to the index of the start item greater than ch 404 // We know we will terminate without length test! 405 // LATER: for large sets, add binary search 406 //int32_t i = -1; 407 //for (;;) { 408 // if (c < list[++i]) break; 409 //} 410 if (bmpSet != NULL) { 411 return bmpSet->contains(c); 412 } 413 if (stringSpan != NULL) { 414 return stringSpan->contains(c); 415 } 416 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound 417 return FALSE; 418 } 419 int32_t i = findCodePoint(c); 420 return (UBool)(i & 1); // return true if odd 421 } 422 423 /** 424 * Returns the smallest value i such that c < list[i]. Caller 425 * must ensure that c is a legal value or this method will enter 426 * an infinite loop. This method performs a binary search. 427 * @param c a character in the range MIN_VALUE..MAX_VALUE 428 * inclusive 429 * @return the smallest integer i in the range 0..len-1, 430 * inclusive, such that c < list[i] 431 */ 432 int32_t UnicodeSet::findCodePoint(UChar32 c) const { 433 /* Examples: 434 findCodePoint(c) 435 set list[] c=0 1 3 4 7 8 436 === ============== =========== 437 [] [110000] 0 0 0 0 0 0 438 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 439 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 440 [:Any:] [0, 110000] 1 1 1 1 1 1 441 */ 442 443 // Return the smallest i such that c < list[i]. Assume 444 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 445 if (c < list[0]) 446 return 0; 447 // High runner test. c is often after the last range, so an 448 // initial check for this condition pays off. 449 int32_t lo = 0; 450 int32_t hi = len - 1; 451 if (lo >= hi || c >= list[hi-1]) 452 return hi; 453 // invariant: c >= list[lo] 454 // invariant: c < list[hi] 455 for (;;) { 456 int32_t i = (lo + hi) >> 1; 457 if (i == lo) { 458 break; // Found! 459 } else if (c < list[i]) { 460 hi = i; 461 } else { 462 lo = i; 463 } 464 } 465 return hi; 466 } 467 468 /** 469 * Returns true if this set contains every character 470 * of the given range. 471 * @param start first character, inclusive, of the range 472 * @param end last character, inclusive, of the range 473 * @return true if the test condition is met 474 */ 475 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { 476 //int32_t i = -1; 477 //for (;;) { 478 // if (start < list[++i]) break; 479 //} 480 int32_t i = findCodePoint(start); 481 return ((i & 1) != 0 && end < list[i]); 482 } 483 484 /** 485 * Returns <tt>true</tt> if this set contains the given 486 * multicharacter string. 487 * @param s string to be checked for containment 488 * @return <tt>true</tt> if this set contains the specified string 489 */ 490 UBool UnicodeSet::contains(const UnicodeString& s) const { 491 if (s.length() == 0) return FALSE; 492 int32_t cp = getSingleCP(s); 493 if (cp < 0) { 494 return strings->contains((void*) &s); 495 } else { 496 return contains((UChar32) cp); 497 } 498 } 499 500 /** 501 * Returns true if this set contains all the characters and strings 502 * of the given set. 503 * @param c set to be checked for containment 504 * @return true if the test condition is met 505 */ 506 UBool UnicodeSet::containsAll(const UnicodeSet& c) const { 507 // The specified set is a subset if all of its pairs are contained in 508 // this set. It's possible to code this more efficiently in terms of 509 // direct manipulation of the inversion lists if the need arises. 510 int32_t n = c.getRangeCount(); 511 for (int i=0; i<n; ++i) { 512 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 513 return FALSE; 514 } 515 } 516 if (!strings->containsAll(*c.strings)) return FALSE; 517 return TRUE; 518 } 519 520 /** 521 * Returns true if this set contains all the characters 522 * of the given string. 523 * @param s string containing characters to be checked for containment 524 * @return true if the test condition is met 525 */ 526 UBool UnicodeSet::containsAll(const UnicodeString& s) const { 527 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == 528 s.length()); 529 } 530 531 /** 532 * Returns true if this set contains none of the characters 533 * of the given range. 534 * @param start first character, inclusive, of the range 535 * @param end last character, inclusive, of the range 536 * @return true if the test condition is met 537 */ 538 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { 539 //int32_t i = -1; 540 //for (;;) { 541 // if (start < list[++i]) break; 542 //} 543 int32_t i = findCodePoint(start); 544 return ((i & 1) == 0 && end < list[i]); 545 } 546 547 /** 548 * Returns true if this set contains none of the characters and strings 549 * of the given set. 550 * @param c set to be checked for containment 551 * @return true if the test condition is met 552 */ 553 UBool UnicodeSet::containsNone(const UnicodeSet& c) const { 554 // The specified set is a subset if all of its pairs are contained in 555 // this set. It's possible to code this more efficiently in terms of 556 // direct manipulation of the inversion lists if the need arises. 557 int32_t n = c.getRangeCount(); 558 for (int32_t i=0; i<n; ++i) { 559 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 560 return FALSE; 561 } 562 } 563 if (!strings->containsNone(*c.strings)) return FALSE; 564 return TRUE; 565 } 566 567 /** 568 * Returns true if this set contains none of the characters 569 * of the given string. 570 * @param s string containing characters to be checked for containment 571 * @return true if the test condition is met 572 */ 573 UBool UnicodeSet::containsNone(const UnicodeString& s) const { 574 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == 575 s.length()); 576 } 577 578 /** 579 * Returns <tt>true</tt> if this set contains any character whose low byte 580 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 581 * indexing. 582 */ 583 UBool UnicodeSet::matchesIndexValue(uint8_t v) const { 584 /* The index value v, in the range [0,255], is contained in this set if 585 * it is contained in any pair of this set. Pairs either have the high 586 * bytes equal, or unequal. If the high bytes are equal, then we have 587 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 588 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 589 * Then v is contained if xx <= v || v <= yy. (This is identical to the 590 * time zone month containment logic.) 591 */ 592 int32_t i; 593 int32_t rangeCount=getRangeCount(); 594 for (i=0; i<rangeCount; ++i) { 595 UChar32 low = getRangeStart(i); 596 UChar32 high = getRangeEnd(i); 597 if ((low & ~0xFF) == (high & ~0xFF)) { 598 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 599 return TRUE; 600 } 601 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 602 return TRUE; 603 } 604 } 605 if (strings->size() != 0) { 606 for (i=0; i<strings->size(); ++i) { 607 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); 608 //if (s.length() == 0) { 609 // // Empty strings match everything 610 // return TRUE; 611 //} 612 // assert(s.length() != 0); // We enforce this elsewhere 613 UChar32 c = s.char32At(0); 614 if ((c & 0xFF) == v) { 615 return TRUE; 616 } 617 } 618 } 619 return FALSE; 620 } 621 622 /** 623 * Implementation of UnicodeMatcher::matches(). Always matches the 624 * longest possible multichar string. 625 */ 626 UMatchDegree UnicodeSet::matches(const Replaceable& text, 627 int32_t& offset, 628 int32_t limit, 629 UBool incremental) { 630 if (offset == limit) { 631 // Strings, if any, have length != 0, so we don't worry 632 // about them here. If we ever allow zero-length strings 633 // we much check for them here. 634 if (contains(U_ETHER)) { 635 return incremental ? U_PARTIAL_MATCH : U_MATCH; 636 } else { 637 return U_MISMATCH; 638 } 639 } else { 640 if (strings->size() != 0) { // try strings first 641 642 // might separate forward and backward loops later 643 // for now they are combined 644 645 // TODO Improve efficiency of this, at least in the forward 646 // direction, if not in both. In the forward direction we 647 // can assume the strings are sorted. 648 649 int32_t i; 650 UBool forward = offset < limit; 651 652 // firstChar is the leftmost char to match in the 653 // forward direction or the rightmost char to match in 654 // the reverse direction. 655 UChar firstChar = text.charAt(offset); 656 657 // If there are multiple strings that can match we 658 // return the longest match. 659 int32_t highWaterLength = 0; 660 661 for (i=0; i<strings->size(); ++i) { 662 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i); 663 664 //if (trial.length() == 0) { 665 // return U_MATCH; // null-string always matches 666 //} 667 // assert(trial.length() != 0); // We ensure this elsewhere 668 669 UChar c = trial.charAt(forward ? 0 : trial.length() - 1); 670 671 // Strings are sorted, so we can optimize in the 672 // forward direction. 673 if (forward && c > firstChar) break; 674 if (c != firstChar) continue; 675 676 int32_t matchLen = matchRest(text, offset, limit, trial); 677 678 if (incremental) { 679 int32_t maxLen = forward ? limit-offset : offset-limit; 680 if (matchLen == maxLen) { 681 // We have successfully matched but only up to limit. 682 return U_PARTIAL_MATCH; 683 } 684 } 685 686 if (matchLen == trial.length()) { 687 // We have successfully matched the whole string. 688 if (matchLen > highWaterLength) { 689 highWaterLength = matchLen; 690 } 691 // In the forward direction we know strings 692 // are sorted so we can bail early. 693 if (forward && matchLen < highWaterLength) { 694 break; 695 } 696 continue; 697 } 698 } 699 700 // We've checked all strings without a partial match. 701 // If we have full matches, return the longest one. 702 if (highWaterLength != 0) { 703 offset += forward ? highWaterLength : -highWaterLength; 704 return U_MATCH; 705 } 706 } 707 return UnicodeFilter::matches(text, offset, limit, incremental); 708 } 709 } 710 711 /** 712 * Returns the longest match for s in text at the given position. 713 * If limit > start then match forward from start+1 to limit 714 * matching all characters except s.charAt(0). If limit < start, 715 * go backward starting from start-1 matching all characters 716 * except s.charAt(s.length()-1). This method assumes that the 717 * first character, text.charAt(start), matches s, so it does not 718 * check it. 719 * @param text the text to match 720 * @param start the first character to match. In the forward 721 * direction, text.charAt(start) is matched against s.charAt(0). 722 * In the reverse direction, it is matched against 723 * s.charAt(s.length()-1). 724 * @param limit the limit offset for matching, either last+1 in 725 * the forward direction, or last-1 in the reverse direction, 726 * where last is the index of the last character to match. 727 * @return If part of s matches up to the limit, return |limit - 728 * start|. If all of s matches before reaching the limit, return 729 * s.length(). If there is a mismatch between s and text, return 730 * 0 731 */ 732 int32_t UnicodeSet::matchRest(const Replaceable& text, 733 int32_t start, int32_t limit, 734 const UnicodeString& s) { 735 int32_t i; 736 int32_t maxLen; 737 int32_t slen = s.length(); 738 if (start < limit) { 739 maxLen = limit - start; 740 if (maxLen > slen) maxLen = slen; 741 for (i = 1; i < maxLen; ++i) { 742 if (text.charAt(start + i) != s.charAt(i)) return 0; 743 } 744 } else { 745 maxLen = start - limit; 746 if (maxLen > slen) maxLen = slen; 747 --slen; // <=> slen = s.length() - 1; 748 for (i = 1; i < maxLen; ++i) { 749 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 750 } 751 } 752 return maxLen; 753 } 754 755 /** 756 * Implement of UnicodeMatcher 757 */ 758 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const { 759 toUnionTo.addAll(*this); 760 } 761 762 /** 763 * Returns the index of the given character within this set, where 764 * the set is ordered by ascending code point. If the character 765 * is not in this set, return -1. The inverse of this method is 766 * <code>charAt()</code>. 767 * @return an index from 0..size()-1, or -1 768 */ 769 int32_t UnicodeSet::indexOf(UChar32 c) const { 770 if (c < MIN_VALUE || c > MAX_VALUE) { 771 return -1; 772 } 773 int32_t i = 0; 774 int32_t n = 0; 775 for (;;) { 776 UChar32 start = list[i++]; 777 if (c < start) { 778 return -1; 779 } 780 UChar32 limit = list[i++]; 781 if (c < limit) { 782 return n + c - start; 783 } 784 n += limit - start; 785 } 786 } 787 788 /** 789 * Returns the character at the given index within this set, where 790 * the set is ordered by ascending code point. If the index is 791 * out of range, return (UChar32)-1. The inverse of this method is 792 * <code>indexOf()</code>. 793 * @param index an index from 0..size()-1 794 * @return the character at the given index, or (UChar32)-1. 795 */ 796 UChar32 UnicodeSet::charAt(int32_t index) const { 797 if (index >= 0) { 798 // len2 is the largest even integer <= len, that is, it is len 799 // for even values and len-1 for odd values. With odd values 800 // the last entry is UNICODESET_HIGH. 801 int32_t len2 = len & ~1; 802 for (int32_t i=0; i < len2;) { 803 UChar32 start = list[i++]; 804 int32_t count = list[i++] - start; 805 if (index < count) { 806 return (UChar32)(start + index); 807 } 808 index -= count; 809 } 810 } 811 return (UChar32)-1; 812 } 813 814 /** 815 * Make this object represent the range <code>start - end</code>. 816 * If <code>end > start</code> then this object is set to an 817 * an empty range. 818 * 819 * @param start first character in the set, inclusive 820 * @rparam end last character in the set, inclusive 821 */ 822 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { 823 clear(); 824 complement(start, end); 825 return *this; 826 } 827 828 /** 829 * Adds the specified range to this set if it is not already 830 * present. If this set already contains the specified range, 831 * the call leaves this set unchanged. If <code>end > start</code> 832 * then an empty range is added, leaving the set unchanged. 833 * 834 * @param start first character, inclusive, of range to be added 835 * to this set. 836 * @param end last character, inclusive, of range to be added 837 * to this set. 838 */ 839 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { 840 if (pinCodePoint(start) < pinCodePoint(end)) { 841 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 842 add(range, 2, 0); 843 } else if (start == end) { 844 add(start); 845 } 846 return *this; 847 } 848 849 // #define DEBUG_US_ADD 850 851 #ifdef DEBUG_US_ADD 852 #include <stdio.h> 853 void dump(UChar32 c) { 854 if (c <= 0xFF) { 855 printf("%c", (char)c); 856 } else { 857 printf("U+%04X", c); 858 } 859 } 860 void dump(const UChar32* list, int32_t len) { 861 printf("["); 862 for (int32_t i=0; i<len; ++i) { 863 if (i != 0) printf(", "); 864 dump(list[i]); 865 } 866 printf("]"); 867 } 868 #endif 869 870 /** 871 * Adds the specified character to this set if it is not already 872 * present. If this set already contains the specified character, 873 * the call leaves this set unchanged. 874 */ 875 UnicodeSet& UnicodeSet::add(UChar32 c) { 876 // find smallest i such that c < list[i] 877 // if odd, then it is IN the set 878 // if even, then it is OUT of the set 879 int32_t i = findCodePoint(pinCodePoint(c)); 880 881 // already in set? 882 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this; 883 884 // HIGH is 0x110000 885 // assert(list[len-1] == HIGH); 886 887 // empty = [HIGH] 888 // [start_0, limit_0, start_1, limit_1, HIGH] 889 890 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 891 // ^ 892 // list[i] 893 894 // i == 0 means c is before the first range 895 896 #ifdef DEBUG_US_ADD 897 printf("Add of "); 898 dump(c); 899 printf(" found at %d", i); 900 printf(": "); 901 dump(list, len); 902 printf(" => "); 903 #endif 904 905 if (c == list[i]-1) { 906 // c is before start of next range 907 list[i] = c; 908 // if we touched the HIGH mark, then add a new one 909 if (c == (UNICODESET_HIGH - 1)) { 910 UErrorCode status = U_ZERO_ERROR; 911 ensureCapacity(len+1, status); 912 if (U_FAILURE(status)) { 913 return *this; // There is no way to report this error :-( 914 } 915 list[len++] = UNICODESET_HIGH; 916 } 917 if (i > 0 && c == list[i-1]) { 918 // collapse adjacent ranges 919 920 // [..., start_k-1, c, c, limit_k, ..., HIGH] 921 // ^ 922 // list[i] 923 924 //for (int32_t k=i-1; k<len-2; ++k) { 925 // list[k] = list[k+2]; 926 //} 927 UChar32* dst = list + i - 1; 928 UChar32* src = dst + 2; 929 UChar32* srclimit = list + len; 930 while (src < srclimit) *(dst++) = *(src++); 931 932 len -= 2; 933 } 934 } 935 936 else if (i > 0 && c == list[i-1]) { 937 // c is after end of prior range 938 list[i-1]++; 939 // no need to check for collapse here 940 } 941 942 else { 943 // At this point we know the new char is not adjacent to 944 // any existing ranges, and it is not 10FFFF. 945 946 947 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 948 // ^ 949 // list[i] 950 951 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 952 // ^ 953 // list[i] 954 955 UErrorCode status = U_ZERO_ERROR; 956 ensureCapacity(len+2, status); 957 if (U_FAILURE(status)) { 958 return *this; // There is no way to report this error :-( 959 } 960 961 //for (int32_t k=len-1; k>=i; --k) { 962 // list[k+2] = list[k]; 963 //} 964 UChar32* src = list + len; 965 UChar32* dst = src + 2; 966 UChar32* srclimit = list + i; 967 while (src > srclimit) *(--dst) = *(--src); 968 969 list[i] = c; 970 list[i+1] = c+1; 971 len += 2; 972 } 973 974 #ifdef DEBUG_US_ADD 975 dump(list, len); 976 printf("\n"); 977 978 for (i=1; i<len; ++i) { 979 if (list[i] <= list[i-1]) { 980 // Corrupt array! 981 printf("ERROR: list has been corrupted\n"); 982 exit(1); 983 } 984 } 985 #endif 986 987 releasePattern(); 988 return *this; 989 } 990 991 /** 992 * Adds the specified multicharacter to this set if it is not already 993 * present. If this set already contains the multicharacter, 994 * the call leaves this set unchanged. 995 * Thus "ch" => {"ch"} 996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 997 * @param s the source string 998 * @return the modified set, for chaining 999 */ 1000 UnicodeSet& UnicodeSet::add(const UnicodeString& s) { 1001 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1002 int32_t cp = getSingleCP(s); 1003 if (cp < 0) { 1004 if (!strings->contains((void*) &s)) { 1005 _add(s); 1006 releasePattern(); 1007 } 1008 } else { 1009 add((UChar32)cp); 1010 } 1011 return *this; 1012 } 1013 1014 /** 1015 * Adds the given string, in order, to 'strings'. The given string 1016 * must have been checked by the caller to not be empty and to not 1017 * already be in 'strings'. 1018 */ 1019 void UnicodeSet::_add(const UnicodeString& s) { 1020 if (isFrozen() || isBogus()) { 1021 return; 1022 } 1023 UnicodeString* t = new UnicodeString(s); 1024 if (t == NULL) { // Check for memory allocation error. 1025 setToBogus(); 1026 return; 1027 } 1028 UErrorCode ec = U_ZERO_ERROR; 1029 strings->sortedInsert(t, compareUnicodeString, ec); 1030 if (U_FAILURE(ec)) { 1031 setToBogus(); 1032 delete t; 1033 } 1034 } 1035 1036 /** 1037 * @return a code point IF the string consists of a single one. 1038 * otherwise returns -1. 1039 * @param string to test 1040 */ 1041 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { 1042 //if (s.length() < 1) { 1043 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 1044 //} 1045 if (s.length() > 2) return -1; 1046 if (s.length() == 1) return s.charAt(0); 1047 1048 // at this point, len = 2 1049 UChar32 cp = s.char32At(0); 1050 if (cp > 0xFFFF) { // is surrogate pair 1051 return cp; 1052 } 1053 return -1; 1054 } 1055 1056 /** 1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1058 * If this set already any particular character, it has no effect on that character. 1059 * @param the source string 1060 * @return the modified set, for chaining 1061 */ 1062 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { 1063 UChar32 cp; 1064 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1065 cp = s.char32At(i); 1066 add(cp); 1067 } 1068 return *this; 1069 } 1070 1071 /** 1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1073 * If this set already any particular character, it has no effect on that character. 1074 * @param the source string 1075 * @return the modified set, for chaining 1076 */ 1077 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) { 1078 UnicodeSet set; 1079 set.addAll(s); 1080 retainAll(set); 1081 return *this; 1082 } 1083 1084 /** 1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1086 * If this set already any particular character, it has no effect on that character. 1087 * @param the source string 1088 * @return the modified set, for chaining 1089 */ 1090 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) { 1091 UnicodeSet set; 1092 set.addAll(s); 1093 complementAll(set); 1094 return *this; 1095 } 1096 1097 /** 1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1099 * If this set already any particular character, it has no effect on that character. 1100 * @param the source string 1101 * @return the modified set, for chaining 1102 */ 1103 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { 1104 UnicodeSet set; 1105 set.addAll(s); 1106 removeAll(set); 1107 return *this; 1108 } 1109 1110 UnicodeSet& UnicodeSet::removeAllStrings() { 1111 strings->removeAllElements(); 1112 return *this; 1113 } 1114 1115 1116 /** 1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1119 * @param the source string 1120 * @return a newly created set containing the given string 1121 */ 1122 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { 1123 UnicodeSet *set = new UnicodeSet(); 1124 if (set != NULL) { // Check for memory allocation error. 1125 set->add(s); 1126 } 1127 return set; 1128 } 1129 1130 1131 /** 1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1133 * @param the source string 1134 * @return a newly created set containing the given characters 1135 */ 1136 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { 1137 UnicodeSet *set = new UnicodeSet(); 1138 if (set != NULL) { // Check for memory allocation error. 1139 set->addAll(s); 1140 } 1141 return set; 1142 } 1143 1144 /** 1145 * Retain only the elements in this set that are contained in the 1146 * specified range. If <code>end > start</code> then an empty range is 1147 * retained, leaving the set empty. 1148 * 1149 * @param start first character, inclusive, of range to be retained 1150 * to this set. 1151 * @param end last character, inclusive, of range to be retained 1152 * to this set. 1153 */ 1154 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { 1155 if (pinCodePoint(start) <= pinCodePoint(end)) { 1156 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1157 retain(range, 2, 0); 1158 } else { 1159 clear(); 1160 } 1161 return *this; 1162 } 1163 1164 UnicodeSet& UnicodeSet::retain(UChar32 c) { 1165 return retain(c, c); 1166 } 1167 1168 /** 1169 * Removes the specified range from this set if it is present. 1170 * The set will not contain the specified range once the call 1171 * returns. If <code>end > start</code> then an empty range is 1172 * removed, leaving the set unchanged. 1173 * 1174 * @param start first character, inclusive, of range to be removed 1175 * from this set. 1176 * @param end last character, inclusive, of range to be removed 1177 * from this set. 1178 */ 1179 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { 1180 if (pinCodePoint(start) <= pinCodePoint(end)) { 1181 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1182 retain(range, 2, 2); 1183 } 1184 return *this; 1185 } 1186 1187 /** 1188 * Removes the specified character from this set if it is present. 1189 * The set will not contain the specified range once the call 1190 * returns. 1191 */ 1192 UnicodeSet& UnicodeSet::remove(UChar32 c) { 1193 return remove(c, c); 1194 } 1195 1196 /** 1197 * Removes the specified string from this set if it is present. 1198 * The set will not contain the specified character once the call 1199 * returns. 1200 * @param the source string 1201 * @return the modified set, for chaining 1202 */ 1203 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { 1204 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1205 int32_t cp = getSingleCP(s); 1206 if (cp < 0) { 1207 strings->removeElement((void*) &s); 1208 releasePattern(); 1209 } else { 1210 remove((UChar32)cp, (UChar32)cp); 1211 } 1212 return *this; 1213 } 1214 1215 /** 1216 * Complements the specified range in this set. Any character in 1217 * the range will be removed if it is in this set, or will be 1218 * added if it is not in this set. If <code>end > start</code> 1219 * then an empty range is xor'ed, leaving the set unchanged. 1220 * 1221 * @param start first character, inclusive, of range to be removed 1222 * from this set. 1223 * @param end last character, inclusive, of range to be removed 1224 * from this set. 1225 */ 1226 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { 1227 if (isFrozen() || isBogus()) { 1228 return *this; 1229 } 1230 if (pinCodePoint(start) <= pinCodePoint(end)) { 1231 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1232 exclusiveOr(range, 2, 0); 1233 } 1234 releasePattern(); 1235 return *this; 1236 } 1237 1238 UnicodeSet& UnicodeSet::complement(UChar32 c) { 1239 return complement(c, c); 1240 } 1241 1242 /** 1243 * This is equivalent to 1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1245 */ 1246 UnicodeSet& UnicodeSet::complement(void) { 1247 if (isFrozen() || isBogus()) { 1248 return *this; 1249 } 1250 UErrorCode status = U_ZERO_ERROR; 1251 if (list[0] == UNICODESET_LOW) { 1252 ensureBufferCapacity(len-1, status); 1253 if (U_FAILURE(status)) { 1254 return *this; 1255 } 1256 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32)); 1257 --len; 1258 } else { 1259 ensureBufferCapacity(len+1, status); 1260 if (U_FAILURE(status)) { 1261 return *this; 1262 } 1263 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32)); 1264 buffer[0] = UNICODESET_LOW; 1265 ++len; 1266 } 1267 swapBuffers(); 1268 releasePattern(); 1269 return *this; 1270 } 1271 1272 /** 1273 * Complement the specified string in this set. 1274 * The set will not contain the specified string once the call 1275 * returns. 1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1277 * @param s the string to complement 1278 * @return this object, for chaining 1279 */ 1280 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { 1281 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1282 int32_t cp = getSingleCP(s); 1283 if (cp < 0) { 1284 if (strings->contains((void*) &s)) { 1285 strings->removeElement((void*) &s); 1286 } else { 1287 _add(s); 1288 } 1289 releasePattern(); 1290 } else { 1291 complement((UChar32)cp, (UChar32)cp); 1292 } 1293 return *this; 1294 } 1295 1296 /** 1297 * Adds all of the elements in the specified set to this set if 1298 * they're not already present. This operation effectively 1299 * modifies this set so that its value is the <i>union</i> of the two 1300 * sets. The behavior of this operation is unspecified if the specified 1301 * collection is modified while the operation is in progress. 1302 * 1303 * @param c set whose elements are to be added to this set. 1304 * @see #add(char, char) 1305 */ 1306 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { 1307 if ( c.len>0 && c.list!=NULL ) { 1308 add(c.list, c.len, 0); 1309 } 1310 1311 // Add strings in order 1312 if ( c.strings!=NULL ) { 1313 for (int32_t i=0; i<c.strings->size(); ++i) { 1314 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); 1315 if (!strings->contains((void*) s)) { 1316 _add(*s); 1317 } 1318 } 1319 } 1320 return *this; 1321 } 1322 1323 /** 1324 * Retains only the elements in this set that are contained in the 1325 * specified set. In other words, removes from this set all of 1326 * its elements that are not contained in the specified set. This 1327 * operation effectively modifies this set so that its value is 1328 * the <i>intersection</i> of the two sets. 1329 * 1330 * @param c set that defines which elements this set will retain. 1331 */ 1332 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { 1333 if (isFrozen() || isBogus()) { 1334 return *this; 1335 } 1336 retain(c.list, c.len, 0); 1337 strings->retainAll(*c.strings); 1338 return *this; 1339 } 1340 1341 /** 1342 * Removes from this set all of its elements that are contained in the 1343 * specified set. This operation effectively modifies this 1344 * set so that its value is the <i>asymmetric set difference</i> of 1345 * the two sets. 1346 * 1347 * @param c set that defines which elements will be removed from 1348 * this set. 1349 */ 1350 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { 1351 if (isFrozen() || isBogus()) { 1352 return *this; 1353 } 1354 retain(c.list, c.len, 2); 1355 strings->removeAll(*c.strings); 1356 return *this; 1357 } 1358 1359 /** 1360 * Complements in this set all elements contained in the specified 1361 * set. Any character in the other set will be removed if it is 1362 * in this set, or will be added if it is not in this set. 1363 * 1364 * @param c set that defines which elements will be xor'ed from 1365 * this set. 1366 */ 1367 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { 1368 if (isFrozen() || isBogus()) { 1369 return *this; 1370 } 1371 exclusiveOr(c.list, c.len, 0); 1372 1373 for (int32_t i=0; i<c.strings->size(); ++i) { 1374 void* e = c.strings->elementAt(i); 1375 if (!strings->removeElement(e)) { 1376 _add(*(const UnicodeString*)e); 1377 } 1378 } 1379 return *this; 1380 } 1381 1382 /** 1383 * Removes all of the elements from this set. This set will be 1384 * empty after this call returns. 1385 */ 1386 UnicodeSet& UnicodeSet::clear(void) { 1387 if (isFrozen()) { 1388 return *this; 1389 } 1390 if (list != NULL) { 1391 list[0] = UNICODESET_HIGH; 1392 } 1393 len = 1; 1394 releasePattern(); 1395 if (strings != NULL) { 1396 strings->removeAllElements(); 1397 } 1398 if (list != NULL && strings != NULL) { 1399 // Remove bogus 1400 fFlags = 0; 1401 } 1402 return *this; 1403 } 1404 1405 /** 1406 * Iteration method that returns the number of ranges contained in 1407 * this set. 1408 * @see #getRangeStart 1409 * @see #getRangeEnd 1410 */ 1411 int32_t UnicodeSet::getRangeCount() const { 1412 return len/2; 1413 } 1414 1415 /** 1416 * Iteration method that returns the first character in the 1417 * specified range of this set. 1418 * @see #getRangeCount 1419 * @see #getRangeEnd 1420 */ 1421 UChar32 UnicodeSet::getRangeStart(int32_t index) const { 1422 return list[index*2]; 1423 } 1424 1425 /** 1426 * Iteration method that returns the last character in the 1427 * specified range of this set. 1428 * @see #getRangeStart 1429 * @see #getRangeEnd 1430 */ 1431 UChar32 UnicodeSet::getRangeEnd(int32_t index) const { 1432 return list[index*2 + 1] - 1; 1433 } 1434 1435 int32_t UnicodeSet::getStringCount() const { 1436 return strings->size(); 1437 } 1438 1439 const UnicodeString* UnicodeSet::getString(int32_t index) const { 1440 return (const UnicodeString*) strings->elementAt(index); 1441 } 1442 1443 /** 1444 * Reallocate this objects internal structures to take up the least 1445 * possible space, without changing this object's value. 1446 */ 1447 UnicodeSet& UnicodeSet::compact() { 1448 if (isFrozen() || isBogus()) { 1449 return *this; 1450 } 1451 // Delete buffer first to defragment memory less. 1452 if (buffer != NULL) { 1453 uprv_free(buffer); 1454 buffer = NULL; 1455 } 1456 if (len < capacity) { 1457 // Make the capacity equal to len or 1. 1458 // We don't want to realloc of 0 size. 1459 int32_t newCapacity = len + (len == 0); 1460 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); 1461 if (temp) { 1462 list = temp; 1463 capacity = newCapacity; 1464 } 1465 // else what the heck happened?! We allocated less memory! 1466 // Oh well. We'll keep our original array. 1467 } 1468 return *this; 1469 } 1470 1471 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const { 1472 int32_t bmpLength, length, destLength; 1473 1474 if (U_FAILURE(ec)) { 1475 return 0; 1476 } 1477 1478 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) { 1479 ec=U_ILLEGAL_ARGUMENT_ERROR; 1480 return 0; 1481 } 1482 1483 /* count necessary 16-bit units */ 1484 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH 1485 // assert(length>=0); 1486 if (length==0) { 1487 /* empty set */ 1488 if (destCapacity>0) { 1489 *dest=0; 1490 } else { 1491 ec=U_BUFFER_OVERFLOW_ERROR; 1492 } 1493 return 1; 1494 } 1495 /* now length>0 */ 1496 1497 if (this->list[length-1]<=0xffff) { 1498 /* all BMP */ 1499 bmpLength=length; 1500 } else if (this->list[0]>=0x10000) { 1501 /* all supplementary */ 1502 bmpLength=0; 1503 length*=2; 1504 } else { 1505 /* some BMP, some supplementary */ 1506 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {} 1507 length=bmpLength+2*(length-bmpLength); 1508 } 1509 1510 /* length: number of 16-bit array units */ 1511 if (length>0x7fff) { 1512 /* there are only 15 bits for the length in the first serialized word */ 1513 ec=U_INDEX_OUTOFBOUNDS_ERROR; 1514 return 0; 1515 } 1516 1517 /* 1518 * total serialized length: 1519 * number of 16-bit array units (length) + 1520 * 1 length unit (always) + 1521 * 1 bmpLength unit (if there are supplementary values) 1522 */ 1523 destLength=length+((length>bmpLength)?2:1); 1524 if (destLength<=destCapacity) { 1525 const UChar32 *p; 1526 int32_t i; 1527 1528 *dest=(uint16_t)length; 1529 if (length>bmpLength) { 1530 *dest|=0x8000; 1531 *++dest=(uint16_t)bmpLength; 1532 } 1533 ++dest; 1534 1535 /* write the BMP part of the array */ 1536 p=this->list; 1537 for (i=0; i<bmpLength; ++i) { 1538 *dest++=(uint16_t)*p++; 1539 } 1540 1541 /* write the supplementary part of the array */ 1542 for (; i<length; i+=2) { 1543 *dest++=(uint16_t)(*p>>16); 1544 *dest++=(uint16_t)*p++; 1545 } 1546 } else { 1547 ec=U_BUFFER_OVERFLOW_ERROR; 1548 } 1549 return destLength; 1550 } 1551 1552 //---------------------------------------------------------------- 1553 // Implementation: Utility methods 1554 //---------------------------------------------------------------- 1555 1556 /** 1557 * Allocate our strings vector and return TRUE if successful. 1558 */ 1559 UBool UnicodeSet::allocateStrings(UErrorCode &status) { 1560 if (U_FAILURE(status)) { 1561 return FALSE; 1562 } 1563 strings = new UVector(uprv_deleteUObject, 1564 uhash_compareUnicodeString, 1, status); 1565 if (strings == NULL) { // Check for memory allocation error. 1566 status = U_MEMORY_ALLOCATION_ERROR; 1567 return FALSE; 1568 } 1569 if (U_FAILURE(status)) { 1570 delete strings; 1571 strings = NULL; 1572 return FALSE; 1573 } 1574 return TRUE; 1575 } 1576 1577 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { 1578 if (newLen <= capacity) 1579 return; 1580 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1581 if (temp == NULL) { 1582 ec = U_MEMORY_ALLOCATION_ERROR; 1583 setToBogus(); 1584 return; 1585 } 1586 list = temp; 1587 capacity = newLen + GROW_EXTRA; 1588 // else we keep the original contents on the memory failure. 1589 } 1590 1591 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { 1592 if (buffer != NULL && newLen <= bufferCapacity) 1593 return; 1594 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1595 if (temp == NULL) { 1596 ec = U_MEMORY_ALLOCATION_ERROR; 1597 setToBogus(); 1598 return; 1599 } 1600 buffer = temp; 1601 bufferCapacity = newLen + GROW_EXTRA; 1602 // else we keep the original contents on the memory failure. 1603 } 1604 1605 /** 1606 * Swap list and buffer. 1607 */ 1608 void UnicodeSet::swapBuffers(void) { 1609 // swap list and buffer 1610 UChar32* temp = list; 1611 list = buffer; 1612 buffer = temp; 1613 1614 int32_t c = capacity; 1615 capacity = bufferCapacity; 1616 bufferCapacity = c; 1617 } 1618 1619 void UnicodeSet::setToBogus() { 1620 clear(); // Remove everything in the set. 1621 fFlags = kIsBogus; 1622 } 1623 1624 //---------------------------------------------------------------- 1625 // Implementation: Fundamental operators 1626 //---------------------------------------------------------------- 1627 1628 static inline UChar32 max(UChar32 a, UChar32 b) { 1629 return (a > b) ? a : b; 1630 } 1631 1632 // polarity = 0, 3 is normal: x xor y 1633 // polarity = 1, 2: x xor ~y == x === y 1634 1635 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { 1636 if (isFrozen() || isBogus()) { 1637 return; 1638 } 1639 UErrorCode status = U_ZERO_ERROR; 1640 ensureBufferCapacity(len + otherLen, status); 1641 if (U_FAILURE(status)) { 1642 return; 1643 } 1644 1645 int32_t i = 0, j = 0, k = 0; 1646 UChar32 a = list[i++]; 1647 UChar32 b; 1648 if (polarity == 1 || polarity == 2) { 1649 b = UNICODESET_LOW; 1650 if (other[j] == UNICODESET_LOW) { // skip base if already LOW 1651 ++j; 1652 b = other[j]; 1653 } 1654 } else { 1655 b = other[j++]; 1656 } 1657 // simplest of all the routines 1658 // sort the values, discarding identicals! 1659 for (;;) { 1660 if (a < b) { 1661 buffer[k++] = a; 1662 a = list[i++]; 1663 } else if (b < a) { 1664 buffer[k++] = b; 1665 b = other[j++]; 1666 } else if (a != UNICODESET_HIGH) { // at this point, a == b 1667 // discard both values! 1668 a = list[i++]; 1669 b = other[j++]; 1670 } else { // DONE! 1671 buffer[k++] = UNICODESET_HIGH; 1672 len = k; 1673 break; 1674 } 1675 } 1676 swapBuffers(); 1677 releasePattern(); 1678 } 1679 1680 // polarity = 0 is normal: x union y 1681 // polarity = 2: x union ~y 1682 // polarity = 1: ~x union y 1683 // polarity = 3: ~x union ~y 1684 1685 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { 1686 if (isFrozen() || isBogus() || other==NULL) { 1687 return; 1688 } 1689 UErrorCode status = U_ZERO_ERROR; 1690 ensureBufferCapacity(len + otherLen, status); 1691 if (U_FAILURE(status)) { 1692 return; 1693 } 1694 1695 int32_t i = 0, j = 0, k = 0; 1696 UChar32 a = list[i++]; 1697 UChar32 b = other[j++]; 1698 // change from xor is that we have to check overlapping pairs 1699 // polarity bit 1 means a is second, bit 2 means b is. 1700 for (;;) { 1701 switch (polarity) { 1702 case 0: // both first; take lower if unequal 1703 if (a < b) { // take a 1704 // Back up over overlapping ranges in buffer[] 1705 if (k > 0 && a <= buffer[k-1]) { 1706 // Pick latter end value in buffer[] vs. list[] 1707 a = max(list[i], buffer[--k]); 1708 } else { 1709 // No overlap 1710 buffer[k++] = a; 1711 a = list[i]; 1712 } 1713 i++; // Common if/else code factored out 1714 polarity ^= 1; 1715 } else if (b < a) { // take b 1716 if (k > 0 && b <= buffer[k-1]) { 1717 b = max(other[j], buffer[--k]); 1718 } else { 1719 buffer[k++] = b; 1720 b = other[j]; 1721 } 1722 j++; 1723 polarity ^= 2; 1724 } else { // a == b, take a, drop b 1725 if (a == UNICODESET_HIGH) goto loop_end; 1726 // This is symmetrical; it doesn't matter if 1727 // we backtrack with a or b. - liu 1728 if (k > 0 && a <= buffer[k-1]) { 1729 a = max(list[i], buffer[--k]); 1730 } else { 1731 // No overlap 1732 buffer[k++] = a; 1733 a = list[i]; 1734 } 1735 i++; 1736 polarity ^= 1; 1737 b = other[j++]; 1738 polarity ^= 2; 1739 } 1740 break; 1741 case 3: // both second; take higher if unequal, and drop other 1742 if (b <= a) { // take a 1743 if (a == UNICODESET_HIGH) goto loop_end; 1744 buffer[k++] = a; 1745 } else { // take b 1746 if (b == UNICODESET_HIGH) goto loop_end; 1747 buffer[k++] = b; 1748 } 1749 a = list[i++]; 1750 polarity ^= 1; // factored common code 1751 b = other[j++]; 1752 polarity ^= 2; 1753 break; 1754 case 1: // a second, b first; if b < a, overlap 1755 if (a < b) { // no overlap, take a 1756 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1757 } else if (b < a) { // OVERLAP, drop b 1758 b = other[j++]; 1759 polarity ^= 2; 1760 } else { // a == b, drop both! 1761 if (a == UNICODESET_HIGH) goto loop_end; 1762 a = list[i++]; 1763 polarity ^= 1; 1764 b = other[j++]; 1765 polarity ^= 2; 1766 } 1767 break; 1768 case 2: // a first, b second; if a < b, overlap 1769 if (b < a) { // no overlap, take b 1770 buffer[k++] = b; 1771 b = other[j++]; 1772 polarity ^= 2; 1773 } else if (a < b) { // OVERLAP, drop a 1774 a = list[i++]; 1775 polarity ^= 1; 1776 } else { // a == b, drop both! 1777 if (a == UNICODESET_HIGH) goto loop_end; 1778 a = list[i++]; 1779 polarity ^= 1; 1780 b = other[j++]; 1781 polarity ^= 2; 1782 } 1783 break; 1784 } 1785 } 1786 loop_end: 1787 buffer[k++] = UNICODESET_HIGH; // terminate 1788 len = k; 1789 swapBuffers(); 1790 releasePattern(); 1791 } 1792 1793 // polarity = 0 is normal: x intersect y 1794 // polarity = 2: x intersect ~y == set-minus 1795 // polarity = 1: ~x intersect y 1796 // polarity = 3: ~x intersect ~y 1797 1798 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { 1799 if (isFrozen() || isBogus()) { 1800 return; 1801 } 1802 UErrorCode status = U_ZERO_ERROR; 1803 ensureBufferCapacity(len + otherLen, status); 1804 if (U_FAILURE(status)) { 1805 return; 1806 } 1807 1808 int32_t i = 0, j = 0, k = 0; 1809 UChar32 a = list[i++]; 1810 UChar32 b = other[j++]; 1811 // change from xor is that we have to check overlapping pairs 1812 // polarity bit 1 means a is second, bit 2 means b is. 1813 for (;;) { 1814 switch (polarity) { 1815 case 0: // both first; drop the smaller 1816 if (a < b) { // drop a 1817 a = list[i++]; 1818 polarity ^= 1; 1819 } else if (b < a) { // drop b 1820 b = other[j++]; 1821 polarity ^= 2; 1822 } else { // a == b, take one, drop other 1823 if (a == UNICODESET_HIGH) goto loop_end; 1824 buffer[k++] = a; 1825 a = list[i++]; 1826 polarity ^= 1; 1827 b = other[j++]; 1828 polarity ^= 2; 1829 } 1830 break; 1831 case 3: // both second; take lower if unequal 1832 if (a < b) { // take a 1833 buffer[k++] = a; 1834 a = list[i++]; 1835 polarity ^= 1; 1836 } else if (b < a) { // take b 1837 buffer[k++] = b; 1838 b = other[j++]; 1839 polarity ^= 2; 1840 } else { // a == b, take one, drop other 1841 if (a == UNICODESET_HIGH) goto loop_end; 1842 buffer[k++] = a; 1843 a = list[i++]; 1844 polarity ^= 1; 1845 b = other[j++]; 1846 polarity ^= 2; 1847 } 1848 break; 1849 case 1: // a second, b first; 1850 if (a < b) { // NO OVERLAP, drop a 1851 a = list[i++]; 1852 polarity ^= 1; 1853 } else if (b < a) { // OVERLAP, take b 1854 buffer[k++] = b; 1855 b = other[j++]; 1856 polarity ^= 2; 1857 } else { // a == b, drop both! 1858 if (a == UNICODESET_HIGH) goto loop_end; 1859 a = list[i++]; 1860 polarity ^= 1; 1861 b = other[j++]; 1862 polarity ^= 2; 1863 } 1864 break; 1865 case 2: // a first, b second; if a < b, overlap 1866 if (b < a) { // no overlap, drop b 1867 b = other[j++]; 1868 polarity ^= 2; 1869 } else if (a < b) { // OVERLAP, take a 1870 buffer[k++] = a; 1871 a = list[i++]; 1872 polarity ^= 1; 1873 } else { // a == b, drop both! 1874 if (a == UNICODESET_HIGH) goto loop_end; 1875 a = list[i++]; 1876 polarity ^= 1; 1877 b = other[j++]; 1878 polarity ^= 2; 1879 } 1880 break; 1881 } 1882 } 1883 loop_end: 1884 buffer[k++] = UNICODESET_HIGH; // terminate 1885 len = k; 1886 swapBuffers(); 1887 releasePattern(); 1888 } 1889 1890 /** 1891 * Append the <code>toPattern()</code> representation of a 1892 * string to the given <code>StringBuffer</code>. 1893 */ 1894 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool 1895 escapeUnprintable) { 1896 UChar32 cp; 1897 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1898 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); 1899 } 1900 } 1901 1902 /** 1903 * Append the <code>toPattern()</code> representation of a 1904 * character to the given <code>StringBuffer</code>. 1905 */ 1906 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool 1907 escapeUnprintable) { 1908 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 1909 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything 1910 // unprintable 1911 if (ICU_Utility::escapeUnprintable(buf, c)) { 1912 return; 1913 } 1914 } 1915 // Okay to let ':' pass through 1916 switch (c) { 1917 case SET_OPEN: 1918 case SET_CLOSE: 1919 case HYPHEN: 1920 case COMPLEMENT: 1921 case INTERSECTION: 1922 case BACKSLASH: 1923 case OPEN_BRACE: 1924 case CLOSE_BRACE: 1925 case COLON: 1926 case SymbolTable::SYMBOL_REF: 1927 buf.append(BACKSLASH); 1928 break; 1929 default: 1930 // Escape whitespace 1931 if (PatternProps::isWhiteSpace(c)) { 1932 buf.append(BACKSLASH); 1933 } 1934 break; 1935 } 1936 buf.append(c); 1937 } 1938 1939 /** 1940 * Append a string representation of this set to result. This will be 1941 * a cleaned version of the string passed to applyPattern(), if there 1942 * is one. Otherwise it will be generated. 1943 */ 1944 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, 1945 UBool escapeUnprintable) const 1946 { 1947 if (pat != NULL) { 1948 int32_t i; 1949 int32_t backslashCount = 0; 1950 for (i=0; i<patLen; ) { 1951 UChar32 c; 1952 U16_NEXT(pat, i, patLen, c); 1953 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 1954 // If the unprintable character is preceded by an odd 1955 // number of backslashes, then it has been escaped. 1956 // Before unescaping it, we delete the final 1957 // backslash. 1958 if ((backslashCount % 2) == 1) { 1959 result.truncate(result.length() - 1); 1960 } 1961 ICU_Utility::escapeUnprintable(result, c); 1962 backslashCount = 0; 1963 } else { 1964 result.append(c); 1965 if (c == BACKSLASH) { 1966 ++backslashCount; 1967 } else { 1968 backslashCount = 0; 1969 } 1970 } 1971 } 1972 return result; 1973 } 1974 1975 return _generatePattern(result, escapeUnprintable); 1976 } 1977 1978 /** 1979 * Returns a string representation of this set. If the result of 1980 * calling this function is passed to a UnicodeSet constructor, it 1981 * will produce another set that is equal to this one. 1982 */ 1983 UnicodeString& UnicodeSet::toPattern(UnicodeString& result, 1984 UBool escapeUnprintable) const 1985 { 1986 result.truncate(0); 1987 return _toPattern(result, escapeUnprintable); 1988 } 1989 1990 /** 1991 * Generate and append a string representation of this set to result. 1992 * This does not use this.pat, the cleaned up copy of the string 1993 * passed to applyPattern(). 1994 */ 1995 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, 1996 UBool escapeUnprintable) const 1997 { 1998 result.append(SET_OPEN); 1999 2000 // // Check against the predefined categories. We implicitly build 2001 // // up ALL category sets the first time toPattern() is called. 2002 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { 2003 // if (*this == getCategorySet(cat)) { 2004 // result.append(COLON); 2005 // result.append(CATEGORY_NAMES, cat*2, 2); 2006 // return result.append(CATEGORY_CLOSE); 2007 // } 2008 // } 2009 2010 int32_t count = getRangeCount(); 2011 2012 // If the set contains at least 2 intervals and includes both 2013 // MIN_VALUE and MAX_VALUE, then the inverse representation will 2014 // be more economical. 2015 if (count > 1 && 2016 getRangeStart(0) == MIN_VALUE && 2017 getRangeEnd(count-1) == MAX_VALUE) { 2018 2019 // Emit the inverse 2020 result.append(COMPLEMENT); 2021 2022 for (int32_t i = 1; i < count; ++i) { 2023 UChar32 start = getRangeEnd(i-1)+1; 2024 UChar32 end = getRangeStart(i)-1; 2025 _appendToPat(result, start, escapeUnprintable); 2026 if (start != end) { 2027 if ((start+1) != end) { 2028 result.append(HYPHEN); 2029 } 2030 _appendToPat(result, end, escapeUnprintable); 2031 } 2032 } 2033 } 2034 2035 // Default; emit the ranges as pairs 2036 else { 2037 for (int32_t i = 0; i < count; ++i) { 2038 UChar32 start = getRangeStart(i); 2039 UChar32 end = getRangeEnd(i); 2040 _appendToPat(result, start, escapeUnprintable); 2041 if (start != end) { 2042 if ((start+1) != end) { 2043 result.append(HYPHEN); 2044 } 2045 _appendToPat(result, end, escapeUnprintable); 2046 } 2047 } 2048 } 2049 2050 for (int32_t i = 0; i<strings->size(); ++i) { 2051 result.append(OPEN_BRACE); 2052 _appendToPat(result, 2053 *(const UnicodeString*) strings->elementAt(i), 2054 escapeUnprintable); 2055 result.append(CLOSE_BRACE); 2056 } 2057 return result.append(SET_CLOSE); 2058 } 2059 2060 /** 2061 * Release existing cached pattern 2062 */ 2063 void UnicodeSet::releasePattern() { 2064 if (pat) { 2065 uprv_free(pat); 2066 pat = NULL; 2067 patLen = 0; 2068 } 2069 } 2070 2071 /** 2072 * Set the new pattern to cache. 2073 */ 2074 void UnicodeSet::setPattern(const UnicodeString& newPat) { 2075 releasePattern(); 2076 int32_t newPatLen = newPat.length(); 2077 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); 2078 if (pat) { 2079 patLen = newPatLen; 2080 newPat.extractBetween(0, patLen, pat); 2081 pat[patLen] = 0; 2082 } 2083 // else we don't care if malloc failed. This was just a nice cache. 2084 // We can regenerate an equivalent pattern later when requested. 2085 } 2086 2087 UnicodeFunctor *UnicodeSet::freeze() { 2088 if(!isFrozen() && !isBogus()) { 2089 // Do most of what compact() does before freezing because 2090 // compact() will not work when the set is frozen. 2091 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). 2092 2093 // Delete buffer first to defragment memory less. 2094 if (buffer != NULL) { 2095 uprv_free(buffer); 2096 buffer = NULL; 2097 } 2098 if (capacity > (len + GROW_EXTRA)) { 2099 // Make the capacity equal to len or 1. 2100 // We don't want to realloc of 0 size. 2101 capacity = len + (len == 0); 2102 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); 2103 if (list == NULL) { // Check for memory allocation error. 2104 setToBogus(); 2105 return this; 2106 } 2107 } 2108 2109 // Optimize contains() and span() and similar functions. 2110 if (!strings->isEmpty()) { 2111 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); 2112 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { 2113 // All strings are irrelevant for span() etc. because 2114 // all of each string's code points are contained in this set. 2115 // Do not check needsStringSpanUTF8() because UTF-8 has at most as 2116 // many relevant strings as UTF-16. 2117 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) 2118 delete stringSpan; 2119 stringSpan = NULL; 2120 } 2121 } 2122 if (stringSpan == NULL) { 2123 // No span-relevant strings: Optimize for code point spans. 2124 bmpSet=new BMPSet(list, len); 2125 if (bmpSet == NULL) { // Check for memory allocation error. 2126 setToBogus(); 2127 } 2128 } 2129 } 2130 return this; 2131 } 2132 2133 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 2134 if(length>0 && bmpSet!=NULL) { 2135 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s); 2136 } 2137 if(length<0) { 2138 length=u_strlen(s); 2139 } 2140 if(length==0) { 2141 return 0; 2142 } 2143 if(stringSpan!=NULL) { 2144 return stringSpan->span(s, length, spanCondition); 2145 } else if(!strings->isEmpty()) { 2146 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2147 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : 2148 UnicodeSetStringSpan::FWD_UTF16_CONTAINED; 2149 UnicodeSetStringSpan strSpan(*this, *strings, which); 2150 if(strSpan.needsStringSpanUTF16()) { 2151 return strSpan.span(s, length, spanCondition); 2152 } 2153 } 2154 2155 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2156 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2157 } 2158 2159 UChar32 c; 2160 int32_t start=0, prev=0; 2161 do { 2162 U16_NEXT(s, start, length, c); 2163 if(spanCondition!=contains(c)) { 2164 break; 2165 } 2166 } while((prev=start)<length); 2167 return prev; 2168 } 2169 2170 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 2171 if(length>0 && bmpSet!=NULL) { 2172 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s); 2173 } 2174 if(length<0) { 2175 length=u_strlen(s); 2176 } 2177 if(length==0) { 2178 return 0; 2179 } 2180 if(stringSpan!=NULL) { 2181 return stringSpan->spanBack(s, length, spanCondition); 2182 } else if(!strings->isEmpty()) { 2183 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2184 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : 2185 UnicodeSetStringSpan::BACK_UTF16_CONTAINED; 2186 UnicodeSetStringSpan strSpan(*this, *strings, which); 2187 if(strSpan.needsStringSpanUTF16()) { 2188 return strSpan.spanBack(s, length, spanCondition); 2189 } 2190 } 2191 2192 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2193 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2194 } 2195 2196 UChar32 c; 2197 int32_t prev=length; 2198 do { 2199 U16_PREV(s, 0, length, c); 2200 if(spanCondition!=contains(c)) { 2201 break; 2202 } 2203 } while((prev=length)>0); 2204 return prev; 2205 } 2206 2207 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2208 if(length>0 && bmpSet!=NULL) { 2209 const uint8_t *s0=(const uint8_t *)s; 2210 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0); 2211 } 2212 if(length<0) { 2213 length=(int32_t)uprv_strlen(s); 2214 } 2215 if(length==0) { 2216 return 0; 2217 } 2218 if(stringSpan!=NULL) { 2219 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); 2220 } else if(!strings->isEmpty()) { 2221 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2222 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : 2223 UnicodeSetStringSpan::FWD_UTF8_CONTAINED; 2224 UnicodeSetStringSpan strSpan(*this, *strings, which); 2225 if(strSpan.needsStringSpanUTF8()) { 2226 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition); 2227 } 2228 } 2229 2230 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2231 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2232 } 2233 2234 UChar32 c; 2235 int32_t start=0, prev=0; 2236 do { 2237 U8_NEXT(s, start, length, c); 2238 if(c<0) { 2239 c=0xfffd; 2240 } 2241 if(spanCondition!=contains(c)) { 2242 break; 2243 } 2244 } while((prev=start)<length); 2245 return prev; 2246 } 2247 2248 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2249 if(length>0 && bmpSet!=NULL) { 2250 const uint8_t *s0=(const uint8_t *)s; 2251 return bmpSet->spanBackUTF8(s0, length, spanCondition); 2252 } 2253 if(length<0) { 2254 length=(int32_t)uprv_strlen(s); 2255 } 2256 if(length==0) { 2257 return 0; 2258 } 2259 if(stringSpan!=NULL) { 2260 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); 2261 } else if(!strings->isEmpty()) { 2262 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2263 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : 2264 UnicodeSetStringSpan::BACK_UTF8_CONTAINED; 2265 UnicodeSetStringSpan strSpan(*this, *strings, which); 2266 if(strSpan.needsStringSpanUTF8()) { 2267 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition); 2268 } 2269 } 2270 2271 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2272 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2273 } 2274 2275 UChar32 c; 2276 int32_t prev=length; 2277 do { 2278 U8_PREV(s, 0, length, c); 2279 if(c<0) { 2280 c=0xfffd; 2281 } 2282 if(spanCondition!=contains(c)) { 2283 break; 2284 } 2285 } while((prev=length)>0); 2286 return prev; 2287 } 2288 2289 U_NAMESPACE_END 2290