1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2007, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: unisetspan.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2007mar01 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/uniset.h" 19 #include "unicode/ustring.h" 20 #include "cmemory.h" 21 #include "uvector.h" 22 #include "unisetspan.h" 23 24 U_NAMESPACE_BEGIN 25 26 /* 27 * List of offsets from the current position from where to try matching 28 * a code point or a string. 29 * Store offsets rather than indexes to simplify the code and use the same list 30 * for both increments (in span()) and decrements (in spanBack()). 31 * 32 * Assumption: The maximum offset is limited, and the offsets that are stored 33 * at any one time are relatively dense, that is, there are normally no gaps of 34 * hundreds or thousands of offset values. 35 * 36 * The implementation uses a circular buffer of byte flags, 37 * each indicating whether the corresponding offset is in the list. 38 * This avoids inserting into a sorted list of offsets (or absolute indexes) and 39 * physically moving part of the list. 40 * 41 * Note: In principle, the caller should setMaxLength() to the maximum of the 42 * max string length and U16_LENGTH/U8_LENGTH to account for 43 * "long" single code points. 44 * However, this implementation uses at least a staticList with more than 45 * U8_LENGTH entries anyway. 46 * 47 * Note: If maxLength were guaranteed to be no more than 32 or 64, 48 * the list could be stored as bit flags in a single integer. 49 * Rather than handling a circular buffer with a start list index, 50 * the integer would simply be shifted when lower offsets are removed. 51 * UnicodeSet does not have a limit on the lengths of strings. 52 */ 53 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 54 public: 55 OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 56 57 ~OffsetList() { 58 if(list!=staticList) { 59 uprv_free(list); 60 } 61 } 62 63 // Call exactly once if the list is to be used. 64 void setMaxLength(int32_t maxLength) { 65 if(maxLength<=(int32_t)sizeof(staticList)) { 66 capacity=(int32_t)sizeof(staticList); 67 } else { 68 UBool *l=(UBool *)uprv_malloc(maxLength); 69 if(l!=NULL) { 70 list=l; 71 capacity=maxLength; 72 } 73 } 74 uprv_memset(list, 0, capacity); 75 } 76 77 void clear() { 78 uprv_memset(list, 0, capacity); 79 start=length=0; 80 } 81 82 UBool isEmpty() const { 83 return (UBool)(length==0); 84 } 85 86 // Reduce all stored offsets by delta, used when the current position 87 // moves by delta. 88 // There must not be any offsets lower than delta. 89 // If there is an offset equal to delta, it is removed. 90 // delta=[1..maxLength] 91 void shift(int32_t delta) { 92 int32_t i=start+delta; 93 if(i>=capacity) { 94 i-=capacity; 95 } 96 if(list[i]) { 97 list[i]=FALSE; 98 --length; 99 } 100 start=i; 101 } 102 103 // Add an offset. The list must not contain it yet. 104 // offset=[1..maxLength] 105 void addOffset(int32_t offset) { 106 int32_t i=start+offset; 107 if(i>=capacity) { 108 i-=capacity; 109 } 110 list[i]=TRUE; 111 ++length; 112 } 113 114 // offset=[1..maxLength] 115 UBool containsOffset(int32_t offset) const { 116 int32_t i=start+offset; 117 if(i>=capacity) { 118 i-=capacity; 119 } 120 return list[i]; 121 } 122 123 // Find the lowest stored offset from a non-empty list, remove it, 124 // and reduce all other offsets by this minimum. 125 // Returns [1..maxLength]. 126 int32_t popMinimum() { 127 // Look for the next offset in list[start+1..capacity-1]. 128 int32_t i=start, result; 129 while(++i<capacity) { 130 if(list[i]) { 131 list[i]=FALSE; 132 --length; 133 result=i-start; 134 start=i; 135 return result; 136 } 137 } 138 // i==capacity 139 140 // Wrap around and look for the next offset in list[0..start]. 141 // Since the list is not empty, there will be one. 142 result=capacity-start; 143 i=0; 144 while(!list[i]) { 145 ++i; 146 } 147 list[i]=FALSE; 148 --length; 149 start=i; 150 return result+=i; 151 } 152 153 private: 154 UBool *list; 155 int32_t capacity; 156 int32_t length; 157 int32_t start; 158 159 UBool staticList[16]; 160 }; 161 162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string. 163 static int32_t 164 getUTF8Length(const UChar *s, int32_t length) { 165 UErrorCode errorCode=U_ZERO_ERROR; 166 int32_t length8=0; 167 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 168 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 169 return length8; 170 } else { 171 // The string contains an unpaired surrogate. 172 // Ignore this string. 173 return 0; 174 } 175 } 176 177 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 178 static int32_t 179 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 180 UErrorCode errorCode=U_ZERO_ERROR; 181 int32_t length8=0; 182 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 183 if(U_SUCCESS(errorCode)) { 184 return length8; 185 } else { 186 // The string contains an unpaired surrogate. 187 // Ignore this string. 188 return 0; 189 } 190 } 191 192 static inline uint8_t 193 makeSpanLengthByte(int32_t spanLength) { 194 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 195 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 196 } 197 198 // Construct for all variants of span(), or only for any one variant. 199 // Initialize as little as possible, for single use. 200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 201 const UVector &setStrings, 202 uint32_t which) 203 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 204 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 205 utf8Length(0), 206 maxLength16(0), maxLength8(0), 207 all((UBool)(which==ALL)) { 208 spanSet.retainAll(set); 209 if(which&NOT_CONTAINED) { 210 // Default to the same sets. 211 // addToSpanNotSet() will create a separate set if necessary. 212 pSpanNotSet=&spanSet; 213 } 214 215 // Determine if the strings even need to be taken into account at all for span() etc. 216 // If any string is relevant, then all strings need to be used for 217 // span(longest match) but only the relevant ones for span(while contained). 218 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 219 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 220 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 221 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 222 int32_t stringsLength=strings.size(); 223 224 int32_t i, spanLength; 225 UBool someRelevant=FALSE; 226 for(i=0; i<stringsLength; ++i) { 227 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 228 const UChar *s16=string.getBuffer(); 229 int32_t length16=string.length(); 230 UBool thisRelevant; 231 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 232 if(spanLength<length16) { // Relevant string. 233 someRelevant=thisRelevant=TRUE; 234 } else { 235 thisRelevant=FALSE; 236 } 237 if((which&UTF16) && length16>maxLength16) { 238 maxLength16=length16; 239 } 240 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 241 int32_t length8=getUTF8Length(s16, length16); 242 utf8Length+=length8; 243 if(length8>maxLength8) { 244 maxLength8=length8; 245 } 246 } 247 } 248 if(!someRelevant) { 249 maxLength16=maxLength8=0; 250 return; 251 } 252 253 // Freeze after checking for the need to use strings at all because freezing 254 // a set takes some time and memory which are wasted if there are no relevant strings. 255 if(all) { 256 spanSet.freeze(); 257 } 258 259 uint8_t *spanBackLengths; 260 uint8_t *spanUTF8Lengths; 261 uint8_t *spanBackUTF8Lengths; 262 263 // Allocate a block of meta data. 264 int32_t allocSize; 265 if(all) { 266 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 267 allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 268 } else { 269 allocSize=stringsLength; // One set of span lengths. 270 if(which&UTF8) { 271 // UTF-8 lengths and UTF-8 strings. 272 allocSize+=stringsLength*4+utf8Length; 273 } 274 } 275 if(allocSize<=(int32_t)sizeof(staticLengths)) { 276 utf8Lengths=staticLengths; 277 } else { 278 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 279 if(utf8Lengths==NULL) { 280 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 281 return; // Out of memory. 282 } 283 } 284 285 if(all) { 286 // Store span lengths for all span() variants. 287 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 288 spanBackLengths=spanLengths+stringsLength; 289 spanUTF8Lengths=spanBackLengths+stringsLength; 290 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 291 utf8=spanBackUTF8Lengths+stringsLength; 292 } else { 293 // Store span lengths for only one span() variant. 294 if(which&UTF8) { 295 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 296 utf8=spanLengths+stringsLength; 297 } else { 298 spanLengths=(uint8_t *)utf8Lengths; 299 } 300 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 301 } 302 303 // Set the meta data and pSpanNotSet and write the UTF-8 strings. 304 int32_t utf8Count=0; // Count UTF-8 bytes written so far. 305 306 for(i=0; i<stringsLength; ++i) { 307 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 308 const UChar *s16=string.getBuffer(); 309 int32_t length16=string.length(); 310 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 311 if(spanLength<length16) { // Relevant string. 312 if(which&UTF16) { 313 if(which&CONTAINED) { 314 if(which&FWD) { 315 spanLengths[i]=makeSpanLengthByte(spanLength); 316 } 317 if(which&BACK) { 318 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 319 spanBackLengths[i]=makeSpanLengthByte(spanLength); 320 } 321 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 322 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 323 } 324 } 325 if(which&UTF8) { 326 uint8_t *s8=utf8+utf8Count; 327 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 328 utf8Count+=utf8Lengths[i]=length8; 329 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 330 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 331 } else { // Relevant for UTF-8. 332 if(which&CONTAINED) { 333 if(which&FWD) { 334 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 335 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 336 } 337 if(which&BACK) { 338 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 339 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 340 } 341 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 342 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 343 } 344 } 345 } 346 if(which&NOT_CONTAINED) { 347 // Add string start and end code points to the spanNotSet so that 348 // a span(while not contained) stops before any string. 349 UChar32 c; 350 if(which&FWD) { 351 int32_t len=0; 352 U16_NEXT(s16, len, length16, c); 353 addToSpanNotSet(c); 354 } 355 if(which&BACK) { 356 int32_t len=length16; 357 U16_PREV(s16, 0, len, c); 358 addToSpanNotSet(c); 359 } 360 } 361 } else { // Irrelevant string. 362 if(which&UTF8) { 363 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 364 uint8_t *s8=utf8+utf8Count; 365 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 366 utf8Count+=utf8Lengths[i]=length8; 367 } else { 368 utf8Lengths[i]=0; 369 } 370 } 371 if(all) { 372 spanLengths[i]=spanBackLengths[i]= 373 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 374 (uint8_t)ALL_CP_CONTAINED; 375 } else { 376 // All spanXYZLengths pointers contain the same address. 377 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 378 } 379 } 380 } 381 382 // Finish. 383 if(all) { 384 pSpanNotSet->freeze(); 385 } 386 } 387 388 // Copy constructor. Assumes which==ALL for a frozen set. 389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 390 const UVector &newParentSetStrings) 391 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 392 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 393 utf8Length(otherStringSpan.utf8Length), 394 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 395 all(TRUE) { 396 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 397 pSpanNotSet=&spanSet; 398 } else { 399 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 400 } 401 402 // Allocate a block of meta data. 403 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 404 int32_t stringsLength=strings.size(); 405 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 406 if(allocSize<=(int32_t)sizeof(staticLengths)) { 407 utf8Lengths=staticLengths; 408 } else { 409 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 410 if(utf8Lengths==NULL) { 411 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 412 return; // Out of memory. 413 } 414 } 415 416 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 417 utf8=spanLengths+stringsLength*4; 418 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 419 } 420 421 UnicodeSetStringSpan::~UnicodeSetStringSpan() { 422 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 423 delete pSpanNotSet; 424 } 425 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 426 uprv_free(utf8Lengths); 427 } 428 } 429 430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 431 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 432 if(spanSet.contains(c)) { 433 return; // Nothing to do. 434 } 435 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 436 if(newSet==NULL) { 437 return; // Out of memory. 438 } else { 439 pSpanNotSet=newSet; 440 } 441 } 442 pSpanNotSet->add(c); 443 } 444 445 // Compare strings without any argument checks. Requires length>0. 446 static inline UBool 447 matches16(const UChar *s, const UChar *t, int32_t length) { 448 do { 449 if(*s++!=*t++) { 450 return FALSE; 451 } 452 } while(--length>0); 453 return TRUE; 454 } 455 456 static inline UBool 457 matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 458 do { 459 if(*s++!=*t++) { 460 return FALSE; 461 } 462 } while(--length>0); 463 return TRUE; 464 } 465 466 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 467 // at code point boundaries. 468 // That is, each edge of a match must not be in the middle of a surrogate pair. 469 static inline UBool 470 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 471 s+=start; 472 limit-=start; 473 return matches16(s, t, length) && 474 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 475 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 476 } 477 478 // Does the set contain the next code point? 479 // If so, return its length; otherwise return its negative length. 480 static inline int32_t 481 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 482 UChar c=*s, c2; 483 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 484 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 485 } 486 return set.contains(c) ? 1 : -1; 487 } 488 489 static inline int32_t 490 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 491 UChar c=s[length-1], c2; 492 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 493 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 494 } 495 return set.contains(c) ? 1 : -1; 496 } 497 498 static inline int32_t 499 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 500 UChar32 c=*s; 501 if((int8_t)c>=0) { 502 return set.contains(c) ? 1 : -1; 503 } 504 // Take advantage of non-ASCII fastpaths in U8_NEXT(). 505 int32_t i=0; 506 U8_NEXT(s, i, length, c); 507 return set.contains(c) ? i : -i; 508 } 509 510 static inline int32_t 511 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 512 UChar32 c=s[length-1]; 513 if((int8_t)c>=0) { 514 return set.contains(c) ? 1 : -1; 515 } 516 int32_t i=length-1; 517 c=utf8_prevCharSafeBody(s, 0, &i, c, -1); 518 length-=i; 519 return set.contains(c) ? length : -length; 520 } 521 522 /* 523 * Note: In span() when spanLength==0 (after a string match, or at the beginning 524 * after an empty code point span) and in spanNot() and spanNotUTF8(), 525 * string matching could use a binary search 526 * because all string matches are done from the same start index. 527 * 528 * For UTF-8, this would require a comparison function that returns UTF-16 order. 529 * 530 * This optimization should not be necessary for normal UnicodeSets because 531 * most sets have no strings, and most sets with strings have 532 * very few very short strings. 533 * For cases with many strings, it might be better to use a different API 534 * and implementation with a DFA (state machine). 535 */ 536 537 /* 538 * Algorithm for span(USET_SPAN_CONTAINED) 539 * 540 * Theoretical algorithm: 541 * - Iterate through the string, and at each code point boundary: 542 * + If the code point there is in the set, then remember to continue after it. 543 * + If a set string matches at the current position, then remember to continue after it. 544 * + Either recursively span for each code point or string match, 545 * or recursively span for all but the shortest one and 546 * iteratively continue the span with the shortest local match. 547 * + Remember the longest recursive span (the farthest end point). 548 * + If there is no match at the current position, neither for the code point there 549 * nor for any set string, then stop and return the longest recursive span length. 550 * 551 * Optimized implementation: 552 * 553 * (We assume that most sets will have very few very short strings. 554 * A span using a string-less set is extremely fast.) 555 * 556 * Create and cache a spanSet which contains all of the single code points 557 * of the original set but none of its strings. 558 * 559 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 560 * - Loop: 561 * + Try to match each set string at the end of the spanLength. 562 * ~ Set strings that start with set-contained code points must be matched 563 * with a partial overlap because the recursive algorithm would have tried 564 * to match them at every position. 565 * ~ Set strings that entirely consist of set-contained code points 566 * are irrelevant for span(USET_SPAN_CONTAINED) because the 567 * recursive algorithm would continue after them anyway 568 * and find the longest recursive match from their end. 569 * ~ Rather than recursing, note each end point of a set string match. 570 * + If no set string matched after spanSet.span(), then return 571 * with where the spanSet.span() ended. 572 * + If at least one set string matched after spanSet.span(), then 573 * pop the shortest string match end point and continue 574 * the loop, trying to match all set strings from there. 575 * + If at least one more set string matched after a previous string match, 576 * then test if the code point after the previous string match is also 577 * contained in the set. 578 * Continue the loop with the shortest end point of either this code point 579 * or a matching set string. 580 * + If no more set string matched after a previous string match, 581 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 582 * Stop if spanLength==0, otherwise continue the loop. 583 * 584 * By noting each end point of a set string match, 585 * the function visits each string position at most once and finishes 586 * in linear time. 587 * 588 * The recursive algorithm may visit the same string position many times 589 * if multiple paths lead to it and finishes in exponential time. 590 */ 591 592 /* 593 * Algorithm for span(USET_SPAN_SIMPLE) 594 * 595 * Theoretical algorithm: 596 * - Iterate through the string, and at each code point boundary: 597 * + If the code point there is in the set, then remember to continue after it. 598 * + If a set string matches at the current position, then remember to continue after it. 599 * + Continue from the farthest match position and ignore all others. 600 * + If there is no match at the current position, 601 * then stop and return the current position. 602 * 603 * Optimized implementation: 604 * 605 * (Same assumption and spanSet as above.) 606 * 607 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 608 * - Loop: 609 * + Try to match each set string at the end of the spanLength. 610 * ~ Set strings that start with set-contained code points must be matched 611 * with a partial overlap because the standard algorithm would have tried 612 * to match them earlier. 613 * ~ Set strings that entirely consist of set-contained code points 614 * must be matched with a full overlap because the longest-match algorithm 615 * would hide set string matches that end earlier. 616 * Such set strings need not be matched earlier inside the code point span 617 * because the standard algorithm would then have continued after 618 * the set string match anyway. 619 * ~ Remember the longest set string match (farthest end point) from the earliest 620 * starting point. 621 * + If no set string matched after spanSet.span(), then return 622 * with where the spanSet.span() ended. 623 * + If at least one set string matched, then continue the loop after the 624 * longest match from the earliest position. 625 * + If no more set string matched after a previous string match, 626 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 627 * Stop if spanLength==0, otherwise continue the loop. 628 */ 629 630 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 631 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 632 return spanNot(s, length); 633 } 634 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 635 if(spanLength==length) { 636 return length; 637 } 638 639 // Consider strings; they may overlap with the span. 640 OffsetList offsets; 641 if(spanCondition==USET_SPAN_CONTAINED) { 642 // Use offset list to try all possibilities. 643 offsets.setMaxLength(maxLength16); 644 } 645 int32_t pos=spanLength, rest=length-pos; 646 int32_t i, stringsLength=strings.size(); 647 for(;;) { 648 if(spanCondition==USET_SPAN_CONTAINED) { 649 for(i=0; i<stringsLength; ++i) { 650 int32_t overlap=spanLengths[i]; 651 if(overlap==ALL_CP_CONTAINED) { 652 continue; // Irrelevant string. 653 } 654 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 655 const UChar *s16=string.getBuffer(); 656 int32_t length16=string.length(); 657 658 // Try to match this string at pos-overlap..pos. 659 if(overlap>=LONG_SPAN) { 660 overlap=length16; 661 // While contained: No point matching fully inside the code point span. 662 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 663 } 664 if(overlap>spanLength) { 665 overlap=spanLength; 666 } 667 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 668 for(;;) { 669 if(inc>rest) { 670 break; 671 } 672 // Try to match if the increment is not listed already. 673 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 674 if(inc==rest) { 675 return length; // Reached the end of the string. 676 } 677 offsets.addOffset(inc); 678 } 679 if(overlap==0) { 680 break; 681 } 682 --overlap; 683 ++inc; 684 } 685 } 686 } else /* USET_SPAN_SIMPLE */ { 687 int32_t maxInc=0, maxOverlap=0; 688 for(i=0; i<stringsLength; ++i) { 689 int32_t overlap=spanLengths[i]; 690 // For longest match, we do need to try to match even an all-contained string 691 // to find the match from the earliest start. 692 693 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 694 const UChar *s16=string.getBuffer(); 695 int32_t length16=string.length(); 696 697 // Try to match this string at pos-overlap..pos. 698 if(overlap>=LONG_SPAN) { 699 overlap=length16; 700 // Longest match: Need to match fully inside the code point span 701 // to find the match from the earliest start. 702 } 703 if(overlap>spanLength) { 704 overlap=spanLength; 705 } 706 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 707 for(;;) { 708 if(inc>rest || overlap<maxOverlap) { 709 break; 710 } 711 // Try to match if the string is longer or starts earlier. 712 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 713 matches16CPB(s, pos-overlap, length, s16, length16) 714 ) { 715 maxInc=inc; // Longest match from earliest start. 716 maxOverlap=overlap; 717 break; 718 } 719 --overlap; 720 ++inc; 721 } 722 } 723 724 if(maxInc!=0 || maxOverlap!=0) { 725 // Longest-match algorithm, and there was a string match. 726 // Simply continue after it. 727 pos+=maxInc; 728 rest-=maxInc; 729 if(rest==0) { 730 return length; // Reached the end of the string. 731 } 732 spanLength=0; // Match strings from after a string match. 733 continue; 734 } 735 } 736 // Finished trying to match all strings at pos. 737 738 if(spanLength!=0 || pos==0) { 739 // The position is after an unlimited code point span (spanLength!=0), 740 // not after a string match. 741 // The only position where spanLength==0 after a span is pos==0. 742 // Otherwise, an unlimited code point span is only tried again when no 743 // strings match, and if such a non-initial span fails we stop. 744 if(offsets.isEmpty()) { 745 return pos; // No strings matched after a span. 746 } 747 // Match strings from after the next string match. 748 } else { 749 // The position is after a string match (or a single code point). 750 if(offsets.isEmpty()) { 751 // No more strings matched after a previous string match. 752 // Try another code point span from after the last string match. 753 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 754 if( spanLength==rest || // Reached the end of the string, or 755 spanLength==0 // neither strings nor span progressed. 756 ) { 757 return pos+spanLength; 758 } 759 pos+=spanLength; 760 rest-=spanLength; 761 continue; // spanLength>0: Match strings from after a span. 762 } else { 763 // Try to match only one code point from after a string match if some 764 // string matched beyond it, so that we try all possible positions 765 // and don't overshoot. 766 spanLength=spanOne(spanSet, s+pos, rest); 767 if(spanLength>0) { 768 if(spanLength==rest) { 769 return length; // Reached the end of the string. 770 } 771 // Match strings after this code point. 772 // There cannot be any increments below it because UnicodeSet strings 773 // contain multiple code points. 774 pos+=spanLength; 775 rest-=spanLength; 776 offsets.shift(spanLength); 777 spanLength=0; 778 continue; // Match strings from after a single code point. 779 } 780 // Match strings from after the next string match. 781 } 782 } 783 int32_t minOffset=offsets.popMinimum(); 784 pos+=minOffset; 785 rest-=minOffset; 786 spanLength=0; // Match strings from after a string match. 787 } 788 } 789 790 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 791 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 792 return spanNotBack(s, length); 793 } 794 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 795 if(pos==0) { 796 return 0; 797 } 798 int32_t spanLength=length-pos; 799 800 // Consider strings; they may overlap with the span. 801 OffsetList offsets; 802 if(spanCondition==USET_SPAN_CONTAINED) { 803 // Use offset list to try all possibilities. 804 offsets.setMaxLength(maxLength16); 805 } 806 int32_t i, stringsLength=strings.size(); 807 uint8_t *spanBackLengths=spanLengths; 808 if(all) { 809 spanBackLengths+=stringsLength; 810 } 811 for(;;) { 812 if(spanCondition==USET_SPAN_CONTAINED) { 813 for(i=0; i<stringsLength; ++i) { 814 int32_t overlap=spanBackLengths[i]; 815 if(overlap==ALL_CP_CONTAINED) { 816 continue; // Irrelevant string. 817 } 818 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 819 const UChar *s16=string.getBuffer(); 820 int32_t length16=string.length(); 821 822 // Try to match this string at pos-(length16-overlap)..pos-length16. 823 if(overlap>=LONG_SPAN) { 824 overlap=length16; 825 // While contained: No point matching fully inside the code point span. 826 int32_t len1=0; 827 U16_FWD_1(s16, len1, overlap); 828 overlap-=len1; // Length of the string minus the first code point. 829 } 830 if(overlap>spanLength) { 831 overlap=spanLength; 832 } 833 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 834 for(;;) { 835 if(dec>pos) { 836 break; 837 } 838 // Try to match if the decrement is not listed already. 839 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 840 if(dec==pos) { 841 return 0; // Reached the start of the string. 842 } 843 offsets.addOffset(dec); 844 } 845 if(overlap==0) { 846 break; 847 } 848 --overlap; 849 ++dec; 850 } 851 } 852 } else /* USET_SPAN_SIMPLE */ { 853 int32_t maxDec=0, maxOverlap=0; 854 for(i=0; i<stringsLength; ++i) { 855 int32_t overlap=spanBackLengths[i]; 856 // For longest match, we do need to try to match even an all-contained string 857 // to find the match from the latest end. 858 859 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 860 const UChar *s16=string.getBuffer(); 861 int32_t length16=string.length(); 862 863 // Try to match this string at pos-(length16-overlap)..pos-length16. 864 if(overlap>=LONG_SPAN) { 865 overlap=length16; 866 // Longest match: Need to match fully inside the code point span 867 // to find the match from the latest end. 868 } 869 if(overlap>spanLength) { 870 overlap=spanLength; 871 } 872 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 873 for(;;) { 874 if(dec>pos || overlap<maxOverlap) { 875 break; 876 } 877 // Try to match if the string is longer or ends later. 878 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 879 matches16CPB(s, pos-dec, length, s16, length16) 880 ) { 881 maxDec=dec; // Longest match from latest end. 882 maxOverlap=overlap; 883 break; 884 } 885 --overlap; 886 ++dec; 887 } 888 } 889 890 if(maxDec!=0 || maxOverlap!=0) { 891 // Longest-match algorithm, and there was a string match. 892 // Simply continue before it. 893 pos-=maxDec; 894 if(pos==0) { 895 return 0; // Reached the start of the string. 896 } 897 spanLength=0; // Match strings from before a string match. 898 continue; 899 } 900 } 901 // Finished trying to match all strings at pos. 902 903 if(spanLength!=0 || pos==length) { 904 // The position is before an unlimited code point span (spanLength!=0), 905 // not before a string match. 906 // The only position where spanLength==0 before a span is pos==length. 907 // Otherwise, an unlimited code point span is only tried again when no 908 // strings match, and if such a non-initial span fails we stop. 909 if(offsets.isEmpty()) { 910 return pos; // No strings matched before a span. 911 } 912 // Match strings from before the next string match. 913 } else { 914 // The position is before a string match (or a single code point). 915 if(offsets.isEmpty()) { 916 // No more strings matched before a previous string match. 917 // Try another code point span from before the last string match. 918 int32_t oldPos=pos; 919 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 920 spanLength=oldPos-pos; 921 if( pos==0 || // Reached the start of the string, or 922 spanLength==0 // neither strings nor span progressed. 923 ) { 924 return pos; 925 } 926 continue; // spanLength>0: Match strings from before a span. 927 } else { 928 // Try to match only one code point from before a string match if some 929 // string matched beyond it, so that we try all possible positions 930 // and don't overshoot. 931 spanLength=spanOneBack(spanSet, s, pos); 932 if(spanLength>0) { 933 if(spanLength==pos) { 934 return 0; // Reached the start of the string. 935 } 936 // Match strings before this code point. 937 // There cannot be any decrements below it because UnicodeSet strings 938 // contain multiple code points. 939 pos-=spanLength; 940 offsets.shift(spanLength); 941 spanLength=0; 942 continue; // Match strings from before a single code point. 943 } 944 // Match strings from before the next string match. 945 } 946 } 947 pos-=offsets.popMinimum(); 948 spanLength=0; // Match strings from before a string match. 949 } 950 } 951 952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 953 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 954 return spanNotUTF8(s, length); 955 } 956 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 957 if(spanLength==length) { 958 return length; 959 } 960 961 // Consider strings; they may overlap with the span. 962 OffsetList offsets; 963 if(spanCondition==USET_SPAN_CONTAINED) { 964 // Use offset list to try all possibilities. 965 offsets.setMaxLength(maxLength8); 966 } 967 int32_t pos=spanLength, rest=length-pos; 968 int32_t i, stringsLength=strings.size(); 969 uint8_t *spanUTF8Lengths=spanLengths; 970 if(all) { 971 spanUTF8Lengths+=2*stringsLength; 972 } 973 for(;;) { 974 const uint8_t *s8=utf8; 975 int32_t length8; 976 if(spanCondition==USET_SPAN_CONTAINED) { 977 for(i=0; i<stringsLength; ++i) { 978 length8=utf8Lengths[i]; 979 if(length8==0) { 980 continue; // String not representable in UTF-8. 981 } 982 int32_t overlap=spanUTF8Lengths[i]; 983 if(overlap==ALL_CP_CONTAINED) { 984 s8+=length8; 985 continue; // Irrelevant string. 986 } 987 988 // Try to match this string at pos-overlap..pos. 989 if(overlap>=LONG_SPAN) { 990 overlap=length8; 991 // While contained: No point matching fully inside the code point span. 992 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 993 } 994 if(overlap>spanLength) { 995 overlap=spanLength; 996 } 997 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 998 for(;;) { 999 if(inc>rest) { 1000 break; 1001 } 1002 // Try to match if the increment is not listed already. 1003 // Match at code point boundaries. (The UTF-8 strings were converted 1004 // from UTF-16 and are guaranteed to be well-formed.) 1005 if( !U8_IS_TRAIL(s[pos-overlap]) && 1006 !offsets.containsOffset(inc) && 1007 matches8(s+pos-overlap, s8, length8) 1008 1009 ) { 1010 if(inc==rest) { 1011 return length; // Reached the end of the string. 1012 } 1013 offsets.addOffset(inc); 1014 } 1015 if(overlap==0) { 1016 break; 1017 } 1018 --overlap; 1019 ++inc; 1020 } 1021 s8+=length8; 1022 } 1023 } else /* USET_SPAN_SIMPLE */ { 1024 int32_t maxInc=0, maxOverlap=0; 1025 for(i=0; i<stringsLength; ++i) { 1026 length8=utf8Lengths[i]; 1027 if(length8==0) { 1028 continue; // String not representable in UTF-8. 1029 } 1030 int32_t overlap=spanUTF8Lengths[i]; 1031 // For longest match, we do need to try to match even an all-contained string 1032 // to find the match from the earliest start. 1033 1034 // Try to match this string at pos-overlap..pos. 1035 if(overlap>=LONG_SPAN) { 1036 overlap=length8; 1037 // Longest match: Need to match fully inside the code point span 1038 // to find the match from the earliest start. 1039 } 1040 if(overlap>spanLength) { 1041 overlap=spanLength; 1042 } 1043 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1044 for(;;) { 1045 if(inc>rest || overlap<maxOverlap) { 1046 break; 1047 } 1048 // Try to match if the string is longer or starts earlier. 1049 // Match at code point boundaries. (The UTF-8 strings were converted 1050 // from UTF-16 and are guaranteed to be well-formed.) 1051 if( !U8_IS_TRAIL(s[pos-overlap]) && 1052 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1053 matches8(s+pos-overlap, s8, length8) 1054 1055 ) { 1056 maxInc=inc; // Longest match from earliest start. 1057 maxOverlap=overlap; 1058 break; 1059 } 1060 --overlap; 1061 ++inc; 1062 } 1063 s8+=length8; 1064 } 1065 1066 if(maxInc!=0 || maxOverlap!=0) { 1067 // Longest-match algorithm, and there was a string match. 1068 // Simply continue after it. 1069 pos+=maxInc; 1070 rest-=maxInc; 1071 if(rest==0) { 1072 return length; // Reached the end of the string. 1073 } 1074 spanLength=0; // Match strings from after a string match. 1075 continue; 1076 } 1077 } 1078 // Finished trying to match all strings at pos. 1079 1080 if(spanLength!=0 || pos==0) { 1081 // The position is after an unlimited code point span (spanLength!=0), 1082 // not after a string match. 1083 // The only position where spanLength==0 after a span is pos==0. 1084 // Otherwise, an unlimited code point span is only tried again when no 1085 // strings match, and if such a non-initial span fails we stop. 1086 if(offsets.isEmpty()) { 1087 return pos; // No strings matched after a span. 1088 } 1089 // Match strings from after the next string match. 1090 } else { 1091 // The position is after a string match (or a single code point). 1092 if(offsets.isEmpty()) { 1093 // No more strings matched after a previous string match. 1094 // Try another code point span from after the last string match. 1095 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1096 if( spanLength==rest || // Reached the end of the string, or 1097 spanLength==0 // neither strings nor span progressed. 1098 ) { 1099 return pos+spanLength; 1100 } 1101 pos+=spanLength; 1102 rest-=spanLength; 1103 continue; // spanLength>0: Match strings from after a span. 1104 } else { 1105 // Try to match only one code point from after a string match if some 1106 // string matched beyond it, so that we try all possible positions 1107 // and don't overshoot. 1108 spanLength=spanOneUTF8(spanSet, s+pos, rest); 1109 if(spanLength>0) { 1110 if(spanLength==rest) { 1111 return length; // Reached the end of the string. 1112 } 1113 // Match strings after this code point. 1114 // There cannot be any increments below it because UnicodeSet strings 1115 // contain multiple code points. 1116 pos+=spanLength; 1117 rest-=spanLength; 1118 offsets.shift(spanLength); 1119 spanLength=0; 1120 continue; // Match strings from after a single code point. 1121 } 1122 // Match strings from after the next string match. 1123 } 1124 } 1125 int32_t minOffset=offsets.popMinimum(); 1126 pos+=minOffset; 1127 rest-=minOffset; 1128 spanLength=0; // Match strings from after a string match. 1129 } 1130 } 1131 1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1133 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1134 return spanNotBackUTF8(s, length); 1135 } 1136 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1137 if(pos==0) { 1138 return 0; 1139 } 1140 int32_t spanLength=length-pos; 1141 1142 // Consider strings; they may overlap with the span. 1143 OffsetList offsets; 1144 if(spanCondition==USET_SPAN_CONTAINED) { 1145 // Use offset list to try all possibilities. 1146 offsets.setMaxLength(maxLength8); 1147 } 1148 int32_t i, stringsLength=strings.size(); 1149 uint8_t *spanBackUTF8Lengths=spanLengths; 1150 if(all) { 1151 spanBackUTF8Lengths+=3*stringsLength; 1152 } 1153 for(;;) { 1154 const uint8_t *s8=utf8; 1155 int32_t length8; 1156 if(spanCondition==USET_SPAN_CONTAINED) { 1157 for(i=0; i<stringsLength; ++i) { 1158 length8=utf8Lengths[i]; 1159 if(length8==0) { 1160 continue; // String not representable in UTF-8. 1161 } 1162 int32_t overlap=spanBackUTF8Lengths[i]; 1163 if(overlap==ALL_CP_CONTAINED) { 1164 s8+=length8; 1165 continue; // Irrelevant string. 1166 } 1167 1168 // Try to match this string at pos-(length8-overlap)..pos-length8. 1169 if(overlap>=LONG_SPAN) { 1170 overlap=length8; 1171 // While contained: No point matching fully inside the code point span. 1172 int32_t len1=0; 1173 U8_FWD_1(s8, len1, overlap); 1174 overlap-=len1; // Length of the string minus the first code point. 1175 } 1176 if(overlap>spanLength) { 1177 overlap=spanLength; 1178 } 1179 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1180 for(;;) { 1181 if(dec>pos) { 1182 break; 1183 } 1184 // Try to match if the decrement is not listed already. 1185 // Match at code point boundaries. (The UTF-8 strings were converted 1186 // from UTF-16 and are guaranteed to be well-formed.) 1187 if( !U8_IS_TRAIL(s[pos-dec]) && 1188 !offsets.containsOffset(dec) && 1189 matches8(s+pos-dec, s8, length8) 1190 ) { 1191 if(dec==pos) { 1192 return 0; // Reached the start of the string. 1193 } 1194 offsets.addOffset(dec); 1195 } 1196 if(overlap==0) { 1197 break; 1198 } 1199 --overlap; 1200 ++dec; 1201 } 1202 s8+=length8; 1203 } 1204 } else /* USET_SPAN_SIMPLE */ { 1205 int32_t maxDec=0, maxOverlap=0; 1206 for(i=0; i<stringsLength; ++i) { 1207 length8=utf8Lengths[i]; 1208 if(length8==0) { 1209 continue; // String not representable in UTF-8. 1210 } 1211 int32_t overlap=spanBackUTF8Lengths[i]; 1212 // For longest match, we do need to try to match even an all-contained string 1213 // to find the match from the latest end. 1214 1215 // Try to match this string at pos-(length8-overlap)..pos-length8. 1216 if(overlap>=LONG_SPAN) { 1217 overlap=length8; 1218 // Longest match: Need to match fully inside the code point span 1219 // to find the match from the latest end. 1220 } 1221 if(overlap>spanLength) { 1222 overlap=spanLength; 1223 } 1224 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1225 for(;;) { 1226 if(dec>pos || overlap<maxOverlap) { 1227 break; 1228 } 1229 // Try to match if the string is longer or ends later. 1230 // Match at code point boundaries. (The UTF-8 strings were converted 1231 // from UTF-16 and are guaranteed to be well-formed.) 1232 if( !U8_IS_TRAIL(s[pos-dec]) && 1233 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1234 matches8(s+pos-dec, s8, length8) 1235 ) { 1236 maxDec=dec; // Longest match from latest end. 1237 maxOverlap=overlap; 1238 break; 1239 } 1240 --overlap; 1241 ++dec; 1242 } 1243 s8+=length8; 1244 } 1245 1246 if(maxDec!=0 || maxOverlap!=0) { 1247 // Longest-match algorithm, and there was a string match. 1248 // Simply continue before it. 1249 pos-=maxDec; 1250 if(pos==0) { 1251 return 0; // Reached the start of the string. 1252 } 1253 spanLength=0; // Match strings from before a string match. 1254 continue; 1255 } 1256 } 1257 // Finished trying to match all strings at pos. 1258 1259 if(spanLength!=0 || pos==length) { 1260 // The position is before an unlimited code point span (spanLength!=0), 1261 // not before a string match. 1262 // The only position where spanLength==0 before a span is pos==length. 1263 // Otherwise, an unlimited code point span is only tried again when no 1264 // strings match, and if such a non-initial span fails we stop. 1265 if(offsets.isEmpty()) { 1266 return pos; // No strings matched before a span. 1267 } 1268 // Match strings from before the next string match. 1269 } else { 1270 // The position is before a string match (or a single code point). 1271 if(offsets.isEmpty()) { 1272 // No more strings matched before a previous string match. 1273 // Try another code point span from before the last string match. 1274 int32_t oldPos=pos; 1275 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1276 spanLength=oldPos-pos; 1277 if( pos==0 || // Reached the start of the string, or 1278 spanLength==0 // neither strings nor span progressed. 1279 ) { 1280 return pos; 1281 } 1282 continue; // spanLength>0: Match strings from before a span. 1283 } else { 1284 // Try to match only one code point from before a string match if some 1285 // string matched beyond it, so that we try all possible positions 1286 // and don't overshoot. 1287 spanLength=spanOneBackUTF8(spanSet, s, pos); 1288 if(spanLength>0) { 1289 if(spanLength==pos) { 1290 return 0; // Reached the start of the string. 1291 } 1292 // Match strings before this code point. 1293 // There cannot be any decrements below it because UnicodeSet strings 1294 // contain multiple code points. 1295 pos-=spanLength; 1296 offsets.shift(spanLength); 1297 spanLength=0; 1298 continue; // Match strings from before a single code point. 1299 } 1300 // Match strings from before the next string match. 1301 } 1302 } 1303 pos-=offsets.popMinimum(); 1304 spanLength=0; // Match strings from before a string match. 1305 } 1306 } 1307 1308 /* 1309 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1310 * 1311 * Theoretical algorithm: 1312 * - Iterate through the string, and at each code point boundary: 1313 * + If the code point there is in the set, then return with the current position. 1314 * + If a set string matches at the current position, then return with the current position. 1315 * 1316 * Optimized implementation: 1317 * 1318 * (Same assumption as for span() above.) 1319 * 1320 * Create and cache a spanNotSet which contains all of the single code points 1321 * of the original set but none of its strings. 1322 * For each set string add its initial code point to the spanNotSet. 1323 * (Also add its final code point for spanNotBack().) 1324 * 1325 * - Loop: 1326 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1327 * + If the current code point is in the original set, then 1328 * return the current position. 1329 * + If any set string matches at the current position, then 1330 * return the current position. 1331 * + If there is no match at the current position, neither for the code point there 1332 * nor for any set string, then skip this code point and continue the loop. 1333 * This happens for set-string-initial code points that were added to spanNotSet 1334 * when there is not actually a match for such a set string. 1335 */ 1336 1337 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1338 int32_t pos=0, rest=length; 1339 int32_t i, stringsLength=strings.size(); 1340 do { 1341 // Span until we find a code point from the set, 1342 // or a code point that starts or ends some string. 1343 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1344 if(i==rest) { 1345 return length; // Reached the end of the string. 1346 } 1347 pos+=i; 1348 rest-=i; 1349 1350 // Check whether the current code point is in the original set, 1351 // without the string starts and ends. 1352 int32_t cpLength=spanOne(spanSet, s+pos, rest); 1353 if(cpLength>0) { 1354 return pos; // There is a set element at pos. 1355 } 1356 1357 // Try to match the strings at pos. 1358 for(i=0; i<stringsLength; ++i) { 1359 if(spanLengths[i]==ALL_CP_CONTAINED) { 1360 continue; // Irrelevant string. 1361 } 1362 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1363 const UChar *s16=string.getBuffer(); 1364 int32_t length16=string.length(); 1365 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1366 return pos; // There is a set element at pos. 1367 } 1368 } 1369 1370 // The span(while not contained) ended on a string start/end which is 1371 // not in the original set. Skip this code point and continue. 1372 // cpLength<0 1373 pos-=cpLength; 1374 rest+=cpLength; 1375 } while(rest!=0); 1376 return length; // Reached the end of the string. 1377 } 1378 1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1380 int32_t pos=length; 1381 int32_t i, stringsLength=strings.size(); 1382 do { 1383 // Span until we find a code point from the set, 1384 // or a code point that starts or ends some string. 1385 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1386 if(pos==0) { 1387 return 0; // Reached the start of the string. 1388 } 1389 1390 // Check whether the current code point is in the original set, 1391 // without the string starts and ends. 1392 int32_t cpLength=spanOneBack(spanSet, s, pos); 1393 if(cpLength>0) { 1394 return pos; // There is a set element at pos. 1395 } 1396 1397 // Try to match the strings at pos. 1398 for(i=0; i<stringsLength; ++i) { 1399 // Use spanLengths rather than a spanBackLengths pointer because 1400 // it is easier and we only need to know whether the string is irrelevant 1401 // which is the same in either array. 1402 if(spanLengths[i]==ALL_CP_CONTAINED) { 1403 continue; // Irrelevant string. 1404 } 1405 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1406 const UChar *s16=string.getBuffer(); 1407 int32_t length16=string.length(); 1408 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1409 return pos; // There is a set element at pos. 1410 } 1411 } 1412 1413 // The span(while not contained) ended on a string start/end which is 1414 // not in the original set. Skip this code point and continue. 1415 // cpLength<0 1416 pos+=cpLength; 1417 } while(pos!=0); 1418 return 0; // Reached the start of the string. 1419 } 1420 1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1422 int32_t pos=0, rest=length; 1423 int32_t i, stringsLength=strings.size(); 1424 uint8_t *spanUTF8Lengths=spanLengths; 1425 if(all) { 1426 spanUTF8Lengths+=2*stringsLength; 1427 } 1428 do { 1429 // Span until we find a code point from the set, 1430 // or a code point that starts or ends some string. 1431 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1432 if(i==rest) { 1433 return length; // Reached the end of the string. 1434 } 1435 pos+=i; 1436 rest-=i; 1437 1438 // Check whether the current code point is in the original set, 1439 // without the string starts and ends. 1440 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1441 if(cpLength>0) { 1442 return pos; // There is a set element at pos. 1443 } 1444 1445 // Try to match the strings at pos. 1446 const uint8_t *s8=utf8; 1447 int32_t length8; 1448 for(i=0; i<stringsLength; ++i) { 1449 length8=utf8Lengths[i]; 1450 // ALL_CP_CONTAINED: Irrelevant string. 1451 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1452 return pos; // There is a set element at pos. 1453 } 1454 s8+=length8; 1455 } 1456 1457 // The span(while not contained) ended on a string start/end which is 1458 // not in the original set. Skip this code point and continue. 1459 // cpLength<0 1460 pos-=cpLength; 1461 rest+=cpLength; 1462 } while(rest!=0); 1463 return length; // Reached the end of the string. 1464 } 1465 1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1467 int32_t pos=length; 1468 int32_t i, stringsLength=strings.size(); 1469 uint8_t *spanBackUTF8Lengths=spanLengths; 1470 if(all) { 1471 spanBackUTF8Lengths+=3*stringsLength; 1472 } 1473 do { 1474 // Span until we find a code point from the set, 1475 // or a code point that starts or ends some string. 1476 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1477 if(pos==0) { 1478 return 0; // Reached the start of the string. 1479 } 1480 1481 // Check whether the current code point is in the original set, 1482 // without the string starts and ends. 1483 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1484 if(cpLength>0) { 1485 return pos; // There is a set element at pos. 1486 } 1487 1488 // Try to match the strings at pos. 1489 const uint8_t *s8=utf8; 1490 int32_t length8; 1491 for(i=0; i<stringsLength; ++i) { 1492 length8=utf8Lengths[i]; 1493 // ALL_CP_CONTAINED: Irrelevant string. 1494 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1495 return pos; // There is a set element at pos. 1496 } 1497 s8+=length8; 1498 } 1499 1500 // The span(while not contained) ended on a string start/end which is 1501 // not in the original set. Skip this code point and continue. 1502 // cpLength<0 1503 pos+=cpLength; 1504 } while(pos!=0); 1505 return 0; // Reached the start of the string. 1506 } 1507 1508 U_NAMESPACE_END 1509