1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2007-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: unisetspan.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2007mar01 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/uniset.h" 19 #include "unicode/ustring.h" 20 #include "unicode/utf8.h" 21 #include "unicode/utf16.h" 22 #include "cmemory.h" 23 #include "uvector.h" 24 #include "unisetspan.h" 25 26 U_NAMESPACE_BEGIN 27 28 /* 29 * List of offsets from the current position from where to try matching 30 * a code point or a string. 31 * Store offsets rather than indexes to simplify the code and use the same list 32 * for both increments (in span()) and decrements (in spanBack()). 33 * 34 * Assumption: The maximum offset is limited, and the offsets that are stored 35 * at any one time are relatively dense, that is, there are normally no gaps of 36 * hundreds or thousands of offset values. 37 * 38 * The implementation uses a circular buffer of byte flags, 39 * each indicating whether the corresponding offset is in the list. 40 * This avoids inserting into a sorted list of offsets (or absolute indexes) and 41 * physically moving part of the list. 42 * 43 * Note: In principle, the caller should setMaxLength() to the maximum of the 44 * max string length and U16_LENGTH/U8_LENGTH to account for 45 * "long" single code points. 46 * However, this implementation uses at least a staticList with more than 47 * U8_LENGTH entries anyway. 48 * 49 * Note: If maxLength were guaranteed to be no more than 32 or 64, 50 * the list could be stored as bit flags in a single integer. 51 * Rather than handling a circular buffer with a start list index, 52 * the integer would simply be shifted when lower offsets are removed. 53 * UnicodeSet does not have a limit on the lengths of strings. 54 */ 55 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 56 public: 57 OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 58 59 ~OffsetList() { 60 if(list!=staticList) { 61 uprv_free(list); 62 } 63 } 64 65 // Call exactly once if the list is to be used. 66 void setMaxLength(int32_t maxLength) { 67 if(maxLength<=(int32_t)sizeof(staticList)) { 68 capacity=(int32_t)sizeof(staticList); 69 } else { 70 UBool *l=(UBool *)uprv_malloc(maxLength); 71 if(l!=NULL) { 72 list=l; 73 capacity=maxLength; 74 } 75 } 76 uprv_memset(list, 0, capacity); 77 } 78 79 void clear() { 80 uprv_memset(list, 0, capacity); 81 start=length=0; 82 } 83 84 UBool isEmpty() const { 85 return (UBool)(length==0); 86 } 87 88 // Reduce all stored offsets by delta, used when the current position 89 // moves by delta. 90 // There must not be any offsets lower than delta. 91 // If there is an offset equal to delta, it is removed. 92 // delta=[1..maxLength] 93 void shift(int32_t delta) { 94 int32_t i=start+delta; 95 if(i>=capacity) { 96 i-=capacity; 97 } 98 if(list[i]) { 99 list[i]=FALSE; 100 --length; 101 } 102 start=i; 103 } 104 105 // Add an offset. The list must not contain it yet. 106 // offset=[1..maxLength] 107 void addOffset(int32_t offset) { 108 int32_t i=start+offset; 109 if(i>=capacity) { 110 i-=capacity; 111 } 112 list[i]=TRUE; 113 ++length; 114 } 115 116 // offset=[1..maxLength] 117 UBool containsOffset(int32_t offset) const { 118 int32_t i=start+offset; 119 if(i>=capacity) { 120 i-=capacity; 121 } 122 return list[i]; 123 } 124 125 // Find the lowest stored offset from a non-empty list, remove it, 126 // and reduce all other offsets by this minimum. 127 // Returns [1..maxLength]. 128 int32_t popMinimum() { 129 // Look for the next offset in list[start+1..capacity-1]. 130 int32_t i=start, result; 131 while(++i<capacity) { 132 if(list[i]) { 133 list[i]=FALSE; 134 --length; 135 result=i-start; 136 start=i; 137 return result; 138 } 139 } 140 // i==capacity 141 142 // Wrap around and look for the next offset in list[0..start]. 143 // Since the list is not empty, there will be one. 144 result=capacity-start; 145 i=0; 146 while(!list[i]) { 147 ++i; 148 } 149 list[i]=FALSE; 150 --length; 151 start=i; 152 return result+=i; 153 } 154 155 private: 156 UBool *list; 157 int32_t capacity; 158 int32_t length; 159 int32_t start; 160 161 UBool staticList[16]; 162 }; 163 164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string. 165 static int32_t 166 getUTF8Length(const UChar *s, int32_t length) { 167 UErrorCode errorCode=U_ZERO_ERROR; 168 int32_t length8=0; 169 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 170 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 171 return length8; 172 } else { 173 // The string contains an unpaired surrogate. 174 // Ignore this string. 175 return 0; 176 } 177 } 178 179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 180 static int32_t 181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 182 UErrorCode errorCode=U_ZERO_ERROR; 183 int32_t length8=0; 184 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 185 if(U_SUCCESS(errorCode)) { 186 return length8; 187 } else { 188 // The string contains an unpaired surrogate. 189 // Ignore this string. 190 return 0; 191 } 192 } 193 194 static inline uint8_t 195 makeSpanLengthByte(int32_t spanLength) { 196 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 197 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 198 } 199 200 // Construct for all variants of span(), or only for any one variant. 201 // Initialize as little as possible, for single use. 202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 203 const UVector &setStrings, 204 uint32_t which) 205 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 206 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 207 utf8Length(0), 208 maxLength16(0), maxLength8(0), 209 all((UBool)(which==ALL)) { 210 spanSet.retainAll(set); 211 if(which&NOT_CONTAINED) { 212 // Default to the same sets. 213 // addToSpanNotSet() will create a separate set if necessary. 214 pSpanNotSet=&spanSet; 215 } 216 217 // Determine if the strings even need to be taken into account at all for span() etc. 218 // If any string is relevant, then all strings need to be used for 219 // span(longest match) but only the relevant ones for span(while contained). 220 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 221 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 222 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 223 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 224 int32_t stringsLength=strings.size(); 225 226 int32_t i, spanLength; 227 UBool someRelevant=FALSE; 228 for(i=0; i<stringsLength; ++i) { 229 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 230 const UChar *s16=string.getBuffer(); 231 int32_t length16=string.length(); 232 UBool thisRelevant; 233 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 234 if(spanLength<length16) { // Relevant string. 235 someRelevant=thisRelevant=TRUE; 236 } else { 237 thisRelevant=FALSE; 238 } 239 if((which&UTF16) && length16>maxLength16) { 240 maxLength16=length16; 241 } 242 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 243 int32_t length8=getUTF8Length(s16, length16); 244 utf8Length+=length8; 245 if(length8>maxLength8) { 246 maxLength8=length8; 247 } 248 } 249 } 250 if(!someRelevant) { 251 maxLength16=maxLength8=0; 252 return; 253 } 254 255 // Freeze after checking for the need to use strings at all because freezing 256 // a set takes some time and memory which are wasted if there are no relevant strings. 257 if(all) { 258 spanSet.freeze(); 259 } 260 261 uint8_t *spanBackLengths; 262 uint8_t *spanUTF8Lengths; 263 uint8_t *spanBackUTF8Lengths; 264 265 // Allocate a block of meta data. 266 int32_t allocSize; 267 if(all) { 268 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 269 allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 270 } else { 271 allocSize=stringsLength; // One set of span lengths. 272 if(which&UTF8) { 273 // UTF-8 lengths and UTF-8 strings. 274 allocSize+=stringsLength*4+utf8Length; 275 } 276 } 277 if(allocSize<=(int32_t)sizeof(staticLengths)) { 278 utf8Lengths=staticLengths; 279 } else { 280 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 281 if(utf8Lengths==NULL) { 282 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 283 return; // Out of memory. 284 } 285 } 286 287 if(all) { 288 // Store span lengths for all span() variants. 289 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 290 spanBackLengths=spanLengths+stringsLength; 291 spanUTF8Lengths=spanBackLengths+stringsLength; 292 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 293 utf8=spanBackUTF8Lengths+stringsLength; 294 } else { 295 // Store span lengths for only one span() variant. 296 if(which&UTF8) { 297 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 298 utf8=spanLengths+stringsLength; 299 } else { 300 spanLengths=(uint8_t *)utf8Lengths; 301 } 302 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 303 } 304 305 // Set the meta data and pSpanNotSet and write the UTF-8 strings. 306 int32_t utf8Count=0; // Count UTF-8 bytes written so far. 307 308 for(i=0; i<stringsLength; ++i) { 309 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 310 const UChar *s16=string.getBuffer(); 311 int32_t length16=string.length(); 312 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 313 if(spanLength<length16) { // Relevant string. 314 if(which&UTF16) { 315 if(which&CONTAINED) { 316 if(which&FWD) { 317 spanLengths[i]=makeSpanLengthByte(spanLength); 318 } 319 if(which&BACK) { 320 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 321 spanBackLengths[i]=makeSpanLengthByte(spanLength); 322 } 323 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 324 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 325 } 326 } 327 if(which&UTF8) { 328 uint8_t *s8=utf8+utf8Count; 329 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 330 utf8Count+=utf8Lengths[i]=length8; 331 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 332 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 333 } else { // Relevant for UTF-8. 334 if(which&CONTAINED) { 335 if(which&FWD) { 336 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 337 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 338 } 339 if(which&BACK) { 340 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 341 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 342 } 343 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 344 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 345 } 346 } 347 } 348 if(which&NOT_CONTAINED) { 349 // Add string start and end code points to the spanNotSet so that 350 // a span(while not contained) stops before any string. 351 UChar32 c; 352 if(which&FWD) { 353 int32_t len=0; 354 U16_NEXT(s16, len, length16, c); 355 addToSpanNotSet(c); 356 } 357 if(which&BACK) { 358 int32_t len=length16; 359 U16_PREV(s16, 0, len, c); 360 addToSpanNotSet(c); 361 } 362 } 363 } else { // Irrelevant string. 364 if(which&UTF8) { 365 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 366 uint8_t *s8=utf8+utf8Count; 367 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 368 utf8Count+=utf8Lengths[i]=length8; 369 } else { 370 utf8Lengths[i]=0; 371 } 372 } 373 if(all) { 374 spanLengths[i]=spanBackLengths[i]= 375 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 376 (uint8_t)ALL_CP_CONTAINED; 377 } else { 378 // All spanXYZLengths pointers contain the same address. 379 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 380 } 381 } 382 } 383 384 // Finish. 385 if(all) { 386 pSpanNotSet->freeze(); 387 } 388 } 389 390 // Copy constructor. Assumes which==ALL for a frozen set. 391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 392 const UVector &newParentSetStrings) 393 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 394 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 395 utf8Length(otherStringSpan.utf8Length), 396 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 397 all(TRUE) { 398 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 399 pSpanNotSet=&spanSet; 400 } else { 401 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 402 } 403 404 // Allocate a block of meta data. 405 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 406 int32_t stringsLength=strings.size(); 407 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 408 if(allocSize<=(int32_t)sizeof(staticLengths)) { 409 utf8Lengths=staticLengths; 410 } else { 411 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 412 if(utf8Lengths==NULL) { 413 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 414 return; // Out of memory. 415 } 416 } 417 418 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 419 utf8=spanLengths+stringsLength*4; 420 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 421 } 422 423 UnicodeSetStringSpan::~UnicodeSetStringSpan() { 424 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 425 delete pSpanNotSet; 426 } 427 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 428 uprv_free(utf8Lengths); 429 } 430 } 431 432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 433 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 434 if(spanSet.contains(c)) { 435 return; // Nothing to do. 436 } 437 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 438 if(newSet==NULL) { 439 return; // Out of memory. 440 } else { 441 pSpanNotSet=newSet; 442 } 443 } 444 pSpanNotSet->add(c); 445 } 446 447 // Compare strings without any argument checks. Requires length>0. 448 static inline UBool 449 matches16(const UChar *s, const UChar *t, int32_t length) { 450 do { 451 if(*s++!=*t++) { 452 return FALSE; 453 } 454 } while(--length>0); 455 return TRUE; 456 } 457 458 static inline UBool 459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 460 do { 461 if(*s++!=*t++) { 462 return FALSE; 463 } 464 } while(--length>0); 465 return TRUE; 466 } 467 468 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 469 // at code point boundaries. 470 // That is, each edge of a match must not be in the middle of a surrogate pair. 471 static inline UBool 472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 473 s+=start; 474 limit-=start; 475 return matches16(s, t, length) && 476 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 477 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 478 } 479 480 // Does the set contain the next code point? 481 // If so, return its length; otherwise return its negative length. 482 static inline int32_t 483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 484 UChar c=*s, c2; 485 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 486 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 487 } 488 return set.contains(c) ? 1 : -1; 489 } 490 491 static inline int32_t 492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 493 UChar c=s[length-1], c2; 494 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 495 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 496 } 497 return set.contains(c) ? 1 : -1; 498 } 499 500 static inline int32_t 501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 502 UChar32 c=*s; 503 if((int8_t)c>=0) { 504 return set.contains(c) ? 1 : -1; 505 } 506 // Take advantage of non-ASCII fastpaths in U8_NEXT(). 507 int32_t i=0; 508 U8_NEXT(s, i, length, c); 509 return set.contains(c) ? i : -i; 510 } 511 512 static inline int32_t 513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 514 UChar32 c=s[length-1]; 515 if((int8_t)c>=0) { 516 return set.contains(c) ? 1 : -1; 517 } 518 int32_t i=length-1; 519 c=utf8_prevCharSafeBody(s, 0, &i, c, -1); 520 length-=i; 521 return set.contains(c) ? length : -length; 522 } 523 524 /* 525 * Note: In span() when spanLength==0 (after a string match, or at the beginning 526 * after an empty code point span) and in spanNot() and spanNotUTF8(), 527 * string matching could use a binary search 528 * because all string matches are done from the same start index. 529 * 530 * For UTF-8, this would require a comparison function that returns UTF-16 order. 531 * 532 * This optimization should not be necessary for normal UnicodeSets because 533 * most sets have no strings, and most sets with strings have 534 * very few very short strings. 535 * For cases with many strings, it might be better to use a different API 536 * and implementation with a DFA (state machine). 537 */ 538 539 /* 540 * Algorithm for span(USET_SPAN_CONTAINED) 541 * 542 * Theoretical algorithm: 543 * - Iterate through the string, and at each code point boundary: 544 * + If the code point there is in the set, then remember to continue after it. 545 * + If a set string matches at the current position, then remember to continue after it. 546 * + Either recursively span for each code point or string match, 547 * or recursively span for all but the shortest one and 548 * iteratively continue the span with the shortest local match. 549 * + Remember the longest recursive span (the farthest end point). 550 * + If there is no match at the current position, neither for the code point there 551 * nor for any set string, then stop and return the longest recursive span length. 552 * 553 * Optimized implementation: 554 * 555 * (We assume that most sets will have very few very short strings. 556 * A span using a string-less set is extremely fast.) 557 * 558 * Create and cache a spanSet which contains all of the single code points 559 * of the original set but none of its strings. 560 * 561 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 562 * - Loop: 563 * + Try to match each set string at the end of the spanLength. 564 * ~ Set strings that start with set-contained code points must be matched 565 * with a partial overlap because the recursive algorithm would have tried 566 * to match them at every position. 567 * ~ Set strings that entirely consist of set-contained code points 568 * are irrelevant for span(USET_SPAN_CONTAINED) because the 569 * recursive algorithm would continue after them anyway 570 * and find the longest recursive match from their end. 571 * ~ Rather than recursing, note each end point of a set string match. 572 * + If no set string matched after spanSet.span(), then return 573 * with where the spanSet.span() ended. 574 * + If at least one set string matched after spanSet.span(), then 575 * pop the shortest string match end point and continue 576 * the loop, trying to match all set strings from there. 577 * + If at least one more set string matched after a previous string match, 578 * then test if the code point after the previous string match is also 579 * contained in the set. 580 * Continue the loop with the shortest end point of either this code point 581 * or a matching set string. 582 * + If no more set string matched after a previous string match, 583 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 584 * Stop if spanLength==0, otherwise continue the loop. 585 * 586 * By noting each end point of a set string match, 587 * the function visits each string position at most once and finishes 588 * in linear time. 589 * 590 * The recursive algorithm may visit the same string position many times 591 * if multiple paths lead to it and finishes in exponential time. 592 */ 593 594 /* 595 * Algorithm for span(USET_SPAN_SIMPLE) 596 * 597 * Theoretical algorithm: 598 * - Iterate through the string, and at each code point boundary: 599 * + If the code point there is in the set, then remember to continue after it. 600 * + If a set string matches at the current position, then remember to continue after it. 601 * + Continue from the farthest match position and ignore all others. 602 * + If there is no match at the current position, 603 * then stop and return the current position. 604 * 605 * Optimized implementation: 606 * 607 * (Same assumption and spanSet as above.) 608 * 609 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 610 * - Loop: 611 * + Try to match each set string at the end of the spanLength. 612 * ~ Set strings that start with set-contained code points must be matched 613 * with a partial overlap because the standard algorithm would have tried 614 * to match them earlier. 615 * ~ Set strings that entirely consist of set-contained code points 616 * must be matched with a full overlap because the longest-match algorithm 617 * would hide set string matches that end earlier. 618 * Such set strings need not be matched earlier inside the code point span 619 * because the standard algorithm would then have continued after 620 * the set string match anyway. 621 * ~ Remember the longest set string match (farthest end point) from the earliest 622 * starting point. 623 * + If no set string matched after spanSet.span(), then return 624 * with where the spanSet.span() ended. 625 * + If at least one set string matched, then continue the loop after the 626 * longest match from the earliest position. 627 * + If no more set string matched after a previous string match, 628 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 629 * Stop if spanLength==0, otherwise continue the loop. 630 */ 631 632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 633 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 634 return spanNot(s, length); 635 } 636 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 637 if(spanLength==length) { 638 return length; 639 } 640 641 // Consider strings; they may overlap with the span. 642 OffsetList offsets; 643 if(spanCondition==USET_SPAN_CONTAINED) { 644 // Use offset list to try all possibilities. 645 offsets.setMaxLength(maxLength16); 646 } 647 int32_t pos=spanLength, rest=length-pos; 648 int32_t i, stringsLength=strings.size(); 649 for(;;) { 650 if(spanCondition==USET_SPAN_CONTAINED) { 651 for(i=0; i<stringsLength; ++i) { 652 int32_t overlap=spanLengths[i]; 653 if(overlap==ALL_CP_CONTAINED) { 654 continue; // Irrelevant string. 655 } 656 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 657 const UChar *s16=string.getBuffer(); 658 int32_t length16=string.length(); 659 660 // Try to match this string at pos-overlap..pos. 661 if(overlap>=LONG_SPAN) { 662 overlap=length16; 663 // While contained: No point matching fully inside the code point span. 664 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 665 } 666 if(overlap>spanLength) { 667 overlap=spanLength; 668 } 669 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 670 for(;;) { 671 if(inc>rest) { 672 break; 673 } 674 // Try to match if the increment is not listed already. 675 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 676 if(inc==rest) { 677 return length; // Reached the end of the string. 678 } 679 offsets.addOffset(inc); 680 } 681 if(overlap==0) { 682 break; 683 } 684 --overlap; 685 ++inc; 686 } 687 } 688 } else /* USET_SPAN_SIMPLE */ { 689 int32_t maxInc=0, maxOverlap=0; 690 for(i=0; i<stringsLength; ++i) { 691 int32_t overlap=spanLengths[i]; 692 // For longest match, we do need to try to match even an all-contained string 693 // to find the match from the earliest start. 694 695 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 696 const UChar *s16=string.getBuffer(); 697 int32_t length16=string.length(); 698 699 // Try to match this string at pos-overlap..pos. 700 if(overlap>=LONG_SPAN) { 701 overlap=length16; 702 // Longest match: Need to match fully inside the code point span 703 // to find the match from the earliest start. 704 } 705 if(overlap>spanLength) { 706 overlap=spanLength; 707 } 708 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 709 for(;;) { 710 if(inc>rest || overlap<maxOverlap) { 711 break; 712 } 713 // Try to match if the string is longer or starts earlier. 714 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 715 matches16CPB(s, pos-overlap, length, s16, length16) 716 ) { 717 maxInc=inc; // Longest match from earliest start. 718 maxOverlap=overlap; 719 break; 720 } 721 --overlap; 722 ++inc; 723 } 724 } 725 726 if(maxInc!=0 || maxOverlap!=0) { 727 // Longest-match algorithm, and there was a string match. 728 // Simply continue after it. 729 pos+=maxInc; 730 rest-=maxInc; 731 if(rest==0) { 732 return length; // Reached the end of the string. 733 } 734 spanLength=0; // Match strings from after a string match. 735 continue; 736 } 737 } 738 // Finished trying to match all strings at pos. 739 740 if(spanLength!=0 || pos==0) { 741 // The position is after an unlimited code point span (spanLength!=0), 742 // not after a string match. 743 // The only position where spanLength==0 after a span is pos==0. 744 // Otherwise, an unlimited code point span is only tried again when no 745 // strings match, and if such a non-initial span fails we stop. 746 if(offsets.isEmpty()) { 747 return pos; // No strings matched after a span. 748 } 749 // Match strings from after the next string match. 750 } else { 751 // The position is after a string match (or a single code point). 752 if(offsets.isEmpty()) { 753 // No more strings matched after a previous string match. 754 // Try another code point span from after the last string match. 755 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 756 if( spanLength==rest || // Reached the end of the string, or 757 spanLength==0 // neither strings nor span progressed. 758 ) { 759 return pos+spanLength; 760 } 761 pos+=spanLength; 762 rest-=spanLength; 763 continue; // spanLength>0: Match strings from after a span. 764 } else { 765 // Try to match only one code point from after a string match if some 766 // string matched beyond it, so that we try all possible positions 767 // and don't overshoot. 768 spanLength=spanOne(spanSet, s+pos, rest); 769 if(spanLength>0) { 770 if(spanLength==rest) { 771 return length; // Reached the end of the string. 772 } 773 // Match strings after this code point. 774 // There cannot be any increments below it because UnicodeSet strings 775 // contain multiple code points. 776 pos+=spanLength; 777 rest-=spanLength; 778 offsets.shift(spanLength); 779 spanLength=0; 780 continue; // Match strings from after a single code point. 781 } 782 // Match strings from after the next string match. 783 } 784 } 785 int32_t minOffset=offsets.popMinimum(); 786 pos+=minOffset; 787 rest-=minOffset; 788 spanLength=0; // Match strings from after a string match. 789 } 790 } 791 792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 793 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 794 return spanNotBack(s, length); 795 } 796 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 797 if(pos==0) { 798 return 0; 799 } 800 int32_t spanLength=length-pos; 801 802 // Consider strings; they may overlap with the span. 803 OffsetList offsets; 804 if(spanCondition==USET_SPAN_CONTAINED) { 805 // Use offset list to try all possibilities. 806 offsets.setMaxLength(maxLength16); 807 } 808 int32_t i, stringsLength=strings.size(); 809 uint8_t *spanBackLengths=spanLengths; 810 if(all) { 811 spanBackLengths+=stringsLength; 812 } 813 for(;;) { 814 if(spanCondition==USET_SPAN_CONTAINED) { 815 for(i=0; i<stringsLength; ++i) { 816 int32_t overlap=spanBackLengths[i]; 817 if(overlap==ALL_CP_CONTAINED) { 818 continue; // Irrelevant string. 819 } 820 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 821 const UChar *s16=string.getBuffer(); 822 int32_t length16=string.length(); 823 824 // Try to match this string at pos-(length16-overlap)..pos-length16. 825 if(overlap>=LONG_SPAN) { 826 overlap=length16; 827 // While contained: No point matching fully inside the code point span. 828 int32_t len1=0; 829 U16_FWD_1(s16, len1, overlap); 830 overlap-=len1; // Length of the string minus the first code point. 831 } 832 if(overlap>spanLength) { 833 overlap=spanLength; 834 } 835 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 836 for(;;) { 837 if(dec>pos) { 838 break; 839 } 840 // Try to match if the decrement is not listed already. 841 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 842 if(dec==pos) { 843 return 0; // Reached the start of the string. 844 } 845 offsets.addOffset(dec); 846 } 847 if(overlap==0) { 848 break; 849 } 850 --overlap; 851 ++dec; 852 } 853 } 854 } else /* USET_SPAN_SIMPLE */ { 855 int32_t maxDec=0, maxOverlap=0; 856 for(i=0; i<stringsLength; ++i) { 857 int32_t overlap=spanBackLengths[i]; 858 // For longest match, we do need to try to match even an all-contained string 859 // to find the match from the latest end. 860 861 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 862 const UChar *s16=string.getBuffer(); 863 int32_t length16=string.length(); 864 865 // Try to match this string at pos-(length16-overlap)..pos-length16. 866 if(overlap>=LONG_SPAN) { 867 overlap=length16; 868 // Longest match: Need to match fully inside the code point span 869 // to find the match from the latest end. 870 } 871 if(overlap>spanLength) { 872 overlap=spanLength; 873 } 874 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 875 for(;;) { 876 if(dec>pos || overlap<maxOverlap) { 877 break; 878 } 879 // Try to match if the string is longer or ends later. 880 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 881 matches16CPB(s, pos-dec, length, s16, length16) 882 ) { 883 maxDec=dec; // Longest match from latest end. 884 maxOverlap=overlap; 885 break; 886 } 887 --overlap; 888 ++dec; 889 } 890 } 891 892 if(maxDec!=0 || maxOverlap!=0) { 893 // Longest-match algorithm, and there was a string match. 894 // Simply continue before it. 895 pos-=maxDec; 896 if(pos==0) { 897 return 0; // Reached the start of the string. 898 } 899 spanLength=0; // Match strings from before a string match. 900 continue; 901 } 902 } 903 // Finished trying to match all strings at pos. 904 905 if(spanLength!=0 || pos==length) { 906 // The position is before an unlimited code point span (spanLength!=0), 907 // not before a string match. 908 // The only position where spanLength==0 before a span is pos==length. 909 // Otherwise, an unlimited code point span is only tried again when no 910 // strings match, and if such a non-initial span fails we stop. 911 if(offsets.isEmpty()) { 912 return pos; // No strings matched before a span. 913 } 914 // Match strings from before the next string match. 915 } else { 916 // The position is before a string match (or a single code point). 917 if(offsets.isEmpty()) { 918 // No more strings matched before a previous string match. 919 // Try another code point span from before the last string match. 920 int32_t oldPos=pos; 921 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 922 spanLength=oldPos-pos; 923 if( pos==0 || // Reached the start of the string, or 924 spanLength==0 // neither strings nor span progressed. 925 ) { 926 return pos; 927 } 928 continue; // spanLength>0: Match strings from before a span. 929 } else { 930 // Try to match only one code point from before a string match if some 931 // string matched beyond it, so that we try all possible positions 932 // and don't overshoot. 933 spanLength=spanOneBack(spanSet, s, pos); 934 if(spanLength>0) { 935 if(spanLength==pos) { 936 return 0; // Reached the start of the string. 937 } 938 // Match strings before this code point. 939 // There cannot be any decrements below it because UnicodeSet strings 940 // contain multiple code points. 941 pos-=spanLength; 942 offsets.shift(spanLength); 943 spanLength=0; 944 continue; // Match strings from before a single code point. 945 } 946 // Match strings from before the next string match. 947 } 948 } 949 pos-=offsets.popMinimum(); 950 spanLength=0; // Match strings from before a string match. 951 } 952 } 953 954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 955 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 956 return spanNotUTF8(s, length); 957 } 958 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 959 if(spanLength==length) { 960 return length; 961 } 962 963 // Consider strings; they may overlap with the span. 964 OffsetList offsets; 965 if(spanCondition==USET_SPAN_CONTAINED) { 966 // Use offset list to try all possibilities. 967 offsets.setMaxLength(maxLength8); 968 } 969 int32_t pos=spanLength, rest=length-pos; 970 int32_t i, stringsLength=strings.size(); 971 uint8_t *spanUTF8Lengths=spanLengths; 972 if(all) { 973 spanUTF8Lengths+=2*stringsLength; 974 } 975 for(;;) { 976 const uint8_t *s8=utf8; 977 int32_t length8; 978 if(spanCondition==USET_SPAN_CONTAINED) { 979 for(i=0; i<stringsLength; ++i) { 980 length8=utf8Lengths[i]; 981 if(length8==0) { 982 continue; // String not representable in UTF-8. 983 } 984 int32_t overlap=spanUTF8Lengths[i]; 985 if(overlap==ALL_CP_CONTAINED) { 986 s8+=length8; 987 continue; // Irrelevant string. 988 } 989 990 // Try to match this string at pos-overlap..pos. 991 if(overlap>=LONG_SPAN) { 992 overlap=length8; 993 // While contained: No point matching fully inside the code point span. 994 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 995 } 996 if(overlap>spanLength) { 997 overlap=spanLength; 998 } 999 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1000 for(;;) { 1001 if(inc>rest) { 1002 break; 1003 } 1004 // Try to match if the increment is not listed already. 1005 // Match at code point boundaries. (The UTF-8 strings were converted 1006 // from UTF-16 and are guaranteed to be well-formed.) 1007 if( !U8_IS_TRAIL(s[pos-overlap]) && 1008 !offsets.containsOffset(inc) && 1009 matches8(s+pos-overlap, s8, length8) 1010 1011 ) { 1012 if(inc==rest) { 1013 return length; // Reached the end of the string. 1014 } 1015 offsets.addOffset(inc); 1016 } 1017 if(overlap==0) { 1018 break; 1019 } 1020 --overlap; 1021 ++inc; 1022 } 1023 s8+=length8; 1024 } 1025 } else /* USET_SPAN_SIMPLE */ { 1026 int32_t maxInc=0, maxOverlap=0; 1027 for(i=0; i<stringsLength; ++i) { 1028 length8=utf8Lengths[i]; 1029 if(length8==0) { 1030 continue; // String not representable in UTF-8. 1031 } 1032 int32_t overlap=spanUTF8Lengths[i]; 1033 // For longest match, we do need to try to match even an all-contained string 1034 // to find the match from the earliest start. 1035 1036 // Try to match this string at pos-overlap..pos. 1037 if(overlap>=LONG_SPAN) { 1038 overlap=length8; 1039 // Longest match: Need to match fully inside the code point span 1040 // to find the match from the earliest start. 1041 } 1042 if(overlap>spanLength) { 1043 overlap=spanLength; 1044 } 1045 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1046 for(;;) { 1047 if(inc>rest || overlap<maxOverlap) { 1048 break; 1049 } 1050 // Try to match if the string is longer or starts earlier. 1051 // Match at code point boundaries. (The UTF-8 strings were converted 1052 // from UTF-16 and are guaranteed to be well-formed.) 1053 if( !U8_IS_TRAIL(s[pos-overlap]) && 1054 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1055 matches8(s+pos-overlap, s8, length8) 1056 1057 ) { 1058 maxInc=inc; // Longest match from earliest start. 1059 maxOverlap=overlap; 1060 break; 1061 } 1062 --overlap; 1063 ++inc; 1064 } 1065 s8+=length8; 1066 } 1067 1068 if(maxInc!=0 || maxOverlap!=0) { 1069 // Longest-match algorithm, and there was a string match. 1070 // Simply continue after it. 1071 pos+=maxInc; 1072 rest-=maxInc; 1073 if(rest==0) { 1074 return length; // Reached the end of the string. 1075 } 1076 spanLength=0; // Match strings from after a string match. 1077 continue; 1078 } 1079 } 1080 // Finished trying to match all strings at pos. 1081 1082 if(spanLength!=0 || pos==0) { 1083 // The position is after an unlimited code point span (spanLength!=0), 1084 // not after a string match. 1085 // The only position where spanLength==0 after a span is pos==0. 1086 // Otherwise, an unlimited code point span is only tried again when no 1087 // strings match, and if such a non-initial span fails we stop. 1088 if(offsets.isEmpty()) { 1089 return pos; // No strings matched after a span. 1090 } 1091 // Match strings from after the next string match. 1092 } else { 1093 // The position is after a string match (or a single code point). 1094 if(offsets.isEmpty()) { 1095 // No more strings matched after a previous string match. 1096 // Try another code point span from after the last string match. 1097 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1098 if( spanLength==rest || // Reached the end of the string, or 1099 spanLength==0 // neither strings nor span progressed. 1100 ) { 1101 return pos+spanLength; 1102 } 1103 pos+=spanLength; 1104 rest-=spanLength; 1105 continue; // spanLength>0: Match strings from after a span. 1106 } else { 1107 // Try to match only one code point from after a string match if some 1108 // string matched beyond it, so that we try all possible positions 1109 // and don't overshoot. 1110 spanLength=spanOneUTF8(spanSet, s+pos, rest); 1111 if(spanLength>0) { 1112 if(spanLength==rest) { 1113 return length; // Reached the end of the string. 1114 } 1115 // Match strings after this code point. 1116 // There cannot be any increments below it because UnicodeSet strings 1117 // contain multiple code points. 1118 pos+=spanLength; 1119 rest-=spanLength; 1120 offsets.shift(spanLength); 1121 spanLength=0; 1122 continue; // Match strings from after a single code point. 1123 } 1124 // Match strings from after the next string match. 1125 } 1126 } 1127 int32_t minOffset=offsets.popMinimum(); 1128 pos+=minOffset; 1129 rest-=minOffset; 1130 spanLength=0; // Match strings from after a string match. 1131 } 1132 } 1133 1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1135 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1136 return spanNotBackUTF8(s, length); 1137 } 1138 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1139 if(pos==0) { 1140 return 0; 1141 } 1142 int32_t spanLength=length-pos; 1143 1144 // Consider strings; they may overlap with the span. 1145 OffsetList offsets; 1146 if(spanCondition==USET_SPAN_CONTAINED) { 1147 // Use offset list to try all possibilities. 1148 offsets.setMaxLength(maxLength8); 1149 } 1150 int32_t i, stringsLength=strings.size(); 1151 uint8_t *spanBackUTF8Lengths=spanLengths; 1152 if(all) { 1153 spanBackUTF8Lengths+=3*stringsLength; 1154 } 1155 for(;;) { 1156 const uint8_t *s8=utf8; 1157 int32_t length8; 1158 if(spanCondition==USET_SPAN_CONTAINED) { 1159 for(i=0; i<stringsLength; ++i) { 1160 length8=utf8Lengths[i]; 1161 if(length8==0) { 1162 continue; // String not representable in UTF-8. 1163 } 1164 int32_t overlap=spanBackUTF8Lengths[i]; 1165 if(overlap==ALL_CP_CONTAINED) { 1166 s8+=length8; 1167 continue; // Irrelevant string. 1168 } 1169 1170 // Try to match this string at pos-(length8-overlap)..pos-length8. 1171 if(overlap>=LONG_SPAN) { 1172 overlap=length8; 1173 // While contained: No point matching fully inside the code point span. 1174 int32_t len1=0; 1175 U8_FWD_1(s8, len1, overlap); 1176 overlap-=len1; // Length of the string minus the first code point. 1177 } 1178 if(overlap>spanLength) { 1179 overlap=spanLength; 1180 } 1181 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1182 for(;;) { 1183 if(dec>pos) { 1184 break; 1185 } 1186 // Try to match if the decrement is not listed already. 1187 // Match at code point boundaries. (The UTF-8 strings were converted 1188 // from UTF-16 and are guaranteed to be well-formed.) 1189 if( !U8_IS_TRAIL(s[pos-dec]) && 1190 !offsets.containsOffset(dec) && 1191 matches8(s+pos-dec, s8, length8) 1192 ) { 1193 if(dec==pos) { 1194 return 0; // Reached the start of the string. 1195 } 1196 offsets.addOffset(dec); 1197 } 1198 if(overlap==0) { 1199 break; 1200 } 1201 --overlap; 1202 ++dec; 1203 } 1204 s8+=length8; 1205 } 1206 } else /* USET_SPAN_SIMPLE */ { 1207 int32_t maxDec=0, maxOverlap=0; 1208 for(i=0; i<stringsLength; ++i) { 1209 length8=utf8Lengths[i]; 1210 if(length8==0) { 1211 continue; // String not representable in UTF-8. 1212 } 1213 int32_t overlap=spanBackUTF8Lengths[i]; 1214 // For longest match, we do need to try to match even an all-contained string 1215 // to find the match from the latest end. 1216 1217 // Try to match this string at pos-(length8-overlap)..pos-length8. 1218 if(overlap>=LONG_SPAN) { 1219 overlap=length8; 1220 // Longest match: Need to match fully inside the code point span 1221 // to find the match from the latest end. 1222 } 1223 if(overlap>spanLength) { 1224 overlap=spanLength; 1225 } 1226 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1227 for(;;) { 1228 if(dec>pos || overlap<maxOverlap) { 1229 break; 1230 } 1231 // Try to match if the string is longer or ends later. 1232 // Match at code point boundaries. (The UTF-8 strings were converted 1233 // from UTF-16 and are guaranteed to be well-formed.) 1234 if( !U8_IS_TRAIL(s[pos-dec]) && 1235 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1236 matches8(s+pos-dec, s8, length8) 1237 ) { 1238 maxDec=dec; // Longest match from latest end. 1239 maxOverlap=overlap; 1240 break; 1241 } 1242 --overlap; 1243 ++dec; 1244 } 1245 s8+=length8; 1246 } 1247 1248 if(maxDec!=0 || maxOverlap!=0) { 1249 // Longest-match algorithm, and there was a string match. 1250 // Simply continue before it. 1251 pos-=maxDec; 1252 if(pos==0) { 1253 return 0; // Reached the start of the string. 1254 } 1255 spanLength=0; // Match strings from before a string match. 1256 continue; 1257 } 1258 } 1259 // Finished trying to match all strings at pos. 1260 1261 if(spanLength!=0 || pos==length) { 1262 // The position is before an unlimited code point span (spanLength!=0), 1263 // not before a string match. 1264 // The only position where spanLength==0 before a span is pos==length. 1265 // Otherwise, an unlimited code point span is only tried again when no 1266 // strings match, and if such a non-initial span fails we stop. 1267 if(offsets.isEmpty()) { 1268 return pos; // No strings matched before a span. 1269 } 1270 // Match strings from before the next string match. 1271 } else { 1272 // The position is before a string match (or a single code point). 1273 if(offsets.isEmpty()) { 1274 // No more strings matched before a previous string match. 1275 // Try another code point span from before the last string match. 1276 int32_t oldPos=pos; 1277 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1278 spanLength=oldPos-pos; 1279 if( pos==0 || // Reached the start of the string, or 1280 spanLength==0 // neither strings nor span progressed. 1281 ) { 1282 return pos; 1283 } 1284 continue; // spanLength>0: Match strings from before a span. 1285 } else { 1286 // Try to match only one code point from before a string match if some 1287 // string matched beyond it, so that we try all possible positions 1288 // and don't overshoot. 1289 spanLength=spanOneBackUTF8(spanSet, s, pos); 1290 if(spanLength>0) { 1291 if(spanLength==pos) { 1292 return 0; // Reached the start of the string. 1293 } 1294 // Match strings before this code point. 1295 // There cannot be any decrements below it because UnicodeSet strings 1296 // contain multiple code points. 1297 pos-=spanLength; 1298 offsets.shift(spanLength); 1299 spanLength=0; 1300 continue; // Match strings from before a single code point. 1301 } 1302 // Match strings from before the next string match. 1303 } 1304 } 1305 pos-=offsets.popMinimum(); 1306 spanLength=0; // Match strings from before a string match. 1307 } 1308 } 1309 1310 /* 1311 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1312 * 1313 * Theoretical algorithm: 1314 * - Iterate through the string, and at each code point boundary: 1315 * + If the code point there is in the set, then return with the current position. 1316 * + If a set string matches at the current position, then return with the current position. 1317 * 1318 * Optimized implementation: 1319 * 1320 * (Same assumption as for span() above.) 1321 * 1322 * Create and cache a spanNotSet which contains all of the single code points 1323 * of the original set but none of its strings. 1324 * For each set string add its initial code point to the spanNotSet. 1325 * (Also add its final code point for spanNotBack().) 1326 * 1327 * - Loop: 1328 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1329 * + If the current code point is in the original set, then 1330 * return the current position. 1331 * + If any set string matches at the current position, then 1332 * return the current position. 1333 * + If there is no match at the current position, neither for the code point there 1334 * nor for any set string, then skip this code point and continue the loop. 1335 * This happens for set-string-initial code points that were added to spanNotSet 1336 * when there is not actually a match for such a set string. 1337 */ 1338 1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1340 int32_t pos=0, rest=length; 1341 int32_t i, stringsLength=strings.size(); 1342 do { 1343 // Span until we find a code point from the set, 1344 // or a code point that starts or ends some string. 1345 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1346 if(i==rest) { 1347 return length; // Reached the end of the string. 1348 } 1349 pos+=i; 1350 rest-=i; 1351 1352 // Check whether the current code point is in the original set, 1353 // without the string starts and ends. 1354 int32_t cpLength=spanOne(spanSet, s+pos, rest); 1355 if(cpLength>0) { 1356 return pos; // There is a set element at pos. 1357 } 1358 1359 // Try to match the strings at pos. 1360 for(i=0; i<stringsLength; ++i) { 1361 if(spanLengths[i]==ALL_CP_CONTAINED) { 1362 continue; // Irrelevant string. 1363 } 1364 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1365 const UChar *s16=string.getBuffer(); 1366 int32_t length16=string.length(); 1367 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1368 return pos; // There is a set element at pos. 1369 } 1370 } 1371 1372 // The span(while not contained) ended on a string start/end which is 1373 // not in the original set. Skip this code point and continue. 1374 // cpLength<0 1375 pos-=cpLength; 1376 rest+=cpLength; 1377 } while(rest!=0); 1378 return length; // Reached the end of the string. 1379 } 1380 1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1382 int32_t pos=length; 1383 int32_t i, stringsLength=strings.size(); 1384 do { 1385 // Span until we find a code point from the set, 1386 // or a code point that starts or ends some string. 1387 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1388 if(pos==0) { 1389 return 0; // Reached the start of the string. 1390 } 1391 1392 // Check whether the current code point is in the original set, 1393 // without the string starts and ends. 1394 int32_t cpLength=spanOneBack(spanSet, s, pos); 1395 if(cpLength>0) { 1396 return pos; // There is a set element at pos. 1397 } 1398 1399 // Try to match the strings at pos. 1400 for(i=0; i<stringsLength; ++i) { 1401 // Use spanLengths rather than a spanBackLengths pointer because 1402 // it is easier and we only need to know whether the string is irrelevant 1403 // which is the same in either array. 1404 if(spanLengths[i]==ALL_CP_CONTAINED) { 1405 continue; // Irrelevant string. 1406 } 1407 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1408 const UChar *s16=string.getBuffer(); 1409 int32_t length16=string.length(); 1410 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1411 return pos; // There is a set element at pos. 1412 } 1413 } 1414 1415 // The span(while not contained) ended on a string start/end which is 1416 // not in the original set. Skip this code point and continue. 1417 // cpLength<0 1418 pos+=cpLength; 1419 } while(pos!=0); 1420 return 0; // Reached the start of the string. 1421 } 1422 1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1424 int32_t pos=0, rest=length; 1425 int32_t i, stringsLength=strings.size(); 1426 uint8_t *spanUTF8Lengths=spanLengths; 1427 if(all) { 1428 spanUTF8Lengths+=2*stringsLength; 1429 } 1430 do { 1431 // Span until we find a code point from the set, 1432 // or a code point that starts or ends some string. 1433 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1434 if(i==rest) { 1435 return length; // Reached the end of the string. 1436 } 1437 pos+=i; 1438 rest-=i; 1439 1440 // Check whether the current code point is in the original set, 1441 // without the string starts and ends. 1442 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1443 if(cpLength>0) { 1444 return pos; // There is a set element at pos. 1445 } 1446 1447 // Try to match the strings at pos. 1448 const uint8_t *s8=utf8; 1449 int32_t length8; 1450 for(i=0; i<stringsLength; ++i) { 1451 length8=utf8Lengths[i]; 1452 // ALL_CP_CONTAINED: Irrelevant string. 1453 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1454 return pos; // There is a set element at pos. 1455 } 1456 s8+=length8; 1457 } 1458 1459 // The span(while not contained) ended on a string start/end which is 1460 // not in the original set. Skip this code point and continue. 1461 // cpLength<0 1462 pos-=cpLength; 1463 rest+=cpLength; 1464 } while(rest!=0); 1465 return length; // Reached the end of the string. 1466 } 1467 1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1469 int32_t pos=length; 1470 int32_t i, stringsLength=strings.size(); 1471 uint8_t *spanBackUTF8Lengths=spanLengths; 1472 if(all) { 1473 spanBackUTF8Lengths+=3*stringsLength; 1474 } 1475 do { 1476 // Span until we find a code point from the set, 1477 // or a code point that starts or ends some string. 1478 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1479 if(pos==0) { 1480 return 0; // Reached the start of the string. 1481 } 1482 1483 // Check whether the current code point is in the original set, 1484 // without the string starts and ends. 1485 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1486 if(cpLength>0) { 1487 return pos; // There is a set element at pos. 1488 } 1489 1490 // Try to match the strings at pos. 1491 const uint8_t *s8=utf8; 1492 int32_t length8; 1493 for(i=0; i<stringsLength; ++i) { 1494 length8=utf8Lengths[i]; 1495 // ALL_CP_CONTAINED: Irrelevant string. 1496 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1497 return pos; // There is a set element at pos. 1498 } 1499 s8+=length8; 1500 } 1501 1502 // The span(while not contained) ended on a string start/end which is 1503 // not in the original set. Skip this code point and continue. 1504 // cpLength<0 1505 pos+=cpLength; 1506 } while(pos!=0); 1507 return 0; // Reached the start of the string. 1508 } 1509 1510 U_NAMESPACE_END 1511