1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2007-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: unisetspan.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007mar01 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uniset.h" 21 #include "unicode/ustring.h" 22 #include "unicode/utf8.h" 23 #include "unicode/utf16.h" 24 #include "cmemory.h" 25 #include "uvector.h" 26 #include "unisetspan.h" 27 28 U_NAMESPACE_BEGIN 29 30 /* 31 * List of offsets from the current position from where to try matching 32 * a code point or a string. 33 * Store offsets rather than indexes to simplify the code and use the same list 34 * for both increments (in span()) and decrements (in spanBack()). 35 * 36 * Assumption: The maximum offset is limited, and the offsets that are stored 37 * at any one time are relatively dense, that is, there are normally no gaps of 38 * hundreds or thousands of offset values. 39 * 40 * The implementation uses a circular buffer of byte flags, 41 * each indicating whether the corresponding offset is in the list. 42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and 43 * physically moving part of the list. 44 * 45 * Note: In principle, the caller should setMaxLength() to the maximum of the 46 * max string length and U16_LENGTH/U8_LENGTH to account for 47 * "long" single code points. 48 * However, this implementation uses at least a staticList with more than 49 * U8_LENGTH entries anyway. 50 * 51 * Note: If maxLength were guaranteed to be no more than 32 or 64, 52 * the list could be stored as bit flags in a single integer. 53 * Rather than handling a circular buffer with a start list index, 54 * the integer would simply be shifted when lower offsets are removed. 55 * UnicodeSet does not have a limit on the lengths of strings. 56 */ 57 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 58 public: 59 OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 60 61 ~OffsetList() { 62 if(list!=staticList) { 63 uprv_free(list); 64 } 65 } 66 67 // Call exactly once if the list is to be used. 68 void setMaxLength(int32_t maxLength) { 69 if(maxLength<=(int32_t)sizeof(staticList)) { 70 capacity=(int32_t)sizeof(staticList); 71 } else { 72 UBool *l=(UBool *)uprv_malloc(maxLength); 73 if(l!=NULL) { 74 list=l; 75 capacity=maxLength; 76 } 77 } 78 uprv_memset(list, 0, capacity); 79 } 80 81 void clear() { 82 uprv_memset(list, 0, capacity); 83 start=length=0; 84 } 85 86 UBool isEmpty() const { 87 return (UBool)(length==0); 88 } 89 90 // Reduce all stored offsets by delta, used when the current position 91 // moves by delta. 92 // There must not be any offsets lower than delta. 93 // If there is an offset equal to delta, it is removed. 94 // delta=[1..maxLength] 95 void shift(int32_t delta) { 96 int32_t i=start+delta; 97 if(i>=capacity) { 98 i-=capacity; 99 } 100 if(list[i]) { 101 list[i]=FALSE; 102 --length; 103 } 104 start=i; 105 } 106 107 // Add an offset. The list must not contain it yet. 108 // offset=[1..maxLength] 109 void addOffset(int32_t offset) { 110 int32_t i=start+offset; 111 if(i>=capacity) { 112 i-=capacity; 113 } 114 list[i]=TRUE; 115 ++length; 116 } 117 118 // offset=[1..maxLength] 119 UBool containsOffset(int32_t offset) const { 120 int32_t i=start+offset; 121 if(i>=capacity) { 122 i-=capacity; 123 } 124 return list[i]; 125 } 126 127 // Find the lowest stored offset from a non-empty list, remove it, 128 // and reduce all other offsets by this minimum. 129 // Returns [1..maxLength]. 130 int32_t popMinimum() { 131 // Look for the next offset in list[start+1..capacity-1]. 132 int32_t i=start, result; 133 while(++i<capacity) { 134 if(list[i]) { 135 list[i]=FALSE; 136 --length; 137 result=i-start; 138 start=i; 139 return result; 140 } 141 } 142 // i==capacity 143 144 // Wrap around and look for the next offset in list[0..start]. 145 // Since the list is not empty, there will be one. 146 result=capacity-start; 147 i=0; 148 while(!list[i]) { 149 ++i; 150 } 151 list[i]=FALSE; 152 --length; 153 start=i; 154 return result+=i; 155 } 156 157 private: 158 UBool *list; 159 int32_t capacity; 160 int32_t length; 161 int32_t start; 162 163 UBool staticList[16]; 164 }; 165 166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string. 167 static int32_t 168 getUTF8Length(const UChar *s, int32_t length) { 169 UErrorCode errorCode=U_ZERO_ERROR; 170 int32_t length8=0; 171 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 172 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 173 return length8; 174 } else { 175 // The string contains an unpaired surrogate. 176 // Ignore this string. 177 return 0; 178 } 179 } 180 181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 182 static int32_t 183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 184 UErrorCode errorCode=U_ZERO_ERROR; 185 int32_t length8=0; 186 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 187 if(U_SUCCESS(errorCode)) { 188 return length8; 189 } else { 190 // The string contains an unpaired surrogate. 191 // Ignore this string. 192 return 0; 193 } 194 } 195 196 static inline uint8_t 197 makeSpanLengthByte(int32_t spanLength) { 198 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 199 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 200 } 201 202 // Construct for all variants of span(), or only for any one variant. 203 // Initialize as little as possible, for single use. 204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 205 const UVector &setStrings, 206 uint32_t which) 207 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 208 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 209 utf8Length(0), 210 maxLength16(0), maxLength8(0), 211 all((UBool)(which==ALL)) { 212 spanSet.retainAll(set); 213 if(which&NOT_CONTAINED) { 214 // Default to the same sets. 215 // addToSpanNotSet() will create a separate set if necessary. 216 pSpanNotSet=&spanSet; 217 } 218 219 // Determine if the strings even need to be taken into account at all for span() etc. 220 // If any string is relevant, then all strings need to be used for 221 // span(longest match) but only the relevant ones for span(while contained). 222 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 223 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 224 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 225 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 226 int32_t stringsLength=strings.size(); 227 228 int32_t i, spanLength; 229 UBool someRelevant=FALSE; 230 for(i=0; i<stringsLength; ++i) { 231 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 232 const UChar *s16=string.getBuffer(); 233 int32_t length16=string.length(); 234 UBool thisRelevant; 235 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 236 if(spanLength<length16) { // Relevant string. 237 someRelevant=thisRelevant=TRUE; 238 } else { 239 thisRelevant=FALSE; 240 } 241 if((which&UTF16) && length16>maxLength16) { 242 maxLength16=length16; 243 } 244 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 245 int32_t length8=getUTF8Length(s16, length16); 246 utf8Length+=length8; 247 if(length8>maxLength8) { 248 maxLength8=length8; 249 } 250 } 251 } 252 if(!someRelevant) { 253 maxLength16=maxLength8=0; 254 return; 255 } 256 257 // Freeze after checking for the need to use strings at all because freezing 258 // a set takes some time and memory which are wasted if there are no relevant strings. 259 if(all) { 260 spanSet.freeze(); 261 } 262 263 uint8_t *spanBackLengths; 264 uint8_t *spanUTF8Lengths; 265 uint8_t *spanBackUTF8Lengths; 266 267 // Allocate a block of meta data. 268 int32_t allocSize; 269 if(all) { 270 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 271 allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 272 } else { 273 allocSize=stringsLength; // One set of span lengths. 274 if(which&UTF8) { 275 // UTF-8 lengths and UTF-8 strings. 276 allocSize+=stringsLength*4+utf8Length; 277 } 278 } 279 if(allocSize<=(int32_t)sizeof(staticLengths)) { 280 utf8Lengths=staticLengths; 281 } else { 282 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 283 if(utf8Lengths==NULL) { 284 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 285 return; // Out of memory. 286 } 287 } 288 289 if(all) { 290 // Store span lengths for all span() variants. 291 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 292 spanBackLengths=spanLengths+stringsLength; 293 spanUTF8Lengths=spanBackLengths+stringsLength; 294 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 295 utf8=spanBackUTF8Lengths+stringsLength; 296 } else { 297 // Store span lengths for only one span() variant. 298 if(which&UTF8) { 299 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 300 utf8=spanLengths+stringsLength; 301 } else { 302 spanLengths=(uint8_t *)utf8Lengths; 303 } 304 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 305 } 306 307 // Set the meta data and pSpanNotSet and write the UTF-8 strings. 308 int32_t utf8Count=0; // Count UTF-8 bytes written so far. 309 310 for(i=0; i<stringsLength; ++i) { 311 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 312 const UChar *s16=string.getBuffer(); 313 int32_t length16=string.length(); 314 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 315 if(spanLength<length16) { // Relevant string. 316 if(which&UTF16) { 317 if(which&CONTAINED) { 318 if(which&FWD) { 319 spanLengths[i]=makeSpanLengthByte(spanLength); 320 } 321 if(which&BACK) { 322 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 323 spanBackLengths[i]=makeSpanLengthByte(spanLength); 324 } 325 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 326 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 327 } 328 } 329 if(which&UTF8) { 330 uint8_t *s8=utf8+utf8Count; 331 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 332 utf8Count+=utf8Lengths[i]=length8; 333 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 334 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 335 } else { // Relevant for UTF-8. 336 if(which&CONTAINED) { 337 if(which&FWD) { 338 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 339 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 340 } 341 if(which&BACK) { 342 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 343 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 344 } 345 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 346 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 347 } 348 } 349 } 350 if(which&NOT_CONTAINED) { 351 // Add string start and end code points to the spanNotSet so that 352 // a span(while not contained) stops before any string. 353 UChar32 c; 354 if(which&FWD) { 355 int32_t len=0; 356 U16_NEXT(s16, len, length16, c); 357 addToSpanNotSet(c); 358 } 359 if(which&BACK) { 360 int32_t len=length16; 361 U16_PREV(s16, 0, len, c); 362 addToSpanNotSet(c); 363 } 364 } 365 } else { // Irrelevant string. 366 if(which&UTF8) { 367 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 368 uint8_t *s8=utf8+utf8Count; 369 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 370 utf8Count+=utf8Lengths[i]=length8; 371 } else { 372 utf8Lengths[i]=0; 373 } 374 } 375 if(all) { 376 spanLengths[i]=spanBackLengths[i]= 377 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 378 (uint8_t)ALL_CP_CONTAINED; 379 } else { 380 // All spanXYZLengths pointers contain the same address. 381 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 382 } 383 } 384 } 385 386 // Finish. 387 if(all) { 388 pSpanNotSet->freeze(); 389 } 390 } 391 392 // Copy constructor. Assumes which==ALL for a frozen set. 393 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 394 const UVector &newParentSetStrings) 395 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 396 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 397 utf8Length(otherStringSpan.utf8Length), 398 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 399 all(TRUE) { 400 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 401 pSpanNotSet=&spanSet; 402 } else { 403 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 404 } 405 406 // Allocate a block of meta data. 407 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 408 int32_t stringsLength=strings.size(); 409 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 410 if(allocSize<=(int32_t)sizeof(staticLengths)) { 411 utf8Lengths=staticLengths; 412 } else { 413 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 414 if(utf8Lengths==NULL) { 415 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 416 return; // Out of memory. 417 } 418 } 419 420 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 421 utf8=spanLengths+stringsLength*4; 422 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 423 } 424 425 UnicodeSetStringSpan::~UnicodeSetStringSpan() { 426 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 427 delete pSpanNotSet; 428 } 429 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 430 uprv_free(utf8Lengths); 431 } 432 } 433 434 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 435 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 436 if(spanSet.contains(c)) { 437 return; // Nothing to do. 438 } 439 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 440 if(newSet==NULL) { 441 return; // Out of memory. 442 } else { 443 pSpanNotSet=newSet; 444 } 445 } 446 pSpanNotSet->add(c); 447 } 448 449 // Compare strings without any argument checks. Requires length>0. 450 static inline UBool 451 matches16(const UChar *s, const UChar *t, int32_t length) { 452 do { 453 if(*s++!=*t++) { 454 return FALSE; 455 } 456 } while(--length>0); 457 return TRUE; 458 } 459 460 static inline UBool 461 matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 462 do { 463 if(*s++!=*t++) { 464 return FALSE; 465 } 466 } while(--length>0); 467 return TRUE; 468 } 469 470 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 471 // at code point boundaries. 472 // That is, each edge of a match must not be in the middle of a surrogate pair. 473 static inline UBool 474 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 475 s+=start; 476 limit-=start; 477 return matches16(s, t, length) && 478 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 479 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 480 } 481 482 // Does the set contain the next code point? 483 // If so, return its length; otherwise return its negative length. 484 static inline int32_t 485 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 486 UChar c=*s, c2; 487 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 488 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 489 } 490 return set.contains(c) ? 1 : -1; 491 } 492 493 static inline int32_t 494 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 495 UChar c=s[length-1], c2; 496 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 497 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 498 } 499 return set.contains(c) ? 1 : -1; 500 } 501 502 static inline int32_t 503 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 504 UChar32 c=*s; 505 if((int8_t)c>=0) { 506 return set.contains(c) ? 1 : -1; 507 } 508 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). 509 int32_t i=0; 510 U8_NEXT_OR_FFFD(s, i, length, c); 511 return set.contains(c) ? i : -i; 512 } 513 514 static inline int32_t 515 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 516 UChar32 c=s[length-1]; 517 if((int8_t)c>=0) { 518 return set.contains(c) ? 1 : -1; 519 } 520 int32_t i=length-1; 521 c=utf8_prevCharSafeBody(s, 0, &i, c, -3); 522 length-=i; 523 return set.contains(c) ? length : -length; 524 } 525 526 /* 527 * Note: In span() when spanLength==0 (after a string match, or at the beginning 528 * after an empty code point span) and in spanNot() and spanNotUTF8(), 529 * string matching could use a binary search 530 * because all string matches are done from the same start index. 531 * 532 * For UTF-8, this would require a comparison function that returns UTF-16 order. 533 * 534 * This optimization should not be necessary for normal UnicodeSets because 535 * most sets have no strings, and most sets with strings have 536 * very few very short strings. 537 * For cases with many strings, it might be better to use a different API 538 * and implementation with a DFA (state machine). 539 */ 540 541 /* 542 * Algorithm for span(USET_SPAN_CONTAINED) 543 * 544 * Theoretical algorithm: 545 * - Iterate through the string, and at each code point boundary: 546 * + If the code point there is in the set, then remember to continue after it. 547 * + If a set string matches at the current position, then remember to continue after it. 548 * + Either recursively span for each code point or string match, 549 * or recursively span for all but the shortest one and 550 * iteratively continue the span with the shortest local match. 551 * + Remember the longest recursive span (the farthest end point). 552 * + If there is no match at the current position, neither for the code point there 553 * nor for any set string, then stop and return the longest recursive span length. 554 * 555 * Optimized implementation: 556 * 557 * (We assume that most sets will have very few very short strings. 558 * A span using a string-less set is extremely fast.) 559 * 560 * Create and cache a spanSet which contains all of the single code points 561 * of the original set but none of its strings. 562 * 563 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 564 * - Loop: 565 * + Try to match each set string at the end of the spanLength. 566 * ~ Set strings that start with set-contained code points must be matched 567 * with a partial overlap because the recursive algorithm would have tried 568 * to match them at every position. 569 * ~ Set strings that entirely consist of set-contained code points 570 * are irrelevant for span(USET_SPAN_CONTAINED) because the 571 * recursive algorithm would continue after them anyway 572 * and find the longest recursive match from their end. 573 * ~ Rather than recursing, note each end point of a set string match. 574 * + If no set string matched after spanSet.span(), then return 575 * with where the spanSet.span() ended. 576 * + If at least one set string matched after spanSet.span(), then 577 * pop the shortest string match end point and continue 578 * the loop, trying to match all set strings from there. 579 * + If at least one more set string matched after a previous string match, 580 * then test if the code point after the previous string match is also 581 * contained in the set. 582 * Continue the loop with the shortest end point of either this code point 583 * or a matching set string. 584 * + If no more set string matched after a previous string match, 585 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 586 * Stop if spanLength==0, otherwise continue the loop. 587 * 588 * By noting each end point of a set string match, 589 * the function visits each string position at most once and finishes 590 * in linear time. 591 * 592 * The recursive algorithm may visit the same string position many times 593 * if multiple paths lead to it and finishes in exponential time. 594 */ 595 596 /* 597 * Algorithm for span(USET_SPAN_SIMPLE) 598 * 599 * Theoretical algorithm: 600 * - Iterate through the string, and at each code point boundary: 601 * + If the code point there is in the set, then remember to continue after it. 602 * + If a set string matches at the current position, then remember to continue after it. 603 * + Continue from the farthest match position and ignore all others. 604 * + If there is no match at the current position, 605 * then stop and return the current position. 606 * 607 * Optimized implementation: 608 * 609 * (Same assumption and spanSet as above.) 610 * 611 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 612 * - Loop: 613 * + Try to match each set string at the end of the spanLength. 614 * ~ Set strings that start with set-contained code points must be matched 615 * with a partial overlap because the standard algorithm would have tried 616 * to match them earlier. 617 * ~ Set strings that entirely consist of set-contained code points 618 * must be matched with a full overlap because the longest-match algorithm 619 * would hide set string matches that end earlier. 620 * Such set strings need not be matched earlier inside the code point span 621 * because the standard algorithm would then have continued after 622 * the set string match anyway. 623 * ~ Remember the longest set string match (farthest end point) from the earliest 624 * starting point. 625 * + If no set string matched after spanSet.span(), then return 626 * with where the spanSet.span() ended. 627 * + If at least one set string matched, then continue the loop after the 628 * longest match from the earliest position. 629 * + If no more set string matched after a previous string match, 630 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 631 * Stop if spanLength==0, otherwise continue the loop. 632 */ 633 634 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 635 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 636 return spanNot(s, length); 637 } 638 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 639 if(spanLength==length) { 640 return length; 641 } 642 643 // Consider strings; they may overlap with the span. 644 OffsetList offsets; 645 if(spanCondition==USET_SPAN_CONTAINED) { 646 // Use offset list to try all possibilities. 647 offsets.setMaxLength(maxLength16); 648 } 649 int32_t pos=spanLength, rest=length-pos; 650 int32_t i, stringsLength=strings.size(); 651 for(;;) { 652 if(spanCondition==USET_SPAN_CONTAINED) { 653 for(i=0; i<stringsLength; ++i) { 654 int32_t overlap=spanLengths[i]; 655 if(overlap==ALL_CP_CONTAINED) { 656 continue; // Irrelevant string. 657 } 658 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 659 const UChar *s16=string.getBuffer(); 660 int32_t length16=string.length(); 661 662 // Try to match this string at pos-overlap..pos. 663 if(overlap>=LONG_SPAN) { 664 overlap=length16; 665 // While contained: No point matching fully inside the code point span. 666 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 667 } 668 if(overlap>spanLength) { 669 overlap=spanLength; 670 } 671 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 672 for(;;) { 673 if(inc>rest) { 674 break; 675 } 676 // Try to match if the increment is not listed already. 677 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 678 if(inc==rest) { 679 return length; // Reached the end of the string. 680 } 681 offsets.addOffset(inc); 682 } 683 if(overlap==0) { 684 break; 685 } 686 --overlap; 687 ++inc; 688 } 689 } 690 } else /* USET_SPAN_SIMPLE */ { 691 int32_t maxInc=0, maxOverlap=0; 692 for(i=0; i<stringsLength; ++i) { 693 int32_t overlap=spanLengths[i]; 694 // For longest match, we do need to try to match even an all-contained string 695 // to find the match from the earliest start. 696 697 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 698 const UChar *s16=string.getBuffer(); 699 int32_t length16=string.length(); 700 701 // Try to match this string at pos-overlap..pos. 702 if(overlap>=LONG_SPAN) { 703 overlap=length16; 704 // Longest match: Need to match fully inside the code point span 705 // to find the match from the earliest start. 706 } 707 if(overlap>spanLength) { 708 overlap=spanLength; 709 } 710 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 711 for(;;) { 712 if(inc>rest || overlap<maxOverlap) { 713 break; 714 } 715 // Try to match if the string is longer or starts earlier. 716 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 717 matches16CPB(s, pos-overlap, length, s16, length16) 718 ) { 719 maxInc=inc; // Longest match from earliest start. 720 maxOverlap=overlap; 721 break; 722 } 723 --overlap; 724 ++inc; 725 } 726 } 727 728 if(maxInc!=0 || maxOverlap!=0) { 729 // Longest-match algorithm, and there was a string match. 730 // Simply continue after it. 731 pos+=maxInc; 732 rest-=maxInc; 733 if(rest==0) { 734 return length; // Reached the end of the string. 735 } 736 spanLength=0; // Match strings from after a string match. 737 continue; 738 } 739 } 740 // Finished trying to match all strings at pos. 741 742 if(spanLength!=0 || pos==0) { 743 // The position is after an unlimited code point span (spanLength!=0), 744 // not after a string match. 745 // The only position where spanLength==0 after a span is pos==0. 746 // Otherwise, an unlimited code point span is only tried again when no 747 // strings match, and if such a non-initial span fails we stop. 748 if(offsets.isEmpty()) { 749 return pos; // No strings matched after a span. 750 } 751 // Match strings from after the next string match. 752 } else { 753 // The position is after a string match (or a single code point). 754 if(offsets.isEmpty()) { 755 // No more strings matched after a previous string match. 756 // Try another code point span from after the last string match. 757 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 758 if( spanLength==rest || // Reached the end of the string, or 759 spanLength==0 // neither strings nor span progressed. 760 ) { 761 return pos+spanLength; 762 } 763 pos+=spanLength; 764 rest-=spanLength; 765 continue; // spanLength>0: Match strings from after a span. 766 } else { 767 // Try to match only one code point from after a string match if some 768 // string matched beyond it, so that we try all possible positions 769 // and don't overshoot. 770 spanLength=spanOne(spanSet, s+pos, rest); 771 if(spanLength>0) { 772 if(spanLength==rest) { 773 return length; // Reached the end of the string. 774 } 775 // Match strings after this code point. 776 // There cannot be any increments below it because UnicodeSet strings 777 // contain multiple code points. 778 pos+=spanLength; 779 rest-=spanLength; 780 offsets.shift(spanLength); 781 spanLength=0; 782 continue; // Match strings from after a single code point. 783 } 784 // Match strings from after the next string match. 785 } 786 } 787 int32_t minOffset=offsets.popMinimum(); 788 pos+=minOffset; 789 rest-=minOffset; 790 spanLength=0; // Match strings from after a string match. 791 } 792 } 793 794 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 795 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 796 return spanNotBack(s, length); 797 } 798 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 799 if(pos==0) { 800 return 0; 801 } 802 int32_t spanLength=length-pos; 803 804 // Consider strings; they may overlap with the span. 805 OffsetList offsets; 806 if(spanCondition==USET_SPAN_CONTAINED) { 807 // Use offset list to try all possibilities. 808 offsets.setMaxLength(maxLength16); 809 } 810 int32_t i, stringsLength=strings.size(); 811 uint8_t *spanBackLengths=spanLengths; 812 if(all) { 813 spanBackLengths+=stringsLength; 814 } 815 for(;;) { 816 if(spanCondition==USET_SPAN_CONTAINED) { 817 for(i=0; i<stringsLength; ++i) { 818 int32_t overlap=spanBackLengths[i]; 819 if(overlap==ALL_CP_CONTAINED) { 820 continue; // Irrelevant string. 821 } 822 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 823 const UChar *s16=string.getBuffer(); 824 int32_t length16=string.length(); 825 826 // Try to match this string at pos-(length16-overlap)..pos-length16. 827 if(overlap>=LONG_SPAN) { 828 overlap=length16; 829 // While contained: No point matching fully inside the code point span. 830 int32_t len1=0; 831 U16_FWD_1(s16, len1, overlap); 832 overlap-=len1; // Length of the string minus the first code point. 833 } 834 if(overlap>spanLength) { 835 overlap=spanLength; 836 } 837 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 838 for(;;) { 839 if(dec>pos) { 840 break; 841 } 842 // Try to match if the decrement is not listed already. 843 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 844 if(dec==pos) { 845 return 0; // Reached the start of the string. 846 } 847 offsets.addOffset(dec); 848 } 849 if(overlap==0) { 850 break; 851 } 852 --overlap; 853 ++dec; 854 } 855 } 856 } else /* USET_SPAN_SIMPLE */ { 857 int32_t maxDec=0, maxOverlap=0; 858 for(i=0; i<stringsLength; ++i) { 859 int32_t overlap=spanBackLengths[i]; 860 // For longest match, we do need to try to match even an all-contained string 861 // to find the match from the latest end. 862 863 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 864 const UChar *s16=string.getBuffer(); 865 int32_t length16=string.length(); 866 867 // Try to match this string at pos-(length16-overlap)..pos-length16. 868 if(overlap>=LONG_SPAN) { 869 overlap=length16; 870 // Longest match: Need to match fully inside the code point span 871 // to find the match from the latest end. 872 } 873 if(overlap>spanLength) { 874 overlap=spanLength; 875 } 876 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 877 for(;;) { 878 if(dec>pos || overlap<maxOverlap) { 879 break; 880 } 881 // Try to match if the string is longer or ends later. 882 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 883 matches16CPB(s, pos-dec, length, s16, length16) 884 ) { 885 maxDec=dec; // Longest match from latest end. 886 maxOverlap=overlap; 887 break; 888 } 889 --overlap; 890 ++dec; 891 } 892 } 893 894 if(maxDec!=0 || maxOverlap!=0) { 895 // Longest-match algorithm, and there was a string match. 896 // Simply continue before it. 897 pos-=maxDec; 898 if(pos==0) { 899 return 0; // Reached the start of the string. 900 } 901 spanLength=0; // Match strings from before a string match. 902 continue; 903 } 904 } 905 // Finished trying to match all strings at pos. 906 907 if(spanLength!=0 || pos==length) { 908 // The position is before an unlimited code point span (spanLength!=0), 909 // not before a string match. 910 // The only position where spanLength==0 before a span is pos==length. 911 // Otherwise, an unlimited code point span is only tried again when no 912 // strings match, and if such a non-initial span fails we stop. 913 if(offsets.isEmpty()) { 914 return pos; // No strings matched before a span. 915 } 916 // Match strings from before the next string match. 917 } else { 918 // The position is before a string match (or a single code point). 919 if(offsets.isEmpty()) { 920 // No more strings matched before a previous string match. 921 // Try another code point span from before the last string match. 922 int32_t oldPos=pos; 923 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 924 spanLength=oldPos-pos; 925 if( pos==0 || // Reached the start of the string, or 926 spanLength==0 // neither strings nor span progressed. 927 ) { 928 return pos; 929 } 930 continue; // spanLength>0: Match strings from before a span. 931 } else { 932 // Try to match only one code point from before a string match if some 933 // string matched beyond it, so that we try all possible positions 934 // and don't overshoot. 935 spanLength=spanOneBack(spanSet, s, pos); 936 if(spanLength>0) { 937 if(spanLength==pos) { 938 return 0; // Reached the start of the string. 939 } 940 // Match strings before this code point. 941 // There cannot be any decrements below it because UnicodeSet strings 942 // contain multiple code points. 943 pos-=spanLength; 944 offsets.shift(spanLength); 945 spanLength=0; 946 continue; // Match strings from before a single code point. 947 } 948 // Match strings from before the next string match. 949 } 950 } 951 pos-=offsets.popMinimum(); 952 spanLength=0; // Match strings from before a string match. 953 } 954 } 955 956 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 957 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 958 return spanNotUTF8(s, length); 959 } 960 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 961 if(spanLength==length) { 962 return length; 963 } 964 965 // Consider strings; they may overlap with the span. 966 OffsetList offsets; 967 if(spanCondition==USET_SPAN_CONTAINED) { 968 // Use offset list to try all possibilities. 969 offsets.setMaxLength(maxLength8); 970 } 971 int32_t pos=spanLength, rest=length-pos; 972 int32_t i, stringsLength=strings.size(); 973 uint8_t *spanUTF8Lengths=spanLengths; 974 if(all) { 975 spanUTF8Lengths+=2*stringsLength; 976 } 977 for(;;) { 978 const uint8_t *s8=utf8; 979 int32_t length8; 980 if(spanCondition==USET_SPAN_CONTAINED) { 981 for(i=0; i<stringsLength; ++i) { 982 length8=utf8Lengths[i]; 983 if(length8==0) { 984 continue; // String not representable in UTF-8. 985 } 986 int32_t overlap=spanUTF8Lengths[i]; 987 if(overlap==ALL_CP_CONTAINED) { 988 s8+=length8; 989 continue; // Irrelevant string. 990 } 991 992 // Try to match this string at pos-overlap..pos. 993 if(overlap>=LONG_SPAN) { 994 overlap=length8; 995 // While contained: No point matching fully inside the code point span. 996 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 997 } 998 if(overlap>spanLength) { 999 overlap=spanLength; 1000 } 1001 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1002 for(;;) { 1003 if(inc>rest) { 1004 break; 1005 } 1006 // Try to match if the increment is not listed already. 1007 // Match at code point boundaries. (The UTF-8 strings were converted 1008 // from UTF-16 and are guaranteed to be well-formed.) 1009 if( !U8_IS_TRAIL(s[pos-overlap]) && 1010 !offsets.containsOffset(inc) && 1011 matches8(s+pos-overlap, s8, length8) 1012 1013 ) { 1014 if(inc==rest) { 1015 return length; // Reached the end of the string. 1016 } 1017 offsets.addOffset(inc); 1018 } 1019 if(overlap==0) { 1020 break; 1021 } 1022 --overlap; 1023 ++inc; 1024 } 1025 s8+=length8; 1026 } 1027 } else /* USET_SPAN_SIMPLE */ { 1028 int32_t maxInc=0, maxOverlap=0; 1029 for(i=0; i<stringsLength; ++i) { 1030 length8=utf8Lengths[i]; 1031 if(length8==0) { 1032 continue; // String not representable in UTF-8. 1033 } 1034 int32_t overlap=spanUTF8Lengths[i]; 1035 // For longest match, we do need to try to match even an all-contained string 1036 // to find the match from the earliest start. 1037 1038 // Try to match this string at pos-overlap..pos. 1039 if(overlap>=LONG_SPAN) { 1040 overlap=length8; 1041 // Longest match: Need to match fully inside the code point span 1042 // to find the match from the earliest start. 1043 } 1044 if(overlap>spanLength) { 1045 overlap=spanLength; 1046 } 1047 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1048 for(;;) { 1049 if(inc>rest || overlap<maxOverlap) { 1050 break; 1051 } 1052 // Try to match if the string is longer or starts earlier. 1053 // Match at code point boundaries. (The UTF-8 strings were converted 1054 // from UTF-16 and are guaranteed to be well-formed.) 1055 if( !U8_IS_TRAIL(s[pos-overlap]) && 1056 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1057 matches8(s+pos-overlap, s8, length8) 1058 1059 ) { 1060 maxInc=inc; // Longest match from earliest start. 1061 maxOverlap=overlap; 1062 break; 1063 } 1064 --overlap; 1065 ++inc; 1066 } 1067 s8+=length8; 1068 } 1069 1070 if(maxInc!=0 || maxOverlap!=0) { 1071 // Longest-match algorithm, and there was a string match. 1072 // Simply continue after it. 1073 pos+=maxInc; 1074 rest-=maxInc; 1075 if(rest==0) { 1076 return length; // Reached the end of the string. 1077 } 1078 spanLength=0; // Match strings from after a string match. 1079 continue; 1080 } 1081 } 1082 // Finished trying to match all strings at pos. 1083 1084 if(spanLength!=0 || pos==0) { 1085 // The position is after an unlimited code point span (spanLength!=0), 1086 // not after a string match. 1087 // The only position where spanLength==0 after a span is pos==0. 1088 // Otherwise, an unlimited code point span is only tried again when no 1089 // strings match, and if such a non-initial span fails we stop. 1090 if(offsets.isEmpty()) { 1091 return pos; // No strings matched after a span. 1092 } 1093 // Match strings from after the next string match. 1094 } else { 1095 // The position is after a string match (or a single code point). 1096 if(offsets.isEmpty()) { 1097 // No more strings matched after a previous string match. 1098 // Try another code point span from after the last string match. 1099 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1100 if( spanLength==rest || // Reached the end of the string, or 1101 spanLength==0 // neither strings nor span progressed. 1102 ) { 1103 return pos+spanLength; 1104 } 1105 pos+=spanLength; 1106 rest-=spanLength; 1107 continue; // spanLength>0: Match strings from after a span. 1108 } else { 1109 // Try to match only one code point from after a string match if some 1110 // string matched beyond it, so that we try all possible positions 1111 // and don't overshoot. 1112 spanLength=spanOneUTF8(spanSet, s+pos, rest); 1113 if(spanLength>0) { 1114 if(spanLength==rest) { 1115 return length; // Reached the end of the string. 1116 } 1117 // Match strings after this code point. 1118 // There cannot be any increments below it because UnicodeSet strings 1119 // contain multiple code points. 1120 pos+=spanLength; 1121 rest-=spanLength; 1122 offsets.shift(spanLength); 1123 spanLength=0; 1124 continue; // Match strings from after a single code point. 1125 } 1126 // Match strings from after the next string match. 1127 } 1128 } 1129 int32_t minOffset=offsets.popMinimum(); 1130 pos+=minOffset; 1131 rest-=minOffset; 1132 spanLength=0; // Match strings from after a string match. 1133 } 1134 } 1135 1136 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1137 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1138 return spanNotBackUTF8(s, length); 1139 } 1140 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1141 if(pos==0) { 1142 return 0; 1143 } 1144 int32_t spanLength=length-pos; 1145 1146 // Consider strings; they may overlap with the span. 1147 OffsetList offsets; 1148 if(spanCondition==USET_SPAN_CONTAINED) { 1149 // Use offset list to try all possibilities. 1150 offsets.setMaxLength(maxLength8); 1151 } 1152 int32_t i, stringsLength=strings.size(); 1153 uint8_t *spanBackUTF8Lengths=spanLengths; 1154 if(all) { 1155 spanBackUTF8Lengths+=3*stringsLength; 1156 } 1157 for(;;) { 1158 const uint8_t *s8=utf8; 1159 int32_t length8; 1160 if(spanCondition==USET_SPAN_CONTAINED) { 1161 for(i=0; i<stringsLength; ++i) { 1162 length8=utf8Lengths[i]; 1163 if(length8==0) { 1164 continue; // String not representable in UTF-8. 1165 } 1166 int32_t overlap=spanBackUTF8Lengths[i]; 1167 if(overlap==ALL_CP_CONTAINED) { 1168 s8+=length8; 1169 continue; // Irrelevant string. 1170 } 1171 1172 // Try to match this string at pos-(length8-overlap)..pos-length8. 1173 if(overlap>=LONG_SPAN) { 1174 overlap=length8; 1175 // While contained: No point matching fully inside the code point span. 1176 int32_t len1=0; 1177 U8_FWD_1(s8, len1, overlap); 1178 overlap-=len1; // Length of the string minus the first code point. 1179 } 1180 if(overlap>spanLength) { 1181 overlap=spanLength; 1182 } 1183 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1184 for(;;) { 1185 if(dec>pos) { 1186 break; 1187 } 1188 // Try to match if the decrement is not listed already. 1189 // Match at code point boundaries. (The UTF-8 strings were converted 1190 // from UTF-16 and are guaranteed to be well-formed.) 1191 if( !U8_IS_TRAIL(s[pos-dec]) && 1192 !offsets.containsOffset(dec) && 1193 matches8(s+pos-dec, s8, length8) 1194 ) { 1195 if(dec==pos) { 1196 return 0; // Reached the start of the string. 1197 } 1198 offsets.addOffset(dec); 1199 } 1200 if(overlap==0) { 1201 break; 1202 } 1203 --overlap; 1204 ++dec; 1205 } 1206 s8+=length8; 1207 } 1208 } else /* USET_SPAN_SIMPLE */ { 1209 int32_t maxDec=0, maxOverlap=0; 1210 for(i=0; i<stringsLength; ++i) { 1211 length8=utf8Lengths[i]; 1212 if(length8==0) { 1213 continue; // String not representable in UTF-8. 1214 } 1215 int32_t overlap=spanBackUTF8Lengths[i]; 1216 // For longest match, we do need to try to match even an all-contained string 1217 // to find the match from the latest end. 1218 1219 // Try to match this string at pos-(length8-overlap)..pos-length8. 1220 if(overlap>=LONG_SPAN) { 1221 overlap=length8; 1222 // Longest match: Need to match fully inside the code point span 1223 // to find the match from the latest end. 1224 } 1225 if(overlap>spanLength) { 1226 overlap=spanLength; 1227 } 1228 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1229 for(;;) { 1230 if(dec>pos || overlap<maxOverlap) { 1231 break; 1232 } 1233 // Try to match if the string is longer or ends later. 1234 // Match at code point boundaries. (The UTF-8 strings were converted 1235 // from UTF-16 and are guaranteed to be well-formed.) 1236 if( !U8_IS_TRAIL(s[pos-dec]) && 1237 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1238 matches8(s+pos-dec, s8, length8) 1239 ) { 1240 maxDec=dec; // Longest match from latest end. 1241 maxOverlap=overlap; 1242 break; 1243 } 1244 --overlap; 1245 ++dec; 1246 } 1247 s8+=length8; 1248 } 1249 1250 if(maxDec!=0 || maxOverlap!=0) { 1251 // Longest-match algorithm, and there was a string match. 1252 // Simply continue before it. 1253 pos-=maxDec; 1254 if(pos==0) { 1255 return 0; // Reached the start of the string. 1256 } 1257 spanLength=0; // Match strings from before a string match. 1258 continue; 1259 } 1260 } 1261 // Finished trying to match all strings at pos. 1262 1263 if(spanLength!=0 || pos==length) { 1264 // The position is before an unlimited code point span (spanLength!=0), 1265 // not before a string match. 1266 // The only position where spanLength==0 before a span is pos==length. 1267 // Otherwise, an unlimited code point span is only tried again when no 1268 // strings match, and if such a non-initial span fails we stop. 1269 if(offsets.isEmpty()) { 1270 return pos; // No strings matched before a span. 1271 } 1272 // Match strings from before the next string match. 1273 } else { 1274 // The position is before a string match (or a single code point). 1275 if(offsets.isEmpty()) { 1276 // No more strings matched before a previous string match. 1277 // Try another code point span from before the last string match. 1278 int32_t oldPos=pos; 1279 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1280 spanLength=oldPos-pos; 1281 if( pos==0 || // Reached the start of the string, or 1282 spanLength==0 // neither strings nor span progressed. 1283 ) { 1284 return pos; 1285 } 1286 continue; // spanLength>0: Match strings from before a span. 1287 } else { 1288 // Try to match only one code point from before a string match if some 1289 // string matched beyond it, so that we try all possible positions 1290 // and don't overshoot. 1291 spanLength=spanOneBackUTF8(spanSet, s, pos); 1292 if(spanLength>0) { 1293 if(spanLength==pos) { 1294 return 0; // Reached the start of the string. 1295 } 1296 // Match strings before this code point. 1297 // There cannot be any decrements below it because UnicodeSet strings 1298 // contain multiple code points. 1299 pos-=spanLength; 1300 offsets.shift(spanLength); 1301 spanLength=0; 1302 continue; // Match strings from before a single code point. 1303 } 1304 // Match strings from before the next string match. 1305 } 1306 } 1307 pos-=offsets.popMinimum(); 1308 spanLength=0; // Match strings from before a string match. 1309 } 1310 } 1311 1312 /* 1313 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1314 * 1315 * Theoretical algorithm: 1316 * - Iterate through the string, and at each code point boundary: 1317 * + If the code point there is in the set, then return with the current position. 1318 * + If a set string matches at the current position, then return with the current position. 1319 * 1320 * Optimized implementation: 1321 * 1322 * (Same assumption as for span() above.) 1323 * 1324 * Create and cache a spanNotSet which contains all of the single code points 1325 * of the original set but none of its strings. 1326 * For each set string add its initial code point to the spanNotSet. 1327 * (Also add its final code point for spanNotBack().) 1328 * 1329 * - Loop: 1330 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1331 * + If the current code point is in the original set, then 1332 * return the current position. 1333 * + If any set string matches at the current position, then 1334 * return the current position. 1335 * + If there is no match at the current position, neither for the code point there 1336 * nor for any set string, then skip this code point and continue the loop. 1337 * This happens for set-string-initial code points that were added to spanNotSet 1338 * when there is not actually a match for such a set string. 1339 */ 1340 1341 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1342 int32_t pos=0, rest=length; 1343 int32_t i, stringsLength=strings.size(); 1344 do { 1345 // Span until we find a code point from the set, 1346 // or a code point that starts or ends some string. 1347 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1348 if(i==rest) { 1349 return length; // Reached the end of the string. 1350 } 1351 pos+=i; 1352 rest-=i; 1353 1354 // Check whether the current code point is in the original set, 1355 // without the string starts and ends. 1356 int32_t cpLength=spanOne(spanSet, s+pos, rest); 1357 if(cpLength>0) { 1358 return pos; // There is a set element at pos. 1359 } 1360 1361 // Try to match the strings at pos. 1362 for(i=0; i<stringsLength; ++i) { 1363 if(spanLengths[i]==ALL_CP_CONTAINED) { 1364 continue; // Irrelevant string. 1365 } 1366 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1367 const UChar *s16=string.getBuffer(); 1368 int32_t length16=string.length(); 1369 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1370 return pos; // There is a set element at pos. 1371 } 1372 } 1373 1374 // The span(while not contained) ended on a string start/end which is 1375 // not in the original set. Skip this code point and continue. 1376 // cpLength<0 1377 pos-=cpLength; 1378 rest+=cpLength; 1379 } while(rest!=0); 1380 return length; // Reached the end of the string. 1381 } 1382 1383 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1384 int32_t pos=length; 1385 int32_t i, stringsLength=strings.size(); 1386 do { 1387 // Span until we find a code point from the set, 1388 // or a code point that starts or ends some string. 1389 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1390 if(pos==0) { 1391 return 0; // Reached the start of the string. 1392 } 1393 1394 // Check whether the current code point is in the original set, 1395 // without the string starts and ends. 1396 int32_t cpLength=spanOneBack(spanSet, s, pos); 1397 if(cpLength>0) { 1398 return pos; // There is a set element at pos. 1399 } 1400 1401 // Try to match the strings at pos. 1402 for(i=0; i<stringsLength; ++i) { 1403 // Use spanLengths rather than a spanBackLengths pointer because 1404 // it is easier and we only need to know whether the string is irrelevant 1405 // which is the same in either array. 1406 if(spanLengths[i]==ALL_CP_CONTAINED) { 1407 continue; // Irrelevant string. 1408 } 1409 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1410 const UChar *s16=string.getBuffer(); 1411 int32_t length16=string.length(); 1412 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1413 return pos; // There is a set element at pos. 1414 } 1415 } 1416 1417 // The span(while not contained) ended on a string start/end which is 1418 // not in the original set. Skip this code point and continue. 1419 // cpLength<0 1420 pos+=cpLength; 1421 } while(pos!=0); 1422 return 0; // Reached the start of the string. 1423 } 1424 1425 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1426 int32_t pos=0, rest=length; 1427 int32_t i, stringsLength=strings.size(); 1428 uint8_t *spanUTF8Lengths=spanLengths; 1429 if(all) { 1430 spanUTF8Lengths+=2*stringsLength; 1431 } 1432 do { 1433 // Span until we find a code point from the set, 1434 // or a code point that starts or ends some string. 1435 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1436 if(i==rest) { 1437 return length; // Reached the end of the string. 1438 } 1439 pos+=i; 1440 rest-=i; 1441 1442 // Check whether the current code point is in the original set, 1443 // without the string starts and ends. 1444 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1445 if(cpLength>0) { 1446 return pos; // There is a set element at pos. 1447 } 1448 1449 // Try to match the strings at pos. 1450 const uint8_t *s8=utf8; 1451 int32_t length8; 1452 for(i=0; i<stringsLength; ++i) { 1453 length8=utf8Lengths[i]; 1454 // ALL_CP_CONTAINED: Irrelevant string. 1455 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1456 return pos; // There is a set element at pos. 1457 } 1458 s8+=length8; 1459 } 1460 1461 // The span(while not contained) ended on a string start/end which is 1462 // not in the original set. Skip this code point and continue. 1463 // cpLength<0 1464 pos-=cpLength; 1465 rest+=cpLength; 1466 } while(rest!=0); 1467 return length; // Reached the end of the string. 1468 } 1469 1470 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1471 int32_t pos=length; 1472 int32_t i, stringsLength=strings.size(); 1473 uint8_t *spanBackUTF8Lengths=spanLengths; 1474 if(all) { 1475 spanBackUTF8Lengths+=3*stringsLength; 1476 } 1477 do { 1478 // Span until we find a code point from the set, 1479 // or a code point that starts or ends some string. 1480 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1481 if(pos==0) { 1482 return 0; // Reached the start of the string. 1483 } 1484 1485 // Check whether the current code point is in the original set, 1486 // without the string starts and ends. 1487 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1488 if(cpLength>0) { 1489 return pos; // There is a set element at pos. 1490 } 1491 1492 // Try to match the strings at pos. 1493 const uint8_t *s8=utf8; 1494 int32_t length8; 1495 for(i=0; i<stringsLength; ++i) { 1496 length8=utf8Lengths[i]; 1497 // ALL_CP_CONTAINED: Irrelevant string. 1498 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1499 return pos; // There is a set element at pos. 1500 } 1501 s8+=length8; 1502 } 1503 1504 // The span(while not contained) ended on a string start/end which is 1505 // not in the original set. Skip this code point and continue. 1506 // cpLength<0 1507 pos+=cpLength; 1508 } while(pos!=0); 1509 return 0; // Reached the start of the string. 1510 } 1511 1512 U_NAMESPACE_END 1513