1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2007-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: unisetspan.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007mar01 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uniset.h" 21 #include "unicode/ustring.h" 22 #include "unicode/utf8.h" 23 #include "unicode/utf16.h" 24 #include "cmemory.h" 25 #include "uvector.h" 26 #include "unisetspan.h" 27 28 U_NAMESPACE_BEGIN 29 30 /* 31 * List of offsets from the current position from where to try matching 32 * a code point or a string. 33 * Store offsets rather than indexes to simplify the code and use the same list 34 * for both increments (in span()) and decrements (in spanBack()). 35 * 36 * Assumption: The maximum offset is limited, and the offsets that are stored 37 * at any one time are relatively dense, that is, there are normally no gaps of 38 * hundreds or thousands of offset values. 39 * 40 * The implementation uses a circular buffer of byte flags, 41 * each indicating whether the corresponding offset is in the list. 42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and 43 * physically moving part of the list. 44 * 45 * Note: In principle, the caller should setMaxLength() to the maximum of the 46 * max string length and U16_LENGTH/U8_LENGTH to account for 47 * "long" single code points. 48 * However, this implementation uses at least a staticList with more than 49 * U8_LENGTH entries anyway. 50 * 51 * Note: If maxLength were guaranteed to be no more than 32 or 64, 52 * the list could be stored as bit flags in a single integer. 53 * Rather than handling a circular buffer with a start list index, 54 * the integer would simply be shifted when lower offsets are removed. 55 * UnicodeSet does not have a limit on the lengths of strings. 56 */ 57 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 58 public: 59 OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 60 61 ~OffsetList() { 62 if(list!=staticList) { 63 uprv_free(list); 64 } 65 } 66 67 // Call exactly once if the list is to be used. 68 void setMaxLength(int32_t maxLength) { 69 if(maxLength<=(int32_t)sizeof(staticList)) { 70 capacity=(int32_t)sizeof(staticList); 71 } else { 72 UBool *l=(UBool *)uprv_malloc(maxLength); 73 if(l!=NULL) { 74 list=l; 75 capacity=maxLength; 76 } 77 } 78 uprv_memset(list, 0, capacity); 79 } 80 81 void clear() { 82 uprv_memset(list, 0, capacity); 83 start=length=0; 84 } 85 86 UBool isEmpty() const { 87 return (UBool)(length==0); 88 } 89 90 // Reduce all stored offsets by delta, used when the current position 91 // moves by delta. 92 // There must not be any offsets lower than delta. 93 // If there is an offset equal to delta, it is removed. 94 // delta=[1..maxLength] 95 void shift(int32_t delta) { 96 int32_t i=start+delta; 97 if(i>=capacity) { 98 i-=capacity; 99 } 100 if(list[i]) { 101 list[i]=FALSE; 102 --length; 103 } 104 start=i; 105 } 106 107 // Add an offset. The list must not contain it yet. 108 // offset=[1..maxLength] 109 void addOffset(int32_t offset) { 110 int32_t i=start+offset; 111 if(i>=capacity) { 112 i-=capacity; 113 } 114 list[i]=TRUE; 115 ++length; 116 } 117 118 // offset=[1..maxLength] 119 UBool containsOffset(int32_t offset) const { 120 int32_t i=start+offset; 121 if(i>=capacity) { 122 i-=capacity; 123 } 124 return list[i]; 125 } 126 127 // Find the lowest stored offset from a non-empty list, remove it, 128 // and reduce all other offsets by this minimum. 129 // Returns [1..maxLength]. 130 int32_t popMinimum() { 131 // Look for the next offset in list[start+1..capacity-1]. 132 int32_t i=start, result; 133 while(++i<capacity) { 134 if(list[i]) { 135 list[i]=FALSE; 136 --length; 137 result=i-start; 138 start=i; 139 return result; 140 } 141 } 142 // i==capacity 143 144 // Wrap around and look for the next offset in list[0..start]. 145 // Since the list is not empty, there will be one. 146 result=capacity-start; 147 i=0; 148 while(!list[i]) { 149 ++i; 150 } 151 list[i]=FALSE; 152 --length; 153 start=i; 154 return result+=i; 155 } 156 157 private: 158 UBool *list; 159 int32_t capacity; 160 int32_t length; 161 int32_t start; 162 163 UBool staticList[16]; 164 }; 165 166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string. 167 static int32_t 168 getUTF8Length(const UChar *s, int32_t length) { 169 UErrorCode errorCode=U_ZERO_ERROR; 170 int32_t length8=0; 171 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 172 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 173 return length8; 174 } else { 175 // The string contains an unpaired surrogate. 176 // Ignore this string. 177 return 0; 178 } 179 } 180 181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 182 static int32_t 183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 184 UErrorCode errorCode=U_ZERO_ERROR; 185 int32_t length8=0; 186 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 187 if(U_SUCCESS(errorCode)) { 188 return length8; 189 } else { 190 // The string contains an unpaired surrogate. 191 // Ignore this string. 192 return 0; 193 } 194 } 195 196 static inline uint8_t 197 makeSpanLengthByte(int32_t spanLength) { 198 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 199 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 200 } 201 202 // Construct for all variants of span(), or only for any one variant. 203 // Initialize as little as possible, for single use. 204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 205 const UVector &setStrings, 206 uint32_t which) 207 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 208 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 209 utf8Length(0), 210 maxLength16(0), maxLength8(0), 211 all((UBool)(which==ALL)) { 212 spanSet.retainAll(set); 213 if(which&NOT_CONTAINED) { 214 // Default to the same sets. 215 // addToSpanNotSet() will create a separate set if necessary. 216 pSpanNotSet=&spanSet; 217 } 218 219 // Determine if the strings even need to be taken into account at all for span() etc. 220 // If any string is relevant, then all strings need to be used for 221 // span(longest match) but only the relevant ones for span(while contained). 222 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 223 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 224 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 225 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 226 int32_t stringsLength=strings.size(); 227 228 int32_t i, spanLength; 229 UBool someRelevant=FALSE; 230 for(i=0; i<stringsLength; ++i) { 231 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 232 const UChar *s16=string.getBuffer(); 233 int32_t length16=string.length(); 234 UBool thisRelevant; 235 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 236 if(spanLength<length16) { // Relevant string. 237 someRelevant=thisRelevant=TRUE; 238 } else { 239 thisRelevant=FALSE; 240 } 241 if((which&UTF16) && length16>maxLength16) { 242 maxLength16=length16; 243 } 244 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 245 int32_t length8=getUTF8Length(s16, length16); 246 utf8Length+=length8; 247 if(length8>maxLength8) { 248 maxLength8=length8; 249 } 250 } 251 } 252 if(!someRelevant) { 253 maxLength16=maxLength8=0; 254 return; 255 } 256 257 // Freeze after checking for the need to use strings at all because freezing 258 // a set takes some time and memory which are wasted if there are no relevant strings. 259 if(all) { 260 spanSet.freeze(); 261 } 262 263 uint8_t *spanBackLengths; 264 uint8_t *spanUTF8Lengths; 265 uint8_t *spanBackUTF8Lengths; 266 267 // Allocate a block of meta data. 268 int32_t allocSize; 269 if(all) { 270 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 271 allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 272 } else { 273 allocSize=stringsLength; // One set of span lengths. 274 if(which&UTF8) { 275 // UTF-8 lengths and UTF-8 strings. 276 allocSize+=stringsLength*4+utf8Length; 277 } 278 } 279 if(allocSize<=(int32_t)sizeof(staticLengths)) { 280 utf8Lengths=staticLengths; 281 } else { 282 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 283 if(utf8Lengths==NULL) { 284 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 285 return; // Out of memory. 286 } 287 } 288 289 if(all) { 290 // Store span lengths for all span() variants. 291 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 292 spanBackLengths=spanLengths+stringsLength; 293 spanUTF8Lengths=spanBackLengths+stringsLength; 294 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 295 utf8=spanBackUTF8Lengths+stringsLength; 296 } else { 297 // Store span lengths for only one span() variant. 298 if(which&UTF8) { 299 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 300 utf8=spanLengths+stringsLength; 301 } else { 302 spanLengths=(uint8_t *)utf8Lengths; 303 } 304 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 305 } 306 307 // Set the meta data and pSpanNotSet and write the UTF-8 strings. 308 int32_t utf8Count=0; // Count UTF-8 bytes written so far. 309 310 for(i=0; i<stringsLength; ++i) { 311 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 312 const UChar *s16=string.getBuffer(); 313 int32_t length16=string.length(); 314 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 315 if(spanLength<length16) { // Relevant string. 316 if(which&UTF16) { 317 if(which&CONTAINED) { 318 if(which&FWD) { 319 spanLengths[i]=makeSpanLengthByte(spanLength); 320 } 321 if(which&BACK) { 322 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 323 spanBackLengths[i]=makeSpanLengthByte(spanLength); 324 } 325 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 326 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 327 } 328 } 329 if(which&UTF8) { 330 uint8_t *s8=utf8+utf8Count; 331 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 332 utf8Count+=utf8Lengths[i]=length8; 333 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 334 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 335 } else { // Relevant for UTF-8. 336 if(which&CONTAINED) { 337 if(which&FWD) { 338 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 339 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 340 } 341 if(which&BACK) { 342 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 343 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 344 } 345 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 346 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 347 } 348 } 349 } 350 if(which&NOT_CONTAINED) { 351 // Add string start and end code points to the spanNotSet so that 352 // a span(while not contained) stops before any string. 353 UChar32 c; 354 if(which&FWD) { 355 int32_t len=0; 356 U16_NEXT(s16, len, length16, c); 357 addToSpanNotSet(c); 358 } 359 if(which&BACK) { 360 int32_t len=length16; 361 U16_PREV(s16, 0, len, c); 362 addToSpanNotSet(c); 363 } 364 } 365 } else { // Irrelevant string. 366 if(which&UTF8) { 367 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 368 uint8_t *s8=utf8+utf8Count; 369 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 370 utf8Count+=utf8Lengths[i]=length8; 371 } else { 372 utf8Lengths[i]=0; 373 } 374 } 375 if(all) { 376 spanLengths[i]=spanBackLengths[i]= 377 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 378 (uint8_t)ALL_CP_CONTAINED; 379 } else { 380 // All spanXYZLengths pointers contain the same address. 381 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 382 } 383 } 384 } 385 386 // Finish. 387 if(all) { 388 pSpanNotSet->freeze(); 389 } 390 } 391 392 // Copy constructor. Assumes which==ALL for a frozen set. 393 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 394 const UVector &newParentSetStrings) 395 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 396 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 397 utf8Length(otherStringSpan.utf8Length), 398 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 399 all(TRUE) { 400 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 401 pSpanNotSet=&spanSet; 402 } else { 403 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 404 } 405 406 // Allocate a block of meta data. 407 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 408 int32_t stringsLength=strings.size(); 409 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 410 if(allocSize<=(int32_t)sizeof(staticLengths)) { 411 utf8Lengths=staticLengths; 412 } else { 413 utf8Lengths=(int32_t *)uprv_malloc(allocSize); 414 if(utf8Lengths==NULL) { 415 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 416 return; // Out of memory. 417 } 418 } 419 420 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 421 utf8=spanLengths+stringsLength*4; 422 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 423 } 424 425 UnicodeSetStringSpan::~UnicodeSetStringSpan() { 426 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 427 delete pSpanNotSet; 428 } 429 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 430 uprv_free(utf8Lengths); 431 } 432 } 433 434 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 435 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 436 if(spanSet.contains(c)) { 437 return; // Nothing to do. 438 } 439 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 440 if(newSet==NULL) { 441 return; // Out of memory. 442 } else { 443 pSpanNotSet=newSet; 444 } 445 } 446 pSpanNotSet->add(c); 447 } 448 449 // Compare strings without any argument checks. Requires length>0. 450 static inline UBool 451 matches16(const UChar *s, const UChar *t, int32_t length) { 452 do { 453 if(*s++!=*t++) { 454 return FALSE; 455 } 456 } while(--length>0); 457 return TRUE; 458 } 459 460 static inline UBool 461 matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 462 do { 463 if(*s++!=*t++) { 464 return FALSE; 465 } 466 } while(--length>0); 467 return TRUE; 468 } 469 470 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 471 // at code point boundaries. 472 // That is, each edge of a match must not be in the middle of a surrogate pair. 473 static inline UBool 474 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 475 s+=start; 476 limit-=start; 477 return matches16(s, t, length) && 478 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 479 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 480 } 481 482 // Does the set contain the next code point? 483 // If so, return its length; otherwise return its negative length. 484 static inline int32_t 485 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 486 UChar c=*s, c2; 487 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 488 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 489 } 490 return set.contains(c) ? 1 : -1; 491 } 492 493 static inline int32_t 494 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 495 UChar c=s[length-1], c2; 496 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 497 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 498 } 499 return set.contains(c) ? 1 : -1; 500 } 501 502 static inline int32_t 503 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 504 UChar32 c=*s; 505 if(U8_IS_SINGLE(c)) { 506 return set.contains(c) ? 1 : -1; 507 } 508 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). 509 int32_t i=0; 510 U8_NEXT_OR_FFFD(s, i, length, c); 511 return set.contains(c) ? i : -i; 512 } 513 514 static inline int32_t 515 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 516 UChar32 c=s[length-1]; 517 if(U8_IS_SINGLE(c)) { 518 return set.contains(c) ? 1 : -1; 519 } 520 int32_t i=length-1; 521 c=utf8_prevCharSafeBody(s, 0, &i, c, -3); 522 length-=i; 523 return set.contains(c) ? length : -length; 524 } 525 526 /* 527 * Note: In span() when spanLength==0 (after a string match, or at the beginning 528 * after an empty code point span) and in spanNot() and spanNotUTF8(), 529 * string matching could use a binary search 530 * because all string matches are done from the same start index. 531 * 532 * For UTF-8, this would require a comparison function that returns UTF-16 order. 533 * 534 * This optimization should not be necessary for normal UnicodeSets because 535 * most sets have no strings, and most sets with strings have 536 * very few very short strings. 537 * For cases with many strings, it might be better to use a different API 538 * and implementation with a DFA (state machine). 539 */ 540 541 /* 542 * Algorithm for span(USET_SPAN_CONTAINED) 543 * 544 * Theoretical algorithm: 545 * - Iterate through the string, and at each code point boundary: 546 * + If the code point there is in the set, then remember to continue after it. 547 * + If a set string matches at the current position, then remember to continue after it. 548 * + Either recursively span for each code point or string match, 549 * or recursively span for all but the shortest one and 550 * iteratively continue the span with the shortest local match. 551 * + Remember the longest recursive span (the farthest end point). 552 * + If there is no match at the current position, neither for the code point there 553 * nor for any set string, then stop and return the longest recursive span length. 554 * 555 * Optimized implementation: 556 * 557 * (We assume that most sets will have very few very short strings. 558 * A span using a string-less set is extremely fast.) 559 * 560 * Create and cache a spanSet which contains all of the single code points 561 * of the original set but none of its strings. 562 * 563 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 564 * - Loop: 565 * + Try to match each set string at the end of the spanLength. 566 * ~ Set strings that start with set-contained code points must be matched 567 * with a partial overlap because the recursive algorithm would have tried 568 * to match them at every position. 569 * ~ Set strings that entirely consist of set-contained code points 570 * are irrelevant for span(USET_SPAN_CONTAINED) because the 571 * recursive algorithm would continue after them anyway 572 * and find the longest recursive match from their end. 573 * ~ Rather than recursing, note each end point of a set string match. 574 * + If no set string matched after spanSet.span(), then return 575 * with where the spanSet.span() ended. 576 * + If at least one set string matched after spanSet.span(), then 577 * pop the shortest string match end point and continue 578 * the loop, trying to match all set strings from there. 579 * + If at least one more set string matched after a previous string match, 580 * then test if the code point after the previous string match is also 581 * contained in the set. 582 * Continue the loop with the shortest end point of either this code point 583 * or a matching set string. 584 * + If no more set string matched after a previous string match, 585 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 586 * Stop if spanLength==0, otherwise continue the loop. 587 * 588 * By noting each end point of a set string match, 589 * the function visits each string position at most once and finishes 590 * in linear time. 591 * 592 * The recursive algorithm may visit the same string position many times 593 * if multiple paths lead to it and finishes in exponential time. 594 */ 595 596 /* 597 * Algorithm for span(USET_SPAN_SIMPLE) 598 * 599 * Theoretical algorithm: 600 * - Iterate through the string, and at each code point boundary: 601 * + If the code point there is in the set, then remember to continue after it. 602 * + If a set string matches at the current position, then remember to continue after it. 603 * + Continue from the farthest match position and ignore all others. 604 * + If there is no match at the current position, 605 * then stop and return the current position. 606 * 607 * Optimized implementation: 608 * 609 * (Same assumption and spanSet as above.) 610 * 611 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 612 * - Loop: 613 * + Try to match each set string at the end of the spanLength. 614 * ~ Set strings that start with set-contained code points must be matched 615 * with a partial overlap because the standard algorithm would have tried 616 * to match them earlier. 617 * ~ Set strings that entirely consist of set-contained code points 618 * must be matched with a full overlap because the longest-match algorithm 619 * would hide set string matches that end earlier. 620 * Such set strings need not be matched earlier inside the code point span 621 * because the standard algorithm would then have continued after 622 * the set string match anyway. 623 * ~ Remember the longest set string match (farthest end point) from the earliest 624 * starting point. 625 * + If no set string matched after spanSet.span(), then return 626 * with where the spanSet.span() ended. 627 * + If at least one set string matched, then continue the loop after the 628 * longest match from the earliest position. 629 * + If no more set string matched after a previous string match, 630 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 631 * Stop if spanLength==0, otherwise continue the loop. 632 */ 633 634 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 635 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 636 return spanNot(s, length); 637 } 638 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 639 if(spanLength==length) { 640 return length; 641 } 642 643 // Consider strings; they may overlap with the span. 644 OffsetList offsets; 645 if(spanCondition==USET_SPAN_CONTAINED) { 646 // Use offset list to try all possibilities. 647 offsets.setMaxLength(maxLength16); 648 } 649 int32_t pos=spanLength, rest=length-pos; 650 int32_t i, stringsLength=strings.size(); 651 for(;;) { 652 if(spanCondition==USET_SPAN_CONTAINED) { 653 for(i=0; i<stringsLength; ++i) { 654 int32_t overlap=spanLengths[i]; 655 if(overlap==ALL_CP_CONTAINED) { 656 continue; // Irrelevant string. 657 } 658 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 659 const UChar *s16=string.getBuffer(); 660 int32_t length16=string.length(); 661 662 // Try to match this string at pos-overlap..pos. 663 if(overlap>=LONG_SPAN) { 664 overlap=length16; 665 // While contained: No point matching fully inside the code point span. 666 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 667 } 668 if(overlap>spanLength) { 669 overlap=spanLength; 670 } 671 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 672 for(;;) { 673 if(inc>rest) { 674 break; 675 } 676 // Try to match if the increment is not listed already. 677 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 678 if(inc==rest) { 679 return length; // Reached the end of the string. 680 } 681 offsets.addOffset(inc); 682 } 683 if(overlap==0) { 684 break; 685 } 686 --overlap; 687 ++inc; 688 } 689 } 690 } else /* USET_SPAN_SIMPLE */ { 691 int32_t maxInc=0, maxOverlap=0; 692 for(i=0; i<stringsLength; ++i) { 693 int32_t overlap=spanLengths[i]; 694 // For longest match, we do need to try to match even an all-contained string 695 // to find the match from the earliest start. 696 697 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 698 const UChar *s16=string.getBuffer(); 699 int32_t length16=string.length(); 700 701 // Try to match this string at pos-overlap..pos. 702 if(overlap>=LONG_SPAN) { 703 overlap=length16; 704 // Longest match: Need to match fully inside the code point span 705 // to find the match from the earliest start. 706 } 707 if(overlap>spanLength) { 708 overlap=spanLength; 709 } 710 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 711 for(;;) { 712 if(inc>rest || overlap<maxOverlap) { 713 break; 714 } 715 // Try to match if the string is longer or starts earlier. 716 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 717 matches16CPB(s, pos-overlap, length, s16, length16) 718 ) { 719 maxInc=inc; // Longest match from earliest start. 720 maxOverlap=overlap; 721 break; 722 } 723 --overlap; 724 ++inc; 725 } 726 } 727 728 if(maxInc!=0 || maxOverlap!=0) { 729 // Longest-match algorithm, and there was a string match. 730 // Simply continue after it. 731 pos+=maxInc; 732 rest-=maxInc; 733 if(rest==0) { 734 return length; // Reached the end of the string. 735 } 736 spanLength=0; // Match strings from after a string match. 737 continue; 738 } 739 } 740 // Finished trying to match all strings at pos. 741 742 if(spanLength!=0 || pos==0) { 743 // The position is after an unlimited code point span (spanLength!=0), 744 // not after a string match. 745 // The only position where spanLength==0 after a span is pos==0. 746 // Otherwise, an unlimited code point span is only tried again when no 747 // strings match, and if such a non-initial span fails we stop. 748 if(offsets.isEmpty()) { 749 return pos; // No strings matched after a span. 750 } 751 // Match strings from after the next string match. 752 } else { 753 // The position is after a string match (or a single code point). 754 if(offsets.isEmpty()) { 755 // No more strings matched after a previous string match. 756 // Try another code point span from after the last string match. 757 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 758 if( spanLength==rest || // Reached the end of the string, or 759 spanLength==0 // neither strings nor span progressed. 760 ) { 761 return pos+spanLength; 762 } 763 pos+=spanLength; 764 rest-=spanLength; 765 continue; // spanLength>0: Match strings from after a span. 766 } else { 767 // Try to match only one code point from after a string match if some 768 // string matched beyond it, so that we try all possible positions 769 // and don't overshoot. 770 spanLength=spanOne(spanSet, s+pos, rest); 771 if(spanLength>0) { 772 if(spanLength==rest) { 773 return length; // Reached the end of the string. 774 } 775 // Match strings after this code point. 776 // There cannot be any increments below it because UnicodeSet strings 777 // contain multiple code points. 778 pos+=spanLength; 779 rest-=spanLength; 780 offsets.shift(spanLength); 781 spanLength=0; 782 continue; // Match strings from after a single code point. 783 } 784 // Match strings from after the next string match. 785 } 786 } 787 int32_t minOffset=offsets.popMinimum(); 788 pos+=minOffset; 789 rest-=minOffset; 790 spanLength=0; // Match strings from after a string match. 791 } 792 } 793 794 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 795 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 796 return spanNotBack(s, length); 797 } 798 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 799 if(pos==0) { 800 return 0; 801 } 802 int32_t spanLength=length-pos; 803 804 // Consider strings; they may overlap with the span. 805 OffsetList offsets; 806 if(spanCondition==USET_SPAN_CONTAINED) { 807 // Use offset list to try all possibilities. 808 offsets.setMaxLength(maxLength16); 809 } 810 int32_t i, stringsLength=strings.size(); 811 uint8_t *spanBackLengths=spanLengths; 812 if(all) { 813 spanBackLengths+=stringsLength; 814 } 815 for(;;) { 816 if(spanCondition==USET_SPAN_CONTAINED) { 817 for(i=0; i<stringsLength; ++i) { 818 int32_t overlap=spanBackLengths[i]; 819 if(overlap==ALL_CP_CONTAINED) { 820 continue; // Irrelevant string. 821 } 822 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 823 const UChar *s16=string.getBuffer(); 824 int32_t length16=string.length(); 825 826 // Try to match this string at pos-(length16-overlap)..pos-length16. 827 if(overlap>=LONG_SPAN) { 828 overlap=length16; 829 // While contained: No point matching fully inside the code point span. 830 int32_t len1=0; 831 U16_FWD_1(s16, len1, overlap); 832 overlap-=len1; // Length of the string minus the first code point. 833 } 834 if(overlap>spanLength) { 835 overlap=spanLength; 836 } 837 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 838 for(;;) { 839 if(dec>pos) { 840 break; 841 } 842 // Try to match if the decrement is not listed already. 843 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 844 if(dec==pos) { 845 return 0; // Reached the start of the string. 846 } 847 offsets.addOffset(dec); 848 } 849 if(overlap==0) { 850 break; 851 } 852 --overlap; 853 ++dec; 854 } 855 } 856 } else /* USET_SPAN_SIMPLE */ { 857 int32_t maxDec=0, maxOverlap=0; 858 for(i=0; i<stringsLength; ++i) { 859 int32_t overlap=spanBackLengths[i]; 860 // For longest match, we do need to try to match even an all-contained string 861 // to find the match from the latest end. 862 863 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 864 const UChar *s16=string.getBuffer(); 865 int32_t length16=string.length(); 866 867 // Try to match this string at pos-(length16-overlap)..pos-length16. 868 if(overlap>=LONG_SPAN) { 869 overlap=length16; 870 // Longest match: Need to match fully inside the code point span 871 // to find the match from the latest end. 872 } 873 if(overlap>spanLength) { 874 overlap=spanLength; 875 } 876 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 877 for(;;) { 878 if(dec>pos || overlap<maxOverlap) { 879 break; 880 } 881 // Try to match if the string is longer or ends later. 882 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 883 matches16CPB(s, pos-dec, length, s16, length16) 884 ) { 885 maxDec=dec; // Longest match from latest end. 886 maxOverlap=overlap; 887 break; 888 } 889 --overlap; 890 ++dec; 891 } 892 } 893 894 if(maxDec!=0 || maxOverlap!=0) { 895 // Longest-match algorithm, and there was a string match. 896 // Simply continue before it. 897 pos-=maxDec; 898 if(pos==0) { 899 return 0; // Reached the start of the string. 900 } 901 spanLength=0; // Match strings from before a string match. 902 continue; 903 } 904 } 905 // Finished trying to match all strings at pos. 906 907 if(spanLength!=0 || pos==length) { 908 // The position is before an unlimited code point span (spanLength!=0), 909 // not before a string match. 910 // The only position where spanLength==0 before a span is pos==length. 911 // Otherwise, an unlimited code point span is only tried again when no 912 // strings match, and if such a non-initial span fails we stop. 913 if(offsets.isEmpty()) { 914 return pos; // No strings matched before a span. 915 } 916 // Match strings from before the next string match. 917 } else { 918 // The position is before a string match (or a single code point). 919 if(offsets.isEmpty()) { 920 // No more strings matched before a previous string match. 921 // Try another code point span from before the last string match. 922 int32_t oldPos=pos; 923 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 924 spanLength=oldPos-pos; 925 if( pos==0 || // Reached the start of the string, or 926 spanLength==0 // neither strings nor span progressed. 927 ) { 928 return pos; 929 } 930 continue; // spanLength>0: Match strings from before a span. 931 } else { 932 // Try to match only one code point from before a string match if some 933 // string matched beyond it, so that we try all possible positions 934 // and don't overshoot. 935 spanLength=spanOneBack(spanSet, s, pos); 936 if(spanLength>0) { 937 if(spanLength==pos) { 938 return 0; // Reached the start of the string. 939 } 940 // Match strings before this code point. 941 // There cannot be any decrements below it because UnicodeSet strings 942 // contain multiple code points. 943 pos-=spanLength; 944 offsets.shift(spanLength); 945 spanLength=0; 946 continue; // Match strings from before a single code point. 947 } 948 // Match strings from before the next string match. 949 } 950 } 951 pos-=offsets.popMinimum(); 952 spanLength=0; // Match strings from before a string match. 953 } 954 } 955 956 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 957 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 958 return spanNotUTF8(s, length); 959 } 960 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 961 if(spanLength==length) { 962 return length; 963 } 964 965 // Consider strings; they may overlap with the span. 966 OffsetList offsets; 967 if(spanCondition==USET_SPAN_CONTAINED) { 968 // Use offset list to try all possibilities. 969 offsets.setMaxLength(maxLength8); 970 } 971 int32_t pos=spanLength, rest=length-pos; 972 int32_t i, stringsLength=strings.size(); 973 uint8_t *spanUTF8Lengths=spanLengths; 974 if(all) { 975 spanUTF8Lengths+=2*stringsLength; 976 } 977 for(;;) { 978 const uint8_t *s8=utf8; 979 int32_t length8; 980 if(spanCondition==USET_SPAN_CONTAINED) { 981 for(i=0; i<stringsLength; ++i) { 982 length8=utf8Lengths[i]; 983 if(length8==0) { 984 continue; // String not representable in UTF-8. 985 } 986 int32_t overlap=spanUTF8Lengths[i]; 987 if(overlap==ALL_CP_CONTAINED) { 988 s8+=length8; 989 continue; // Irrelevant string. 990 } 991 992 // Try to match this string at pos-overlap..pos. 993 if(overlap>=LONG_SPAN) { 994 overlap=length8; 995 // While contained: No point matching fully inside the code point span. 996 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 997 } 998 if(overlap>spanLength) { 999 overlap=spanLength; 1000 } 1001 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1002 for(;;) { 1003 if(inc>rest) { 1004 break; 1005 } 1006 // Try to match if the increment is not listed already. 1007 // Match at code point boundaries. (The UTF-8 strings were converted 1008 // from UTF-16 and are guaranteed to be well-formed.) 1009 if(!U8_IS_TRAIL(s[pos-overlap]) && 1010 !offsets.containsOffset(inc) && 1011 matches8(s+pos-overlap, s8, length8)) { 1012 if(inc==rest) { 1013 return length; // Reached the end of the string. 1014 } 1015 offsets.addOffset(inc); 1016 } 1017 if(overlap==0) { 1018 break; 1019 } 1020 --overlap; 1021 ++inc; 1022 } 1023 s8+=length8; 1024 } 1025 } else /* USET_SPAN_SIMPLE */ { 1026 int32_t maxInc=0, maxOverlap=0; 1027 for(i=0; i<stringsLength; ++i) { 1028 length8=utf8Lengths[i]; 1029 if(length8==0) { 1030 continue; // String not representable in UTF-8. 1031 } 1032 int32_t overlap=spanUTF8Lengths[i]; 1033 // For longest match, we do need to try to match even an all-contained string 1034 // to find the match from the earliest start. 1035 1036 // Try to match this string at pos-overlap..pos. 1037 if(overlap>=LONG_SPAN) { 1038 overlap=length8; 1039 // Longest match: Need to match fully inside the code point span 1040 // to find the match from the earliest start. 1041 } 1042 if(overlap>spanLength) { 1043 overlap=spanLength; 1044 } 1045 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1046 for(;;) { 1047 if(inc>rest || overlap<maxOverlap) { 1048 break; 1049 } 1050 // Try to match if the string is longer or starts earlier. 1051 // Match at code point boundaries. (The UTF-8 strings were converted 1052 // from UTF-16 and are guaranteed to be well-formed.) 1053 if(!U8_IS_TRAIL(s[pos-overlap]) && 1054 (overlap>maxOverlap || 1055 /* redundant overlap==maxOverlap && */ inc>maxInc) && 1056 matches8(s+pos-overlap, s8, length8)) { 1057 maxInc=inc; // Longest match from earliest start. 1058 maxOverlap=overlap; 1059 break; 1060 } 1061 --overlap; 1062 ++inc; 1063 } 1064 s8+=length8; 1065 } 1066 1067 if(maxInc!=0 || maxOverlap!=0) { 1068 // Longest-match algorithm, and there was a string match. 1069 // Simply continue after it. 1070 pos+=maxInc; 1071 rest-=maxInc; 1072 if(rest==0) { 1073 return length; // Reached the end of the string. 1074 } 1075 spanLength=0; // Match strings from after a string match. 1076 continue; 1077 } 1078 } 1079 // Finished trying to match all strings at pos. 1080 1081 if(spanLength!=0 || pos==0) { 1082 // The position is after an unlimited code point span (spanLength!=0), 1083 // not after a string match. 1084 // The only position where spanLength==0 after a span is pos==0. 1085 // Otherwise, an unlimited code point span is only tried again when no 1086 // strings match, and if such a non-initial span fails we stop. 1087 if(offsets.isEmpty()) { 1088 return pos; // No strings matched after a span. 1089 } 1090 // Match strings from after the next string match. 1091 } else { 1092 // The position is after a string match (or a single code point). 1093 if(offsets.isEmpty()) { 1094 // No more strings matched after a previous string match. 1095 // Try another code point span from after the last string match. 1096 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1097 if( spanLength==rest || // Reached the end of the string, or 1098 spanLength==0 // neither strings nor span progressed. 1099 ) { 1100 return pos+spanLength; 1101 } 1102 pos+=spanLength; 1103 rest-=spanLength; 1104 continue; // spanLength>0: Match strings from after a span. 1105 } else { 1106 // Try to match only one code point from after a string match if some 1107 // string matched beyond it, so that we try all possible positions 1108 // and don't overshoot. 1109 spanLength=spanOneUTF8(spanSet, s+pos, rest); 1110 if(spanLength>0) { 1111 if(spanLength==rest) { 1112 return length; // Reached the end of the string. 1113 } 1114 // Match strings after this code point. 1115 // There cannot be any increments below it because UnicodeSet strings 1116 // contain multiple code points. 1117 pos+=spanLength; 1118 rest-=spanLength; 1119 offsets.shift(spanLength); 1120 spanLength=0; 1121 continue; // Match strings from after a single code point. 1122 } 1123 // Match strings from after the next string match. 1124 } 1125 } 1126 int32_t minOffset=offsets.popMinimum(); 1127 pos+=minOffset; 1128 rest-=minOffset; 1129 spanLength=0; // Match strings from after a string match. 1130 } 1131 } 1132 1133 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1134 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1135 return spanNotBackUTF8(s, length); 1136 } 1137 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1138 if(pos==0) { 1139 return 0; 1140 } 1141 int32_t spanLength=length-pos; 1142 1143 // Consider strings; they may overlap with the span. 1144 OffsetList offsets; 1145 if(spanCondition==USET_SPAN_CONTAINED) { 1146 // Use offset list to try all possibilities. 1147 offsets.setMaxLength(maxLength8); 1148 } 1149 int32_t i, stringsLength=strings.size(); 1150 uint8_t *spanBackUTF8Lengths=spanLengths; 1151 if(all) { 1152 spanBackUTF8Lengths+=3*stringsLength; 1153 } 1154 for(;;) { 1155 const uint8_t *s8=utf8; 1156 int32_t length8; 1157 if(spanCondition==USET_SPAN_CONTAINED) { 1158 for(i=0; i<stringsLength; ++i) { 1159 length8=utf8Lengths[i]; 1160 if(length8==0) { 1161 continue; // String not representable in UTF-8. 1162 } 1163 int32_t overlap=spanBackUTF8Lengths[i]; 1164 if(overlap==ALL_CP_CONTAINED) { 1165 s8+=length8; 1166 continue; // Irrelevant string. 1167 } 1168 1169 // Try to match this string at pos-(length8-overlap)..pos-length8. 1170 if(overlap>=LONG_SPAN) { 1171 overlap=length8; 1172 // While contained: No point matching fully inside the code point span. 1173 int32_t len1=0; 1174 U8_FWD_1(s8, len1, overlap); 1175 overlap-=len1; // Length of the string minus the first code point. 1176 } 1177 if(overlap>spanLength) { 1178 overlap=spanLength; 1179 } 1180 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1181 for(;;) { 1182 if(dec>pos) { 1183 break; 1184 } 1185 // Try to match if the decrement is not listed already. 1186 // Match at code point boundaries. (The UTF-8 strings were converted 1187 // from UTF-16 and are guaranteed to be well-formed.) 1188 if( !U8_IS_TRAIL(s[pos-dec]) && 1189 !offsets.containsOffset(dec) && 1190 matches8(s+pos-dec, s8, length8) 1191 ) { 1192 if(dec==pos) { 1193 return 0; // Reached the start of the string. 1194 } 1195 offsets.addOffset(dec); 1196 } 1197 if(overlap==0) { 1198 break; 1199 } 1200 --overlap; 1201 ++dec; 1202 } 1203 s8+=length8; 1204 } 1205 } else /* USET_SPAN_SIMPLE */ { 1206 int32_t maxDec=0, maxOverlap=0; 1207 for(i=0; i<stringsLength; ++i) { 1208 length8=utf8Lengths[i]; 1209 if(length8==0) { 1210 continue; // String not representable in UTF-8. 1211 } 1212 int32_t overlap=spanBackUTF8Lengths[i]; 1213 // For longest match, we do need to try to match even an all-contained string 1214 // to find the match from the latest end. 1215 1216 // Try to match this string at pos-(length8-overlap)..pos-length8. 1217 if(overlap>=LONG_SPAN) { 1218 overlap=length8; 1219 // Longest match: Need to match fully inside the code point span 1220 // to find the match from the latest end. 1221 } 1222 if(overlap>spanLength) { 1223 overlap=spanLength; 1224 } 1225 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1226 for(;;) { 1227 if(dec>pos || overlap<maxOverlap) { 1228 break; 1229 } 1230 // Try to match if the string is longer or ends later. 1231 // Match at code point boundaries. (The UTF-8 strings were converted 1232 // from UTF-16 and are guaranteed to be well-formed.) 1233 if( !U8_IS_TRAIL(s[pos-dec]) && 1234 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1235 matches8(s+pos-dec, s8, length8) 1236 ) { 1237 maxDec=dec; // Longest match from latest end. 1238 maxOverlap=overlap; 1239 break; 1240 } 1241 --overlap; 1242 ++dec; 1243 } 1244 s8+=length8; 1245 } 1246 1247 if(maxDec!=0 || maxOverlap!=0) { 1248 // Longest-match algorithm, and there was a string match. 1249 // Simply continue before it. 1250 pos-=maxDec; 1251 if(pos==0) { 1252 return 0; // Reached the start of the string. 1253 } 1254 spanLength=0; // Match strings from before a string match. 1255 continue; 1256 } 1257 } 1258 // Finished trying to match all strings at pos. 1259 1260 if(spanLength!=0 || pos==length) { 1261 // The position is before an unlimited code point span (spanLength!=0), 1262 // not before a string match. 1263 // The only position where spanLength==0 before a span is pos==length. 1264 // Otherwise, an unlimited code point span is only tried again when no 1265 // strings match, and if such a non-initial span fails we stop. 1266 if(offsets.isEmpty()) { 1267 return pos; // No strings matched before a span. 1268 } 1269 // Match strings from before the next string match. 1270 } else { 1271 // The position is before a string match (or a single code point). 1272 if(offsets.isEmpty()) { 1273 // No more strings matched before a previous string match. 1274 // Try another code point span from before the last string match. 1275 int32_t oldPos=pos; 1276 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1277 spanLength=oldPos-pos; 1278 if( pos==0 || // Reached the start of the string, or 1279 spanLength==0 // neither strings nor span progressed. 1280 ) { 1281 return pos; 1282 } 1283 continue; // spanLength>0: Match strings from before a span. 1284 } else { 1285 // Try to match only one code point from before a string match if some 1286 // string matched beyond it, so that we try all possible positions 1287 // and don't overshoot. 1288 spanLength=spanOneBackUTF8(spanSet, s, pos); 1289 if(spanLength>0) { 1290 if(spanLength==pos) { 1291 return 0; // Reached the start of the string. 1292 } 1293 // Match strings before this code point. 1294 // There cannot be any decrements below it because UnicodeSet strings 1295 // contain multiple code points. 1296 pos-=spanLength; 1297 offsets.shift(spanLength); 1298 spanLength=0; 1299 continue; // Match strings from before a single code point. 1300 } 1301 // Match strings from before the next string match. 1302 } 1303 } 1304 pos-=offsets.popMinimum(); 1305 spanLength=0; // Match strings from before a string match. 1306 } 1307 } 1308 1309 /* 1310 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1311 * 1312 * Theoretical algorithm: 1313 * - Iterate through the string, and at each code point boundary: 1314 * + If the code point there is in the set, then return with the current position. 1315 * + If a set string matches at the current position, then return with the current position. 1316 * 1317 * Optimized implementation: 1318 * 1319 * (Same assumption as for span() above.) 1320 * 1321 * Create and cache a spanNotSet which contains all of the single code points 1322 * of the original set but none of its strings. 1323 * For each set string add its initial code point to the spanNotSet. 1324 * (Also add its final code point for spanNotBack().) 1325 * 1326 * - Loop: 1327 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1328 * + If the current code point is in the original set, then 1329 * return the current position. 1330 * + If any set string matches at the current position, then 1331 * return the current position. 1332 * + If there is no match at the current position, neither for the code point there 1333 * nor for any set string, then skip this code point and continue the loop. 1334 * This happens for set-string-initial code points that were added to spanNotSet 1335 * when there is not actually a match for such a set string. 1336 */ 1337 1338 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1339 int32_t pos=0, rest=length; 1340 int32_t i, stringsLength=strings.size(); 1341 do { 1342 // Span until we find a code point from the set, 1343 // or a code point that starts or ends some string. 1344 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1345 if(i==rest) { 1346 return length; // Reached the end of the string. 1347 } 1348 pos+=i; 1349 rest-=i; 1350 1351 // Check whether the current code point is in the original set, 1352 // without the string starts and ends. 1353 int32_t cpLength=spanOne(spanSet, s+pos, rest); 1354 if(cpLength>0) { 1355 return pos; // There is a set element at pos. 1356 } 1357 1358 // Try to match the strings at pos. 1359 for(i=0; i<stringsLength; ++i) { 1360 if(spanLengths[i]==ALL_CP_CONTAINED) { 1361 continue; // Irrelevant string. 1362 } 1363 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1364 const UChar *s16=string.getBuffer(); 1365 int32_t length16=string.length(); 1366 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1367 return pos; // There is a set element at pos. 1368 } 1369 } 1370 1371 // The span(while not contained) ended on a string start/end which is 1372 // not in the original set. Skip this code point and continue. 1373 // cpLength<0 1374 pos-=cpLength; 1375 rest+=cpLength; 1376 } while(rest!=0); 1377 return length; // Reached the end of the string. 1378 } 1379 1380 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1381 int32_t pos=length; 1382 int32_t i, stringsLength=strings.size(); 1383 do { 1384 // Span until we find a code point from the set, 1385 // or a code point that starts or ends some string. 1386 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1387 if(pos==0) { 1388 return 0; // Reached the start of the string. 1389 } 1390 1391 // Check whether the current code point is in the original set, 1392 // without the string starts and ends. 1393 int32_t cpLength=spanOneBack(spanSet, s, pos); 1394 if(cpLength>0) { 1395 return pos; // There is a set element at pos. 1396 } 1397 1398 // Try to match the strings at pos. 1399 for(i=0; i<stringsLength; ++i) { 1400 // Use spanLengths rather than a spanBackLengths pointer because 1401 // it is easier and we only need to know whether the string is irrelevant 1402 // which is the same in either array. 1403 if(spanLengths[i]==ALL_CP_CONTAINED) { 1404 continue; // Irrelevant string. 1405 } 1406 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1407 const UChar *s16=string.getBuffer(); 1408 int32_t length16=string.length(); 1409 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1410 return pos; // There is a set element at pos. 1411 } 1412 } 1413 1414 // The span(while not contained) ended on a string start/end which is 1415 // not in the original set. Skip this code point and continue. 1416 // cpLength<0 1417 pos+=cpLength; 1418 } while(pos!=0); 1419 return 0; // Reached the start of the string. 1420 } 1421 1422 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1423 int32_t pos=0, rest=length; 1424 int32_t i, stringsLength=strings.size(); 1425 uint8_t *spanUTF8Lengths=spanLengths; 1426 if(all) { 1427 spanUTF8Lengths+=2*stringsLength; 1428 } 1429 do { 1430 // Span until we find a code point from the set, 1431 // or a code point that starts or ends some string. 1432 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1433 if(i==rest) { 1434 return length; // Reached the end of the string. 1435 } 1436 pos+=i; 1437 rest-=i; 1438 1439 // Check whether the current code point is in the original set, 1440 // without the string starts and ends. 1441 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1442 if(cpLength>0) { 1443 return pos; // There is a set element at pos. 1444 } 1445 1446 // Try to match the strings at pos. 1447 const uint8_t *s8=utf8; 1448 int32_t length8; 1449 for(i=0; i<stringsLength; ++i) { 1450 length8=utf8Lengths[i]; 1451 // ALL_CP_CONTAINED: Irrelevant string. 1452 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1453 return pos; // There is a set element at pos. 1454 } 1455 s8+=length8; 1456 } 1457 1458 // The span(while not contained) ended on a string start/end which is 1459 // not in the original set. Skip this code point and continue. 1460 // cpLength<0 1461 pos-=cpLength; 1462 rest+=cpLength; 1463 } while(rest!=0); 1464 return length; // Reached the end of the string. 1465 } 1466 1467 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1468 int32_t pos=length; 1469 int32_t i, stringsLength=strings.size(); 1470 uint8_t *spanBackUTF8Lengths=spanLengths; 1471 if(all) { 1472 spanBackUTF8Lengths+=3*stringsLength; 1473 } 1474 do { 1475 // Span until we find a code point from the set, 1476 // or a code point that starts or ends some string. 1477 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1478 if(pos==0) { 1479 return 0; // Reached the start of the string. 1480 } 1481 1482 // Check whether the current code point is in the original set, 1483 // without the string starts and ends. 1484 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1485 if(cpLength>0) { 1486 return pos; // There is a set element at pos. 1487 } 1488 1489 // Try to match the strings at pos. 1490 const uint8_t *s8=utf8; 1491 int32_t length8; 1492 for(i=0; i<stringsLength; ++i) { 1493 length8=utf8Lengths[i]; 1494 // ALL_CP_CONTAINED: Irrelevant string. 1495 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1496 return pos; // There is a set element at pos. 1497 } 1498 s8+=length8; 1499 } 1500 1501 // The span(while not contained) ended on a string start/end which is 1502 // not in the original set. Skip this code point and continue. 1503 // cpLength<0 1504 pos+=cpLength; 1505 } while(pos!=0); 1506 return 0; // Reached the start of the string. 1507 } 1508 1509 U_NAMESPACE_END 1510