Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2007-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  unisetspan.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2007mar01
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/utf8.h"
     23 #include "unicode/utf16.h"
     24 #include "cmemory.h"
     25 #include "uvector.h"
     26 #include "unisetspan.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 /*
     31  * List of offsets from the current position from where to try matching
     32  * a code point or a string.
     33  * Store offsets rather than indexes to simplify the code and use the same list
     34  * for both increments (in span()) and decrements (in spanBack()).
     35  *
     36  * Assumption: The maximum offset is limited, and the offsets that are stored
     37  * at any one time are relatively dense, that is, there are normally no gaps of
     38  * hundreds or thousands of offset values.
     39  *
     40  * The implementation uses a circular buffer of byte flags,
     41  * each indicating whether the corresponding offset is in the list.
     42  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
     43  * physically moving part of the list.
     44  *
     45  * Note: In principle, the caller should setMaxLength() to the maximum of the
     46  * max string length and U16_LENGTH/U8_LENGTH to account for
     47  * "long" single code points.
     48  * However, this implementation uses at least a staticList with more than
     49  * U8_LENGTH entries anyway.
     50  *
     51  * Note: If maxLength were guaranteed to be no more than 32 or 64,
     52  * the list could be stored as bit flags in a single integer.
     53  * Rather than handling a circular buffer with a start list index,
     54  * the integer would simply be shifted when lower offsets are removed.
     55  * UnicodeSet does not have a limit on the lengths of strings.
     56  */
     57 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
     58 public:
     59     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
     60 
     61     ~OffsetList() {
     62         if(list!=staticList) {
     63             uprv_free(list);
     64         }
     65     }
     66 
     67     // Call exactly once if the list is to be used.
     68     void setMaxLength(int32_t maxLength) {
     69         if(maxLength<=(int32_t)sizeof(staticList)) {
     70             capacity=(int32_t)sizeof(staticList);
     71         } else {
     72             UBool *l=(UBool *)uprv_malloc(maxLength);
     73             if(l!=NULL) {
     74                 list=l;
     75                 capacity=maxLength;
     76             }
     77         }
     78         uprv_memset(list, 0, capacity);
     79     }
     80 
     81     void clear() {
     82         uprv_memset(list, 0, capacity);
     83         start=length=0;
     84     }
     85 
     86     UBool isEmpty() const {
     87         return (UBool)(length==0);
     88     }
     89 
     90     // Reduce all stored offsets by delta, used when the current position
     91     // moves by delta.
     92     // There must not be any offsets lower than delta.
     93     // If there is an offset equal to delta, it is removed.
     94     // delta=[1..maxLength]
     95     void shift(int32_t delta) {
     96         int32_t i=start+delta;
     97         if(i>=capacity) {
     98             i-=capacity;
     99         }
    100         if(list[i]) {
    101             list[i]=FALSE;
    102             --length;
    103         }
    104         start=i;
    105     }
    106 
    107     // Add an offset. The list must not contain it yet.
    108     // offset=[1..maxLength]
    109     void addOffset(int32_t offset) {
    110         int32_t i=start+offset;
    111         if(i>=capacity) {
    112             i-=capacity;
    113         }
    114         list[i]=TRUE;
    115         ++length;
    116     }
    117 
    118     // offset=[1..maxLength]
    119     UBool containsOffset(int32_t offset) const {
    120         int32_t i=start+offset;
    121         if(i>=capacity) {
    122             i-=capacity;
    123         }
    124         return list[i];
    125     }
    126 
    127     // Find the lowest stored offset from a non-empty list, remove it,
    128     // and reduce all other offsets by this minimum.
    129     // Returns [1..maxLength].
    130     int32_t popMinimum() {
    131         // Look for the next offset in list[start+1..capacity-1].
    132         int32_t i=start, result;
    133         while(++i<capacity) {
    134             if(list[i]) {
    135                 list[i]=FALSE;
    136                 --length;
    137                 result=i-start;
    138                 start=i;
    139                 return result;
    140             }
    141         }
    142         // i==capacity
    143 
    144         // Wrap around and look for the next offset in list[0..start].
    145         // Since the list is not empty, there will be one.
    146         result=capacity-start;
    147         i=0;
    148         while(!list[i]) {
    149             ++i;
    150         }
    151         list[i]=FALSE;
    152         --length;
    153         start=i;
    154         return result+=i;
    155     }
    156 
    157 private:
    158     UBool *list;
    159     int32_t capacity;
    160     int32_t length;
    161     int32_t start;
    162 
    163     UBool staticList[16];
    164 };
    165 
    166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
    167 static int32_t
    168 getUTF8Length(const UChar *s, int32_t length) {
    169     UErrorCode errorCode=U_ZERO_ERROR;
    170     int32_t length8=0;
    171     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
    172     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
    173         return length8;
    174     } else {
    175         // The string contains an unpaired surrogate.
    176         // Ignore this string.
    177         return 0;
    178     }
    179 }
    180 
    181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
    182 static int32_t
    183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
    184     UErrorCode errorCode=U_ZERO_ERROR;
    185     int32_t length8=0;
    186     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
    187     if(U_SUCCESS(errorCode)) {
    188         return length8;
    189     } else {
    190         // The string contains an unpaired surrogate.
    191         // Ignore this string.
    192         return 0;
    193     }
    194 }
    195 
    196 static inline uint8_t
    197 makeSpanLengthByte(int32_t spanLength) {
    198     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
    199     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
    200 }
    201 
    202 // Construct for all variants of span(), or only for any one variant.
    203 // Initialize as little as possible, for single use.
    204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
    205                                            const UVector &setStrings,
    206                                            uint32_t which)
    207         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
    208           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
    209           utf8Length(0),
    210           maxLength16(0), maxLength8(0),
    211           all((UBool)(which==ALL)) {
    212     spanSet.retainAll(set);
    213     if(which&NOT_CONTAINED) {
    214         // Default to the same sets.
    215         // addToSpanNotSet() will create a separate set if necessary.
    216         pSpanNotSet=&spanSet;
    217     }
    218 
    219     // Determine if the strings even need to be taken into account at all for span() etc.
    220     // If any string is relevant, then all strings need to be used for
    221     // span(longest match) but only the relevant ones for span(while contained).
    222     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
    223     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
    224     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
    225     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
    226     int32_t stringsLength=strings.size();
    227 
    228     int32_t i, spanLength;
    229     UBool someRelevant=FALSE;
    230     for(i=0; i<stringsLength; ++i) {
    231         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    232         const UChar *s16=string.getBuffer();
    233         int32_t length16=string.length();
    234         UBool thisRelevant;
    235         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    236         if(spanLength<length16) {  // Relevant string.
    237             someRelevant=thisRelevant=TRUE;
    238         } else {
    239             thisRelevant=FALSE;
    240         }
    241         if((which&UTF16) && length16>maxLength16) {
    242             maxLength16=length16;
    243         }
    244         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
    245             int32_t length8=getUTF8Length(s16, length16);
    246             utf8Length+=length8;
    247             if(length8>maxLength8) {
    248                 maxLength8=length8;
    249             }
    250         }
    251     }
    252     if(!someRelevant) {
    253         maxLength16=maxLength8=0;
    254         return;
    255     }
    256 
    257     // Freeze after checking for the need to use strings at all because freezing
    258     // a set takes some time and memory which are wasted if there are no relevant strings.
    259     if(all) {
    260         spanSet.freeze();
    261     }
    262 
    263     uint8_t *spanBackLengths;
    264     uint8_t *spanUTF8Lengths;
    265     uint8_t *spanBackUTF8Lengths;
    266 
    267     // Allocate a block of meta data.
    268     int32_t allocSize;
    269     if(all) {
    270         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    271         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    272     } else {
    273         allocSize=stringsLength;  // One set of span lengths.
    274         if(which&UTF8) {
    275             // UTF-8 lengths and UTF-8 strings.
    276             allocSize+=stringsLength*4+utf8Length;
    277         }
    278     }
    279     if(allocSize<=(int32_t)sizeof(staticLengths)) {
    280         utf8Lengths=staticLengths;
    281     } else {
    282         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
    283         if(utf8Lengths==NULL) {
    284             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
    285             return;  // Out of memory.
    286         }
    287     }
    288 
    289     if(all) {
    290         // Store span lengths for all span() variants.
    291         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    292         spanBackLengths=spanLengths+stringsLength;
    293         spanUTF8Lengths=spanBackLengths+stringsLength;
    294         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
    295         utf8=spanBackUTF8Lengths+stringsLength;
    296     } else {
    297         // Store span lengths for only one span() variant.
    298         if(which&UTF8) {
    299             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    300             utf8=spanLengths+stringsLength;
    301         } else {
    302             spanLengths=(uint8_t *)utf8Lengths;
    303         }
    304         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
    305     }
    306 
    307     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
    308     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
    309 
    310     for(i=0; i<stringsLength; ++i) {
    311         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    312         const UChar *s16=string.getBuffer();
    313         int32_t length16=string.length();
    314         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    315         if(spanLength<length16) {  // Relevant string.
    316             if(which&UTF16) {
    317                 if(which&CONTAINED) {
    318                     if(which&FWD) {
    319                         spanLengths[i]=makeSpanLengthByte(spanLength);
    320                     }
    321                     if(which&BACK) {
    322                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
    323                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
    324                     }
    325                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    326                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
    327                 }
    328             }
    329             if(which&UTF8) {
    330                 uint8_t *s8=utf8+utf8Count;
    331                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    332                 utf8Count+=utf8Lengths[i]=length8;
    333                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
    334                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
    335                 } else {  // Relevant for UTF-8.
    336                     if(which&CONTAINED) {
    337                         if(which&FWD) {
    338                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
    339                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    340                         }
    341                         if(which&BACK) {
    342                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
    343                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    344                         }
    345                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    346                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
    347                     }
    348                 }
    349             }
    350             if(which&NOT_CONTAINED) {
    351                 // Add string start and end code points to the spanNotSet so that
    352                 // a span(while not contained) stops before any string.
    353                 UChar32 c;
    354                 if(which&FWD) {
    355                     int32_t len=0;
    356                     U16_NEXT(s16, len, length16, c);
    357                     addToSpanNotSet(c);
    358                 }
    359                 if(which&BACK) {
    360                     int32_t len=length16;
    361                     U16_PREV(s16, 0, len, c);
    362                     addToSpanNotSet(c);
    363                 }
    364             }
    365         } else {  // Irrelevant string.
    366             if(which&UTF8) {
    367                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
    368                     uint8_t *s8=utf8+utf8Count;
    369                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    370                     utf8Count+=utf8Lengths[i]=length8;
    371                 } else {
    372                     utf8Lengths[i]=0;
    373                 }
    374             }
    375             if(all) {
    376                 spanLengths[i]=spanBackLengths[i]=
    377                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
    378                         (uint8_t)ALL_CP_CONTAINED;
    379             } else {
    380                 // All spanXYZLengths pointers contain the same address.
    381                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
    382             }
    383         }
    384     }
    385 
    386     // Finish.
    387     if(all) {
    388         pSpanNotSet->freeze();
    389     }
    390 }
    391 
    392 // Copy constructor. Assumes which==ALL for a frozen set.
    393 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
    394                                            const UVector &newParentSetStrings)
    395         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
    396           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
    397           utf8Length(otherStringSpan.utf8Length),
    398           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
    399           all(TRUE) {
    400     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
    401         pSpanNotSet=&spanSet;
    402     } else {
    403         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
    404     }
    405 
    406     // Allocate a block of meta data.
    407     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    408     int32_t stringsLength=strings.size();
    409     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    410     if(allocSize<=(int32_t)sizeof(staticLengths)) {
    411         utf8Lengths=staticLengths;
    412     } else {
    413         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
    414         if(utf8Lengths==NULL) {
    415             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
    416             return;  // Out of memory.
    417         }
    418     }
    419 
    420     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    421     utf8=spanLengths+stringsLength*4;
    422     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
    423 }
    424 
    425 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
    426     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
    427         delete pSpanNotSet;
    428     }
    429     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
    430         uprv_free(utf8Lengths);
    431     }
    432 }
    433 
    434 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
    435     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
    436         if(spanSet.contains(c)) {
    437             return;  // Nothing to do.
    438         }
    439         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
    440         if(newSet==NULL) {
    441             return;  // Out of memory.
    442         } else {
    443             pSpanNotSet=newSet;
    444         }
    445     }
    446     pSpanNotSet->add(c);
    447 }
    448 
    449 // Compare strings without any argument checks. Requires length>0.
    450 static inline UBool
    451 matches16(const UChar *s, const UChar *t, int32_t length) {
    452     do {
    453         if(*s++!=*t++) {
    454             return FALSE;
    455         }
    456     } while(--length>0);
    457     return TRUE;
    458 }
    459 
    460 static inline UBool
    461 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
    462     do {
    463         if(*s++!=*t++) {
    464             return FALSE;
    465         }
    466     } while(--length>0);
    467     return TRUE;
    468 }
    469 
    470 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
    471 // at code point boundaries.
    472 // That is, each edge of a match must not be in the middle of a surrogate pair.
    473 static inline UBool
    474 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
    475     s+=start;
    476     limit-=start;
    477     return matches16(s, t, length) &&
    478            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
    479            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
    480 }
    481 
    482 // Does the set contain the next code point?
    483 // If so, return its length; otherwise return its negative length.
    484 static inline int32_t
    485 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
    486     UChar c=*s, c2;
    487     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
    488         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
    489     }
    490     return set.contains(c) ? 1 : -1;
    491 }
    492 
    493 static inline int32_t
    494 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
    495     UChar c=s[length-1], c2;
    496     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
    497         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
    498     }
    499     return set.contains(c) ? 1 : -1;
    500 }
    501 
    502 static inline int32_t
    503 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    504     UChar32 c=*s;
    505     if((int8_t)c>=0) {
    506         return set.contains(c) ? 1 : -1;
    507     }
    508     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
    509     int32_t i=0;
    510     U8_NEXT_OR_FFFD(s, i, length, c);
    511     return set.contains(c) ? i : -i;
    512 }
    513 
    514 static inline int32_t
    515 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    516     UChar32 c=s[length-1];
    517     if((int8_t)c>=0) {
    518         return set.contains(c) ? 1 : -1;
    519     }
    520     int32_t i=length-1;
    521     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
    522     length-=i;
    523     return set.contains(c) ? length : -length;
    524 }
    525 
    526 /*
    527  * Note: In span() when spanLength==0 (after a string match, or at the beginning
    528  * after an empty code point span) and in spanNot() and spanNotUTF8(),
    529  * string matching could use a binary search
    530  * because all string matches are done from the same start index.
    531  *
    532  * For UTF-8, this would require a comparison function that returns UTF-16 order.
    533  *
    534  * This optimization should not be necessary for normal UnicodeSets because
    535  * most sets have no strings, and most sets with strings have
    536  * very few very short strings.
    537  * For cases with many strings, it might be better to use a different API
    538  * and implementation with a DFA (state machine).
    539  */
    540 
    541 /*
    542  * Algorithm for span(USET_SPAN_CONTAINED)
    543  *
    544  * Theoretical algorithm:
    545  * - Iterate through the string, and at each code point boundary:
    546  *   + If the code point there is in the set, then remember to continue after it.
    547  *   + If a set string matches at the current position, then remember to continue after it.
    548  *   + Either recursively span for each code point or string match,
    549  *     or recursively span for all but the shortest one and
    550  *     iteratively continue the span with the shortest local match.
    551  *   + Remember the longest recursive span (the farthest end point).
    552  *   + If there is no match at the current position, neither for the code point there
    553  *     nor for any set string, then stop and return the longest recursive span length.
    554  *
    555  * Optimized implementation:
    556  *
    557  * (We assume that most sets will have very few very short strings.
    558  * A span using a string-less set is extremely fast.)
    559  *
    560  * Create and cache a spanSet which contains all of the single code points
    561  * of the original set but none of its strings.
    562  *
    563  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    564  * - Loop:
    565  *   + Try to match each set string at the end of the spanLength.
    566  *     ~ Set strings that start with set-contained code points must be matched
    567  *       with a partial overlap because the recursive algorithm would have tried
    568  *       to match them at every position.
    569  *     ~ Set strings that entirely consist of set-contained code points
    570  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
    571  *       recursive algorithm would continue after them anyway
    572  *       and find the longest recursive match from their end.
    573  *     ~ Rather than recursing, note each end point of a set string match.
    574  *   + If no set string matched after spanSet.span(), then return
    575  *     with where the spanSet.span() ended.
    576  *   + If at least one set string matched after spanSet.span(), then
    577  *     pop the shortest string match end point and continue
    578  *     the loop, trying to match all set strings from there.
    579  *   + If at least one more set string matched after a previous string match,
    580  *     then test if the code point after the previous string match is also
    581  *     contained in the set.
    582  *     Continue the loop with the shortest end point of either this code point
    583  *     or a matching set string.
    584  *   + If no more set string matched after a previous string match,
    585  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    586  *     Stop if spanLength==0, otherwise continue the loop.
    587  *
    588  * By noting each end point of a set string match,
    589  * the function visits each string position at most once and finishes
    590  * in linear time.
    591  *
    592  * The recursive algorithm may visit the same string position many times
    593  * if multiple paths lead to it and finishes in exponential time.
    594  */
    595 
    596 /*
    597  * Algorithm for span(USET_SPAN_SIMPLE)
    598  *
    599  * Theoretical algorithm:
    600  * - Iterate through the string, and at each code point boundary:
    601  *   + If the code point there is in the set, then remember to continue after it.
    602  *   + If a set string matches at the current position, then remember to continue after it.
    603  *   + Continue from the farthest match position and ignore all others.
    604  *   + If there is no match at the current position,
    605  *     then stop and return the current position.
    606  *
    607  * Optimized implementation:
    608  *
    609  * (Same assumption and spanSet as above.)
    610  *
    611  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    612  * - Loop:
    613  *   + Try to match each set string at the end of the spanLength.
    614  *     ~ Set strings that start with set-contained code points must be matched
    615  *       with a partial overlap because the standard algorithm would have tried
    616  *       to match them earlier.
    617  *     ~ Set strings that entirely consist of set-contained code points
    618  *       must be matched with a full overlap because the longest-match algorithm
    619  *       would hide set string matches that end earlier.
    620  *       Such set strings need not be matched earlier inside the code point span
    621  *       because the standard algorithm would then have continued after
    622  *       the set string match anyway.
    623  *     ~ Remember the longest set string match (farthest end point) from the earliest
    624  *       starting point.
    625  *   + If no set string matched after spanSet.span(), then return
    626  *     with where the spanSet.span() ended.
    627  *   + If at least one set string matched, then continue the loop after the
    628  *     longest match from the earliest position.
    629  *   + If no more set string matched after a previous string match,
    630  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    631  *     Stop if spanLength==0, otherwise continue the loop.
    632  */
    633 
    634 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    635     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    636         return spanNot(s, length);
    637     }
    638     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
    639     if(spanLength==length) {
    640         return length;
    641     }
    642 
    643     // Consider strings; they may overlap with the span.
    644     OffsetList offsets;
    645     if(spanCondition==USET_SPAN_CONTAINED) {
    646         // Use offset list to try all possibilities.
    647         offsets.setMaxLength(maxLength16);
    648     }
    649     int32_t pos=spanLength, rest=length-pos;
    650     int32_t i, stringsLength=strings.size();
    651     for(;;) {
    652         if(spanCondition==USET_SPAN_CONTAINED) {
    653             for(i=0; i<stringsLength; ++i) {
    654                 int32_t overlap=spanLengths[i];
    655                 if(overlap==ALL_CP_CONTAINED) {
    656                     continue;  // Irrelevant string.
    657                 }
    658                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    659                 const UChar *s16=string.getBuffer();
    660                 int32_t length16=string.length();
    661 
    662                 // Try to match this string at pos-overlap..pos.
    663                 if(overlap>=LONG_SPAN) {
    664                     overlap=length16;
    665                     // While contained: No point matching fully inside the code point span.
    666                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
    667                 }
    668                 if(overlap>spanLength) {
    669                     overlap=spanLength;
    670                 }
    671                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    672                 for(;;) {
    673                     if(inc>rest) {
    674                         break;
    675                     }
    676                     // Try to match if the increment is not listed already.
    677                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
    678                         if(inc==rest) {
    679                             return length;  // Reached the end of the string.
    680                         }
    681                         offsets.addOffset(inc);
    682                     }
    683                     if(overlap==0) {
    684                         break;
    685                     }
    686                     --overlap;
    687                     ++inc;
    688                 }
    689             }
    690         } else /* USET_SPAN_SIMPLE */ {
    691             int32_t maxInc=0, maxOverlap=0;
    692             for(i=0; i<stringsLength; ++i) {
    693                 int32_t overlap=spanLengths[i];
    694                 // For longest match, we do need to try to match even an all-contained string
    695                 // to find the match from the earliest start.
    696 
    697                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    698                 const UChar *s16=string.getBuffer();
    699                 int32_t length16=string.length();
    700 
    701                 // Try to match this string at pos-overlap..pos.
    702                 if(overlap>=LONG_SPAN) {
    703                     overlap=length16;
    704                     // Longest match: Need to match fully inside the code point span
    705                     // to find the match from the earliest start.
    706                 }
    707                 if(overlap>spanLength) {
    708                     overlap=spanLength;
    709                 }
    710                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    711                 for(;;) {
    712                     if(inc>rest || overlap<maxOverlap) {
    713                         break;
    714                     }
    715                     // Try to match if the string is longer or starts earlier.
    716                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
    717                         matches16CPB(s, pos-overlap, length, s16, length16)
    718                     ) {
    719                         maxInc=inc;  // Longest match from earliest start.
    720                         maxOverlap=overlap;
    721                         break;
    722                     }
    723                     --overlap;
    724                     ++inc;
    725                 }
    726             }
    727 
    728             if(maxInc!=0 || maxOverlap!=0) {
    729                 // Longest-match algorithm, and there was a string match.
    730                 // Simply continue after it.
    731                 pos+=maxInc;
    732                 rest-=maxInc;
    733                 if(rest==0) {
    734                     return length;  // Reached the end of the string.
    735                 }
    736                 spanLength=0;  // Match strings from after a string match.
    737                 continue;
    738             }
    739         }
    740         // Finished trying to match all strings at pos.
    741 
    742         if(spanLength!=0 || pos==0) {
    743             // The position is after an unlimited code point span (spanLength!=0),
    744             // not after a string match.
    745             // The only position where spanLength==0 after a span is pos==0.
    746             // Otherwise, an unlimited code point span is only tried again when no
    747             // strings match, and if such a non-initial span fails we stop.
    748             if(offsets.isEmpty()) {
    749                 return pos;  // No strings matched after a span.
    750             }
    751             // Match strings from after the next string match.
    752         } else {
    753             // The position is after a string match (or a single code point).
    754             if(offsets.isEmpty()) {
    755                 // No more strings matched after a previous string match.
    756                 // Try another code point span from after the last string match.
    757                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
    758                 if( spanLength==rest || // Reached the end of the string, or
    759                     spanLength==0       // neither strings nor span progressed.
    760                 ) {
    761                     return pos+spanLength;
    762                 }
    763                 pos+=spanLength;
    764                 rest-=spanLength;
    765                 continue;  // spanLength>0: Match strings from after a span.
    766             } else {
    767                 // Try to match only one code point from after a string match if some
    768                 // string matched beyond it, so that we try all possible positions
    769                 // and don't overshoot.
    770                 spanLength=spanOne(spanSet, s+pos, rest);
    771                 if(spanLength>0) {
    772                     if(spanLength==rest) {
    773                         return length;  // Reached the end of the string.
    774                     }
    775                     // Match strings after this code point.
    776                     // There cannot be any increments below it because UnicodeSet strings
    777                     // contain multiple code points.
    778                     pos+=spanLength;
    779                     rest-=spanLength;
    780                     offsets.shift(spanLength);
    781                     spanLength=0;
    782                     continue;  // Match strings from after a single code point.
    783                 }
    784                 // Match strings from after the next string match.
    785             }
    786         }
    787         int32_t minOffset=offsets.popMinimum();
    788         pos+=minOffset;
    789         rest-=minOffset;
    790         spanLength=0;  // Match strings from after a string match.
    791     }
    792 }
    793 
    794 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    795     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    796         return spanNotBack(s, length);
    797     }
    798     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
    799     if(pos==0) {
    800         return 0;
    801     }
    802     int32_t spanLength=length-pos;
    803 
    804     // Consider strings; they may overlap with the span.
    805     OffsetList offsets;
    806     if(spanCondition==USET_SPAN_CONTAINED) {
    807         // Use offset list to try all possibilities.
    808         offsets.setMaxLength(maxLength16);
    809     }
    810     int32_t i, stringsLength=strings.size();
    811     uint8_t *spanBackLengths=spanLengths;
    812     if(all) {
    813         spanBackLengths+=stringsLength;
    814     }
    815     for(;;) {
    816         if(spanCondition==USET_SPAN_CONTAINED) {
    817             for(i=0; i<stringsLength; ++i) {
    818                 int32_t overlap=spanBackLengths[i];
    819                 if(overlap==ALL_CP_CONTAINED) {
    820                     continue;  // Irrelevant string.
    821                 }
    822                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    823                 const UChar *s16=string.getBuffer();
    824                 int32_t length16=string.length();
    825 
    826                 // Try to match this string at pos-(length16-overlap)..pos-length16.
    827                 if(overlap>=LONG_SPAN) {
    828                     overlap=length16;
    829                     // While contained: No point matching fully inside the code point span.
    830                     int32_t len1=0;
    831                     U16_FWD_1(s16, len1, overlap);
    832                     overlap-=len1;  // Length of the string minus the first code point.
    833                 }
    834                 if(overlap>spanLength) {
    835                     overlap=spanLength;
    836                 }
    837                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    838                 for(;;) {
    839                     if(dec>pos) {
    840                         break;
    841                     }
    842                     // Try to match if the decrement is not listed already.
    843                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
    844                         if(dec==pos) {
    845                             return 0;  // Reached the start of the string.
    846                         }
    847                         offsets.addOffset(dec);
    848                     }
    849                     if(overlap==0) {
    850                         break;
    851                     }
    852                     --overlap;
    853                     ++dec;
    854                 }
    855             }
    856         } else /* USET_SPAN_SIMPLE */ {
    857             int32_t maxDec=0, maxOverlap=0;
    858             for(i=0; i<stringsLength; ++i) {
    859                 int32_t overlap=spanBackLengths[i];
    860                 // For longest match, we do need to try to match even an all-contained string
    861                 // to find the match from the latest end.
    862 
    863                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    864                 const UChar *s16=string.getBuffer();
    865                 int32_t length16=string.length();
    866 
    867                 // Try to match this string at pos-(length16-overlap)..pos-length16.
    868                 if(overlap>=LONG_SPAN) {
    869                     overlap=length16;
    870                     // Longest match: Need to match fully inside the code point span
    871                     // to find the match from the latest end.
    872                 }
    873                 if(overlap>spanLength) {
    874                     overlap=spanLength;
    875                 }
    876                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    877                 for(;;) {
    878                     if(dec>pos || overlap<maxOverlap) {
    879                         break;
    880                     }
    881                     // Try to match if the string is longer or ends later.
    882                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
    883                         matches16CPB(s, pos-dec, length, s16, length16)
    884                     ) {
    885                         maxDec=dec;  // Longest match from latest end.
    886                         maxOverlap=overlap;
    887                         break;
    888                     }
    889                     --overlap;
    890                     ++dec;
    891                 }
    892             }
    893 
    894             if(maxDec!=0 || maxOverlap!=0) {
    895                 // Longest-match algorithm, and there was a string match.
    896                 // Simply continue before it.
    897                 pos-=maxDec;
    898                 if(pos==0) {
    899                     return 0;  // Reached the start of the string.
    900                 }
    901                 spanLength=0;  // Match strings from before a string match.
    902                 continue;
    903             }
    904         }
    905         // Finished trying to match all strings at pos.
    906 
    907         if(spanLength!=0 || pos==length) {
    908             // The position is before an unlimited code point span (spanLength!=0),
    909             // not before a string match.
    910             // The only position where spanLength==0 before a span is pos==length.
    911             // Otherwise, an unlimited code point span is only tried again when no
    912             // strings match, and if such a non-initial span fails we stop.
    913             if(offsets.isEmpty()) {
    914                 return pos;  // No strings matched before a span.
    915             }
    916             // Match strings from before the next string match.
    917         } else {
    918             // The position is before a string match (or a single code point).
    919             if(offsets.isEmpty()) {
    920                 // No more strings matched before a previous string match.
    921                 // Try another code point span from before the last string match.
    922                 int32_t oldPos=pos;
    923                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
    924                 spanLength=oldPos-pos;
    925                 if( pos==0 ||           // Reached the start of the string, or
    926                     spanLength==0       // neither strings nor span progressed.
    927                 ) {
    928                     return pos;
    929                 }
    930                 continue;  // spanLength>0: Match strings from before a span.
    931             } else {
    932                 // Try to match only one code point from before a string match if some
    933                 // string matched beyond it, so that we try all possible positions
    934                 // and don't overshoot.
    935                 spanLength=spanOneBack(spanSet, s, pos);
    936                 if(spanLength>0) {
    937                     if(spanLength==pos) {
    938                         return 0;  // Reached the start of the string.
    939                     }
    940                     // Match strings before this code point.
    941                     // There cannot be any decrements below it because UnicodeSet strings
    942                     // contain multiple code points.
    943                     pos-=spanLength;
    944                     offsets.shift(spanLength);
    945                     spanLength=0;
    946                     continue;  // Match strings from before a single code point.
    947                 }
    948                 // Match strings from before the next string match.
    949             }
    950         }
    951         pos-=offsets.popMinimum();
    952         spanLength=0;  // Match strings from before a string match.
    953     }
    954 }
    955 
    956 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    957     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    958         return spanNotUTF8(s, length);
    959     }
    960     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
    961     if(spanLength==length) {
    962         return length;
    963     }
    964 
    965     // Consider strings; they may overlap with the span.
    966     OffsetList offsets;
    967     if(spanCondition==USET_SPAN_CONTAINED) {
    968         // Use offset list to try all possibilities.
    969         offsets.setMaxLength(maxLength8);
    970     }
    971     int32_t pos=spanLength, rest=length-pos;
    972     int32_t i, stringsLength=strings.size();
    973     uint8_t *spanUTF8Lengths=spanLengths;
    974     if(all) {
    975         spanUTF8Lengths+=2*stringsLength;
    976     }
    977     for(;;) {
    978         const uint8_t *s8=utf8;
    979         int32_t length8;
    980         if(spanCondition==USET_SPAN_CONTAINED) {
    981             for(i=0; i<stringsLength; ++i) {
    982                 length8=utf8Lengths[i];
    983                 if(length8==0) {
    984                     continue;  // String not representable in UTF-8.
    985                 }
    986                 int32_t overlap=spanUTF8Lengths[i];
    987                 if(overlap==ALL_CP_CONTAINED) {
    988                     s8+=length8;
    989                     continue;  // Irrelevant string.
    990                 }
    991 
    992                 // Try to match this string at pos-overlap..pos.
    993                 if(overlap>=LONG_SPAN) {
    994                     overlap=length8;
    995                     // While contained: No point matching fully inside the code point span.
    996                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
    997                 }
    998                 if(overlap>spanLength) {
    999                     overlap=spanLength;
   1000                 }
   1001                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
   1002                 for(;;) {
   1003                     if(inc>rest) {
   1004                         break;
   1005                     }
   1006                     // Try to match if the increment is not listed already.
   1007                     // Match at code point boundaries. (The UTF-8 strings were converted
   1008                     // from UTF-16 and are guaranteed to be well-formed.)
   1009                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
   1010                         !offsets.containsOffset(inc) &&
   1011                         matches8(s+pos-overlap, s8, length8)
   1012 
   1013                     ) {
   1014                         if(inc==rest) {
   1015                             return length;  // Reached the end of the string.
   1016                         }
   1017                         offsets.addOffset(inc);
   1018                     }
   1019                     if(overlap==0) {
   1020                         break;
   1021                     }
   1022                     --overlap;
   1023                     ++inc;
   1024                 }
   1025                 s8+=length8;
   1026             }
   1027         } else /* USET_SPAN_SIMPLE */ {
   1028             int32_t maxInc=0, maxOverlap=0;
   1029             for(i=0; i<stringsLength; ++i) {
   1030                 length8=utf8Lengths[i];
   1031                 if(length8==0) {
   1032                     continue;  // String not representable in UTF-8.
   1033                 }
   1034                 int32_t overlap=spanUTF8Lengths[i];
   1035                 // For longest match, we do need to try to match even an all-contained string
   1036                 // to find the match from the earliest start.
   1037 
   1038                 // Try to match this string at pos-overlap..pos.
   1039                 if(overlap>=LONG_SPAN) {
   1040                     overlap=length8;
   1041                     // Longest match: Need to match fully inside the code point span
   1042                     // to find the match from the earliest start.
   1043                 }
   1044                 if(overlap>spanLength) {
   1045                     overlap=spanLength;
   1046                 }
   1047                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
   1048                 for(;;) {
   1049                     if(inc>rest || overlap<maxOverlap) {
   1050                         break;
   1051                     }
   1052                     // Try to match if the string is longer or starts earlier.
   1053                     // Match at code point boundaries. (The UTF-8 strings were converted
   1054                     // from UTF-16 and are guaranteed to be well-formed.)
   1055                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
   1056                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
   1057                         matches8(s+pos-overlap, s8, length8)
   1058 
   1059                     ) {
   1060                         maxInc=inc;  // Longest match from earliest start.
   1061                         maxOverlap=overlap;
   1062                         break;
   1063                     }
   1064                     --overlap;
   1065                     ++inc;
   1066                 }
   1067                 s8+=length8;
   1068             }
   1069 
   1070             if(maxInc!=0 || maxOverlap!=0) {
   1071                 // Longest-match algorithm, and there was a string match.
   1072                 // Simply continue after it.
   1073                 pos+=maxInc;
   1074                 rest-=maxInc;
   1075                 if(rest==0) {
   1076                     return length;  // Reached the end of the string.
   1077                 }
   1078                 spanLength=0;  // Match strings from after a string match.
   1079                 continue;
   1080             }
   1081         }
   1082         // Finished trying to match all strings at pos.
   1083 
   1084         if(spanLength!=0 || pos==0) {
   1085             // The position is after an unlimited code point span (spanLength!=0),
   1086             // not after a string match.
   1087             // The only position where spanLength==0 after a span is pos==0.
   1088             // Otherwise, an unlimited code point span is only tried again when no
   1089             // strings match, and if such a non-initial span fails we stop.
   1090             if(offsets.isEmpty()) {
   1091                 return pos;  // No strings matched after a span.
   1092             }
   1093             // Match strings from after the next string match.
   1094         } else {
   1095             // The position is after a string match (or a single code point).
   1096             if(offsets.isEmpty()) {
   1097                 // No more strings matched after a previous string match.
   1098                 // Try another code point span from after the last string match.
   1099                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
   1100                 if( spanLength==rest || // Reached the end of the string, or
   1101                     spanLength==0       // neither strings nor span progressed.
   1102                 ) {
   1103                     return pos+spanLength;
   1104                 }
   1105                 pos+=spanLength;
   1106                 rest-=spanLength;
   1107                 continue;  // spanLength>0: Match strings from after a span.
   1108             } else {
   1109                 // Try to match only one code point from after a string match if some
   1110                 // string matched beyond it, so that we try all possible positions
   1111                 // and don't overshoot.
   1112                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
   1113                 if(spanLength>0) {
   1114                     if(spanLength==rest) {
   1115                         return length;  // Reached the end of the string.
   1116                     }
   1117                     // Match strings after this code point.
   1118                     // There cannot be any increments below it because UnicodeSet strings
   1119                     // contain multiple code points.
   1120                     pos+=spanLength;
   1121                     rest-=spanLength;
   1122                     offsets.shift(spanLength);
   1123                     spanLength=0;
   1124                     continue;  // Match strings from after a single code point.
   1125                 }
   1126                 // Match strings from after the next string match.
   1127             }
   1128         }
   1129         int32_t minOffset=offsets.popMinimum();
   1130         pos+=minOffset;
   1131         rest-=minOffset;
   1132         spanLength=0;  // Match strings from after a string match.
   1133     }
   1134 }
   1135 
   1136 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   1137     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   1138         return spanNotBackUTF8(s, length);
   1139     }
   1140     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
   1141     if(pos==0) {
   1142         return 0;
   1143     }
   1144     int32_t spanLength=length-pos;
   1145 
   1146     // Consider strings; they may overlap with the span.
   1147     OffsetList offsets;
   1148     if(spanCondition==USET_SPAN_CONTAINED) {
   1149         // Use offset list to try all possibilities.
   1150         offsets.setMaxLength(maxLength8);
   1151     }
   1152     int32_t i, stringsLength=strings.size();
   1153     uint8_t *spanBackUTF8Lengths=spanLengths;
   1154     if(all) {
   1155         spanBackUTF8Lengths+=3*stringsLength;
   1156     }
   1157     for(;;) {
   1158         const uint8_t *s8=utf8;
   1159         int32_t length8;
   1160         if(spanCondition==USET_SPAN_CONTAINED) {
   1161             for(i=0; i<stringsLength; ++i) {
   1162                 length8=utf8Lengths[i];
   1163                 if(length8==0) {
   1164                     continue;  // String not representable in UTF-8.
   1165                 }
   1166                 int32_t overlap=spanBackUTF8Lengths[i];
   1167                 if(overlap==ALL_CP_CONTAINED) {
   1168                     s8+=length8;
   1169                     continue;  // Irrelevant string.
   1170                 }
   1171 
   1172                 // Try to match this string at pos-(length8-overlap)..pos-length8.
   1173                 if(overlap>=LONG_SPAN) {
   1174                     overlap=length8;
   1175                     // While contained: No point matching fully inside the code point span.
   1176                     int32_t len1=0;
   1177                     U8_FWD_1(s8, len1, overlap);
   1178                     overlap-=len1;  // Length of the string minus the first code point.
   1179                 }
   1180                 if(overlap>spanLength) {
   1181                     overlap=spanLength;
   1182                 }
   1183                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1184                 for(;;) {
   1185                     if(dec>pos) {
   1186                         break;
   1187                     }
   1188                     // Try to match if the decrement is not listed already.
   1189                     // Match at code point boundaries. (The UTF-8 strings were converted
   1190                     // from UTF-16 and are guaranteed to be well-formed.)
   1191                     if( !U8_IS_TRAIL(s[pos-dec]) &&
   1192                         !offsets.containsOffset(dec) &&
   1193                         matches8(s+pos-dec, s8, length8)
   1194                     ) {
   1195                         if(dec==pos) {
   1196                             return 0;  // Reached the start of the string.
   1197                         }
   1198                         offsets.addOffset(dec);
   1199                     }
   1200                     if(overlap==0) {
   1201                         break;
   1202                     }
   1203                     --overlap;
   1204                     ++dec;
   1205                 }
   1206                 s8+=length8;
   1207             }
   1208         } else /* USET_SPAN_SIMPLE */ {
   1209             int32_t maxDec=0, maxOverlap=0;
   1210             for(i=0; i<stringsLength; ++i) {
   1211                 length8=utf8Lengths[i];
   1212                 if(length8==0) {
   1213                     continue;  // String not representable in UTF-8.
   1214                 }
   1215                 int32_t overlap=spanBackUTF8Lengths[i];
   1216                 // For longest match, we do need to try to match even an all-contained string
   1217                 // to find the match from the latest end.
   1218 
   1219                 // Try to match this string at pos-(length8-overlap)..pos-length8.
   1220                 if(overlap>=LONG_SPAN) {
   1221                     overlap=length8;
   1222                     // Longest match: Need to match fully inside the code point span
   1223                     // to find the match from the latest end.
   1224                 }
   1225                 if(overlap>spanLength) {
   1226                     overlap=spanLength;
   1227                 }
   1228                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1229                 for(;;) {
   1230                     if(dec>pos || overlap<maxOverlap) {
   1231                         break;
   1232                     }
   1233                     // Try to match if the string is longer or ends later.
   1234                     // Match at code point boundaries. (The UTF-8 strings were converted
   1235                     // from UTF-16 and are guaranteed to be well-formed.)
   1236                     if( !U8_IS_TRAIL(s[pos-dec]) &&
   1237                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
   1238                         matches8(s+pos-dec, s8, length8)
   1239                     ) {
   1240                         maxDec=dec;  // Longest match from latest end.
   1241                         maxOverlap=overlap;
   1242                         break;
   1243                     }
   1244                     --overlap;
   1245                     ++dec;
   1246                 }
   1247                 s8+=length8;
   1248             }
   1249 
   1250             if(maxDec!=0 || maxOverlap!=0) {
   1251                 // Longest-match algorithm, and there was a string match.
   1252                 // Simply continue before it.
   1253                 pos-=maxDec;
   1254                 if(pos==0) {
   1255                     return 0;  // Reached the start of the string.
   1256                 }
   1257                 spanLength=0;  // Match strings from before a string match.
   1258                 continue;
   1259             }
   1260         }
   1261         // Finished trying to match all strings at pos.
   1262 
   1263         if(spanLength!=0 || pos==length) {
   1264             // The position is before an unlimited code point span (spanLength!=0),
   1265             // not before a string match.
   1266             // The only position where spanLength==0 before a span is pos==length.
   1267             // Otherwise, an unlimited code point span is only tried again when no
   1268             // strings match, and if such a non-initial span fails we stop.
   1269             if(offsets.isEmpty()) {
   1270                 return pos;  // No strings matched before a span.
   1271             }
   1272             // Match strings from before the next string match.
   1273         } else {
   1274             // The position is before a string match (or a single code point).
   1275             if(offsets.isEmpty()) {
   1276                 // No more strings matched before a previous string match.
   1277                 // Try another code point span from before the last string match.
   1278                 int32_t oldPos=pos;
   1279                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
   1280                 spanLength=oldPos-pos;
   1281                 if( pos==0 ||           // Reached the start of the string, or
   1282                     spanLength==0       // neither strings nor span progressed.
   1283                 ) {
   1284                     return pos;
   1285                 }
   1286                 continue;  // spanLength>0: Match strings from before a span.
   1287             } else {
   1288                 // Try to match only one code point from before a string match if some
   1289                 // string matched beyond it, so that we try all possible positions
   1290                 // and don't overshoot.
   1291                 spanLength=spanOneBackUTF8(spanSet, s, pos);
   1292                 if(spanLength>0) {
   1293                     if(spanLength==pos) {
   1294                         return 0;  // Reached the start of the string.
   1295                     }
   1296                     // Match strings before this code point.
   1297                     // There cannot be any decrements below it because UnicodeSet strings
   1298                     // contain multiple code points.
   1299                     pos-=spanLength;
   1300                     offsets.shift(spanLength);
   1301                     spanLength=0;
   1302                     continue;  // Match strings from before a single code point.
   1303                 }
   1304                 // Match strings from before the next string match.
   1305             }
   1306         }
   1307         pos-=offsets.popMinimum();
   1308         spanLength=0;  // Match strings from before a string match.
   1309     }
   1310 }
   1311 
   1312 /*
   1313  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
   1314  *
   1315  * Theoretical algorithm:
   1316  * - Iterate through the string, and at each code point boundary:
   1317  *   + If the code point there is in the set, then return with the current position.
   1318  *   + If a set string matches at the current position, then return with the current position.
   1319  *
   1320  * Optimized implementation:
   1321  *
   1322  * (Same assumption as for span() above.)
   1323  *
   1324  * Create and cache a spanNotSet which contains all of the single code points
   1325  * of the original set but none of its strings.
   1326  * For each set string add its initial code point to the spanNotSet.
   1327  * (Also add its final code point for spanNotBack().)
   1328  *
   1329  * - Loop:
   1330  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
   1331  *   + If the current code point is in the original set, then
   1332  *     return the current position.
   1333  *   + If any set string matches at the current position, then
   1334  *     return the current position.
   1335  *   + If there is no match at the current position, neither for the code point there
   1336  *     nor for any set string, then skip this code point and continue the loop.
   1337  *     This happens for set-string-initial code points that were added to spanNotSet
   1338  *     when there is not actually a match for such a set string.
   1339  */
   1340 
   1341 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
   1342     int32_t pos=0, rest=length;
   1343     int32_t i, stringsLength=strings.size();
   1344     do {
   1345         // Span until we find a code point from the set,
   1346         // or a code point that starts or ends some string.
   1347         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
   1348         if(i==rest) {
   1349             return length;  // Reached the end of the string.
   1350         }
   1351         pos+=i;
   1352         rest-=i;
   1353 
   1354         // Check whether the current code point is in the original set,
   1355         // without the string starts and ends.
   1356         int32_t cpLength=spanOne(spanSet, s+pos, rest);
   1357         if(cpLength>0) {
   1358             return pos;  // There is a set element at pos.
   1359         }
   1360 
   1361         // Try to match the strings at pos.
   1362         for(i=0; i<stringsLength; ++i) {
   1363             if(spanLengths[i]==ALL_CP_CONTAINED) {
   1364                 continue;  // Irrelevant string.
   1365             }
   1366             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   1367             const UChar *s16=string.getBuffer();
   1368             int32_t length16=string.length();
   1369             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
   1370                 return pos;  // There is a set element at pos.
   1371             }
   1372         }
   1373 
   1374         // The span(while not contained) ended on a string start/end which is
   1375         // not in the original set. Skip this code point and continue.
   1376         // cpLength<0
   1377         pos-=cpLength;
   1378         rest+=cpLength;
   1379     } while(rest!=0);
   1380     return length;  // Reached the end of the string.
   1381 }
   1382 
   1383 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
   1384     int32_t pos=length;
   1385     int32_t i, stringsLength=strings.size();
   1386     do {
   1387         // Span until we find a code point from the set,
   1388         // or a code point that starts or ends some string.
   1389         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
   1390         if(pos==0) {
   1391             return 0;  // Reached the start of the string.
   1392         }
   1393 
   1394         // Check whether the current code point is in the original set,
   1395         // without the string starts and ends.
   1396         int32_t cpLength=spanOneBack(spanSet, s, pos);
   1397         if(cpLength>0) {
   1398             return pos;  // There is a set element at pos.
   1399         }
   1400 
   1401         // Try to match the strings at pos.
   1402         for(i=0; i<stringsLength; ++i) {
   1403             // Use spanLengths rather than a spanBackLengths pointer because
   1404             // it is easier and we only need to know whether the string is irrelevant
   1405             // which is the same in either array.
   1406             if(spanLengths[i]==ALL_CP_CONTAINED) {
   1407                 continue;  // Irrelevant string.
   1408             }
   1409             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   1410             const UChar *s16=string.getBuffer();
   1411             int32_t length16=string.length();
   1412             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
   1413                 return pos;  // There is a set element at pos.
   1414             }
   1415         }
   1416 
   1417         // The span(while not contained) ended on a string start/end which is
   1418         // not in the original set. Skip this code point and continue.
   1419         // cpLength<0
   1420         pos+=cpLength;
   1421     } while(pos!=0);
   1422     return 0;  // Reached the start of the string.
   1423 }
   1424 
   1425 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
   1426     int32_t pos=0, rest=length;
   1427     int32_t i, stringsLength=strings.size();
   1428     uint8_t *spanUTF8Lengths=spanLengths;
   1429     if(all) {
   1430         spanUTF8Lengths+=2*stringsLength;
   1431     }
   1432     do {
   1433         // Span until we find a code point from the set,
   1434         // or a code point that starts or ends some string.
   1435         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
   1436         if(i==rest) {
   1437             return length;  // Reached the end of the string.
   1438         }
   1439         pos+=i;
   1440         rest-=i;
   1441 
   1442         // Check whether the current code point is in the original set,
   1443         // without the string starts and ends.
   1444         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
   1445         if(cpLength>0) {
   1446             return pos;  // There is a set element at pos.
   1447         }
   1448 
   1449         // Try to match the strings at pos.
   1450         const uint8_t *s8=utf8;
   1451         int32_t length8;
   1452         for(i=0; i<stringsLength; ++i) {
   1453             length8=utf8Lengths[i];
   1454             // ALL_CP_CONTAINED: Irrelevant string.
   1455             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
   1456                 return pos;  // There is a set element at pos.
   1457             }
   1458             s8+=length8;
   1459         }
   1460 
   1461         // The span(while not contained) ended on a string start/end which is
   1462         // not in the original set. Skip this code point and continue.
   1463         // cpLength<0
   1464         pos-=cpLength;
   1465         rest+=cpLength;
   1466     } while(rest!=0);
   1467     return length;  // Reached the end of the string.
   1468 }
   1469 
   1470 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
   1471     int32_t pos=length;
   1472     int32_t i, stringsLength=strings.size();
   1473     uint8_t *spanBackUTF8Lengths=spanLengths;
   1474     if(all) {
   1475         spanBackUTF8Lengths+=3*stringsLength;
   1476     }
   1477     do {
   1478         // Span until we find a code point from the set,
   1479         // or a code point that starts or ends some string.
   1480         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
   1481         if(pos==0) {
   1482             return 0;  // Reached the start of the string.
   1483         }
   1484 
   1485         // Check whether the current code point is in the original set,
   1486         // without the string starts and ends.
   1487         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
   1488         if(cpLength>0) {
   1489             return pos;  // There is a set element at pos.
   1490         }
   1491 
   1492         // Try to match the strings at pos.
   1493         const uint8_t *s8=utf8;
   1494         int32_t length8;
   1495         for(i=0; i<stringsLength; ++i) {
   1496             length8=utf8Lengths[i];
   1497             // ALL_CP_CONTAINED: Irrelevant string.
   1498             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
   1499                 return pos;  // There is a set element at pos.
   1500             }
   1501             s8+=length8;
   1502         }
   1503 
   1504         // The span(while not contained) ended on a string start/end which is
   1505         // not in the original set. Skip this code point and continue.
   1506         // cpLength<0
   1507         pos+=cpLength;
   1508     } while(pos!=0);
   1509     return 0;  // Reached the start of the string.
   1510 }
   1511 
   1512 U_NAMESPACE_END
   1513