Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2007, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  unisetspan.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2007mar01
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/ustring.h"
     20 #include "cmemory.h"
     21 #include "uvector.h"
     22 #include "unisetspan.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 /*
     27  * List of offsets from the current position from where to try matching
     28  * a code point or a string.
     29  * Store offsets rather than indexes to simplify the code and use the same list
     30  * for both increments (in span()) and decrements (in spanBack()).
     31  *
     32  * Assumption: The maximum offset is limited, and the offsets that are stored
     33  * at any one time are relatively dense, that is, there are normally no gaps of
     34  * hundreds or thousands of offset values.
     35  *
     36  * The implementation uses a circular buffer of byte flags,
     37  * each indicating whether the corresponding offset is in the list.
     38  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
     39  * physically moving part of the list.
     40  *
     41  * Note: In principle, the caller should setMaxLength() to the maximum of the
     42  * max string length and U16_LENGTH/U8_LENGTH to account for
     43  * "long" single code points.
     44  * However, this implementation uses at least a staticList with more than
     45  * U8_LENGTH entries anyway.
     46  *
     47  * Note: If maxLength were guaranteed to be no more than 32 or 64,
     48  * the list could be stored as bit flags in a single integer.
     49  * Rather than handling a circular buffer with a start list index,
     50  * the integer would simply be shifted when lower offsets are removed.
     51  * UnicodeSet does not have a limit on the lengths of strings.
     52  */
     53 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
     54 public:
     55     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
     56 
     57     ~OffsetList() {
     58         if(list!=staticList) {
     59             uprv_free(list);
     60         }
     61     }
     62 
     63     // Call exactly once if the list is to be used.
     64     void setMaxLength(int32_t maxLength) {
     65         if(maxLength<=(int32_t)sizeof(staticList)) {
     66             capacity=(int32_t)sizeof(staticList);
     67         } else {
     68             UBool *l=(UBool *)uprv_malloc(maxLength);
     69             if(l!=NULL) {
     70                 list=l;
     71                 capacity=maxLength;
     72             }
     73         }
     74         uprv_memset(list, 0, capacity);
     75     }
     76 
     77     void clear() {
     78         uprv_memset(list, 0, capacity);
     79         start=length=0;
     80     }
     81 
     82     UBool isEmpty() const {
     83         return (UBool)(length==0);
     84     }
     85 
     86     // Reduce all stored offsets by delta, used when the current position
     87     // moves by delta.
     88     // There must not be any offsets lower than delta.
     89     // If there is an offset equal to delta, it is removed.
     90     // delta=[1..maxLength]
     91     void shift(int32_t delta) {
     92         int32_t i=start+delta;
     93         if(i>=capacity) {
     94             i-=capacity;
     95         }
     96         if(list[i]) {
     97             list[i]=FALSE;
     98             --length;
     99         }
    100         start=i;
    101     }
    102 
    103     // Add an offset. The list must not contain it yet.
    104     // offset=[1..maxLength]
    105     void addOffset(int32_t offset) {
    106         int32_t i=start+offset;
    107         if(i>=capacity) {
    108             i-=capacity;
    109         }
    110         list[i]=TRUE;
    111         ++length;
    112     }
    113 
    114     // offset=[1..maxLength]
    115     UBool containsOffset(int32_t offset) const {
    116         int32_t i=start+offset;
    117         if(i>=capacity) {
    118             i-=capacity;
    119         }
    120         return list[i];
    121     }
    122 
    123     // Find the lowest stored offset from a non-empty list, remove it,
    124     // and reduce all other offsets by this minimum.
    125     // Returns [1..maxLength].
    126     int32_t popMinimum() {
    127         // Look for the next offset in list[start+1..capacity-1].
    128         int32_t i=start, result;
    129         while(++i<capacity) {
    130             if(list[i]) {
    131                 list[i]=FALSE;
    132                 --length;
    133                 result=i-start;
    134                 start=i;
    135                 return result;
    136             }
    137         }
    138         // i==capacity
    139 
    140         // Wrap around and look for the next offset in list[0..start].
    141         // Since the list is not empty, there will be one.
    142         result=capacity-start;
    143         i=0;
    144         while(!list[i]) {
    145             ++i;
    146         }
    147         list[i]=FALSE;
    148         --length;
    149         start=i;
    150         return result+=i;
    151     }
    152 
    153 private:
    154     UBool *list;
    155     int32_t capacity;
    156     int32_t length;
    157     int32_t start;
    158 
    159     UBool staticList[16];
    160 };
    161 
    162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
    163 static int32_t
    164 getUTF8Length(const UChar *s, int32_t length) {
    165     UErrorCode errorCode=U_ZERO_ERROR;
    166     int32_t length8=0;
    167     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
    168     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
    169         return length8;
    170     } else {
    171         // The string contains an unpaired surrogate.
    172         // Ignore this string.
    173         return 0;
    174     }
    175 }
    176 
    177 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
    178 static int32_t
    179 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
    180     UErrorCode errorCode=U_ZERO_ERROR;
    181     int32_t length8=0;
    182     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
    183     if(U_SUCCESS(errorCode)) {
    184         return length8;
    185     } else {
    186         // The string contains an unpaired surrogate.
    187         // Ignore this string.
    188         return 0;
    189     }
    190 }
    191 
    192 static inline uint8_t
    193 makeSpanLengthByte(int32_t spanLength) {
    194     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
    195     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
    196 }
    197 
    198 // Construct for all variants of span(), or only for any one variant.
    199 // Initialize as little as possible, for single use.
    200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
    201                                            const UVector &setStrings,
    202                                            uint32_t which)
    203         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
    204           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
    205           utf8Length(0),
    206           maxLength16(0), maxLength8(0),
    207           all((UBool)(which==ALL)) {
    208     spanSet.retainAll(set);
    209     if(which&NOT_CONTAINED) {
    210         // Default to the same sets.
    211         // addToSpanNotSet() will create a separate set if necessary.
    212         pSpanNotSet=&spanSet;
    213     }
    214 
    215     // Determine if the strings even need to be taken into account at all for span() etc.
    216     // If any string is relevant, then all strings need to be used for
    217     // span(longest match) but only the relevant ones for span(while contained).
    218     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
    219     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
    220     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
    221     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
    222     int32_t stringsLength=strings.size();
    223 
    224     int32_t i, spanLength;
    225     UBool someRelevant=FALSE;
    226     for(i=0; i<stringsLength; ++i) {
    227         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    228         const UChar *s16=string.getBuffer();
    229         int32_t length16=string.length();
    230         UBool thisRelevant;
    231         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    232         if(spanLength<length16) {  // Relevant string.
    233             someRelevant=thisRelevant=TRUE;
    234         } else {
    235             thisRelevant=FALSE;
    236         }
    237         if((which&UTF16) && length16>maxLength16) {
    238             maxLength16=length16;
    239         }
    240         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
    241             int32_t length8=getUTF8Length(s16, length16);
    242             utf8Length+=length8;
    243             if(length8>maxLength8) {
    244                 maxLength8=length8;
    245             }
    246         }
    247     }
    248     if(!someRelevant) {
    249         maxLength16=maxLength8=0;
    250         return;
    251     }
    252 
    253     // Freeze after checking for the need to use strings at all because freezing
    254     // a set takes some time and memory which are wasted if there are no relevant strings.
    255     if(all) {
    256         spanSet.freeze();
    257     }
    258 
    259     uint8_t *spanBackLengths;
    260     uint8_t *spanUTF8Lengths;
    261     uint8_t *spanBackUTF8Lengths;
    262 
    263     // Allocate a block of meta data.
    264     int32_t allocSize;
    265     if(all) {
    266         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    267         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    268     } else {
    269         allocSize=stringsLength;  // One set of span lengths.
    270         if(which&UTF8) {
    271             // UTF-8 lengths and UTF-8 strings.
    272             allocSize+=stringsLength*4+utf8Length;
    273         }
    274     }
    275     if(allocSize<=(int32_t)sizeof(staticLengths)) {
    276         utf8Lengths=staticLengths;
    277     } else {
    278         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
    279         if(utf8Lengths==NULL) {
    280             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
    281             return;  // Out of memory.
    282         }
    283     }
    284 
    285     if(all) {
    286         // Store span lengths for all span() variants.
    287         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    288         spanBackLengths=spanLengths+stringsLength;
    289         spanUTF8Lengths=spanBackLengths+stringsLength;
    290         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
    291         utf8=spanBackUTF8Lengths+stringsLength;
    292     } else {
    293         // Store span lengths for only one span() variant.
    294         if(which&UTF8) {
    295             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    296             utf8=spanLengths+stringsLength;
    297         } else {
    298             spanLengths=(uint8_t *)utf8Lengths;
    299         }
    300         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
    301     }
    302 
    303     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
    304     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
    305 
    306     for(i=0; i<stringsLength; ++i) {
    307         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    308         const UChar *s16=string.getBuffer();
    309         int32_t length16=string.length();
    310         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    311         if(spanLength<length16) {  // Relevant string.
    312             if(which&UTF16) {
    313                 if(which&CONTAINED) {
    314                     if(which&FWD) {
    315                         spanLengths[i]=makeSpanLengthByte(spanLength);
    316                     }
    317                     if(which&BACK) {
    318                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
    319                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
    320                     }
    321                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    322                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
    323                 }
    324             }
    325             if(which&UTF8) {
    326                 uint8_t *s8=utf8+utf8Count;
    327                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    328                 utf8Count+=utf8Lengths[i]=length8;
    329                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
    330                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
    331                 } else {  // Relevant for UTF-8.
    332                     if(which&CONTAINED) {
    333                         if(which&FWD) {
    334                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
    335                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    336                         }
    337                         if(which&BACK) {
    338                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
    339                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    340                         }
    341                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    342                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
    343                     }
    344                 }
    345             }
    346             if(which&NOT_CONTAINED) {
    347                 // Add string start and end code points to the spanNotSet so that
    348                 // a span(while not contained) stops before any string.
    349                 UChar32 c;
    350                 if(which&FWD) {
    351                     int32_t len=0;
    352                     U16_NEXT(s16, len, length16, c);
    353                     addToSpanNotSet(c);
    354                 }
    355                 if(which&BACK) {
    356                     int32_t len=length16;
    357                     U16_PREV(s16, 0, len, c);
    358                     addToSpanNotSet(c);
    359                 }
    360             }
    361         } else {  // Irrelevant string.
    362             if(which&UTF8) {
    363                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
    364                     uint8_t *s8=utf8+utf8Count;
    365                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    366                     utf8Count+=utf8Lengths[i]=length8;
    367                 } else {
    368                     utf8Lengths[i]=0;
    369                 }
    370             }
    371             if(all) {
    372                 spanLengths[i]=spanBackLengths[i]=
    373                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
    374                         (uint8_t)ALL_CP_CONTAINED;
    375             } else {
    376                 // All spanXYZLengths pointers contain the same address.
    377                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
    378             }
    379         }
    380     }
    381 
    382     // Finish.
    383     if(all) {
    384         pSpanNotSet->freeze();
    385     }
    386 }
    387 
    388 // Copy constructor. Assumes which==ALL for a frozen set.
    389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
    390                                            const UVector &newParentSetStrings)
    391         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
    392           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
    393           utf8Length(otherStringSpan.utf8Length),
    394           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
    395           all(TRUE) {
    396     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
    397         pSpanNotSet=&spanSet;
    398     } else {
    399         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
    400     }
    401 
    402     // Allocate a block of meta data.
    403     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    404     int32_t stringsLength=strings.size();
    405     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    406     if(allocSize<=(int32_t)sizeof(staticLengths)) {
    407         utf8Lengths=staticLengths;
    408     } else {
    409         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
    410         if(utf8Lengths==NULL) {
    411             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
    412             return;  // Out of memory.
    413         }
    414     }
    415 
    416     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    417     utf8=spanLengths+stringsLength*4;
    418     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
    419 }
    420 
    421 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
    422     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
    423         delete pSpanNotSet;
    424     }
    425     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
    426         uprv_free(utf8Lengths);
    427     }
    428 }
    429 
    430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
    431     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
    432         if(spanSet.contains(c)) {
    433             return;  // Nothing to do.
    434         }
    435         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
    436         if(newSet==NULL) {
    437             return;  // Out of memory.
    438         } else {
    439             pSpanNotSet=newSet;
    440         }
    441     }
    442     pSpanNotSet->add(c);
    443 }
    444 
    445 // Compare strings without any argument checks. Requires length>0.
    446 static inline UBool
    447 matches16(const UChar *s, const UChar *t, int32_t length) {
    448     do {
    449         if(*s++!=*t++) {
    450             return FALSE;
    451         }
    452     } while(--length>0);
    453     return TRUE;
    454 }
    455 
    456 static inline UBool
    457 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
    458     do {
    459         if(*s++!=*t++) {
    460             return FALSE;
    461         }
    462     } while(--length>0);
    463     return TRUE;
    464 }
    465 
    466 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
    467 // at code point boundaries.
    468 // That is, each edge of a match must not be in the middle of a surrogate pair.
    469 static inline UBool
    470 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
    471     s+=start;
    472     limit-=start;
    473     return matches16(s, t, length) &&
    474            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
    475            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
    476 }
    477 
    478 // Does the set contain the next code point?
    479 // If so, return its length; otherwise return its negative length.
    480 static inline int32_t
    481 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
    482     UChar c=*s, c2;
    483     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
    484         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
    485     }
    486     return set.contains(c) ? 1 : -1;
    487 }
    488 
    489 static inline int32_t
    490 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
    491     UChar c=s[length-1], c2;
    492     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
    493         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
    494     }
    495     return set.contains(c) ? 1 : -1;
    496 }
    497 
    498 static inline int32_t
    499 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    500     UChar32 c=*s;
    501     if((int8_t)c>=0) {
    502         return set.contains(c) ? 1 : -1;
    503     }
    504     // Take advantage of non-ASCII fastpaths in U8_NEXT().
    505     int32_t i=0;
    506     U8_NEXT(s, i, length, c);
    507     return set.contains(c) ? i : -i;
    508 }
    509 
    510 static inline int32_t
    511 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    512     UChar32 c=s[length-1];
    513     if((int8_t)c>=0) {
    514         return set.contains(c) ? 1 : -1;
    515     }
    516     int32_t i=length-1;
    517     c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
    518     length-=i;
    519     return set.contains(c) ? length : -length;
    520 }
    521 
    522 /*
    523  * Note: In span() when spanLength==0 (after a string match, or at the beginning
    524  * after an empty code point span) and in spanNot() and spanNotUTF8(),
    525  * string matching could use a binary search
    526  * because all string matches are done from the same start index.
    527  *
    528  * For UTF-8, this would require a comparison function that returns UTF-16 order.
    529  *
    530  * This optimization should not be necessary for normal UnicodeSets because
    531  * most sets have no strings, and most sets with strings have
    532  * very few very short strings.
    533  * For cases with many strings, it might be better to use a different API
    534  * and implementation with a DFA (state machine).
    535  */
    536 
    537 /*
    538  * Algorithm for span(USET_SPAN_CONTAINED)
    539  *
    540  * Theoretical algorithm:
    541  * - Iterate through the string, and at each code point boundary:
    542  *   + If the code point there is in the set, then remember to continue after it.
    543  *   + If a set string matches at the current position, then remember to continue after it.
    544  *   + Either recursively span for each code point or string match,
    545  *     or recursively span for all but the shortest one and
    546  *     iteratively continue the span with the shortest local match.
    547  *   + Remember the longest recursive span (the farthest end point).
    548  *   + If there is no match at the current position, neither for the code point there
    549  *     nor for any set string, then stop and return the longest recursive span length.
    550  *
    551  * Optimized implementation:
    552  *
    553  * (We assume that most sets will have very few very short strings.
    554  * A span using a string-less set is extremely fast.)
    555  *
    556  * Create and cache a spanSet which contains all of the single code points
    557  * of the original set but none of its strings.
    558  *
    559  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    560  * - Loop:
    561  *   + Try to match each set string at the end of the spanLength.
    562  *     ~ Set strings that start with set-contained code points must be matched
    563  *       with a partial overlap because the recursive algorithm would have tried
    564  *       to match them at every position.
    565  *     ~ Set strings that entirely consist of set-contained code points
    566  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
    567  *       recursive algorithm would continue after them anyway
    568  *       and find the longest recursive match from their end.
    569  *     ~ Rather than recursing, note each end point of a set string match.
    570  *   + If no set string matched after spanSet.span(), then return
    571  *     with where the spanSet.span() ended.
    572  *   + If at least one set string matched after spanSet.span(), then
    573  *     pop the shortest string match end point and continue
    574  *     the loop, trying to match all set strings from there.
    575  *   + If at least one more set string matched after a previous string match,
    576  *     then test if the code point after the previous string match is also
    577  *     contained in the set.
    578  *     Continue the loop with the shortest end point of either this code point
    579  *     or a matching set string.
    580  *   + If no more set string matched after a previous string match,
    581  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    582  *     Stop if spanLength==0, otherwise continue the loop.
    583  *
    584  * By noting each end point of a set string match,
    585  * the function visits each string position at most once and finishes
    586  * in linear time.
    587  *
    588  * The recursive algorithm may visit the same string position many times
    589  * if multiple paths lead to it and finishes in exponential time.
    590  */
    591 
    592 /*
    593  * Algorithm for span(USET_SPAN_SIMPLE)
    594  *
    595  * Theoretical algorithm:
    596  * - Iterate through the string, and at each code point boundary:
    597  *   + If the code point there is in the set, then remember to continue after it.
    598  *   + If a set string matches at the current position, then remember to continue after it.
    599  *   + Continue from the farthest match position and ignore all others.
    600  *   + If there is no match at the current position,
    601  *     then stop and return the current position.
    602  *
    603  * Optimized implementation:
    604  *
    605  * (Same assumption and spanSet as above.)
    606  *
    607  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    608  * - Loop:
    609  *   + Try to match each set string at the end of the spanLength.
    610  *     ~ Set strings that start with set-contained code points must be matched
    611  *       with a partial overlap because the standard algorithm would have tried
    612  *       to match them earlier.
    613  *     ~ Set strings that entirely consist of set-contained code points
    614  *       must be matched with a full overlap because the longest-match algorithm
    615  *       would hide set string matches that end earlier.
    616  *       Such set strings need not be matched earlier inside the code point span
    617  *       because the standard algorithm would then have continued after
    618  *       the set string match anyway.
    619  *     ~ Remember the longest set string match (farthest end point) from the earliest
    620  *       starting point.
    621  *   + If no set string matched after spanSet.span(), then return
    622  *     with where the spanSet.span() ended.
    623  *   + If at least one set string matched, then continue the loop after the
    624  *     longest match from the earliest position.
    625  *   + If no more set string matched after a previous string match,
    626  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    627  *     Stop if spanLength==0, otherwise continue the loop.
    628  */
    629 
    630 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    631     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    632         return spanNot(s, length);
    633     }
    634     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
    635     if(spanLength==length) {
    636         return length;
    637     }
    638 
    639     // Consider strings; they may overlap with the span.
    640     OffsetList offsets;
    641     if(spanCondition==USET_SPAN_CONTAINED) {
    642         // Use offset list to try all possibilities.
    643         offsets.setMaxLength(maxLength16);
    644     }
    645     int32_t pos=spanLength, rest=length-pos;
    646     int32_t i, stringsLength=strings.size();
    647     for(;;) {
    648         if(spanCondition==USET_SPAN_CONTAINED) {
    649             for(i=0; i<stringsLength; ++i) {
    650                 int32_t overlap=spanLengths[i];
    651                 if(overlap==ALL_CP_CONTAINED) {
    652                     continue;  // Irrelevant string.
    653                 }
    654                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    655                 const UChar *s16=string.getBuffer();
    656                 int32_t length16=string.length();
    657 
    658                 // Try to match this string at pos-overlap..pos.
    659                 if(overlap>=LONG_SPAN) {
    660                     overlap=length16;
    661                     // While contained: No point matching fully inside the code point span.
    662                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
    663                 }
    664                 if(overlap>spanLength) {
    665                     overlap=spanLength;
    666                 }
    667                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    668                 for(;;) {
    669                     if(inc>rest) {
    670                         break;
    671                     }
    672                     // Try to match if the increment is not listed already.
    673                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
    674                         if(inc==rest) {
    675                             return length;  // Reached the end of the string.
    676                         }
    677                         offsets.addOffset(inc);
    678                     }
    679                     if(overlap==0) {
    680                         break;
    681                     }
    682                     --overlap;
    683                     ++inc;
    684                 }
    685             }
    686         } else /* USET_SPAN_SIMPLE */ {
    687             int32_t maxInc=0, maxOverlap=0;
    688             for(i=0; i<stringsLength; ++i) {
    689                 int32_t overlap=spanLengths[i];
    690                 // For longest match, we do need to try to match even an all-contained string
    691                 // to find the match from the earliest start.
    692 
    693                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    694                 const UChar *s16=string.getBuffer();
    695                 int32_t length16=string.length();
    696 
    697                 // Try to match this string at pos-overlap..pos.
    698                 if(overlap>=LONG_SPAN) {
    699                     overlap=length16;
    700                     // Longest match: Need to match fully inside the code point span
    701                     // to find the match from the earliest start.
    702                 }
    703                 if(overlap>spanLength) {
    704                     overlap=spanLength;
    705                 }
    706                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    707                 for(;;) {
    708                     if(inc>rest || overlap<maxOverlap) {
    709                         break;
    710                     }
    711                     // Try to match if the string is longer or starts earlier.
    712                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
    713                         matches16CPB(s, pos-overlap, length, s16, length16)
    714                     ) {
    715                         maxInc=inc;  // Longest match from earliest start.
    716                         maxOverlap=overlap;
    717                         break;
    718                     }
    719                     --overlap;
    720                     ++inc;
    721                 }
    722             }
    723 
    724             if(maxInc!=0 || maxOverlap!=0) {
    725                 // Longest-match algorithm, and there was a string match.
    726                 // Simply continue after it.
    727                 pos+=maxInc;
    728                 rest-=maxInc;
    729                 if(rest==0) {
    730                     return length;  // Reached the end of the string.
    731                 }
    732                 spanLength=0;  // Match strings from after a string match.
    733                 continue;
    734             }
    735         }
    736         // Finished trying to match all strings at pos.
    737 
    738         if(spanLength!=0 || pos==0) {
    739             // The position is after an unlimited code point span (spanLength!=0),
    740             // not after a string match.
    741             // The only position where spanLength==0 after a span is pos==0.
    742             // Otherwise, an unlimited code point span is only tried again when no
    743             // strings match, and if such a non-initial span fails we stop.
    744             if(offsets.isEmpty()) {
    745                 return pos;  // No strings matched after a span.
    746             }
    747             // Match strings from after the next string match.
    748         } else {
    749             // The position is after a string match (or a single code point).
    750             if(offsets.isEmpty()) {
    751                 // No more strings matched after a previous string match.
    752                 // Try another code point span from after the last string match.
    753                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
    754                 if( spanLength==rest || // Reached the end of the string, or
    755                     spanLength==0       // neither strings nor span progressed.
    756                 ) {
    757                     return pos+spanLength;
    758                 }
    759                 pos+=spanLength;
    760                 rest-=spanLength;
    761                 continue;  // spanLength>0: Match strings from after a span.
    762             } else {
    763                 // Try to match only one code point from after a string match if some
    764                 // string matched beyond it, so that we try all possible positions
    765                 // and don't overshoot.
    766                 spanLength=spanOne(spanSet, s+pos, rest);
    767                 if(spanLength>0) {
    768                     if(spanLength==rest) {
    769                         return length;  // Reached the end of the string.
    770                     }
    771                     // Match strings after this code point.
    772                     // There cannot be any increments below it because UnicodeSet strings
    773                     // contain multiple code points.
    774                     pos+=spanLength;
    775                     rest-=spanLength;
    776                     offsets.shift(spanLength);
    777                     spanLength=0;
    778                     continue;  // Match strings from after a single code point.
    779                 }
    780                 // Match strings from after the next string match.
    781             }
    782         }
    783         int32_t minOffset=offsets.popMinimum();
    784         pos+=minOffset;
    785         rest-=minOffset;
    786         spanLength=0;  // Match strings from after a string match.
    787     }
    788 }
    789 
    790 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    791     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    792         return spanNotBack(s, length);
    793     }
    794     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
    795     if(pos==0) {
    796         return 0;
    797     }
    798     int32_t spanLength=length-pos;
    799 
    800     // Consider strings; they may overlap with the span.
    801     OffsetList offsets;
    802     if(spanCondition==USET_SPAN_CONTAINED) {
    803         // Use offset list to try all possibilities.
    804         offsets.setMaxLength(maxLength16);
    805     }
    806     int32_t i, stringsLength=strings.size();
    807     uint8_t *spanBackLengths=spanLengths;
    808     if(all) {
    809         spanBackLengths+=stringsLength;
    810     }
    811     for(;;) {
    812         if(spanCondition==USET_SPAN_CONTAINED) {
    813             for(i=0; i<stringsLength; ++i) {
    814                 int32_t overlap=spanBackLengths[i];
    815                 if(overlap==ALL_CP_CONTAINED) {
    816                     continue;  // Irrelevant string.
    817                 }
    818                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    819                 const UChar *s16=string.getBuffer();
    820                 int32_t length16=string.length();
    821 
    822                 // Try to match this string at pos-(length16-overlap)..pos-length16.
    823                 if(overlap>=LONG_SPAN) {
    824                     overlap=length16;
    825                     // While contained: No point matching fully inside the code point span.
    826                     int32_t len1=0;
    827                     U16_FWD_1(s16, len1, overlap);
    828                     overlap-=len1;  // Length of the string minus the first code point.
    829                 }
    830                 if(overlap>spanLength) {
    831                     overlap=spanLength;
    832                 }
    833                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    834                 for(;;) {
    835                     if(dec>pos) {
    836                         break;
    837                     }
    838                     // Try to match if the decrement is not listed already.
    839                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
    840                         if(dec==pos) {
    841                             return 0;  // Reached the start of the string.
    842                         }
    843                         offsets.addOffset(dec);
    844                     }
    845                     if(overlap==0) {
    846                         break;
    847                     }
    848                     --overlap;
    849                     ++dec;
    850                 }
    851             }
    852         } else /* USET_SPAN_SIMPLE */ {
    853             int32_t maxDec=0, maxOverlap=0;
    854             for(i=0; i<stringsLength; ++i) {
    855                 int32_t overlap=spanBackLengths[i];
    856                 // For longest match, we do need to try to match even an all-contained string
    857                 // to find the match from the latest end.
    858 
    859                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    860                 const UChar *s16=string.getBuffer();
    861                 int32_t length16=string.length();
    862 
    863                 // Try to match this string at pos-(length16-overlap)..pos-length16.
    864                 if(overlap>=LONG_SPAN) {
    865                     overlap=length16;
    866                     // Longest match: Need to match fully inside the code point span
    867                     // to find the match from the latest end.
    868                 }
    869                 if(overlap>spanLength) {
    870                     overlap=spanLength;
    871                 }
    872                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    873                 for(;;) {
    874                     if(dec>pos || overlap<maxOverlap) {
    875                         break;
    876                     }
    877                     // Try to match if the string is longer or ends later.
    878                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
    879                         matches16CPB(s, pos-dec, length, s16, length16)
    880                     ) {
    881                         maxDec=dec;  // Longest match from latest end.
    882                         maxOverlap=overlap;
    883                         break;
    884                     }
    885                     --overlap;
    886                     ++dec;
    887                 }
    888             }
    889 
    890             if(maxDec!=0 || maxOverlap!=0) {
    891                 // Longest-match algorithm, and there was a string match.
    892                 // Simply continue before it.
    893                 pos-=maxDec;
    894                 if(pos==0) {
    895                     return 0;  // Reached the start of the string.
    896                 }
    897                 spanLength=0;  // Match strings from before a string match.
    898                 continue;
    899             }
    900         }
    901         // Finished trying to match all strings at pos.
    902 
    903         if(spanLength!=0 || pos==length) {
    904             // The position is before an unlimited code point span (spanLength!=0),
    905             // not before a string match.
    906             // The only position where spanLength==0 before a span is pos==length.
    907             // Otherwise, an unlimited code point span is only tried again when no
    908             // strings match, and if such a non-initial span fails we stop.
    909             if(offsets.isEmpty()) {
    910                 return pos;  // No strings matched before a span.
    911             }
    912             // Match strings from before the next string match.
    913         } else {
    914             // The position is before a string match (or a single code point).
    915             if(offsets.isEmpty()) {
    916                 // No more strings matched before a previous string match.
    917                 // Try another code point span from before the last string match.
    918                 int32_t oldPos=pos;
    919                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
    920                 spanLength=oldPos-pos;
    921                 if( pos==0 ||           // Reached the start of the string, or
    922                     spanLength==0       // neither strings nor span progressed.
    923                 ) {
    924                     return pos;
    925                 }
    926                 continue;  // spanLength>0: Match strings from before a span.
    927             } else {
    928                 // Try to match only one code point from before a string match if some
    929                 // string matched beyond it, so that we try all possible positions
    930                 // and don't overshoot.
    931                 spanLength=spanOneBack(spanSet, s, pos);
    932                 if(spanLength>0) {
    933                     if(spanLength==pos) {
    934                         return 0;  // Reached the start of the string.
    935                     }
    936                     // Match strings before this code point.
    937                     // There cannot be any decrements below it because UnicodeSet strings
    938                     // contain multiple code points.
    939                     pos-=spanLength;
    940                     offsets.shift(spanLength);
    941                     spanLength=0;
    942                     continue;  // Match strings from before a single code point.
    943                 }
    944                 // Match strings from before the next string match.
    945             }
    946         }
    947         pos-=offsets.popMinimum();
    948         spanLength=0;  // Match strings from before a string match.
    949     }
    950 }
    951 
    952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    953     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    954         return spanNotUTF8(s, length);
    955     }
    956     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
    957     if(spanLength==length) {
    958         return length;
    959     }
    960 
    961     // Consider strings; they may overlap with the span.
    962     OffsetList offsets;
    963     if(spanCondition==USET_SPAN_CONTAINED) {
    964         // Use offset list to try all possibilities.
    965         offsets.setMaxLength(maxLength8);
    966     }
    967     int32_t pos=spanLength, rest=length-pos;
    968     int32_t i, stringsLength=strings.size();
    969     uint8_t *spanUTF8Lengths=spanLengths;
    970     if(all) {
    971         spanUTF8Lengths+=2*stringsLength;
    972     }
    973     for(;;) {
    974         const uint8_t *s8=utf8;
    975         int32_t length8;
    976         if(spanCondition==USET_SPAN_CONTAINED) {
    977             for(i=0; i<stringsLength; ++i) {
    978                 length8=utf8Lengths[i];
    979                 if(length8==0) {
    980                     continue;  // String not representable in UTF-8.
    981                 }
    982                 int32_t overlap=spanUTF8Lengths[i];
    983                 if(overlap==ALL_CP_CONTAINED) {
    984                     s8+=length8;
    985                     continue;  // Irrelevant string.
    986                 }
    987 
    988                 // Try to match this string at pos-overlap..pos.
    989                 if(overlap>=LONG_SPAN) {
    990                     overlap=length8;
    991                     // While contained: No point matching fully inside the code point span.
    992                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
    993                 }
    994                 if(overlap>spanLength) {
    995                     overlap=spanLength;
    996                 }
    997                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
    998                 for(;;) {
    999                     if(inc>rest) {
   1000                         break;
   1001                     }
   1002                     // Try to match if the increment is not listed already.
   1003                     // Match at code point boundaries. (The UTF-8 strings were converted
   1004                     // from UTF-16 and are guaranteed to be well-formed.)
   1005                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
   1006                         !offsets.containsOffset(inc) &&
   1007                         matches8(s+pos-overlap, s8, length8)
   1008 
   1009                     ) {
   1010                         if(inc==rest) {
   1011                             return length;  // Reached the end of the string.
   1012                         }
   1013                         offsets.addOffset(inc);
   1014                     }
   1015                     if(overlap==0) {
   1016                         break;
   1017                     }
   1018                     --overlap;
   1019                     ++inc;
   1020                 }
   1021                 s8+=length8;
   1022             }
   1023         } else /* USET_SPAN_SIMPLE */ {
   1024             int32_t maxInc=0, maxOverlap=0;
   1025             for(i=0; i<stringsLength; ++i) {
   1026                 length8=utf8Lengths[i];
   1027                 if(length8==0) {
   1028                     continue;  // String not representable in UTF-8.
   1029                 }
   1030                 int32_t overlap=spanUTF8Lengths[i];
   1031                 // For longest match, we do need to try to match even an all-contained string
   1032                 // to find the match from the earliest start.
   1033 
   1034                 // Try to match this string at pos-overlap..pos.
   1035                 if(overlap>=LONG_SPAN) {
   1036                     overlap=length8;
   1037                     // Longest match: Need to match fully inside the code point span
   1038                     // to find the match from the earliest start.
   1039                 }
   1040                 if(overlap>spanLength) {
   1041                     overlap=spanLength;
   1042                 }
   1043                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
   1044                 for(;;) {
   1045                     if(inc>rest || overlap<maxOverlap) {
   1046                         break;
   1047                     }
   1048                     // Try to match if the string is longer or starts earlier.
   1049                     // Match at code point boundaries. (The UTF-8 strings were converted
   1050                     // from UTF-16 and are guaranteed to be well-formed.)
   1051                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
   1052                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
   1053                         matches8(s+pos-overlap, s8, length8)
   1054 
   1055                     ) {
   1056                         maxInc=inc;  // Longest match from earliest start.
   1057                         maxOverlap=overlap;
   1058                         break;
   1059                     }
   1060                     --overlap;
   1061                     ++inc;
   1062                 }
   1063                 s8+=length8;
   1064             }
   1065 
   1066             if(maxInc!=0 || maxOverlap!=0) {
   1067                 // Longest-match algorithm, and there was a string match.
   1068                 // Simply continue after it.
   1069                 pos+=maxInc;
   1070                 rest-=maxInc;
   1071                 if(rest==0) {
   1072                     return length;  // Reached the end of the string.
   1073                 }
   1074                 spanLength=0;  // Match strings from after a string match.
   1075                 continue;
   1076             }
   1077         }
   1078         // Finished trying to match all strings at pos.
   1079 
   1080         if(spanLength!=0 || pos==0) {
   1081             // The position is after an unlimited code point span (spanLength!=0),
   1082             // not after a string match.
   1083             // The only position where spanLength==0 after a span is pos==0.
   1084             // Otherwise, an unlimited code point span is only tried again when no
   1085             // strings match, and if such a non-initial span fails we stop.
   1086             if(offsets.isEmpty()) {
   1087                 return pos;  // No strings matched after a span.
   1088             }
   1089             // Match strings from after the next string match.
   1090         } else {
   1091             // The position is after a string match (or a single code point).
   1092             if(offsets.isEmpty()) {
   1093                 // No more strings matched after a previous string match.
   1094                 // Try another code point span from after the last string match.
   1095                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
   1096                 if( spanLength==rest || // Reached the end of the string, or
   1097                     spanLength==0       // neither strings nor span progressed.
   1098                 ) {
   1099                     return pos+spanLength;
   1100                 }
   1101                 pos+=spanLength;
   1102                 rest-=spanLength;
   1103                 continue;  // spanLength>0: Match strings from after a span.
   1104             } else {
   1105                 // Try to match only one code point from after a string match if some
   1106                 // string matched beyond it, so that we try all possible positions
   1107                 // and don't overshoot.
   1108                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
   1109                 if(spanLength>0) {
   1110                     if(spanLength==rest) {
   1111                         return length;  // Reached the end of the string.
   1112                     }
   1113                     // Match strings after this code point.
   1114                     // There cannot be any increments below it because UnicodeSet strings
   1115                     // contain multiple code points.
   1116                     pos+=spanLength;
   1117                     rest-=spanLength;
   1118                     offsets.shift(spanLength);
   1119                     spanLength=0;
   1120                     continue;  // Match strings from after a single code point.
   1121                 }
   1122                 // Match strings from after the next string match.
   1123             }
   1124         }
   1125         int32_t minOffset=offsets.popMinimum();
   1126         pos+=minOffset;
   1127         rest-=minOffset;
   1128         spanLength=0;  // Match strings from after a string match.
   1129     }
   1130 }
   1131 
   1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   1133     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   1134         return spanNotBackUTF8(s, length);
   1135     }
   1136     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
   1137     if(pos==0) {
   1138         return 0;
   1139     }
   1140     int32_t spanLength=length-pos;
   1141 
   1142     // Consider strings; they may overlap with the span.
   1143     OffsetList offsets;
   1144     if(spanCondition==USET_SPAN_CONTAINED) {
   1145         // Use offset list to try all possibilities.
   1146         offsets.setMaxLength(maxLength8);
   1147     }
   1148     int32_t i, stringsLength=strings.size();
   1149     uint8_t *spanBackUTF8Lengths=spanLengths;
   1150     if(all) {
   1151         spanBackUTF8Lengths+=3*stringsLength;
   1152     }
   1153     for(;;) {
   1154         const uint8_t *s8=utf8;
   1155         int32_t length8;
   1156         if(spanCondition==USET_SPAN_CONTAINED) {
   1157             for(i=0; i<stringsLength; ++i) {
   1158                 length8=utf8Lengths[i];
   1159                 if(length8==0) {
   1160                     continue;  // String not representable in UTF-8.
   1161                 }
   1162                 int32_t overlap=spanBackUTF8Lengths[i];
   1163                 if(overlap==ALL_CP_CONTAINED) {
   1164                     s8+=length8;
   1165                     continue;  // Irrelevant string.
   1166                 }
   1167 
   1168                 // Try to match this string at pos-(length8-overlap)..pos-length8.
   1169                 if(overlap>=LONG_SPAN) {
   1170                     overlap=length8;
   1171                     // While contained: No point matching fully inside the code point span.
   1172                     int32_t len1=0;
   1173                     U8_FWD_1(s8, len1, overlap);
   1174                     overlap-=len1;  // Length of the string minus the first code point.
   1175                 }
   1176                 if(overlap>spanLength) {
   1177                     overlap=spanLength;
   1178                 }
   1179                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1180                 for(;;) {
   1181                     if(dec>pos) {
   1182                         break;
   1183                     }
   1184                     // Try to match if the decrement is not listed already.
   1185                     // Match at code point boundaries. (The UTF-8 strings were converted
   1186                     // from UTF-16 and are guaranteed to be well-formed.)
   1187                     if( !U8_IS_TRAIL(s[pos-dec]) &&
   1188                         !offsets.containsOffset(dec) &&
   1189                         matches8(s+pos-dec, s8, length8)
   1190                     ) {
   1191                         if(dec==pos) {
   1192                             return 0;  // Reached the start of the string.
   1193                         }
   1194                         offsets.addOffset(dec);
   1195                     }
   1196                     if(overlap==0) {
   1197                         break;
   1198                     }
   1199                     --overlap;
   1200                     ++dec;
   1201                 }
   1202                 s8+=length8;
   1203             }
   1204         } else /* USET_SPAN_SIMPLE */ {
   1205             int32_t maxDec=0, maxOverlap=0;
   1206             for(i=0; i<stringsLength; ++i) {
   1207                 length8=utf8Lengths[i];
   1208                 if(length8==0) {
   1209                     continue;  // String not representable in UTF-8.
   1210                 }
   1211                 int32_t overlap=spanBackUTF8Lengths[i];
   1212                 // For longest match, we do need to try to match even an all-contained string
   1213                 // to find the match from the latest end.
   1214 
   1215                 // Try to match this string at pos-(length8-overlap)..pos-length8.
   1216                 if(overlap>=LONG_SPAN) {
   1217                     overlap=length8;
   1218                     // Longest match: Need to match fully inside the code point span
   1219                     // to find the match from the latest end.
   1220                 }
   1221                 if(overlap>spanLength) {
   1222                     overlap=spanLength;
   1223                 }
   1224                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1225                 for(;;) {
   1226                     if(dec>pos || overlap<maxOverlap) {
   1227                         break;
   1228                     }
   1229                     // Try to match if the string is longer or ends later.
   1230                     // Match at code point boundaries. (The UTF-8 strings were converted
   1231                     // from UTF-16 and are guaranteed to be well-formed.)
   1232                     if( !U8_IS_TRAIL(s[pos-dec]) &&
   1233                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
   1234                         matches8(s+pos-dec, s8, length8)
   1235                     ) {
   1236                         maxDec=dec;  // Longest match from latest end.
   1237                         maxOverlap=overlap;
   1238                         break;
   1239                     }
   1240                     --overlap;
   1241                     ++dec;
   1242                 }
   1243                 s8+=length8;
   1244             }
   1245 
   1246             if(maxDec!=0 || maxOverlap!=0) {
   1247                 // Longest-match algorithm, and there was a string match.
   1248                 // Simply continue before it.
   1249                 pos-=maxDec;
   1250                 if(pos==0) {
   1251                     return 0;  // Reached the start of the string.
   1252                 }
   1253                 spanLength=0;  // Match strings from before a string match.
   1254                 continue;
   1255             }
   1256         }
   1257         // Finished trying to match all strings at pos.
   1258 
   1259         if(spanLength!=0 || pos==length) {
   1260             // The position is before an unlimited code point span (spanLength!=0),
   1261             // not before a string match.
   1262             // The only position where spanLength==0 before a span is pos==length.
   1263             // Otherwise, an unlimited code point span is only tried again when no
   1264             // strings match, and if such a non-initial span fails we stop.
   1265             if(offsets.isEmpty()) {
   1266                 return pos;  // No strings matched before a span.
   1267             }
   1268             // Match strings from before the next string match.
   1269         } else {
   1270             // The position is before a string match (or a single code point).
   1271             if(offsets.isEmpty()) {
   1272                 // No more strings matched before a previous string match.
   1273                 // Try another code point span from before the last string match.
   1274                 int32_t oldPos=pos;
   1275                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
   1276                 spanLength=oldPos-pos;
   1277                 if( pos==0 ||           // Reached the start of the string, or
   1278                     spanLength==0       // neither strings nor span progressed.
   1279                 ) {
   1280                     return pos;
   1281                 }
   1282                 continue;  // spanLength>0: Match strings from before a span.
   1283             } else {
   1284                 // Try to match only one code point from before a string match if some
   1285                 // string matched beyond it, so that we try all possible positions
   1286                 // and don't overshoot.
   1287                 spanLength=spanOneBackUTF8(spanSet, s, pos);
   1288                 if(spanLength>0) {
   1289                     if(spanLength==pos) {
   1290                         return 0;  // Reached the start of the string.
   1291                     }
   1292                     // Match strings before this code point.
   1293                     // There cannot be any decrements below it because UnicodeSet strings
   1294                     // contain multiple code points.
   1295                     pos-=spanLength;
   1296                     offsets.shift(spanLength);
   1297                     spanLength=0;
   1298                     continue;  // Match strings from before a single code point.
   1299                 }
   1300                 // Match strings from before the next string match.
   1301             }
   1302         }
   1303         pos-=offsets.popMinimum();
   1304         spanLength=0;  // Match strings from before a string match.
   1305     }
   1306 }
   1307 
   1308 /*
   1309  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
   1310  *
   1311  * Theoretical algorithm:
   1312  * - Iterate through the string, and at each code point boundary:
   1313  *   + If the code point there is in the set, then return with the current position.
   1314  *   + If a set string matches at the current position, then return with the current position.
   1315  *
   1316  * Optimized implementation:
   1317  *
   1318  * (Same assumption as for span() above.)
   1319  *
   1320  * Create and cache a spanNotSet which contains all of the single code points
   1321  * of the original set but none of its strings.
   1322  * For each set string add its initial code point to the spanNotSet.
   1323  * (Also add its final code point for spanNotBack().)
   1324  *
   1325  * - Loop:
   1326  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
   1327  *   + If the current code point is in the original set, then
   1328  *     return the current position.
   1329  *   + If any set string matches at the current position, then
   1330  *     return the current position.
   1331  *   + If there is no match at the current position, neither for the code point there
   1332  *     nor for any set string, then skip this code point and continue the loop.
   1333  *     This happens for set-string-initial code points that were added to spanNotSet
   1334  *     when there is not actually a match for such a set string.
   1335  */
   1336 
   1337 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
   1338     int32_t pos=0, rest=length;
   1339     int32_t i, stringsLength=strings.size();
   1340     do {
   1341         // Span until we find a code point from the set,
   1342         // or a code point that starts or ends some string.
   1343         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
   1344         if(i==rest) {
   1345             return length;  // Reached the end of the string.
   1346         }
   1347         pos+=i;
   1348         rest-=i;
   1349 
   1350         // Check whether the current code point is in the original set,
   1351         // without the string starts and ends.
   1352         int32_t cpLength=spanOne(spanSet, s+pos, rest);
   1353         if(cpLength>0) {
   1354             return pos;  // There is a set element at pos.
   1355         }
   1356 
   1357         // Try to match the strings at pos.
   1358         for(i=0; i<stringsLength; ++i) {
   1359             if(spanLengths[i]==ALL_CP_CONTAINED) {
   1360                 continue;  // Irrelevant string.
   1361             }
   1362             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   1363             const UChar *s16=string.getBuffer();
   1364             int32_t length16=string.length();
   1365             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
   1366                 return pos;  // There is a set element at pos.
   1367             }
   1368         }
   1369 
   1370         // The span(while not contained) ended on a string start/end which is
   1371         // not in the original set. Skip this code point and continue.
   1372         // cpLength<0
   1373         pos-=cpLength;
   1374         rest+=cpLength;
   1375     } while(rest!=0);
   1376     return length;  // Reached the end of the string.
   1377 }
   1378 
   1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
   1380     int32_t pos=length;
   1381     int32_t i, stringsLength=strings.size();
   1382     do {
   1383         // Span until we find a code point from the set,
   1384         // or a code point that starts or ends some string.
   1385         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
   1386         if(pos==0) {
   1387             return 0;  // Reached the start of the string.
   1388         }
   1389 
   1390         // Check whether the current code point is in the original set,
   1391         // without the string starts and ends.
   1392         int32_t cpLength=spanOneBack(spanSet, s, pos);
   1393         if(cpLength>0) {
   1394             return pos;  // There is a set element at pos.
   1395         }
   1396 
   1397         // Try to match the strings at pos.
   1398         for(i=0; i<stringsLength; ++i) {
   1399             // Use spanLengths rather than a spanBackLengths pointer because
   1400             // it is easier and we only need to know whether the string is irrelevant
   1401             // which is the same in either array.
   1402             if(spanLengths[i]==ALL_CP_CONTAINED) {
   1403                 continue;  // Irrelevant string.
   1404             }
   1405             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   1406             const UChar *s16=string.getBuffer();
   1407             int32_t length16=string.length();
   1408             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
   1409                 return pos;  // There is a set element at pos.
   1410             }
   1411         }
   1412 
   1413         // The span(while not contained) ended on a string start/end which is
   1414         // not in the original set. Skip this code point and continue.
   1415         // cpLength<0
   1416         pos+=cpLength;
   1417     } while(pos!=0);
   1418     return 0;  // Reached the start of the string.
   1419 }
   1420 
   1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
   1422     int32_t pos=0, rest=length;
   1423     int32_t i, stringsLength=strings.size();
   1424     uint8_t *spanUTF8Lengths=spanLengths;
   1425     if(all) {
   1426         spanUTF8Lengths+=2*stringsLength;
   1427     }
   1428     do {
   1429         // Span until we find a code point from the set,
   1430         // or a code point that starts or ends some string.
   1431         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
   1432         if(i==rest) {
   1433             return length;  // Reached the end of the string.
   1434         }
   1435         pos+=i;
   1436         rest-=i;
   1437 
   1438         // Check whether the current code point is in the original set,
   1439         // without the string starts and ends.
   1440         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
   1441         if(cpLength>0) {
   1442             return pos;  // There is a set element at pos.
   1443         }
   1444 
   1445         // Try to match the strings at pos.
   1446         const uint8_t *s8=utf8;
   1447         int32_t length8;
   1448         for(i=0; i<stringsLength; ++i) {
   1449             length8=utf8Lengths[i];
   1450             // ALL_CP_CONTAINED: Irrelevant string.
   1451             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
   1452                 return pos;  // There is a set element at pos.
   1453             }
   1454             s8+=length8;
   1455         }
   1456 
   1457         // The span(while not contained) ended on a string start/end which is
   1458         // not in the original set. Skip this code point and continue.
   1459         // cpLength<0
   1460         pos-=cpLength;
   1461         rest+=cpLength;
   1462     } while(rest!=0);
   1463     return length;  // Reached the end of the string.
   1464 }
   1465 
   1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
   1467     int32_t pos=length;
   1468     int32_t i, stringsLength=strings.size();
   1469     uint8_t *spanBackUTF8Lengths=spanLengths;
   1470     if(all) {
   1471         spanBackUTF8Lengths+=3*stringsLength;
   1472     }
   1473     do {
   1474         // Span until we find a code point from the set,
   1475         // or a code point that starts or ends some string.
   1476         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
   1477         if(pos==0) {
   1478             return 0;  // Reached the start of the string.
   1479         }
   1480 
   1481         // Check whether the current code point is in the original set,
   1482         // without the string starts and ends.
   1483         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
   1484         if(cpLength>0) {
   1485             return pos;  // There is a set element at pos.
   1486         }
   1487 
   1488         // Try to match the strings at pos.
   1489         const uint8_t *s8=utf8;
   1490         int32_t length8;
   1491         for(i=0; i<stringsLength; ++i) {
   1492             length8=utf8Lengths[i];
   1493             // ALL_CP_CONTAINED: Irrelevant string.
   1494             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
   1495                 return pos;  // There is a set element at pos.
   1496             }
   1497             s8+=length8;
   1498         }
   1499 
   1500         // The span(while not contained) ended on a string start/end which is
   1501         // not in the original set. Skip this code point and continue.
   1502         // cpLength<0
   1503         pos+=cpLength;
   1504     } while(pos!=0);
   1505     return 0;  // Reached the start of the string.
   1506 }
   1507 
   1508 U_NAMESPACE_END
   1509