Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2007-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  unisetspan.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2007mar01
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/ustring.h"
     20 #include "unicode/utf8.h"
     21 #include "unicode/utf16.h"
     22 #include "cmemory.h"
     23 #include "uvector.h"
     24 #include "unisetspan.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 /*
     29  * List of offsets from the current position from where to try matching
     30  * a code point or a string.
     31  * Store offsets rather than indexes to simplify the code and use the same list
     32  * for both increments (in span()) and decrements (in spanBack()).
     33  *
     34  * Assumption: The maximum offset is limited, and the offsets that are stored
     35  * at any one time are relatively dense, that is, there are normally no gaps of
     36  * hundreds or thousands of offset values.
     37  *
     38  * The implementation uses a circular buffer of byte flags,
     39  * each indicating whether the corresponding offset is in the list.
     40  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
     41  * physically moving part of the list.
     42  *
     43  * Note: In principle, the caller should setMaxLength() to the maximum of the
     44  * max string length and U16_LENGTH/U8_LENGTH to account for
     45  * "long" single code points.
     46  * However, this implementation uses at least a staticList with more than
     47  * U8_LENGTH entries anyway.
     48  *
     49  * Note: If maxLength were guaranteed to be no more than 32 or 64,
     50  * the list could be stored as bit flags in a single integer.
     51  * Rather than handling a circular buffer with a start list index,
     52  * the integer would simply be shifted when lower offsets are removed.
     53  * UnicodeSet does not have a limit on the lengths of strings.
     54  */
     55 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
     56 public:
     57     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
     58 
     59     ~OffsetList() {
     60         if(list!=staticList) {
     61             uprv_free(list);
     62         }
     63     }
     64 
     65     // Call exactly once if the list is to be used.
     66     void setMaxLength(int32_t maxLength) {
     67         if(maxLength<=(int32_t)sizeof(staticList)) {
     68             capacity=(int32_t)sizeof(staticList);
     69         } else {
     70             UBool *l=(UBool *)uprv_malloc(maxLength);
     71             if(l!=NULL) {
     72                 list=l;
     73                 capacity=maxLength;
     74             }
     75         }
     76         uprv_memset(list, 0, capacity);
     77     }
     78 
     79     void clear() {
     80         uprv_memset(list, 0, capacity);
     81         start=length=0;
     82     }
     83 
     84     UBool isEmpty() const {
     85         return (UBool)(length==0);
     86     }
     87 
     88     // Reduce all stored offsets by delta, used when the current position
     89     // moves by delta.
     90     // There must not be any offsets lower than delta.
     91     // If there is an offset equal to delta, it is removed.
     92     // delta=[1..maxLength]
     93     void shift(int32_t delta) {
     94         int32_t i=start+delta;
     95         if(i>=capacity) {
     96             i-=capacity;
     97         }
     98         if(list[i]) {
     99             list[i]=FALSE;
    100             --length;
    101         }
    102         start=i;
    103     }
    104 
    105     // Add an offset. The list must not contain it yet.
    106     // offset=[1..maxLength]
    107     void addOffset(int32_t offset) {
    108         int32_t i=start+offset;
    109         if(i>=capacity) {
    110             i-=capacity;
    111         }
    112         list[i]=TRUE;
    113         ++length;
    114     }
    115 
    116     // offset=[1..maxLength]
    117     UBool containsOffset(int32_t offset) const {
    118         int32_t i=start+offset;
    119         if(i>=capacity) {
    120             i-=capacity;
    121         }
    122         return list[i];
    123     }
    124 
    125     // Find the lowest stored offset from a non-empty list, remove it,
    126     // and reduce all other offsets by this minimum.
    127     // Returns [1..maxLength].
    128     int32_t popMinimum() {
    129         // Look for the next offset in list[start+1..capacity-1].
    130         int32_t i=start, result;
    131         while(++i<capacity) {
    132             if(list[i]) {
    133                 list[i]=FALSE;
    134                 --length;
    135                 result=i-start;
    136                 start=i;
    137                 return result;
    138             }
    139         }
    140         // i==capacity
    141 
    142         // Wrap around and look for the next offset in list[0..start].
    143         // Since the list is not empty, there will be one.
    144         result=capacity-start;
    145         i=0;
    146         while(!list[i]) {
    147             ++i;
    148         }
    149         list[i]=FALSE;
    150         --length;
    151         start=i;
    152         return result+=i;
    153     }
    154 
    155 private:
    156     UBool *list;
    157     int32_t capacity;
    158     int32_t length;
    159     int32_t start;
    160 
    161     UBool staticList[16];
    162 };
    163 
    164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
    165 static int32_t
    166 getUTF8Length(const UChar *s, int32_t length) {
    167     UErrorCode errorCode=U_ZERO_ERROR;
    168     int32_t length8=0;
    169     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
    170     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
    171         return length8;
    172     } else {
    173         // The string contains an unpaired surrogate.
    174         // Ignore this string.
    175         return 0;
    176     }
    177 }
    178 
    179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
    180 static int32_t
    181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
    182     UErrorCode errorCode=U_ZERO_ERROR;
    183     int32_t length8=0;
    184     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
    185     if(U_SUCCESS(errorCode)) {
    186         return length8;
    187     } else {
    188         // The string contains an unpaired surrogate.
    189         // Ignore this string.
    190         return 0;
    191     }
    192 }
    193 
    194 static inline uint8_t
    195 makeSpanLengthByte(int32_t spanLength) {
    196     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
    197     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
    198 }
    199 
    200 // Construct for all variants of span(), or only for any one variant.
    201 // Initialize as little as possible, for single use.
    202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
    203                                            const UVector &setStrings,
    204                                            uint32_t which)
    205         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
    206           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
    207           utf8Length(0),
    208           maxLength16(0), maxLength8(0),
    209           all((UBool)(which==ALL)) {
    210     spanSet.retainAll(set);
    211     if(which&NOT_CONTAINED) {
    212         // Default to the same sets.
    213         // addToSpanNotSet() will create a separate set if necessary.
    214         pSpanNotSet=&spanSet;
    215     }
    216 
    217     // Determine if the strings even need to be taken into account at all for span() etc.
    218     // If any string is relevant, then all strings need to be used for
    219     // span(longest match) but only the relevant ones for span(while contained).
    220     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
    221     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
    222     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
    223     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
    224     int32_t stringsLength=strings.size();
    225 
    226     int32_t i, spanLength;
    227     UBool someRelevant=FALSE;
    228     for(i=0; i<stringsLength; ++i) {
    229         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    230         const UChar *s16=string.getBuffer();
    231         int32_t length16=string.length();
    232         UBool thisRelevant;
    233         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    234         if(spanLength<length16) {  // Relevant string.
    235             someRelevant=thisRelevant=TRUE;
    236         } else {
    237             thisRelevant=FALSE;
    238         }
    239         if((which&UTF16) && length16>maxLength16) {
    240             maxLength16=length16;
    241         }
    242         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
    243             int32_t length8=getUTF8Length(s16, length16);
    244             utf8Length+=length8;
    245             if(length8>maxLength8) {
    246                 maxLength8=length8;
    247             }
    248         }
    249     }
    250     if(!someRelevant) {
    251         maxLength16=maxLength8=0;
    252         return;
    253     }
    254 
    255     // Freeze after checking for the need to use strings at all because freezing
    256     // a set takes some time and memory which are wasted if there are no relevant strings.
    257     if(all) {
    258         spanSet.freeze();
    259     }
    260 
    261     uint8_t *spanBackLengths;
    262     uint8_t *spanUTF8Lengths;
    263     uint8_t *spanBackUTF8Lengths;
    264 
    265     // Allocate a block of meta data.
    266     int32_t allocSize;
    267     if(all) {
    268         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    269         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    270     } else {
    271         allocSize=stringsLength;  // One set of span lengths.
    272         if(which&UTF8) {
    273             // UTF-8 lengths and UTF-8 strings.
    274             allocSize+=stringsLength*4+utf8Length;
    275         }
    276     }
    277     if(allocSize<=(int32_t)sizeof(staticLengths)) {
    278         utf8Lengths=staticLengths;
    279     } else {
    280         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
    281         if(utf8Lengths==NULL) {
    282             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
    283             return;  // Out of memory.
    284         }
    285     }
    286 
    287     if(all) {
    288         // Store span lengths for all span() variants.
    289         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    290         spanBackLengths=spanLengths+stringsLength;
    291         spanUTF8Lengths=spanBackLengths+stringsLength;
    292         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
    293         utf8=spanBackUTF8Lengths+stringsLength;
    294     } else {
    295         // Store span lengths for only one span() variant.
    296         if(which&UTF8) {
    297             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    298             utf8=spanLengths+stringsLength;
    299         } else {
    300             spanLengths=(uint8_t *)utf8Lengths;
    301         }
    302         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
    303     }
    304 
    305     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
    306     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
    307 
    308     for(i=0; i<stringsLength; ++i) {
    309         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    310         const UChar *s16=string.getBuffer();
    311         int32_t length16=string.length();
    312         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    313         if(spanLength<length16) {  // Relevant string.
    314             if(which&UTF16) {
    315                 if(which&CONTAINED) {
    316                     if(which&FWD) {
    317                         spanLengths[i]=makeSpanLengthByte(spanLength);
    318                     }
    319                     if(which&BACK) {
    320                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
    321                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
    322                     }
    323                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    324                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
    325                 }
    326             }
    327             if(which&UTF8) {
    328                 uint8_t *s8=utf8+utf8Count;
    329                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    330                 utf8Count+=utf8Lengths[i]=length8;
    331                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
    332                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
    333                 } else {  // Relevant for UTF-8.
    334                     if(which&CONTAINED) {
    335                         if(which&FWD) {
    336                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
    337                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    338                         }
    339                         if(which&BACK) {
    340                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
    341                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    342                         }
    343                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    344                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
    345                     }
    346                 }
    347             }
    348             if(which&NOT_CONTAINED) {
    349                 // Add string start and end code points to the spanNotSet so that
    350                 // a span(while not contained) stops before any string.
    351                 UChar32 c;
    352                 if(which&FWD) {
    353                     int32_t len=0;
    354                     U16_NEXT(s16, len, length16, c);
    355                     addToSpanNotSet(c);
    356                 }
    357                 if(which&BACK) {
    358                     int32_t len=length16;
    359                     U16_PREV(s16, 0, len, c);
    360                     addToSpanNotSet(c);
    361                 }
    362             }
    363         } else {  // Irrelevant string.
    364             if(which&UTF8) {
    365                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
    366                     uint8_t *s8=utf8+utf8Count;
    367                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    368                     utf8Count+=utf8Lengths[i]=length8;
    369                 } else {
    370                     utf8Lengths[i]=0;
    371                 }
    372             }
    373             if(all) {
    374                 spanLengths[i]=spanBackLengths[i]=
    375                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
    376                         (uint8_t)ALL_CP_CONTAINED;
    377             } else {
    378                 // All spanXYZLengths pointers contain the same address.
    379                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
    380             }
    381         }
    382     }
    383 
    384     // Finish.
    385     if(all) {
    386         pSpanNotSet->freeze();
    387     }
    388 }
    389 
    390 // Copy constructor. Assumes which==ALL for a frozen set.
    391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
    392                                            const UVector &newParentSetStrings)
    393         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
    394           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
    395           utf8Length(otherStringSpan.utf8Length),
    396           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
    397           all(TRUE) {
    398     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
    399         pSpanNotSet=&spanSet;
    400     } else {
    401         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
    402     }
    403 
    404     // Allocate a block of meta data.
    405     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    406     int32_t stringsLength=strings.size();
    407     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    408     if(allocSize<=(int32_t)sizeof(staticLengths)) {
    409         utf8Lengths=staticLengths;
    410     } else {
    411         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
    412         if(utf8Lengths==NULL) {
    413             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
    414             return;  // Out of memory.
    415         }
    416     }
    417 
    418     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
    419     utf8=spanLengths+stringsLength*4;
    420     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
    421 }
    422 
    423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
    424     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
    425         delete pSpanNotSet;
    426     }
    427     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
    428         uprv_free(utf8Lengths);
    429     }
    430 }
    431 
    432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
    433     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
    434         if(spanSet.contains(c)) {
    435             return;  // Nothing to do.
    436         }
    437         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
    438         if(newSet==NULL) {
    439             return;  // Out of memory.
    440         } else {
    441             pSpanNotSet=newSet;
    442         }
    443     }
    444     pSpanNotSet->add(c);
    445 }
    446 
    447 // Compare strings without any argument checks. Requires length>0.
    448 static inline UBool
    449 matches16(const UChar *s, const UChar *t, int32_t length) {
    450     do {
    451         if(*s++!=*t++) {
    452             return FALSE;
    453         }
    454     } while(--length>0);
    455     return TRUE;
    456 }
    457 
    458 static inline UBool
    459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
    460     do {
    461         if(*s++!=*t++) {
    462             return FALSE;
    463         }
    464     } while(--length>0);
    465     return TRUE;
    466 }
    467 
    468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
    469 // at code point boundaries.
    470 // That is, each edge of a match must not be in the middle of a surrogate pair.
    471 static inline UBool
    472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
    473     s+=start;
    474     limit-=start;
    475     return matches16(s, t, length) &&
    476            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
    477            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
    478 }
    479 
    480 // Does the set contain the next code point?
    481 // If so, return its length; otherwise return its negative length.
    482 static inline int32_t
    483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
    484     UChar c=*s, c2;
    485     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
    486         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
    487     }
    488     return set.contains(c) ? 1 : -1;
    489 }
    490 
    491 static inline int32_t
    492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
    493     UChar c=s[length-1], c2;
    494     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
    495         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
    496     }
    497     return set.contains(c) ? 1 : -1;
    498 }
    499 
    500 static inline int32_t
    501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    502     UChar32 c=*s;
    503     if((int8_t)c>=0) {
    504         return set.contains(c) ? 1 : -1;
    505     }
    506     // Take advantage of non-ASCII fastpaths in U8_NEXT().
    507     int32_t i=0;
    508     U8_NEXT(s, i, length, c);
    509     return set.contains(c) ? i : -i;
    510 }
    511 
    512 static inline int32_t
    513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    514     UChar32 c=s[length-1];
    515     if((int8_t)c>=0) {
    516         return set.contains(c) ? 1 : -1;
    517     }
    518     int32_t i=length-1;
    519     c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
    520     length-=i;
    521     return set.contains(c) ? length : -length;
    522 }
    523 
    524 /*
    525  * Note: In span() when spanLength==0 (after a string match, or at the beginning
    526  * after an empty code point span) and in spanNot() and spanNotUTF8(),
    527  * string matching could use a binary search
    528  * because all string matches are done from the same start index.
    529  *
    530  * For UTF-8, this would require a comparison function that returns UTF-16 order.
    531  *
    532  * This optimization should not be necessary for normal UnicodeSets because
    533  * most sets have no strings, and most sets with strings have
    534  * very few very short strings.
    535  * For cases with many strings, it might be better to use a different API
    536  * and implementation with a DFA (state machine).
    537  */
    538 
    539 /*
    540  * Algorithm for span(USET_SPAN_CONTAINED)
    541  *
    542  * Theoretical algorithm:
    543  * - Iterate through the string, and at each code point boundary:
    544  *   + If the code point there is in the set, then remember to continue after it.
    545  *   + If a set string matches at the current position, then remember to continue after it.
    546  *   + Either recursively span for each code point or string match,
    547  *     or recursively span for all but the shortest one and
    548  *     iteratively continue the span with the shortest local match.
    549  *   + Remember the longest recursive span (the farthest end point).
    550  *   + If there is no match at the current position, neither for the code point there
    551  *     nor for any set string, then stop and return the longest recursive span length.
    552  *
    553  * Optimized implementation:
    554  *
    555  * (We assume that most sets will have very few very short strings.
    556  * A span using a string-less set is extremely fast.)
    557  *
    558  * Create and cache a spanSet which contains all of the single code points
    559  * of the original set but none of its strings.
    560  *
    561  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    562  * - Loop:
    563  *   + Try to match each set string at the end of the spanLength.
    564  *     ~ Set strings that start with set-contained code points must be matched
    565  *       with a partial overlap because the recursive algorithm would have tried
    566  *       to match them at every position.
    567  *     ~ Set strings that entirely consist of set-contained code points
    568  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
    569  *       recursive algorithm would continue after them anyway
    570  *       and find the longest recursive match from their end.
    571  *     ~ Rather than recursing, note each end point of a set string match.
    572  *   + If no set string matched after spanSet.span(), then return
    573  *     with where the spanSet.span() ended.
    574  *   + If at least one set string matched after spanSet.span(), then
    575  *     pop the shortest string match end point and continue
    576  *     the loop, trying to match all set strings from there.
    577  *   + If at least one more set string matched after a previous string match,
    578  *     then test if the code point after the previous string match is also
    579  *     contained in the set.
    580  *     Continue the loop with the shortest end point of either this code point
    581  *     or a matching set string.
    582  *   + If no more set string matched after a previous string match,
    583  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    584  *     Stop if spanLength==0, otherwise continue the loop.
    585  *
    586  * By noting each end point of a set string match,
    587  * the function visits each string position at most once and finishes
    588  * in linear time.
    589  *
    590  * The recursive algorithm may visit the same string position many times
    591  * if multiple paths lead to it and finishes in exponential time.
    592  */
    593 
    594 /*
    595  * Algorithm for span(USET_SPAN_SIMPLE)
    596  *
    597  * Theoretical algorithm:
    598  * - Iterate through the string, and at each code point boundary:
    599  *   + If the code point there is in the set, then remember to continue after it.
    600  *   + If a set string matches at the current position, then remember to continue after it.
    601  *   + Continue from the farthest match position and ignore all others.
    602  *   + If there is no match at the current position,
    603  *     then stop and return the current position.
    604  *
    605  * Optimized implementation:
    606  *
    607  * (Same assumption and spanSet as above.)
    608  *
    609  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    610  * - Loop:
    611  *   + Try to match each set string at the end of the spanLength.
    612  *     ~ Set strings that start with set-contained code points must be matched
    613  *       with a partial overlap because the standard algorithm would have tried
    614  *       to match them earlier.
    615  *     ~ Set strings that entirely consist of set-contained code points
    616  *       must be matched with a full overlap because the longest-match algorithm
    617  *       would hide set string matches that end earlier.
    618  *       Such set strings need not be matched earlier inside the code point span
    619  *       because the standard algorithm would then have continued after
    620  *       the set string match anyway.
    621  *     ~ Remember the longest set string match (farthest end point) from the earliest
    622  *       starting point.
    623  *   + If no set string matched after spanSet.span(), then return
    624  *     with where the spanSet.span() ended.
    625  *   + If at least one set string matched, then continue the loop after the
    626  *     longest match from the earliest position.
    627  *   + If no more set string matched after a previous string match,
    628  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    629  *     Stop if spanLength==0, otherwise continue the loop.
    630  */
    631 
    632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    633     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    634         return spanNot(s, length);
    635     }
    636     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
    637     if(spanLength==length) {
    638         return length;
    639     }
    640 
    641     // Consider strings; they may overlap with the span.
    642     OffsetList offsets;
    643     if(spanCondition==USET_SPAN_CONTAINED) {
    644         // Use offset list to try all possibilities.
    645         offsets.setMaxLength(maxLength16);
    646     }
    647     int32_t pos=spanLength, rest=length-pos;
    648     int32_t i, stringsLength=strings.size();
    649     for(;;) {
    650         if(spanCondition==USET_SPAN_CONTAINED) {
    651             for(i=0; i<stringsLength; ++i) {
    652                 int32_t overlap=spanLengths[i];
    653                 if(overlap==ALL_CP_CONTAINED) {
    654                     continue;  // Irrelevant string.
    655                 }
    656                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    657                 const UChar *s16=string.getBuffer();
    658                 int32_t length16=string.length();
    659 
    660                 // Try to match this string at pos-overlap..pos.
    661                 if(overlap>=LONG_SPAN) {
    662                     overlap=length16;
    663                     // While contained: No point matching fully inside the code point span.
    664                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
    665                 }
    666                 if(overlap>spanLength) {
    667                     overlap=spanLength;
    668                 }
    669                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    670                 for(;;) {
    671                     if(inc>rest) {
    672                         break;
    673                     }
    674                     // Try to match if the increment is not listed already.
    675                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
    676                         if(inc==rest) {
    677                             return length;  // Reached the end of the string.
    678                         }
    679                         offsets.addOffset(inc);
    680                     }
    681                     if(overlap==0) {
    682                         break;
    683                     }
    684                     --overlap;
    685                     ++inc;
    686                 }
    687             }
    688         } else /* USET_SPAN_SIMPLE */ {
    689             int32_t maxInc=0, maxOverlap=0;
    690             for(i=0; i<stringsLength; ++i) {
    691                 int32_t overlap=spanLengths[i];
    692                 // For longest match, we do need to try to match even an all-contained string
    693                 // to find the match from the earliest start.
    694 
    695                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    696                 const UChar *s16=string.getBuffer();
    697                 int32_t length16=string.length();
    698 
    699                 // Try to match this string at pos-overlap..pos.
    700                 if(overlap>=LONG_SPAN) {
    701                     overlap=length16;
    702                     // Longest match: Need to match fully inside the code point span
    703                     // to find the match from the earliest start.
    704                 }
    705                 if(overlap>spanLength) {
    706                     overlap=spanLength;
    707                 }
    708                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    709                 for(;;) {
    710                     if(inc>rest || overlap<maxOverlap) {
    711                         break;
    712                     }
    713                     // Try to match if the string is longer or starts earlier.
    714                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
    715                         matches16CPB(s, pos-overlap, length, s16, length16)
    716                     ) {
    717                         maxInc=inc;  // Longest match from earliest start.
    718                         maxOverlap=overlap;
    719                         break;
    720                     }
    721                     --overlap;
    722                     ++inc;
    723                 }
    724             }
    725 
    726             if(maxInc!=0 || maxOverlap!=0) {
    727                 // Longest-match algorithm, and there was a string match.
    728                 // Simply continue after it.
    729                 pos+=maxInc;
    730                 rest-=maxInc;
    731                 if(rest==0) {
    732                     return length;  // Reached the end of the string.
    733                 }
    734                 spanLength=0;  // Match strings from after a string match.
    735                 continue;
    736             }
    737         }
    738         // Finished trying to match all strings at pos.
    739 
    740         if(spanLength!=0 || pos==0) {
    741             // The position is after an unlimited code point span (spanLength!=0),
    742             // not after a string match.
    743             // The only position where spanLength==0 after a span is pos==0.
    744             // Otherwise, an unlimited code point span is only tried again when no
    745             // strings match, and if such a non-initial span fails we stop.
    746             if(offsets.isEmpty()) {
    747                 return pos;  // No strings matched after a span.
    748             }
    749             // Match strings from after the next string match.
    750         } else {
    751             // The position is after a string match (or a single code point).
    752             if(offsets.isEmpty()) {
    753                 // No more strings matched after a previous string match.
    754                 // Try another code point span from after the last string match.
    755                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
    756                 if( spanLength==rest || // Reached the end of the string, or
    757                     spanLength==0       // neither strings nor span progressed.
    758                 ) {
    759                     return pos+spanLength;
    760                 }
    761                 pos+=spanLength;
    762                 rest-=spanLength;
    763                 continue;  // spanLength>0: Match strings from after a span.
    764             } else {
    765                 // Try to match only one code point from after a string match if some
    766                 // string matched beyond it, so that we try all possible positions
    767                 // and don't overshoot.
    768                 spanLength=spanOne(spanSet, s+pos, rest);
    769                 if(spanLength>0) {
    770                     if(spanLength==rest) {
    771                         return length;  // Reached the end of the string.
    772                     }
    773                     // Match strings after this code point.
    774                     // There cannot be any increments below it because UnicodeSet strings
    775                     // contain multiple code points.
    776                     pos+=spanLength;
    777                     rest-=spanLength;
    778                     offsets.shift(spanLength);
    779                     spanLength=0;
    780                     continue;  // Match strings from after a single code point.
    781                 }
    782                 // Match strings from after the next string match.
    783             }
    784         }
    785         int32_t minOffset=offsets.popMinimum();
    786         pos+=minOffset;
    787         rest-=minOffset;
    788         spanLength=0;  // Match strings from after a string match.
    789     }
    790 }
    791 
    792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    793     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    794         return spanNotBack(s, length);
    795     }
    796     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
    797     if(pos==0) {
    798         return 0;
    799     }
    800     int32_t spanLength=length-pos;
    801 
    802     // Consider strings; they may overlap with the span.
    803     OffsetList offsets;
    804     if(spanCondition==USET_SPAN_CONTAINED) {
    805         // Use offset list to try all possibilities.
    806         offsets.setMaxLength(maxLength16);
    807     }
    808     int32_t i, stringsLength=strings.size();
    809     uint8_t *spanBackLengths=spanLengths;
    810     if(all) {
    811         spanBackLengths+=stringsLength;
    812     }
    813     for(;;) {
    814         if(spanCondition==USET_SPAN_CONTAINED) {
    815             for(i=0; i<stringsLength; ++i) {
    816                 int32_t overlap=spanBackLengths[i];
    817                 if(overlap==ALL_CP_CONTAINED) {
    818                     continue;  // Irrelevant string.
    819                 }
    820                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    821                 const UChar *s16=string.getBuffer();
    822                 int32_t length16=string.length();
    823 
    824                 // Try to match this string at pos-(length16-overlap)..pos-length16.
    825                 if(overlap>=LONG_SPAN) {
    826                     overlap=length16;
    827                     // While contained: No point matching fully inside the code point span.
    828                     int32_t len1=0;
    829                     U16_FWD_1(s16, len1, overlap);
    830                     overlap-=len1;  // Length of the string minus the first code point.
    831                 }
    832                 if(overlap>spanLength) {
    833                     overlap=spanLength;
    834                 }
    835                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    836                 for(;;) {
    837                     if(dec>pos) {
    838                         break;
    839                     }
    840                     // Try to match if the decrement is not listed already.
    841                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
    842                         if(dec==pos) {
    843                             return 0;  // Reached the start of the string.
    844                         }
    845                         offsets.addOffset(dec);
    846                     }
    847                     if(overlap==0) {
    848                         break;
    849                     }
    850                     --overlap;
    851                     ++dec;
    852                 }
    853             }
    854         } else /* USET_SPAN_SIMPLE */ {
    855             int32_t maxDec=0, maxOverlap=0;
    856             for(i=0; i<stringsLength; ++i) {
    857                 int32_t overlap=spanBackLengths[i];
    858                 // For longest match, we do need to try to match even an all-contained string
    859                 // to find the match from the latest end.
    860 
    861                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    862                 const UChar *s16=string.getBuffer();
    863                 int32_t length16=string.length();
    864 
    865                 // Try to match this string at pos-(length16-overlap)..pos-length16.
    866                 if(overlap>=LONG_SPAN) {
    867                     overlap=length16;
    868                     // Longest match: Need to match fully inside the code point span
    869                     // to find the match from the latest end.
    870                 }
    871                 if(overlap>spanLength) {
    872                     overlap=spanLength;
    873                 }
    874                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    875                 for(;;) {
    876                     if(dec>pos || overlap<maxOverlap) {
    877                         break;
    878                     }
    879                     // Try to match if the string is longer or ends later.
    880                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
    881                         matches16CPB(s, pos-dec, length, s16, length16)
    882                     ) {
    883                         maxDec=dec;  // Longest match from latest end.
    884                         maxOverlap=overlap;
    885                         break;
    886                     }
    887                     --overlap;
    888                     ++dec;
    889                 }
    890             }
    891 
    892             if(maxDec!=0 || maxOverlap!=0) {
    893                 // Longest-match algorithm, and there was a string match.
    894                 // Simply continue before it.
    895                 pos-=maxDec;
    896                 if(pos==0) {
    897                     return 0;  // Reached the start of the string.
    898                 }
    899                 spanLength=0;  // Match strings from before a string match.
    900                 continue;
    901             }
    902         }
    903         // Finished trying to match all strings at pos.
    904 
    905         if(spanLength!=0 || pos==length) {
    906             // The position is before an unlimited code point span (spanLength!=0),
    907             // not before a string match.
    908             // The only position where spanLength==0 before a span is pos==length.
    909             // Otherwise, an unlimited code point span is only tried again when no
    910             // strings match, and if such a non-initial span fails we stop.
    911             if(offsets.isEmpty()) {
    912                 return pos;  // No strings matched before a span.
    913             }
    914             // Match strings from before the next string match.
    915         } else {
    916             // The position is before a string match (or a single code point).
    917             if(offsets.isEmpty()) {
    918                 // No more strings matched before a previous string match.
    919                 // Try another code point span from before the last string match.
    920                 int32_t oldPos=pos;
    921                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
    922                 spanLength=oldPos-pos;
    923                 if( pos==0 ||           // Reached the start of the string, or
    924                     spanLength==0       // neither strings nor span progressed.
    925                 ) {
    926                     return pos;
    927                 }
    928                 continue;  // spanLength>0: Match strings from before a span.
    929             } else {
    930                 // Try to match only one code point from before a string match if some
    931                 // string matched beyond it, so that we try all possible positions
    932                 // and don't overshoot.
    933                 spanLength=spanOneBack(spanSet, s, pos);
    934                 if(spanLength>0) {
    935                     if(spanLength==pos) {
    936                         return 0;  // Reached the start of the string.
    937                     }
    938                     // Match strings before this code point.
    939                     // There cannot be any decrements below it because UnicodeSet strings
    940                     // contain multiple code points.
    941                     pos-=spanLength;
    942                     offsets.shift(spanLength);
    943                     spanLength=0;
    944                     continue;  // Match strings from before a single code point.
    945                 }
    946                 // Match strings from before the next string match.
    947             }
    948         }
    949         pos-=offsets.popMinimum();
    950         spanLength=0;  // Match strings from before a string match.
    951     }
    952 }
    953 
    954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    955     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    956         return spanNotUTF8(s, length);
    957     }
    958     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
    959     if(spanLength==length) {
    960         return length;
    961     }
    962 
    963     // Consider strings; they may overlap with the span.
    964     OffsetList offsets;
    965     if(spanCondition==USET_SPAN_CONTAINED) {
    966         // Use offset list to try all possibilities.
    967         offsets.setMaxLength(maxLength8);
    968     }
    969     int32_t pos=spanLength, rest=length-pos;
    970     int32_t i, stringsLength=strings.size();
    971     uint8_t *spanUTF8Lengths=spanLengths;
    972     if(all) {
    973         spanUTF8Lengths+=2*stringsLength;
    974     }
    975     for(;;) {
    976         const uint8_t *s8=utf8;
    977         int32_t length8;
    978         if(spanCondition==USET_SPAN_CONTAINED) {
    979             for(i=0; i<stringsLength; ++i) {
    980                 length8=utf8Lengths[i];
    981                 if(length8==0) {
    982                     continue;  // String not representable in UTF-8.
    983                 }
    984                 int32_t overlap=spanUTF8Lengths[i];
    985                 if(overlap==ALL_CP_CONTAINED) {
    986                     s8+=length8;
    987                     continue;  // Irrelevant string.
    988                 }
    989 
    990                 // Try to match this string at pos-overlap..pos.
    991                 if(overlap>=LONG_SPAN) {
    992                     overlap=length8;
    993                     // While contained: No point matching fully inside the code point span.
    994                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
    995                 }
    996                 if(overlap>spanLength) {
    997                     overlap=spanLength;
    998                 }
    999                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
   1000                 for(;;) {
   1001                     if(inc>rest) {
   1002                         break;
   1003                     }
   1004                     // Try to match if the increment is not listed already.
   1005                     // Match at code point boundaries. (The UTF-8 strings were converted
   1006                     // from UTF-16 and are guaranteed to be well-formed.)
   1007                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
   1008                         !offsets.containsOffset(inc) &&
   1009                         matches8(s+pos-overlap, s8, length8)
   1010 
   1011                     ) {
   1012                         if(inc==rest) {
   1013                             return length;  // Reached the end of the string.
   1014                         }
   1015                         offsets.addOffset(inc);
   1016                     }
   1017                     if(overlap==0) {
   1018                         break;
   1019                     }
   1020                     --overlap;
   1021                     ++inc;
   1022                 }
   1023                 s8+=length8;
   1024             }
   1025         } else /* USET_SPAN_SIMPLE */ {
   1026             int32_t maxInc=0, maxOverlap=0;
   1027             for(i=0; i<stringsLength; ++i) {
   1028                 length8=utf8Lengths[i];
   1029                 if(length8==0) {
   1030                     continue;  // String not representable in UTF-8.
   1031                 }
   1032                 int32_t overlap=spanUTF8Lengths[i];
   1033                 // For longest match, we do need to try to match even an all-contained string
   1034                 // to find the match from the earliest start.
   1035 
   1036                 // Try to match this string at pos-overlap..pos.
   1037                 if(overlap>=LONG_SPAN) {
   1038                     overlap=length8;
   1039                     // Longest match: Need to match fully inside the code point span
   1040                     // to find the match from the earliest start.
   1041                 }
   1042                 if(overlap>spanLength) {
   1043                     overlap=spanLength;
   1044                 }
   1045                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
   1046                 for(;;) {
   1047                     if(inc>rest || overlap<maxOverlap) {
   1048                         break;
   1049                     }
   1050                     // Try to match if the string is longer or starts earlier.
   1051                     // Match at code point boundaries. (The UTF-8 strings were converted
   1052                     // from UTF-16 and are guaranteed to be well-formed.)
   1053                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
   1054                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
   1055                         matches8(s+pos-overlap, s8, length8)
   1056 
   1057                     ) {
   1058                         maxInc=inc;  // Longest match from earliest start.
   1059                         maxOverlap=overlap;
   1060                         break;
   1061                     }
   1062                     --overlap;
   1063                     ++inc;
   1064                 }
   1065                 s8+=length8;
   1066             }
   1067 
   1068             if(maxInc!=0 || maxOverlap!=0) {
   1069                 // Longest-match algorithm, and there was a string match.
   1070                 // Simply continue after it.
   1071                 pos+=maxInc;
   1072                 rest-=maxInc;
   1073                 if(rest==0) {
   1074                     return length;  // Reached the end of the string.
   1075                 }
   1076                 spanLength=0;  // Match strings from after a string match.
   1077                 continue;
   1078             }
   1079         }
   1080         // Finished trying to match all strings at pos.
   1081 
   1082         if(spanLength!=0 || pos==0) {
   1083             // The position is after an unlimited code point span (spanLength!=0),
   1084             // not after a string match.
   1085             // The only position where spanLength==0 after a span is pos==0.
   1086             // Otherwise, an unlimited code point span is only tried again when no
   1087             // strings match, and if such a non-initial span fails we stop.
   1088             if(offsets.isEmpty()) {
   1089                 return pos;  // No strings matched after a span.
   1090             }
   1091             // Match strings from after the next string match.
   1092         } else {
   1093             // The position is after a string match (or a single code point).
   1094             if(offsets.isEmpty()) {
   1095                 // No more strings matched after a previous string match.
   1096                 // Try another code point span from after the last string match.
   1097                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
   1098                 if( spanLength==rest || // Reached the end of the string, or
   1099                     spanLength==0       // neither strings nor span progressed.
   1100                 ) {
   1101                     return pos+spanLength;
   1102                 }
   1103                 pos+=spanLength;
   1104                 rest-=spanLength;
   1105                 continue;  // spanLength>0: Match strings from after a span.
   1106             } else {
   1107                 // Try to match only one code point from after a string match if some
   1108                 // string matched beyond it, so that we try all possible positions
   1109                 // and don't overshoot.
   1110                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
   1111                 if(spanLength>0) {
   1112                     if(spanLength==rest) {
   1113                         return length;  // Reached the end of the string.
   1114                     }
   1115                     // Match strings after this code point.
   1116                     // There cannot be any increments below it because UnicodeSet strings
   1117                     // contain multiple code points.
   1118                     pos+=spanLength;
   1119                     rest-=spanLength;
   1120                     offsets.shift(spanLength);
   1121                     spanLength=0;
   1122                     continue;  // Match strings from after a single code point.
   1123                 }
   1124                 // Match strings from after the next string match.
   1125             }
   1126         }
   1127         int32_t minOffset=offsets.popMinimum();
   1128         pos+=minOffset;
   1129         rest-=minOffset;
   1130         spanLength=0;  // Match strings from after a string match.
   1131     }
   1132 }
   1133 
   1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   1135     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   1136         return spanNotBackUTF8(s, length);
   1137     }
   1138     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
   1139     if(pos==0) {
   1140         return 0;
   1141     }
   1142     int32_t spanLength=length-pos;
   1143 
   1144     // Consider strings; they may overlap with the span.
   1145     OffsetList offsets;
   1146     if(spanCondition==USET_SPAN_CONTAINED) {
   1147         // Use offset list to try all possibilities.
   1148         offsets.setMaxLength(maxLength8);
   1149     }
   1150     int32_t i, stringsLength=strings.size();
   1151     uint8_t *spanBackUTF8Lengths=spanLengths;
   1152     if(all) {
   1153         spanBackUTF8Lengths+=3*stringsLength;
   1154     }
   1155     for(;;) {
   1156         const uint8_t *s8=utf8;
   1157         int32_t length8;
   1158         if(spanCondition==USET_SPAN_CONTAINED) {
   1159             for(i=0; i<stringsLength; ++i) {
   1160                 length8=utf8Lengths[i];
   1161                 if(length8==0) {
   1162                     continue;  // String not representable in UTF-8.
   1163                 }
   1164                 int32_t overlap=spanBackUTF8Lengths[i];
   1165                 if(overlap==ALL_CP_CONTAINED) {
   1166                     s8+=length8;
   1167                     continue;  // Irrelevant string.
   1168                 }
   1169 
   1170                 // Try to match this string at pos-(length8-overlap)..pos-length8.
   1171                 if(overlap>=LONG_SPAN) {
   1172                     overlap=length8;
   1173                     // While contained: No point matching fully inside the code point span.
   1174                     int32_t len1=0;
   1175                     U8_FWD_1(s8, len1, overlap);
   1176                     overlap-=len1;  // Length of the string minus the first code point.
   1177                 }
   1178                 if(overlap>spanLength) {
   1179                     overlap=spanLength;
   1180                 }
   1181                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1182                 for(;;) {
   1183                     if(dec>pos) {
   1184                         break;
   1185                     }
   1186                     // Try to match if the decrement is not listed already.
   1187                     // Match at code point boundaries. (The UTF-8 strings were converted
   1188                     // from UTF-16 and are guaranteed to be well-formed.)
   1189                     if( !U8_IS_TRAIL(s[pos-dec]) &&
   1190                         !offsets.containsOffset(dec) &&
   1191                         matches8(s+pos-dec, s8, length8)
   1192                     ) {
   1193                         if(dec==pos) {
   1194                             return 0;  // Reached the start of the string.
   1195                         }
   1196                         offsets.addOffset(dec);
   1197                     }
   1198                     if(overlap==0) {
   1199                         break;
   1200                     }
   1201                     --overlap;
   1202                     ++dec;
   1203                 }
   1204                 s8+=length8;
   1205             }
   1206         } else /* USET_SPAN_SIMPLE */ {
   1207             int32_t maxDec=0, maxOverlap=0;
   1208             for(i=0; i<stringsLength; ++i) {
   1209                 length8=utf8Lengths[i];
   1210                 if(length8==0) {
   1211                     continue;  // String not representable in UTF-8.
   1212                 }
   1213                 int32_t overlap=spanBackUTF8Lengths[i];
   1214                 // For longest match, we do need to try to match even an all-contained string
   1215                 // to find the match from the latest end.
   1216 
   1217                 // Try to match this string at pos-(length8-overlap)..pos-length8.
   1218                 if(overlap>=LONG_SPAN) {
   1219                     overlap=length8;
   1220                     // Longest match: Need to match fully inside the code point span
   1221                     // to find the match from the latest end.
   1222                 }
   1223                 if(overlap>spanLength) {
   1224                     overlap=spanLength;
   1225                 }
   1226                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1227                 for(;;) {
   1228                     if(dec>pos || overlap<maxOverlap) {
   1229                         break;
   1230                     }
   1231                     // Try to match if the string is longer or ends later.
   1232                     // Match at code point boundaries. (The UTF-8 strings were converted
   1233                     // from UTF-16 and are guaranteed to be well-formed.)
   1234                     if( !U8_IS_TRAIL(s[pos-dec]) &&
   1235                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
   1236                         matches8(s+pos-dec, s8, length8)
   1237                     ) {
   1238                         maxDec=dec;  // Longest match from latest end.
   1239                         maxOverlap=overlap;
   1240                         break;
   1241                     }
   1242                     --overlap;
   1243                     ++dec;
   1244                 }
   1245                 s8+=length8;
   1246             }
   1247 
   1248             if(maxDec!=0 || maxOverlap!=0) {
   1249                 // Longest-match algorithm, and there was a string match.
   1250                 // Simply continue before it.
   1251                 pos-=maxDec;
   1252                 if(pos==0) {
   1253                     return 0;  // Reached the start of the string.
   1254                 }
   1255                 spanLength=0;  // Match strings from before a string match.
   1256                 continue;
   1257             }
   1258         }
   1259         // Finished trying to match all strings at pos.
   1260 
   1261         if(spanLength!=0 || pos==length) {
   1262             // The position is before an unlimited code point span (spanLength!=0),
   1263             // not before a string match.
   1264             // The only position where spanLength==0 before a span is pos==length.
   1265             // Otherwise, an unlimited code point span is only tried again when no
   1266             // strings match, and if such a non-initial span fails we stop.
   1267             if(offsets.isEmpty()) {
   1268                 return pos;  // No strings matched before a span.
   1269             }
   1270             // Match strings from before the next string match.
   1271         } else {
   1272             // The position is before a string match (or a single code point).
   1273             if(offsets.isEmpty()) {
   1274                 // No more strings matched before a previous string match.
   1275                 // Try another code point span from before the last string match.
   1276                 int32_t oldPos=pos;
   1277                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
   1278                 spanLength=oldPos-pos;
   1279                 if( pos==0 ||           // Reached the start of the string, or
   1280                     spanLength==0       // neither strings nor span progressed.
   1281                 ) {
   1282                     return pos;
   1283                 }
   1284                 continue;  // spanLength>0: Match strings from before a span.
   1285             } else {
   1286                 // Try to match only one code point from before a string match if some
   1287                 // string matched beyond it, so that we try all possible positions
   1288                 // and don't overshoot.
   1289                 spanLength=spanOneBackUTF8(spanSet, s, pos);
   1290                 if(spanLength>0) {
   1291                     if(spanLength==pos) {
   1292                         return 0;  // Reached the start of the string.
   1293                     }
   1294                     // Match strings before this code point.
   1295                     // There cannot be any decrements below it because UnicodeSet strings
   1296                     // contain multiple code points.
   1297                     pos-=spanLength;
   1298                     offsets.shift(spanLength);
   1299                     spanLength=0;
   1300                     continue;  // Match strings from before a single code point.
   1301                 }
   1302                 // Match strings from before the next string match.
   1303             }
   1304         }
   1305         pos-=offsets.popMinimum();
   1306         spanLength=0;  // Match strings from before a string match.
   1307     }
   1308 }
   1309 
   1310 /*
   1311  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
   1312  *
   1313  * Theoretical algorithm:
   1314  * - Iterate through the string, and at each code point boundary:
   1315  *   + If the code point there is in the set, then return with the current position.
   1316  *   + If a set string matches at the current position, then return with the current position.
   1317  *
   1318  * Optimized implementation:
   1319  *
   1320  * (Same assumption as for span() above.)
   1321  *
   1322  * Create and cache a spanNotSet which contains all of the single code points
   1323  * of the original set but none of its strings.
   1324  * For each set string add its initial code point to the spanNotSet.
   1325  * (Also add its final code point for spanNotBack().)
   1326  *
   1327  * - Loop:
   1328  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
   1329  *   + If the current code point is in the original set, then
   1330  *     return the current position.
   1331  *   + If any set string matches at the current position, then
   1332  *     return the current position.
   1333  *   + If there is no match at the current position, neither for the code point there
   1334  *     nor for any set string, then skip this code point and continue the loop.
   1335  *     This happens for set-string-initial code points that were added to spanNotSet
   1336  *     when there is not actually a match for such a set string.
   1337  */
   1338 
   1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
   1340     int32_t pos=0, rest=length;
   1341     int32_t i, stringsLength=strings.size();
   1342     do {
   1343         // Span until we find a code point from the set,
   1344         // or a code point that starts or ends some string.
   1345         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
   1346         if(i==rest) {
   1347             return length;  // Reached the end of the string.
   1348         }
   1349         pos+=i;
   1350         rest-=i;
   1351 
   1352         // Check whether the current code point is in the original set,
   1353         // without the string starts and ends.
   1354         int32_t cpLength=spanOne(spanSet, s+pos, rest);
   1355         if(cpLength>0) {
   1356             return pos;  // There is a set element at pos.
   1357         }
   1358 
   1359         // Try to match the strings at pos.
   1360         for(i=0; i<stringsLength; ++i) {
   1361             if(spanLengths[i]==ALL_CP_CONTAINED) {
   1362                 continue;  // Irrelevant string.
   1363             }
   1364             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   1365             const UChar *s16=string.getBuffer();
   1366             int32_t length16=string.length();
   1367             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
   1368                 return pos;  // There is a set element at pos.
   1369             }
   1370         }
   1371 
   1372         // The span(while not contained) ended on a string start/end which is
   1373         // not in the original set. Skip this code point and continue.
   1374         // cpLength<0
   1375         pos-=cpLength;
   1376         rest+=cpLength;
   1377     } while(rest!=0);
   1378     return length;  // Reached the end of the string.
   1379 }
   1380 
   1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
   1382     int32_t pos=length;
   1383     int32_t i, stringsLength=strings.size();
   1384     do {
   1385         // Span until we find a code point from the set,
   1386         // or a code point that starts or ends some string.
   1387         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
   1388         if(pos==0) {
   1389             return 0;  // Reached the start of the string.
   1390         }
   1391 
   1392         // Check whether the current code point is in the original set,
   1393         // without the string starts and ends.
   1394         int32_t cpLength=spanOneBack(spanSet, s, pos);
   1395         if(cpLength>0) {
   1396             return pos;  // There is a set element at pos.
   1397         }
   1398 
   1399         // Try to match the strings at pos.
   1400         for(i=0; i<stringsLength; ++i) {
   1401             // Use spanLengths rather than a spanBackLengths pointer because
   1402             // it is easier and we only need to know whether the string is irrelevant
   1403             // which is the same in either array.
   1404             if(spanLengths[i]==ALL_CP_CONTAINED) {
   1405                 continue;  // Irrelevant string.
   1406             }
   1407             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   1408             const UChar *s16=string.getBuffer();
   1409             int32_t length16=string.length();
   1410             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
   1411                 return pos;  // There is a set element at pos.
   1412             }
   1413         }
   1414 
   1415         // The span(while not contained) ended on a string start/end which is
   1416         // not in the original set. Skip this code point and continue.
   1417         // cpLength<0
   1418         pos+=cpLength;
   1419     } while(pos!=0);
   1420     return 0;  // Reached the start of the string.
   1421 }
   1422 
   1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
   1424     int32_t pos=0, rest=length;
   1425     int32_t i, stringsLength=strings.size();
   1426     uint8_t *spanUTF8Lengths=spanLengths;
   1427     if(all) {
   1428         spanUTF8Lengths+=2*stringsLength;
   1429     }
   1430     do {
   1431         // Span until we find a code point from the set,
   1432         // or a code point that starts or ends some string.
   1433         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
   1434         if(i==rest) {
   1435             return length;  // Reached the end of the string.
   1436         }
   1437         pos+=i;
   1438         rest-=i;
   1439 
   1440         // Check whether the current code point is in the original set,
   1441         // without the string starts and ends.
   1442         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
   1443         if(cpLength>0) {
   1444             return pos;  // There is a set element at pos.
   1445         }
   1446 
   1447         // Try to match the strings at pos.
   1448         const uint8_t *s8=utf8;
   1449         int32_t length8;
   1450         for(i=0; i<stringsLength; ++i) {
   1451             length8=utf8Lengths[i];
   1452             // ALL_CP_CONTAINED: Irrelevant string.
   1453             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
   1454                 return pos;  // There is a set element at pos.
   1455             }
   1456             s8+=length8;
   1457         }
   1458 
   1459         // The span(while not contained) ended on a string start/end which is
   1460         // not in the original set. Skip this code point and continue.
   1461         // cpLength<0
   1462         pos-=cpLength;
   1463         rest+=cpLength;
   1464     } while(rest!=0);
   1465     return length;  // Reached the end of the string.
   1466 }
   1467 
   1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
   1469     int32_t pos=length;
   1470     int32_t i, stringsLength=strings.size();
   1471     uint8_t *spanBackUTF8Lengths=spanLengths;
   1472     if(all) {
   1473         spanBackUTF8Lengths+=3*stringsLength;
   1474     }
   1475     do {
   1476         // Span until we find a code point from the set,
   1477         // or a code point that starts or ends some string.
   1478         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
   1479         if(pos==0) {
   1480             return 0;  // Reached the start of the string.
   1481         }
   1482 
   1483         // Check whether the current code point is in the original set,
   1484         // without the string starts and ends.
   1485         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
   1486         if(cpLength>0) {
   1487             return pos;  // There is a set element at pos.
   1488         }
   1489 
   1490         // Try to match the strings at pos.
   1491         const uint8_t *s8=utf8;
   1492         int32_t length8;
   1493         for(i=0; i<stringsLength; ++i) {
   1494             length8=utf8Lengths[i];
   1495             // ALL_CP_CONTAINED: Irrelevant string.
   1496             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
   1497                 return pos;  // There is a set element at pos.
   1498             }
   1499             s8+=length8;
   1500         }
   1501 
   1502         // The span(while not contained) ended on a string start/end which is
   1503         // not in the original set. Skip this code point and continue.
   1504         // cpLength<0
   1505         pos+=cpLength;
   1506     } while(pos!=0);
   1507     return 0;  // Reached the start of the string.
   1508 }
   1509 
   1510 U_NAMESPACE_END
   1511