Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  ******************************************************************************
      6  *
      7  *   Copyright (C) 2009-2015, International Business Machines
      8  *   Corporation and others.  All Rights Reserved.
      9  *
     10  ******************************************************************************
     11  */
     12 
     13 package android.icu.impl;
     14 
     15 import java.util.ArrayList;
     16 
     17 import android.icu.text.UnicodeSet;
     18 import android.icu.text.UnicodeSet.SpanCondition;
     19 import android.icu.util.OutputInt;
     20 
     21 /*
     22  * Implement span() etc. for a set with strings.
     23  * Avoid recursion because of its exponential complexity.
     24  * Instead, try multiple paths at once and track them with an IndexList.
     25  */
     26 /**
     27  * @hide Only a subset of ICU is exposed in Android
     28  */
     29 public class UnicodeSetStringSpan {
     30 
     31     /*
     32      * Which span() variant will be used? The object is either built for one variant and used once,
     33      * or built for all and may be used many times.
     34      */
     35     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
     36     public static final int FWD           = 0x20;
     37     public static final int BACK          = 0x10;
     38     // public static final int UTF16      = 8;
     39     public static final int CONTAINED     = 2;
     40     public static final int NOT_CONTAINED = 1;
     41 
     42     public static final int ALL = 0x7f;
     43 
     44     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
     45     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
     46     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
     47     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
     48 
     49     /**
     50      * Special spanLength short values. (since Java has not unsigned byte type)
     51      * All code points in the string are contained in the parent set.
     52      */
     53     static final short ALL_CP_CONTAINED = 0xff;
     54     /** The spanLength is >=0xfe. */
     55     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
     56 
     57     /** Set for span(). Same as parent but without strings. */
     58     private UnicodeSet spanSet;
     59 
     60     /**
     61      * Set for span(not contained).
     62      * Same as spanSet, plus characters that start or end strings.
     63      */
     64     private UnicodeSet spanNotSet;
     65 
     66     /** The strings of the parent set. */
     67     private ArrayList<String> strings;
     68 
     69     /** The lengths of span(), spanBack() etc. for each string. */
     70     private short[] spanLengths;
     71 
     72     /** Maximum lengths of relevant strings. */
     73     private final int maxLength16;
     74 
     75     /** Are there strings that are not fully contained in the code point set? */
     76     private boolean someRelevant;
     77 
     78     /** Set up for all variants of span()? */
     79     private boolean all;
     80 
     81     /** Span helper */
     82     private OffsetList offsets;
     83 
     84     /**
     85      * Constructs for all variants of span(), or only for any one variant.
     86      * Initializes as little as possible, for single use.
     87      */
     88     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
     89         spanSet = new UnicodeSet(0, 0x10ffff);
     90         // TODO: With Java 6, just take the parent set's strings as is,
     91         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
     92         // Then iterate via the first() and higher() methods.
     93         // (We do not want to create multiple Iterator objects in each span().)
     94         // See ICU ticket #7454.
     95         strings = setStrings;
     96         all = (which == ALL);
     97         spanSet.retainAll(set);
     98         if (0 != (which & NOT_CONTAINED)) {
     99             // Default to the same sets.
    100             // addToSpanNotSet() will create a separate set if necessary.
    101             spanNotSet = spanSet;
    102         }
    103         offsets = new OffsetList();
    104 
    105         // Determine if the strings even need to be taken into account at all for span() etc.
    106         // If any string is relevant, then all strings need to be used for
    107         // span(longest match) but only the relevant ones for span(while contained).
    108         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
    109         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
    110         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
    111         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
    112         int stringsLength = strings.size();
    113 
    114         int i, spanLength;
    115         int maxLength16 = 0;
    116         someRelevant = false;
    117         for (i = 0; i < stringsLength; ++i) {
    118             String string = strings.get(i);
    119             int length16 = string.length();
    120             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
    121             if (spanLength < length16) { // Relevant string.
    122                 someRelevant = true;
    123             }
    124             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
    125                 maxLength16 = length16;
    126             }
    127         }
    128         this.maxLength16 = maxLength16;
    129         if (!someRelevant && (which & WITH_COUNT) == 0) {
    130             return;
    131         }
    132 
    133         // Freeze after checking for the need to use strings at all because freezing
    134         // a set takes some time and memory which are wasted if there are no relevant strings.
    135         if (all) {
    136             spanSet.freeze();
    137         }
    138 
    139         int spanBackLengthsOffset;
    140 
    141         // Allocate a block of meta data.
    142         int allocSize;
    143         if (all) {
    144             // 2 sets of span lengths
    145             allocSize = stringsLength * (2);
    146         } else {
    147             allocSize = stringsLength; // One set of span lengths.
    148         }
    149         spanLengths = new short[allocSize];
    150 
    151         if (all) {
    152             // Store span lengths for all span() variants.
    153             spanBackLengthsOffset = stringsLength;
    154         } else {
    155             // Store span lengths for only one span() variant.
    156             spanBackLengthsOffset = 0;
    157         }
    158 
    159         // Set the meta data and spanNotSet and write the UTF-8 strings.
    160 
    161         for (i = 0; i < stringsLength; ++i) {
    162             String string = strings.get(i);
    163             int length16 = string.length();
    164             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
    165             if (spanLength < length16) { // Relevant string.
    166                 if (true /* 0 != (which & UTF16) */) {
    167                     if (0 != (which & CONTAINED)) {
    168                         if (0 != (which & FWD)) {
    169                             spanLengths[i] = makeSpanLengthByte(spanLength);
    170                         }
    171                         if (0 != (which & BACK)) {
    172                             spanLength = length16
    173                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
    174                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
    175                         }
    176                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
    177                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
    178                                                                                      // flag.
    179                     }
    180                 }
    181                 if (0 != (which & NOT_CONTAINED)) {
    182                     // Add string start and end code points to the spanNotSet so that
    183                     // a span(while not contained) stops before any string.
    184                     int c;
    185                     if (0 != (which & FWD)) {
    186                         c = string.codePointAt(0);
    187                         addToSpanNotSet(c);
    188                     }
    189                     if (0 != (which & BACK)) {
    190                         c = string.codePointBefore(length16);
    191                         addToSpanNotSet(c);
    192                     }
    193                 }
    194             } else { // Irrelevant string.
    195                 if (all) {
    196                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
    197                 } else {
    198                     // All spanXYZLengths pointers contain the same address.
    199                     spanLengths[i] = ALL_CP_CONTAINED;
    200                 }
    201             }
    202         }
    203 
    204         // Finish.
    205         if (all) {
    206             spanNotSet.freeze();
    207         }
    208     }
    209 
    210     /**
    211      * Constructs a copy of an existing UnicodeSetStringSpan.
    212      * Assumes which==ALL for a frozen set.
    213      */
    214     public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
    215             final ArrayList<String> newParentSetStrings) {
    216         spanSet = otherStringSpan.spanSet;
    217         strings = newParentSetStrings;
    218         maxLength16 = otherStringSpan.maxLength16;
    219         someRelevant = otherStringSpan.someRelevant;
    220         all = true;
    221         if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) {
    222             spanNotSet = spanSet;
    223         } else {
    224             spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
    225         }
    226         offsets = new OffsetList();
    227 
    228         spanLengths = otherStringSpan.spanLengths.clone();
    229     }
    230 
    231     /**
    232      * Do the strings need to be checked in span() etc.?
    233      *
    234      * @return true if strings need to be checked (call span() here),
    235      *         false if not (use a BMPSet for best performance).
    236      */
    237     public boolean needsStringSpanUTF16() {
    238         return someRelevant;
    239     }
    240 
    241     /** For fast UnicodeSet::contains(c). */
    242     public boolean contains(int c) {
    243         return spanSet.contains(c);
    244     }
    245 
    246     /**
    247      * Adds a starting or ending string character to the spanNotSet
    248      * so that a character span ends before any string.
    249      */
    250     private void addToSpanNotSet(int c) {
    251         if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) {
    252             if (spanSet.contains(c)) {
    253                 return; // Nothing to do.
    254             }
    255             spanNotSet = spanSet.cloneAsThawed();
    256         }
    257         spanNotSet.add(c);
    258     }
    259 
    260     /*
    261      * Note: In span() when spanLength==0
    262      * (after a string match, or at the beginning after an empty code point span)
    263      * and in spanNot() and spanNotUTF8(),
    264      * string matching could use a binary search because all string matches are done
    265      * from the same start index.
    266      *
    267      * For UTF-8, this would require a comparison function that returns UTF-16 order.
    268      *
    269      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
    270      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
    271      * API and implementation with a DFA (state machine).
    272      */
    273 
    274     /*
    275      * Algorithm for span(SpanCondition.CONTAINED)
    276      *
    277      * Theoretical algorithm:
    278      * - Iterate through the string, and at each code point boundary:
    279      *   + If the code point there is in the set, then remember to continue after it.
    280      *   + If a set string matches at the current position, then remember to continue after it.
    281      *   + Either recursively span for each code point or string match, or recursively span
    282      *     for all but the shortest one and iteratively continue the span with the shortest local match.
    283      *   + Remember the longest recursive span (the farthest end point).
    284      *   + If there is no match at the current position,
    285      *     neither for the code point there nor for any set string,
    286      *     then stop and return the longest recursive span length.
    287      *
    288      * Optimized implementation:
    289      *
    290      * (We assume that most sets will have very few very short strings.
    291      * A span using a string-less set is extremely fast.)
    292      *
    293      * Create and cache a spanSet which contains all of the single code points of the original set
    294      * but none of its strings.
    295      *
    296      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
    297      * - Loop:
    298      *   + Try to match each set string at the end of the spanLength.
    299      *     ~ Set strings that start with set-contained code points
    300      *       must be matched with a partial overlap
    301      *       because the recursive algorithm would have tried to match them at every position.
    302      *     ~ Set strings that entirely consist of set-contained code points
    303      *       are irrelevant for span(SpanCondition.CONTAINED)
    304      *       because the recursive algorithm would continue after them anyway and
    305      *       find the longest recursive match from their end.
    306      *     ~ Rather than recursing, note each end point of a set string match.
    307      *   + If no set string matched after spanSet.span(),
    308      *     then return with where the spanSet.span() ended.
    309      *   + If at least one set string matched after spanSet.span(),
    310      *     then pop the shortest string match end point and continue the loop,
    311      *     trying to match all set strings from there.
    312      *   + If at least one more set string matched after a previous string match, then test if the
    313      *     code point after the previous string match is also contained in the set.
    314      *     Continue the loop with the shortest end point of
    315      *     either this code point or a matching set string.
    316      *   + If no more set string matched after a previous string match,
    317      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
    318      *     Stop if spanLength==0, otherwise continue the loop.
    319      *
    320      * By noting each end point of a set string match, the function visits each string position at most once and
    321      * finishes in linear time.
    322      *
    323      * The recursive algorithm may visit the same string position many times
    324      * if multiple paths lead to it and finishes in exponential time.
    325      */
    326 
    327     /*
    328      * Algorithm for span(SIMPLE)
    329      *
    330      * Theoretical algorithm:
    331      * - Iterate through the string, and at each code point boundary:
    332      *   + If the code point there is in the set, then remember to continue after it.
    333      *   + If a set string matches at the current position, then remember to continue after it.
    334      *   + Continue from the farthest match position and ignore all others.
    335      *   + If there is no match at the current position, then stop and return the current position.
    336      *
    337      * Optimized implementation:
    338      *
    339      * (Same assumption and spanSet as above.)
    340      *
    341      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
    342      * - Loop:
    343      *   + Try to match each set string at the end of the spanLength.
    344      *     ~ Set strings that start with set-contained code points
    345      *       must be matched with a partial overlap
    346      *       because the standard algorithm would have tried to match them earlier.
    347      *     ~ Set strings that entirely consist of set-contained code points
    348      *       must be matched with a full overlap because the longest-match algorithm
    349      *       would hide set string matches that end earlier.
    350      *       Such set strings need not be matched earlier inside the code point span
    351      *       because the standard algorithm would then have
    352      *       continued after the set string match anyway.
    353      *     ~ Remember the longest set string match (farthest end point)
    354      *       from the earliest starting point.
    355      *   + If no set string matched after spanSet.span(),
    356      *     then return with where the spanSet.span() ended.
    357      *   + If at least one set string matched,
    358      *     then continue the loop after the longest match from the earliest position.
    359      *   + If no more set string matched after a previous string match,
    360      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
    361      *     Stop if spanLength==0, otherwise continue the loop.
    362      */
    363     /**
    364      * Spans a string.
    365      *
    366      * @param s The string to be spanned
    367      * @param start The start index that the span begins
    368      * @param spanCondition The span condition
    369      * @return the limit (exclusive end) of the span
    370      */
    371     public int span(CharSequence s, int start, SpanCondition spanCondition) {
    372         if (spanCondition == SpanCondition.NOT_CONTAINED) {
    373             return spanNot(s, start, null);
    374         }
    375         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
    376         if (spanLimit == s.length()) {
    377             return spanLimit;
    378         }
    379         return spanWithStrings(s, start, spanLimit, spanCondition);
    380     }
    381 
    382     /**
    383      * Synchronized method for complicated spans using the offsets.
    384      * Avoids synchronization for simple cases.
    385      *
    386      * @param spanLimit = spanSet.span(s, start, CONTAINED)
    387      */
    388     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
    389             SpanCondition spanCondition) {
    390         // Consider strings; they may overlap with the span.
    391         int initSize = 0;
    392         if (spanCondition == SpanCondition.CONTAINED) {
    393             // Use offset list to try all possibilities.
    394             initSize = maxLength16;
    395         }
    396         offsets.setMaxLength(initSize);
    397         int length = s.length();
    398         int pos = spanLimit, rest = length - spanLimit;
    399         int spanLength = spanLimit - start;
    400         int i, stringsLength = strings.size();
    401         for (;;) {
    402             if (spanCondition == SpanCondition.CONTAINED) {
    403                 for (i = 0; i < stringsLength; ++i) {
    404                     int overlap = spanLengths[i];
    405                     if (overlap == ALL_CP_CONTAINED) {
    406                         continue; // Irrelevant string.
    407                     }
    408                     String string = strings.get(i);
    409 
    410                     int length16 = string.length();
    411 
    412                     // Try to match this string at pos-overlap..pos.
    413                     if (overlap >= LONG_SPAN) {
    414                         overlap = length16;
    415                         // While contained: No point matching fully inside the code point span.
    416                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
    417                                                                           // point.
    418                     }
    419                     if (overlap > spanLength) {
    420                         overlap = spanLength;
    421                     }
    422                     int inc = length16 - overlap; // Keep overlap+inc==length16.
    423                     for (;;) {
    424                         if (inc > rest) {
    425                             break;
    426                         }
    427                         // Try to match if the increment is not listed already.
    428                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
    429                             if (inc == rest) {
    430                                 return length; // Reached the end of the string.
    431                             }
    432                             offsets.addOffset(inc);
    433                         }
    434                         if (overlap == 0) {
    435                             break;
    436                         }
    437                         --overlap;
    438                         ++inc;
    439                     }
    440                 }
    441             } else /* SIMPLE */{
    442                 int maxInc = 0, maxOverlap = 0;
    443                 for (i = 0; i < stringsLength; ++i) {
    444                     int overlap = spanLengths[i];
    445                     // For longest match, we do need to try to match even an all-contained string
    446                     // to find the match from the earliest start.
    447 
    448                     String string = strings.get(i);
    449 
    450                     int length16 = string.length();
    451 
    452                     // Try to match this string at pos-overlap..pos.
    453                     if (overlap >= LONG_SPAN) {
    454                         overlap = length16;
    455                         // Longest match: Need to match fully inside the code point span
    456                         // to find the match from the earliest start.
    457                     }
    458                     if (overlap > spanLength) {
    459                         overlap = spanLength;
    460                     }
    461                     int inc = length16 - overlap; // Keep overlap+inc==length16.
    462                     for (;;) {
    463                         if (inc > rest || overlap < maxOverlap) {
    464                             break;
    465                         }
    466                         // Try to match if the string is longer or starts earlier.
    467                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
    468                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
    469                             maxInc = inc; // Longest match from earliest start.
    470                             maxOverlap = overlap;
    471                             break;
    472                         }
    473                         --overlap;
    474                         ++inc;
    475                     }
    476                 }
    477 
    478                 if (maxInc != 0 || maxOverlap != 0) {
    479                     // Longest-match algorithm, and there was a string match.
    480                     // Simply continue after it.
    481                     pos += maxInc;
    482                     rest -= maxInc;
    483                     if (rest == 0) {
    484                         return length; // Reached the end of the string.
    485                     }
    486                     spanLength = 0; // Match strings from after a string match.
    487                     continue;
    488                 }
    489             }
    490             // Finished trying to match all strings at pos.
    491 
    492             if (spanLength != 0 || pos == 0) {
    493                 // The position is after an unlimited code point span (spanLength!=0),
    494                 // not after a string match.
    495                 // The only position where spanLength==0 after a span is pos==0.
    496                 // Otherwise, an unlimited code point span is only tried again when no
    497                 // strings match, and if such a non-initial span fails we stop.
    498                 if (offsets.isEmpty()) {
    499                     return pos; // No strings matched after a span.
    500                 }
    501                 // Match strings from after the next string match.
    502             } else {
    503                 // The position is after a string match (or a single code point).
    504                 if (offsets.isEmpty()) {
    505                     // No more strings matched after a previous string match.
    506                     // Try another code point span from after the last string match.
    507                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
    508                     spanLength = spanLimit - pos;
    509                     if (spanLength == rest || // Reached the end of the string, or
    510                             spanLength == 0 // neither strings nor span progressed.
    511                     ) {
    512                         return spanLimit;
    513                     }
    514                     pos += spanLength;
    515                     rest -= spanLength;
    516                     continue; // spanLength>0: Match strings from after a span.
    517                 } else {
    518                     // Try to match only one code point from after a string match if some
    519                     // string matched beyond it, so that we try all possible positions
    520                     // and don't overshoot.
    521                     spanLength = spanOne(spanSet, s, pos, rest);
    522                     if (spanLength > 0) {
    523                         if (spanLength == rest) {
    524                             return length; // Reached the end of the string.
    525                         }
    526                         // Match strings after this code point.
    527                         // There cannot be any increments below it because UnicodeSet strings
    528                         // contain multiple code points.
    529                         pos += spanLength;
    530                         rest -= spanLength;
    531                         offsets.shift(spanLength);
    532                         spanLength = 0;
    533                         continue; // Match strings from after a single code point.
    534                     }
    535                     // Match strings from after the next string match.
    536                 }
    537             }
    538             int minOffset = offsets.popMinimum(null);
    539             pos += minOffset;
    540             rest -= minOffset;
    541             spanLength = 0; // Match strings from after a string match.
    542         }
    543     }
    544 
    545     /**
    546      * Spans a string and counts the smallest number of set elements on any path across the span.
    547      *
    548      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
    549      *
    550      * <p>If the set does not have any fully-contained strings, then we could optimize this
    551      * like span(), but such sets are likely rare, and this is at least still linear.
    552      *
    553      * @param s The string to be spanned
    554      * @param start The start index that the span begins
    555      * @param spanCondition The span condition
    556      * @param outCount The count
    557      * @return the limit (exclusive end) of the span
    558      */
    559     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
    560             OutputInt outCount) {
    561         if (spanCondition == SpanCondition.NOT_CONTAINED) {
    562             return spanNot(s, start, outCount);
    563         }
    564         // Consider strings; they may overlap with the span,
    565         // and they may result in a smaller count that with just code points.
    566         if (spanCondition == SpanCondition.CONTAINED) {
    567             return spanContainedAndCount(s, start, outCount);
    568         }
    569         // SIMPLE (not synchronized, does not use offsets)
    570         int stringsLength = strings.size();
    571         int length = s.length();
    572         int pos = start;
    573         int rest = length - start;
    574         int count = 0;
    575         while (rest != 0) {
    576             // Try to match the next code point.
    577             int cpLength = spanOne(spanSet, s, pos, rest);
    578             int maxInc = (cpLength > 0) ? cpLength : 0;
    579             // Try to match all of the strings.
    580             for (int i = 0; i < stringsLength; ++i) {
    581                 String string = strings.get(i);
    582                 int length16 = string.length();
    583                 if (maxInc < length16 && length16 <= rest &&
    584                         matches16CPB(s, pos, length, string, length16)) {
    585                     maxInc = length16;
    586                 }
    587             }
    588             // We are done if there is no match beyond pos.
    589             if (maxInc == 0) {
    590                 outCount.value = count;
    591                 return pos;
    592             }
    593             // Continue from the longest match.
    594             ++count;
    595             pos += maxInc;
    596             rest -= maxInc;
    597         }
    598         outCount.value = count;
    599         return pos;
    600     }
    601 
    602     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
    603         // Use offset list to try all possibilities.
    604         offsets.setMaxLength(maxLength16);
    605         int stringsLength = strings.size();
    606         int length = s.length();
    607         int pos = start;
    608         int rest = length - start;
    609         int count = 0;
    610         while (rest != 0) {
    611             // Try to match the next code point.
    612             int cpLength = spanOne(spanSet, s, pos, rest);
    613             if (cpLength > 0) {
    614                 offsets.addOffsetAndCount(cpLength, count + 1);
    615             }
    616             // Try to match all of the strings.
    617             for (int i = 0; i < stringsLength; ++i) {
    618                 String string = strings.get(i);
    619                 int length16 = string.length();
    620                 // Note: If the strings were sorted by length, then we could also
    621                 // avoid trying to match if there is already a match of the same length.
    622                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
    623                         matches16CPB(s, pos, length, string, length16)) {
    624                     offsets.addOffsetAndCount(length16, count + 1);
    625                 }
    626             }
    627             // We are done if there is no match beyond pos.
    628             if (offsets.isEmpty()) {
    629                 outCount.value = count;
    630                 return pos;
    631             }
    632             // Continue from the nearest match.
    633             int minOffset = offsets.popMinimum(outCount);
    634             count = outCount.value;
    635             pos += minOffset;
    636             rest -= minOffset;
    637         }
    638         outCount.value = count;
    639         return pos;
    640     }
    641 
    642     /**
    643      * Span a string backwards.
    644      *
    645      * @param s The string to be spanned
    646      * @param spanCondition The span condition
    647      * @return The string index which starts the span (i.e. inclusive).
    648      */
    649     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
    650         if (spanCondition == SpanCondition.NOT_CONTAINED) {
    651             return spanNotBack(s, length);
    652         }
    653         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
    654         if (pos == 0) {
    655             return 0;
    656         }
    657         int spanLength = length - pos;
    658 
    659         // Consider strings; they may overlap with the span.
    660         int initSize = 0;
    661         if (spanCondition == SpanCondition.CONTAINED) {
    662             // Use offset list to try all possibilities.
    663             initSize = maxLength16;
    664         }
    665         offsets.setMaxLength(initSize);
    666         int i, stringsLength = strings.size();
    667         int spanBackLengthsOffset = 0;
    668         if (all) {
    669             spanBackLengthsOffset = stringsLength;
    670         }
    671         for (;;) {
    672             if (spanCondition == SpanCondition.CONTAINED) {
    673                 for (i = 0; i < stringsLength; ++i) {
    674                     int overlap = spanLengths[spanBackLengthsOffset + i];
    675                     if (overlap == ALL_CP_CONTAINED) {
    676                         continue; // Irrelevant string.
    677                     }
    678                     String string = strings.get(i);
    679 
    680                     int length16 = string.length();
    681 
    682                     // Try to match this string at pos-(length16-overlap)..pos-length16.
    683                     if (overlap >= LONG_SPAN) {
    684                         overlap = length16;
    685                         // While contained: No point matching fully inside the code point span.
    686                         int len1 = 0;
    687                         len1 = string.offsetByCodePoints(0, 1);
    688                         overlap -= len1; // Length of the string minus the first code point.
    689                     }
    690                     if (overlap > spanLength) {
    691                         overlap = spanLength;
    692                     }
    693                     int dec = length16 - overlap; // Keep dec+overlap==length16.
    694                     for (;;) {
    695                         if (dec > pos) {
    696                             break;
    697                         }
    698                         // Try to match if the decrement is not listed already.
    699                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
    700                             if (dec == pos) {
    701                                 return 0; // Reached the start of the string.
    702                             }
    703                             offsets.addOffset(dec);
    704                         }
    705                         if (overlap == 0) {
    706                             break;
    707                         }
    708                         --overlap;
    709                         ++dec;
    710                     }
    711                 }
    712             } else /* SIMPLE */{
    713                 int maxDec = 0, maxOverlap = 0;
    714                 for (i = 0; i < stringsLength; ++i) {
    715                     int overlap = spanLengths[spanBackLengthsOffset + i];
    716                     // For longest match, we do need to try to match even an all-contained string
    717                     // to find the match from the latest end.
    718 
    719                     String string = strings.get(i);
    720 
    721                     int length16 = string.length();
    722 
    723                     // Try to match this string at pos-(length16-overlap)..pos-length16.
    724                     if (overlap >= LONG_SPAN) {
    725                         overlap = length16;
    726                         // Longest match: Need to match fully inside the code point span
    727                         // to find the match from the latest end.
    728                     }
    729                     if (overlap > spanLength) {
    730                         overlap = spanLength;
    731                     }
    732                     int dec = length16 - overlap; // Keep dec+overlap==length16.
    733                     for (;;) {
    734                         if (dec > pos || overlap < maxOverlap) {
    735                             break;
    736                         }
    737                         // Try to match if the string is longer or ends later.
    738                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
    739                                 && matches16CPB(s, pos - dec, length, string, length16)) {
    740                             maxDec = dec; // Longest match from latest end.
    741                             maxOverlap = overlap;
    742                             break;
    743                         }
    744                         --overlap;
    745                         ++dec;
    746                     }
    747                 }
    748 
    749                 if (maxDec != 0 || maxOverlap != 0) {
    750                     // Longest-match algorithm, and there was a string match.
    751                     // Simply continue before it.
    752                     pos -= maxDec;
    753                     if (pos == 0) {
    754                         return 0; // Reached the start of the string.
    755                     }
    756                     spanLength = 0; // Match strings from before a string match.
    757                     continue;
    758                 }
    759             }
    760             // Finished trying to match all strings at pos.
    761 
    762             if (spanLength != 0 || pos == length) {
    763                 // The position is before an unlimited code point span (spanLength!=0),
    764                 // not before a string match.
    765                 // The only position where spanLength==0 before a span is pos==length.
    766                 // Otherwise, an unlimited code point span is only tried again when no
    767                 // strings match, and if such a non-initial span fails we stop.
    768                 if (offsets.isEmpty()) {
    769                     return pos; // No strings matched before a span.
    770                 }
    771                 // Match strings from before the next string match.
    772             } else {
    773                 // The position is before a string match (or a single code point).
    774                 if (offsets.isEmpty()) {
    775                     // No more strings matched before a previous string match.
    776                     // Try another code point span from before the last string match.
    777                     int oldPos = pos;
    778                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
    779                     spanLength = oldPos - pos;
    780                     if (pos == 0 || // Reached the start of the string, or
    781                             spanLength == 0 // neither strings nor span progressed.
    782                     ) {
    783                         return pos;
    784                     }
    785                     continue; // spanLength>0: Match strings from before a span.
    786                 } else {
    787                     // Try to match only one code point from before a string match if some
    788                     // string matched beyond it, so that we try all possible positions
    789                     // and don't overshoot.
    790                     spanLength = spanOneBack(spanSet, s, pos);
    791                     if (spanLength > 0) {
    792                         if (spanLength == pos) {
    793                             return 0; // Reached the start of the string.
    794                         }
    795                         // Match strings before this code point.
    796                         // There cannot be any decrements below it because UnicodeSet strings
    797                         // contain multiple code points.
    798                         pos -= spanLength;
    799                         offsets.shift(spanLength);
    800                         spanLength = 0;
    801                         continue; // Match strings from before a single code point.
    802                     }
    803                     // Match strings from before the next string match.
    804                 }
    805             }
    806             pos -= offsets.popMinimum(null);
    807             spanLength = 0; // Match strings from before a string match.
    808         }
    809     }
    810 
    811     /**
    812      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
    813      *
    814      * Theoretical algorithm:
    815      * - Iterate through the string, and at each code point boundary:
    816      *   + If the code point there is in the set, then return with the current position.
    817      *   + If a set string matches at the current position, then return with the current position.
    818      *
    819      * Optimized implementation:
    820      *
    821      * (Same assumption as for span() above.)
    822      *
    823      * Create and cache a spanNotSet which contains
    824      * all of the single code points of the original set but none of its strings.
    825      * For each set string add its initial code point to the spanNotSet.
    826      * (Also add its final code point for spanNotBack().)
    827      *
    828      * - Loop:
    829      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
    830      *   + If the current code point is in the original set, then return the current position.
    831      *   + If any set string matches at the current position, then return the current position.
    832      *   + If there is no match at the current position, neither for the code point
    833      *     there nor for any set string, then skip this code point and continue the loop.
    834      *     This happens for set-string-initial code points that were added to spanNotSet
    835      *     when there is not actually a match for such a set string.
    836      *
    837      * @param s The string to be spanned
    838      * @param start The start index that the span begins
    839      * @param outCount If not null: Receives the number of code points across the span.
    840      * @return the limit (exclusive end) of the span
    841      */
    842     private int spanNot(CharSequence s, int start, OutputInt outCount) {
    843         int length = s.length();
    844         int pos = start, rest = length - start;
    845         int stringsLength = strings.size();
    846         int count = 0;
    847         do {
    848             // Span until we find a code point from the set,
    849             // or a code point that starts or ends some string.
    850             int spanLimit;
    851             if (outCount == null) {
    852                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
    853             } else {
    854                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
    855                 outCount.value = count = count + outCount.value;
    856             }
    857             if (spanLimit == length) {
    858                 return length; // Reached the end of the string.
    859             }
    860             pos = spanLimit;
    861             rest = length - spanLimit;
    862 
    863             // Check whether the current code point is in the original set,
    864             // without the string starts and ends.
    865             int cpLength = spanOne(spanSet, s, pos, rest);
    866             if (cpLength > 0) {
    867                 return pos; // There is a set element at pos.
    868             }
    869 
    870             // Try to match the strings at pos.
    871             for (int i = 0; i < stringsLength; ++i) {
    872                 if (spanLengths[i] == ALL_CP_CONTAINED) {
    873                     continue; // Irrelevant string.
    874                 }
    875                 String string = strings.get(i);
    876 
    877                 int length16 = string.length();
    878                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
    879                     return pos; // There is a set element at pos.
    880                 }
    881             }
    882 
    883             // The span(while not contained) ended on a string start/end which is
    884             // not in the original set. Skip this code point and continue.
    885             // cpLength<0
    886             pos -= cpLength;
    887             rest += cpLength;
    888             ++count;
    889         } while (rest != 0);
    890         if (outCount != null) {
    891             outCount.value = count;
    892         }
    893         return length; // Reached the end of the string.
    894     }
    895 
    896     private int spanNotBack(CharSequence s, int length) {
    897         int pos = length;
    898         int i, stringsLength = strings.size();
    899         do {
    900             // Span until we find a code point from the set,
    901             // or a code point that starts or ends some string.
    902             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
    903             if (pos == 0) {
    904                 return 0; // Reached the start of the string.
    905             }
    906 
    907             // Check whether the current code point is in the original set,
    908             // without the string starts and ends.
    909             int cpLength = spanOneBack(spanSet, s, pos);
    910             if (cpLength > 0) {
    911                 return pos; // There is a set element at pos.
    912             }
    913 
    914             // Try to match the strings at pos.
    915             for (i = 0; i < stringsLength; ++i) {
    916                 // Use spanLengths rather than a spanLengths pointer because
    917                 // it is easier and we only need to know whether the string is irrelevant
    918                 // which is the same in either array.
    919                 if (spanLengths[i] == ALL_CP_CONTAINED) {
    920                     continue; // Irrelevant string.
    921                 }
    922                 String string = strings.get(i);
    923 
    924                 int length16 = string.length();
    925                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
    926                     return pos; // There is a set element at pos.
    927                 }
    928             }
    929 
    930             // The span(while not contained) ended on a string start/end which is
    931             // not in the original set. Skip this code point and continue.
    932             // cpLength<0
    933             pos += cpLength;
    934         } while (pos != 0);
    935         return 0; // Reached the start of the string.
    936     }
    937 
    938     static short makeSpanLengthByte(int spanLength) {
    939         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
    940         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
    941     }
    942 
    943     // Compare strings without any argument checks. Requires length>0.
    944     private static boolean matches16(CharSequence s, int start, final String t, int length) {
    945         int end = start + length;
    946         while (length-- > 0) {
    947             if (s.charAt(--end) != t.charAt(length)) {
    948                 return false;
    949             }
    950         }
    951         return true;
    952     }
    953 
    954     /**
    955      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
    956      * at code point boundaries.
    957      * That is, each edge of a match must not be in the middle of a surrogate pair.
    958      * @param s       The string to match in.
    959      * @param start   The start index of s.
    960      * @param limit   The limit of the subsequence of s being spanned.
    961      * @param t       The substring to be matched in s.
    962      * @param tlength The length of t.
    963      */
    964     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
    965         return matches16(s, start, t, tlength)
    966                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
    967                         Character.isLowSurrogate(s.charAt(start)))
    968                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
    969                         Character.isLowSurrogate(s.charAt(start + tlength)));
    970     }
    971 
    972     /**
    973      * Does the set contain the next code point?
    974      * If so, return its length; otherwise return its negative length.
    975      */
    976     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
    977         char c = s.charAt(start);
    978         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
    979             char c2 = s.charAt(start + 1);
    980             if (android.icu.text.UTF16.isTrailSurrogate(c2)) {
    981                 int supplementary = Character.toCodePoint(c, c2);
    982                 return set.contains(supplementary) ? 2 : -2;
    983             }
    984         }
    985         return set.contains(c) ? 1 : -1;
    986     }
    987 
    988     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
    989         char c = s.charAt(length - 1);
    990         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
    991             char c2 = s.charAt(length - 2);
    992             if (android.icu.text.UTF16.isLeadSurrogate(c2)) {
    993                 int supplementary = Character.toCodePoint(c2, c);
    994                 return set.contains(supplementary) ? 2 : -2;
    995             }
    996         }
    997         return set.contains(c) ? 1 : -1;
    998     }
    999 
   1000     /**
   1001      * Helper class for UnicodeSetStringSpan.
   1002      *
   1003      * <p>List of offsets from the current position from where to try matching
   1004      * a code point or a string.
   1005      * Stores offsets rather than indexes to simplify the code and use the same list
   1006      * for both increments (in span()) and decrements (in spanBack()).
   1007      *
   1008      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
   1009      * are relatively dense, that is,
   1010      * there are normally no gaps of hundreds or thousands of offset values.
   1011      *
   1012      * <p>This class optionally also tracks the minimum non-negative count for each position,
   1013      * intended to count the smallest number of elements of any path leading to that position.
   1014      *
   1015      * <p>The implementation uses a circular buffer of count integers,
   1016      * each indicating whether the corresponding offset is in the list,
   1017      * and its path element count.
   1018      * This avoids inserting into a sorted list of offsets (or absolute indexes)
   1019      * and physically moving part of the list.
   1020      *
   1021      * <p>Note: In principle, the caller should setMaxLength() to
   1022      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
   1023      * to account for "long" single code points.
   1024      *
   1025      * <p>Note: An earlier version did not track counts and stored only byte flags.
   1026      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
   1027      * the list could be stored as bit flags in a single integer.
   1028      * Rather than handling a circular buffer with a start list index,
   1029      * the integer would simply be shifted when lower offsets are removed.
   1030      * UnicodeSet does not have a limit on the lengths of strings.
   1031      */
   1032     private static final class OffsetList {
   1033         private int[] list;
   1034         private int length;
   1035         private int start;
   1036 
   1037         public OffsetList() {
   1038             list = new int[16];  // default size
   1039         }
   1040 
   1041         public void setMaxLength(int maxLength) {
   1042             if (maxLength > list.length) {
   1043                 list = new int[maxLength];
   1044             }
   1045             clear();
   1046         }
   1047 
   1048         public void clear() {
   1049             for (int i = list.length; i-- > 0;) {
   1050                 list[i] = 0;
   1051             }
   1052             start = length = 0;
   1053         }
   1054 
   1055         public boolean isEmpty() {
   1056             return (length == 0);
   1057         }
   1058 
   1059         /**
   1060          * Reduces all stored offsets by delta, used when the current position moves by delta.
   1061          * There must not be any offsets lower than delta.
   1062          * If there is an offset equal to delta, it is removed.
   1063          *
   1064          * @param delta [1..maxLength]
   1065          */
   1066         public void shift(int delta) {
   1067             int i = start + delta;
   1068             if (i >= list.length) {
   1069                 i -= list.length;
   1070             }
   1071             if (list[i] != 0) {
   1072                 list[i] = 0;
   1073                 --length;
   1074             }
   1075             start = i;
   1076         }
   1077 
   1078         /**
   1079          * Adds an offset. The list must not contain it yet.
   1080          * @param offset [1..maxLength]
   1081          */
   1082         public void addOffset(int offset) {
   1083             int i = start + offset;
   1084             if (i >= list.length) {
   1085                 i -= list.length;
   1086             }
   1087             assert list[i] == 0;
   1088             list[i] = 1;
   1089             ++length;
   1090         }
   1091 
   1092         /**
   1093          * Adds an offset and updates its count.
   1094          * The list may already contain the offset.
   1095          * @param offset [1..maxLength]
   1096          */
   1097         public void addOffsetAndCount(int offset, int count) {
   1098             assert count > 0;
   1099             int i = start + offset;
   1100             if (i >= list.length) {
   1101                 i -= list.length;
   1102             }
   1103             if (list[i] == 0) {
   1104                 list[i] = count;
   1105                 ++length;
   1106             } else if (count < list[i]) {
   1107                 list[i] = count;
   1108             }
   1109         }
   1110 
   1111         /**
   1112          * @param offset [1..maxLength]
   1113          */
   1114         public boolean containsOffset(int offset) {
   1115             int i = start + offset;
   1116             if (i >= list.length) {
   1117                 i -= list.length;
   1118             }
   1119             return list[i] != 0;
   1120         }
   1121 
   1122         /**
   1123          * @param offset [1..maxLength]
   1124          */
   1125         public boolean hasCountAtOffset(int offset, int count) {
   1126             int i = start + offset;
   1127             if (i >= list.length) {
   1128                 i -= list.length;
   1129             }
   1130             int oldCount = list[i];
   1131             return oldCount != 0 && oldCount <= count;
   1132         }
   1133 
   1134         /**
   1135          * Finds the lowest stored offset from a non-empty list, removes it,
   1136          * and reduces all other offsets by this minimum.
   1137          * @return min=[1..maxLength]
   1138          */
   1139         public int popMinimum(OutputInt outCount) {
   1140             // Look for the next offset in list[start+1..list.length-1].
   1141             int i = start, result;
   1142             while (++i < list.length) {
   1143                 int count = list[i];
   1144                 if (count != 0) {
   1145                     list[i] = 0;
   1146                     --length;
   1147                     result = i - start;
   1148                     start = i;
   1149                     if (outCount != null) { outCount.value = count; }
   1150                     return result;
   1151                 }
   1152             }
   1153             // i==list.length
   1154 
   1155             // Wrap around and look for the next offset in list[0..start].
   1156             // Since the list is not empty, there will be one.
   1157             result = list.length - start;
   1158             i = 0;
   1159             int count;
   1160             while ((count = list[i]) == 0) {
   1161                 ++i;
   1162             }
   1163             list[i] = 0;
   1164             --length;
   1165             start = i;
   1166             if (outCount != null) { outCount.value = count; }
   1167             return result + i;
   1168         }
   1169     }
   1170 }
   1171