Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ******************************************************************************
      5  *
      6  *   Copyright (C) 2009-2015, International Business Machines
      7  *   Corporation and others.  All Rights Reserved.
      8  *
      9  ******************************************************************************
     10  */
     11 
     12 package com.ibm.icu.impl;
     13 
     14 import java.util.ArrayList;
     15 
     16 import com.ibm.icu.text.UnicodeSet;
     17 import com.ibm.icu.text.UnicodeSet.SpanCondition;
     18 import com.ibm.icu.util.OutputInt;
     19 
     20 /*
     21  * Implement span() etc. for a set with strings.
     22  * Avoid recursion because of its exponential complexity.
     23  * Instead, try multiple paths at once and track them with an IndexList.
     24  */
     25 public class UnicodeSetStringSpan {
     26 
     27     /*
     28      * Which span() variant will be used? The object is either built for one variant and used once,
     29      * or built for all and may be used many times.
     30      */
     31     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
     32     public static final int FWD           = 0x20;
     33     public static final int BACK          = 0x10;
     34     // public static final int UTF16      = 8;
     35     public static final int CONTAINED     = 2;
     36     public static final int NOT_CONTAINED = 1;
     37 
     38     public static final int ALL = 0x7f;
     39 
     40     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
     41     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
     42     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
     43     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
     44 
     45     /**
     46      * Special spanLength short values. (since Java has not unsigned byte type)
     47      * All code points in the string are contained in the parent set.
     48      */
     49     static final short ALL_CP_CONTAINED = 0xff;
     50     /** The spanLength is >=0xfe. */
     51     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
     52 
     53     /** Set for span(). Same as parent but without strings. */
     54     private UnicodeSet spanSet;
     55 
     56     /**
     57      * Set for span(not contained).
     58      * Same as spanSet, plus characters that start or end strings.
     59      */
     60     private UnicodeSet spanNotSet;
     61 
     62     /** The strings of the parent set. */
     63     private ArrayList<String> strings;
     64 
     65     /** The lengths of span(), spanBack() etc. for each string. */
     66     private short[] spanLengths;
     67 
     68     /** Maximum lengths of relevant strings. */
     69     private final int maxLength16;
     70 
     71     /** Are there strings that are not fully contained in the code point set? */
     72     private boolean someRelevant;
     73 
     74     /** Set up for all variants of span()? */
     75     private boolean all;
     76 
     77     /** Span helper */
     78     private OffsetList offsets;
     79 
     80     /**
     81      * Constructs for all variants of span(), or only for any one variant.
     82      * Initializes as little as possible, for single use.
     83      */
     84     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
     85         spanSet = new UnicodeSet(0, 0x10ffff);
     86         // TODO: With Java 6, just take the parent set's strings as is,
     87         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
     88         // Then iterate via the first() and higher() methods.
     89         // (We do not want to create multiple Iterator objects in each span().)
     90         // See ICU ticket #7454.
     91         strings = setStrings;
     92         all = (which == ALL);
     93         spanSet.retainAll(set);
     94         if (0 != (which & NOT_CONTAINED)) {
     95             // Default to the same sets.
     96             // addToSpanNotSet() will create a separate set if necessary.
     97             spanNotSet = spanSet;
     98         }
     99         offsets = new OffsetList();
    100 
    101         // Determine if the strings even need to be taken into account at all for span() etc.
    102         // If any string is relevant, then all strings need to be used for
    103         // span(longest match) but only the relevant ones for span(while contained).
    104         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
    105         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
    106         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
    107         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
    108         int stringsLength = strings.size();
    109 
    110         int i, spanLength;
    111         int maxLength16 = 0;
    112         someRelevant = false;
    113         for (i = 0; i < stringsLength; ++i) {
    114             String string = strings.get(i);
    115             int length16 = string.length();
    116             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
    117             if (spanLength < length16) { // Relevant string.
    118                 someRelevant = true;
    119             }
    120             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
    121                 maxLength16 = length16;
    122             }
    123         }
    124         this.maxLength16 = maxLength16;
    125         if (!someRelevant && (which & WITH_COUNT) == 0) {
    126             return;
    127         }
    128 
    129         // Freeze after checking for the need to use strings at all because freezing
    130         // a set takes some time and memory which are wasted if there are no relevant strings.
    131         if (all) {
    132             spanSet.freeze();
    133         }
    134 
    135         int spanBackLengthsOffset;
    136 
    137         // Allocate a block of meta data.
    138         int allocSize;
    139         if (all) {
    140             // 2 sets of span lengths
    141             allocSize = stringsLength * (2);
    142         } else {
    143             allocSize = stringsLength; // One set of span lengths.
    144         }
    145         spanLengths = new short[allocSize];
    146 
    147         if (all) {
    148             // Store span lengths for all span() variants.
    149             spanBackLengthsOffset = stringsLength;
    150         } else {
    151             // Store span lengths for only one span() variant.
    152             spanBackLengthsOffset = 0;
    153         }
    154 
    155         // Set the meta data and spanNotSet and write the UTF-8 strings.
    156 
    157         for (i = 0; i < stringsLength; ++i) {
    158             String string = strings.get(i);
    159             int length16 = string.length();
    160             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
    161             if (spanLength < length16) { // Relevant string.
    162                 if (true /* 0 != (which & UTF16) */) {
    163                     if (0 != (which & CONTAINED)) {
    164                         if (0 != (which & FWD)) {
    165                             spanLengths[i] = makeSpanLengthByte(spanLength);
    166                         }
    167                         if (0 != (which & BACK)) {
    168                             spanLength = length16
    169                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
    170                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
    171                         }
    172                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
    173                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
    174                                                                                      // flag.
    175                     }
    176                 }
    177                 if (0 != (which & NOT_CONTAINED)) {
    178                     // Add string start and end code points to the spanNotSet so that
    179                     // a span(while not contained) stops before any string.
    180                     int c;
    181                     if (0 != (which & FWD)) {
    182                         c = string.codePointAt(0);
    183                         addToSpanNotSet(c);
    184                     }
    185                     if (0 != (which & BACK)) {
    186                         c = string.codePointBefore(length16);
    187                         addToSpanNotSet(c);
    188                     }
    189                 }
    190             } else { // Irrelevant string.
    191                 if (all) {
    192                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
    193                 } else {
    194                     // All spanXYZLengths pointers contain the same address.
    195                     spanLengths[i] = ALL_CP_CONTAINED;
    196                 }
    197             }
    198         }
    199 
    200         // Finish.
    201         if (all) {
    202             spanNotSet.freeze();
    203         }
    204     }
    205 
    206     /**
    207      * Constructs a copy of an existing UnicodeSetStringSpan.
    208      * Assumes which==ALL for a frozen set.
    209      */
    210     public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
    211             final ArrayList<String> newParentSetStrings) {
    212         spanSet = otherStringSpan.spanSet;
    213         strings = newParentSetStrings;
    214         maxLength16 = otherStringSpan.maxLength16;
    215         someRelevant = otherStringSpan.someRelevant;
    216         all = true;
    217         if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) {
    218             spanNotSet = spanSet;
    219         } else {
    220             spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
    221         }
    222         offsets = new OffsetList();
    223 
    224         spanLengths = otherStringSpan.spanLengths.clone();
    225     }
    226 
    227     /**
    228      * Do the strings need to be checked in span() etc.?
    229      *
    230      * @return true if strings need to be checked (call span() here),
    231      *         false if not (use a BMPSet for best performance).
    232      */
    233     public boolean needsStringSpanUTF16() {
    234         return someRelevant;
    235     }
    236 
    237     /** For fast UnicodeSet::contains(c). */
    238     public boolean contains(int c) {
    239         return spanSet.contains(c);
    240     }
    241 
    242     /**
    243      * Adds a starting or ending string character to the spanNotSet
    244      * so that a character span ends before any string.
    245      */
    246     private void addToSpanNotSet(int c) {
    247         if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) {
    248             if (spanSet.contains(c)) {
    249                 return; // Nothing to do.
    250             }
    251             spanNotSet = spanSet.cloneAsThawed();
    252         }
    253         spanNotSet.add(c);
    254     }
    255 
    256     /*
    257      * Note: In span() when spanLength==0
    258      * (after a string match, or at the beginning after an empty code point span)
    259      * and in spanNot() and spanNotUTF8(),
    260      * string matching could use a binary search because all string matches are done
    261      * from the same start index.
    262      *
    263      * For UTF-8, this would require a comparison function that returns UTF-16 order.
    264      *
    265      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
    266      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
    267      * API and implementation with a DFA (state machine).
    268      */
    269 
    270     /*
    271      * Algorithm for span(SpanCondition.CONTAINED)
    272      *
    273      * Theoretical algorithm:
    274      * - Iterate through the string, and at each code point boundary:
    275      *   + If the code point there is in the set, then remember to continue after it.
    276      *   + If a set string matches at the current position, then remember to continue after it.
    277      *   + Either recursively span for each code point or string match, or recursively span
    278      *     for all but the shortest one and iteratively continue the span with the shortest local match.
    279      *   + Remember the longest recursive span (the farthest end point).
    280      *   + If there is no match at the current position,
    281      *     neither for the code point there nor for any set string,
    282      *     then stop and return the longest recursive span length.
    283      *
    284      * Optimized implementation:
    285      *
    286      * (We assume that most sets will have very few very short strings.
    287      * A span using a string-less set is extremely fast.)
    288      *
    289      * Create and cache a spanSet which contains all of the single code points of the original set
    290      * but none of its strings.
    291      *
    292      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
    293      * - Loop:
    294      *   + Try to match each set string at the end of the spanLength.
    295      *     ~ Set strings that start with set-contained code points
    296      *       must be matched with a partial overlap
    297      *       because the recursive algorithm would have tried to match them at every position.
    298      *     ~ Set strings that entirely consist of set-contained code points
    299      *       are irrelevant for span(SpanCondition.CONTAINED)
    300      *       because the recursive algorithm would continue after them anyway and
    301      *       find the longest recursive match from their end.
    302      *     ~ Rather than recursing, note each end point of a set string match.
    303      *   + If no set string matched after spanSet.span(),
    304      *     then return with where the spanSet.span() ended.
    305      *   + If at least one set string matched after spanSet.span(),
    306      *     then pop the shortest string match end point and continue the loop,
    307      *     trying to match all set strings from there.
    308      *   + If at least one more set string matched after a previous string match, then test if the
    309      *     code point after the previous string match is also contained in the set.
    310      *     Continue the loop with the shortest end point of
    311      *     either this code point or a matching set string.
    312      *   + If no more set string matched after a previous string match,
    313      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
    314      *     Stop if spanLength==0, otherwise continue the loop.
    315      *
    316      * By noting each end point of a set string match, the function visits each string position at most once and
    317      * finishes in linear time.
    318      *
    319      * The recursive algorithm may visit the same string position many times
    320      * if multiple paths lead to it and finishes in exponential time.
    321      */
    322 
    323     /*
    324      * Algorithm for span(SIMPLE)
    325      *
    326      * Theoretical algorithm:
    327      * - Iterate through the string, and at each code point boundary:
    328      *   + If the code point there is in the set, then remember to continue after it.
    329      *   + If a set string matches at the current position, then remember to continue after it.
    330      *   + Continue from the farthest match position and ignore all others.
    331      *   + If there is no match at the current position, then stop and return the current position.
    332      *
    333      * Optimized implementation:
    334      *
    335      * (Same assumption and spanSet as above.)
    336      *
    337      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
    338      * - Loop:
    339      *   + Try to match each set string at the end of the spanLength.
    340      *     ~ Set strings that start with set-contained code points
    341      *       must be matched with a partial overlap
    342      *       because the standard algorithm would have tried to match them earlier.
    343      *     ~ Set strings that entirely consist of set-contained code points
    344      *       must be matched with a full overlap because the longest-match algorithm
    345      *       would hide set string matches that end earlier.
    346      *       Such set strings need not be matched earlier inside the code point span
    347      *       because the standard algorithm would then have
    348      *       continued after the set string match anyway.
    349      *     ~ Remember the longest set string match (farthest end point)
    350      *       from the earliest starting point.
    351      *   + If no set string matched after spanSet.span(),
    352      *     then return with where the spanSet.span() ended.
    353      *   + If at least one set string matched,
    354      *     then continue the loop after the longest match from the earliest position.
    355      *   + If no more set string matched after a previous string match,
    356      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
    357      *     Stop if spanLength==0, otherwise continue the loop.
    358      */
    359     /**
    360      * Spans a string.
    361      *
    362      * @param s The string to be spanned
    363      * @param start The start index that the span begins
    364      * @param spanCondition The span condition
    365      * @return the limit (exclusive end) of the span
    366      */
    367     public int span(CharSequence s, int start, SpanCondition spanCondition) {
    368         if (spanCondition == SpanCondition.NOT_CONTAINED) {
    369             return spanNot(s, start, null);
    370         }
    371         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
    372         if (spanLimit == s.length()) {
    373             return spanLimit;
    374         }
    375         return spanWithStrings(s, start, spanLimit, spanCondition);
    376     }
    377 
    378     /**
    379      * Synchronized method for complicated spans using the offsets.
    380      * Avoids synchronization for simple cases.
    381      *
    382      * @param spanLimit = spanSet.span(s, start, CONTAINED)
    383      */
    384     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
    385             SpanCondition spanCondition) {
    386         // Consider strings; they may overlap with the span.
    387         int initSize = 0;
    388         if (spanCondition == SpanCondition.CONTAINED) {
    389             // Use offset list to try all possibilities.
    390             initSize = maxLength16;
    391         }
    392         offsets.setMaxLength(initSize);
    393         int length = s.length();
    394         int pos = spanLimit, rest = length - spanLimit;
    395         int spanLength = spanLimit - start;
    396         int i, stringsLength = strings.size();
    397         for (;;) {
    398             if (spanCondition == SpanCondition.CONTAINED) {
    399                 for (i = 0; i < stringsLength; ++i) {
    400                     int overlap = spanLengths[i];
    401                     if (overlap == ALL_CP_CONTAINED) {
    402                         continue; // Irrelevant string.
    403                     }
    404                     String string = strings.get(i);
    405 
    406                     int length16 = string.length();
    407 
    408                     // Try to match this string at pos-overlap..pos.
    409                     if (overlap >= LONG_SPAN) {
    410                         overlap = length16;
    411                         // While contained: No point matching fully inside the code point span.
    412                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
    413                                                                           // point.
    414                     }
    415                     if (overlap > spanLength) {
    416                         overlap = spanLength;
    417                     }
    418                     int inc = length16 - overlap; // Keep overlap+inc==length16.
    419                     for (;;) {
    420                         if (inc > rest) {
    421                             break;
    422                         }
    423                         // Try to match if the increment is not listed already.
    424                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
    425                             if (inc == rest) {
    426                                 return length; // Reached the end of the string.
    427                             }
    428                             offsets.addOffset(inc);
    429                         }
    430                         if (overlap == 0) {
    431                             break;
    432                         }
    433                         --overlap;
    434                         ++inc;
    435                     }
    436                 }
    437             } else /* SIMPLE */{
    438                 int maxInc = 0, maxOverlap = 0;
    439                 for (i = 0; i < stringsLength; ++i) {
    440                     int overlap = spanLengths[i];
    441                     // For longest match, we do need to try to match even an all-contained string
    442                     // to find the match from the earliest start.
    443 
    444                     String string = strings.get(i);
    445 
    446                     int length16 = string.length();
    447 
    448                     // Try to match this string at pos-overlap..pos.
    449                     if (overlap >= LONG_SPAN) {
    450                         overlap = length16;
    451                         // Longest match: Need to match fully inside the code point span
    452                         // to find the match from the earliest start.
    453                     }
    454                     if (overlap > spanLength) {
    455                         overlap = spanLength;
    456                     }
    457                     int inc = length16 - overlap; // Keep overlap+inc==length16.
    458                     for (;;) {
    459                         if (inc > rest || overlap < maxOverlap) {
    460                             break;
    461                         }
    462                         // Try to match if the string is longer or starts earlier.
    463                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
    464                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
    465                             maxInc = inc; // Longest match from earliest start.
    466                             maxOverlap = overlap;
    467                             break;
    468                         }
    469                         --overlap;
    470                         ++inc;
    471                     }
    472                 }
    473 
    474                 if (maxInc != 0 || maxOverlap != 0) {
    475                     // Longest-match algorithm, and there was a string match.
    476                     // Simply continue after it.
    477                     pos += maxInc;
    478                     rest -= maxInc;
    479                     if (rest == 0) {
    480                         return length; // Reached the end of the string.
    481                     }
    482                     spanLength = 0; // Match strings from after a string match.
    483                     continue;
    484                 }
    485             }
    486             // Finished trying to match all strings at pos.
    487 
    488             if (spanLength != 0 || pos == 0) {
    489                 // The position is after an unlimited code point span (spanLength!=0),
    490                 // not after a string match.
    491                 // The only position where spanLength==0 after a span is pos==0.
    492                 // Otherwise, an unlimited code point span is only tried again when no
    493                 // strings match, and if such a non-initial span fails we stop.
    494                 if (offsets.isEmpty()) {
    495                     return pos; // No strings matched after a span.
    496                 }
    497                 // Match strings from after the next string match.
    498             } else {
    499                 // The position is after a string match (or a single code point).
    500                 if (offsets.isEmpty()) {
    501                     // No more strings matched after a previous string match.
    502                     // Try another code point span from after the last string match.
    503                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
    504                     spanLength = spanLimit - pos;
    505                     if (spanLength == rest || // Reached the end of the string, or
    506                             spanLength == 0 // neither strings nor span progressed.
    507                     ) {
    508                         return spanLimit;
    509                     }
    510                     pos += spanLength;
    511                     rest -= spanLength;
    512                     continue; // spanLength>0: Match strings from after a span.
    513                 } else {
    514                     // Try to match only one code point from after a string match if some
    515                     // string matched beyond it, so that we try all possible positions
    516                     // and don't overshoot.
    517                     spanLength = spanOne(spanSet, s, pos, rest);
    518                     if (spanLength > 0) {
    519                         if (spanLength == rest) {
    520                             return length; // Reached the end of the string.
    521                         }
    522                         // Match strings after this code point.
    523                         // There cannot be any increments below it because UnicodeSet strings
    524                         // contain multiple code points.
    525                         pos += spanLength;
    526                         rest -= spanLength;
    527                         offsets.shift(spanLength);
    528                         spanLength = 0;
    529                         continue; // Match strings from after a single code point.
    530                     }
    531                     // Match strings from after the next string match.
    532                 }
    533             }
    534             int minOffset = offsets.popMinimum(null);
    535             pos += minOffset;
    536             rest -= minOffset;
    537             spanLength = 0; // Match strings from after a string match.
    538         }
    539     }
    540 
    541     /**
    542      * Spans a string and counts the smallest number of set elements on any path across the span.
    543      *
    544      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
    545      *
    546      * <p>If the set does not have any fully-contained strings, then we could optimize this
    547      * like span(), but such sets are likely rare, and this is at least still linear.
    548      *
    549      * @param s The string to be spanned
    550      * @param start The start index that the span begins
    551      * @param spanCondition The span condition
    552      * @param outCount The count
    553      * @return the limit (exclusive end) of the span
    554      */
    555     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
    556             OutputInt outCount) {
    557         if (spanCondition == SpanCondition.NOT_CONTAINED) {
    558             return spanNot(s, start, outCount);
    559         }
    560         // Consider strings; they may overlap with the span,
    561         // and they may result in a smaller count that with just code points.
    562         if (spanCondition == SpanCondition.CONTAINED) {
    563             return spanContainedAndCount(s, start, outCount);
    564         }
    565         // SIMPLE (not synchronized, does not use offsets)
    566         int stringsLength = strings.size();
    567         int length = s.length();
    568         int pos = start;
    569         int rest = length - start;
    570         int count = 0;
    571         while (rest != 0) {
    572             // Try to match the next code point.
    573             int cpLength = spanOne(spanSet, s, pos, rest);
    574             int maxInc = (cpLength > 0) ? cpLength : 0;
    575             // Try to match all of the strings.
    576             for (int i = 0; i < stringsLength; ++i) {
    577                 String string = strings.get(i);
    578                 int length16 = string.length();
    579                 if (maxInc < length16 && length16 <= rest &&
    580                         matches16CPB(s, pos, length, string, length16)) {
    581                     maxInc = length16;
    582                 }
    583             }
    584             // We are done if there is no match beyond pos.
    585             if (maxInc == 0) {
    586                 outCount.value = count;
    587                 return pos;
    588             }
    589             // Continue from the longest match.
    590             ++count;
    591             pos += maxInc;
    592             rest -= maxInc;
    593         }
    594         outCount.value = count;
    595         return pos;
    596     }
    597 
    598     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
    599         // Use offset list to try all possibilities.
    600         offsets.setMaxLength(maxLength16);
    601         int stringsLength = strings.size();
    602         int length = s.length();
    603         int pos = start;
    604         int rest = length - start;
    605         int count = 0;
    606         while (rest != 0) {
    607             // Try to match the next code point.
    608             int cpLength = spanOne(spanSet, s, pos, rest);
    609             if (cpLength > 0) {
    610                 offsets.addOffsetAndCount(cpLength, count + 1);
    611             }
    612             // Try to match all of the strings.
    613             for (int i = 0; i < stringsLength; ++i) {
    614                 String string = strings.get(i);
    615                 int length16 = string.length();
    616                 // Note: If the strings were sorted by length, then we could also
    617                 // avoid trying to match if there is already a match of the same length.
    618                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
    619                         matches16CPB(s, pos, length, string, length16)) {
    620                     offsets.addOffsetAndCount(length16, count + 1);
    621                 }
    622             }
    623             // We are done if there is no match beyond pos.
    624             if (offsets.isEmpty()) {
    625                 outCount.value = count;
    626                 return pos;
    627             }
    628             // Continue from the nearest match.
    629             int minOffset = offsets.popMinimum(outCount);
    630             count = outCount.value;
    631             pos += minOffset;
    632             rest -= minOffset;
    633         }
    634         outCount.value = count;
    635         return pos;
    636     }
    637 
    638     /**
    639      * Span a string backwards.
    640      *
    641      * @param s The string to be spanned
    642      * @param spanCondition The span condition
    643      * @return The string index which starts the span (i.e. inclusive).
    644      */
    645     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
    646         if (spanCondition == SpanCondition.NOT_CONTAINED) {
    647             return spanNotBack(s, length);
    648         }
    649         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
    650         if (pos == 0) {
    651             return 0;
    652         }
    653         int spanLength = length - pos;
    654 
    655         // Consider strings; they may overlap with the span.
    656         int initSize = 0;
    657         if (spanCondition == SpanCondition.CONTAINED) {
    658             // Use offset list to try all possibilities.
    659             initSize = maxLength16;
    660         }
    661         offsets.setMaxLength(initSize);
    662         int i, stringsLength = strings.size();
    663         int spanBackLengthsOffset = 0;
    664         if (all) {
    665             spanBackLengthsOffset = stringsLength;
    666         }
    667         for (;;) {
    668             if (spanCondition == SpanCondition.CONTAINED) {
    669                 for (i = 0; i < stringsLength; ++i) {
    670                     int overlap = spanLengths[spanBackLengthsOffset + i];
    671                     if (overlap == ALL_CP_CONTAINED) {
    672                         continue; // Irrelevant string.
    673                     }
    674                     String string = strings.get(i);
    675 
    676                     int length16 = string.length();
    677 
    678                     // Try to match this string at pos-(length16-overlap)..pos-length16.
    679                     if (overlap >= LONG_SPAN) {
    680                         overlap = length16;
    681                         // While contained: No point matching fully inside the code point span.
    682                         int len1 = 0;
    683                         len1 = string.offsetByCodePoints(0, 1);
    684                         overlap -= len1; // Length of the string minus the first code point.
    685                     }
    686                     if (overlap > spanLength) {
    687                         overlap = spanLength;
    688                     }
    689                     int dec = length16 - overlap; // Keep dec+overlap==length16.
    690                     for (;;) {
    691                         if (dec > pos) {
    692                             break;
    693                         }
    694                         // Try to match if the decrement is not listed already.
    695                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
    696                             if (dec == pos) {
    697                                 return 0; // Reached the start of the string.
    698                             }
    699                             offsets.addOffset(dec);
    700                         }
    701                         if (overlap == 0) {
    702                             break;
    703                         }
    704                         --overlap;
    705                         ++dec;
    706                     }
    707                 }
    708             } else /* SIMPLE */{
    709                 int maxDec = 0, maxOverlap = 0;
    710                 for (i = 0; i < stringsLength; ++i) {
    711                     int overlap = spanLengths[spanBackLengthsOffset + i];
    712                     // For longest match, we do need to try to match even an all-contained string
    713                     // to find the match from the latest end.
    714 
    715                     String string = strings.get(i);
    716 
    717                     int length16 = string.length();
    718 
    719                     // Try to match this string at pos-(length16-overlap)..pos-length16.
    720                     if (overlap >= LONG_SPAN) {
    721                         overlap = length16;
    722                         // Longest match: Need to match fully inside the code point span
    723                         // to find the match from the latest end.
    724                     }
    725                     if (overlap > spanLength) {
    726                         overlap = spanLength;
    727                     }
    728                     int dec = length16 - overlap; // Keep dec+overlap==length16.
    729                     for (;;) {
    730                         if (dec > pos || overlap < maxOverlap) {
    731                             break;
    732                         }
    733                         // Try to match if the string is longer or ends later.
    734                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
    735                                 && matches16CPB(s, pos - dec, length, string, length16)) {
    736                             maxDec = dec; // Longest match from latest end.
    737                             maxOverlap = overlap;
    738                             break;
    739                         }
    740                         --overlap;
    741                         ++dec;
    742                     }
    743                 }
    744 
    745                 if (maxDec != 0 || maxOverlap != 0) {
    746                     // Longest-match algorithm, and there was a string match.
    747                     // Simply continue before it.
    748                     pos -= maxDec;
    749                     if (pos == 0) {
    750                         return 0; // Reached the start of the string.
    751                     }
    752                     spanLength = 0; // Match strings from before a string match.
    753                     continue;
    754                 }
    755             }
    756             // Finished trying to match all strings at pos.
    757 
    758             if (spanLength != 0 || pos == length) {
    759                 // The position is before an unlimited code point span (spanLength!=0),
    760                 // not before a string match.
    761                 // The only position where spanLength==0 before a span is pos==length.
    762                 // Otherwise, an unlimited code point span is only tried again when no
    763                 // strings match, and if such a non-initial span fails we stop.
    764                 if (offsets.isEmpty()) {
    765                     return pos; // No strings matched before a span.
    766                 }
    767                 // Match strings from before the next string match.
    768             } else {
    769                 // The position is before a string match (or a single code point).
    770                 if (offsets.isEmpty()) {
    771                     // No more strings matched before a previous string match.
    772                     // Try another code point span from before the last string match.
    773                     int oldPos = pos;
    774                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
    775                     spanLength = oldPos - pos;
    776                     if (pos == 0 || // Reached the start of the string, or
    777                             spanLength == 0 // neither strings nor span progressed.
    778                     ) {
    779                         return pos;
    780                     }
    781                     continue; // spanLength>0: Match strings from before a span.
    782                 } else {
    783                     // Try to match only one code point from before a string match if some
    784                     // string matched beyond it, so that we try all possible positions
    785                     // and don't overshoot.
    786                     spanLength = spanOneBack(spanSet, s, pos);
    787                     if (spanLength > 0) {
    788                         if (spanLength == pos) {
    789                             return 0; // Reached the start of the string.
    790                         }
    791                         // Match strings before this code point.
    792                         // There cannot be any decrements below it because UnicodeSet strings
    793                         // contain multiple code points.
    794                         pos -= spanLength;
    795                         offsets.shift(spanLength);
    796                         spanLength = 0;
    797                         continue; // Match strings from before a single code point.
    798                     }
    799                     // Match strings from before the next string match.
    800                 }
    801             }
    802             pos -= offsets.popMinimum(null);
    803             spanLength = 0; // Match strings from before a string match.
    804         }
    805     }
    806 
    807     /**
    808      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
    809      *
    810      * Theoretical algorithm:
    811      * - Iterate through the string, and at each code point boundary:
    812      *   + If the code point there is in the set, then return with the current position.
    813      *   + If a set string matches at the current position, then return with the current position.
    814      *
    815      * Optimized implementation:
    816      *
    817      * (Same assumption as for span() above.)
    818      *
    819      * Create and cache a spanNotSet which contains
    820      * all of the single code points of the original set but none of its strings.
    821      * For each set string add its initial code point to the spanNotSet.
    822      * (Also add its final code point for spanNotBack().)
    823      *
    824      * - Loop:
    825      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
    826      *   + If the current code point is in the original set, then return the current position.
    827      *   + If any set string matches at the current position, then return the current position.
    828      *   + If there is no match at the current position, neither for the code point
    829      *     there nor for any set string, then skip this code point and continue the loop.
    830      *     This happens for set-string-initial code points that were added to spanNotSet
    831      *     when there is not actually a match for such a set string.
    832      *
    833      * @param s The string to be spanned
    834      * @param start The start index that the span begins
    835      * @param outCount If not null: Receives the number of code points across the span.
    836      * @return the limit (exclusive end) of the span
    837      */
    838     private int spanNot(CharSequence s, int start, OutputInt outCount) {
    839         int length = s.length();
    840         int pos = start, rest = length - start;
    841         int stringsLength = strings.size();
    842         int count = 0;
    843         do {
    844             // Span until we find a code point from the set,
    845             // or a code point that starts or ends some string.
    846             int spanLimit;
    847             if (outCount == null) {
    848                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
    849             } else {
    850                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
    851                 outCount.value = count = count + outCount.value;
    852             }
    853             if (spanLimit == length) {
    854                 return length; // Reached the end of the string.
    855             }
    856             pos = spanLimit;
    857             rest = length - spanLimit;
    858 
    859             // Check whether the current code point is in the original set,
    860             // without the string starts and ends.
    861             int cpLength = spanOne(spanSet, s, pos, rest);
    862             if (cpLength > 0) {
    863                 return pos; // There is a set element at pos.
    864             }
    865 
    866             // Try to match the strings at pos.
    867             for (int i = 0; i < stringsLength; ++i) {
    868                 if (spanLengths[i] == ALL_CP_CONTAINED) {
    869                     continue; // Irrelevant string.
    870                 }
    871                 String string = strings.get(i);
    872 
    873                 int length16 = string.length();
    874                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
    875                     return pos; // There is a set element at pos.
    876                 }
    877             }
    878 
    879             // The span(while not contained) ended on a string start/end which is
    880             // not in the original set. Skip this code point and continue.
    881             // cpLength<0
    882             pos -= cpLength;
    883             rest += cpLength;
    884             ++count;
    885         } while (rest != 0);
    886         if (outCount != null) {
    887             outCount.value = count;
    888         }
    889         return length; // Reached the end of the string.
    890     }
    891 
    892     private int spanNotBack(CharSequence s, int length) {
    893         int pos = length;
    894         int i, stringsLength = strings.size();
    895         do {
    896             // Span until we find a code point from the set,
    897             // or a code point that starts or ends some string.
    898             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
    899             if (pos == 0) {
    900                 return 0; // Reached the start of the string.
    901             }
    902 
    903             // Check whether the current code point is in the original set,
    904             // without the string starts and ends.
    905             int cpLength = spanOneBack(spanSet, s, pos);
    906             if (cpLength > 0) {
    907                 return pos; // There is a set element at pos.
    908             }
    909 
    910             // Try to match the strings at pos.
    911             for (i = 0; i < stringsLength; ++i) {
    912                 // Use spanLengths rather than a spanLengths pointer because
    913                 // it is easier and we only need to know whether the string is irrelevant
    914                 // which is the same in either array.
    915                 if (spanLengths[i] == ALL_CP_CONTAINED) {
    916                     continue; // Irrelevant string.
    917                 }
    918                 String string = strings.get(i);
    919 
    920                 int length16 = string.length();
    921                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
    922                     return pos; // There is a set element at pos.
    923                 }
    924             }
    925 
    926             // The span(while not contained) ended on a string start/end which is
    927             // not in the original set. Skip this code point and continue.
    928             // cpLength<0
    929             pos += cpLength;
    930         } while (pos != 0);
    931         return 0; // Reached the start of the string.
    932     }
    933 
    934     static short makeSpanLengthByte(int spanLength) {
    935         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
    936         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
    937     }
    938 
    939     // Compare strings without any argument checks. Requires length>0.
    940     private static boolean matches16(CharSequence s, int start, final String t, int length) {
    941         int end = start + length;
    942         while (length-- > 0) {
    943             if (s.charAt(--end) != t.charAt(length)) {
    944                 return false;
    945             }
    946         }
    947         return true;
    948     }
    949 
    950     /**
    951      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
    952      * at code point boundaries.
    953      * That is, each edge of a match must not be in the middle of a surrogate pair.
    954      * @param s       The string to match in.
    955      * @param start   The start index of s.
    956      * @param limit   The limit of the subsequence of s being spanned.
    957      * @param t       The substring to be matched in s.
    958      * @param tlength The length of t.
    959      */
    960     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
    961         return matches16(s, start, t, tlength)
    962                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
    963                         Character.isLowSurrogate(s.charAt(start)))
    964                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
    965                         Character.isLowSurrogate(s.charAt(start + tlength)));
    966     }
    967 
    968     /**
    969      * Does the set contain the next code point?
    970      * If so, return its length; otherwise return its negative length.
    971      */
    972     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
    973         char c = s.charAt(start);
    974         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
    975             char c2 = s.charAt(start + 1);
    976             if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
    977                 int supplementary = Character.toCodePoint(c, c2);
    978                 return set.contains(supplementary) ? 2 : -2;
    979             }
    980         }
    981         return set.contains(c) ? 1 : -1;
    982     }
    983 
    984     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
    985         char c = s.charAt(length - 1);
    986         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
    987             char c2 = s.charAt(length - 2);
    988             if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
    989                 int supplementary = Character.toCodePoint(c2, c);
    990                 return set.contains(supplementary) ? 2 : -2;
    991             }
    992         }
    993         return set.contains(c) ? 1 : -1;
    994     }
    995 
    996     /**
    997      * Helper class for UnicodeSetStringSpan.
    998      *
    999      * <p>List of offsets from the current position from where to try matching
   1000      * a code point or a string.
   1001      * Stores offsets rather than indexes to simplify the code and use the same list
   1002      * for both increments (in span()) and decrements (in spanBack()).
   1003      *
   1004      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
   1005      * are relatively dense, that is,
   1006      * there are normally no gaps of hundreds or thousands of offset values.
   1007      *
   1008      * <p>This class optionally also tracks the minimum non-negative count for each position,
   1009      * intended to count the smallest number of elements of any path leading to that position.
   1010      *
   1011      * <p>The implementation uses a circular buffer of count integers,
   1012      * each indicating whether the corresponding offset is in the list,
   1013      * and its path element count.
   1014      * This avoids inserting into a sorted list of offsets (or absolute indexes)
   1015      * and physically moving part of the list.
   1016      *
   1017      * <p>Note: In principle, the caller should setMaxLength() to
   1018      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
   1019      * to account for "long" single code points.
   1020      *
   1021      * <p>Note: An earlier version did not track counts and stored only byte flags.
   1022      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
   1023      * the list could be stored as bit flags in a single integer.
   1024      * Rather than handling a circular buffer with a start list index,
   1025      * the integer would simply be shifted when lower offsets are removed.
   1026      * UnicodeSet does not have a limit on the lengths of strings.
   1027      */
   1028     private static final class OffsetList {
   1029         private int[] list;
   1030         private int length;
   1031         private int start;
   1032 
   1033         public OffsetList() {
   1034             list = new int[16];  // default size
   1035         }
   1036 
   1037         public void setMaxLength(int maxLength) {
   1038             if (maxLength > list.length) {
   1039                 list = new int[maxLength];
   1040             }
   1041             clear();
   1042         }
   1043 
   1044         public void clear() {
   1045             for (int i = list.length; i-- > 0;) {
   1046                 list[i] = 0;
   1047             }
   1048             start = length = 0;
   1049         }
   1050 
   1051         public boolean isEmpty() {
   1052             return (length == 0);
   1053         }
   1054 
   1055         /**
   1056          * Reduces all stored offsets by delta, used when the current position moves by delta.
   1057          * There must not be any offsets lower than delta.
   1058          * If there is an offset equal to delta, it is removed.
   1059          *
   1060          * @param delta [1..maxLength]
   1061          */
   1062         public void shift(int delta) {
   1063             int i = start + delta;
   1064             if (i >= list.length) {
   1065                 i -= list.length;
   1066             }
   1067             if (list[i] != 0) {
   1068                 list[i] = 0;
   1069                 --length;
   1070             }
   1071             start = i;
   1072         }
   1073 
   1074         /**
   1075          * Adds an offset. The list must not contain it yet.
   1076          * @param offset [1..maxLength]
   1077          */
   1078         public void addOffset(int offset) {
   1079             int i = start + offset;
   1080             if (i >= list.length) {
   1081                 i -= list.length;
   1082             }
   1083             assert list[i] == 0;
   1084             list[i] = 1;
   1085             ++length;
   1086         }
   1087 
   1088         /**
   1089          * Adds an offset and updates its count.
   1090          * The list may already contain the offset.
   1091          * @param offset [1..maxLength]
   1092          */
   1093         public void addOffsetAndCount(int offset, int count) {
   1094             assert count > 0;
   1095             int i = start + offset;
   1096             if (i >= list.length) {
   1097                 i -= list.length;
   1098             }
   1099             if (list[i] == 0) {
   1100                 list[i] = count;
   1101                 ++length;
   1102             } else if (count < list[i]) {
   1103                 list[i] = count;
   1104             }
   1105         }
   1106 
   1107         /**
   1108          * @param offset [1..maxLength]
   1109          */
   1110         public boolean containsOffset(int offset) {
   1111             int i = start + offset;
   1112             if (i >= list.length) {
   1113                 i -= list.length;
   1114             }
   1115             return list[i] != 0;
   1116         }
   1117 
   1118         /**
   1119          * @param offset [1..maxLength]
   1120          */
   1121         public boolean hasCountAtOffset(int offset, int count) {
   1122             int i = start + offset;
   1123             if (i >= list.length) {
   1124                 i -= list.length;
   1125             }
   1126             int oldCount = list[i];
   1127             return oldCount != 0 && oldCount <= count;
   1128         }
   1129 
   1130         /**
   1131          * Finds the lowest stored offset from a non-empty list, removes it,
   1132          * and reduces all other offsets by this minimum.
   1133          * @return min=[1..maxLength]
   1134          */
   1135         public int popMinimum(OutputInt outCount) {
   1136             // Look for the next offset in list[start+1..list.length-1].
   1137             int i = start, result;
   1138             while (++i < list.length) {
   1139                 int count = list[i];
   1140                 if (count != 0) {
   1141                     list[i] = 0;
   1142                     --length;
   1143                     result = i - start;
   1144                     start = i;
   1145                     if (outCount != null) { outCount.value = count; }
   1146                     return result;
   1147                 }
   1148             }
   1149             // i==list.length
   1150 
   1151             // Wrap around and look for the next offset in list[0..start].
   1152             // Since the list is not empty, there will be one.
   1153             result = list.length - start;
   1154             i = 0;
   1155             int count;
   1156             while ((count = list[i]) == 0) {
   1157                 ++i;
   1158             }
   1159             list[i] = 0;
   1160             --length;
   1161             start = i;
   1162             if (outCount != null) { outCount.value = count; }
   1163             return result + i;
   1164         }
   1165     }
   1166 }
   1167