Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 1996-2016, International Business Machines Corporation and
      7  * others. All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 package android.icu.text;
     11 
     12 import java.text.CharacterIterator;
     13 import java.text.StringCharacterIterator;
     14 import java.util.Locale;
     15 
     16 import android.icu.util.ICUException;
     17 import android.icu.util.ULocale;
     18 
     19 // Java porting note:
     20 //
     21 //        The ICU4C implementation contains dead code in many places.
     22 //      While porting the ICU4C linear search implementation, this dead code
     23 //      was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***"
     24 //      are those dead code blocks, still available in ICU4C.
     25 
     26 //        The ICU4C implementation does not seem to handle UCharacterIterator pointing
     27 //      to a fragment of text properly. ICU4J uses CharacterIterator to navigate through
     28 //      the input text. We need to carefully review the code ported from ICU4C
     29 //      assuming the start index is 0.
     30 
     31 //        ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like
     32 //      CE is no longer used, except in a few places checking CELength. It looks like this
     33 //      is a leftover from already-disabled Boyer-Moore search code. This Java implementation
     34 //      preserves the code, but we should clean this up later.
     35 
     36 /**
     37  *
     38  * <tt>StringSearch</tt> is a {@link SearchIterator} that provides
     39  * language-sensitive text searching based on the comparison rules defined
     40  * in a {@link RuleBasedCollator} object.
     41  * StringSearch ensures that language eccentricity can be
     42  * handled, e.g. for the German collator, characters &szlig; and SS will be matched
     43  * if case is chosen to be ignored.
     44  * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm">
     45  * "ICU Collation Design Document"</a> for more information.
     46  * <p>
     47  * There are 2 match options for selection:<br>
     48  * Let S' be the sub-string of a text string S between the offsets start and
     49  * end [start, end].
     50  * <br>
     51  * A pattern string P matches a text string S at the offsets [start, end]
     52  * if
     53  * <pre>
     54  * option 1. Some canonical equivalent of P matches some canonical equivalent
     55  *           of S'
     56  * option 2. P matches S' and if P starts or ends with a combining mark,
     57  *           there exists no non-ignorable combining mark before or after S?
     58  *           in S respectively.
     59  * </pre>
     60  * Option 2. is the default.
     61  * <p>
     62  * This search has APIs similar to that of other text iteration mechanisms
     63  * such as the break iterators in {@link BreakIterator}. Using these
     64  * APIs, it is easy to scan through text looking for all occurrences of
     65  * a given pattern. This search iterator allows changing of direction by
     66  * calling a {@link #reset} followed by a {@link #next} or {@link #previous}.
     67  * Though a direction change can occur without calling {@link #reset} first,
     68  * this operation comes with some speed penalty.
     69  * Match results in the forward direction will match the result matches in
     70  * the backwards direction in the reverse order
     71  * <p>
     72  * {@link SearchIterator} provides APIs to specify the starting position
     73  * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex},
     74  * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}.
     75  * Since the starting position will be set as it is specified, please take note that
     76  * there are some danger points at which the search may render incorrect
     77  * results:
     78  * <ul>
     79  * <li> In the midst of a substring that requires normalization.
     80  * <li> If the following match is to be found, the position should not be the
     81  *      second character which requires swapping with the preceding
     82  *      character. Vice versa, if the preceding match is to be found, the
     83  *      position to search from should not be the first character which
     84  *      requires swapping with the next character. E.g certain Thai and
     85  *      Lao characters require swapping.
     86  * <li> If a following pattern match is to be found, any position within a
     87  *      contracting sequence except the first will fail. Vice versa if a
     88  *      preceding pattern match is to be found, an invalid starting point
     89  *      would be any character within a contracting sequence except the last.
     90  * </ul>
     91  * <p>
     92  * A {@link BreakIterator} can be used if only matches at logical breaks are desired.
     93  * Using a {@link BreakIterator} will only give you results that exactly matches the
     94  * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will
     95  * not be found in the string "\u00e9" if a character break iterator is used.
     96  * <p>
     97  * Options are provided to handle overlapping matches.
     98  * E.g. In English, overlapping matches produces the result 0 and 2
     99  * for the pattern "abab" in the text "ababab", where mutually
    100  * exclusive matches only produces the result of 0.
    101  * <p>
    102  * Options are also provided to implement "asymmetric search" as described in
    103  * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
    104  * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType
    105  * values.
    106  * <p>
    107  * Though collator attributes will be taken into consideration while
    108  * performing matches, there are no APIs here for setting and getting the
    109  * attributes. These attributes can be set by getting the collator
    110  * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}.
    111  * Lastly to update <tt>StringSearch</tt> to the new collator attributes,
    112  * {@link #reset} has to be called.
    113  * <p>
    114  * Restriction: <br>
    115  * Currently there are no composite characters that consists of a
    116  * character with combining class &gt; 0 before a character with combining
    117  * class == 0. However, if such a character exists in the future,
    118  * <tt>StringSearch</tt> does not guarantee the results for option 1.
    119  * <p>
    120  * Consult the {@link SearchIterator} documentation for information on
    121  * and examples of how to use instances of this class to implement text
    122  * searching.
    123  * <p>
    124  * Note, <tt>StringSearch</tt> is not to be subclassed.
    125  * </p>
    126  * @see SearchIterator
    127  * @see RuleBasedCollator
    128  * @author Laura Werner, synwee
    129  */
    130 // internal notes: all methods do not guarantee the correct status of the
    131 // characteriterator. the caller has to maintain the original index position
    132 // if necessary. methods could change the index position as it deems fit
    133 public final class StringSearch extends SearchIterator {
    134 
    135     private Pattern pattern_;
    136     private RuleBasedCollator collator_;
    137 
    138     // positions within the collation element iterator is used to determine
    139     // if we are at the start of the text.
    140     private CollationElementIterator textIter_;
    141     private CollationPCE textProcessedIter_;
    142 
    143     // utility collation element, used throughout program for temporary
    144     // iteration.
    145     private CollationElementIterator utilIter_;
    146 
    147     private Normalizer2 nfd_;
    148 
    149     private int strength_;
    150     int ceMask_;
    151     int variableTop_;
    152 
    153     private boolean toShift_;
    154 
    155     // *** Boyer-Moore ***
    156     // private char[] canonicalPrefixAccents_;
    157     // private char[] canonicalSuffixAccents_;
    158 
    159     /**
    160      * Initializes the iterator to use the language-specific rules defined in
    161      * the argument collator to search for argument pattern in the argument
    162      * target text. The argument <code>breakiter</code> is used to define logical matches.
    163      * See super class documentation for more details on the use of the target
    164      * text and {@link BreakIterator}.
    165      * @param pattern text to look for.
    166      * @param target target text to search for pattern.
    167      * @param collator {@link RuleBasedCollator} that defines the language rules
    168      * @param breakiter A {@link BreakIterator} that is used to determine the
    169      *                boundaries of a logical match. This argument can be null.
    170      * @throws IllegalArgumentException thrown when argument target is null,
    171      *            or of length 0
    172      * @see BreakIterator
    173      * @see RuleBasedCollator
    174      */
    175     public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator,
    176             BreakIterator breakiter) {
    177 
    178         // This implementation is ported from ICU4C usearch_open()
    179 
    180         super(target, breakiter);
    181 
    182         // string search does not really work when numeric collation is turned on
    183         if (collator.getNumericCollation()) {
    184             throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch");
    185         }
    186 
    187         collator_ = collator;
    188         strength_ = collator.getStrength();
    189         ceMask_ = getMask(strength_);
    190         toShift_ = collator.isAlternateHandlingShifted();
    191         variableTop_ = collator.getVariableTop();
    192 
    193         nfd_ = Normalizer2.getNFDInstance();
    194 
    195         pattern_ = new Pattern(pattern);
    196 
    197         search_.setMatchedLength(0);
    198         search_.matchedIndex_ = DONE;
    199 
    200         utilIter_ = null;
    201         textIter_ = new CollationElementIterator(target, collator);
    202 
    203         textProcessedIter_ = null;
    204 
    205         // This is done by super class constructor
    206         /*
    207         search_.isOverlap_ = false;
    208         search_.isCanonicalMatch_ = false;
    209         search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
    210         search_.isForwardSearching_ = true;
    211         search_.reset_ = true;
    212          */
    213         ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
    214         search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
    215         search_.internalBreakIter_.setText((CharacterIterator)target.clone());  // We need to create a clone
    216 
    217         initialize();
    218     }
    219 
    220     /**
    221      * Initializes the iterator to use the language-specific rules defined in
    222      * the argument collator to search for argument pattern in the argument
    223      * target text. No {@link BreakIterator}s are set to test for logical matches.
    224      * @param pattern text to look for.
    225      * @param target target text to search for pattern.
    226      * @param collator {@link RuleBasedCollator} that defines the language rules
    227      * @throws IllegalArgumentException thrown when argument target is null,
    228      *            or of length 0
    229      * @see RuleBasedCollator
    230      */
    231     public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) {
    232         this(pattern, target, collator, null);
    233     }
    234 
    235     /**
    236      * Initializes the iterator to use the language-specific rules and
    237      * break iterator rules defined in the argument locale to search for
    238      * argument pattern in the argument target text.
    239      * @param pattern text to look for.
    240      * @param target target text to search for pattern.
    241      * @param locale locale to use for language and break iterator rules
    242      * @throws IllegalArgumentException thrown when argument target is null,
    243      *            or of length 0. ClassCastException thrown if the collator for
    244      *            the specified locale is not a RuleBasedCollator.
    245      */
    246     public StringSearch(String pattern, CharacterIterator target, Locale locale) {
    247         this(pattern, target, ULocale.forLocale(locale));
    248     }
    249 
    250     /**
    251      * Initializes the iterator to use the language-specific rules and
    252      * break iterator rules defined in the argument locale to search for
    253      * argument pattern in the argument target text.
    254      * See super class documentation for more details on the use of the target
    255      * text and {@link BreakIterator}.
    256      * @param pattern text to look for.
    257      * @param target target text to search for pattern.
    258      * @param locale locale to use for language and break iterator rules
    259      * @throws IllegalArgumentException thrown when argument target is null,
    260      *            or of length 0. ClassCastException thrown if the collator for
    261      *            the specified locale is not a RuleBasedCollator.
    262      * @see BreakIterator
    263      * @see RuleBasedCollator
    264      * @see SearchIterator
    265      */
    266     public StringSearch(String pattern, CharacterIterator target, ULocale locale) {
    267         this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null);
    268     }
    269 
    270     /**
    271      * Initializes the iterator to use the language-specific rules and
    272      * break iterator rules defined in the default locale to search for
    273      * argument pattern in the argument target text.
    274      * @param pattern text to look for.
    275      * @param target target text to search for pattern.
    276      * @throws IllegalArgumentException thrown when argument target is null,
    277      *            or of length 0. ClassCastException thrown if the collator for
    278      *            the default locale is not a RuleBasedCollator.
    279      */
    280     public StringSearch(String pattern, String target) {
    281         this(pattern, new StringCharacterIterator(target),
    282                 (RuleBasedCollator) Collator.getInstance(), null);
    283     }
    284 
    285     /**
    286      * Gets the {@link RuleBasedCollator} used for the language rules.
    287      * <p>
    288      * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any
    289      * changes to the {@link RuleBasedCollator} result should follow with a call to
    290      * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct
    291      * search behavior.
    292      * </p>
    293      * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt>
    294      * @see RuleBasedCollator
    295      * @see #setCollator
    296      */
    297     public RuleBasedCollator getCollator() {
    298         return collator_;
    299     }
    300 
    301     /**
    302      * Sets the {@link RuleBasedCollator} to be used for language-specific searching.
    303      * <p>
    304      * The iterator's position will not be changed by this method.
    305      * @param collator to use for this <tt>StringSearch</tt>
    306      * @throws IllegalArgumentException thrown when collator is null
    307      * @see #getCollator
    308      */
    309     public void setCollator(RuleBasedCollator collator) {
    310         if (collator == null) {
    311             throw new IllegalArgumentException("Collator can not be null");
    312         }
    313         collator_ = collator;
    314         ceMask_ = getMask(collator_.getStrength());
    315 
    316         ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
    317         search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
    318         search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone());  // We need to create a clone
    319 
    320         toShift_ = collator.isAlternateHandlingShifted();
    321         variableTop_ = collator.getVariableTop();
    322         textIter_ = new CollationElementIterator(pattern_.text_, collator);
    323         utilIter_ = new CollationElementIterator(pattern_.text_, collator);
    324 
    325         // initialize() _after_ setting the iterators for the new collator.
    326         initialize();
    327     }
    328 
    329     /**
    330      * Returns the pattern for which <tt>StringSearch</tt> is searching for.
    331      * @return the pattern searched for
    332      */
    333     public String getPattern() {
    334         return pattern_.text_;
    335     }
    336 
    337     /**
    338      * Set the pattern to search for.
    339      * The iterator's position will not be changed by this method.
    340      * @param pattern for searching
    341      * @see #getPattern
    342      * @exception IllegalArgumentException thrown if pattern is null or of
    343      *               length 0
    344      */
    345     public void setPattern(String pattern) {
    346         if (pattern == null || pattern.length() <= 0) {
    347             throw new IllegalArgumentException(
    348                     "Pattern to search for can not be null or of length 0");
    349         }
    350         pattern_.text_ = pattern;
    351         initialize();
    352     }
    353 
    354     /**
    355      * Determines whether canonical matches (option 1, as described in the
    356      * class documentation) is set.
    357      * See setCanonical(boolean) for more information.
    358      * @see #setCanonical
    359      * @return true if canonical matches is set, false otherwise
    360      */
    361     //TODO: hoist this to SearchIterator
    362     public boolean isCanonical() {
    363         return search_.isCanonicalMatch_;
    364     }
    365 
    366     /**
    367      * Set the canonical match mode. See class documentation for details.
    368      * The default setting for this property is false.
    369      * @param allowCanonical flag indicator if canonical matches are allowed
    370      * @see #isCanonical
    371      */
    372     //TODO: hoist this to SearchIterator
    373     public void setCanonical(boolean allowCanonical) {
    374         search_.isCanonicalMatch_ = allowCanonical;
    375     }
    376 
    377     /**
    378      * {@inheritDoc}
    379      */
    380     @Override
    381     public void setTarget(CharacterIterator text) {
    382         super.setTarget(text);
    383         textIter_.setText(text);
    384     }
    385 
    386     /**
    387      * {@inheritDoc}
    388      */
    389     @Override
    390     public int getIndex() {
    391         int result = textIter_.getOffset();
    392         if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) {
    393             return DONE;
    394         }
    395         return result;
    396     }
    397 
    398     /**
    399      * {@inheritDoc}
    400      */
    401     @Override
    402     public void setIndex(int position) {
    403         // Java porting note: This method is equivalent to setOffset() in ICU4C.
    404         // ICU4C SearchIterator::setOffset() is a pure virtual method, while
    405         // ICU4J SearchIterator.setIndex() is not abstract method.
    406 
    407         super.setIndex(position);
    408         textIter_.setOffset(position);
    409     }
    410 
    411     /**
    412      * {@inheritDoc}
    413      */
    414     @Override
    415     public void reset() {
    416         // reset is setting the attributes that are already in
    417         // string search, hence all attributes in the collator should
    418         // be retrieved without any problems
    419 
    420         boolean sameCollAttribute = true;
    421         int ceMask;
    422         boolean shift;
    423         int varTop;
    424 
    425         // **** hack to deal w/ how processed CEs encode quaternary ****
    426         int newStrength = collator_.getStrength();
    427         if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY)
    428                 || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) {
    429             sameCollAttribute = false;
    430         }
    431 
    432         strength_ = collator_.getStrength();
    433         ceMask = getMask(strength_);
    434         if (ceMask_ != ceMask) {
    435             ceMask_ = ceMask;
    436             sameCollAttribute = false;
    437         }
    438 
    439         shift = collator_.isAlternateHandlingShifted();
    440         if (toShift_ != shift) {
    441             toShift_ = shift;
    442             sameCollAttribute = false;
    443         }
    444 
    445         varTop = collator_.getVariableTop();
    446         if (variableTop_ != varTop) {
    447             variableTop_ = varTop;
    448             sameCollAttribute = false;
    449         }
    450 
    451         if (!sameCollAttribute) {
    452             initialize();
    453         }
    454 
    455         textIter_.setText(search_.text());
    456 
    457         search_.setMatchedLength(0);
    458         search_.matchedIndex_ = DONE;
    459         search_.isOverlap_ = false;
    460         search_.isCanonicalMatch_ = false;
    461         search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
    462         search_.isForwardSearching_ = true;
    463         search_.reset_ = true;
    464     }
    465 
    466     /**
    467      * {@inheritDoc}
    468      */
    469     @Override
    470     protected int handleNext(int position) {
    471         if (pattern_.CELength_ == 0) {
    472             search_.matchedIndex_ = search_.matchedIndex_ == DONE ?
    473                                     getIndex() : search_.matchedIndex_ + 1;
    474             search_.setMatchedLength(0);
    475             textIter_.setOffset(search_.matchedIndex_);
    476             if (search_.matchedIndex_ == search_.endIndex()) {
    477                 search_.matchedIndex_ = DONE;
    478             }
    479         } else {
    480             if (search_.matchedLength() <= 0) {
    481                 // the flipping direction issue has already been handled
    482                 // in next()
    483                 // for boundary check purposes. this will ensure that the
    484                 // next match will not preceed the current offset
    485                 // note search_.matchedIndex_ will always be set to something
    486                 // in the code
    487                 search_.matchedIndex_ = position - 1;
    488             }
    489 
    490             textIter_.setOffset(position);
    491 
    492             // ICU4C comment:
    493             // if strsrch_->breakIter is always the same as m_breakiterator_
    494             // then we don't need to check the match boundaries here because
    495             // usearch_handleNextXXX will already have done it.
    496             if (search_.isCanonicalMatch_) {
    497                 // *could* actually use exact here 'cause no extra accents allowed...
    498                 handleNextCanonical();
    499             } else {
    500                 handleNextExact();
    501             }
    502 
    503             if (search_.matchedIndex_ == DONE) {
    504                 textIter_.setOffset(search_.endIndex());
    505             } else {
    506                 textIter_.setOffset(search_.matchedIndex_);
    507             }
    508 
    509             return search_.matchedIndex_;
    510         }
    511 
    512         return DONE;
    513     }
    514 
    515     /**
    516      * {@inheritDoc}
    517      */
    518     @Override
    519     protected int handlePrevious(int position) {
    520         if (pattern_.CELength_ == 0) {
    521             search_.matchedIndex_ =
    522                     search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_;
    523             if (search_.matchedIndex_ == search_.beginIndex()) {
    524                 setMatchNotFound();
    525             } else {
    526                 search_.matchedIndex_--;
    527                 textIter_.setOffset(search_.matchedIndex_);
    528                 search_.setMatchedLength(0);
    529             }
    530         } else {
    531             textIter_.setOffset(position);
    532 
    533             if (search_.isCanonicalMatch_) {
    534                 // *could* use exact match here since extra accents *not* allowed!
    535                 handlePreviousCanonical();
    536             } else {
    537                 handlePreviousExact();
    538             }
    539         }
    540 
    541         return search_.matchedIndex_;
    542     }
    543 
    544     // ------------------ Internal implementation code ---------------------------
    545 
    546     private static final int INITIAL_ARRAY_SIZE_ = 256;
    547 
    548     // *** Boyer-Moore ***
    549     // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
    550     // private static final int LAST_BYTE_MASK_ = 0xff;
    551     // private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
    552 
    553     private static final int PRIMARYORDERMASK = 0xffff0000;
    554     private static final int SECONDARYORDERMASK = 0x0000ff00;
    555     private static final int TERTIARYORDERMASK = 0x000000ff;
    556 
    557     /**
    558      * Getting the mask for collation strength
    559      * @param strength collation strength
    560      * @return collation element mask
    561      */
    562     private static int getMask(int strength) {
    563         switch (strength) {
    564         case Collator.PRIMARY:
    565             return PRIMARYORDERMASK;
    566         case Collator.SECONDARY:
    567             return SECONDARYORDERMASK | PRIMARYORDERMASK;
    568         default:
    569             return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK;
    570         }
    571     }
    572 
    573 
    574     // *** Boyer-Moore ***
    575     /*
    576     private final char getFCD(String str, int offset) {
    577         char ch = str.charAt(offset);
    578         if (ch < 0x180) {
    579             return (char) nfcImpl_.getFCD16FromBelow180(ch);
    580         } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
    581             if (!Character.isHighSurrogate(ch)) {
    582                 return (char) nfcImpl_.getFCD16FromNormData(ch);
    583             } else {
    584                 char c2;
    585                 if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
    586                     return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
    587                 }
    588             }
    589         }
    590         return 0;
    591     }
    592 
    593     private final char getFCD(int c) {
    594         return (char)nfcImpl_.getFCD16(c);
    595     }
    596     */
    597 
    598     /**
    599      * Getting the modified collation elements taking into account the collation
    600      * attributes.
    601      *
    602      * @param sourcece
    603      * @return the modified collation element
    604      */
    605     private int getCE(int sourcece) {
    606         // note for tertiary we can't use the collator->tertiaryMask, that
    607         // is a preprocessed mask that takes into account case options. since
    608         // we are only concerned with exact matches, we don't need that.
    609         sourcece &= ceMask_;
    610 
    611         if (toShift_) {
    612             // alternate handling here, since only the 16 most significant digits
    613             // is only used, we can safely do a compare without masking
    614             // if the ce is a variable, we mask and get only the primary values
    615             // no shifting to quartenary is required since all primary values
    616             // less than variabletop will need to be masked off anyway.
    617             if (variableTop_ > sourcece) {
    618                 if (strength_ >= Collator.QUATERNARY) {
    619                     sourcece &= PRIMARYORDERMASK;
    620                 } else {
    621                     sourcece = CollationElementIterator.IGNORABLE;
    622                 }
    623             }
    624         } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) {
    625             sourcece = 0xFFFF;
    626         }
    627 
    628         return sourcece;
    629     }
    630 
    631     /**
    632      * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp
    633      * (except not taking destination buffer size and status param).
    634      * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
    635      * implement this in Pattern class.
    636      *
    637      * @param destination target array
    638      * @param offset destination offset to add value
    639      * @param value to be added
    640      * @param increments incremental size expected
    641      * @return new destination array, destination if there was no new allocation
    642      */
    643     private static int[] addToIntArray(int[] destination, int offset, int value, int increments) {
    644         int newlength = destination.length;
    645         if (offset + 1 == newlength) {
    646             newlength += increments;
    647             int temp[] = new int[newlength];
    648             System.arraycopy(destination, 0, temp, 0, offset);
    649             destination = temp;
    650         }
    651         destination[offset] = value;
    652         return destination;
    653     }
    654 
    655     /**
    656      * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp.
    657      * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
    658      * implement this in Pattern class.
    659      *
    660      * @param destination target array
    661      * @param offset destination offset to add value
    662      * @param destinationlength target array size
    663      * @param value to be added
    664      * @param increments incremental size expected
    665      * @return new destination array, destination if there was no new allocation
    666      */
    667     private static long[] addToLongArray(long[] destination, int offset, int destinationlength,
    668             long value, int increments) {
    669         int newlength = destinationlength;
    670         if (offset + 1 == newlength) {
    671             newlength += increments;
    672             long temp[] = new long[newlength];
    673             System.arraycopy(destination, 0, temp, 0, offset);
    674             destination = temp;
    675         }
    676         destination[offset] = value;
    677         return destination;
    678     }
    679 
    680     /**
    681      * Initializing the ce table for a pattern.
    682      * Stores non-ignorable collation keys.
    683      * Table size will be estimated by the size of the pattern text. Table
    684      * expansion will be perform as we go along. Adding 1 to ensure that the table
    685      * size definitely increases.
    686      * @return total number of expansions
    687      */
    688     // TODO: We probably do not need Pattern CE table.
    689     private int initializePatternCETable() {
    690         int[] cetable = new int[INITIAL_ARRAY_SIZE_];
    691         int patternlength = pattern_.text_.length();
    692         CollationElementIterator coleiter = utilIter_;
    693 
    694         if (coleiter == null) {
    695             coleiter = new CollationElementIterator(pattern_.text_, collator_);
    696             utilIter_ = coleiter;
    697         } else {
    698             coleiter.setText(pattern_.text_);
    699         }
    700 
    701         int offset = 0;
    702         int result = 0;
    703         int ce;
    704 
    705         while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) {
    706             int newce = getCE(ce);
    707             if (newce != CollationElementIterator.IGNORABLE /* 0 */) {
    708                 int[] temp = addToIntArray(cetable, offset, newce,
    709                         patternlength - coleiter.getOffset() + 1);
    710                 offset++;
    711                 cetable = temp;
    712             }
    713             result += (coleiter.getMaxExpansion(ce) - 1);
    714         }
    715 
    716         cetable[offset] = 0;
    717         pattern_.CE_ = cetable;
    718         pattern_.CELength_ = offset;
    719 
    720         return result;
    721     }
    722 
    723     /**
    724      * Initializing the pce table for a pattern.
    725      * Stores non-ignorable collation keys.
    726      * Table size will be estimated by the size of the pattern text. Table
    727      * expansion will be perform as we go along. Adding 1 to ensure that the table
    728      * size definitely increases.
    729      * @return total number of expansions
    730      */
    731     private int initializePatternPCETable() {
    732         long[] pcetable = new long[INITIAL_ARRAY_SIZE_];
    733         int pcetablesize = pcetable.length;
    734         int patternlength = pattern_.text_.length();
    735         CollationElementIterator coleiter = utilIter_;
    736 
    737         if (coleiter == null) {
    738             coleiter = new CollationElementIterator(pattern_.text_, collator_);
    739             utilIter_ = coleiter;
    740         } else {
    741             coleiter.setText(pattern_.text_);
    742         }
    743 
    744         int offset = 0;
    745         int result = 0;
    746         long pce;
    747 
    748         CollationPCE iter = new CollationPCE(coleiter);
    749 
    750         // ** Should processed CEs be signed or unsigned?
    751         // ** (the rest of the code in this file seems to play fast-and-loose with
    752         // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
    753         while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) {
    754             long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1);
    755             offset++;
    756             pcetable = temp;
    757         }
    758 
    759         pcetable[offset] = 0;
    760         pattern_.PCE_ = pcetable;
    761         pattern_.PCELength_ = offset;
    762 
    763         return result;
    764     }
    765 
    766     // TODO: This method only triggers initializePatternCETable(), which is probably no
    767     //      longer needed.
    768     private int initializePattern() {
    769         // Since the strength is primary, accents are ignored in the pattern.
    770 
    771         // *** Boyer-Moore ***
    772         /*
    773         if (strength_ == Collator.PRIMARY) {
    774             pattern_.hasPrefixAccents_ = false;
    775             pattern_.hasSuffixAccents_ = false;
    776         } else {
    777             pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0;
    778             pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0;
    779         }
    780         */
    781 
    782         pattern_.PCE_ = null;
    783 
    784         // since intializePattern is an internal method status is a success.
    785         return initializePatternCETable();
    786     }
    787 
    788     // *** Boyer-Moore ***
    789     /*
    790      private final void setShiftTable(char shift[],
    791                                          char backshift[],
    792                                          int cetable[], int cesize,
    793                                          int expansionsize,
    794                                          int defaultforward,
    795                                          int defaultbackward) {
    796          // No implementation
    797      }
    798      */
    799 
    800     // TODO: This method only triggers initializePattern(), which is probably no
    801     //      longer needed.
    802     private void initialize() {
    803         /* int expandlength = */ initializePattern();
    804 
    805         // *** Boyer-Moore ***
    806         /*
    807         if (pattern_.CELength_ > 0) {
    808             int cesize = pattern_.CELength_;
    809             int minlength = cesize > expandlength ? cesize - expandlength : 1;
    810             pattern_.defaultShiftSize_ = minlength;
    811             setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize,
    812                     expandlength, minlength, minlength);
    813             return;
    814         }
    815         return pattern_.defaultShiftSize_;
    816         */
    817     }
    818 
    819     /**
    820      * @deprecated This API is ICU internal only.
    821      * @hide original deprecated declaration
    822      * @hide draft / provisional / internal are hidden on Android
    823      */
    824     @Override
    825     @Deprecated
    826     protected void setMatchNotFound() {
    827         super.setMatchNotFound();
    828         // SearchIterator#setMatchNotFound() does following:
    829         //      search_.matchedIndex_ = DONE;
    830         //      search_.setMatchedLength(0);
    831         if (search_.isForwardSearching_) {
    832             textIter_.setOffset(search_.text().getEndIndex());
    833         } else {
    834             textIter_.setOffset(0);
    835         }
    836     }
    837 
    838     /**
    839      * Checks if the offset runs out of the text string range
    840      * @param textstart offset of the first character in the range
    841      * @param textlimit limit offset of the text string range
    842      * @param offset to test
    843      * @return true if offset is out of bounds, false otherwise
    844      */
    845     private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) {
    846         return offset < textstart || offset > textlimit;
    847     }
    848 
    849     /**
    850      * Checks for identical match
    851      * @param start offset of possible match
    852      * @param end offset of possible match
    853      * @return TRUE if identical match is found
    854      */
    855     private boolean checkIdentical(int start, int end) {
    856         if (strength_ != Collator.IDENTICAL) {
    857             return true;
    858         }
    859         // Note: We could use Normalizer::compare() or similar, but for short strings
    860         // which may not be in FCD it might be faster to just NFD them.
    861         String textstr = getString(targetText, start, end - start);
    862         if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) {
    863             textstr = Normalizer.decompose(textstr, false);
    864         }
    865         String patternstr = pattern_.text_;
    866         if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) {
    867             patternstr = Normalizer.decompose(patternstr, false);
    868         }
    869         return textstr.equals(patternstr);
    870     }
    871 
    872     private boolean initTextProcessedIter() {
    873         if (textProcessedIter_ == null) {
    874             textProcessedIter_ = new CollationPCE(textIter_);
    875         } else {
    876             textProcessedIter_.init(textIter_);
    877         }
    878         return true;
    879     }
    880 
    881     /*
    882      * Find the next break boundary after startIndex. If the UStringSearch object
    883      * has an external break iterator, use that. Otherwise use the internal character
    884      * break iterator.
    885      */
    886     private int nextBoundaryAfter(int startIndex) {
    887         BreakIterator breakiterator = search_.breakIter();
    888 
    889         if (breakiterator == null) {
    890             breakiterator = search_.internalBreakIter_;
    891         }
    892 
    893         if (breakiterator != null) {
    894             return breakiterator.following(startIndex);
    895         }
    896 
    897         return startIndex;
    898     }
    899 
    900     /*
    901      * Returns TRUE if index is on a break boundary. If the UStringSearch
    902      * has an external break iterator, test using that, otherwise test
    903      * using the internal character break iterator.
    904      */
    905     private boolean isBreakBoundary(int index) {
    906         BreakIterator breakiterator = search_.breakIter();
    907 
    908         if (breakiterator == null) {
    909             breakiterator = search_.internalBreakIter_;
    910         }
    911 
    912         return (breakiterator != null && breakiterator.isBoundary(index));
    913     }
    914 
    915 
    916     // Java porting note: Followings are corresponding to UCompareCEsResult enum
    917     private static final int CE_MATCH = -1;
    918     private static final int CE_NO_MATCH = 0;
    919     private static final int CE_SKIP_TARG = 1;
    920     private static final int CE_SKIP_PATN = 2;
    921 
    922     private static int CE_LEVEL2_BASE = 0x00000005;
    923     private static int CE_LEVEL3_BASE = 0x00050000;
    924 
    925     private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) {
    926         if (targCE == patCE) {
    927             return CE_MATCH;
    928         }
    929         if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
    930             return CE_NO_MATCH;
    931         }
    932 
    933         long targCEshifted = targCE >>> 32;
    934         long patCEshifted = patCE >>> 32;
    935         long mask;
    936 
    937         mask = 0xFFFF0000L;
    938         int targLev1 = (int)(targCEshifted & mask);
    939         int patLev1 = (int)(patCEshifted & mask);
    940         if (targLev1 != patLev1) {
    941             if (targLev1 == 0) {
    942                 return CE_SKIP_TARG;
    943             }
    944             if (patLev1 == 0
    945                     && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
    946                 return CE_SKIP_PATN;
    947             }
    948             return CE_NO_MATCH;
    949         }
    950 
    951         mask = 0x0000FFFFL;
    952         int targLev2 = (int)(targCEshifted & mask);
    953         int patLev2 = (int)(patCEshifted & mask);
    954         if (targLev2 != patLev2) {
    955             if (targLev2 == 0) {
    956                 return CE_SKIP_TARG;
    957             }
    958             if (patLev2 == 0
    959                     && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
    960                 return CE_SKIP_PATN;
    961             }
    962             return (patLev2 == CE_LEVEL2_BASE ||
    963                     (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
    964                         targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH;
    965         }
    966 
    967         mask = 0xFFFF0000L;
    968         int targLev3 = (int)(targCE & mask);
    969         int patLev3 = (int)(patCE & mask);
    970         if (targLev3 != patLev3) {
    971             return (patLev3 == CE_LEVEL3_BASE ||
    972                     (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
    973                         targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH;
    974         }
    975 
    976         return CE_MATCH;
    977     }
    978 
    979     /**
    980      * An object used for receiving matched index in search() and
    981      * searchBackwards().
    982      */
    983     private static class Match {
    984         int start_ = -1;
    985         int limit_ = -1;
    986     }
    987 
    988     private boolean search(int startIdx, Match m) {
    989         // Input parameter sanity check.
    990         if (pattern_.CELength_ == 0
    991                 || startIdx < search_.beginIndex()
    992                 || startIdx > search_.endIndex()) {
    993             throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " +
    994                     search_.beginIndex() + " and " + search_.endIndex());
    995         }
    996 
    997         if (pattern_.PCE_ == null) {
    998             initializePatternPCETable();
    999         }
   1000 
   1001         textIter_.setOffset(startIdx);
   1002         CEBuffer ceb = new CEBuffer(this);
   1003 
   1004         int targetIx = 0;
   1005         CEI targetCEI = null;
   1006         int patIx;
   1007         boolean found;
   1008 
   1009         int mStart = -1;
   1010         int mLimit = -1;
   1011         int minLimit;
   1012         int maxLimit;
   1013 
   1014         // Outer loop moves over match starting positions in the
   1015         //      target CE space.
   1016         // Here we see the target as a sequence of collation elements, resulting from the following:
   1017         // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
   1018         //    (for example, digraphs such as IJ may be broken into two characters).
   1019         // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
   1020         //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
   1021         //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
   1022         //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
   1023         //    then the CE is deleted, so the following code sees only CEs that are relevant.
   1024         // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
   1025         // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
   1026         // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
   1027         for (targetIx = 0; ; targetIx++) {
   1028             found = true;
   1029             // Inner loop checks for a match beginning at each
   1030             // position from the outer loop.
   1031             int targetIxOffset = 0;
   1032             long patCE = 0;
   1033             // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
   1034             // (compared to the last CE fetched for the previous targetIx value) as we need to go
   1035             // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
   1036             CEI firstCEI = ceb.get(targetIx);
   1037             if (firstCEI == null) {
   1038                 throw new ICUException("CEBuffer.get(" + targetIx + ") returned null.");
   1039             }
   1040 
   1041             for (patIx = 0; patIx < pattern_.PCELength_; patIx++) {
   1042                 patCE = pattern_.PCE_[patIx];
   1043                 targetCEI = ceb.get(targetIx + patIx + targetIxOffset);
   1044                 // Compare CE from target string with CE from the pattern.
   1045                 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
   1046                 // which will fail the compare, below.
   1047                 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
   1048                 if (ceMatch == CE_NO_MATCH) {
   1049                     found = false;
   1050                     break;
   1051                 } else if (ceMatch > CE_NO_MATCH) {
   1052                     if (ceMatch == CE_SKIP_TARG) {
   1053                         // redo with same patCE, next targCE
   1054                         patIx--;
   1055                         targetIxOffset++;
   1056                     } else { // ceMatch == CE_SKIP_PATN
   1057                         // redo with same targCE, next patCE
   1058                         targetIxOffset--;
   1059                     }
   1060                 }
   1061             }
   1062             targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far
   1063 
   1064             if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
   1065                 // No match at this targetIx.  Try again at the next.
   1066                 continue;
   1067             }
   1068 
   1069             if (!found) {
   1070                 // No match at all, we have run off the end of the target text.
   1071                 break;
   1072             }
   1073 
   1074             // We have found a match in CE space.
   1075             // Now determine the bounds in string index space.
   1076             // There still is a chance of match failure if the CE range not correspond to
   1077             // an acceptable character range.
   1078             //
   1079             CEI lastCEI = ceb.get(targetIx + targetIxOffset -1);
   1080 
   1081             mStart = firstCEI.lowIndex_;
   1082             minLimit = lastCEI.lowIndex_;
   1083 
   1084             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   1085             // extended to the end of input, and the match is good.
   1086 
   1087             // Look at the high and low indices of the CE following the match. If
   1088             // they are the same it means one of two things:
   1089             //    1. The match extended to the last CE from the target text, which is OK, or
   1090             //    2. The last CE that was part of the match is in an expansion that extends
   1091             //       to the first CE after the match. In this case, we reject the match.
   1092             CEI nextCEI = null;
   1093             if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
   1094                 nextCEI = ceb.get(targetIx + targetIxOffset);
   1095                 maxLimit = nextCEI.lowIndex_;
   1096                 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
   1097                     found = false;
   1098                 }
   1099             } else {
   1100                 for (;; ++targetIxOffset) {
   1101                     nextCEI = ceb.get(targetIx + targetIxOffset);
   1102                     maxLimit = nextCEI.lowIndex_;
   1103                     // If we are at the end of the target too, match succeeds
   1104                     if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) {
   1105                         break;
   1106                     }
   1107                     // As long as the next CE has primary weight of 0,
   1108                     // it is part of the last target element matched by the pattern;
   1109                     // make sure it can be part of a match with the last patCE
   1110                     if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) {
   1111                         int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_);
   1112                         if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) {
   1113                             found = false;
   1114                             break;
   1115                         }
   1116                     // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
   1117                     // target element, but it has non-zero primary weight => match fails
   1118                     } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) {
   1119                         found = false;
   1120                         break;
   1121                     // Else the target CE is not part of an expansion of the last matched element, match succeeds
   1122                     } else {
   1123                         break;
   1124                     }
   1125                 }
   1126             }
   1127 
   1128             // Check for the start of the match being within a combining sequence.
   1129             // This can happen if the pattern itself begins with a combining char, and
   1130             // the match found combining marks in the target text that were attached
   1131             // to something else.
   1132             // This type of match should be rejected for not completely consuming a
   1133             // combining sequence.
   1134             if (!isBreakBoundary(mStart)) {
   1135                 found = false;
   1136             }
   1137 
   1138             // Check for the start of the match being within an Collation Element Expansion,
   1139             // meaning that the first char of the match is only partially matched.
   1140             // With expansions, the first CE will report the index of the source
   1141             // character, and all subsequent (expansions) CEs will report the source index of the
   1142             // _following_ character.
   1143             int secondIx = firstCEI.highIndex_;
   1144             if (mStart == secondIx) {
   1145                 found = false;
   1146             }
   1147 
   1148             // Allow matches to end in the middle of a grapheme cluster if the following
   1149             // conditions are met; this is needed to make prefix search work properly in
   1150             // Indic, see #11750
   1151             // * the default breakIter is being used
   1152             // * the next collation element after this combining sequence
   1153             //   - has non-zero primary weight
   1154             //   - corresponds to a separate character following the one at end of the current match
   1155             //   (the second of these conditions, and perhaps both, may be redundant given the
   1156             //   subsequent check for normalization boundary; however they are likely much faster
   1157             //   tests in any case)
   1158             // * the match limit is a normalization boundary
   1159             boolean allowMidclusterMatch =
   1160                             breakIterator == null &&
   1161                             (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 &&
   1162                             maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit &&
   1163                             (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) ||
   1164                                     nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit)));
   1165 
   1166             // If those conditions are met, then:
   1167             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
   1168             //   the match limit may be backed off to a previous break boundary. This handles
   1169             //   cases in which mLimit includes target characters that are ignorable with current
   1170             //   settings (such as space) and which extend beyond the pattern match.
   1171             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
   1172             // * do NOT require that match limit be on a breakIter boundary
   1173 
   1174             // Advance the match end position to the first acceptable match boundary.
   1175             // This advances the index over any combining characters.
   1176             mLimit = maxLimit;
   1177             if (minLimit < maxLimit) {
   1178                 // When the last CE's low index is same with its high index, the CE is likely
   1179                 // a part of expansion. In this case, the index is located just after the
   1180                 // character corresponding to the CEs compared above. If the index is right
   1181                 // at the break boundary, move the position to the next boundary will result
   1182                 // incorrect match length when there are ignorable characters exist between
   1183                 // the position and the next character produces CE(s). See ticket#8482.
   1184                 if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) {
   1185                     mLimit = minLimit;
   1186                 } else {
   1187                     int nba = nextBoundaryAfter(minLimit);
   1188                     // Note that we can have nba < maxLimit && nba >= minLImit, in which
   1189                     // case we want to set mLimit to nba regardless of allowMidclusterMatch
   1190                     // (i.e. we back off mLimit to the previous breakIterator boundary).
   1191                     if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) {
   1192                         mLimit = nba;
   1193                     }
   1194                 }
   1195             }
   1196 
   1197             if (!allowMidclusterMatch) {
   1198                 // If advancing to the end of a combining sequence in character indexing space
   1199                 // advanced us beyond the end of the match in CE space, reject this match.
   1200                 if (mLimit > maxLimit) {
   1201                     found = false;
   1202                 }
   1203 
   1204                 if (!isBreakBoundary(mLimit)) {
   1205                     found = false;
   1206                 }
   1207             }
   1208 
   1209             if (!checkIdentical(mStart, mLimit)) {
   1210                 found = false;
   1211             }
   1212 
   1213             if (found) {
   1214                 break;
   1215             }
   1216         }
   1217 
   1218         // All Done.  Store back the match bounds to the caller.
   1219         //
   1220         if (found == false) {
   1221             mLimit = -1;
   1222             mStart = -1;
   1223         }
   1224 
   1225         if (m != null) {
   1226             m.start_ = mStart;
   1227             m.limit_ = mLimit;
   1228         }
   1229 
   1230         return found;
   1231     }
   1232 
   1233     private static int codePointAt(CharacterIterator iter, int index) {
   1234         int currentIterIndex = iter.getIndex();
   1235         char codeUnit = iter.setIndex(index);
   1236         int cp = codeUnit;
   1237         if (Character.isHighSurrogate(codeUnit)) {
   1238             char nextUnit = iter.next();
   1239             if (Character.isLowSurrogate(nextUnit)) {
   1240                 cp = Character.toCodePoint(codeUnit, nextUnit);
   1241             }
   1242         }
   1243         iter.setIndex(currentIterIndex);  // restore iter position
   1244         return cp;
   1245     }
   1246 
   1247     private static int codePointBefore(CharacterIterator iter, int index) {
   1248         int currentIterIndex = iter.getIndex();
   1249         iter.setIndex(index);
   1250         char codeUnit = iter.previous();
   1251         int cp = codeUnit;
   1252         if (Character.isLowSurrogate(codeUnit)) {
   1253             char prevUnit = iter.previous();
   1254             if (Character.isHighSurrogate(prevUnit)) {
   1255                 cp = Character.toCodePoint(prevUnit, codeUnit);
   1256             }
   1257         }
   1258         iter.setIndex(currentIterIndex);  // restore iter position
   1259         return cp;
   1260     }
   1261 
   1262     private boolean searchBackwards(int startIdx, Match m) {
   1263         //ICU4C_TODO comment:  reject search patterns beginning with a combining char.
   1264 
   1265         // Input parameter sanity check.
   1266         if (pattern_.CELength_ == 0
   1267                 || startIdx < search_.beginIndex()
   1268                 || startIdx > search_.endIndex()) {
   1269             throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " +
   1270                     search_.beginIndex() + " and " + search_.endIndex());
   1271         }
   1272 
   1273         if (pattern_.PCE_ == null) {
   1274             initializePatternPCETable();
   1275         }
   1276 
   1277         CEBuffer ceb = new CEBuffer(this);
   1278         int targetIx = 0;
   1279 
   1280         /*
   1281          * Pre-load the buffer with the CE's for the grapheme
   1282          * after our starting position so that we're sure that
   1283          * we can look at the CE following the match when we
   1284          * check the match boundaries.
   1285          *
   1286          * This will also pre-fetch the first CE that we'll
   1287          * consider for the match.
   1288          */
   1289         if (startIdx < search_.endIndex()) {
   1290             BreakIterator bi = search_.internalBreakIter_;
   1291             int next = bi.following(startIdx);
   1292 
   1293             textIter_.setOffset(next);
   1294 
   1295             for (targetIx = 0; ; targetIx++) {
   1296                 if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) {
   1297                     break;
   1298                 }
   1299             }
   1300         } else {
   1301             textIter_.setOffset(startIdx);
   1302         }
   1303 
   1304         CEI targetCEI = null;
   1305         int patIx;
   1306         boolean found;
   1307 
   1308         int limitIx = targetIx;
   1309         int mStart = -1;
   1310         int mLimit = -1;
   1311         int minLimit;
   1312         int maxLimit;
   1313 
   1314         // Outer loop moves over match starting positions in the
   1315         //      target CE space.
   1316         // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
   1317         // But  patIx is 0 at the beginning of the pattern and increases toward the end.
   1318         // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
   1319         // and the beginning of the base text.
   1320         for (targetIx = limitIx; ; targetIx++) {
   1321             found = true;
   1322             // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
   1323             // (compared to the last CE fetched for the previous targetIx value) as we need to go
   1324             // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
   1325             CEI lastCEI = ceb.getPrevious(targetIx);
   1326             if (lastCEI == null) {
   1327                 throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null.");
   1328             }
   1329             // Inner loop checks for a match beginning at each
   1330             // position from the outer loop.
   1331             int targetIxOffset = 0;
   1332             for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) {
   1333                 long patCE = pattern_.PCE_[patIx];
   1334 
   1335                 targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset);
   1336                 // Compare CE from target string with CE from the pattern.
   1337                 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   1338                 // which will fail the compare, below.
   1339                 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
   1340                 if (ceMatch == CE_NO_MATCH) {
   1341                     found = false;
   1342                     break;
   1343                 } else if (ceMatch > CE_NO_MATCH) {
   1344                     if (ceMatch == CE_SKIP_TARG) {
   1345                         // redo with same patCE, next targCE
   1346                         patIx++;
   1347                         targetIxOffset++;
   1348                     } else { // ceMatch == CE_SKIP_PATN
   1349                         // redo with same targCE, next patCE
   1350                         targetIxOffset--;
   1351                     }
   1352                 }
   1353             }
   1354 
   1355             if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
   1356                 // No match at this targetIx.  Try again at the next.
   1357                 continue;
   1358             }
   1359 
   1360             if (!found) {
   1361                 // No match at all, we have run off the end of the target text.
   1362                 break;
   1363             }
   1364 
   1365             // We have found a match in CE space.
   1366             // Now determine the bounds in string index space.
   1367             // There still is a chance of match failure if the CE range not correspond to
   1368             // an acceptable character range.
   1369             //
   1370             CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset);
   1371             mStart = firstCEI.lowIndex_;
   1372 
   1373             // Check for the start of the match being within a combining sequence.
   1374             // This can happen if the pattern itself begins with a combining char, and
   1375             // the match found combining marks in the target text that were attached
   1376             // to something else.
   1377             // This type of match should be rejected for not completely consuming a
   1378             // combining sequence.
   1379             if (!isBreakBoundary(mStart)) {
   1380                 found = false;
   1381             }
   1382 
   1383             // Look at the high index of the first CE in the match. If it's the same as the
   1384             // low index, the first CE in the match is in the middle of an expansion.
   1385             if (mStart == firstCEI.highIndex_) {
   1386                 found = false;
   1387             }
   1388 
   1389             minLimit = lastCEI.lowIndex_;
   1390 
   1391             if (targetIx > 0) {
   1392                 // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   1393                 // extended to the end of input, and the match is good.
   1394 
   1395                 // Look at the high and low indices of the CE following the match. If
   1396                 // they are the same it means one of two things:
   1397                 //    1. The match extended to the last CE from the target text, which is OK, or
   1398                 //    2. The last CE that was part of the match is in an expansion that extends
   1399                 //       to the first CE after the match. In this case, we reject the match.
   1400                 CEI nextCEI  = ceb.getPrevious(targetIx - 1);
   1401 
   1402                 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
   1403                     found = false;
   1404                 }
   1405 
   1406                 mLimit = maxLimit = nextCEI.lowIndex_;
   1407 
   1408                 // Allow matches to end in the middle of a grapheme cluster if the following
   1409                 // conditions are met; this is needed to make prefix search work properly in
   1410                 // Indic, see #11750
   1411                 // * the default breakIter is being used
   1412                 // * the next collation element after this combining sequence
   1413                 //   - has non-zero primary weight
   1414                 //   - corresponds to a separate character following the one at end of the current match
   1415                 //   (the second of these conditions, and perhaps both, may be redundant given the
   1416                 //   subsequent check for normalization boundary; however they are likely much faster
   1417                 //   tests in any case)
   1418                 // * the match limit is a normalization boundary
   1419                 boolean allowMidclusterMatch =
   1420                                 breakIterator == null &&
   1421                                 (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 &&
   1422                                 maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit &&
   1423                                 (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) ||
   1424                                         nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit)));
   1425 
   1426                 // If those conditions are met, then:
   1427                 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
   1428                 //   the match limit may be backed off to a previous break boundary. This handles
   1429                 //   cases in which mLimit includes target characters that are ignorable with current
   1430                 //   settings (such as space) and which extend beyond the pattern match.
   1431                 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
   1432                 // * do NOT require that match limit be on a breakIter boundary
   1433 
   1434                 // Advance the match end position to the first acceptable match boundary.
   1435                 // This advances the index over any combining charcters.
   1436                 if (minLimit < maxLimit) {
   1437                     int nba = nextBoundaryAfter(minLimit);
   1438                     // Note that we can have nba < maxLimit && nba >= minLImit, in which
   1439                     // case we want to set mLimit to nba regardless of allowMidclusterMatch
   1440                     // (i.e. we back off mLimit to the previous breakIterator boundary).
   1441                     if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) {
   1442                         mLimit = nba;
   1443                     }
   1444                 }
   1445 
   1446                 if (!allowMidclusterMatch) {
   1447                     // If advancing to the end of a combining sequence in character indexing space
   1448                     // advanced us beyond the end of the match in CE space, reject this match.
   1449                     if (mLimit > maxLimit) {
   1450                         found = false;
   1451                     }
   1452 
   1453                     // Make sure the end of the match is on a break boundary
   1454                     if (!isBreakBoundary(mLimit)) {
   1455                         found = false;
   1456                     }
   1457                 }
   1458 
   1459             } else {
   1460                 // No non-ignorable CEs after this point.
   1461                 // The maximum position is detected by boundary after
   1462                 // the last non-ignorable CE. Combining sequence
   1463                 // across the start index will be truncated.
   1464                 int nba = nextBoundaryAfter(minLimit);
   1465                 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
   1466             }
   1467 
   1468             if (!checkIdentical(mStart, mLimit)) {
   1469                 found = false;
   1470             }
   1471 
   1472             if (found) {
   1473                 break;
   1474             }
   1475         }
   1476 
   1477         // All Done.  Store back the match bounds to the caller.
   1478         //
   1479         if (found == false) {
   1480             mLimit = -1;
   1481             mStart = -1;
   1482         }
   1483 
   1484         if (m != null) {
   1485             m.start_ = mStart;
   1486             m.limit_ = mLimit;
   1487         }
   1488 
   1489         return found;
   1490     }
   1491 
   1492     // Java porting note:
   1493     //
   1494     // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical()
   1495     // for the linear search implementation. The differences are addressed in search().
   1496     //
   1497     private boolean handleNextExact() {
   1498         return handleNextCommonImpl();
   1499     }
   1500 
   1501     private boolean handleNextCanonical() {
   1502         return handleNextCommonImpl();
   1503     }
   1504 
   1505     private boolean handleNextCommonImpl() {
   1506         int textOffset = textIter_.getOffset();
   1507         Match match = new Match();
   1508 
   1509         if (search(textOffset, match)) {
   1510             search_.matchedIndex_ = match.start_;
   1511             search_.setMatchedLength(match.limit_ - match.start_);
   1512             return true;
   1513         } else {
   1514             setMatchNotFound();
   1515             return false;
   1516         }
   1517     }
   1518 
   1519     // Java porting note:
   1520     //
   1521     // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical()
   1522     // for the linear search implementation. The differences are addressed in searchBackwards().
   1523     //
   1524     private boolean handlePreviousExact() {
   1525         return handlePreviousCommonImpl();
   1526     }
   1527 
   1528     private boolean handlePreviousCanonical() {
   1529         return handlePreviousCommonImpl();
   1530     }
   1531 
   1532     private boolean handlePreviousCommonImpl() {
   1533         int textOffset;
   1534 
   1535         if (search_.isOverlap_) {
   1536             if (search_.matchedIndex_ != DONE) {
   1537                 textOffset = search_.matchedIndex_ + search_.matchedLength() - 1;
   1538             } else {
   1539                 // move the start position at the end of possible match
   1540                 initializePatternPCETable();
   1541                 if (!initTextProcessedIter()) {
   1542                     setMatchNotFound();
   1543                     return false;
   1544                 }
   1545                 for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) {
   1546                     long pce = textProcessedIter_.nextProcessed(null);
   1547                     if (pce == CollationPCE.PROCESSED_NULLORDER) {
   1548                         // at the end of the text
   1549                         break;
   1550                     }
   1551                 }
   1552                 textOffset = textIter_.getOffset();
   1553             }
   1554         } else {
   1555             textOffset = textIter_.getOffset();
   1556         }
   1557 
   1558         Match match = new Match();
   1559         if (searchBackwards(textOffset, match)) {
   1560             search_.matchedIndex_ = match.start_;
   1561             search_.setMatchedLength(match.limit_ - match.start_);
   1562             return true;
   1563         } else {
   1564             setMatchNotFound();
   1565             return false;
   1566         }
   1567     }
   1568 
   1569     /**
   1570      * Gets a substring out of a CharacterIterator
   1571      *
   1572      * Java porting note: Not available in ICU4C
   1573      *
   1574      * @param text CharacterIterator
   1575      * @param start start offset
   1576      * @param length of substring
   1577      * @return substring from text starting at start and length length
   1578      */
   1579     private static final String getString(CharacterIterator text, int start, int length) {
   1580         StringBuilder result = new StringBuilder(length);
   1581         int offset = text.getIndex();
   1582         text.setIndex(start);
   1583         for (int i = 0; i < length; i++) {
   1584             result.append(text.current());
   1585             text.next();
   1586         }
   1587         text.setIndex(offset);
   1588         return result.toString();
   1589     }
   1590 
   1591     /**
   1592      * Java port of ICU4C struct UPattern (usrchimp.h)
   1593      */
   1594     private static final class Pattern {
   1595         /** Pattern string */
   1596         String text_;
   1597 
   1598         long[] PCE_;
   1599         int PCELength_ = 0;
   1600 
   1601         // TODO: We probably do not need CE_ / CELength_
   1602         @SuppressWarnings("unused")
   1603         int[] CE_;
   1604         int CELength_ = 0;
   1605 
   1606         // *** Boyer-Moore ***
   1607         // boolean hasPrefixAccents_ = false;
   1608         // boolean hasSuffixAccents_ = false;
   1609         // int defaultShiftSize_;
   1610         // char[] shift_;
   1611         // char[] backShift_;
   1612 
   1613         protected Pattern(String pattern) {
   1614             text_ = pattern;
   1615         }
   1616     }
   1617 
   1618     /**
   1619      * Java port of ICU4C UCollationPCE (usrchimp.h)
   1620      */
   1621     private static class CollationPCE {
   1622         public static final long PROCESSED_NULLORDER = -1;
   1623 
   1624         private static final int DEFAULT_BUFFER_SIZE = 16;
   1625         private static final int BUFFER_GROW = 8;
   1626 
   1627         // Note: PRIMARYORDERMASK is also duplicated in StringSearch class
   1628         private static final int PRIMARYORDERMASK = 0xffff0000;
   1629         private static final int CONTINUATION_MARKER = 0xc0;
   1630 
   1631         private PCEBuffer pceBuffer_ = new PCEBuffer();
   1632         private CollationElementIterator cei_;
   1633         private int strength_;
   1634         private boolean toShift_;
   1635         private boolean isShifted_;
   1636         private int variableTop_;
   1637 
   1638         public CollationPCE(CollationElementIterator iter) {
   1639             init(iter);
   1640         }
   1641 
   1642         public void init(CollationElementIterator iter) {
   1643             cei_ = iter;
   1644             init(iter.getRuleBasedCollator());
   1645         }
   1646 
   1647         private void init(RuleBasedCollator coll) {
   1648             strength_ = coll.getStrength();
   1649             toShift_ = coll.isAlternateHandlingShifted();
   1650             isShifted_ = false;
   1651             variableTop_ = coll.getVariableTop();
   1652         }
   1653 
   1654         @SuppressWarnings("fallthrough")
   1655         private long processCE(int ce) {
   1656             long primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
   1657 
   1658             // This is clean, but somewhat slow...
   1659             // We could apply the mask to ce and then
   1660             // just get all three orders...
   1661             switch (strength_) {
   1662             default:
   1663                 tertiary = CollationElementIterator.tertiaryOrder(ce);
   1664                 /* note fall-through */
   1665 
   1666             case Collator.SECONDARY:
   1667                 secondary = CollationElementIterator.secondaryOrder(ce);
   1668                 /* note fall-through */
   1669 
   1670             case Collator.PRIMARY:
   1671                 primary = CollationElementIterator.primaryOrder(ce);
   1672             }
   1673 
   1674             // **** This should probably handle continuations too. ****
   1675             // **** That means that we need 24 bits for the primary ****
   1676             // **** instead of the 16 that we're currently using. ****
   1677             // **** So we can lay out the 64 bits as: 24.12.12.16. ****
   1678             // **** Another complication with continuations is that ****
   1679             // **** the *second* CE is marked as a continuation, so ****
   1680             // **** we always have to peek ahead to know how long ****
   1681             // **** the primary is... ****
   1682             if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) {
   1683 
   1684                 if (primary == 0) {
   1685                     return CollationElementIterator.IGNORABLE;
   1686                 }
   1687 
   1688                 if (strength_ >= Collator.QUATERNARY) {
   1689                     quaternary = primary;
   1690                 }
   1691 
   1692                 primary = secondary = tertiary = 0;
   1693                 isShifted_ = true;
   1694             } else {
   1695                 if (strength_ >= Collator.QUATERNARY) {
   1696                     quaternary = 0xFFFF;
   1697                 }
   1698 
   1699                 isShifted_ = false;
   1700             }
   1701 
   1702             return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
   1703         }
   1704 
   1705         /**
   1706          * Get the processed ordering priority of the next collation element in the text.
   1707          * A single character may contain more than one collation element.
   1708          *
   1709          * Note: This is equivalent to
   1710          * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
   1711          *
   1712          * @param range receiving the iterator index before/after fetching the CE.
   1713          * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER
   1714          *         if an error has occurred or if the end of string has been reached
   1715          */
   1716         public long nextProcessed(Range range) {
   1717             long result = CollationElementIterator.IGNORABLE;
   1718             int low = 0, high = 0;
   1719 
   1720             pceBuffer_.reset();
   1721 
   1722             do {
   1723                 low = cei_.getOffset();
   1724                 int ce = cei_.next();
   1725                 high = cei_.getOffset();
   1726 
   1727                 if (ce == CollationElementIterator.NULLORDER) {
   1728                      result = PROCESSED_NULLORDER;
   1729                      break;
   1730                 }
   1731 
   1732                 result = processCE(ce);
   1733             } while (result == CollationElementIterator.IGNORABLE);
   1734 
   1735             if (range != null) {
   1736                 range.ixLow_ = low;
   1737                 range.ixHigh_ = high;
   1738             }
   1739 
   1740             return result;
   1741         }
   1742 
   1743         /**
   1744          * Get the processed ordering priority of the previous collation element in the text.
   1745          * A single character may contain more than one collation element.
   1746          *
   1747          * Note: This is equivalent to
   1748          * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
   1749          *
   1750          * @param range receiving the iterator index before/after fetching the CE.
   1751          * @return The previous collation elements ordering, otherwise returns
   1752          *         PROCESSED_NULLORDER if an error has occurred or if the start of
   1753          *         string has been reached.
   1754          */
   1755         public long previousProcessed(Range range) {
   1756             long result = CollationElementIterator.IGNORABLE;
   1757             int low = 0, high = 0;
   1758 
   1759             // pceBuffer_.reset();
   1760 
   1761             while (pceBuffer_.empty()) {
   1762                 // buffer raw CEs up to non-ignorable primary
   1763                 RCEBuffer rceb = new RCEBuffer();
   1764                 int ce;
   1765 
   1766                 boolean finish = false;
   1767 
   1768                 // **** do we need to reset rceb, or will it always be empty at this point ****
   1769                 do {
   1770                     high = cei_.getOffset();
   1771                     ce = cei_.previous();
   1772                     low = cei_.getOffset();
   1773 
   1774                     if (ce == CollationElementIterator.NULLORDER) {
   1775                         if (!rceb.empty()) {
   1776                             break;
   1777                         }
   1778 
   1779                         finish = true;
   1780                         break;
   1781                     }
   1782 
   1783                     rceb.put(ce, low, high);
   1784                 } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce));
   1785 
   1786                 if (finish) {
   1787                     break;
   1788                 }
   1789 
   1790                 // process the raw CEs
   1791                 while (!rceb.empty()) {
   1792                     RCEI rcei = rceb.get();
   1793 
   1794                     result = processCE(rcei.ce_);
   1795 
   1796                     if (result != CollationElementIterator.IGNORABLE) {
   1797                         pceBuffer_.put(result, rcei.low_, rcei.high_);
   1798                     }
   1799                 }
   1800             }
   1801 
   1802             if (pceBuffer_.empty()) {
   1803                 // **** Is -1 the right value for ixLow, ixHigh? ****
   1804                 if (range != null) {
   1805                     range.ixLow_ = -1;
   1806                     range.ixHigh_ = -1;
   1807                 }
   1808                 return CollationElementIterator.NULLORDER;
   1809             }
   1810 
   1811             PCEI pcei = pceBuffer_.get();
   1812 
   1813             if (range != null) {
   1814                 range.ixLow_ = pcei.low_;
   1815                 range.ixHigh_ = pcei.high_;
   1816             }
   1817 
   1818             return pcei.ce_;
   1819         }
   1820 
   1821         private static boolean isContinuation(int ce) {
   1822             return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER);
   1823         }
   1824 
   1825         public static final class Range {
   1826             int ixLow_;
   1827             int ixHigh_;
   1828         }
   1829 
   1830         /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */
   1831         private static final class PCEI {
   1832             long ce_;
   1833             int low_;
   1834             int high_;
   1835         }
   1836 
   1837         private static final class PCEBuffer {
   1838             private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE];
   1839             private int bufferIndex_ = 0;
   1840 
   1841             void reset() {
   1842                 bufferIndex_ = 0;
   1843             }
   1844 
   1845             boolean empty() {
   1846                 return bufferIndex_ <= 0;
   1847             }
   1848 
   1849             void put(long ce, int ixLow, int ixHigh)
   1850             {
   1851                 if (bufferIndex_ >= buffer_.length) {
   1852                     PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW];
   1853                     System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
   1854                     buffer_ = newBuffer;
   1855                 }
   1856                 buffer_[bufferIndex_] = new PCEI();
   1857                 buffer_[bufferIndex_].ce_ = ce;
   1858                 buffer_[bufferIndex_].low_ = ixLow;
   1859                 buffer_[bufferIndex_].high_ = ixHigh;
   1860 
   1861                 bufferIndex_ += 1;
   1862             }
   1863 
   1864             PCEI get() {
   1865                 if (bufferIndex_ > 0) {
   1866                     return buffer_[--bufferIndex_];
   1867                 }
   1868                 return null;
   1869             }
   1870         }
   1871 
   1872         /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */
   1873         private static final class RCEI {
   1874             int ce_;
   1875             int low_;
   1876             int high_;
   1877         }
   1878 
   1879         private static final class RCEBuffer {
   1880             private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE];
   1881             private int bufferIndex_ = 0;
   1882 
   1883             boolean empty() {
   1884                 return bufferIndex_ <= 0;
   1885             }
   1886 
   1887             void put(int ce, int ixLow, int ixHigh) {
   1888                 if (bufferIndex_ >= buffer_.length) {
   1889                     RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW];
   1890                     System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
   1891                     buffer_ = newBuffer;
   1892                 }
   1893                 buffer_[bufferIndex_] = new RCEI();
   1894                 buffer_[bufferIndex_].ce_ = ce;
   1895                 buffer_[bufferIndex_].low_ = ixLow;
   1896                 buffer_[bufferIndex_].high_ = ixHigh;
   1897 
   1898                 bufferIndex_ += 1;
   1899             }
   1900 
   1901             RCEI get() {
   1902                 if (bufferIndex_ > 0) {
   1903                     return buffer_[--bufferIndex_];
   1904                 }
   1905                 return null;
   1906             }
   1907         }
   1908     }
   1909 
   1910     /**
   1911      * Java port of ICU4C CEI (usearch.cpp)
   1912      *
   1913      * CEI  Collation Element + source text index.
   1914      *      These structs are kept in the circular buffer.
   1915      */
   1916     private static class CEI {
   1917         long ce_;
   1918         int lowIndex_;
   1919         int highIndex_;
   1920     }
   1921 
   1922     /**
   1923      * CEBuffer A circular buffer of CEs from the text being searched
   1924      */
   1925     private static class CEBuffer {
   1926         // Java porting note: ICU4C uses the size for stack buffer
   1927         // static final int DEFAULT_CEBUFFER_SIZE = 96;
   1928 
   1929         static final int CEBUFFER_EXTRA = 32;
   1930         static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8;
   1931         static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3;
   1932 
   1933         CEI[] buf_;
   1934         int bufSize_;
   1935         int firstIx_;
   1936         int limitIx_;
   1937 
   1938         // Java porting note: No references in ICU4C implementation
   1939         // CollationElementIterator ceIter_;
   1940 
   1941         StringSearch strSearch_;
   1942 
   1943         CEBuffer(StringSearch ss) {
   1944             strSearch_ = ss;
   1945             bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA;
   1946             if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
   1947                 String patText = ss.pattern_.text_;
   1948                 if (patText != null) {
   1949                     for (int i = 0; i < patText.length(); i++) {
   1950                         char c = patText.charAt(i);
   1951                         if (MIGHT_BE_JAMO_L(c)) {
   1952                             bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
   1953                         } else {
   1954                             // No check for surrogates, we might allocate slightly more buffer than necessary.
   1955                             bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
   1956                         }
   1957                     }
   1958                 }
   1959             }
   1960 
   1961             // Not used - see above
   1962             // ceIter_ = ss.textIter_;
   1963 
   1964             firstIx_ = 0;
   1965             limitIx_ = 0;
   1966 
   1967             if (!ss.initTextProcessedIter()) {
   1968                 return;
   1969             }
   1970 
   1971             buf_ = new CEI[bufSize_];
   1972         }
   1973 
   1974         // Get the CE with the specified index.
   1975         //   Index must be in the range
   1976         //             n-history_size < index < n+1
   1977         //   where n is the largest index to have been fetched by some previous call to this function.
   1978         //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   1979         //
   1980         CEI get(int index) {
   1981             int i = index % bufSize_;
   1982 
   1983             if (index >= firstIx_ && index < limitIx_) {
   1984                 // The request was for an entry already in our buffer.
   1985                 // Just return it.
   1986                 return buf_[i];
   1987             }
   1988 
   1989             // Caller is requesting a new, never accessed before, CE.
   1990             // Verify that it is the next one in sequence, which is all
   1991             // that is allowed.
   1992             if (index != limitIx_) {
   1993                 assert(false);
   1994                 return null;
   1995             }
   1996 
   1997             // Manage the circular CE buffer indexing
   1998             limitIx_++;
   1999 
   2000             if (limitIx_ - firstIx_ >= bufSize_) {
   2001                 // The buffer is full, knock out the lowest-indexed entry.
   2002                 firstIx_++;
   2003             }
   2004 
   2005             CollationPCE.Range range = new CollationPCE.Range();
   2006             if (buf_[i] == null) {
   2007                 buf_[i] = new CEI();
   2008             }
   2009             buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range);
   2010             buf_[i].lowIndex_ = range.ixLow_;
   2011             buf_[i].highIndex_ = range.ixHigh_;
   2012 
   2013             return buf_[i];
   2014         }
   2015 
   2016         // Get the CE with the specified index.
   2017         //   Index must be in the range
   2018         //             n-history_size < index < n+1
   2019         //   where n is the largest index to have been fetched by some previous call to this function.
   2020         //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   2021         //
   2022         CEI getPrevious(int index) {
   2023             int i = index % bufSize_;
   2024 
   2025             if (index >= firstIx_ && index < limitIx_) {
   2026                 // The request was for an entry already in our buffer.
   2027                 // Just return it.
   2028                 return buf_[i];
   2029             }
   2030 
   2031             // Caller is requesting a new, never accessed before, CE.
   2032             // Verify that it is the next one in sequence, which is all
   2033             // that is allowed.
   2034             if (index != limitIx_) {
   2035                 assert(false);
   2036                 return null;
   2037             }
   2038 
   2039             // Manage the circular CE buffer indexing
   2040             limitIx_++;
   2041 
   2042             if (limitIx_ - firstIx_ >= bufSize_) {
   2043                 // The buffer is full, knock out the lowest-indexed entry.
   2044                 firstIx_++;
   2045             }
   2046 
   2047             CollationPCE.Range range = new CollationPCE.Range();
   2048             if (buf_[i] == null) {
   2049                 buf_[i] = new CEI();
   2050             }
   2051             buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range);
   2052             buf_[i].lowIndex_ = range.ixLow_;
   2053             buf_[i].highIndex_ = range.ixHigh_;
   2054 
   2055             return buf_[i];
   2056         }
   2057 
   2058         static boolean MIGHT_BE_JAMO_L(char c) {
   2059             return (c >= 0x1100 && c <= 0x115E)
   2060                     || (c >= 0x3131 && c <= 0x314E)
   2061                     || (c >= 0x3165 && c <= 0x3186);
   2062         }
   2063     }
   2064 }
   2065