Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2014 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  07/02/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/usearch.h"
     15 #include "unicode/ustring.h"
     16 #include "unicode/uchar.h"
     17 #include "unicode/utf16.h"
     18 #include "normalizer2impl.h"
     19 #include "usrchimp.h"
     20 #include "cmemory.h"
     21 #include "ucln_in.h"
     22 #include "uassert.h"
     23 #include "ustr_imp.h"
     24 
     25 U_NAMESPACE_USE
     26 
     27 // don't use Boyer-Moore
     28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
     29 #define BOYER_MOORE 0
     30 
     31 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     32 
     33 // internal definition ---------------------------------------------------
     34 
     35 #define LAST_BYTE_MASK_          0xFF
     36 #define SECOND_LAST_BYTE_SHIFT_  8
     37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
     38 
     39 static const Normalizer2Impl *g_nfcImpl = NULL;
     40 
     41 // internal methods -------------------------------------------------
     42 
     43 /**
     44 * Fast collation element iterator setOffset.
     45 * This function does not check for bounds.
     46 * @param coleiter collation element iterator
     47 * @param offset to set
     48 */
     49 static
     50 inline void setColEIterOffset(UCollationElements *elems,
     51                       int32_t             offset)
     52 {
     53     // Note: Not "fast" any more after the 2013 collation rewrite.
     54     // We do not want to expose more internals than necessary.
     55     UErrorCode status = U_ZERO_ERROR;
     56     ucol_setOffset(elems, offset, &status);
     57 }
     58 
     59 /**
     60 * Getting the mask for collation strength
     61 * @param strength collation strength
     62 * @return collation element mask
     63 */
     64 static
     65 inline uint32_t getMask(UCollationStrength strength)
     66 {
     67     switch (strength)
     68     {
     69     case UCOL_PRIMARY:
     70         return UCOL_PRIMARYORDERMASK;
     71     case UCOL_SECONDARY:
     72         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
     73     default:
     74         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
     75                UCOL_PRIMARYORDERMASK;
     76     }
     77 }
     78 
     79 /**
     80 * This is to squeeze the 21bit ces into a 256 table
     81 * @param ce collation element
     82 * @return collapsed version of the collation element
     83 */
     84 static
     85 inline int hash(uint32_t ce)
     86 {
     87     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
     88     // well with the new collation where most of the latin 1 characters
     89     // are of the value xx000xxx. their hashes will most of the time be 0
     90     // to be discussed on the hash algo.
     91     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
     92 }
     93 
     94 U_CDECL_BEGIN
     95 static UBool U_CALLCONV
     96 usearch_cleanup(void) {
     97     g_nfcImpl = NULL;
     98     return TRUE;
     99 }
    100 U_CDECL_END
    101 
    102 /**
    103 * Initializing the fcd tables.
    104 * Internal method, status assumed to be a success.
    105 * @param status output error if any, caller to check status before calling
    106 *               method, status assumed to be success when passed in.
    107 */
    108 static
    109 inline void initializeFCD(UErrorCode *status)
    110 {
    111     if (g_nfcImpl == NULL) {
    112         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    113         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
    114     }
    115 }
    116 
    117 /**
    118 * Gets the fcd value for a character at the argument index.
    119 * This method takes into accounts of the supplementary characters.
    120 * @param str UTF16 string where character for fcd retrieval resides
    121 * @param offset position of the character whose fcd is to be retrieved, to be
    122 *               overwritten with the next character position, taking
    123 *               surrogate characters into consideration.
    124 * @param strlength length of the argument string
    125 * @return fcd value
    126 */
    127 static
    128 uint16_t getFCD(const UChar   *str, int32_t *offset,
    129                              int32_t  strlength)
    130 {
    131     const UChar *temp = str + *offset;
    132     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
    133     *offset = (int32_t)(temp - str);
    134     return result;
    135 }
    136 
    137 /**
    138 * Getting the modified collation elements taking into account the collation
    139 * attributes
    140 * @param strsrch string search data
    141 * @param sourcece
    142 * @return the modified collation element
    143 */
    144 static
    145 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
    146 {
    147     // note for tertiary we can't use the collator->tertiaryMask, that
    148     // is a preprocessed mask that takes into account case options. since
    149     // we are only concerned with exact matches, we don't need that.
    150     sourcece &= strsrch->ceMask;
    151 
    152     if (strsrch->toShift) {
    153         // alternate handling here, since only the 16 most significant digits
    154         // is only used, we can safely do a compare without masking
    155         // if the ce is a variable, we mask and get only the primary values
    156         // no shifting to quartenary is required since all primary values
    157         // less than variabletop will need to be masked off anyway.
    158         if (strsrch->variableTop > sourcece) {
    159             if (strsrch->strength >= UCOL_QUATERNARY) {
    160                 sourcece &= UCOL_PRIMARYORDERMASK;
    161             }
    162             else {
    163                 sourcece = UCOL_IGNORABLE;
    164             }
    165         }
    166     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
    167         sourcece = 0xFFFF;
    168     }
    169 
    170     return sourcece;
    171 }
    172 
    173 /**
    174 * Allocate a memory and returns NULL if it failed.
    175 * Internal method, status assumed to be a success.
    176 * @param size to allocate
    177 * @param status output error if any, caller to check status before calling
    178 *               method, status assumed to be success when passed in.
    179 * @return newly allocated array, NULL otherwise
    180 */
    181 static
    182 inline void * allocateMemory(uint32_t size, UErrorCode *status)
    183 {
    184     uint32_t *result = (uint32_t *)uprv_malloc(size);
    185     if (result == NULL) {
    186         *status = U_MEMORY_ALLOCATION_ERROR;
    187     }
    188     return result;
    189 }
    190 
    191 /**
    192 * Adds a uint32_t value to a destination array.
    193 * Creates a new array if we run out of space. The caller will have to
    194 * manually deallocate the newly allocated array.
    195 * Internal method, status assumed to be success, caller has to check status
    196 * before calling this method. destination not to be NULL and has at least
    197 * size destinationlength.
    198 * @param destination target array
    199 * @param offset destination offset to add value
    200 * @param destinationlength target array size, return value for the new size
    201 * @param value to be added
    202 * @param increments incremental size expected
    203 * @param status output error if any, caller to check status before calling
    204 *               method, status assumed to be success when passed in.
    205 * @return new destination array, destination if there was no new allocation
    206 */
    207 static
    208 inline int32_t * addTouint32_tArray(int32_t    *destination,
    209                                     uint32_t    offset,
    210                                     uint32_t   *destinationlength,
    211                                     uint32_t    value,
    212                                     uint32_t    increments,
    213                                     UErrorCode *status)
    214 {
    215     uint32_t newlength = *destinationlength;
    216     if (offset + 1 == newlength) {
    217         newlength += increments;
    218         int32_t *temp = (int32_t *)allocateMemory(
    219                                          sizeof(int32_t) * newlength, status);
    220         if (U_FAILURE(*status)) {
    221             return NULL;
    222         }
    223         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
    224         *destinationlength = newlength;
    225         destination        = temp;
    226     }
    227     destination[offset] = value;
    228     return destination;
    229 }
    230 
    231 /**
    232 * Adds a uint64_t value to a destination array.
    233 * Creates a new array if we run out of space. The caller will have to
    234 * manually deallocate the newly allocated array.
    235 * Internal method, status assumed to be success, caller has to check status
    236 * before calling this method. destination not to be NULL and has at least
    237 * size destinationlength.
    238 * @param destination target array
    239 * @param offset destination offset to add value
    240 * @param destinationlength target array size, return value for the new size
    241 * @param value to be added
    242 * @param increments incremental size expected
    243 * @param status output error if any, caller to check status before calling
    244 *               method, status assumed to be success when passed in.
    245 * @return new destination array, destination if there was no new allocation
    246 */
    247 static
    248 inline int64_t * addTouint64_tArray(int64_t    *destination,
    249                                     uint32_t    offset,
    250                                     uint32_t   *destinationlength,
    251                                     uint64_t    value,
    252                                     uint32_t    increments,
    253                                     UErrorCode *status)
    254 {
    255     uint32_t newlength = *destinationlength;
    256     if (offset + 1 == newlength) {
    257         newlength += increments;
    258         int64_t *temp = (int64_t *)allocateMemory(
    259                                          sizeof(int64_t) * newlength, status);
    260 
    261         if (U_FAILURE(*status)) {
    262             return NULL;
    263         }
    264 
    265         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
    266         *destinationlength = newlength;
    267         destination        = temp;
    268     }
    269 
    270     destination[offset] = value;
    271 
    272     return destination;
    273 }
    274 
    275 /**
    276 * Initializing the ce table for a pattern.
    277 * Stores non-ignorable collation keys.
    278 * Table size will be estimated by the size of the pattern text. Table
    279 * expansion will be perform as we go along. Adding 1 to ensure that the table
    280 * size definitely increases.
    281 * Internal method, status assumed to be a success.
    282 * @param strsrch string search data
    283 * @param status output error if any, caller to check status before calling
    284 *               method, status assumed to be success when passed in.
    285 * @return total number of expansions
    286 */
    287 static
    288 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
    289                                          UErrorCode    *status)
    290 {
    291     UPattern *pattern            = &(strsrch->pattern);
    292     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
    293     int32_t  *cetable            = pattern->CEBuffer;
    294     uint32_t  patternlength      = pattern->textLength;
    295     UCollationElements *coleiter = strsrch->utilIter;
    296 
    297     if (coleiter == NULL) {
    298         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    299                                      patternlength, status);
    300         // status will be checked in ucol_next(..) later and if it is an
    301         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    302         // returned.
    303         strsrch->utilIter = coleiter;
    304     }
    305     else {
    306         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    307     }
    308     if(U_FAILURE(*status)) {
    309         return 0;
    310     }
    311 
    312     if (pattern->CE != cetable && pattern->CE) {
    313         uprv_free(pattern->CE);
    314     }
    315 
    316     uint16_t  offset      = 0;
    317     uint16_t  result      = 0;
    318     int32_t   ce;
    319 
    320     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
    321            U_SUCCESS(*status)) {
    322         uint32_t newce = getCE(strsrch, ce);
    323         if (newce) {
    324             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
    325                                   newce,
    326                                   patternlength - ucol_getOffset(coleiter) + 1,
    327                                   status);
    328             if (U_FAILURE(*status)) {
    329                 return 0;
    330             }
    331             offset ++;
    332             if (cetable != temp && cetable != pattern->CEBuffer) {
    333                 uprv_free(cetable);
    334             }
    335             cetable = temp;
    336         }
    337         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    338     }
    339 
    340     cetable[offset]   = 0;
    341     pattern->CE       = cetable;
    342     pattern->CELength = offset;
    343 
    344     return result;
    345 }
    346 
    347 /**
    348 * Initializing the pce table for a pattern.
    349 * Stores non-ignorable collation keys.
    350 * Table size will be estimated by the size of the pattern text. Table
    351 * expansion will be perform as we go along. Adding 1 to ensure that the table
    352 * size definitely increases.
    353 * Internal method, status assumed to be a success.
    354 * @param strsrch string search data
    355 * @param status output error if any, caller to check status before calling
    356 *               method, status assumed to be success when passed in.
    357 * @return total number of expansions
    358 */
    359 static
    360 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
    361                                           UErrorCode    *status)
    362 {
    363     UPattern *pattern            = &(strsrch->pattern);
    364     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
    365     int64_t  *pcetable           = pattern->PCEBuffer;
    366     uint32_t  patternlength      = pattern->textLength;
    367     UCollationElements *coleiter = strsrch->utilIter;
    368 
    369     if (coleiter == NULL) {
    370         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    371                                      patternlength, status);
    372         // status will be checked in ucol_next(..) later and if it is an
    373         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    374         // returned.
    375         strsrch->utilIter = coleiter;
    376     } else {
    377         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    378     }
    379     if(U_FAILURE(*status)) {
    380         return 0;
    381     }
    382 
    383     if (pattern->PCE != pcetable && pattern->PCE != NULL) {
    384         uprv_free(pattern->PCE);
    385     }
    386 
    387     uint16_t  offset = 0;
    388     uint16_t  result = 0;
    389     int64_t   pce;
    390 
    391     icu::UCollationPCE iter(coleiter);
    392 
    393     // ** Should processed CEs be signed or unsigned?
    394     // ** (the rest of the code in this file seems to play fast-and-loose with
    395     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
    396     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
    397            U_SUCCESS(*status)) {
    398         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
    399                               pce,
    400                               patternlength - ucol_getOffset(coleiter) + 1,
    401                               status);
    402 
    403         if (U_FAILURE(*status)) {
    404             return 0;
    405         }
    406 
    407         offset += 1;
    408 
    409         if (pcetable != temp && pcetable != pattern->PCEBuffer) {
    410             uprv_free(pcetable);
    411         }
    412 
    413         pcetable = temp;
    414         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    415     }
    416 
    417     pcetable[offset]   = 0;
    418     pattern->PCE       = pcetable;
    419     pattern->PCELength = offset;
    420 
    421     return result;
    422 }
    423 
    424 /**
    425 * Initializes the pattern struct.
    426 * Internal method, status assumed to be success.
    427 * @param strsrch UStringSearch data storage
    428 * @param status output error if any, caller to check status before calling
    429 *               method, status assumed to be success when passed in.
    430 * @return expansionsize the total expansion size of the pattern
    431 */
    432 static
    433 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
    434 {
    435     if (U_FAILURE(*status)) { return 0; }
    436           UPattern   *pattern     = &(strsrch->pattern);
    437     const UChar      *patterntext = pattern->text;
    438           int32_t     length      = pattern->textLength;
    439           int32_t index       = 0;
    440 
    441     // Since the strength is primary, accents are ignored in the pattern.
    442     if (strsrch->strength == UCOL_PRIMARY) {
    443         pattern->hasPrefixAccents = 0;
    444         pattern->hasSuffixAccents = 0;
    445     } else {
    446         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
    447                                                          SECOND_LAST_BYTE_SHIFT_;
    448         index = length;
    449         U16_BACK_1(patterntext, 0, index);
    450         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
    451                                                                  LAST_BYTE_MASK_;
    452     }
    453 
    454     // ** HACK **
    455     if (strsrch->pattern.PCE != NULL) {
    456         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
    457             uprv_free(strsrch->pattern.PCE);
    458         }
    459 
    460         strsrch->pattern.PCE = NULL;
    461     }
    462 
    463     // since intializePattern is an internal method status is a success.
    464     return initializePatternCETable(strsrch, status);
    465 }
    466 
    467 /**
    468 * Initializing shift tables, with the default values.
    469 * If a corresponding default value is 0, the shift table is not set.
    470 * @param shift table for forwards shift
    471 * @param backshift table for backwards shift
    472 * @param cetable table containing pattern ce
    473 * @param cesize size of the pattern ces
    474 * @param expansionsize total size of the expansions
    475 * @param defaultforward the default forward value
    476 * @param defaultbackward the default backward value
    477 */
    478 static
    479 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
    480                           int32_t  *cetable, int32_t cesize,
    481                           int16_t   expansionsize,
    482                           int16_t   defaultforward,
    483                           int16_t   defaultbackward)
    484 {
    485     // estimate the value to shift. to do that we estimate the smallest
    486     // number of characters to give the relevant ces, ie approximately
    487     // the number of ces minus their expansion, since expansions can come
    488     // from a character.
    489     int32_t count;
    490     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    491         shift[count] = defaultforward;
    492     }
    493     cesize --; // down to the last index
    494     for (count = 0; count < cesize; count ++) {
    495         // number of ces from right of array to the count
    496         int temp = defaultforward - count - 1;
    497         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
    498     }
    499     shift[hash(cetable[cesize])] = 1;
    500     // for ignorables we just shift by one. see test examples.
    501     shift[hash(0)] = 1;
    502 
    503     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    504         backshift[count] = defaultbackward;
    505     }
    506     for (count = cesize; count > 0; count --) {
    507         // the original value count does not seem to work
    508         backshift[hash(cetable[count])] = count > expansionsize ?
    509                                           (int16_t)(count - expansionsize) : 1;
    510     }
    511     backshift[hash(cetable[0])] = 1;
    512     backshift[hash(0)] = 1;
    513 }
    514 
    515 /**
    516 * Building of the pattern collation element list and the boyer moore strsrch
    517 * table.
    518 * The canonical match will only be performed after the default match fails.
    519 * For both cases we need to remember the size of the composed and decomposed
    520 * versions of the string. Since the Boyer-Moore shift calculations shifts by
    521 * a number of characters in the text and tries to match the pattern from that
    522 * offset, the shift value can not be too large in case we miss some
    523 * characters. To choose a right shift size, we estimate the NFC form of the
    524 * and use its size as a shift guide. The NFC form should be the small
    525 * possible representation of the pattern. Anyways, we'll err on the smaller
    526 * shift size. Hence the calculation for minlength.
    527 * Canonical match will be performed slightly differently. We'll split the
    528 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
    529 * the first and last base character (MS), the ending accents (EA). Matches
    530 * will be done on MS first, and only when we match MS then some processing
    531 * will be required for the prefix and end accents in order to determine if
    532 * they match PA and EA. Hence the default shift values
    533 * for the canonical match will take the size of either end's accent into
    534 * consideration. Forwards search will take the end accents into consideration
    535 * for the default shift values and the backwards search will take the prefix
    536 * accents into consideration.
    537 * If pattern has no non-ignorable ce, we return a illegal argument error.
    538 * Internal method, status assumed to be success.
    539 * @param strsrch UStringSearch data storage
    540 * @param status  for output errors if it occurs, status is assumed to be a
    541 *                success when it is passed in.
    542 */
    543 static
    544 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
    545 {
    546     int16_t expandlength  = initializePattern(strsrch, status);
    547     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
    548         UPattern *pattern = &strsrch->pattern;
    549         int32_t   cesize  = pattern->CELength;
    550 
    551         int16_t minlength = cesize > expandlength
    552                             ? (int16_t)cesize - expandlength : 1;
    553         pattern->defaultShiftSize    = minlength;
    554         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
    555                       cesize, expandlength, minlength, minlength);
    556         return;
    557     }
    558     strsrch->pattern.defaultShiftSize = 0;
    559 }
    560 
    561 #if BOYER_MOORE
    562 /**
    563 * Check to make sure that the match length is at the end of the character by
    564 * using the breakiterator.
    565 * @param strsrch string search data
    566 * @param start target text start offset
    567 * @param end target text end offset
    568 */
    569 static
    570 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
    571                                int32_t *end)
    572 {
    573 #if !UCONFIG_NO_BREAK_ITERATION
    574     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
    575     if (breakiterator) {
    576         int32_t matchend = *end;
    577         //int32_t matchstart = *start;
    578 
    579         if (!ubrk_isBoundary(breakiterator, matchend)) {
    580             *end = ubrk_following(breakiterator, matchend);
    581         }
    582 
    583         /* Check the start of the matched text to make sure it doesn't have any accents
    584          * before it.  This code may not be necessary and so it is commented out */
    585         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
    586             *start = ubrk_preceding(breakiterator, matchstart);
    587         }*/
    588     }
    589 #endif
    590 }
    591 
    592 /**
    593 * Determine whether the target text in UStringSearch bounded by the offset
    594 * start and end is one or more whole units of text as
    595 * determined by the breakiterator in UStringSearch.
    596 * @param strsrch string search data
    597 * @param start target text start offset
    598 * @param end target text end offset
    599 */
    600 static
    601 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
    602                                int32_t    end)
    603 {
    604 #if !UCONFIG_NO_BREAK_ITERATION
    605     UBreakIterator *breakiterator = strsrch->search->breakIter;
    606     //TODO: Add here.
    607     if (breakiterator) {
    608         int32_t startindex = ubrk_first(breakiterator);
    609         int32_t endindex   = ubrk_last(breakiterator);
    610 
    611         // out-of-range indexes are never boundary positions
    612         if (start < startindex || start > endindex ||
    613             end < startindex || end > endindex) {
    614             return FALSE;
    615         }
    616         // otherwise, we can use following() on the position before the
    617         // specified one and return true of the position we get back is the
    618         // one the user specified
    619         UBool result = (start == startindex ||
    620                 ubrk_following(breakiterator, start - 1) == start) &&
    621                (end == endindex ||
    622                 ubrk_following(breakiterator, end - 1) == end);
    623         if (result) {
    624             // iterates the individual ces
    625                   UCollationElements *coleiter  = strsrch->utilIter;
    626             const UChar              *text      = strsrch->search->text +
    627                                                                       start;
    628                   UErrorCode          status    = U_ZERO_ERROR;
    629             ucol_setText(coleiter, text, end - start, &status);
    630             for (int32_t count = 0; count < strsrch->pattern.CELength;
    631                  count ++) {
    632                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
    633                 if (ce == UCOL_IGNORABLE) {
    634                     count --;
    635                     continue;
    636                 }
    637                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
    638                     return FALSE;
    639                 }
    640             }
    641             int32_t nextce = ucol_next(coleiter, &status);
    642             while (ucol_getOffset(coleiter) == (end - start)
    643                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
    644                 nextce = ucol_next(coleiter, &status);
    645             }
    646             if (ucol_getOffset(coleiter) == (end - start)
    647                 && nextce != UCOL_NULLORDER) {
    648                 // extra collation elements at the end of the match
    649                 return FALSE;
    650             }
    651         }
    652         return result;
    653     }
    654 #endif
    655     return TRUE;
    656 }
    657 
    658 /**
    659 * Getting the next base character offset if current offset is an accent,
    660 * or the current offset if the current character contains a base character.
    661 * accents the following base character will be returned
    662 * @param text string
    663 * @param textoffset current offset
    664 * @param textlength length of text string
    665 * @return the next base character or the current offset
    666 *         if the current character is contains a base character.
    667 */
    668 static
    669 inline int32_t getNextBaseOffset(const UChar       *text,
    670                                            int32_t  textoffset,
    671                                            int32_t      textlength)
    672 {
    673     if (textoffset < textlength) {
    674         int32_t temp = textoffset;
    675         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
    676             while (temp < textlength) {
    677                 int32_t result = temp;
    678                 if ((getFCD(text, &temp, textlength) >>
    679                      SECOND_LAST_BYTE_SHIFT_) == 0) {
    680                     return result;
    681                 }
    682             }
    683             return textlength;
    684         }
    685     }
    686     return textoffset;
    687 }
    688 
    689 /**
    690 * Gets the next base character offset depending on the string search pattern
    691 * data
    692 * @param strsrch string search data
    693 * @param textoffset current offset, one offset away from the last character
    694 *                   to search for.
    695 * @return start index of the next base character or the current offset
    696 *         if the current character is contains a base character.
    697 */
    698 static
    699 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
    700                                                   int32_t    textoffset)
    701 {
    702     int32_t textlength = strsrch->search->textLength;
    703     if (strsrch->pattern.hasSuffixAccents &&
    704         textoffset < textlength) {
    705               int32_t  temp       = textoffset;
    706         const UChar       *text       = strsrch->search->text;
    707         U16_BACK_1(text, 0, temp);
    708         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
    709             return getNextBaseOffset(text, textoffset, textlength);
    710         }
    711     }
    712     return textoffset;
    713 }
    714 
    715 /**
    716 * Shifting the collation element iterator position forward to prepare for
    717 * a following match. If the last character is a unsafe character, we'll only
    718 * shift by 1 to capture contractions, normalization etc.
    719 * Internal method, status assumed to be success.
    720 * @param text strsrch string search data
    721 * @param textoffset start text position to do search
    722 * @param ce the text ce which failed the match.
    723 * @param patternceindex index of the ce within the pattern ce buffer which
    724 *        failed the match
    725 * @return final offset
    726 */
    727 static
    728 inline int32_t shiftForward(UStringSearch *strsrch,
    729                                 int32_t    textoffset,
    730                                 int32_t       ce,
    731                                 int32_t        patternceindex)
    732 {
    733     UPattern *pattern = &(strsrch->pattern);
    734     if (ce != UCOL_NULLORDER) {
    735         int32_t shift = pattern->shift[hash(ce)];
    736         // this is to adjust for characters in the middle of the
    737         // substring for matching that failed.
    738         int32_t adjust = pattern->CELength - patternceindex;
    739         if (adjust > 1 && shift >= adjust) {
    740             shift -= adjust - 1;
    741         }
    742         textoffset += shift;
    743     }
    744     else {
    745         textoffset += pattern->defaultShiftSize;
    746     }
    747 
    748     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
    749     // check for unsafe characters
    750     // * if it is the start or middle of a contraction: to be done after
    751     //   a initial match is found
    752     // * thai or lao base consonant character: similar to contraction
    753     // * high surrogate character: similar to contraction
    754     // * next character is a accent: shift to the next base character
    755     return textoffset;
    756 }
    757 #endif // #if BOYER_MOORE
    758 
    759 /**
    760 * sets match not found
    761 * @param strsrch string search data
    762 */
    763 static
    764 inline void setMatchNotFound(UStringSearch *strsrch)
    765 {
    766     // this method resets the match result regardless of the error status.
    767     strsrch->search->matchedIndex = USEARCH_DONE;
    768     strsrch->search->matchedLength = 0;
    769     if (strsrch->search->isForwardSearching) {
    770         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
    771     }
    772     else {
    773         setColEIterOffset(strsrch->textIter, 0);
    774     }
    775 }
    776 
    777 #if BOYER_MOORE
    778 /**
    779 * Gets the offset to the next safe point in text.
    780 * ie. not the middle of a contraction, swappable characters or supplementary
    781 * characters.
    782 * @param collator collation sata
    783 * @param text string to work with
    784 * @param textoffset offset in string
    785 * @param textlength length of text string
    786 * @return offset to the next safe character
    787 */
    788 static
    789 inline int32_t getNextSafeOffset(const UCollator   *collator,
    790                                      const UChar       *text,
    791                                            int32_t  textoffset,
    792                                            int32_t      textlength)
    793 {
    794     int32_t result = textoffset; // first contraction character
    795     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
    796         result ++;
    797     }
    798     return result;
    799 }
    800 
    801 /**
    802 * This checks for accents in the potential match started with a .
    803 * composite character.
    804 * This is really painful... we have to check that composite character do not
    805 * have any extra accents. We have to normalize the potential match and find
    806 * the immediate decomposed character before the match.
    807 * The first composite character would have been taken care of by the fcd
    808 * checks in checkForwardExactMatch.
    809 * This is the slow path after the fcd of the first character and
    810 * the last character has been checked by checkForwardExactMatch and we
    811 * determine that the potential match has extra non-ignorable preceding
    812 * ces.
    813 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    814 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    815 * Note here that accents checking are slow and cautioned in the API docs.
    816 * Internal method, status assumed to be a success, caller should check status
    817 * before calling this method
    818 * @param strsrch string search data
    819 * @param start index of the potential unfriendly composite character
    820 * @param end index of the potential unfriendly composite character
    821 * @param status output error status if any.
    822 * @return TRUE if there is non-ignorable accents before at the beginning
    823 *              of the match, FALSE otherwise.
    824 */
    825 
    826 static
    827 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
    828                                    int32_t    end,
    829                                    UErrorCode    *status)
    830 {
    831     UBool result = FALSE;
    832     if (strsrch->pattern.hasPrefixAccents) {
    833               int32_t  length = end - start;
    834               int32_t  offset = 0;
    835         const UChar       *text   = strsrch->search->text + start;
    836 
    837         U16_FWD_1(text, offset, length);
    838         // we are only concerned with the first composite character
    839         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
    840             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
    841                                                        text, 0, length);
    842             if (safeoffset != length) {
    843                 safeoffset ++;
    844             }
    845             UChar   *norm = NULL;
    846             UChar    buffer[INITIAL_ARRAY_SIZE_];
    847             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
    848                                             buffer, INITIAL_ARRAY_SIZE_,
    849                                             status);
    850             if (U_FAILURE(*status)) {
    851                 return FALSE;
    852             }
    853             if (size >= INITIAL_ARRAY_SIZE_) {
    854                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
    855                                                status);
    856                 // if allocation failed, status will be set to
    857                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
    858                 // checks for it.
    859                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
    860                                        size, status);
    861                 if (U_FAILURE(*status) && norm != NULL) {
    862                     uprv_free(norm);
    863                     return FALSE;
    864                 }
    865             }
    866             else {
    867                 norm = buffer;
    868             }
    869 
    870             UCollationElements *coleiter  = strsrch->utilIter;
    871             ucol_setText(coleiter, norm, size, status);
    872             uint32_t            firstce   = strsrch->pattern.CE[0];
    873             UBool               ignorable = TRUE;
    874             uint32_t            ce        = UCOL_IGNORABLE;
    875             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
    876                 offset = ucol_getOffset(coleiter);
    877                 if (ce != firstce && ce != UCOL_IGNORABLE) {
    878                     ignorable = FALSE;
    879                 }
    880                 ce = ucol_next(coleiter, status);
    881             }
    882             UChar32 codepoint;
    883             U16_PREV(norm, 0, offset, codepoint);
    884             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
    885 
    886             if (norm != buffer) {
    887                 uprv_free(norm);
    888             }
    889         }
    890     }
    891 
    892     return result;
    893 }
    894 
    895 /**
    896 * Used by exact matches, checks if there are accents before the match.
    897 * This is really painful... we have to check that composite characters at
    898 * the start of the matches have to not have any extra accents.
    899 * We check the FCD of the character first, if it starts with an accent and
    900 * the first pattern ce does not match the first ce of the character, we bail.
    901 * Otherwise we try normalizing the first composite
    902 * character and find the immediate decomposed character before the match to
    903 * see if it is an non-ignorable accent.
    904 * Now normalizing the first composite character is enough because we ensure
    905 * that when the match is passed in here with extra beginning ces, the
    906 * first or last ce that match has to occur within the first character.
    907 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    908 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    909 * Note here that accents checking are slow and cautioned in the API docs.
    910 * @param strsrch string search data
    911 * @param start offset
    912 * @param end offset
    913 * @return TRUE if there are accents on either side of the match,
    914 *         FALSE otherwise
    915 */
    916 static
    917 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
    918                                   int32_t    end)
    919 {
    920     if (strsrch->pattern.hasPrefixAccents) {
    921         UCollationElements *coleiter  = strsrch->textIter;
    922         UErrorCode          status    = U_ZERO_ERROR;
    923         // we have been iterating forwards previously
    924         uint32_t            ignorable = TRUE;
    925         int32_t             firstce   = strsrch->pattern.CE[0];
    926 
    927         setColEIterOffset(coleiter, start);
    928         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
    929         if (U_FAILURE(status)) {
    930             return TRUE;
    931         }
    932         while (ce != firstce) {
    933             if (ce != UCOL_IGNORABLE) {
    934                 ignorable = FALSE;
    935             }
    936             ce = getCE(strsrch, ucol_next(coleiter, &status));
    937             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
    938                 return TRUE;
    939             }
    940         }
    941         if (!ignorable && inNormBuf(coleiter)) {
    942             // within normalization buffer, discontiguous handled here
    943             return TRUE;
    944         }
    945 
    946         // within text
    947         int32_t temp = start;
    948         // original code
    949         // accent = (getFCD(strsrch->search->text, &temp,
    950         //                  strsrch->search->textLength)
    951         //            >> SECOND_LAST_BYTE_SHIFT_);
    952         // however this code does not work well with VC7 .net in release mode.
    953         // maybe the inlines for getFCD combined with shifting has bugs in
    954         // VC7. anyways this is a work around.
    955         UBool accent = getFCD(strsrch->search->text, &temp,
    956                               strsrch->search->textLength) > 0xFF;
    957         if (!accent) {
    958             return checkExtraMatchAccents(strsrch, start, end, &status);
    959         }
    960         if (!ignorable) {
    961             return TRUE;
    962         }
    963         if (start > 0) {
    964             temp = start;
    965             U16_BACK_1(strsrch->search->text, 0, temp);
    966             if (getFCD(strsrch->search->text, &temp,
    967                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
    968                 setColEIterOffset(coleiter, start);
    969                 ce = ucol_previous(coleiter, &status);
    970                 if (U_FAILURE(status) ||
    971                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
    972                     return TRUE;
    973                 }
    974             }
    975         }
    976     }
    977 
    978     return FALSE;
    979 }
    980 
    981 /**
    982 * Used by exact matches, checks if there are accents bounding the match.
    983 * Note this is the initial boundary check. If the potential match
    984 * starts or ends with composite characters, the accents in those
    985 * characters will be determined later.
    986 * Not doing backwards iteration here, since discontiguos contraction for
    987 * backwards collation element iterator, use up too many characters.
    988 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
    989 * should fail since there is a acute at the end of \u01FA
    990 * Note here that accents checking are slow and cautioned in the API docs.
    991 * @param strsrch string search data
    992 * @param start offset of match
    993 * @param end end offset of the match
    994 * @return TRUE if there are accents on either side of the match,
    995 *         FALSE otherwise
    996 */
    997 static
    998 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
    999                                  int32_t    end)
   1000 {
   1001     if (strsrch->pattern.hasSuffixAccents) {
   1002         const UChar       *text       = strsrch->search->text;
   1003               int32_t  temp       = end;
   1004               int32_t      textlength = strsrch->search->textLength;
   1005         U16_BACK_1(text, 0, temp);
   1006         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   1007             int32_t             firstce  = strsrch->pattern.CE[0];
   1008             UCollationElements *coleiter = strsrch->textIter;
   1009             UErrorCode          status   = U_ZERO_ERROR;
   1010             int32_t ce;
   1011             setColEIterOffset(coleiter, start);
   1012             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
   1013                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   1014                     return TRUE;
   1015                 }
   1016             }
   1017             int32_t count = 1;
   1018             while (count < strsrch->pattern.CELength) {
   1019                 if (getCE(strsrch, ucol_next(coleiter, &status))
   1020                     == UCOL_IGNORABLE) {
   1021                     // Thai can give an ignorable here.
   1022                     count --;
   1023                 }
   1024                 if (U_FAILURE(status)) {
   1025                     return TRUE;
   1026                 }
   1027                 count ++;
   1028             }
   1029 
   1030             ce = ucol_next(coleiter, &status);
   1031             if (U_FAILURE(status)) {
   1032                 return TRUE;
   1033             }
   1034             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1035                 ce = getCE(strsrch, ce);
   1036             }
   1037             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1038                 if (ucol_getOffset(coleiter) <= end) {
   1039                     return TRUE;
   1040                 }
   1041                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   1042                     return TRUE;
   1043                 }
   1044             }
   1045         }
   1046     }
   1047     return FALSE;
   1048 }
   1049 #endif // #if BOYER_MOORE
   1050 
   1051 /**
   1052 * Checks if the offset runs out of the text string
   1053 * @param offset
   1054 * @param textlength of the text string
   1055 * @return TRUE if offset is out of bounds, FALSE otherwise
   1056 */
   1057 static
   1058 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
   1059 {
   1060     return offset < 0 || offset > textlength;
   1061 }
   1062 
   1063 /**
   1064 * Checks for identical match
   1065 * @param strsrch string search data
   1066 * @param start offset of possible match
   1067 * @param end offset of possible match
   1068 * @return TRUE if identical match is found
   1069 */
   1070 static
   1071 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
   1072                                   int32_t    end)
   1073 {
   1074     if (strsrch->strength != UCOL_IDENTICAL) {
   1075         return TRUE;
   1076     }
   1077 
   1078     // Note: We could use Normalizer::compare() or similar, but for short strings
   1079     // which may not be in FCD it might be faster to just NFD them.
   1080     UErrorCode status = U_ZERO_ERROR;
   1081     UnicodeString t2, p2;
   1082     strsrch->nfd->normalize(
   1083         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
   1084     strsrch->nfd->normalize(
   1085         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
   1086     // return FALSE if NFD failed
   1087     return U_SUCCESS(status) && t2 == p2;
   1088 }
   1089 
   1090 #if BOYER_MOORE
   1091 /**
   1092 * Checks to see if the match is repeated
   1093 * @param strsrch string search data
   1094 * @param start new match start index
   1095 * @param end new match end index
   1096 * @return TRUE if the the match is repeated, FALSE otherwise
   1097 */
   1098 static
   1099 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
   1100                                 int32_t    start,
   1101                                 int32_t    end)
   1102 {
   1103     int32_t lastmatchindex = strsrch->search->matchedIndex;
   1104     UBool       result;
   1105     if (lastmatchindex == USEARCH_DONE) {
   1106         return FALSE;
   1107     }
   1108     if (strsrch->search->isForwardSearching) {
   1109         result = start <= lastmatchindex;
   1110     }
   1111     else {
   1112         result = start >= lastmatchindex;
   1113     }
   1114     if (!result && !strsrch->search->isOverlap) {
   1115         if (strsrch->search->isForwardSearching) {
   1116             result = start < lastmatchindex + strsrch->search->matchedLength;
   1117         }
   1118         else {
   1119             result = end > lastmatchindex;
   1120         }
   1121     }
   1122     return result;
   1123 }
   1124 
   1125 /**
   1126 * Gets the collation element iterator's current offset.
   1127 * @param coleiter collation element iterator
   1128 * @param forwards flag TRUE if we are moving in th forwards direction
   1129 * @return current offset
   1130 */
   1131 static
   1132 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
   1133                                               UBool               forwards)
   1134 {
   1135     int32_t result = ucol_getOffset(coleiter);
   1136     // intricacies of the the backwards collation element iterator
   1137     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
   1138         result ++;
   1139     }
   1140     return result;
   1141 }
   1142 
   1143 /**
   1144 * Checks match for contraction.
   1145 * If the match ends with a partial contraction we fail.
   1146 * If the match starts too far off (because of backwards iteration) we try to
   1147 * chip off the extra characters depending on whether a breakiterator has
   1148 * been used.
   1149 * Internal method, error assumed to be success, caller has to check status
   1150 * before calling this method.
   1151 * @param strsrch string search data
   1152 * @param start offset of potential match, to be modified if necessary
   1153 * @param end offset of potential match, to be modified if necessary
   1154 * @param status output error status if any
   1155 * @return TRUE if match passes the contraction test, FALSE otherwise
   1156 */
   1157 
   1158 static
   1159 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
   1160                                      int32_t   *start,
   1161                                      int32_t   *end, UErrorCode  *status)
   1162 {
   1163           UCollationElements *coleiter   = strsrch->textIter;
   1164           int32_t             textlength = strsrch->search->textLength;
   1165           int32_t             temp       = *start;
   1166     const UCollator          *collator   = strsrch->collator;
   1167     const UChar              *text       = strsrch->search->text;
   1168     // This part checks if either ends of the match contains potential
   1169     // contraction. If so we'll have to iterate through them
   1170     // The start contraction needs to be checked since ucol_previous dumps
   1171     // all characters till the first safe character into the buffer.
   1172     // *start + 1 is used to test for the unsafe characters instead of *start
   1173     // because ucol_prev takes all unsafe characters till the first safe
   1174     // character ie *start. so by testing *start + 1, we can estimate if
   1175     // excess prefix characters has been included in the potential search
   1176     // results.
   1177     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1178         (*start + 1 < textlength
   1179          && ucol_unsafeCP(text[*start + 1], collator))) {
   1180         int32_t expansion  = getExpansionPrefix(coleiter);
   1181         UBool   expandflag = expansion > 0;
   1182         setColEIterOffset(coleiter, *start);
   1183         while (expansion > 0) {
   1184             // getting rid of the redundant ce, caused by setOffset.
   1185             // since backward contraction/expansion may have extra ces if we
   1186             // are in the normalization buffer, hasAccentsBeforeMatch would
   1187             // have taken care of it.
   1188             // E.g. the character \u01FA will have an expansion of 3, but if
   1189             // we are only looking for acute and ring \u030A and \u0301, we'll
   1190             // have to skip the first ce in the expansion buffer.
   1191             ucol_next(coleiter, status);
   1192             if (U_FAILURE(*status)) {
   1193                 return FALSE;
   1194             }
   1195             if (ucol_getOffset(coleiter) != temp) {
   1196                 *start = temp;
   1197                 temp  = ucol_getOffset(coleiter);
   1198             }
   1199             expansion --;
   1200         }
   1201 
   1202         int32_t  *patternce       = strsrch->pattern.CE;
   1203         int32_t   patterncelength = strsrch->pattern.CELength;
   1204         int32_t   count           = 0;
   1205         while (count < patterncelength) {
   1206             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1207             if (ce == UCOL_IGNORABLE) {
   1208                 continue;
   1209             }
   1210             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1211                 *start = temp;
   1212                 temp   = ucol_getOffset(coleiter);
   1213             }
   1214             if (U_FAILURE(*status) || ce != patternce[count]) {
   1215                 (*end) ++;
   1216                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1217                 return FALSE;
   1218             }
   1219             count ++;
   1220         }
   1221     }
   1222     return TRUE;
   1223 }
   1224 
   1225 /**
   1226 * Checks and sets the match information if found.
   1227 * Checks
   1228 * <ul>
   1229 * <li> the potential match does not repeat the previous match
   1230 * <li> boundaries are correct
   1231 * <li> exact matches has no extra accents
   1232 * <li> identical matchesb
   1233 * <li> potential match does not end in the middle of a contraction
   1234 * <\ul>
   1235 * Otherwise the offset will be shifted to the next character.
   1236 * Internal method, status assumed to be success, caller has to check status
   1237 * before calling this method.
   1238 * @param strsrch string search data
   1239 * @param textoffset offset in the collation element text. the returned value
   1240 *        will be the truncated end offset of the match or the new start
   1241 *        search offset.
   1242 * @param status output error status if any
   1243 * @return TRUE if the match is valid, FALSE otherwise
   1244 */
   1245 static
   1246 inline UBool checkNextExactMatch(UStringSearch *strsrch,
   1247                                  int32_t   *textoffset, UErrorCode *status)
   1248 {
   1249     UCollationElements *coleiter = strsrch->textIter;
   1250     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
   1251 
   1252     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
   1253         return FALSE;
   1254     }
   1255 
   1256     // this totally matches, however we need to check if it is repeating
   1257     if (!isBreakUnit(strsrch, start, *textoffset) ||
   1258         checkRepeatedMatch(strsrch, start, *textoffset) ||
   1259         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
   1260         !checkIdentical(strsrch, start, *textoffset) ||
   1261         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
   1262 
   1263         (*textoffset) ++;
   1264         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
   1265         return FALSE;
   1266     }
   1267 
   1268     //Add breakiterator boundary check for primary strength search.
   1269     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   1270         checkBreakBoundary(strsrch, &start, textoffset);
   1271     }
   1272 
   1273     // totally match, we will get rid of the ending ignorables.
   1274     strsrch->search->matchedIndex  = start;
   1275     strsrch->search->matchedLength = *textoffset - start;
   1276     return TRUE;
   1277 }
   1278 
   1279 /**
   1280 * Getting the previous base character offset, or the current offset if the
   1281 * current character is a base character
   1282 * @param text string
   1283 * @param textoffset one offset after the current character
   1284 * @return the offset of the next character after the base character or the first
   1285 *         composed character with accents
   1286 */
   1287 static
   1288 inline int32_t getPreviousBaseOffset(const UChar       *text,
   1289                                                int32_t  textoffset)
   1290 {
   1291     if (textoffset > 0) {
   1292         for (;;) {
   1293             int32_t result = textoffset;
   1294             U16_BACK_1(text, 0, textoffset);
   1295             int32_t temp = textoffset;
   1296             uint16_t fcd = getFCD(text, &temp, result);
   1297             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   1298                 if (fcd & LAST_BYTE_MASK_) {
   1299                     return textoffset;
   1300                 }
   1301                 return result;
   1302             }
   1303             if (textoffset == 0) {
   1304                 return 0;
   1305             }
   1306         }
   1307     }
   1308     return textoffset;
   1309 }
   1310 
   1311 /**
   1312 * Getting the indexes of the accents that are not blocked in the argument
   1313 * accent array
   1314 * @param accents array of accents in nfd terminated by a 0.
   1315 * @param accentsindex array of indexes of the accents that are not blocked
   1316 */
   1317 static
   1318 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
   1319 {
   1320     int32_t index     = 0;
   1321     int32_t     length    = u_strlen(accents);
   1322     UChar32     codepoint = 0;
   1323     int         cclass    = 0;
   1324     int         result    = 0;
   1325     int32_t temp;
   1326     while (index < length) {
   1327         temp = index;
   1328         U16_NEXT(accents, index, length, codepoint);
   1329         if (u_getCombiningClass(codepoint) != cclass) {
   1330             cclass        = u_getCombiningClass(codepoint);
   1331             accentsindex[result] = temp;
   1332             result ++;
   1333         }
   1334     }
   1335     accentsindex[result] = length;
   1336     return result;
   1337 }
   1338 
   1339 /**
   1340 * Appends 3 UChar arrays to a destination array.
   1341 * Creates a new array if we run out of space. The caller will have to
   1342 * manually deallocate the newly allocated array.
   1343 * Internal method, status assumed to be success, caller has to check status
   1344 * before calling this method. destination not to be NULL and has at least
   1345 * size destinationlength.
   1346 * @param destination target array
   1347 * @param destinationlength target array size, returning the appended length
   1348 * @param source1 null-terminated first array
   1349 * @param source2 second array
   1350 * @param source2length length of seond array
   1351 * @param source3 null-terminated third array
   1352 * @param status error status if any
   1353 * @return new destination array, destination if there was no new allocation
   1354 */
   1355 static
   1356 inline UChar * addToUCharArray(      UChar      *destination,
   1357                                      int32_t    *destinationlength,
   1358                                const UChar      *source1,
   1359                                const UChar      *source2,
   1360                                      int32_t     source2length,
   1361                                const UChar      *source3,
   1362                                      UErrorCode *status)
   1363 {
   1364     int32_t source1length = source1 ? u_strlen(source1) : 0;
   1365     int32_t source3length = source3 ? u_strlen(source3) : 0;
   1366     if (*destinationlength < source1length + source2length + source3length +
   1367                                                                            1)
   1368     {
   1369         destination = (UChar *)allocateMemory(
   1370           (source1length + source2length + source3length + 1) * sizeof(UChar),
   1371           status);
   1372         // if error allocating memory, status will be
   1373         // U_MEMORY_ALLOCATION_ERROR
   1374         if (U_FAILURE(*status)) {
   1375             *destinationlength = 0;
   1376             return NULL;
   1377         }
   1378     }
   1379     if (source1length != 0) {
   1380         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
   1381     }
   1382     if (source2length != 0) {
   1383         uprv_memcpy(destination + source1length, source2,
   1384                     sizeof(UChar) * source2length);
   1385     }
   1386     if (source3length != 0) {
   1387         uprv_memcpy(destination + source1length + source2length, source3,
   1388                     sizeof(UChar) * source3length);
   1389     }
   1390     *destinationlength = source1length + source2length + source3length;
   1391     return destination;
   1392 }
   1393 
   1394 /**
   1395 * Running through a collation element iterator to see if the contents matches
   1396 * pattern in string search data
   1397 * @param strsrch string search data
   1398 * @param coleiter collation element iterator
   1399 * @return TRUE if a match if found, FALSE otherwise
   1400 */
   1401 static
   1402 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
   1403                                        UCollationElements *coleiter)
   1404 {
   1405     int         patternceindex = strsrch->pattern.CELength;
   1406     int32_t    *patternce      = strsrch->pattern.CE;
   1407     UErrorCode  status = U_ZERO_ERROR;
   1408     while (patternceindex > 0) {
   1409         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   1410         if (ce == UCOL_IGNORABLE) {
   1411             continue;
   1412         }
   1413         if (U_FAILURE(status) || ce != *patternce) {
   1414             return FALSE;
   1415         }
   1416         patternce ++;
   1417         patternceindex --;
   1418     }
   1419     return TRUE;
   1420 }
   1421 
   1422 /**
   1423 * Rearranges the front accents to try matching.
   1424 * Prefix accents in the text will be grouped according to their combining
   1425 * class and the groups will be mixed and matched to try find the perfect
   1426 * match with the pattern.
   1427 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1428 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1429 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1430 *         "\u0301\u0325".
   1431 * step 2: check if any of the generated substrings matches the pattern.
   1432 * Internal method, status is assumed to be success, caller has to check status
   1433 * before calling this method.
   1434 * @param strsrch string search match
   1435 * @param start first offset of the accents to start searching
   1436 * @param end start of the last accent set
   1437 * @param status output error status if any
   1438 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1439 *         offset of the match. Note this start includes all preceding accents.
   1440 */
   1441 static
   1442 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
   1443                                        int32_t    start,
   1444                                        int32_t    end,
   1445                                        UErrorCode    *status)
   1446 {
   1447     const UChar       *text       = strsrch->search->text;
   1448           int32_t      textlength = strsrch->search->textLength;
   1449           int32_t  tempstart  = start;
   1450 
   1451     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
   1452         // die... failed at a base character
   1453         return USEARCH_DONE;
   1454     }
   1455 
   1456     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
   1457     start = getPreviousBaseOffset(text, tempstart);
   1458 
   1459     UChar       accents[INITIAL_ARRAY_SIZE_];
   1460     // normalizing the offensive string
   1461     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
   1462                     INITIAL_ARRAY_SIZE_, status);
   1463     if (U_FAILURE(*status)) {
   1464         return USEARCH_DONE;
   1465     }
   1466 
   1467     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   1468     int32_t         accentsize = getUnblockedAccentIndex(accents,
   1469                                                                  accentsindex);
   1470     int32_t         count      = (2 << (accentsize - 1)) - 1;
   1471     UChar               buffer[INITIAL_ARRAY_SIZE_];
   1472     UCollationElements *coleiter   = strsrch->utilIter;
   1473     while (U_SUCCESS(*status) && count > 0) {
   1474         UChar *rearrange = strsrch->canonicalPrefixAccents;
   1475         // copy the base characters
   1476         for (int k = 0; k < accentsindex[0]; k ++) {
   1477             *rearrange ++ = accents[k];
   1478         }
   1479         // forming all possible canonical rearrangement by dropping
   1480         // sets of accents
   1481         for (int i = 0; i <= accentsize - 1; i ++) {
   1482             int32_t mask = 1 << (accentsize - i - 1);
   1483             if (count & mask) {
   1484                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1485                     *rearrange ++ = accents[j];
   1486                 }
   1487             }
   1488         }
   1489         *rearrange = 0;
   1490         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   1491         UChar   *match     = addToUCharArray(buffer, &matchsize,
   1492                                            strsrch->canonicalPrefixAccents,
   1493                                            strsrch->search->text + offset,
   1494                                            end - offset,
   1495                                            strsrch->canonicalSuffixAccents,
   1496                                            status);
   1497 
   1498         // if status is a failure, ucol_setText does nothing.
   1499         // run the collator iterator through this match
   1500         ucol_setText(coleiter, match, matchsize, status);
   1501         if (U_SUCCESS(*status)) {
   1502             if (checkCollationMatch(strsrch, coleiter)) {
   1503                 if (match != buffer) {
   1504                     uprv_free(match);
   1505                 }
   1506                 return start;
   1507             }
   1508         }
   1509         count --;
   1510     }
   1511     return USEARCH_DONE;
   1512 }
   1513 
   1514 /**
   1515 * Gets the offset to the safe point in text before textoffset.
   1516 * ie. not the middle of a contraction, swappable characters or supplementary
   1517 * characters.
   1518 * @param collator collation sata
   1519 * @param text string to work with
   1520 * @param textoffset offset in string
   1521 * @param textlength length of text string
   1522 * @return offset to the previous safe character
   1523 */
   1524 static
   1525 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
   1526                                       const UChar       *text,
   1527                                             int32_t  textoffset)
   1528 {
   1529     int32_t result = textoffset; // first contraction character
   1530     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
   1531         result --;
   1532     }
   1533     if (result != 0) {
   1534         // the first contraction character is consider unsafe here
   1535         result --;
   1536     }
   1537     return result;
   1538 }
   1539 
   1540 /**
   1541 * Cleaning up after we passed the safe zone
   1542 * @param strsrch string search data
   1543 * @param safetext safe text array
   1544 * @param safebuffer safe text buffer
   1545 * @param coleiter collation element iterator for safe text
   1546 */
   1547 static
   1548 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
   1549                                   UChar         *safebuffer)
   1550 {
   1551     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
   1552     {
   1553        uprv_free(safetext);
   1554     }
   1555 }
   1556 
   1557 /**
   1558 * Take the rearranged end accents and tries matching. If match failed at
   1559 * a seperate preceding set of accents (seperated from the rearranged on by
   1560 * at least a base character) then we rearrange the preceding accents and
   1561 * tries matching again.
   1562 * We allow skipping of the ends of the accent set if the ces do not match.
   1563 * However if the failure is found before the accent set, it fails.
   1564 * Internal method, status assumed to be success, caller has to check status
   1565 * before calling this method.
   1566 * @param strsrch string search data
   1567 * @param textoffset of the start of the rearranged accent
   1568 * @param status output error status if any
   1569 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1570 *         offset of the match. Note this start includes all preceding accents.
   1571 */
   1572 static
   1573 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
   1574                                        int32_t    textoffset,
   1575                                        UErrorCode    *status)
   1576 {
   1577     const UChar              *text           = strsrch->search->text;
   1578     const UCollator          *collator       = strsrch->collator;
   1579           int32_t             safelength     = 0;
   1580           UChar              *safetext;
   1581           int32_t             safetextlength;
   1582           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
   1583           UCollationElements *coleiter       = strsrch->utilIter;
   1584           int32_t         safeoffset     = textoffset;
   1585 
   1586     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
   1587                                          collator)) {
   1588         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
   1589         safelength     = textoffset - safeoffset;
   1590         safetextlength = INITIAL_ARRAY_SIZE_;
   1591         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
   1592                                          text + safeoffset, safelength,
   1593                                          strsrch->canonicalSuffixAccents,
   1594                                          status);
   1595     }
   1596     else {
   1597         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
   1598         safetext       = strsrch->canonicalSuffixAccents;
   1599     }
   1600 
   1601     // if status is a failure, ucol_setText does nothing
   1602     ucol_setText(coleiter, safetext, safetextlength, status);
   1603     // status checked in loop below
   1604 
   1605     int32_t  *ce        = strsrch->pattern.CE;
   1606     int32_t   celength  = strsrch->pattern.CELength;
   1607     int       ceindex   = celength - 1;
   1608     UBool     isSafe    = TRUE; // indication flag for position in safe zone
   1609 
   1610     while (ceindex >= 0) {
   1611         int32_t textce = ucol_previous(coleiter, status);
   1612         if (U_FAILURE(*status)) {
   1613             if (isSafe) {
   1614                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1615             }
   1616             return USEARCH_DONE;
   1617         }
   1618         if (textce == UCOL_NULLORDER) {
   1619             // check if we have passed the safe buffer
   1620             if (coleiter == strsrch->textIter) {
   1621                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1622                 return USEARCH_DONE;
   1623             }
   1624             cleanUpSafeText(strsrch, safetext, safebuffer);
   1625             safetext = safebuffer;
   1626             coleiter = strsrch->textIter;
   1627             setColEIterOffset(coleiter, safeoffset);
   1628             // status checked at the start of the loop
   1629             isSafe = FALSE;
   1630             continue;
   1631         }
   1632         textce = getCE(strsrch, textce);
   1633         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   1634             // do the beginning stuff
   1635             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
   1636             if (isSafe && failedoffset >= safelength) {
   1637                 // alas... no hope. failed at rearranged accent set
   1638                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1639                 return USEARCH_DONE;
   1640             }
   1641             else {
   1642                 if (isSafe) {
   1643                     failedoffset += safeoffset;
   1644                     cleanUpSafeText(strsrch, safetext, safebuffer);
   1645                 }
   1646 
   1647                 // try rearranging the front accents
   1648                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
   1649                                         failedoffset, textoffset, status);
   1650                 if (result != USEARCH_DONE) {
   1651                     // if status is a failure, ucol_setOffset does nothing
   1652                     setColEIterOffset(strsrch->textIter, result);
   1653                 }
   1654                 if (U_FAILURE(*status)) {
   1655                     return USEARCH_DONE;
   1656                 }
   1657                 return result;
   1658             }
   1659         }
   1660         if (textce == ce[ceindex]) {
   1661             ceindex --;
   1662         }
   1663     }
   1664     // set offset here
   1665     if (isSafe) {
   1666         int32_t result     = getColElemIterOffset(coleiter, FALSE);
   1667         // sets the text iterator here with the correct expansion and offset
   1668         int32_t    leftoverces = getExpansionPrefix(coleiter);
   1669         cleanUpSafeText(strsrch, safetext, safebuffer);
   1670         if (result >= safelength) {
   1671             result = textoffset;
   1672         }
   1673         else {
   1674             result += safeoffset;
   1675         }
   1676         setColEIterOffset(strsrch->textIter, result);
   1677         strsrch->textIter->iteratordata_.toReturn =
   1678                        setExpansionPrefix(strsrch->textIter, leftoverces);
   1679         return result;
   1680     }
   1681 
   1682     return ucol_getOffset(coleiter);
   1683 }
   1684 
   1685 /**
   1686 * Trying out the substring and sees if it can be a canonical match.
   1687 * This will try normalizing the end accents and arranging them into canonical
   1688 * equivalents and check their corresponding ces with the pattern ce.
   1689 * Suffix accents in the text will be grouped according to their combining
   1690 * class and the groups will be mixed and matched to try find the perfect
   1691 * match with the pattern.
   1692 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1693 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1694 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1695 *         "\u0301\u0325".
   1696 * step 2: check if any of the generated substrings matches the pattern.
   1697 * Internal method, status assumed to be success, caller has to check status
   1698 * before calling this method.
   1699 * @param strsrch string search data
   1700 * @param textoffset end offset in the collation element text that ends with
   1701 *                   the accents to be rearranged
   1702 * @param status error status if any
   1703 * @return TRUE if the match is valid, FALSE otherwise
   1704 */
   1705 static
   1706 UBool doNextCanonicalMatch(UStringSearch *strsrch,
   1707                            int32_t    textoffset,
   1708                            UErrorCode    *status)
   1709 {
   1710     const UChar       *text = strsrch->search->text;
   1711           int32_t  temp = textoffset;
   1712     U16_BACK_1(text, 0, temp);
   1713     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
   1714         UCollationElements *coleiter = strsrch->textIter;
   1715         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
   1716         if (strsrch->pattern.hasPrefixAccents) {
   1717             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
   1718                                                 status);
   1719             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   1720                 setColEIterOffset(coleiter, offset);
   1721                 return TRUE;
   1722             }
   1723         }
   1724         return FALSE;
   1725     }
   1726 
   1727     if (!strsrch->pattern.hasSuffixAccents) {
   1728         return FALSE;
   1729     }
   1730 
   1731     UChar       accents[INITIAL_ARRAY_SIZE_];
   1732     // offset to the last base character in substring to search
   1733     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
   1734     // normalizing the offensive string
   1735     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
   1736                                0, accents, INITIAL_ARRAY_SIZE_, status);
   1737     // status checked in loop below
   1738 
   1739     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   1740     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   1741 
   1742     // 2 power n - 1 plus the full set of accents
   1743     int32_t  count = (2 << (size - 1)) - 1;
   1744     while (U_SUCCESS(*status) && count > 0) {
   1745         UChar *rearrange = strsrch->canonicalSuffixAccents;
   1746         // copy the base characters
   1747         for (int k = 0; k < accentsindex[0]; k ++) {
   1748             *rearrange ++ = accents[k];
   1749         }
   1750         // forming all possible canonical rearrangement by dropping
   1751         // sets of accents
   1752         for (int i = 0; i <= size - 1; i ++) {
   1753             int32_t mask = 1 << (size - i - 1);
   1754             if (count & mask) {
   1755                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1756                     *rearrange ++ = accents[j];
   1757                 }
   1758             }
   1759         }
   1760         *rearrange = 0;
   1761         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
   1762                                                         status);
   1763         if (offset != USEARCH_DONE) {
   1764             return TRUE; // match found
   1765         }
   1766         count --;
   1767     }
   1768     return FALSE;
   1769 }
   1770 
   1771 /**
   1772 * Gets the previous base character offset depending on the string search
   1773 * pattern data
   1774 * @param strsrch string search data
   1775 * @param textoffset current offset, current character
   1776 * @return the offset of the next character after this base character or itself
   1777 *         if it is a composed character with accents
   1778 */
   1779 static
   1780 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
   1781                                                       int32_t textoffset)
   1782 {
   1783     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
   1784         const UChar       *text = strsrch->search->text;
   1785               int32_t  offset = textoffset;
   1786         if (getFCD(text, &offset, strsrch->search->textLength) >>
   1787                                                    SECOND_LAST_BYTE_SHIFT_) {
   1788             return getPreviousBaseOffset(text, textoffset);
   1789         }
   1790     }
   1791     return textoffset;
   1792 }
   1793 
   1794 /**
   1795 * Checks match for contraction.
   1796 * If the match ends with a partial contraction we fail.
   1797 * If the match starts too far off (because of backwards iteration) we try to
   1798 * chip off the extra characters
   1799 * Internal method, status assumed to be success, caller has to check status
   1800 * before calling this method.
   1801 * @param strsrch string search data
   1802 * @param start offset of potential match, to be modified if necessary
   1803 * @param end offset of potential match, to be modified if necessary
   1804 * @param status output error status if any
   1805 * @return TRUE if match passes the contraction test, FALSE otherwise
   1806 */
   1807 static
   1808 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
   1809                                          int32_t   *start,
   1810                                          int32_t   *end,
   1811                                          UErrorCode    *status)
   1812 {
   1813           UCollationElements *coleiter   = strsrch->textIter;
   1814           int32_t             textlength = strsrch->search->textLength;
   1815           int32_t         temp       = *start;
   1816     const UCollator          *collator   = strsrch->collator;
   1817     const UChar              *text       = strsrch->search->text;
   1818     // This part checks if either ends of the match contains potential
   1819     // contraction. If so we'll have to iterate through them
   1820     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1821         (*start + 1 < textlength
   1822          && ucol_unsafeCP(text[*start + 1], collator))) {
   1823         int32_t expansion  = getExpansionPrefix(coleiter);
   1824         UBool   expandflag = expansion > 0;
   1825         setColEIterOffset(coleiter, *start);
   1826         while (expansion > 0) {
   1827             // getting rid of the redundant ce, caused by setOffset.
   1828             // since backward contraction/expansion may have extra ces if we
   1829             // are in the normalization buffer, hasAccentsBeforeMatch would
   1830             // have taken care of it.
   1831             // E.g. the character \u01FA will have an expansion of 3, but if
   1832             // we are only looking for acute and ring \u030A and \u0301, we'll
   1833             // have to skip the first ce in the expansion buffer.
   1834             ucol_next(coleiter, status);
   1835             if (U_FAILURE(*status)) {
   1836                 return FALSE;
   1837             }
   1838             if (ucol_getOffset(coleiter) != temp) {
   1839                 *start = temp;
   1840                 temp  = ucol_getOffset(coleiter);
   1841             }
   1842             expansion --;
   1843         }
   1844 
   1845         int32_t  *patternce       = strsrch->pattern.CE;
   1846         int32_t   patterncelength = strsrch->pattern.CELength;
   1847         int32_t   count           = 0;
   1848         int32_t   textlength      = strsrch->search->textLength;
   1849         while (count < patterncelength) {
   1850             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1851             // status checked below, note that if status is a failure
   1852             // ucol_next returns UCOL_NULLORDER
   1853             if (ce == UCOL_IGNORABLE) {
   1854                 continue;
   1855             }
   1856             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1857                 *start = temp;
   1858                 temp   = ucol_getOffset(coleiter);
   1859             }
   1860 
   1861             if (count == 0 && ce != patternce[0]) {
   1862                 // accents may have extra starting ces, this occurs when a
   1863                 // pure accent pattern is matched without rearrangement
   1864                 // text \u0325\u0300 and looking for \u0300
   1865                 int32_t expected = patternce[0];
   1866                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
   1867                     ce = getCE(strsrch, ucol_next(coleiter, status));
   1868                     while (U_SUCCESS(*status) && ce != expected &&
   1869                            ce != UCOL_NULLORDER &&
   1870                            ucol_getOffset(coleiter) <= *end) {
   1871                         ce = getCE(strsrch, ucol_next(coleiter, status));
   1872                     }
   1873                 }
   1874             }
   1875             if (U_FAILURE(*status) || ce != patternce[count]) {
   1876                 (*end) ++;
   1877                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1878                 return FALSE;
   1879             }
   1880             count ++;
   1881         }
   1882     }
   1883     return TRUE;
   1884 }
   1885 
   1886 /**
   1887 * Checks and sets the match information if found.
   1888 * Checks
   1889 * <ul>
   1890 * <li> the potential match does not repeat the previous match
   1891 * <li> boundaries are correct
   1892 * <li> potential match does not end in the middle of a contraction
   1893 * <li> identical matches
   1894 * <\ul>
   1895 * Otherwise the offset will be shifted to the next character.
   1896 * Internal method, status assumed to be success, caller has to check the
   1897 * status before calling this method.
   1898 * @param strsrch string search data
   1899 * @param textoffset offset in the collation element text. the returned value
   1900 *        will be the truncated end offset of the match or the new start
   1901 *        search offset.
   1902 * @param status output error status if any
   1903 * @return TRUE if the match is valid, FALSE otherwise
   1904 */
   1905 static
   1906 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
   1907                                      int32_t   *textoffset,
   1908                                      UErrorCode    *status)
   1909 {
   1910     // to ensure that the start and ends are not composite characters
   1911     UCollationElements *coleiter = strsrch->textIter;
   1912     // if we have a canonical accent match
   1913     if ((strsrch->pattern.hasSuffixAccents &&
   1914         strsrch->canonicalSuffixAccents[0]) ||
   1915         (strsrch->pattern.hasPrefixAccents &&
   1916         strsrch->canonicalPrefixAccents[0])) {
   1917         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
   1918                                                     strsrch,
   1919                                                     ucol_getOffset(coleiter));
   1920         strsrch->search->matchedLength = *textoffset -
   1921                                                 strsrch->search->matchedIndex;
   1922         return TRUE;
   1923     }
   1924 
   1925     int32_t start = getColElemIterOffset(coleiter, FALSE);
   1926     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
   1927                                             status) || U_FAILURE(*status)) {
   1928         return FALSE;
   1929     }
   1930 
   1931     start = getPreviousUStringSearchBaseOffset(strsrch, start);
   1932     // this totally matches, however we need to check if it is repeating
   1933     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
   1934         !isBreakUnit(strsrch, start, *textoffset) ||
   1935         !checkIdentical(strsrch, start, *textoffset)) {
   1936         (*textoffset) ++;
   1937         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
   1938                                         strsrch->search->textLength);
   1939         return FALSE;
   1940     }
   1941 
   1942     strsrch->search->matchedIndex  = start;
   1943     strsrch->search->matchedLength = *textoffset - start;
   1944     return TRUE;
   1945 }
   1946 
   1947 /**
   1948 * Shifting the collation element iterator position forward to prepare for
   1949 * a preceding match. If the first character is a unsafe character, we'll only
   1950 * shift by 1 to capture contractions, normalization etc.
   1951 * Internal method, status assumed to be success, caller has to check status
   1952 * before calling this method.
   1953 * @param text strsrch string search data
   1954 * @param textoffset start text position to do search
   1955 * @param ce the text ce which failed the match.
   1956 * @param patternceindex index of the ce within the pattern ce buffer which
   1957 *        failed the match
   1958 * @return final offset
   1959 */
   1960 static
   1961 inline int32_t reverseShift(UStringSearch *strsrch,
   1962                                 int32_t    textoffset,
   1963                                 int32_t       ce,
   1964                                 int32_t        patternceindex)
   1965 {
   1966     if (strsrch->search->isOverlap) {
   1967         if (textoffset != strsrch->search->textLength) {
   1968             textoffset --;
   1969         }
   1970         else {
   1971             textoffset -= strsrch->pattern.defaultShiftSize;
   1972         }
   1973     }
   1974     else {
   1975         if (ce != UCOL_NULLORDER) {
   1976             int32_t shift = strsrch->pattern.backShift[hash(ce)];
   1977 
   1978             // this is to adjust for characters in the middle of the substring
   1979             // for matching that failed.
   1980             int32_t adjust = patternceindex;
   1981             if (adjust > 1 && shift > adjust) {
   1982                 shift -= adjust - 1;
   1983             }
   1984             textoffset -= shift;
   1985         }
   1986         else {
   1987             textoffset -= strsrch->pattern.defaultShiftSize;
   1988         }
   1989     }
   1990     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
   1991     return textoffset;
   1992 }
   1993 
   1994 /**
   1995 * Checks match for contraction.
   1996 * If the match starts with a partial contraction we fail.
   1997 * Internal method, status assumed to be success, caller has to check status
   1998 * before calling this method.
   1999 * @param strsrch string search data
   2000 * @param start offset of potential match, to be modified if necessary
   2001 * @param end offset of potential match, to be modified if necessary
   2002 * @param status output error status if any
   2003 * @return TRUE if match passes the contraction test, FALSE otherwise
   2004 */
   2005 static
   2006 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
   2007                                      int32_t   *start,
   2008                                      int32_t   *end, UErrorCode  *status)
   2009 {
   2010           UCollationElements *coleiter   = strsrch->textIter;
   2011           int32_t             textlength = strsrch->search->textLength;
   2012           int32_t             temp       = *end;
   2013     const UCollator          *collator   = strsrch->collator;
   2014     const UChar              *text       = strsrch->search->text;
   2015     // This part checks if either if the start of the match contains potential
   2016     // contraction. If so we'll have to iterate through them
   2017     // Since we used ucol_next while previously looking for the potential
   2018     // match, this guarantees that our end will not be a partial contraction,
   2019     // or a partial supplementary character.
   2020     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2021         int32_t expansion  = getExpansionSuffix(coleiter);
   2022         UBool   expandflag = expansion > 0;
   2023         setColEIterOffset(coleiter, *end);
   2024         while (U_SUCCESS(*status) && expansion > 0) {
   2025             // getting rid of the redundant ce
   2026             // since forward contraction/expansion may have extra ces
   2027             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2028             // would have taken care of it.
   2029             // E.g. the character \u01FA will have an expansion of 3, but if
   2030             // we are only looking for A ring A\u030A, we'll have to skip the
   2031             // last ce in the expansion buffer
   2032             ucol_previous(coleiter, status);
   2033             if (U_FAILURE(*status)) {
   2034                 return FALSE;
   2035             }
   2036             if (ucol_getOffset(coleiter) != temp) {
   2037                 *end = temp;
   2038                 temp  = ucol_getOffset(coleiter);
   2039             }
   2040             expansion --;
   2041         }
   2042 
   2043         int32_t  *patternce       = strsrch->pattern.CE;
   2044         int32_t   patterncelength = strsrch->pattern.CELength;
   2045         int32_t   count           = patterncelength;
   2046         while (count > 0) {
   2047             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2048             // status checked below, note that if status is a failure
   2049             // ucol_previous returns UCOL_NULLORDER
   2050             if (ce == UCOL_IGNORABLE) {
   2051                 continue;
   2052             }
   2053             if (expandflag && count == 0 &&
   2054                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2055                 *end = temp;
   2056                 temp  = ucol_getOffset(coleiter);
   2057             }
   2058             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2059                 (*start) --;
   2060                 *start = getPreviousBaseOffset(text, *start);
   2061                 return FALSE;
   2062             }
   2063             count --;
   2064         }
   2065     }
   2066     return TRUE;
   2067 }
   2068 
   2069 /**
   2070 * Checks and sets the match information if found.
   2071 * Checks
   2072 * <ul>
   2073 * <li> the current match does not repeat the last match
   2074 * <li> boundaries are correct
   2075 * <li> exact matches has no extra accents
   2076 * <li> identical matches
   2077 * <\ul>
   2078 * Otherwise the offset will be shifted to the preceding character.
   2079 * Internal method, status assumed to be success, caller has to check status
   2080 * before calling this method.
   2081 * @param strsrch string search data
   2082 * @param collator
   2083 * @param coleiter collation element iterator
   2084 * @param text string
   2085 * @param textoffset offset in the collation element text. the returned value
   2086 *        will be the truncated start offset of the match or the new start
   2087 *        search offset.
   2088 * @param status output error status if any
   2089 * @return TRUE if the match is valid, FALSE otherwise
   2090 */
   2091 static
   2092 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
   2093                                      int32_t   *textoffset,
   2094                                      UErrorCode    *status)
   2095 {
   2096     // to ensure that the start and ends are not composite characters
   2097     int32_t end = ucol_getOffset(strsrch->textIter);
   2098     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
   2099         || U_FAILURE(*status)) {
   2100             return FALSE;
   2101     }
   2102 
   2103     // this totally matches, however we need to check if it is repeating
   2104     // the old match
   2105     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2106         !isBreakUnit(strsrch, *textoffset, end) ||
   2107         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
   2108         !checkIdentical(strsrch, *textoffset, end) ||
   2109         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
   2110         (*textoffset) --;
   2111         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2112                                             *textoffset);
   2113         return FALSE;
   2114     }
   2115 
   2116     //Add breakiterator boundary check for primary strength search.
   2117     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   2118         checkBreakBoundary(strsrch, textoffset, &end);
   2119     }
   2120 
   2121     strsrch->search->matchedIndex = *textoffset;
   2122     strsrch->search->matchedLength = end - *textoffset;
   2123     return TRUE;
   2124 }
   2125 
   2126 /**
   2127 * Rearranges the end accents to try matching.
   2128 * Suffix accents in the text will be grouped according to their combining
   2129 * class and the groups will be mixed and matched to try find the perfect
   2130 * match with the pattern.
   2131 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2132 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2133 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2134 *         "\u0301\u0325".
   2135 * step 2: check if any of the generated substrings matches the pattern.
   2136 * Internal method, status assumed to be success, user has to check status
   2137 * before calling this method.
   2138 * @param strsrch string search match
   2139 * @param start offset of the first base character
   2140 * @param end start of the last accent set
   2141 * @param status only error status if any
   2142 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2143 *         offset of the match. Note this start includes all following accents.
   2144 */
   2145 static
   2146 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
   2147                                            int32_t    start,
   2148                                            int32_t    end,
   2149                                            UErrorCode    *status)
   2150 {
   2151     const UChar       *text       = strsrch->search->text;
   2152           int32_t  tempend    = end;
   2153 
   2154     U16_BACK_1(text, 0, tempend);
   2155     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
   2156                                                            LAST_BYTE_MASK_)) {
   2157         // die... failed at a base character
   2158         return USEARCH_DONE;
   2159     }
   2160     end = getNextBaseOffset(text, end, strsrch->search->textLength);
   2161 
   2162     if (U_SUCCESS(*status)) {
   2163         UChar       accents[INITIAL_ARRAY_SIZE_];
   2164         int32_t offset = getPreviousBaseOffset(text, end);
   2165         // normalizing the offensive string
   2166         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
   2167                         INITIAL_ARRAY_SIZE_, status);
   2168 
   2169         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   2170         int32_t         accentsize = getUnblockedAccentIndex(accents,
   2171                                                          accentsindex);
   2172         int32_t         count      = (2 << (accentsize - 1)) - 1;
   2173         UChar               buffer[INITIAL_ARRAY_SIZE_];
   2174         UCollationElements *coleiter = strsrch->utilIter;
   2175         while (U_SUCCESS(*status) && count > 0) {
   2176             UChar *rearrange = strsrch->canonicalSuffixAccents;
   2177             // copy the base characters
   2178             for (int k = 0; k < accentsindex[0]; k ++) {
   2179                 *rearrange ++ = accents[k];
   2180             }
   2181             // forming all possible canonical rearrangement by dropping
   2182             // sets of accents
   2183             for (int i = 0; i <= accentsize - 1; i ++) {
   2184                 int32_t mask = 1 << (accentsize - i - 1);
   2185                 if (count & mask) {
   2186                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2187                         *rearrange ++ = accents[j];
   2188                     }
   2189                 }
   2190             }
   2191             *rearrange = 0;
   2192             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   2193             UChar   *match     = addToUCharArray(buffer, &matchsize,
   2194                                            strsrch->canonicalPrefixAccents,
   2195                                            strsrch->search->text + start,
   2196                                            offset - start,
   2197                                            strsrch->canonicalSuffixAccents,
   2198                                            status);
   2199 
   2200             // run the collator iterator through this match
   2201             // if status is a failure ucol_setText does nothing
   2202             ucol_setText(coleiter, match, matchsize, status);
   2203             if (U_SUCCESS(*status)) {
   2204                 if (checkCollationMatch(strsrch, coleiter)) {
   2205                     if (match != buffer) {
   2206                         uprv_free(match);
   2207                     }
   2208                     return end;
   2209                 }
   2210             }
   2211             count --;
   2212         }
   2213     }
   2214     return USEARCH_DONE;
   2215 }
   2216 
   2217 /**
   2218 * Take the rearranged start accents and tries matching. If match failed at
   2219 * a seperate following set of accents (seperated from the rearranged on by
   2220 * at least a base character) then we rearrange the preceding accents and
   2221 * tries matching again.
   2222 * We allow skipping of the ends of the accent set if the ces do not match.
   2223 * However if the failure is found before the accent set, it fails.
   2224 * Internal method, status assumed to be success, caller has to check status
   2225 * before calling this method.
   2226 * @param strsrch string search data
   2227 * @param textoffset of the ends of the rearranged accent
   2228 * @param status output error status if any
   2229 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2230 *         offset of the match. Note this start includes all following accents.
   2231 */
   2232 static
   2233 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
   2234                                            int32_t    textoffset,
   2235                                            UErrorCode    *status)
   2236 {
   2237     const UChar       *text       = strsrch->search->text;
   2238     const UCollator   *collator   = strsrch->collator;
   2239           int32_t      safelength = 0;
   2240           UChar       *safetext;
   2241           int32_t      safetextlength;
   2242           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
   2243           int32_t  safeoffset = textoffset;
   2244 
   2245     if (textoffset &&
   2246         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
   2247                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
   2248                                          ], collator)) {
   2249         safeoffset     = getNextSafeOffset(collator, text, textoffset,
   2250                                            strsrch->search->textLength);
   2251         safelength     = safeoffset - textoffset;
   2252         safetextlength = INITIAL_ARRAY_SIZE_;
   2253         safetext       = addToUCharArray(safebuffer, &safetextlength,
   2254                                          strsrch->canonicalPrefixAccents,
   2255                                          text + textoffset, safelength,
   2256                                          NULL, status);
   2257     }
   2258     else {
   2259         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
   2260         safetext       = strsrch->canonicalPrefixAccents;
   2261     }
   2262 
   2263     UCollationElements *coleiter = strsrch->utilIter;
   2264      // if status is a failure, ucol_setText does nothing
   2265     ucol_setText(coleiter, safetext, safetextlength, status);
   2266     // status checked in loop below
   2267 
   2268     int32_t  *ce           = strsrch->pattern.CE;
   2269     int32_t   celength     = strsrch->pattern.CELength;
   2270     int       ceindex      = 0;
   2271     UBool     isSafe       = TRUE; // safe zone indication flag for position
   2272     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
   2273 
   2274     while (ceindex < celength) {
   2275         int32_t textce = ucol_next(coleiter, status);
   2276         if (U_FAILURE(*status)) {
   2277             if (isSafe) {
   2278                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2279             }
   2280             return USEARCH_DONE;
   2281         }
   2282         if (textce == UCOL_NULLORDER) {
   2283             // check if we have passed the safe buffer
   2284             if (coleiter == strsrch->textIter) {
   2285                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2286                 return USEARCH_DONE;
   2287             }
   2288             cleanUpSafeText(strsrch, safetext, safebuffer);
   2289             safetext = safebuffer;
   2290             coleiter = strsrch->textIter;
   2291             setColEIterOffset(coleiter, safeoffset);
   2292             // status checked at the start of the loop
   2293             isSafe = FALSE;
   2294             continue;
   2295         }
   2296         textce = getCE(strsrch, textce);
   2297         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   2298             // do the beginning stuff
   2299             int32_t failedoffset = ucol_getOffset(coleiter);
   2300             if (isSafe && failedoffset <= prefixlength) {
   2301                 // alas... no hope. failed at rearranged accent set
   2302                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2303                 return USEARCH_DONE;
   2304             }
   2305             else {
   2306                 if (isSafe) {
   2307                     failedoffset = safeoffset - failedoffset;
   2308                     cleanUpSafeText(strsrch, safetext, safebuffer);
   2309                 }
   2310 
   2311                 // try rearranging the end accents
   2312                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
   2313                                         textoffset, failedoffset, status);
   2314                 if (result != USEARCH_DONE) {
   2315                     // if status is a failure, ucol_setOffset does nothing
   2316                     setColEIterOffset(strsrch->textIter, result);
   2317                 }
   2318                 if (U_FAILURE(*status)) {
   2319                     return USEARCH_DONE;
   2320                 }
   2321                 return result;
   2322             }
   2323         }
   2324         if (textce == ce[ceindex]) {
   2325             ceindex ++;
   2326         }
   2327     }
   2328     // set offset here
   2329     if (isSafe) {
   2330         int32_t result      = ucol_getOffset(coleiter);
   2331         // sets the text iterator here with the correct expansion and offset
   2332         int32_t     leftoverces = getExpansionSuffix(coleiter);
   2333         cleanUpSafeText(strsrch, safetext, safebuffer);
   2334         if (result <= prefixlength) {
   2335             result = textoffset;
   2336         }
   2337         else {
   2338             result = textoffset + (safeoffset - result);
   2339         }
   2340         setColEIterOffset(strsrch->textIter, result);
   2341         setExpansionSuffix(strsrch->textIter, leftoverces);
   2342         return result;
   2343     }
   2344 
   2345     return ucol_getOffset(coleiter);
   2346 }
   2347 
   2348 /**
   2349 * Trying out the substring and sees if it can be a canonical match.
   2350 * This will try normalizing the starting accents and arranging them into
   2351 * canonical equivalents and check their corresponding ces with the pattern ce.
   2352 * Prefix accents in the text will be grouped according to their combining
   2353 * class and the groups will be mixed and matched to try find the perfect
   2354 * match with the pattern.
   2355 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2356 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2357 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2358 *         "\u0301\u0325".
   2359 * step 2: check if any of the generated substrings matches the pattern.
   2360 * Internal method, status assumed to be success, caller has to check status
   2361 * before calling this method.
   2362 * @param strsrch string search data
   2363 * @param textoffset start offset in the collation element text that starts
   2364 *                   with the accents to be rearranged
   2365 * @param status output error status if any
   2366 * @return TRUE if the match is valid, FALSE otherwise
   2367 */
   2368 static
   2369 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
   2370                                int32_t    textoffset,
   2371                                UErrorCode    *status)
   2372 {
   2373     const UChar       *text       = strsrch->search->text;
   2374           int32_t  temp       = textoffset;
   2375           int32_t      textlength = strsrch->search->textLength;
   2376     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   2377         UCollationElements *coleiter = strsrch->textIter;
   2378         int32_t         offset   = ucol_getOffset(coleiter);
   2379         if (strsrch->pattern.hasSuffixAccents) {
   2380             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
   2381                                                     offset, status);
   2382             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   2383                 setColEIterOffset(coleiter, offset);
   2384                 return TRUE;
   2385             }
   2386         }
   2387         return FALSE;
   2388     }
   2389 
   2390     if (!strsrch->pattern.hasPrefixAccents) {
   2391         return FALSE;
   2392     }
   2393 
   2394     UChar       accents[INITIAL_ARRAY_SIZE_];
   2395     // offset to the last base character in substring to search
   2396     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
   2397     // normalizing the offensive string
   2398     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
   2399                                0, accents, INITIAL_ARRAY_SIZE_, status);
   2400     // status checked in loop
   2401 
   2402     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   2403     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   2404 
   2405     // 2 power n - 1 plus the full set of accents
   2406     int32_t  count = (2 << (size - 1)) - 1;
   2407     while (U_SUCCESS(*status) && count > 0) {
   2408         UChar *rearrange = strsrch->canonicalPrefixAccents;
   2409         // copy the base characters
   2410         for (int k = 0; k < accentsindex[0]; k ++) {
   2411             *rearrange ++ = accents[k];
   2412         }
   2413         // forming all possible canonical rearrangement by dropping
   2414         // sets of accents
   2415         for (int i = 0; i <= size - 1; i ++) {
   2416             int32_t mask = 1 << (size - i - 1);
   2417             if (count & mask) {
   2418                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2419                     *rearrange ++ = accents[j];
   2420                 }
   2421             }
   2422         }
   2423         *rearrange = 0;
   2424         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
   2425                                                           baseoffset, status);
   2426         if (offset != USEARCH_DONE) {
   2427             return TRUE; // match found
   2428         }
   2429         count --;
   2430     }
   2431     return FALSE;
   2432 }
   2433 
   2434 /**
   2435 * Checks match for contraction.
   2436 * If the match starts with a partial contraction we fail.
   2437 * Internal method, status assumed to be success, caller has to check status
   2438 * before calling this method.
   2439 * @param strsrch string search data
   2440 * @param start offset of potential match, to be modified if necessary
   2441 * @param end offset of potential match, to be modified if necessary
   2442 * @param status only error status if any
   2443 * @return TRUE if match passes the contraction test, FALSE otherwise
   2444 */
   2445 static
   2446 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
   2447                                      int32_t   *start,
   2448                                      int32_t   *end, UErrorCode  *status)
   2449 {
   2450           UCollationElements *coleiter   = strsrch->textIter;
   2451           int32_t             textlength = strsrch->search->textLength;
   2452           int32_t         temp       = *end;
   2453     const UCollator          *collator   = strsrch->collator;
   2454     const UChar              *text       = strsrch->search->text;
   2455     // This part checks if either if the start of the match contains potential
   2456     // contraction. If so we'll have to iterate through them
   2457     // Since we used ucol_next while previously looking for the potential
   2458     // match, this guarantees that our end will not be a partial contraction,
   2459     // or a partial supplementary character.
   2460     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2461         int32_t expansion  = getExpansionSuffix(coleiter);
   2462         UBool   expandflag = expansion > 0;
   2463         setColEIterOffset(coleiter, *end);
   2464         while (expansion > 0) {
   2465             // getting rid of the redundant ce
   2466             // since forward contraction/expansion may have extra ces
   2467             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2468             // would have taken care of it.
   2469             // E.g. the character \u01FA will have an expansion of 3, but if
   2470             // we are only looking for A ring A\u030A, we'll have to skip the
   2471             // last ce in the expansion buffer
   2472             ucol_previous(coleiter, status);
   2473             if (U_FAILURE(*status)) {
   2474                 return FALSE;
   2475             }
   2476             if (ucol_getOffset(coleiter) != temp) {
   2477                 *end = temp;
   2478                 temp  = ucol_getOffset(coleiter);
   2479             }
   2480             expansion --;
   2481         }
   2482 
   2483         int32_t  *patternce       = strsrch->pattern.CE;
   2484         int32_t   patterncelength = strsrch->pattern.CELength;
   2485         int32_t   count           = patterncelength;
   2486         while (count > 0) {
   2487             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2488             // status checked below, note that if status is a failure
   2489             // ucol_previous returns UCOL_NULLORDER
   2490             if (ce == UCOL_IGNORABLE) {
   2491                 continue;
   2492             }
   2493             if (expandflag && count == 0 &&
   2494                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2495                 *end = temp;
   2496                 temp  = ucol_getOffset(coleiter);
   2497             }
   2498             if (count == patterncelength &&
   2499                 ce != patternce[patterncelength - 1]) {
   2500                 // accents may have extra starting ces, this occurs when a
   2501                 // pure accent pattern is matched without rearrangement
   2502                 int32_t    expected = patternce[patterncelength - 1];
   2503                 U16_BACK_1(text, 0, *end);
   2504                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
   2505                     ce = getCE(strsrch, ucol_previous(coleiter, status));
   2506                     while (U_SUCCESS(*status) && ce != expected &&
   2507                            ce != UCOL_NULLORDER &&
   2508                            ucol_getOffset(coleiter) <= *start) {
   2509                         ce = getCE(strsrch, ucol_previous(coleiter, status));
   2510                     }
   2511                 }
   2512             }
   2513             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2514                 (*start) --;
   2515                 *start = getPreviousBaseOffset(text, *start);
   2516                 return FALSE;
   2517             }
   2518             count --;
   2519         }
   2520     }
   2521     return TRUE;
   2522 }
   2523 
   2524 /**
   2525 * Checks and sets the match information if found.
   2526 * Checks
   2527 * <ul>
   2528 * <li> the potential match does not repeat the previous match
   2529 * <li> boundaries are correct
   2530 * <li> potential match does not end in the middle of a contraction
   2531 * <li> identical matches
   2532 * <\ul>
   2533 * Otherwise the offset will be shifted to the next character.
   2534 * Internal method, status assumed to be success, caller has to check status
   2535 * before calling this method.
   2536 * @param strsrch string search data
   2537 * @param textoffset offset in the collation element text. the returned value
   2538 *        will be the truncated start offset of the match or the new start
   2539 *        search offset.
   2540 * @param status only error status if any
   2541 * @return TRUE if the match is valid, FALSE otherwise
   2542 */
   2543 static
   2544 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
   2545                                          int32_t   *textoffset,
   2546                                          UErrorCode    *status)
   2547 {
   2548     // to ensure that the start and ends are not composite characters
   2549     UCollationElements *coleiter = strsrch->textIter;
   2550     // if we have a canonical accent match
   2551     if ((strsrch->pattern.hasSuffixAccents &&
   2552         strsrch->canonicalSuffixAccents[0]) ||
   2553         (strsrch->pattern.hasPrefixAccents &&
   2554         strsrch->canonicalPrefixAccents[0])) {
   2555         strsrch->search->matchedIndex  = *textoffset;
   2556         strsrch->search->matchedLength =
   2557             getNextUStringSearchBaseOffset(strsrch,
   2558                                       getColElemIterOffset(coleiter, FALSE))
   2559             - *textoffset;
   2560         return TRUE;
   2561     }
   2562 
   2563     int32_t end = ucol_getOffset(coleiter);
   2564     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
   2565                                                 status) ||
   2566          U_FAILURE(*status)) {
   2567         return FALSE;
   2568     }
   2569 
   2570     end = getNextUStringSearchBaseOffset(strsrch, end);
   2571     // this totally matches, however we need to check if it is repeating
   2572     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2573         !isBreakUnit(strsrch, *textoffset, end) ||
   2574         !checkIdentical(strsrch, *textoffset, end)) {
   2575         (*textoffset) --;
   2576         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2577                                             *textoffset);
   2578         return FALSE;
   2579     }
   2580 
   2581     strsrch->search->matchedIndex  = *textoffset;
   2582     strsrch->search->matchedLength = end - *textoffset;
   2583     return TRUE;
   2584 }
   2585 #endif // #if BOYER_MOORE
   2586 
   2587 // constructors and destructor -------------------------------------------
   2588 
   2589 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
   2590                                           int32_t         patternlength,
   2591                                     const UChar          *text,
   2592                                           int32_t         textlength,
   2593                                     const char           *locale,
   2594                                           UBreakIterator *breakiter,
   2595                                           UErrorCode     *status)
   2596 {
   2597     if (U_FAILURE(*status)) {
   2598         return NULL;
   2599     }
   2600 #if UCONFIG_NO_BREAK_ITERATION
   2601     if (breakiter != NULL) {
   2602         *status = U_UNSUPPORTED_ERROR;
   2603         return NULL;
   2604     }
   2605 #endif
   2606     if (locale) {
   2607         // ucol_open internally checks for status
   2608         UCollator     *collator = ucol_open(locale, status);
   2609         // pattern, text checks are done in usearch_openFromCollator
   2610         UStringSearch *result   = usearch_openFromCollator(pattern,
   2611                                               patternlength, text, textlength,
   2612                                               collator, breakiter, status);
   2613 
   2614         if (result == NULL || U_FAILURE(*status)) {
   2615             if (collator) {
   2616                 ucol_close(collator);
   2617             }
   2618             return NULL;
   2619         }
   2620         else {
   2621             result->ownCollator = TRUE;
   2622         }
   2623         return result;
   2624     }
   2625     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2626     return NULL;
   2627 }
   2628 
   2629 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
   2630                                   const UChar          *pattern,
   2631                                         int32_t         patternlength,
   2632                                   const UChar          *text,
   2633                                         int32_t         textlength,
   2634                                   const UCollator      *collator,
   2635                                         UBreakIterator *breakiter,
   2636                                         UErrorCode     *status)
   2637 {
   2638     if (U_FAILURE(*status)) {
   2639         return NULL;
   2640     }
   2641 #if UCONFIG_NO_BREAK_ITERATION
   2642     if (breakiter != NULL) {
   2643         *status = U_UNSUPPORTED_ERROR;
   2644         return NULL;
   2645     }
   2646 #endif
   2647     if (pattern == NULL || text == NULL || collator == NULL) {
   2648         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2649         return NULL;
   2650     }
   2651 
   2652     // string search does not really work when numeric collation is turned on
   2653     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
   2654         *status = U_UNSUPPORTED_ERROR;
   2655         return NULL;
   2656     }
   2657 
   2658     if (U_SUCCESS(*status)) {
   2659         initializeFCD(status);
   2660         if (U_FAILURE(*status)) {
   2661             return NULL;
   2662         }
   2663 
   2664         UStringSearch *result;
   2665         if (textlength == -1) {
   2666             textlength = u_strlen(text);
   2667         }
   2668         if (patternlength == -1) {
   2669             patternlength = u_strlen(pattern);
   2670         }
   2671         if (textlength <= 0 || patternlength <= 0) {
   2672             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2673             return NULL;
   2674         }
   2675 
   2676         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
   2677         if (result == NULL) {
   2678             *status = U_MEMORY_ALLOCATION_ERROR;
   2679             return NULL;
   2680         }
   2681 
   2682         result->collator    = collator;
   2683         result->strength    = ucol_getStrength(collator);
   2684         result->ceMask      = getMask(result->strength);
   2685         result->toShift     =
   2686              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   2687                                                             UCOL_SHIFTED;
   2688         result->variableTop = ucol_getVariableTop(collator, status);
   2689 
   2690         result->nfd         = Normalizer2Factory::getNFDInstance(*status);
   2691 
   2692         if (U_FAILURE(*status)) {
   2693             uprv_free(result);
   2694             return NULL;
   2695         }
   2696 
   2697         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
   2698         if (result->search == NULL) {
   2699             *status = U_MEMORY_ALLOCATION_ERROR;
   2700             uprv_free(result);
   2701             return NULL;
   2702         }
   2703 
   2704         result->search->text       = text;
   2705         result->search->textLength = textlength;
   2706 
   2707         result->pattern.text       = pattern;
   2708         result->pattern.textLength = patternlength;
   2709         result->pattern.CE         = NULL;
   2710         result->pattern.PCE        = NULL;
   2711 
   2712         result->search->breakIter  = breakiter;
   2713 #if !UCONFIG_NO_BREAK_ITERATION
   2714         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
   2715         if (breakiter) {
   2716             ubrk_setText(breakiter, text, textlength, status);
   2717         }
   2718 #endif
   2719 
   2720         result->ownCollator           = FALSE;
   2721         result->search->matchedLength = 0;
   2722         result->search->matchedIndex  = USEARCH_DONE;
   2723         result->utilIter              = NULL;
   2724         result->textIter              = ucol_openElements(collator, text,
   2725                                                           textlength, status);
   2726         result->textProcessedIter     = NULL;
   2727         if (U_FAILURE(*status)) {
   2728             usearch_close(result);
   2729             return NULL;
   2730         }
   2731 
   2732         result->search->isOverlap          = FALSE;
   2733         result->search->isCanonicalMatch   = FALSE;
   2734         result->search->elementComparisonType = 0;
   2735         result->search->isForwardSearching = TRUE;
   2736         result->search->reset              = TRUE;
   2737 
   2738         initialize(result, status);
   2739 
   2740         if (U_FAILURE(*status)) {
   2741             usearch_close(result);
   2742             return NULL;
   2743         }
   2744 
   2745         return result;
   2746     }
   2747     return NULL;
   2748 }
   2749 
   2750 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
   2751 {
   2752     if (strsrch) {
   2753         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
   2754             strsrch->pattern.CE) {
   2755             uprv_free(strsrch->pattern.CE);
   2756         }
   2757 
   2758         if (strsrch->pattern.PCE != NULL &&
   2759             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
   2760             uprv_free(strsrch->pattern.PCE);
   2761         }
   2762 
   2763         delete strsrch->textProcessedIter;
   2764         ucol_closeElements(strsrch->textIter);
   2765         ucol_closeElements(strsrch->utilIter);
   2766 
   2767         if (strsrch->ownCollator && strsrch->collator) {
   2768             ucol_close((UCollator *)strsrch->collator);
   2769         }
   2770 
   2771 #if !UCONFIG_NO_BREAK_ITERATION
   2772         if (strsrch->search->internalBreakIter) {
   2773             ubrk_close(strsrch->search->internalBreakIter);
   2774         }
   2775 #endif
   2776 
   2777         uprv_free(strsrch->search);
   2778         uprv_free(strsrch);
   2779     }
   2780 }
   2781 
   2782 namespace {
   2783 
   2784 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
   2785     if (U_FAILURE(*status)) { return FALSE; }
   2786     if (strsrch->textProcessedIter == NULL) {
   2787         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
   2788         if (strsrch->textProcessedIter == NULL) {
   2789             *status = U_MEMORY_ALLOCATION_ERROR;
   2790             return FALSE;
   2791         }
   2792     } else {
   2793         strsrch->textProcessedIter->init(strsrch->textIter);
   2794     }
   2795     return TRUE;
   2796 }
   2797 
   2798 }
   2799 
   2800 // set and get methods --------------------------------------------------
   2801 
   2802 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
   2803                                         int32_t    position,
   2804                                         UErrorCode    *status)
   2805 {
   2806     if (U_SUCCESS(*status) && strsrch) {
   2807         if (isOutOfBounds(strsrch->search->textLength, position)) {
   2808             *status = U_INDEX_OUTOFBOUNDS_ERROR;
   2809         }
   2810         else {
   2811             setColEIterOffset(strsrch->textIter, position);
   2812         }
   2813         strsrch->search->matchedIndex  = USEARCH_DONE;
   2814         strsrch->search->matchedLength = 0;
   2815         strsrch->search->reset         = FALSE;
   2816     }
   2817 }
   2818 
   2819 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
   2820 {
   2821     if (strsrch) {
   2822         int32_t result = ucol_getOffset(strsrch->textIter);
   2823         if (isOutOfBounds(strsrch->search->textLength, result)) {
   2824             return USEARCH_DONE;
   2825         }
   2826         return result;
   2827     }
   2828     return USEARCH_DONE;
   2829 }
   2830 
   2831 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
   2832                                  USearchAttribute attribute,
   2833                                  USearchAttributeValue value,
   2834                                  UErrorCode *status)
   2835 {
   2836     if (U_SUCCESS(*status) && strsrch) {
   2837         switch (attribute)
   2838         {
   2839         case USEARCH_OVERLAP :
   2840             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
   2841             break;
   2842         case USEARCH_CANONICAL_MATCH :
   2843             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
   2844                                                                       FALSE);
   2845             break;
   2846         case USEARCH_ELEMENT_COMPARISON :
   2847             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2848                 strsrch->search->elementComparisonType = (int16_t)value;
   2849             } else {
   2850                 strsrch->search->elementComparisonType = 0;
   2851             }
   2852             break;
   2853         case USEARCH_ATTRIBUTE_COUNT :
   2854         default:
   2855             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2856         }
   2857     }
   2858     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
   2859         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2860     }
   2861 }
   2862 
   2863 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
   2864                                                 const UStringSearch *strsrch,
   2865                                                 USearchAttribute attribute)
   2866 {
   2867     if (strsrch) {
   2868         switch (attribute) {
   2869         case USEARCH_OVERLAP :
   2870             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
   2871                                                         USEARCH_OFF);
   2872         case USEARCH_CANONICAL_MATCH :
   2873             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
   2874                                                                USEARCH_OFF);
   2875         case USEARCH_ELEMENT_COMPARISON :
   2876             {
   2877                 int16_t value = strsrch->search->elementComparisonType;
   2878                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2879                     return (USearchAttributeValue)value;
   2880                 } else {
   2881                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
   2882                 }
   2883             }
   2884         case USEARCH_ATTRIBUTE_COUNT :
   2885             return USEARCH_DEFAULT;
   2886         }
   2887     }
   2888     return USEARCH_DEFAULT;
   2889 }
   2890 
   2891 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
   2892                                                 const UStringSearch *strsrch)
   2893 {
   2894     if (strsrch == NULL) {
   2895         return USEARCH_DONE;
   2896     }
   2897     return strsrch->search->matchedIndex;
   2898 }
   2899 
   2900 
   2901 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
   2902                                             UChar         *result,
   2903                                             int32_t        resultCapacity,
   2904                                             UErrorCode    *status)
   2905 {
   2906     if (U_FAILURE(*status)) {
   2907         return USEARCH_DONE;
   2908     }
   2909     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
   2910         result == NULL)) {
   2911         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2912         return USEARCH_DONE;
   2913     }
   2914 
   2915     int32_t     copylength = strsrch->search->matchedLength;
   2916     int32_t copyindex  = strsrch->search->matchedIndex;
   2917     if (copyindex == USEARCH_DONE) {
   2918         u_terminateUChars(result, resultCapacity, 0, status);
   2919         return USEARCH_DONE;
   2920     }
   2921 
   2922     if (resultCapacity < copylength) {
   2923         copylength = resultCapacity;
   2924     }
   2925     if (copylength > 0) {
   2926         uprv_memcpy(result, strsrch->search->text + copyindex,
   2927                     copylength * sizeof(UChar));
   2928     }
   2929     return u_terminateUChars(result, resultCapacity,
   2930                              strsrch->search->matchedLength, status);
   2931 }
   2932 
   2933 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
   2934                                               const UStringSearch *strsrch)
   2935 {
   2936     if (strsrch) {
   2937         return strsrch->search->matchedLength;
   2938     }
   2939     return USEARCH_DONE;
   2940 }
   2941 
   2942 #if !UCONFIG_NO_BREAK_ITERATION
   2943 
   2944 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
   2945                                                UBreakIterator *breakiter,
   2946                                                UErrorCode     *status)
   2947 {
   2948     if (U_SUCCESS(*status) && strsrch) {
   2949         strsrch->search->breakIter = breakiter;
   2950         if (breakiter) {
   2951             ubrk_setText(breakiter, strsrch->search->text,
   2952                          strsrch->search->textLength, status);
   2953         }
   2954     }
   2955 }
   2956 
   2957 U_CAPI const UBreakIterator* U_EXPORT2
   2958 usearch_getBreakIterator(const UStringSearch *strsrch)
   2959 {
   2960     if (strsrch) {
   2961         return strsrch->search->breakIter;
   2962     }
   2963     return NULL;
   2964 }
   2965 
   2966 #endif
   2967 
   2968 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
   2969                                       const UChar         *text,
   2970                                             int32_t        textlength,
   2971                                             UErrorCode    *status)
   2972 {
   2973     if (U_SUCCESS(*status)) {
   2974         if (strsrch == NULL || text == NULL || textlength < -1 ||
   2975             textlength == 0) {
   2976             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2977         }
   2978         else {
   2979             if (textlength == -1) {
   2980                 textlength = u_strlen(text);
   2981             }
   2982             strsrch->search->text       = text;
   2983             strsrch->search->textLength = textlength;
   2984             ucol_setText(strsrch->textIter, text, textlength, status);
   2985             strsrch->search->matchedIndex  = USEARCH_DONE;
   2986             strsrch->search->matchedLength = 0;
   2987             strsrch->search->reset         = TRUE;
   2988 #if !UCONFIG_NO_BREAK_ITERATION
   2989             if (strsrch->search->breakIter != NULL) {
   2990                 ubrk_setText(strsrch->search->breakIter, text,
   2991                              textlength, status);
   2992             }
   2993             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
   2994 #endif
   2995         }
   2996     }
   2997 }
   2998 
   2999 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
   3000                                                      int32_t       *length)
   3001 {
   3002     if (strsrch) {
   3003         *length = strsrch->search->textLength;
   3004         return strsrch->search->text;
   3005     }
   3006     return NULL;
   3007 }
   3008 
   3009 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
   3010                                           const UCollator     *collator,
   3011                                                 UErrorCode    *status)
   3012 {
   3013     if (U_SUCCESS(*status)) {
   3014         if (collator == NULL) {
   3015             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3016             return;
   3017         }
   3018 
   3019         if (strsrch) {
   3020             delete strsrch->textProcessedIter;
   3021             strsrch->textProcessedIter = NULL;
   3022             ucol_closeElements(strsrch->textIter);
   3023             ucol_closeElements(strsrch->utilIter);
   3024             strsrch->textIter = strsrch->utilIter = NULL;
   3025             if (strsrch->ownCollator && (strsrch->collator != collator)) {
   3026                 ucol_close((UCollator *)strsrch->collator);
   3027                 strsrch->ownCollator = FALSE;
   3028             }
   3029             strsrch->collator    = collator;
   3030             strsrch->strength    = ucol_getStrength(collator);
   3031             strsrch->ceMask      = getMask(strsrch->strength);
   3032 #if !UCONFIG_NO_BREAK_ITERATION
   3033             ubrk_close(strsrch->search->internalBreakIter);
   3034             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
   3035                                                      strsrch->search->text, strsrch->search->textLength, status);
   3036 #endif
   3037             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3038             strsrch->toShift     =
   3039                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   3040                                                                 UCOL_SHIFTED;
   3041             // if status is a failure, ucol_getVariableTop returns 0
   3042             strsrch->variableTop = ucol_getVariableTop(collator, status);
   3043             strsrch->textIter = ucol_openElements(collator,
   3044                                       strsrch->search->text,
   3045                                       strsrch->search->textLength,
   3046                                       status);
   3047             strsrch->utilIter = ucol_openElements(
   3048                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
   3049             // initialize() _after_ setting the iterators for the new collator.
   3050             initialize(strsrch, status);
   3051         }
   3052 
   3053         // **** are these calls needed?
   3054         // **** we call uprv_init_pce in initializePatternPCETable
   3055         // **** and the CEBuffer constructor...
   3056 #if 0
   3057         uprv_init_pce(strsrch->textIter);
   3058         uprv_init_pce(strsrch->utilIter);
   3059 #endif
   3060     }
   3061 }
   3062 
   3063 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
   3064 {
   3065     if (strsrch) {
   3066         return (UCollator *)strsrch->collator;
   3067     }
   3068     return NULL;
   3069 }
   3070 
   3071 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
   3072                                          const UChar         *pattern,
   3073                                                int32_t        patternlength,
   3074                                                UErrorCode    *status)
   3075 {
   3076     if (U_SUCCESS(*status)) {
   3077         if (strsrch == NULL || pattern == NULL) {
   3078             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3079         }
   3080         else {
   3081             if (patternlength == -1) {
   3082                 patternlength = u_strlen(pattern);
   3083             }
   3084             if (patternlength == 0) {
   3085                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   3086                 return;
   3087             }
   3088             strsrch->pattern.text       = pattern;
   3089             strsrch->pattern.textLength = patternlength;
   3090             initialize(strsrch, status);
   3091         }
   3092     }
   3093 }
   3094 
   3095 U_CAPI const UChar* U_EXPORT2
   3096 usearch_getPattern(const UStringSearch *strsrch,
   3097                    int32_t       *length)
   3098 {
   3099     if (strsrch) {
   3100         *length = strsrch->pattern.textLength;
   3101         return strsrch->pattern.text;
   3102     }
   3103     return NULL;
   3104 }
   3105 
   3106 // miscellanous methods --------------------------------------------------
   3107 
   3108 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
   3109                                            UErrorCode    *status)
   3110 {
   3111     if (strsrch && U_SUCCESS(*status)) {
   3112         strsrch->search->isForwardSearching = TRUE;
   3113         usearch_setOffset(strsrch, 0, status);
   3114         if (U_SUCCESS(*status)) {
   3115             return usearch_next(strsrch, status);
   3116         }
   3117     }
   3118     return USEARCH_DONE;
   3119 }
   3120 
   3121 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
   3122                                                int32_t    position,
   3123                                                UErrorCode    *status)
   3124 {
   3125     if (strsrch && U_SUCCESS(*status)) {
   3126         strsrch->search->isForwardSearching = TRUE;
   3127         // position checked in usearch_setOffset
   3128         usearch_setOffset(strsrch, position, status);
   3129         if (U_SUCCESS(*status)) {
   3130             return usearch_next(strsrch, status);
   3131         }
   3132     }
   3133     return USEARCH_DONE;
   3134 }
   3135 
   3136 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
   3137                                           UErrorCode    *status)
   3138 {
   3139     if (strsrch && U_SUCCESS(*status)) {
   3140         strsrch->search->isForwardSearching = FALSE;
   3141         usearch_setOffset(strsrch, strsrch->search->textLength, status);
   3142         if (U_SUCCESS(*status)) {
   3143             return usearch_previous(strsrch, status);
   3144         }
   3145     }
   3146     return USEARCH_DONE;
   3147 }
   3148 
   3149 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
   3150                                                int32_t    position,
   3151                                                UErrorCode    *status)
   3152 {
   3153     if (strsrch && U_SUCCESS(*status)) {
   3154         strsrch->search->isForwardSearching = FALSE;
   3155         // position checked in usearch_setOffset
   3156         usearch_setOffset(strsrch, position, status);
   3157         if (U_SUCCESS(*status)) {
   3158             return usearch_previous(strsrch, status);
   3159         }
   3160     }
   3161     return USEARCH_DONE;
   3162 }
   3163 
   3164 /**
   3165 * If a direction switch is required, we'll count the number of ces till the
   3166 * beginning of the collation element iterator and iterate forwards that
   3167 * number of times. This is so that we get to the correct point within the
   3168 * string to continue the search in. Imagine when we are in the middle of the
   3169 * normalization buffer when the change in direction is request. arrrgghh....
   3170 * After searching the offset within the collation element iterator will be
   3171 * shifted to the start of the match. If a match is not found, the offset would
   3172 * have been set to the end of the text string in the collation element
   3173 * iterator.
   3174 * Okay, here's my take on normalization buffer. The only time when there can
   3175 * be 2 matches within the same normalization is when the pattern is consists
   3176 * of all accents. But since the offset returned is from the text string, we
   3177 * should not confuse the caller by returning the second match within the
   3178 * same normalization buffer. If we do, the 2 results will have the same match
   3179 * offsets, and that'll be confusing. I'll return the next match that doesn't
   3180 * fall within the same normalization buffer. Note this does not affect the
   3181 * results of matches spanning the text and the normalization buffer.
   3182 * The position to start searching is taken from the collation element
   3183 * iterator. Callers of this API would have to set the offset in the collation
   3184 * element iterator before using this method.
   3185 */
   3186 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
   3187                                           UErrorCode    *status)
   3188 {
   3189     if (U_SUCCESS(*status) && strsrch) {
   3190         // note offset is either equivalent to the start of the previous match
   3191         // or is set by the user
   3192         int32_t      offset       = usearch_getOffset(strsrch);
   3193         USearch     *search       = strsrch->search;
   3194         search->reset             = FALSE;
   3195         int32_t      textlength   = search->textLength;
   3196         if (search->isForwardSearching) {
   3197 #if BOYER_MOORE
   3198             if (offset == textlength
   3199                 || (!search->isOverlap &&
   3200                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
   3201                     (search->matchedIndex != USEARCH_DONE &&
   3202                      offset + search->matchedLength >= textlength)))) {
   3203                 // not enough characters to match
   3204                 setMatchNotFound(strsrch);
   3205                 return USEARCH_DONE;
   3206             }
   3207 #else
   3208             if (offset == textlength ||
   3209                 (! search->isOverlap &&
   3210                 (search->matchedIndex != USEARCH_DONE &&
   3211                 offset + search->matchedLength > textlength))) {
   3212                     // not enough characters to match
   3213                     setMatchNotFound(strsrch);
   3214                     return USEARCH_DONE;
   3215             }
   3216 #endif
   3217         }
   3218         else {
   3219             // switching direction.
   3220             // if matchedIndex == USEARCH_DONE, it means that either a
   3221             // setOffset has been called or that previous ran off the text
   3222             // string. the iterator would have been set to offset 0 if a
   3223             // match is not found.
   3224             search->isForwardSearching = TRUE;
   3225             if (search->matchedIndex != USEARCH_DONE) {
   3226                 // there's no need to set the collation element iterator
   3227                 // the next call to next will set the offset.
   3228                 return search->matchedIndex;
   3229             }
   3230         }
   3231 
   3232         if (U_SUCCESS(*status)) {
   3233             if (strsrch->pattern.CELength == 0) {
   3234                 if (search->matchedIndex == USEARCH_DONE) {
   3235                     search->matchedIndex = offset;
   3236                 }
   3237                 else { // moves by codepoints
   3238                     U16_FWD_1(search->text, search->matchedIndex, textlength);
   3239                 }
   3240 
   3241                 search->matchedLength = 0;
   3242                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3243                 // status checked below
   3244                 if (search->matchedIndex == textlength) {
   3245                     search->matchedIndex = USEARCH_DONE;
   3246                 }
   3247             }
   3248             else {
   3249                 if (search->matchedLength > 0) {
   3250                     // if matchlength is 0 we are at the start of the iteration
   3251                     if (search->isOverlap) {
   3252                         ucol_setOffset(strsrch->textIter, offset + 1, status);
   3253                     }
   3254                     else {
   3255                         ucol_setOffset(strsrch->textIter,
   3256                                        offset + search->matchedLength, status);
   3257                     }
   3258                 }
   3259                 else {
   3260                     // for boundary check purposes. this will ensure that the
   3261                     // next match will not preceed the current offset
   3262                     // note search->matchedIndex will always be set to something
   3263                     // in the code
   3264                     search->matchedIndex = offset - 1;
   3265                 }
   3266 
   3267                 if (search->isCanonicalMatch) {
   3268                     // can't use exact here since extra accents are allowed.
   3269                     usearch_handleNextCanonical(strsrch, status);
   3270                 }
   3271                 else {
   3272                     usearch_handleNextExact(strsrch, status);
   3273                 }
   3274             }
   3275 
   3276             if (U_FAILURE(*status)) {
   3277                 return USEARCH_DONE;
   3278             }
   3279 
   3280 #if !BOYER_MOORE
   3281             if (search->matchedIndex == USEARCH_DONE) {
   3282                 ucol_setOffset(strsrch->textIter, search->textLength, status);
   3283             } else {
   3284                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
   3285             }
   3286 #endif
   3287 
   3288             return search->matchedIndex;
   3289         }
   3290     }
   3291     return USEARCH_DONE;
   3292 }
   3293 
   3294 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
   3295                                               UErrorCode *status)
   3296 {
   3297     if (U_SUCCESS(*status) && strsrch) {
   3298         int32_t offset;
   3299         USearch *search = strsrch->search;
   3300         if (search->reset) {
   3301             offset                     = search->textLength;
   3302             search->isForwardSearching = FALSE;
   3303             search->reset              = FALSE;
   3304             setColEIterOffset(strsrch->textIter, offset);
   3305         }
   3306         else {
   3307             offset = usearch_getOffset(strsrch);
   3308         }
   3309 
   3310         int32_t matchedindex = search->matchedIndex;
   3311         if (search->isForwardSearching == TRUE) {
   3312             // switching direction.
   3313             // if matchedIndex == USEARCH_DONE, it means that either a
   3314             // setOffset has been called or that next ran off the text
   3315             // string. the iterator would have been set to offset textLength if
   3316             // a match is not found.
   3317             search->isForwardSearching = FALSE;
   3318             if (matchedindex != USEARCH_DONE) {
   3319                 return matchedindex;
   3320             }
   3321         }
   3322         else {
   3323 #if BOYER_MOORE
   3324             if (offset == 0 || matchedindex == 0 ||
   3325                 (!search->isOverlap &&
   3326                     (offset < strsrch->pattern.defaultShiftSize ||
   3327                     (matchedindex != USEARCH_DONE &&
   3328                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
   3329                 // not enough characters to match
   3330                 setMatchNotFound(strsrch);
   3331                 return USEARCH_DONE;
   3332             }
   3333 #else
   3334             // Could check pattern length, but the
   3335             // linear search will do the right thing
   3336             if (offset == 0 || matchedindex == 0) {
   3337                 setMatchNotFound(strsrch);
   3338                 return USEARCH_DONE;
   3339             }
   3340 #endif
   3341         }
   3342 
   3343         if (U_SUCCESS(*status)) {
   3344             if (strsrch->pattern.CELength == 0) {
   3345                 search->matchedIndex =
   3346                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
   3347                 if (search->matchedIndex == 0) {
   3348                     setMatchNotFound(strsrch);
   3349                     // status checked below
   3350                 }
   3351                 else { // move by codepoints
   3352                     U16_BACK_1(search->text, 0, search->matchedIndex);
   3353                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3354                     // status checked below
   3355                     search->matchedLength = 0;
   3356                 }
   3357             }
   3358             else {
   3359                 if (strsrch->search->isCanonicalMatch) {
   3360                     // can't use exact here since extra accents are allowed.
   3361                     usearch_handlePreviousCanonical(strsrch, status);
   3362                     // status checked below
   3363                 }
   3364                 else {
   3365                     usearch_handlePreviousExact(strsrch, status);
   3366                     // status checked below
   3367                 }
   3368             }
   3369 
   3370             if (U_FAILURE(*status)) {
   3371                 return USEARCH_DONE;
   3372             }
   3373 
   3374             return search->matchedIndex;
   3375         }
   3376     }
   3377     return USEARCH_DONE;
   3378 }
   3379 
   3380 
   3381 
   3382 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
   3383 {
   3384     /*
   3385     reset is setting the attributes that are already in
   3386     string search, hence all attributes in the collator should
   3387     be retrieved without any problems
   3388     */
   3389     if (strsrch) {
   3390         UErrorCode status            = U_ZERO_ERROR;
   3391         UBool      sameCollAttribute = TRUE;
   3392         uint32_t   ceMask;
   3393         UBool      shift;
   3394         uint32_t   varTop;
   3395 
   3396         // **** hack to deal w/ how processed CEs encode quaternary ****
   3397         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
   3398         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
   3399             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
   3400                 sameCollAttribute = FALSE;
   3401         }
   3402 
   3403         strsrch->strength    = ucol_getStrength(strsrch->collator);
   3404         ceMask = getMask(strsrch->strength);
   3405         if (strsrch->ceMask != ceMask) {
   3406             strsrch->ceMask = ceMask;
   3407             sameCollAttribute = FALSE;
   3408         }
   3409 
   3410         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3411         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
   3412                                   &status) == UCOL_SHIFTED;
   3413         if (strsrch->toShift != shift) {
   3414             strsrch->toShift  = shift;
   3415             sameCollAttribute = FALSE;
   3416         }
   3417 
   3418         // if status is a failure, ucol_getVariableTop returns 0
   3419         varTop = ucol_getVariableTop(strsrch->collator, &status);
   3420         if (strsrch->variableTop != varTop) {
   3421             strsrch->variableTop = varTop;
   3422             sameCollAttribute    = FALSE;
   3423         }
   3424         if (!sameCollAttribute) {
   3425             initialize(strsrch, &status);
   3426         }
   3427         ucol_setText(strsrch->textIter, strsrch->search->text,
   3428                               strsrch->search->textLength,
   3429                               &status);
   3430         strsrch->search->matchedLength      = 0;
   3431         strsrch->search->matchedIndex       = USEARCH_DONE;
   3432         strsrch->search->isOverlap          = FALSE;
   3433         strsrch->search->isCanonicalMatch   = FALSE;
   3434         strsrch->search->elementComparisonType = 0;
   3435         strsrch->search->isForwardSearching = TRUE;
   3436         strsrch->search->reset              = TRUE;
   3437     }
   3438 }
   3439 
   3440 //
   3441 //  CEI  Collation Element + source text index.
   3442 //       These structs are kept in the circular buffer.
   3443 //
   3444 struct  CEI {
   3445     int64_t ce;
   3446     int32_t lowIndex;
   3447     int32_t highIndex;
   3448 };
   3449 
   3450 U_NAMESPACE_BEGIN
   3451 
   3452 namespace {
   3453 //
   3454 //  CEBuffer   A circular buffer of CEs from the text being searched.
   3455 //
   3456 #define   DEFAULT_CEBUFFER_SIZE 96
   3457 #define   CEBUFFER_EXTRA 32
   3458 // Some typical max values to make buffer size more reasonable for asymmetric search.
   3459 // #8694 is for a better long-term solution to allocation of this buffer.
   3460 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
   3461 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
   3462 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
   3463 struct CEBuffer {
   3464     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
   3465     CEI                 *buf;
   3466     int32_t              bufSize;
   3467     int32_t              firstIx;
   3468     int32_t              limitIx;
   3469     UCollationElements  *ceIter;
   3470     UStringSearch       *strSearch;
   3471 
   3472 
   3473 
   3474                CEBuffer(UStringSearch *ss, UErrorCode *status);
   3475                ~CEBuffer();
   3476    const CEI   *get(int32_t index);
   3477    const CEI   *getPrevious(int32_t index);
   3478 };
   3479 
   3480 
   3481 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
   3482     buf = defBuf;
   3483     strSearch = ss;
   3484     bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA;
   3485     if (ss->search->elementComparisonType != 0) {
   3486         const UChar * patText = ss->pattern.text;
   3487         if (patText) {
   3488             const UChar * patTextLimit = patText + ss->pattern.textLength;
   3489             while ( patText < patTextLimit ) {
   3490                 UChar c = *patText++;
   3491                 if (MIGHT_BE_JAMO_L(c)) {
   3492                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
   3493                 } else {
   3494                     // No check for surrogates, we might allocate slightly more buffer than necessary.
   3495                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
   3496                 }
   3497             }
   3498         }
   3499     }
   3500     ceIter    = ss->textIter;
   3501     firstIx = 0;
   3502     limitIx = 0;
   3503 
   3504     if (!initTextProcessedIter(ss, status)) { return; }
   3505 
   3506     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
   3507         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
   3508         if (buf == NULL) {
   3509             *status = U_MEMORY_ALLOCATION_ERROR;
   3510         }
   3511     }
   3512 }
   3513 
   3514 // TODO: add a reset or init function so that allocated
   3515 //       buffers can be retained & reused.
   3516 
   3517 CEBuffer::~CEBuffer() {
   3518     if (buf != defBuf) {
   3519         uprv_free(buf);
   3520     }
   3521 }
   3522 
   3523 
   3524 // Get the CE with the specified index.
   3525 //   Index must be in the range
   3526 //          n-history_size < index < n+1
   3527 //   where n is the largest index to have been fetched by some previous call to this function.
   3528 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3529 //
   3530 const CEI *CEBuffer::get(int32_t index) {
   3531     int i = index % bufSize;
   3532 
   3533     if (index>=firstIx && index<limitIx) {
   3534         // The request was for an entry already in our buffer.
   3535         //  Just return it.
   3536         return &buf[i];
   3537     }
   3538 
   3539     // Caller is requesting a new, never accessed before, CE.
   3540     //   Verify that it is the next one in sequence, which is all
   3541     //   that is allowed.
   3542     if (index != limitIx) {
   3543         U_ASSERT(FALSE);
   3544 
   3545         return NULL;
   3546     }
   3547 
   3548     // Manage the circular CE buffer indexing
   3549     limitIx++;
   3550 
   3551     if (limitIx - firstIx >= bufSize) {
   3552         // The buffer is full, knock out the lowest-indexed entry.
   3553         firstIx++;
   3554     }
   3555 
   3556     UErrorCode status = U_ZERO_ERROR;
   3557 
   3558     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3559 
   3560     return &buf[i];
   3561 }
   3562 
   3563 // Get the CE with the specified index.
   3564 //   Index must be in the range
   3565 //          n-history_size < index < n+1
   3566 //   where n is the largest index to have been fetched by some previous call to this function.
   3567 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3568 //
   3569 const CEI *CEBuffer::getPrevious(int32_t index) {
   3570     int i = index % bufSize;
   3571 
   3572     if (index>=firstIx && index<limitIx) {
   3573         // The request was for an entry already in our buffer.
   3574         //  Just return it.
   3575         return &buf[i];
   3576     }
   3577 
   3578     // Caller is requesting a new, never accessed before, CE.
   3579     //   Verify that it is the next one in sequence, which is all
   3580     //   that is allowed.
   3581     if (index != limitIx) {
   3582         U_ASSERT(FALSE);
   3583 
   3584         return NULL;
   3585     }
   3586 
   3587     // Manage the circular CE buffer indexing
   3588     limitIx++;
   3589 
   3590     if (limitIx - firstIx >= bufSize) {
   3591         // The buffer is full, knock out the lowest-indexed entry.
   3592         firstIx++;
   3593     }
   3594 
   3595     UErrorCode status = U_ZERO_ERROR;
   3596 
   3597     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3598 
   3599     return &buf[i];
   3600 }
   3601 
   3602 }
   3603 
   3604 U_NAMESPACE_END
   3605 
   3606 
   3607 // #define USEARCH_DEBUG
   3608 
   3609 #ifdef USEARCH_DEBUG
   3610 #include <stdio.h>
   3611 #include <stdlib.h>
   3612 #endif
   3613 
   3614 /*
   3615  * Find the next break boundary after startIndex. If the UStringSearch object
   3616  * has an external break iterator, use that. Otherwise use the internal character
   3617  * break iterator.
   3618  */
   3619 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
   3620 #if 0
   3621     const UChar *text = strsrch->search->text;
   3622     int32_t textLen   = strsrch->search->textLength;
   3623 
   3624     U_ASSERT(startIndex>=0);
   3625     U_ASSERT(startIndex<=textLen);
   3626 
   3627     if (startIndex >= textLen) {
   3628         return startIndex;
   3629     }
   3630 
   3631     UChar32  c;
   3632     int32_t  i = startIndex;
   3633     U16_NEXT(text, i, textLen, c);
   3634 
   3635     // If we are on a control character, stop without looking for combining marks.
   3636     //    Control characters do not combine.
   3637     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3638     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   3639         return i;
   3640     }
   3641 
   3642     // The initial character was not a control, and can thus accept trailing
   3643     //   combining characters.  Advance over however many of them there are.
   3644     int32_t  indexOfLastCharChecked;
   3645     for (;;) {
   3646         indexOfLastCharChecked = i;
   3647         if (i>=textLen) {
   3648             break;
   3649         }
   3650         U16_NEXT(text, i, textLen, c);
   3651         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3652         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3653             break;
   3654         }
   3655     }
   3656     return indexOfLastCharChecked;
   3657 #elif !UCONFIG_NO_BREAK_ITERATION
   3658     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3659 
   3660     if (breakiterator == NULL) {
   3661         breakiterator = strsrch->search->internalBreakIter;
   3662     }
   3663 
   3664     if (breakiterator != NULL) {
   3665         return ubrk_following(breakiterator, startIndex);
   3666     }
   3667 
   3668     return startIndex;
   3669 #else
   3670     // **** or should we use the original code? ****
   3671     return startIndex;
   3672 #endif
   3673 
   3674 }
   3675 
   3676 /*
   3677  * Returns TRUE if index is on a break boundary. If the UStringSearch
   3678  * has an external break iterator, test using that, otherwise test
   3679  * using the internal character break iterator.
   3680  */
   3681 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
   3682 #if 0
   3683     const UChar *text = strsrch->search->text;
   3684     int32_t textLen   = strsrch->search->textLength;
   3685 
   3686     U_ASSERT(index>=0);
   3687     U_ASSERT(index<=textLen);
   3688 
   3689     if (index>=textLen || index<=0) {
   3690         return TRUE;
   3691     }
   3692 
   3693     // If the character at the current index is not a GRAPHEME_EXTEND
   3694     //    then we can not be within a combining sequence.
   3695     UChar32  c;
   3696     U16_GET(text, 0, index, textLen, c);
   3697     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3698     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3699         return TRUE;
   3700     }
   3701 
   3702     // We are at a combining mark.  If the preceding character is anything
   3703     //   except a CONTROL, CR or LF, we are in a combining sequence.
   3704     U16_PREV(text, 0, index, c);
   3705     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3706     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   3707     return !combining;
   3708 #elif !UCONFIG_NO_BREAK_ITERATION
   3709     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3710 
   3711     if (breakiterator == NULL) {
   3712         breakiterator = strsrch->search->internalBreakIter;
   3713     }
   3714 
   3715     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
   3716 #else
   3717     // **** or use the original code? ****
   3718     return TRUE;
   3719 #endif
   3720 }
   3721 
   3722 #if 0
   3723 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
   3724 {
   3725 #if !UCONFIG_NO_BREAK_ITERATION
   3726     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3727 
   3728     if (breakiterator != NULL) {
   3729         int32_t startindex = ubrk_first(breakiterator);
   3730         int32_t endindex   = ubrk_last(breakiterator);
   3731 
   3732         // out-of-range indexes are never boundary positions
   3733         if (start < startindex || start > endindex ||
   3734             end < startindex || end > endindex) {
   3735             return FALSE;
   3736         }
   3737 
   3738         return ubrk_isBoundary(breakiterator, start) &&
   3739                ubrk_isBoundary(breakiterator, end);
   3740     }
   3741 #endif
   3742 
   3743     return TRUE;
   3744 }
   3745 #endif
   3746 
   3747 typedef enum {
   3748     U_CE_MATCH = -1,
   3749     U_CE_NO_MATCH = 0,
   3750     U_CE_SKIP_TARG,
   3751     U_CE_SKIP_PATN
   3752 } UCompareCEsResult;
   3753 #define U_CE_LEVEL2_BASE 0x00000005
   3754 #define U_CE_LEVEL3_BASE 0x00050000
   3755 
   3756 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
   3757     if (targCE == patCE) {
   3758         return U_CE_MATCH;
   3759     }
   3760     if (compareType == 0) {
   3761         return U_CE_NO_MATCH;
   3762     }
   3763 
   3764     int64_t targCEshifted = targCE >> 32;
   3765     int64_t patCEshifted = patCE >> 32;
   3766     int64_t mask;
   3767 
   3768     mask = 0xFFFF0000;
   3769     int32_t targLev1 = (int32_t)(targCEshifted & mask);
   3770     int32_t patLev1 = (int32_t)(patCEshifted & mask);
   3771     if ( targLev1 != patLev1 ) {
   3772         if ( targLev1 == 0 ) {
   3773             return U_CE_SKIP_TARG;
   3774         }
   3775         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3776             return U_CE_SKIP_PATN;
   3777         }
   3778         return U_CE_NO_MATCH;
   3779     }
   3780 
   3781     mask = 0x0000FFFF;
   3782     int32_t targLev2 = (int32_t)(targCEshifted & mask);
   3783     int32_t patLev2 = (int32_t)(patCEshifted & mask);
   3784     if ( targLev2 != patLev2 ) {
   3785         if ( targLev2 == 0 ) {
   3786             return U_CE_SKIP_TARG;
   3787         }
   3788         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3789             return U_CE_SKIP_PATN;
   3790         }
   3791         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
   3792             U_CE_MATCH: U_CE_NO_MATCH;
   3793     }
   3794 
   3795     mask = 0xFFFF0000;
   3796     int32_t targLev3 = (int32_t)(targCE & mask);
   3797     int32_t patLev3 = (int32_t)(patCE & mask);
   3798     if ( targLev3 != patLev3 ) {
   3799         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
   3800             U_CE_MATCH: U_CE_NO_MATCH;
   3801    }
   3802 
   3803     return U_CE_MATCH;
   3804 }
   3805 
   3806 #if BOYER_MOORE
   3807 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
   3808 #endif
   3809 
   3810 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
   3811                                        int32_t        startIdx,
   3812                                        int32_t        *matchStart,
   3813                                        int32_t        *matchLimit,
   3814                                        UErrorCode     *status)
   3815 {
   3816     if (U_FAILURE(*status)) {
   3817         return FALSE;
   3818     }
   3819 
   3820     // TODO:  reject search patterns beginning with a combining char.
   3821 
   3822 #ifdef USEARCH_DEBUG
   3823     if (getenv("USEARCH_DEBUG") != NULL) {
   3824         printf("Pattern CEs\n");
   3825         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   3826             printf(" %8x", strsrch->pattern.CE[ii]);
   3827         }
   3828         printf("\n");
   3829     }
   3830 
   3831 #endif
   3832     // Input parameter sanity check.
   3833     //  TODO:  should input indicies clip to the text length
   3834     //         in the same way that UText does.
   3835     if(strsrch->pattern.CELength == 0         ||
   3836        startIdx < 0                           ||
   3837        startIdx > strsrch->search->textLength ||
   3838        strsrch->pattern.CE == NULL) {
   3839            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3840            return FALSE;
   3841     }
   3842 
   3843     if (strsrch->pattern.PCE == NULL) {
   3844         initializePatternPCETable(strsrch, status);
   3845     }
   3846 
   3847     ucol_setOffset(strsrch->textIter, startIdx, status);
   3848     CEBuffer ceb(strsrch, status);
   3849 
   3850 
   3851     int32_t    targetIx = 0;
   3852     const CEI *targetCEI = NULL;
   3853     int32_t    patIx;
   3854     UBool      found;
   3855 
   3856     int32_t  mStart = -1;
   3857     int32_t  mLimit = -1;
   3858     int32_t  minLimit;
   3859     int32_t  maxLimit;
   3860 
   3861 
   3862 
   3863     // Outer loop moves over match starting positions in the
   3864     //      target CE space.
   3865     // Here we see the target as a sequence of collation elements, resulting from the following:
   3866     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
   3867     //    (for example, digraphs such as IJ may be broken into two characters).
   3868     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
   3869     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
   3870     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
   3871     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
   3872     //    then the CE is deleted, so the following code sees only CEs that are relevant.
   3873     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
   3874     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
   3875     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
   3876     //
   3877     for(targetIx=0; ; targetIx++)
   3878     {
   3879         found = TRUE;
   3880         //  Inner loop checks for a match beginning at each
   3881         //  position from the outer loop.
   3882         int32_t targetIxOffset = 0;
   3883         int64_t patCE = 0;
   3884         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
   3885         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   3886         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
   3887         const CEI *firstCEI = ceb.get(targetIx);
   3888         if (firstCEI == NULL) {
   3889             *status = U_INTERNAL_PROGRAM_ERROR;
   3890             found = FALSE;
   3891             break;
   3892         }
   3893 
   3894         for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
   3895             patCE = strsrch->pattern.PCE[patIx];
   3896             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
   3897             //  Compare CE from target string with CE from the pattern.
   3898             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
   3899             //    which will fail the compare, below.
   3900             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   3901             if ( ceMatch == U_CE_NO_MATCH ) {
   3902                 found = FALSE;
   3903                 break;
   3904             } else if ( ceMatch > U_CE_NO_MATCH ) {
   3905                 if ( ceMatch == U_CE_SKIP_TARG ) {
   3906                     // redo with same patCE, next targCE
   3907                     patIx--;
   3908                     targetIxOffset++;
   3909                 } else { // ceMatch == U_CE_SKIP_PATN
   3910                     // redo with same targCE, next patCE
   3911                     targetIxOffset--;
   3912                 }
   3913             }
   3914         }
   3915         targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far
   3916 
   3917         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   3918             // No match at this targetIx.  Try again at the next.
   3919             continue;
   3920         }
   3921 
   3922         if (!found) {
   3923             // No match at all, we have run off the end of the target text.
   3924             break;
   3925         }
   3926 
   3927 
   3928         // We have found a match in CE space.
   3929         // Now determine the bounds in string index space.
   3930         //  There still is a chance of match failure if the CE range not correspond to
   3931         //     an acceptable character range.
   3932         //
   3933         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
   3934 
   3935         mStart   = firstCEI->lowIndex;
   3936         minLimit = lastCEI->lowIndex;
   3937 
   3938         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   3939         //   extended to the end of input, and the match is good.
   3940 
   3941         // Look at the high and low indices of the CE following the match. If
   3942         // they are the same it means one of two things:
   3943         //    1. The match extended to the last CE from the target text, which is OK, or
   3944         //    2. The last CE that was part of the match is in an expansion that extends
   3945         //       to the first CE after the match. In this case, we reject the match.
   3946         const CEI *nextCEI = 0;
   3947         if (strsrch->search->elementComparisonType == 0) {
   3948             nextCEI  = ceb.get(targetIx + targetIxOffset);
   3949             maxLimit = nextCEI->lowIndex;
   3950             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3951                 found = FALSE;
   3952             }
   3953         } else {
   3954             for ( ; ; ++targetIxOffset ) {
   3955                 nextCEI = ceb.get(targetIx + targetIxOffset);
   3956                 maxLimit = nextCEI->lowIndex;
   3957                 // If we are at the end of the target too, match succeeds
   3958                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
   3959                     break;
   3960                 }
   3961                 // As long as the next CE has primary weight of 0,
   3962                 // it is part of the last target element matched by the pattern;
   3963                 // make sure it can be part of a match with the last patCE
   3964                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
   3965                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
   3966                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
   3967                         found = FALSE;
   3968                         break;
   3969                     }
   3970                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
   3971                 // target element, but it has non-zero primary weight => match fails
   3972                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
   3973                     found = false;
   3974                     break;
   3975                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
   3976                 } else {
   3977                     break;
   3978                 }
   3979             }
   3980         }
   3981 
   3982 
   3983         // Check for the start of the match being within a combining sequence.
   3984         //   This can happen if the pattern itself begins with a combining char, and
   3985         //   the match found combining marks in the target text that were attached
   3986         //    to something else.
   3987         //   This type of match should be rejected for not completely consuming a
   3988         //   combining sequence.
   3989         if (!isBreakBoundary(strsrch, mStart)) {
   3990             found = FALSE;
   3991         }
   3992 
   3993         // Check for the start of the match being within an Collation Element Expansion,
   3994         //   meaning that the first char of the match is only partially matched.
   3995         //   With exapnsions, the first CE will report the index of the source
   3996         //   character, and all subsequent (expansions) CEs will report the source index of the
   3997         //    _following_ character.
   3998         int32_t secondIx = firstCEI->highIndex;
   3999         if (mStart == secondIx) {
   4000             found = FALSE;
   4001         }
   4002 
   4003         //  Advance the match end position to the first acceptable match boundary.
   4004         //    This advances the index over any combining charcters.
   4005         mLimit = maxLimit;
   4006         if (minLimit < maxLimit) {
   4007             // When the last CE's low index is same with its high index, the CE is likely
   4008             // a part of expansion. In this case, the index is located just after the
   4009             // character corresponding to the CEs compared above. If the index is right
   4010             // at the break boundary, move the position to the next boundary will result
   4011             // incorrect match length when there are ignorable characters exist between
   4012             // the position and the next character produces CE(s). See ticket#8482.
   4013             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
   4014                 mLimit = minLimit;
   4015             } else {
   4016                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4017                 if (nba >= lastCEI->highIndex) {
   4018                     mLimit = nba;
   4019                 }
   4020             }
   4021         }
   4022 
   4023     #ifdef USEARCH_DEBUG
   4024         if (getenv("USEARCH_DEBUG") != NULL) {
   4025             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4026         }
   4027     #endif
   4028 
   4029         // If advancing to the end of a combining sequence in character indexing space
   4030         //   advanced us beyond the end of the match in CE space, reject this match.
   4031         if (mLimit > maxLimit) {
   4032             found = FALSE;
   4033         }
   4034 
   4035         if (!isBreakBoundary(strsrch, mLimit)) {
   4036             found = FALSE;
   4037         }
   4038 
   4039         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4040             found = FALSE;
   4041         }
   4042 
   4043         if (found) {
   4044             break;
   4045         }
   4046     }
   4047 
   4048     #ifdef USEARCH_DEBUG
   4049     if (getenv("USEARCH_DEBUG") != NULL) {
   4050         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4051         int32_t  lastToPrint = ceb.limitIx+2;
   4052         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4053             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4054         }
   4055         printf("\n%s\n", found? "match found" : "no match");
   4056     }
   4057     #endif
   4058 
   4059     // All Done.  Store back the match bounds to the caller.
   4060     //
   4061     if (found==FALSE) {
   4062         mLimit = -1;
   4063         mStart = -1;
   4064     }
   4065 
   4066     if (matchStart != NULL) {
   4067         *matchStart= mStart;
   4068     }
   4069 
   4070     if (matchLimit != NULL) {
   4071         *matchLimit = mLimit;
   4072     }
   4073 
   4074     return found;
   4075 }
   4076 
   4077 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
   4078                                                 int32_t        startIdx,
   4079                                                 int32_t        *matchStart,
   4080                                                 int32_t        *matchLimit,
   4081                                                 UErrorCode     *status)
   4082 {
   4083     if (U_FAILURE(*status)) {
   4084         return FALSE;
   4085     }
   4086 
   4087     // TODO:  reject search patterns beginning with a combining char.
   4088 
   4089 #ifdef USEARCH_DEBUG
   4090     if (getenv("USEARCH_DEBUG") != NULL) {
   4091         printf("Pattern CEs\n");
   4092         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   4093             printf(" %8x", strsrch->pattern.CE[ii]);
   4094         }
   4095         printf("\n");
   4096     }
   4097 
   4098 #endif
   4099     // Input parameter sanity check.
   4100     //  TODO:  should input indicies clip to the text length
   4101     //         in the same way that UText does.
   4102     if(strsrch->pattern.CELength == 0         ||
   4103        startIdx < 0                           ||
   4104        startIdx > strsrch->search->textLength ||
   4105        strsrch->pattern.CE == NULL) {
   4106            *status = U_ILLEGAL_ARGUMENT_ERROR;
   4107            return FALSE;
   4108     }
   4109 
   4110     if (strsrch->pattern.PCE == NULL) {
   4111         initializePatternPCETable(strsrch, status);
   4112     }
   4113 
   4114     CEBuffer ceb(strsrch, status);
   4115     int32_t    targetIx = 0;
   4116 
   4117     /*
   4118      * Pre-load the buffer with the CE's for the grapheme
   4119      * after our starting position so that we're sure that
   4120      * we can look at the CE following the match when we
   4121      * check the match boundaries.
   4122      *
   4123      * This will also pre-fetch the first CE that we'll
   4124      * consider for the match.
   4125      */
   4126     if (startIdx < strsrch->search->textLength) {
   4127         UBreakIterator *bi = strsrch->search->internalBreakIter;
   4128         int32_t next = ubrk_following(bi, startIdx);
   4129 
   4130         ucol_setOffset(strsrch->textIter, next, status);
   4131 
   4132         for (targetIx = 0; ; targetIx += 1) {
   4133             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
   4134                 break;
   4135             }
   4136         }
   4137     } else {
   4138         ucol_setOffset(strsrch->textIter, startIdx, status);
   4139     }
   4140 
   4141 
   4142     const CEI *targetCEI = NULL;
   4143     int32_t    patIx;
   4144     UBool      found;
   4145 
   4146     int32_t  limitIx = targetIx;
   4147     int32_t  mStart = -1;
   4148     int32_t  mLimit = -1;
   4149     int32_t  minLimit;
   4150     int32_t  maxLimit;
   4151 
   4152 
   4153 
   4154     // Outer loop moves over match starting positions in the
   4155     //      target CE space.
   4156     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
   4157     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
   4158     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
   4159     // and the beginning of the base text.
   4160     for(targetIx = limitIx; ; targetIx += 1)
   4161     {
   4162         found = TRUE;
   4163         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
   4164         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   4165         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
   4166         const CEI *lastCEI  = ceb.getPrevious(targetIx);
   4167         if (lastCEI == NULL) {
   4168             *status = U_INTERNAL_PROGRAM_ERROR;
   4169             found = FALSE;
   4170              break;
   4171         }
   4172         //  Inner loop checks for a match beginning at each
   4173         //  position from the outer loop.
   4174         int32_t targetIxOffset = 0;
   4175         for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
   4176             int64_t patCE = strsrch->pattern.PCE[patIx];
   4177 
   4178             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
   4179             //  Compare CE from target string with CE from the pattern.
   4180             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   4181             //    which will fail the compare, below.
   4182             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   4183             if ( ceMatch == U_CE_NO_MATCH ) {
   4184                 found = FALSE;
   4185                 break;
   4186             } else if ( ceMatch > U_CE_NO_MATCH ) {
   4187                 if ( ceMatch == U_CE_SKIP_TARG ) {
   4188                     // redo with same patCE, next targCE
   4189                     patIx++;
   4190                     targetIxOffset++;
   4191                 } else { // ceMatch == U_CE_SKIP_PATN
   4192                     // redo with same targCE, next patCE
   4193                     targetIxOffset--;
   4194                 }
   4195             }
   4196         }
   4197 
   4198         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   4199             // No match at this targetIx.  Try again at the next.
   4200             continue;
   4201         }
   4202 
   4203         if (!found) {
   4204             // No match at all, we have run off the end of the target text.
   4205             break;
   4206         }
   4207 
   4208 
   4209         // We have found a match in CE space.
   4210         // Now determine the bounds in string index space.
   4211         //  There still is a chance of match failure if the CE range not correspond to
   4212         //     an acceptable character range.
   4213         //
   4214         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
   4215         mStart   = firstCEI->lowIndex;
   4216 
   4217         // Check for the start of the match being within a combining sequence.
   4218         //   This can happen if the pattern itself begins with a combining char, and
   4219         //   the match found combining marks in the target text that were attached
   4220         //    to something else.
   4221         //   This type of match should be rejected for not completely consuming a
   4222         //   combining sequence.
   4223         if (!isBreakBoundary(strsrch, mStart)) {
   4224             found = FALSE;
   4225         }
   4226 
   4227         // Look at the high index of the first CE in the match. If it's the same as the
   4228         // low index, the first CE in the match is in the middle of an expansion.
   4229         if (mStart == firstCEI->highIndex) {
   4230             found = FALSE;
   4231         }
   4232 
   4233 
   4234         minLimit = lastCEI->lowIndex;
   4235 
   4236         if (targetIx > 0) {
   4237             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   4238             //   extended to the end of input, and the match is good.
   4239 
   4240             // Look at the high and low indices of the CE following the match. If
   4241             // they are the same it means one of two things:
   4242             //    1. The match extended to the last CE from the target text, which is OK, or
   4243             //    2. The last CE that was part of the match is in an expansion that extends
   4244             //       to the first CE after the match. In this case, we reject the match.
   4245             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
   4246 
   4247             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4248                 found = FALSE;
   4249             }
   4250 
   4251             mLimit = maxLimit = nextCEI->lowIndex;
   4252 
   4253             //  Advance the match end position to the first acceptable match boundary.
   4254             //    This advances the index over any combining charcters.
   4255             if (minLimit < maxLimit) {
   4256                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4257 
   4258                 if (nba >= lastCEI->highIndex) {
   4259                     mLimit = nba;
   4260                 }
   4261             }
   4262 
   4263             // If advancing to the end of a combining sequence in character indexing space
   4264             //   advanced us beyond the end of the match in CE space, reject this match.
   4265             if (mLimit > maxLimit) {
   4266                 found = FALSE;
   4267             }
   4268 
   4269             // Make sure the end of the match is on a break boundary
   4270             if (!isBreakBoundary(strsrch, mLimit)) {
   4271                 found = FALSE;
   4272             }
   4273 
   4274         } else {
   4275             // No non-ignorable CEs after this point.
   4276             // The maximum position is detected by boundary after
   4277             // the last non-ignorable CE. Combining sequence
   4278             // across the start index will be truncated.
   4279             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4280             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
   4281         }
   4282 
   4283     #ifdef USEARCH_DEBUG
   4284         if (getenv("USEARCH_DEBUG") != NULL) {
   4285             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4286         }
   4287     #endif
   4288 
   4289 
   4290         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4291             found = FALSE;
   4292         }
   4293 
   4294         if (found) {
   4295             break;
   4296         }
   4297     }
   4298 
   4299     #ifdef USEARCH_DEBUG
   4300     if (getenv("USEARCH_DEBUG") != NULL) {
   4301         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4302         int32_t  lastToPrint = ceb.limitIx+2;
   4303         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4304             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4305         }
   4306         printf("\n%s\n", found? "match found" : "no match");
   4307     }
   4308     #endif
   4309 
   4310     // All Done.  Store back the match bounds to the caller.
   4311     //
   4312     if (found==FALSE) {
   4313         mLimit = -1;
   4314         mStart = -1;
   4315     }
   4316 
   4317     if (matchStart != NULL) {
   4318         *matchStart= mStart;
   4319     }
   4320 
   4321     if (matchLimit != NULL) {
   4322         *matchLimit = mLimit;
   4323     }
   4324 
   4325     return found;
   4326 }
   4327 
   4328 // internal use methods declared in usrchimp.h -----------------------------
   4329 
   4330 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
   4331 {
   4332     if (U_FAILURE(*status)) {
   4333         setMatchNotFound(strsrch);
   4334         return FALSE;
   4335     }
   4336 
   4337 #if BOYER_MOORE
   4338     UCollationElements *coleiter        = strsrch->textIter;
   4339     int32_t             textlength      = strsrch->search->textLength;
   4340     int32_t            *patternce       = strsrch->pattern.CE;
   4341     int32_t             patterncelength = strsrch->pattern.CELength;
   4342     int32_t             textoffset      = ucol_getOffset(coleiter);
   4343 
   4344     // status used in setting coleiter offset, since offset is checked in
   4345     // shiftForward before setting the coleiter offset, status never
   4346     // a failure
   4347     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4348                               patterncelength);
   4349     while (textoffset <= textlength)
   4350     {
   4351         uint32_t    patternceindex = patterncelength - 1;
   4352         int32_t     targetce;
   4353         UBool       found          = FALSE;
   4354         int32_t    lastce          = UCOL_NULLORDER;
   4355 
   4356         setColEIterOffset(coleiter, textoffset);
   4357 
   4358         for (;;) {
   4359             // finding the last pattern ce match, imagine composite characters
   4360             // for example: search for pattern A in text \u00C0
   4361             // we'll have to skip \u0300 the grave first before we get to A
   4362             targetce = ucol_previous(coleiter, status);
   4363             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4364                 found = FALSE;
   4365                 break;
   4366             }
   4367             targetce = getCE(strsrch, targetce);
   4368             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
   4369                 // this is for the text \u0315\u0300 that requires
   4370                 // normalization and pattern \u0300, where \u0315 is ignorable
   4371                 continue;
   4372             }
   4373             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4374                 lastce = targetce;
   4375             }
   4376             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4377             if (targetce == patternce[patternceindex]) {
   4378                 // the first ce can be a contraction
   4379                 found = TRUE;
   4380                 break;
   4381             }
   4382             if (!hasExpansion(coleiter)) {
   4383                 found = FALSE;
   4384                 break;
   4385             }
   4386         }
   4387 
   4388         //targetce = lastce;
   4389 
   4390         while (found && patternceindex > 0) {
   4391             lastce = targetce;
   4392             targetce    = ucol_previous(coleiter, status);
   4393             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4394                 found = FALSE;
   4395                 break;
   4396             }
   4397             targetce    = getCE(strsrch, targetce);
   4398             if (targetce == UCOL_IGNORABLE) {
   4399                 continue;
   4400             }
   4401 
   4402             patternceindex --;
   4403             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4404             found = found && targetce == patternce[patternceindex];
   4405         }
   4406 
   4407         targetce = lastce;
   4408 
   4409         if (!found) {
   4410             if (U_FAILURE(*status)) {
   4411                 break;
   4412             }
   4413             textoffset = shiftForward(strsrch, textoffset, lastce,
   4414                                       patternceindex);
   4415             // status checked at loop.
   4416             patternceindex = patterncelength;
   4417             continue;
   4418         }
   4419 
   4420         if (checkNextExactMatch(strsrch, &textoffset, status)) {
   4421             // status checked in ucol_setOffset
   4422             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4423             return TRUE;
   4424         }
   4425     }
   4426     setMatchNotFound(strsrch);
   4427     return FALSE;
   4428 #else
   4429     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4430     int32_t start = -1;
   4431     int32_t end = -1;
   4432 
   4433     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4434         strsrch->search->matchedIndex  = start;
   4435         strsrch->search->matchedLength = end - start;
   4436         return TRUE;
   4437     } else {
   4438         setMatchNotFound(strsrch);
   4439         return FALSE;
   4440     }
   4441 #endif
   4442 }
   4443 
   4444 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
   4445 {
   4446     if (U_FAILURE(*status)) {
   4447         setMatchNotFound(strsrch);
   4448         return FALSE;
   4449     }
   4450 
   4451 #if BOYER_MOORE
   4452     UCollationElements *coleiter        = strsrch->textIter;
   4453     int32_t             textlength      = strsrch->search->textLength;
   4454     int32_t            *patternce       = strsrch->pattern.CE;
   4455     int32_t             patterncelength = strsrch->pattern.CELength;
   4456     int32_t             textoffset      = ucol_getOffset(coleiter);
   4457     UBool               hasPatternAccents =
   4458        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4459 
   4460     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4461                               patterncelength);
   4462     strsrch->canonicalPrefixAccents[0] = 0;
   4463     strsrch->canonicalSuffixAccents[0] = 0;
   4464 
   4465     while (textoffset <= textlength)
   4466     {
   4467         int32_t     patternceindex = patterncelength - 1;
   4468         int32_t     targetce;
   4469         UBool       found          = FALSE;
   4470         int32_t     lastce         = UCOL_NULLORDER;
   4471 
   4472         setColEIterOffset(coleiter, textoffset);
   4473 
   4474         for (;;) {
   4475             // finding the last pattern ce match, imagine composite characters
   4476             // for example: search for pattern A in text \u00C0
   4477             // we'll have to skip \u0300 the grave first before we get to A
   4478             targetce = ucol_previous(coleiter, status);
   4479             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4480                 found = FALSE;
   4481                 break;
   4482             }
   4483             targetce = getCE(strsrch, targetce);
   4484             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4485                 lastce = targetce;
   4486             }
   4487             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4488             if (targetce == patternce[patternceindex]) {
   4489                 // the first ce can be a contraction
   4490                 found = TRUE;
   4491                 break;
   4492             }
   4493             if (!hasExpansion(coleiter)) {
   4494                 found = FALSE;
   4495                 break;
   4496             }
   4497         }
   4498 
   4499         while (found && patternceindex > 0) {
   4500             targetce    = ucol_previous(coleiter, status);
   4501             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4502                 found = FALSE;
   4503                 break;
   4504             }
   4505             targetce    = getCE(strsrch, targetce);
   4506             if (targetce == UCOL_IGNORABLE) {
   4507                 continue;
   4508             }
   4509 
   4510             patternceindex --;
   4511             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4512             found = found && targetce == patternce[patternceindex];
   4513         }
   4514 
   4515         // initializing the rearranged accent array
   4516         if (hasPatternAccents && !found) {
   4517             strsrch->canonicalPrefixAccents[0] = 0;
   4518             strsrch->canonicalSuffixAccents[0] = 0;
   4519             if (U_FAILURE(*status)) {
   4520                 break;
   4521             }
   4522             found = doNextCanonicalMatch(strsrch, textoffset, status);
   4523         }
   4524 
   4525         if (!found) {
   4526             if (U_FAILURE(*status)) {
   4527                 break;
   4528             }
   4529             textoffset = shiftForward(strsrch, textoffset, lastce,
   4530                                       patternceindex);
   4531             // status checked at loop
   4532             patternceindex = patterncelength;
   4533             continue;
   4534         }
   4535 
   4536         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
   4537             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4538             return TRUE;
   4539         }
   4540     }
   4541     setMatchNotFound(strsrch);
   4542     return FALSE;
   4543 #else
   4544     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4545     int32_t start = -1;
   4546     int32_t end = -1;
   4547 
   4548     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4549         strsrch->search->matchedIndex  = start;
   4550         strsrch->search->matchedLength = end - start;
   4551         return TRUE;
   4552     } else {
   4553         setMatchNotFound(strsrch);
   4554         return FALSE;
   4555     }
   4556 #endif
   4557 }
   4558 
   4559 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
   4560 {
   4561     if (U_FAILURE(*status)) {
   4562         setMatchNotFound(strsrch);
   4563         return FALSE;
   4564     }
   4565 
   4566 #if BOYER_MOORE
   4567     UCollationElements *coleiter        = strsrch->textIter;
   4568     int32_t            *patternce       = strsrch->pattern.CE;
   4569     int32_t             patterncelength = strsrch->pattern.CELength;
   4570     int32_t             textoffset      = ucol_getOffset(coleiter);
   4571 
   4572     // shifting it check for setting offset
   4573     // if setOffset is called previously or there was no previous match, we
   4574     // leave the offset as it is.
   4575     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4576         textoffset = strsrch->search->matchedIndex;
   4577     }
   4578 
   4579     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4580                               patterncelength);
   4581 
   4582     while (textoffset >= 0)
   4583     {
   4584         int32_t     patternceindex = 1;
   4585         int32_t     targetce;
   4586         UBool       found          = FALSE;
   4587         int32_t     firstce        = UCOL_NULLORDER;
   4588 
   4589         // if status is a failure, ucol_setOffset does nothing
   4590         setColEIterOffset(coleiter, textoffset);
   4591 
   4592         for (;;) {
   4593             // finding the first pattern ce match, imagine composite
   4594             // characters. for example: search for pattern \u0300 in text
   4595             // \u00C0, we'll have to skip A first before we get to
   4596             // \u0300 the grave accent
   4597             targetce = ucol_next(coleiter, status);
   4598             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4599                 found = FALSE;
   4600                 break;
   4601             }
   4602             targetce = getCE(strsrch, targetce);
   4603             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4604                 firstce = targetce;
   4605             }
   4606             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
   4607                 continue;
   4608             }
   4609             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4610             if (targetce == patternce[0]) {
   4611                 found = TRUE;
   4612                 break;
   4613             }
   4614             if (!hasExpansion(coleiter)) {
   4615                 // checking for accents in composite character
   4616                 found = FALSE;
   4617                 break;
   4618             }
   4619         }
   4620 
   4621         //targetce = firstce;
   4622 
   4623         while (found && (patternceindex < patterncelength)) {
   4624             firstce = targetce;
   4625             targetce    = ucol_next(coleiter, status);
   4626             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4627                 found = FALSE;
   4628                 break;
   4629             }
   4630             targetce    = getCE(strsrch, targetce);
   4631             if (targetce == UCOL_IGNORABLE) {
   4632                 continue;
   4633             }
   4634 
   4635             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4636             found = found && targetce == patternce[patternceindex];
   4637             patternceindex ++;
   4638         }
   4639 
   4640         targetce = firstce;
   4641 
   4642         if (!found) {
   4643             if (U_FAILURE(*status)) {
   4644                 break;
   4645             }
   4646 
   4647             textoffset = reverseShift(strsrch, textoffset, targetce,
   4648                                       patternceindex);
   4649             patternceindex = 0;
   4650             continue;
   4651         }
   4652 
   4653         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
   4654             setColEIterOffset(coleiter, textoffset);
   4655             return TRUE;
   4656         }
   4657     }
   4658     setMatchNotFound(strsrch);
   4659     return FALSE;
   4660 #else
   4661     int32_t textOffset;
   4662 
   4663     if (strsrch->search->isOverlap) {
   4664         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4665             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4666         } else {
   4667             // move the start position at the end of possible match
   4668             initializePatternPCETable(strsrch, status);
   4669             if (!initTextProcessedIter(strsrch, status)) {
   4670                 setMatchNotFound(strsrch);
   4671                 return FALSE;
   4672             }
   4673             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
   4674                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4675                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4676                     // at the end of the text
   4677                     break;
   4678                 }
   4679             }
   4680             if (U_FAILURE(*status)) {
   4681                 setMatchNotFound(strsrch);
   4682                 return FALSE;
   4683             }
   4684             textOffset = ucol_getOffset(strsrch->textIter);
   4685         }
   4686     } else {
   4687         textOffset = ucol_getOffset(strsrch->textIter);
   4688     }
   4689 
   4690     int32_t start = -1;
   4691     int32_t end = -1;
   4692 
   4693     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4694         strsrch->search->matchedIndex = start;
   4695         strsrch->search->matchedLength = end - start;
   4696         return TRUE;
   4697     } else {
   4698         setMatchNotFound(strsrch);
   4699         return FALSE;
   4700     }
   4701 #endif
   4702 }
   4703 
   4704 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
   4705                                       UErrorCode    *status)
   4706 {
   4707     if (U_FAILURE(*status)) {
   4708         setMatchNotFound(strsrch);
   4709         return FALSE;
   4710     }
   4711 
   4712 #if BOYER_MOORE
   4713     UCollationElements *coleiter        = strsrch->textIter;
   4714     int32_t            *patternce       = strsrch->pattern.CE;
   4715     int32_t             patterncelength = strsrch->pattern.CELength;
   4716     int32_t             textoffset      = ucol_getOffset(coleiter);
   4717     UBool               hasPatternAccents =
   4718        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4719 
   4720     // shifting it check for setting offset
   4721     // if setOffset is called previously or there was no previous match, we
   4722     // leave the offset as it is.
   4723     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4724         textoffset = strsrch->search->matchedIndex;
   4725     }
   4726 
   4727     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4728                               patterncelength);
   4729     strsrch->canonicalPrefixAccents[0] = 0;
   4730     strsrch->canonicalSuffixAccents[0] = 0;
   4731 
   4732     while (textoffset >= 0)
   4733     {
   4734         int32_t     patternceindex = 1;
   4735         int32_t     targetce;
   4736         UBool       found          = FALSE;
   4737         int32_t     firstce        = UCOL_NULLORDER;
   4738 
   4739         setColEIterOffset(coleiter, textoffset);
   4740         for (;;) {
   4741             // finding the first pattern ce match, imagine composite
   4742             // characters. for example: search for pattern \u0300 in text
   4743             // \u00C0, we'll have to skip A first before we get to
   4744             // \u0300 the grave accent
   4745             targetce = ucol_next(coleiter, status);
   4746             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4747                 found = FALSE;
   4748                 break;
   4749             }
   4750             targetce = getCE(strsrch, targetce);
   4751             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4752                 firstce = targetce;
   4753             }
   4754 
   4755             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4756             if (targetce == patternce[0]) {
   4757                 // the first ce can be a contraction
   4758                 found = TRUE;
   4759                 break;
   4760             }
   4761             if (!hasExpansion(coleiter)) {
   4762                 // checking for accents in composite character
   4763                 found = FALSE;
   4764                 break;
   4765             }
   4766         }
   4767 
   4768         targetce = firstce;
   4769 
   4770         while (found && patternceindex < patterncelength) {
   4771             targetce    = ucol_next(coleiter, status);
   4772             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4773                 found = FALSE;
   4774                 break;
   4775             }
   4776             targetce = getCE(strsrch, targetce);
   4777             if (targetce == UCOL_IGNORABLE) {
   4778                 continue;
   4779             }
   4780 
   4781             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4782             found = found && targetce == patternce[patternceindex];
   4783             patternceindex ++;
   4784         }
   4785 
   4786         // initializing the rearranged accent array
   4787         if (hasPatternAccents && !found) {
   4788             strsrch->canonicalPrefixAccents[0] = 0;
   4789             strsrch->canonicalSuffixAccents[0] = 0;
   4790             if (U_FAILURE(*status)) {
   4791                 break;
   4792             }
   4793             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
   4794         }
   4795 
   4796         if (!found) {
   4797             if (U_FAILURE(*status)) {
   4798                 break;
   4799             }
   4800             textoffset = reverseShift(strsrch, textoffset, targetce,
   4801                                       patternceindex);
   4802             patternceindex = 0;
   4803             continue;
   4804         }
   4805 
   4806         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
   4807             setColEIterOffset(coleiter, textoffset);
   4808             return TRUE;
   4809         }
   4810     }
   4811     setMatchNotFound(strsrch);
   4812     return FALSE;
   4813 #else
   4814     int32_t textOffset;
   4815 
   4816     if (strsrch->search->isOverlap) {
   4817         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4818             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4819         } else {
   4820             // move the start position at the end of possible match
   4821             initializePatternPCETable(strsrch, status);
   4822             if (!initTextProcessedIter(strsrch, status)) {
   4823                 setMatchNotFound(strsrch);
   4824                 return FALSE;
   4825             }
   4826             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
   4827                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4828                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4829                     // at the end of the text
   4830                     break;
   4831                 }
   4832             }
   4833             if (U_FAILURE(*status)) {
   4834                 setMatchNotFound(strsrch);
   4835                 return FALSE;
   4836             }
   4837             textOffset = ucol_getOffset(strsrch->textIter);
   4838         }
   4839     } else {
   4840         textOffset = ucol_getOffset(strsrch->textIter);
   4841     }
   4842 
   4843     int32_t start = -1;
   4844     int32_t end = -1;
   4845 
   4846     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4847         strsrch->search->matchedIndex = start;
   4848         strsrch->search->matchedLength = end - start;
   4849         return TRUE;
   4850     } else {
   4851         setMatchNotFound(strsrch);
   4852         return FALSE;
   4853     }
   4854 #endif
   4855 }
   4856 
   4857 #endif /* #if !UCONFIG_NO_COLLATION */
   4858