Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2014 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  07/02/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/usearch.h"
     15 #include "unicode/ustring.h"
     16 #include "unicode/uchar.h"
     17 #include "unicode/utf16.h"
     18 #include "normalizer2impl.h"
     19 #include "usrchimp.h"
     20 #include "cmemory.h"
     21 #include "ucln_in.h"
     22 #include "uassert.h"
     23 #include "ustr_imp.h"
     24 
     25 U_NAMESPACE_USE
     26 
     27 // don't use Boyer-Moore
     28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
     29 #define BOYER_MOORE 0
     30 
     31 // internal definition ---------------------------------------------------
     32 
     33 #define LAST_BYTE_MASK_          0xFF
     34 #define SECOND_LAST_BYTE_SHIFT_  8
     35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
     36 
     37 static const Normalizer2Impl *g_nfcImpl = NULL;
     38 
     39 // internal methods -------------------------------------------------
     40 
     41 /**
     42 * Fast collation element iterator setOffset.
     43 * This function does not check for bounds.
     44 * @param coleiter collation element iterator
     45 * @param offset to set
     46 */
     47 static
     48 inline void setColEIterOffset(UCollationElements *elems,
     49                       int32_t             offset)
     50 {
     51     // Note: Not "fast" any more after the 2013 collation rewrite.
     52     // We do not want to expose more internals than necessary.
     53     UErrorCode status = U_ZERO_ERROR;
     54     ucol_setOffset(elems, offset, &status);
     55 }
     56 
     57 /**
     58 * Getting the mask for collation strength
     59 * @param strength collation strength
     60 * @return collation element mask
     61 */
     62 static
     63 inline uint32_t getMask(UCollationStrength strength)
     64 {
     65     switch (strength)
     66     {
     67     case UCOL_PRIMARY:
     68         return UCOL_PRIMARYORDERMASK;
     69     case UCOL_SECONDARY:
     70         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
     71     default:
     72         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
     73                UCOL_PRIMARYORDERMASK;
     74     }
     75 }
     76 
     77 /**
     78 * This is to squeeze the 21bit ces into a 256 table
     79 * @param ce collation element
     80 * @return collapsed version of the collation element
     81 */
     82 static
     83 inline int hash(uint32_t ce)
     84 {
     85     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
     86     // well with the new collation where most of the latin 1 characters
     87     // are of the value xx000xxx. their hashes will most of the time be 0
     88     // to be discussed on the hash algo.
     89     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
     90 }
     91 
     92 U_CDECL_BEGIN
     93 static UBool U_CALLCONV
     94 usearch_cleanup(void) {
     95     g_nfcImpl = NULL;
     96     return TRUE;
     97 }
     98 U_CDECL_END
     99 
    100 /**
    101 * Initializing the fcd tables.
    102 * Internal method, status assumed to be a success.
    103 * @param status output error if any, caller to check status before calling
    104 *               method, status assumed to be success when passed in.
    105 */
    106 static
    107 inline void initializeFCD(UErrorCode *status)
    108 {
    109     if (g_nfcImpl == NULL) {
    110         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    111         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
    112     }
    113 }
    114 
    115 /**
    116 * Gets the fcd value for a character at the argument index.
    117 * This method takes into accounts of the supplementary characters.
    118 * @param str UTF16 string where character for fcd retrieval resides
    119 * @param offset position of the character whose fcd is to be retrieved, to be
    120 *               overwritten with the next character position, taking
    121 *               surrogate characters into consideration.
    122 * @param strlength length of the argument string
    123 * @return fcd value
    124 */
    125 static
    126 uint16_t getFCD(const UChar   *str, int32_t *offset,
    127                              int32_t  strlength)
    128 {
    129     const UChar *temp = str + *offset;
    130     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
    131     *offset = (int32_t)(temp - str);
    132     return result;
    133 }
    134 
    135 /**
    136 * Getting the modified collation elements taking into account the collation
    137 * attributes
    138 * @param strsrch string search data
    139 * @param sourcece
    140 * @return the modified collation element
    141 */
    142 static
    143 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
    144 {
    145     // note for tertiary we can't use the collator->tertiaryMask, that
    146     // is a preprocessed mask that takes into account case options. since
    147     // we are only concerned with exact matches, we don't need that.
    148     sourcece &= strsrch->ceMask;
    149 
    150     if (strsrch->toShift) {
    151         // alternate handling here, since only the 16 most significant digits
    152         // is only used, we can safely do a compare without masking
    153         // if the ce is a variable, we mask and get only the primary values
    154         // no shifting to quartenary is required since all primary values
    155         // less than variabletop will need to be masked off anyway.
    156         if (strsrch->variableTop > sourcece) {
    157             if (strsrch->strength >= UCOL_QUATERNARY) {
    158                 sourcece &= UCOL_PRIMARYORDERMASK;
    159             }
    160             else {
    161                 sourcece = UCOL_IGNORABLE;
    162             }
    163         }
    164     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
    165         sourcece = 0xFFFF;
    166     }
    167 
    168     return sourcece;
    169 }
    170 
    171 /**
    172 * Allocate a memory and returns NULL if it failed.
    173 * Internal method, status assumed to be a success.
    174 * @param size to allocate
    175 * @param status output error if any, caller to check status before calling
    176 *               method, status assumed to be success when passed in.
    177 * @return newly allocated array, NULL otherwise
    178 */
    179 static
    180 inline void * allocateMemory(uint32_t size, UErrorCode *status)
    181 {
    182     uint32_t *result = (uint32_t *)uprv_malloc(size);
    183     if (result == NULL) {
    184         *status = U_MEMORY_ALLOCATION_ERROR;
    185     }
    186     return result;
    187 }
    188 
    189 /**
    190 * Adds a uint32_t value to a destination array.
    191 * Creates a new array if we run out of space. The caller will have to
    192 * manually deallocate the newly allocated array.
    193 * Internal method, status assumed to be success, caller has to check status
    194 * before calling this method. destination not to be NULL and has at least
    195 * size destinationlength.
    196 * @param destination target array
    197 * @param offset destination offset to add value
    198 * @param destinationlength target array size, return value for the new size
    199 * @param value to be added
    200 * @param increments incremental size expected
    201 * @param status output error if any, caller to check status before calling
    202 *               method, status assumed to be success when passed in.
    203 * @return new destination array, destination if there was no new allocation
    204 */
    205 static
    206 inline int32_t * addTouint32_tArray(int32_t    *destination,
    207                                     uint32_t    offset,
    208                                     uint32_t   *destinationlength,
    209                                     uint32_t    value,
    210                                     uint32_t    increments,
    211                                     UErrorCode *status)
    212 {
    213     uint32_t newlength = *destinationlength;
    214     if (offset + 1 == newlength) {
    215         newlength += increments;
    216         int32_t *temp = (int32_t *)allocateMemory(
    217                                          sizeof(int32_t) * newlength, status);
    218         if (U_FAILURE(*status)) {
    219             return NULL;
    220         }
    221         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
    222         *destinationlength = newlength;
    223         destination        = temp;
    224     }
    225     destination[offset] = value;
    226     return destination;
    227 }
    228 
    229 /**
    230 * Adds a uint64_t value to a destination array.
    231 * Creates a new array if we run out of space. The caller will have to
    232 * manually deallocate the newly allocated array.
    233 * Internal method, status assumed to be success, caller has to check status
    234 * before calling this method. destination not to be NULL and has at least
    235 * size destinationlength.
    236 * @param destination target array
    237 * @param offset destination offset to add value
    238 * @param destinationlength target array size, return value for the new size
    239 * @param value to be added
    240 * @param increments incremental size expected
    241 * @param status output error if any, caller to check status before calling
    242 *               method, status assumed to be success when passed in.
    243 * @return new destination array, destination if there was no new allocation
    244 */
    245 static
    246 inline int64_t * addTouint64_tArray(int64_t    *destination,
    247                                     uint32_t    offset,
    248                                     uint32_t   *destinationlength,
    249                                     uint64_t    value,
    250                                     uint32_t    increments,
    251                                     UErrorCode *status)
    252 {
    253     uint32_t newlength = *destinationlength;
    254     if (offset + 1 == newlength) {
    255         newlength += increments;
    256         int64_t *temp = (int64_t *)allocateMemory(
    257                                          sizeof(int64_t) * newlength, status);
    258 
    259         if (U_FAILURE(*status)) {
    260             return NULL;
    261         }
    262 
    263         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
    264         *destinationlength = newlength;
    265         destination        = temp;
    266     }
    267 
    268     destination[offset] = value;
    269 
    270     return destination;
    271 }
    272 
    273 /**
    274 * Initializing the ce table for a pattern.
    275 * Stores non-ignorable collation keys.
    276 * Table size will be estimated by the size of the pattern text. Table
    277 * expansion will be perform as we go along. Adding 1 to ensure that the table
    278 * size definitely increases.
    279 * Internal method, status assumed to be a success.
    280 * @param strsrch string search data
    281 * @param status output error if any, caller to check status before calling
    282 *               method, status assumed to be success when passed in.
    283 * @return total number of expansions
    284 */
    285 static
    286 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
    287                                          UErrorCode    *status)
    288 {
    289     UPattern *pattern            = &(strsrch->pattern);
    290     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
    291     int32_t  *cetable            = pattern->cesBuffer;
    292     uint32_t  patternlength      = pattern->textLength;
    293     UCollationElements *coleiter = strsrch->utilIter;
    294 
    295     if (coleiter == NULL) {
    296         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    297                                      patternlength, status);
    298         // status will be checked in ucol_next(..) later and if it is an
    299         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    300         // returned.
    301         strsrch->utilIter = coleiter;
    302     }
    303     else {
    304         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    305     }
    306     if(U_FAILURE(*status)) {
    307         return 0;
    308     }
    309 
    310     if (pattern->ces != cetable && pattern->ces) {
    311         uprv_free(pattern->ces);
    312     }
    313 
    314     uint16_t  offset      = 0;
    315     uint16_t  result      = 0;
    316     int32_t   ce;
    317 
    318     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
    319            U_SUCCESS(*status)) {
    320         uint32_t newce = getCE(strsrch, ce);
    321         if (newce) {
    322             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
    323                                   newce,
    324                                   patternlength - ucol_getOffset(coleiter) + 1,
    325                                   status);
    326             if (U_FAILURE(*status)) {
    327                 return 0;
    328             }
    329             offset ++;
    330             if (cetable != temp && cetable != pattern->cesBuffer) {
    331                 uprv_free(cetable);
    332             }
    333             cetable = temp;
    334         }
    335         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    336     }
    337 
    338     cetable[offset]   = 0;
    339     pattern->ces       = cetable;
    340     pattern->cesLength = offset;
    341 
    342     return result;
    343 }
    344 
    345 /**
    346 * Initializing the pce table for a pattern.
    347 * Stores non-ignorable collation keys.
    348 * Table size will be estimated by the size of the pattern text. Table
    349 * expansion will be perform as we go along. Adding 1 to ensure that the table
    350 * size definitely increases.
    351 * Internal method, status assumed to be a success.
    352 * @param strsrch string search data
    353 * @param status output error if any, caller to check status before calling
    354 *               method, status assumed to be success when passed in.
    355 * @return total number of expansions
    356 */
    357 static
    358 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
    359                                           UErrorCode    *status)
    360 {
    361     UPattern *pattern            = &(strsrch->pattern);
    362     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
    363     int64_t  *pcetable           = pattern->pcesBuffer;
    364     uint32_t  patternlength      = pattern->textLength;
    365     UCollationElements *coleiter = strsrch->utilIter;
    366 
    367     if (coleiter == NULL) {
    368         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    369                                      patternlength, status);
    370         // status will be checked in ucol_next(..) later and if it is an
    371         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    372         // returned.
    373         strsrch->utilIter = coleiter;
    374     } else {
    375         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    376     }
    377     if(U_FAILURE(*status)) {
    378         return 0;
    379     }
    380 
    381     if (pattern->pces != pcetable && pattern->pces != NULL) {
    382         uprv_free(pattern->pces);
    383     }
    384 
    385     uint16_t  offset = 0;
    386     uint16_t  result = 0;
    387     int64_t   pce;
    388 
    389     icu::UCollationPCE iter(coleiter);
    390 
    391     // ** Should processed CEs be signed or unsigned?
    392     // ** (the rest of the code in this file seems to play fast-and-loose with
    393     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
    394     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
    395            U_SUCCESS(*status)) {
    396         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
    397                               pce,
    398                               patternlength - ucol_getOffset(coleiter) + 1,
    399                               status);
    400 
    401         if (U_FAILURE(*status)) {
    402             return 0;
    403         }
    404 
    405         offset += 1;
    406 
    407         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
    408             uprv_free(pcetable);
    409         }
    410 
    411         pcetable = temp;
    412         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    413     }
    414 
    415     pcetable[offset]   = 0;
    416     pattern->pces       = pcetable;
    417     pattern->pcesLength = offset;
    418 
    419     return result;
    420 }
    421 
    422 /**
    423 * Initializes the pattern struct.
    424 * Internal method, status assumed to be success.
    425 * @param strsrch UStringSearch data storage
    426 * @param status output error if any, caller to check status before calling
    427 *               method, status assumed to be success when passed in.
    428 * @return expansionsize the total expansion size of the pattern
    429 */
    430 static
    431 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
    432 {
    433     if (U_FAILURE(*status)) { return 0; }
    434           UPattern   *pattern     = &(strsrch->pattern);
    435     const UChar      *patterntext = pattern->text;
    436           int32_t     length      = pattern->textLength;
    437           int32_t index       = 0;
    438 
    439     // Since the strength is primary, accents are ignored in the pattern.
    440     if (strsrch->strength == UCOL_PRIMARY) {
    441         pattern->hasPrefixAccents = 0;
    442         pattern->hasSuffixAccents = 0;
    443     } else {
    444         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
    445                                                          SECOND_LAST_BYTE_SHIFT_;
    446         index = length;
    447         U16_BACK_1(patterntext, 0, index);
    448         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
    449                                                                  LAST_BYTE_MASK_;
    450     }
    451 
    452     // ** HACK **
    453     if (strsrch->pattern.pces != NULL) {
    454         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
    455             uprv_free(strsrch->pattern.pces);
    456         }
    457 
    458         strsrch->pattern.pces = NULL;
    459     }
    460 
    461     // since intializePattern is an internal method status is a success.
    462     return initializePatternCETable(strsrch, status);
    463 }
    464 
    465 /**
    466 * Initializing shift tables, with the default values.
    467 * If a corresponding default value is 0, the shift table is not set.
    468 * @param shift table for forwards shift
    469 * @param backshift table for backwards shift
    470 * @param cetable table containing pattern ce
    471 * @param cesize size of the pattern ces
    472 * @param expansionsize total size of the expansions
    473 * @param defaultforward the default forward value
    474 * @param defaultbackward the default backward value
    475 */
    476 static
    477 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
    478                           int32_t  *cetable, int32_t cesize,
    479                           int16_t   expansionsize,
    480                           int16_t   defaultforward,
    481                           int16_t   defaultbackward)
    482 {
    483     // estimate the value to shift. to do that we estimate the smallest
    484     // number of characters to give the relevant ces, ie approximately
    485     // the number of ces minus their expansion, since expansions can come
    486     // from a character.
    487     int32_t count;
    488     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    489         shift[count] = defaultforward;
    490     }
    491     cesize --; // down to the last index
    492     for (count = 0; count < cesize; count ++) {
    493         // number of ces from right of array to the count
    494         int temp = defaultforward - count - 1;
    495         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
    496     }
    497     shift[hash(cetable[cesize])] = 1;
    498     // for ignorables we just shift by one. see test examples.
    499     shift[hash(0)] = 1;
    500 
    501     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    502         backshift[count] = defaultbackward;
    503     }
    504     for (count = cesize; count > 0; count --) {
    505         // the original value count does not seem to work
    506         backshift[hash(cetable[count])] = count > expansionsize ?
    507                                           (int16_t)(count - expansionsize) : 1;
    508     }
    509     backshift[hash(cetable[0])] = 1;
    510     backshift[hash(0)] = 1;
    511 }
    512 
    513 /**
    514 * Building of the pattern collation element list and the boyer moore strsrch
    515 * table.
    516 * The canonical match will only be performed after the default match fails.
    517 * For both cases we need to remember the size of the composed and decomposed
    518 * versions of the string. Since the Boyer-Moore shift calculations shifts by
    519 * a number of characters in the text and tries to match the pattern from that
    520 * offset, the shift value can not be too large in case we miss some
    521 * characters. To choose a right shift size, we estimate the NFC form of the
    522 * and use its size as a shift guide. The NFC form should be the small
    523 * possible representation of the pattern. Anyways, we'll err on the smaller
    524 * shift size. Hence the calculation for minlength.
    525 * Canonical match will be performed slightly differently. We'll split the
    526 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
    527 * the first and last base character (MS), the ending accents (EA). Matches
    528 * will be done on MS first, and only when we match MS then some processing
    529 * will be required for the prefix and end accents in order to determine if
    530 * they match PA and EA. Hence the default shift values
    531 * for the canonical match will take the size of either end's accent into
    532 * consideration. Forwards search will take the end accents into consideration
    533 * for the default shift values and the backwards search will take the prefix
    534 * accents into consideration.
    535 * If pattern has no non-ignorable ce, we return a illegal argument error.
    536 * Internal method, status assumed to be success.
    537 * @param strsrch UStringSearch data storage
    538 * @param status  for output errors if it occurs, status is assumed to be a
    539 *                success when it is passed in.
    540 */
    541 static
    542 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
    543 {
    544     int16_t expandlength  = initializePattern(strsrch, status);
    545     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
    546         UPattern *pattern = &strsrch->pattern;
    547         int32_t   cesize  = pattern->cesLength;
    548 
    549         int16_t minlength = cesize > expandlength
    550                             ? (int16_t)cesize - expandlength : 1;
    551         pattern->defaultShiftSize    = minlength;
    552         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
    553                       cesize, expandlength, minlength, minlength);
    554         return;
    555     }
    556     strsrch->pattern.defaultShiftSize = 0;
    557 }
    558 
    559 #if BOYER_MOORE
    560 /**
    561 * Check to make sure that the match length is at the end of the character by
    562 * using the breakiterator.
    563 * @param strsrch string search data
    564 * @param start target text start offset
    565 * @param end target text end offset
    566 */
    567 static
    568 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
    569                                int32_t *end)
    570 {
    571 #if !UCONFIG_NO_BREAK_ITERATION
    572     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
    573     if (breakiterator) {
    574         int32_t matchend = *end;
    575         //int32_t matchstart = *start;
    576 
    577         if (!ubrk_isBoundary(breakiterator, matchend)) {
    578             *end = ubrk_following(breakiterator, matchend);
    579         }
    580 
    581         /* Check the start of the matched text to make sure it doesn't have any accents
    582          * before it.  This code may not be necessary and so it is commented out */
    583         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
    584             *start = ubrk_preceding(breakiterator, matchstart);
    585         }*/
    586     }
    587 #endif
    588 }
    589 
    590 /**
    591 * Determine whether the target text in UStringSearch bounded by the offset
    592 * start and end is one or more whole units of text as
    593 * determined by the breakiterator in UStringSearch.
    594 * @param strsrch string search data
    595 * @param start target text start offset
    596 * @param end target text end offset
    597 */
    598 static
    599 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
    600                                int32_t    end)
    601 {
    602 #if !UCONFIG_NO_BREAK_ITERATION
    603     UBreakIterator *breakiterator = strsrch->search->breakIter;
    604     //TODO: Add here.
    605     if (breakiterator) {
    606         int32_t startindex = ubrk_first(breakiterator);
    607         int32_t endindex   = ubrk_last(breakiterator);
    608 
    609         // out-of-range indexes are never boundary positions
    610         if (start < startindex || start > endindex ||
    611             end < startindex || end > endindex) {
    612             return FALSE;
    613         }
    614         // otherwise, we can use following() on the position before the
    615         // specified one and return true of the position we get back is the
    616         // one the user specified
    617         UBool result = (start == startindex ||
    618                 ubrk_following(breakiterator, start - 1) == start) &&
    619                (end == endindex ||
    620                 ubrk_following(breakiterator, end - 1) == end);
    621         if (result) {
    622             // iterates the individual ces
    623                   UCollationElements *coleiter  = strsrch->utilIter;
    624             const UChar              *text      = strsrch->search->text +
    625                                                                       start;
    626                   UErrorCode          status    = U_ZERO_ERROR;
    627             ucol_setText(coleiter, text, end - start, &status);
    628             for (int32_t count = 0; count < strsrch->pattern.cesLength;
    629                  count ++) {
    630                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
    631                 if (ce == UCOL_IGNORABLE) {
    632                     count --;
    633                     continue;
    634                 }
    635                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
    636                     return FALSE;
    637                 }
    638             }
    639             int32_t nextce = ucol_next(coleiter, &status);
    640             while (ucol_getOffset(coleiter) == (end - start)
    641                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
    642                 nextce = ucol_next(coleiter, &status);
    643             }
    644             if (ucol_getOffset(coleiter) == (end - start)
    645                 && nextce != UCOL_NULLORDER) {
    646                 // extra collation elements at the end of the match
    647                 return FALSE;
    648             }
    649         }
    650         return result;
    651     }
    652 #endif
    653     return TRUE;
    654 }
    655 
    656 /**
    657 * Getting the next base character offset if current offset is an accent,
    658 * or the current offset if the current character contains a base character.
    659 * accents the following base character will be returned
    660 * @param text string
    661 * @param textoffset current offset
    662 * @param textlength length of text string
    663 * @return the next base character or the current offset
    664 *         if the current character is contains a base character.
    665 */
    666 static
    667 inline int32_t getNextBaseOffset(const UChar       *text,
    668                                            int32_t  textoffset,
    669                                            int32_t      textlength)
    670 {
    671     if (textoffset < textlength) {
    672         int32_t temp = textoffset;
    673         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
    674             while (temp < textlength) {
    675                 int32_t result = temp;
    676                 if ((getFCD(text, &temp, textlength) >>
    677                      SECOND_LAST_BYTE_SHIFT_) == 0) {
    678                     return result;
    679                 }
    680             }
    681             return textlength;
    682         }
    683     }
    684     return textoffset;
    685 }
    686 
    687 /**
    688 * Gets the next base character offset depending on the string search pattern
    689 * data
    690 * @param strsrch string search data
    691 * @param textoffset current offset, one offset away from the last character
    692 *                   to search for.
    693 * @return start index of the next base character or the current offset
    694 *         if the current character is contains a base character.
    695 */
    696 static
    697 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
    698                                                   int32_t    textoffset)
    699 {
    700     int32_t textlength = strsrch->search->textLength;
    701     if (strsrch->pattern.hasSuffixAccents &&
    702         textoffset < textlength) {
    703               int32_t  temp       = textoffset;
    704         const UChar       *text       = strsrch->search->text;
    705         U16_BACK_1(text, 0, temp);
    706         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
    707             return getNextBaseOffset(text, textoffset, textlength);
    708         }
    709     }
    710     return textoffset;
    711 }
    712 
    713 /**
    714 * Shifting the collation element iterator position forward to prepare for
    715 * a following match. If the last character is a unsafe character, we'll only
    716 * shift by 1 to capture contractions, normalization etc.
    717 * Internal method, status assumed to be success.
    718 * @param text strsrch string search data
    719 * @param textoffset start text position to do search
    720 * @param ce the text ce which failed the match.
    721 * @param patternceindex index of the ce within the pattern ce buffer which
    722 *        failed the match
    723 * @return final offset
    724 */
    725 static
    726 inline int32_t shiftForward(UStringSearch *strsrch,
    727                                 int32_t    textoffset,
    728                                 int32_t       ce,
    729                                 int32_t        patternceindex)
    730 {
    731     UPattern *pattern = &(strsrch->pattern);
    732     if (ce != UCOL_NULLORDER) {
    733         int32_t shift = pattern->shift[hash(ce)];
    734         // this is to adjust for characters in the middle of the
    735         // substring for matching that failed.
    736         int32_t adjust = pattern->cesLength - patternceindex;
    737         if (adjust > 1 && shift >= adjust) {
    738             shift -= adjust - 1;
    739         }
    740         textoffset += shift;
    741     }
    742     else {
    743         textoffset += pattern->defaultShiftSize;
    744     }
    745 
    746     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
    747     // check for unsafe characters
    748     // * if it is the start or middle of a contraction: to be done after
    749     //   a initial match is found
    750     // * thai or lao base consonant character: similar to contraction
    751     // * high surrogate character: similar to contraction
    752     // * next character is a accent: shift to the next base character
    753     return textoffset;
    754 }
    755 #endif // #if BOYER_MOORE
    756 
    757 /**
    758 * sets match not found
    759 * @param strsrch string search data
    760 */
    761 static
    762 inline void setMatchNotFound(UStringSearch *strsrch)
    763 {
    764     // this method resets the match result regardless of the error status.
    765     strsrch->search->matchedIndex = USEARCH_DONE;
    766     strsrch->search->matchedLength = 0;
    767     if (strsrch->search->isForwardSearching) {
    768         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
    769     }
    770     else {
    771         setColEIterOffset(strsrch->textIter, 0);
    772     }
    773 }
    774 
    775 #if BOYER_MOORE
    776 /**
    777 * Gets the offset to the next safe point in text.
    778 * ie. not the middle of a contraction, swappable characters or supplementary
    779 * characters.
    780 * @param collator collation sata
    781 * @param text string to work with
    782 * @param textoffset offset in string
    783 * @param textlength length of text string
    784 * @return offset to the next safe character
    785 */
    786 static
    787 inline int32_t getNextSafeOffset(const UCollator   *collator,
    788                                      const UChar       *text,
    789                                            int32_t  textoffset,
    790                                            int32_t      textlength)
    791 {
    792     int32_t result = textoffset; // first contraction character
    793     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
    794         result ++;
    795     }
    796     return result;
    797 }
    798 
    799 /**
    800 * This checks for accents in the potential match started with a .
    801 * composite character.
    802 * This is really painful... we have to check that composite character do not
    803 * have any extra accents. We have to normalize the potential match and find
    804 * the immediate decomposed character before the match.
    805 * The first composite character would have been taken care of by the fcd
    806 * checks in checkForwardExactMatch.
    807 * This is the slow path after the fcd of the first character and
    808 * the last character has been checked by checkForwardExactMatch and we
    809 * determine that the potential match has extra non-ignorable preceding
    810 * ces.
    811 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    812 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    813 * Note here that accents checking are slow and cautioned in the API docs.
    814 * Internal method, status assumed to be a success, caller should check status
    815 * before calling this method
    816 * @param strsrch string search data
    817 * @param start index of the potential unfriendly composite character
    818 * @param end index of the potential unfriendly composite character
    819 * @param status output error status if any.
    820 * @return TRUE if there is non-ignorable accents before at the beginning
    821 *              of the match, FALSE otherwise.
    822 */
    823 
    824 static
    825 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
    826                                    int32_t    end,
    827                                    UErrorCode    *status)
    828 {
    829     UBool result = FALSE;
    830     if (strsrch->pattern.hasPrefixAccents) {
    831               int32_t  length = end - start;
    832               int32_t  offset = 0;
    833         const UChar       *text   = strsrch->search->text + start;
    834 
    835         U16_FWD_1(text, offset, length);
    836         // we are only concerned with the first composite character
    837         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
    838             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
    839                                                        text, 0, length);
    840             if (safeoffset != length) {
    841                 safeoffset ++;
    842             }
    843             UChar   *norm = NULL;
    844             UChar    buffer[INITIAL_ARRAY_SIZE_];
    845             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
    846                                             buffer, INITIAL_ARRAY_SIZE_,
    847                                             status);
    848             if (U_FAILURE(*status)) {
    849                 return FALSE;
    850             }
    851             if (size >= INITIAL_ARRAY_SIZE_) {
    852                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
    853                                                status);
    854                 // if allocation failed, status will be set to
    855                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
    856                 // checks for it.
    857                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
    858                                        size, status);
    859                 if (U_FAILURE(*status) && norm != NULL) {
    860                     uprv_free(norm);
    861                     return FALSE;
    862                 }
    863             }
    864             else {
    865                 norm = buffer;
    866             }
    867 
    868             UCollationElements *coleiter  = strsrch->utilIter;
    869             ucol_setText(coleiter, norm, size, status);
    870             uint32_t            firstce   = strsrch->pattern.ces[0];
    871             UBool               ignorable = TRUE;
    872             uint32_t            ce        = UCOL_IGNORABLE;
    873             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
    874                 offset = ucol_getOffset(coleiter);
    875                 if (ce != firstce && ce != UCOL_IGNORABLE) {
    876                     ignorable = FALSE;
    877                 }
    878                 ce = ucol_next(coleiter, status);
    879             }
    880             UChar32 codepoint;
    881             U16_PREV(norm, 0, offset, codepoint);
    882             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
    883 
    884             if (norm != buffer) {
    885                 uprv_free(norm);
    886             }
    887         }
    888     }
    889 
    890     return result;
    891 }
    892 
    893 /**
    894 * Used by exact matches, checks if there are accents before the match.
    895 * This is really painful... we have to check that composite characters at
    896 * the start of the matches have to not have any extra accents.
    897 * We check the FCD of the character first, if it starts with an accent and
    898 * the first pattern ce does not match the first ce of the character, we bail.
    899 * Otherwise we try normalizing the first composite
    900 * character and find the immediate decomposed character before the match to
    901 * see if it is an non-ignorable accent.
    902 * Now normalizing the first composite character is enough because we ensure
    903 * that when the match is passed in here with extra beginning ces, the
    904 * first or last ce that match has to occur within the first character.
    905 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    906 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    907 * Note here that accents checking are slow and cautioned in the API docs.
    908 * @param strsrch string search data
    909 * @param start offset
    910 * @param end offset
    911 * @return TRUE if there are accents on either side of the match,
    912 *         FALSE otherwise
    913 */
    914 static
    915 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
    916                                   int32_t    end)
    917 {
    918     if (strsrch->pattern.hasPrefixAccents) {
    919         UCollationElements *coleiter  = strsrch->textIter;
    920         UErrorCode          status    = U_ZERO_ERROR;
    921         // we have been iterating forwards previously
    922         uint32_t            ignorable = TRUE;
    923         int32_t             firstce   = strsrch->pattern.ces[0];
    924 
    925         setColEIterOffset(coleiter, start);
    926         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
    927         if (U_FAILURE(status)) {
    928             return TRUE;
    929         }
    930         while (ce != firstce) {
    931             if (ce != UCOL_IGNORABLE) {
    932                 ignorable = FALSE;
    933             }
    934             ce = getCE(strsrch, ucol_next(coleiter, &status));
    935             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
    936                 return TRUE;
    937             }
    938         }
    939         if (!ignorable && inNormBuf(coleiter)) {
    940             // within normalization buffer, discontiguous handled here
    941             return TRUE;
    942         }
    943 
    944         // within text
    945         int32_t temp = start;
    946         // original code
    947         // accent = (getFCD(strsrch->search->text, &temp,
    948         //                  strsrch->search->textLength)
    949         //            >> SECOND_LAST_BYTE_SHIFT_);
    950         // however this code does not work well with VC7 .net in release mode.
    951         // maybe the inlines for getFCD combined with shifting has bugs in
    952         // VC7. anyways this is a work around.
    953         UBool accent = getFCD(strsrch->search->text, &temp,
    954                               strsrch->search->textLength) > 0xFF;
    955         if (!accent) {
    956             return checkExtraMatchAccents(strsrch, start, end, &status);
    957         }
    958         if (!ignorable) {
    959             return TRUE;
    960         }
    961         if (start > 0) {
    962             temp = start;
    963             U16_BACK_1(strsrch->search->text, 0, temp);
    964             if (getFCD(strsrch->search->text, &temp,
    965                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
    966                 setColEIterOffset(coleiter, start);
    967                 ce = ucol_previous(coleiter, &status);
    968                 if (U_FAILURE(status) ||
    969                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
    970                     return TRUE;
    971                 }
    972             }
    973         }
    974     }
    975 
    976     return FALSE;
    977 }
    978 
    979 /**
    980 * Used by exact matches, checks if there are accents bounding the match.
    981 * Note this is the initial boundary check. If the potential match
    982 * starts or ends with composite characters, the accents in those
    983 * characters will be determined later.
    984 * Not doing backwards iteration here, since discontiguos contraction for
    985 * backwards collation element iterator, use up too many characters.
    986 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
    987 * should fail since there is a acute at the end of \u01FA
    988 * Note here that accents checking are slow and cautioned in the API docs.
    989 * @param strsrch string search data
    990 * @param start offset of match
    991 * @param end end offset of the match
    992 * @return TRUE if there are accents on either side of the match,
    993 *         FALSE otherwise
    994 */
    995 static
    996 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
    997                                  int32_t    end)
    998 {
    999     if (strsrch->pattern.hasSuffixAccents) {
   1000         const UChar       *text       = strsrch->search->text;
   1001               int32_t  temp       = end;
   1002               int32_t      textlength = strsrch->search->textLength;
   1003         U16_BACK_1(text, 0, temp);
   1004         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   1005             int32_t             firstce  = strsrch->pattern.ces[0];
   1006             UCollationElements *coleiter = strsrch->textIter;
   1007             UErrorCode          status   = U_ZERO_ERROR;
   1008             int32_t ce;
   1009             setColEIterOffset(coleiter, start);
   1010             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
   1011                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   1012                     return TRUE;
   1013                 }
   1014             }
   1015             int32_t count = 1;
   1016             while (count < strsrch->pattern.cesLength) {
   1017                 if (getCE(strsrch, ucol_next(coleiter, &status))
   1018                     == UCOL_IGNORABLE) {
   1019                     // Thai can give an ignorable here.
   1020                     count --;
   1021                 }
   1022                 if (U_FAILURE(status)) {
   1023                     return TRUE;
   1024                 }
   1025                 count ++;
   1026             }
   1027 
   1028             ce = ucol_next(coleiter, &status);
   1029             if (U_FAILURE(status)) {
   1030                 return TRUE;
   1031             }
   1032             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1033                 ce = getCE(strsrch, ce);
   1034             }
   1035             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1036                 if (ucol_getOffset(coleiter) <= end) {
   1037                     return TRUE;
   1038                 }
   1039                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   1040                     return TRUE;
   1041                 }
   1042             }
   1043         }
   1044     }
   1045     return FALSE;
   1046 }
   1047 #endif // #if BOYER_MOORE
   1048 
   1049 /**
   1050 * Checks if the offset runs out of the text string
   1051 * @param offset
   1052 * @param textlength of the text string
   1053 * @return TRUE if offset is out of bounds, FALSE otherwise
   1054 */
   1055 static
   1056 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
   1057 {
   1058     return offset < 0 || offset > textlength;
   1059 }
   1060 
   1061 /**
   1062 * Checks for identical match
   1063 * @param strsrch string search data
   1064 * @param start offset of possible match
   1065 * @param end offset of possible match
   1066 * @return TRUE if identical match is found
   1067 */
   1068 static
   1069 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
   1070                                   int32_t    end)
   1071 {
   1072     if (strsrch->strength != UCOL_IDENTICAL) {
   1073         return TRUE;
   1074     }
   1075 
   1076     // Note: We could use Normalizer::compare() or similar, but for short strings
   1077     // which may not be in FCD it might be faster to just NFD them.
   1078     UErrorCode status = U_ZERO_ERROR;
   1079     UnicodeString t2, p2;
   1080     strsrch->nfd->normalize(
   1081         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
   1082     strsrch->nfd->normalize(
   1083         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
   1084     // return FALSE if NFD failed
   1085     return U_SUCCESS(status) && t2 == p2;
   1086 }
   1087 
   1088 #if BOYER_MOORE
   1089 /**
   1090 * Checks to see if the match is repeated
   1091 * @param strsrch string search data
   1092 * @param start new match start index
   1093 * @param end new match end index
   1094 * @return TRUE if the the match is repeated, FALSE otherwise
   1095 */
   1096 static
   1097 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
   1098                                 int32_t    start,
   1099                                 int32_t    end)
   1100 {
   1101     int32_t lastmatchindex = strsrch->search->matchedIndex;
   1102     UBool       result;
   1103     if (lastmatchindex == USEARCH_DONE) {
   1104         return FALSE;
   1105     }
   1106     if (strsrch->search->isForwardSearching) {
   1107         result = start <= lastmatchindex;
   1108     }
   1109     else {
   1110         result = start >= lastmatchindex;
   1111     }
   1112     if (!result && !strsrch->search->isOverlap) {
   1113         if (strsrch->search->isForwardSearching) {
   1114             result = start < lastmatchindex + strsrch->search->matchedLength;
   1115         }
   1116         else {
   1117             result = end > lastmatchindex;
   1118         }
   1119     }
   1120     return result;
   1121 }
   1122 
   1123 /**
   1124 * Gets the collation element iterator's current offset.
   1125 * @param coleiter collation element iterator
   1126 * @param forwards flag TRUE if we are moving in th forwards direction
   1127 * @return current offset
   1128 */
   1129 static
   1130 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
   1131                                               UBool               forwards)
   1132 {
   1133     int32_t result = ucol_getOffset(coleiter);
   1134     // intricacies of the the backwards collation element iterator
   1135     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
   1136         result ++;
   1137     }
   1138     return result;
   1139 }
   1140 
   1141 /**
   1142 * Checks match for contraction.
   1143 * If the match ends with a partial contraction we fail.
   1144 * If the match starts too far off (because of backwards iteration) we try to
   1145 * chip off the extra characters depending on whether a breakiterator has
   1146 * been used.
   1147 * Internal method, error assumed to be success, caller has to check status
   1148 * before calling this method.
   1149 * @param strsrch string search data
   1150 * @param start offset of potential match, to be modified if necessary
   1151 * @param end offset of potential match, to be modified if necessary
   1152 * @param status output error status if any
   1153 * @return TRUE if match passes the contraction test, FALSE otherwise
   1154 */
   1155 
   1156 static
   1157 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
   1158                                      int32_t   *start,
   1159                                      int32_t   *end, UErrorCode  *status)
   1160 {
   1161           UCollationElements *coleiter   = strsrch->textIter;
   1162           int32_t             textlength = strsrch->search->textLength;
   1163           int32_t             temp       = *start;
   1164     const UCollator          *collator   = strsrch->collator;
   1165     const UChar              *text       = strsrch->search->text;
   1166     // This part checks if either ends of the match contains potential
   1167     // contraction. If so we'll have to iterate through them
   1168     // The start contraction needs to be checked since ucol_previous dumps
   1169     // all characters till the first safe character into the buffer.
   1170     // *start + 1 is used to test for the unsafe characters instead of *start
   1171     // because ucol_prev takes all unsafe characters till the first safe
   1172     // character ie *start. so by testing *start + 1, we can estimate if
   1173     // excess prefix characters has been included in the potential search
   1174     // results.
   1175     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1176         (*start + 1 < textlength
   1177          && ucol_unsafeCP(text[*start + 1], collator))) {
   1178         int32_t expansion  = getExpansionPrefix(coleiter);
   1179         UBool   expandflag = expansion > 0;
   1180         setColEIterOffset(coleiter, *start);
   1181         while (expansion > 0) {
   1182             // getting rid of the redundant ce, caused by setOffset.
   1183             // since backward contraction/expansion may have extra ces if we
   1184             // are in the normalization buffer, hasAccentsBeforeMatch would
   1185             // have taken care of it.
   1186             // E.g. the character \u01FA will have an expansion of 3, but if
   1187             // we are only looking for acute and ring \u030A and \u0301, we'll
   1188             // have to skip the first ce in the expansion buffer.
   1189             ucol_next(coleiter, status);
   1190             if (U_FAILURE(*status)) {
   1191                 return FALSE;
   1192             }
   1193             if (ucol_getOffset(coleiter) != temp) {
   1194                 *start = temp;
   1195                 temp  = ucol_getOffset(coleiter);
   1196             }
   1197             expansion --;
   1198         }
   1199 
   1200         int32_t  *patternce       = strsrch->pattern.ces;
   1201         int32_t   patterncelength = strsrch->pattern.cesLength;
   1202         int32_t   count           = 0;
   1203         while (count < patterncelength) {
   1204             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1205             if (ce == UCOL_IGNORABLE) {
   1206                 continue;
   1207             }
   1208             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1209                 *start = temp;
   1210                 temp   = ucol_getOffset(coleiter);
   1211             }
   1212             if (U_FAILURE(*status) || ce != patternce[count]) {
   1213                 (*end) ++;
   1214                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1215                 return FALSE;
   1216             }
   1217             count ++;
   1218         }
   1219     }
   1220     return TRUE;
   1221 }
   1222 
   1223 /**
   1224 * Checks and sets the match information if found.
   1225 * Checks
   1226 * <ul>
   1227 * <li> the potential match does not repeat the previous match
   1228 * <li> boundaries are correct
   1229 * <li> exact matches has no extra accents
   1230 * <li> identical matchesb
   1231 * <li> potential match does not end in the middle of a contraction
   1232 * <\ul>
   1233 * Otherwise the offset will be shifted to the next character.
   1234 * Internal method, status assumed to be success, caller has to check status
   1235 * before calling this method.
   1236 * @param strsrch string search data
   1237 * @param textoffset offset in the collation element text. the returned value
   1238 *        will be the truncated end offset of the match or the new start
   1239 *        search offset.
   1240 * @param status output error status if any
   1241 * @return TRUE if the match is valid, FALSE otherwise
   1242 */
   1243 static
   1244 inline UBool checkNextExactMatch(UStringSearch *strsrch,
   1245                                  int32_t   *textoffset, UErrorCode *status)
   1246 {
   1247     UCollationElements *coleiter = strsrch->textIter;
   1248     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
   1249 
   1250     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
   1251         return FALSE;
   1252     }
   1253 
   1254     // this totally matches, however we need to check if it is repeating
   1255     if (!isBreakUnit(strsrch, start, *textoffset) ||
   1256         checkRepeatedMatch(strsrch, start, *textoffset) ||
   1257         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
   1258         !checkIdentical(strsrch, start, *textoffset) ||
   1259         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
   1260 
   1261         (*textoffset) ++;
   1262         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
   1263         return FALSE;
   1264     }
   1265 
   1266     //Add breakiterator boundary check for primary strength search.
   1267     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   1268         checkBreakBoundary(strsrch, &start, textoffset);
   1269     }
   1270 
   1271     // totally match, we will get rid of the ending ignorables.
   1272     strsrch->search->matchedIndex  = start;
   1273     strsrch->search->matchedLength = *textoffset - start;
   1274     return TRUE;
   1275 }
   1276 
   1277 /**
   1278 * Getting the previous base character offset, or the current offset if the
   1279 * current character is a base character
   1280 * @param text string
   1281 * @param textoffset one offset after the current character
   1282 * @return the offset of the next character after the base character or the first
   1283 *         composed character with accents
   1284 */
   1285 static
   1286 inline int32_t getPreviousBaseOffset(const UChar       *text,
   1287                                                int32_t  textoffset)
   1288 {
   1289     if (textoffset > 0) {
   1290         for (;;) {
   1291             int32_t result = textoffset;
   1292             U16_BACK_1(text, 0, textoffset);
   1293             int32_t temp = textoffset;
   1294             uint16_t fcd = getFCD(text, &temp, result);
   1295             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   1296                 if (fcd & LAST_BYTE_MASK_) {
   1297                     return textoffset;
   1298                 }
   1299                 return result;
   1300             }
   1301             if (textoffset == 0) {
   1302                 return 0;
   1303             }
   1304         }
   1305     }
   1306     return textoffset;
   1307 }
   1308 
   1309 /**
   1310 * Getting the indexes of the accents that are not blocked in the argument
   1311 * accent array
   1312 * @param accents array of accents in nfd terminated by a 0.
   1313 * @param accentsindex array of indexes of the accents that are not blocked
   1314 */
   1315 static
   1316 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
   1317 {
   1318     int32_t index     = 0;
   1319     int32_t     length    = u_strlen(accents);
   1320     UChar32     codepoint = 0;
   1321     int         cclass    = 0;
   1322     int         result    = 0;
   1323     int32_t temp;
   1324     while (index < length) {
   1325         temp = index;
   1326         U16_NEXT(accents, index, length, codepoint);
   1327         if (u_getCombiningClass(codepoint) != cclass) {
   1328             cclass        = u_getCombiningClass(codepoint);
   1329             accentsindex[result] = temp;
   1330             result ++;
   1331         }
   1332     }
   1333     accentsindex[result] = length;
   1334     return result;
   1335 }
   1336 
   1337 /**
   1338 * Appends 3 UChar arrays to a destination array.
   1339 * Creates a new array if we run out of space. The caller will have to
   1340 * manually deallocate the newly allocated array.
   1341 * Internal method, status assumed to be success, caller has to check status
   1342 * before calling this method. destination not to be NULL and has at least
   1343 * size destinationlength.
   1344 * @param destination target array
   1345 * @param destinationlength target array size, returning the appended length
   1346 * @param source1 null-terminated first array
   1347 * @param source2 second array
   1348 * @param source2length length of seond array
   1349 * @param source3 null-terminated third array
   1350 * @param status error status if any
   1351 * @return new destination array, destination if there was no new allocation
   1352 */
   1353 static
   1354 inline UChar * addToUCharArray(      UChar      *destination,
   1355                                      int32_t    *destinationlength,
   1356                                const UChar      *source1,
   1357                                const UChar      *source2,
   1358                                      int32_t     source2length,
   1359                                const UChar      *source3,
   1360                                      UErrorCode *status)
   1361 {
   1362     int32_t source1length = source1 ? u_strlen(source1) : 0;
   1363     int32_t source3length = source3 ? u_strlen(source3) : 0;
   1364     if (*destinationlength < source1length + source2length + source3length +
   1365                                                                            1)
   1366     {
   1367         destination = (UChar *)allocateMemory(
   1368           (source1length + source2length + source3length + 1) * sizeof(UChar),
   1369           status);
   1370         // if error allocating memory, status will be
   1371         // U_MEMORY_ALLOCATION_ERROR
   1372         if (U_FAILURE(*status)) {
   1373             *destinationlength = 0;
   1374             return NULL;
   1375         }
   1376     }
   1377     if (source1length != 0) {
   1378         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
   1379     }
   1380     if (source2length != 0) {
   1381         uprv_memcpy(destination + source1length, source2,
   1382                     sizeof(UChar) * source2length);
   1383     }
   1384     if (source3length != 0) {
   1385         uprv_memcpy(destination + source1length + source2length, source3,
   1386                     sizeof(UChar) * source3length);
   1387     }
   1388     *destinationlength = source1length + source2length + source3length;
   1389     return destination;
   1390 }
   1391 
   1392 /**
   1393 * Running through a collation element iterator to see if the contents matches
   1394 * pattern in string search data
   1395 * @param strsrch string search data
   1396 * @param coleiter collation element iterator
   1397 * @return TRUE if a match if found, FALSE otherwise
   1398 */
   1399 static
   1400 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
   1401                                        UCollationElements *coleiter)
   1402 {
   1403     int         patternceindex = strsrch->pattern.cesLength;
   1404     int32_t    *patternce      = strsrch->pattern.ces;
   1405     UErrorCode  status = U_ZERO_ERROR;
   1406     while (patternceindex > 0) {
   1407         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   1408         if (ce == UCOL_IGNORABLE) {
   1409             continue;
   1410         }
   1411         if (U_FAILURE(status) || ce != *patternce) {
   1412             return FALSE;
   1413         }
   1414         patternce ++;
   1415         patternceindex --;
   1416     }
   1417     return TRUE;
   1418 }
   1419 
   1420 /**
   1421 * Rearranges the front accents to try matching.
   1422 * Prefix accents in the text will be grouped according to their combining
   1423 * class and the groups will be mixed and matched to try find the perfect
   1424 * match with the pattern.
   1425 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1426 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1427 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1428 *         "\u0301\u0325".
   1429 * step 2: check if any of the generated substrings matches the pattern.
   1430 * Internal method, status is assumed to be success, caller has to check status
   1431 * before calling this method.
   1432 * @param strsrch string search match
   1433 * @param start first offset of the accents to start searching
   1434 * @param end start of the last accent set
   1435 * @param status output error status if any
   1436 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1437 *         offset of the match. Note this start includes all preceding accents.
   1438 */
   1439 static
   1440 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
   1441                                        int32_t    start,
   1442                                        int32_t    end,
   1443                                        UErrorCode    *status)
   1444 {
   1445     const UChar       *text       = strsrch->search->text;
   1446           int32_t      textlength = strsrch->search->textLength;
   1447           int32_t  tempstart  = start;
   1448 
   1449     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
   1450         // die... failed at a base character
   1451         return USEARCH_DONE;
   1452     }
   1453 
   1454     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
   1455     start = getPreviousBaseOffset(text, tempstart);
   1456 
   1457     UChar       accents[INITIAL_ARRAY_SIZE_];
   1458     // normalizing the offensive string
   1459     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
   1460                     INITIAL_ARRAY_SIZE_, status);
   1461     if (U_FAILURE(*status)) {
   1462         return USEARCH_DONE;
   1463     }
   1464 
   1465     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   1466     int32_t         accentsize = getUnblockedAccentIndex(accents,
   1467                                                                  accentsindex);
   1468     int32_t         count      = (2 << (accentsize - 1)) - 1;
   1469     UChar               buffer[INITIAL_ARRAY_SIZE_];
   1470     UCollationElements *coleiter   = strsrch->utilIter;
   1471     while (U_SUCCESS(*status) && count > 0) {
   1472         UChar *rearrange = strsrch->canonicalPrefixAccents;
   1473         // copy the base characters
   1474         for (int k = 0; k < accentsindex[0]; k ++) {
   1475             *rearrange ++ = accents[k];
   1476         }
   1477         // forming all possible canonical rearrangement by dropping
   1478         // sets of accents
   1479         for (int i = 0; i <= accentsize - 1; i ++) {
   1480             int32_t mask = 1 << (accentsize - i - 1);
   1481             if (count & mask) {
   1482                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1483                     *rearrange ++ = accents[j];
   1484                 }
   1485             }
   1486         }
   1487         *rearrange = 0;
   1488         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   1489         UChar   *match     = addToUCharArray(buffer, &matchsize,
   1490                                            strsrch->canonicalPrefixAccents,
   1491                                            strsrch->search->text + offset,
   1492                                            end - offset,
   1493                                            strsrch->canonicalSuffixAccents,
   1494                                            status);
   1495 
   1496         // if status is a failure, ucol_setText does nothing.
   1497         // run the collator iterator through this match
   1498         ucol_setText(coleiter, match, matchsize, status);
   1499         if (U_SUCCESS(*status)) {
   1500             if (checkCollationMatch(strsrch, coleiter)) {
   1501                 if (match != buffer) {
   1502                     uprv_free(match);
   1503                 }
   1504                 return start;
   1505             }
   1506         }
   1507         count --;
   1508     }
   1509     return USEARCH_DONE;
   1510 }
   1511 
   1512 /**
   1513 * Gets the offset to the safe point in text before textoffset.
   1514 * ie. not the middle of a contraction, swappable characters or supplementary
   1515 * characters.
   1516 * @param collator collation sata
   1517 * @param text string to work with
   1518 * @param textoffset offset in string
   1519 * @param textlength length of text string
   1520 * @return offset to the previous safe character
   1521 */
   1522 static
   1523 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
   1524                                       const UChar       *text,
   1525                                             int32_t  textoffset)
   1526 {
   1527     int32_t result = textoffset; // first contraction character
   1528     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
   1529         result --;
   1530     }
   1531     if (result != 0) {
   1532         // the first contraction character is consider unsafe here
   1533         result --;
   1534     }
   1535     return result;
   1536 }
   1537 
   1538 /**
   1539 * Cleaning up after we passed the safe zone
   1540 * @param strsrch string search data
   1541 * @param safetext safe text array
   1542 * @param safebuffer safe text buffer
   1543 * @param coleiter collation element iterator for safe text
   1544 */
   1545 static
   1546 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
   1547                                   UChar         *safebuffer)
   1548 {
   1549     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
   1550     {
   1551        uprv_free(safetext);
   1552     }
   1553 }
   1554 
   1555 /**
   1556 * Take the rearranged end accents and tries matching. If match failed at
   1557 * a seperate preceding set of accents (seperated from the rearranged on by
   1558 * at least a base character) then we rearrange the preceding accents and
   1559 * tries matching again.
   1560 * We allow skipping of the ends of the accent set if the ces do not match.
   1561 * However if the failure is found before the accent set, it fails.
   1562 * Internal method, status assumed to be success, caller has to check status
   1563 * before calling this method.
   1564 * @param strsrch string search data
   1565 * @param textoffset of the start of the rearranged accent
   1566 * @param status output error status if any
   1567 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1568 *         offset of the match. Note this start includes all preceding accents.
   1569 */
   1570 static
   1571 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
   1572                                        int32_t    textoffset,
   1573                                        UErrorCode    *status)
   1574 {
   1575     const UChar              *text           = strsrch->search->text;
   1576     const UCollator          *collator       = strsrch->collator;
   1577           int32_t             safelength     = 0;
   1578           UChar              *safetext;
   1579           int32_t             safetextlength;
   1580           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
   1581           UCollationElements *coleiter       = strsrch->utilIter;
   1582           int32_t         safeoffset     = textoffset;
   1583 
   1584     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
   1585                                          collator)) {
   1586         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
   1587         safelength     = textoffset - safeoffset;
   1588         safetextlength = INITIAL_ARRAY_SIZE_;
   1589         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
   1590                                          text + safeoffset, safelength,
   1591                                          strsrch->canonicalSuffixAccents,
   1592                                          status);
   1593     }
   1594     else {
   1595         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
   1596         safetext       = strsrch->canonicalSuffixAccents;
   1597     }
   1598 
   1599     // if status is a failure, ucol_setText does nothing
   1600     ucol_setText(coleiter, safetext, safetextlength, status);
   1601     // status checked in loop below
   1602 
   1603     int32_t  *ce        = strsrch->pattern.ces;
   1604     int32_t   celength  = strsrch->pattern.cesLength;
   1605     int       ceindex   = celength - 1;
   1606     UBool     isSafe    = TRUE; // indication flag for position in safe zone
   1607 
   1608     while (ceindex >= 0) {
   1609         int32_t textce = ucol_previous(coleiter, status);
   1610         if (U_FAILURE(*status)) {
   1611             if (isSafe) {
   1612                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1613             }
   1614             return USEARCH_DONE;
   1615         }
   1616         if (textce == UCOL_NULLORDER) {
   1617             // check if we have passed the safe buffer
   1618             if (coleiter == strsrch->textIter) {
   1619                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1620                 return USEARCH_DONE;
   1621             }
   1622             cleanUpSafeText(strsrch, safetext, safebuffer);
   1623             safetext = safebuffer;
   1624             coleiter = strsrch->textIter;
   1625             setColEIterOffset(coleiter, safeoffset);
   1626             // status checked at the start of the loop
   1627             isSafe = FALSE;
   1628             continue;
   1629         }
   1630         textce = getCE(strsrch, textce);
   1631         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   1632             // do the beginning stuff
   1633             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
   1634             if (isSafe && failedoffset >= safelength) {
   1635                 // alas... no hope. failed at rearranged accent set
   1636                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1637                 return USEARCH_DONE;
   1638             }
   1639             else {
   1640                 if (isSafe) {
   1641                     failedoffset += safeoffset;
   1642                     cleanUpSafeText(strsrch, safetext, safebuffer);
   1643                 }
   1644 
   1645                 // try rearranging the front accents
   1646                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
   1647                                         failedoffset, textoffset, status);
   1648                 if (result != USEARCH_DONE) {
   1649                     // if status is a failure, ucol_setOffset does nothing
   1650                     setColEIterOffset(strsrch->textIter, result);
   1651                 }
   1652                 if (U_FAILURE(*status)) {
   1653                     return USEARCH_DONE;
   1654                 }
   1655                 return result;
   1656             }
   1657         }
   1658         if (textce == ce[ceindex]) {
   1659             ceindex --;
   1660         }
   1661     }
   1662     // set offset here
   1663     if (isSafe) {
   1664         int32_t result     = getColElemIterOffset(coleiter, FALSE);
   1665         // sets the text iterator here with the correct expansion and offset
   1666         int32_t    leftoverces = getExpansionPrefix(coleiter);
   1667         cleanUpSafeText(strsrch, safetext, safebuffer);
   1668         if (result >= safelength) {
   1669             result = textoffset;
   1670         }
   1671         else {
   1672             result += safeoffset;
   1673         }
   1674         setColEIterOffset(strsrch->textIter, result);
   1675         strsrch->textIter->iteratordata_.toReturn =
   1676                        setExpansionPrefix(strsrch->textIter, leftoverces);
   1677         return result;
   1678     }
   1679 
   1680     return ucol_getOffset(coleiter);
   1681 }
   1682 
   1683 /**
   1684 * Trying out the substring and sees if it can be a canonical match.
   1685 * This will try normalizing the end accents and arranging them into canonical
   1686 * equivalents and check their corresponding ces with the pattern ce.
   1687 * Suffix accents in the text will be grouped according to their combining
   1688 * class and the groups will be mixed and matched to try find the perfect
   1689 * match with the pattern.
   1690 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1691 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1692 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1693 *         "\u0301\u0325".
   1694 * step 2: check if any of the generated substrings matches the pattern.
   1695 * Internal method, status assumed to be success, caller has to check status
   1696 * before calling this method.
   1697 * @param strsrch string search data
   1698 * @param textoffset end offset in the collation element text that ends with
   1699 *                   the accents to be rearranged
   1700 * @param status error status if any
   1701 * @return TRUE if the match is valid, FALSE otherwise
   1702 */
   1703 static
   1704 UBool doNextCanonicalMatch(UStringSearch *strsrch,
   1705                            int32_t    textoffset,
   1706                            UErrorCode    *status)
   1707 {
   1708     const UChar       *text = strsrch->search->text;
   1709           int32_t  temp = textoffset;
   1710     U16_BACK_1(text, 0, temp);
   1711     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
   1712         UCollationElements *coleiter = strsrch->textIter;
   1713         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
   1714         if (strsrch->pattern.hasPrefixAccents) {
   1715             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
   1716                                                 status);
   1717             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   1718                 setColEIterOffset(coleiter, offset);
   1719                 return TRUE;
   1720             }
   1721         }
   1722         return FALSE;
   1723     }
   1724 
   1725     if (!strsrch->pattern.hasSuffixAccents) {
   1726         return FALSE;
   1727     }
   1728 
   1729     UChar       accents[INITIAL_ARRAY_SIZE_];
   1730     // offset to the last base character in substring to search
   1731     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
   1732     // normalizing the offensive string
   1733     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
   1734                                0, accents, INITIAL_ARRAY_SIZE_, status);
   1735     // status checked in loop below
   1736 
   1737     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   1738     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   1739 
   1740     // 2 power n - 1 plus the full set of accents
   1741     int32_t  count = (2 << (size - 1)) - 1;
   1742     while (U_SUCCESS(*status) && count > 0) {
   1743         UChar *rearrange = strsrch->canonicalSuffixAccents;
   1744         // copy the base characters
   1745         for (int k = 0; k < accentsindex[0]; k ++) {
   1746             *rearrange ++ = accents[k];
   1747         }
   1748         // forming all possible canonical rearrangement by dropping
   1749         // sets of accents
   1750         for (int i = 0; i <= size - 1; i ++) {
   1751             int32_t mask = 1 << (size - i - 1);
   1752             if (count & mask) {
   1753                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1754                     *rearrange ++ = accents[j];
   1755                 }
   1756             }
   1757         }
   1758         *rearrange = 0;
   1759         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
   1760                                                         status);
   1761         if (offset != USEARCH_DONE) {
   1762             return TRUE; // match found
   1763         }
   1764         count --;
   1765     }
   1766     return FALSE;
   1767 }
   1768 
   1769 /**
   1770 * Gets the previous base character offset depending on the string search
   1771 * pattern data
   1772 * @param strsrch string search data
   1773 * @param textoffset current offset, current character
   1774 * @return the offset of the next character after this base character or itself
   1775 *         if it is a composed character with accents
   1776 */
   1777 static
   1778 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
   1779                                                       int32_t textoffset)
   1780 {
   1781     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
   1782         const UChar       *text = strsrch->search->text;
   1783               int32_t  offset = textoffset;
   1784         if (getFCD(text, &offset, strsrch->search->textLength) >>
   1785                                                    SECOND_LAST_BYTE_SHIFT_) {
   1786             return getPreviousBaseOffset(text, textoffset);
   1787         }
   1788     }
   1789     return textoffset;
   1790 }
   1791 
   1792 /**
   1793 * Checks match for contraction.
   1794 * If the match ends with a partial contraction we fail.
   1795 * If the match starts too far off (because of backwards iteration) we try to
   1796 * chip off the extra characters
   1797 * Internal method, status assumed to be success, caller has to check status
   1798 * before calling this method.
   1799 * @param strsrch string search data
   1800 * @param start offset of potential match, to be modified if necessary
   1801 * @param end offset of potential match, to be modified if necessary
   1802 * @param status output error status if any
   1803 * @return TRUE if match passes the contraction test, FALSE otherwise
   1804 */
   1805 static
   1806 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
   1807                                          int32_t   *start,
   1808                                          int32_t   *end,
   1809                                          UErrorCode    *status)
   1810 {
   1811           UCollationElements *coleiter   = strsrch->textIter;
   1812           int32_t             textlength = strsrch->search->textLength;
   1813           int32_t         temp       = *start;
   1814     const UCollator          *collator   = strsrch->collator;
   1815     const UChar              *text       = strsrch->search->text;
   1816     // This part checks if either ends of the match contains potential
   1817     // contraction. If so we'll have to iterate through them
   1818     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1819         (*start + 1 < textlength
   1820          && ucol_unsafeCP(text[*start + 1], collator))) {
   1821         int32_t expansion  = getExpansionPrefix(coleiter);
   1822         UBool   expandflag = expansion > 0;
   1823         setColEIterOffset(coleiter, *start);
   1824         while (expansion > 0) {
   1825             // getting rid of the redundant ce, caused by setOffset.
   1826             // since backward contraction/expansion may have extra ces if we
   1827             // are in the normalization buffer, hasAccentsBeforeMatch would
   1828             // have taken care of it.
   1829             // E.g. the character \u01FA will have an expansion of 3, but if
   1830             // we are only looking for acute and ring \u030A and \u0301, we'll
   1831             // have to skip the first ce in the expansion buffer.
   1832             ucol_next(coleiter, status);
   1833             if (U_FAILURE(*status)) {
   1834                 return FALSE;
   1835             }
   1836             if (ucol_getOffset(coleiter) != temp) {
   1837                 *start = temp;
   1838                 temp  = ucol_getOffset(coleiter);
   1839             }
   1840             expansion --;
   1841         }
   1842 
   1843         int32_t  *patternce       = strsrch->pattern.ces;
   1844         int32_t   patterncelength = strsrch->pattern.cesLength;
   1845         int32_t   count           = 0;
   1846         int32_t   textlength      = strsrch->search->textLength;
   1847         while (count < patterncelength) {
   1848             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1849             // status checked below, note that if status is a failure
   1850             // ucol_next returns UCOL_NULLORDER
   1851             if (ce == UCOL_IGNORABLE) {
   1852                 continue;
   1853             }
   1854             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1855                 *start = temp;
   1856                 temp   = ucol_getOffset(coleiter);
   1857             }
   1858 
   1859             if (count == 0 && ce != patternce[0]) {
   1860                 // accents may have extra starting ces, this occurs when a
   1861                 // pure accent pattern is matched without rearrangement
   1862                 // text \u0325\u0300 and looking for \u0300
   1863                 int32_t expected = patternce[0];
   1864                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
   1865                     ce = getCE(strsrch, ucol_next(coleiter, status));
   1866                     while (U_SUCCESS(*status) && ce != expected &&
   1867                            ce != UCOL_NULLORDER &&
   1868                            ucol_getOffset(coleiter) <= *end) {
   1869                         ce = getCE(strsrch, ucol_next(coleiter, status));
   1870                     }
   1871                 }
   1872             }
   1873             if (U_FAILURE(*status) || ce != patternce[count]) {
   1874                 (*end) ++;
   1875                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1876                 return FALSE;
   1877             }
   1878             count ++;
   1879         }
   1880     }
   1881     return TRUE;
   1882 }
   1883 
   1884 /**
   1885 * Checks and sets the match information if found.
   1886 * Checks
   1887 * <ul>
   1888 * <li> the potential match does not repeat the previous match
   1889 * <li> boundaries are correct
   1890 * <li> potential match does not end in the middle of a contraction
   1891 * <li> identical matches
   1892 * <\ul>
   1893 * Otherwise the offset will be shifted to the next character.
   1894 * Internal method, status assumed to be success, caller has to check the
   1895 * status before calling this method.
   1896 * @param strsrch string search data
   1897 * @param textoffset offset in the collation element text. the returned value
   1898 *        will be the truncated end offset of the match or the new start
   1899 *        search offset.
   1900 * @param status output error status if any
   1901 * @return TRUE if the match is valid, FALSE otherwise
   1902 */
   1903 static
   1904 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
   1905                                      int32_t   *textoffset,
   1906                                      UErrorCode    *status)
   1907 {
   1908     // to ensure that the start and ends are not composite characters
   1909     UCollationElements *coleiter = strsrch->textIter;
   1910     // if we have a canonical accent match
   1911     if ((strsrch->pattern.hasSuffixAccents &&
   1912         strsrch->canonicalSuffixAccents[0]) ||
   1913         (strsrch->pattern.hasPrefixAccents &&
   1914         strsrch->canonicalPrefixAccents[0])) {
   1915         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
   1916                                                     strsrch,
   1917                                                     ucol_getOffset(coleiter));
   1918         strsrch->search->matchedLength = *textoffset -
   1919                                                 strsrch->search->matchedIndex;
   1920         return TRUE;
   1921     }
   1922 
   1923     int32_t start = getColElemIterOffset(coleiter, FALSE);
   1924     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
   1925                                             status) || U_FAILURE(*status)) {
   1926         return FALSE;
   1927     }
   1928 
   1929     start = getPreviousUStringSearchBaseOffset(strsrch, start);
   1930     // this totally matches, however we need to check if it is repeating
   1931     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
   1932         !isBreakUnit(strsrch, start, *textoffset) ||
   1933         !checkIdentical(strsrch, start, *textoffset)) {
   1934         (*textoffset) ++;
   1935         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
   1936                                         strsrch->search->textLength);
   1937         return FALSE;
   1938     }
   1939 
   1940     strsrch->search->matchedIndex  = start;
   1941     strsrch->search->matchedLength = *textoffset - start;
   1942     return TRUE;
   1943 }
   1944 
   1945 /**
   1946 * Shifting the collation element iterator position forward to prepare for
   1947 * a preceding match. If the first character is a unsafe character, we'll only
   1948 * shift by 1 to capture contractions, normalization etc.
   1949 * Internal method, status assumed to be success, caller has to check status
   1950 * before calling this method.
   1951 * @param text strsrch string search data
   1952 * @param textoffset start text position to do search
   1953 * @param ce the text ce which failed the match.
   1954 * @param patternceindex index of the ce within the pattern ce buffer which
   1955 *        failed the match
   1956 * @return final offset
   1957 */
   1958 static
   1959 inline int32_t reverseShift(UStringSearch *strsrch,
   1960                                 int32_t    textoffset,
   1961                                 int32_t       ce,
   1962                                 int32_t        patternceindex)
   1963 {
   1964     if (strsrch->search->isOverlap) {
   1965         if (textoffset != strsrch->search->textLength) {
   1966             textoffset --;
   1967         }
   1968         else {
   1969             textoffset -= strsrch->pattern.defaultShiftSize;
   1970         }
   1971     }
   1972     else {
   1973         if (ce != UCOL_NULLORDER) {
   1974             int32_t shift = strsrch->pattern.backShift[hash(ce)];
   1975 
   1976             // this is to adjust for characters in the middle of the substring
   1977             // for matching that failed.
   1978             int32_t adjust = patternceindex;
   1979             if (adjust > 1 && shift > adjust) {
   1980                 shift -= adjust - 1;
   1981             }
   1982             textoffset -= shift;
   1983         }
   1984         else {
   1985             textoffset -= strsrch->pattern.defaultShiftSize;
   1986         }
   1987     }
   1988     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
   1989     return textoffset;
   1990 }
   1991 
   1992 /**
   1993 * Checks match for contraction.
   1994 * If the match starts with a partial contraction we fail.
   1995 * Internal method, status assumed to be success, caller has to check status
   1996 * before calling this method.
   1997 * @param strsrch string search data
   1998 * @param start offset of potential match, to be modified if necessary
   1999 * @param end offset of potential match, to be modified if necessary
   2000 * @param status output error status if any
   2001 * @return TRUE if match passes the contraction test, FALSE otherwise
   2002 */
   2003 static
   2004 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
   2005                                      int32_t   *start,
   2006                                      int32_t   *end, UErrorCode  *status)
   2007 {
   2008           UCollationElements *coleiter   = strsrch->textIter;
   2009           int32_t             textlength = strsrch->search->textLength;
   2010           int32_t             temp       = *end;
   2011     const UCollator          *collator   = strsrch->collator;
   2012     const UChar              *text       = strsrch->search->text;
   2013     // This part checks if either if the start of the match contains potential
   2014     // contraction. If so we'll have to iterate through them
   2015     // Since we used ucol_next while previously looking for the potential
   2016     // match, this guarantees that our end will not be a partial contraction,
   2017     // or a partial supplementary character.
   2018     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2019         int32_t expansion  = getExpansionSuffix(coleiter);
   2020         UBool   expandflag = expansion > 0;
   2021         setColEIterOffset(coleiter, *end);
   2022         while (U_SUCCESS(*status) && expansion > 0) {
   2023             // getting rid of the redundant ce
   2024             // since forward contraction/expansion may have extra ces
   2025             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2026             // would have taken care of it.
   2027             // E.g. the character \u01FA will have an expansion of 3, but if
   2028             // we are only looking for A ring A\u030A, we'll have to skip the
   2029             // last ce in the expansion buffer
   2030             ucol_previous(coleiter, status);
   2031             if (U_FAILURE(*status)) {
   2032                 return FALSE;
   2033             }
   2034             if (ucol_getOffset(coleiter) != temp) {
   2035                 *end = temp;
   2036                 temp  = ucol_getOffset(coleiter);
   2037             }
   2038             expansion --;
   2039         }
   2040 
   2041         int32_t  *patternce       = strsrch->pattern.ces;
   2042         int32_t   patterncelength = strsrch->pattern.cesLength;
   2043         int32_t   count           = patterncelength;
   2044         while (count > 0) {
   2045             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2046             // status checked below, note that if status is a failure
   2047             // ucol_previous returns UCOL_NULLORDER
   2048             if (ce == UCOL_IGNORABLE) {
   2049                 continue;
   2050             }
   2051             if (expandflag && count == 0 &&
   2052                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2053                 *end = temp;
   2054                 temp  = ucol_getOffset(coleiter);
   2055             }
   2056             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2057                 (*start) --;
   2058                 *start = getPreviousBaseOffset(text, *start);
   2059                 return FALSE;
   2060             }
   2061             count --;
   2062         }
   2063     }
   2064     return TRUE;
   2065 }
   2066 
   2067 /**
   2068 * Checks and sets the match information if found.
   2069 * Checks
   2070 * <ul>
   2071 * <li> the current match does not repeat the last match
   2072 * <li> boundaries are correct
   2073 * <li> exact matches has no extra accents
   2074 * <li> identical matches
   2075 * <\ul>
   2076 * Otherwise the offset will be shifted to the preceding character.
   2077 * Internal method, status assumed to be success, caller has to check status
   2078 * before calling this method.
   2079 * @param strsrch string search data
   2080 * @param collator
   2081 * @param coleiter collation element iterator
   2082 * @param text string
   2083 * @param textoffset offset in the collation element text. the returned value
   2084 *        will be the truncated start offset of the match or the new start
   2085 *        search offset.
   2086 * @param status output error status if any
   2087 * @return TRUE if the match is valid, FALSE otherwise
   2088 */
   2089 static
   2090 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
   2091                                      int32_t   *textoffset,
   2092                                      UErrorCode    *status)
   2093 {
   2094     // to ensure that the start and ends are not composite characters
   2095     int32_t end = ucol_getOffset(strsrch->textIter);
   2096     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
   2097         || U_FAILURE(*status)) {
   2098             return FALSE;
   2099     }
   2100 
   2101     // this totally matches, however we need to check if it is repeating
   2102     // the old match
   2103     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2104         !isBreakUnit(strsrch, *textoffset, end) ||
   2105         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
   2106         !checkIdentical(strsrch, *textoffset, end) ||
   2107         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
   2108         (*textoffset) --;
   2109         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2110                                             *textoffset);
   2111         return FALSE;
   2112     }
   2113 
   2114     //Add breakiterator boundary check for primary strength search.
   2115     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   2116         checkBreakBoundary(strsrch, textoffset, &end);
   2117     }
   2118 
   2119     strsrch->search->matchedIndex = *textoffset;
   2120     strsrch->search->matchedLength = end - *textoffset;
   2121     return TRUE;
   2122 }
   2123 
   2124 /**
   2125 * Rearranges the end accents to try matching.
   2126 * Suffix accents in the text will be grouped according to their combining
   2127 * class and the groups will be mixed and matched to try find the perfect
   2128 * match with the pattern.
   2129 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2130 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2131 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2132 *         "\u0301\u0325".
   2133 * step 2: check if any of the generated substrings matches the pattern.
   2134 * Internal method, status assumed to be success, user has to check status
   2135 * before calling this method.
   2136 * @param strsrch string search match
   2137 * @param start offset of the first base character
   2138 * @param end start of the last accent set
   2139 * @param status only error status if any
   2140 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2141 *         offset of the match. Note this start includes all following accents.
   2142 */
   2143 static
   2144 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
   2145                                            int32_t    start,
   2146                                            int32_t    end,
   2147                                            UErrorCode    *status)
   2148 {
   2149     const UChar       *text       = strsrch->search->text;
   2150           int32_t  tempend    = end;
   2151 
   2152     U16_BACK_1(text, 0, tempend);
   2153     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
   2154                                                            LAST_BYTE_MASK_)) {
   2155         // die... failed at a base character
   2156         return USEARCH_DONE;
   2157     }
   2158     end = getNextBaseOffset(text, end, strsrch->search->textLength);
   2159 
   2160     if (U_SUCCESS(*status)) {
   2161         UChar       accents[INITIAL_ARRAY_SIZE_];
   2162         int32_t offset = getPreviousBaseOffset(text, end);
   2163         // normalizing the offensive string
   2164         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
   2165                         INITIAL_ARRAY_SIZE_, status);
   2166 
   2167         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   2168         int32_t         accentsize = getUnblockedAccentIndex(accents,
   2169                                                          accentsindex);
   2170         int32_t         count      = (2 << (accentsize - 1)) - 1;
   2171         UChar               buffer[INITIAL_ARRAY_SIZE_];
   2172         UCollationElements *coleiter = strsrch->utilIter;
   2173         while (U_SUCCESS(*status) && count > 0) {
   2174             UChar *rearrange = strsrch->canonicalSuffixAccents;
   2175             // copy the base characters
   2176             for (int k = 0; k < accentsindex[0]; k ++) {
   2177                 *rearrange ++ = accents[k];
   2178             }
   2179             // forming all possible canonical rearrangement by dropping
   2180             // sets of accents
   2181             for (int i = 0; i <= accentsize - 1; i ++) {
   2182                 int32_t mask = 1 << (accentsize - i - 1);
   2183                 if (count & mask) {
   2184                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2185                         *rearrange ++ = accents[j];
   2186                     }
   2187                 }
   2188             }
   2189             *rearrange = 0;
   2190             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   2191             UChar   *match     = addToUCharArray(buffer, &matchsize,
   2192                                            strsrch->canonicalPrefixAccents,
   2193                                            strsrch->search->text + start,
   2194                                            offset - start,
   2195                                            strsrch->canonicalSuffixAccents,
   2196                                            status);
   2197 
   2198             // run the collator iterator through this match
   2199             // if status is a failure ucol_setText does nothing
   2200             ucol_setText(coleiter, match, matchsize, status);
   2201             if (U_SUCCESS(*status)) {
   2202                 if (checkCollationMatch(strsrch, coleiter)) {
   2203                     if (match != buffer) {
   2204                         uprv_free(match);
   2205                     }
   2206                     return end;
   2207                 }
   2208             }
   2209             count --;
   2210         }
   2211     }
   2212     return USEARCH_DONE;
   2213 }
   2214 
   2215 /**
   2216 * Take the rearranged start accents and tries matching. If match failed at
   2217 * a seperate following set of accents (seperated from the rearranged on by
   2218 * at least a base character) then we rearrange the preceding accents and
   2219 * tries matching again.
   2220 * We allow skipping of the ends of the accent set if the ces do not match.
   2221 * However if the failure is found before the accent set, it fails.
   2222 * Internal method, status assumed to be success, caller has to check status
   2223 * before calling this method.
   2224 * @param strsrch string search data
   2225 * @param textoffset of the ends of the rearranged accent
   2226 * @param status output error status if any
   2227 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2228 *         offset of the match. Note this start includes all following accents.
   2229 */
   2230 static
   2231 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
   2232                                            int32_t    textoffset,
   2233                                            UErrorCode    *status)
   2234 {
   2235     const UChar       *text       = strsrch->search->text;
   2236     const UCollator   *collator   = strsrch->collator;
   2237           int32_t      safelength = 0;
   2238           UChar       *safetext;
   2239           int32_t      safetextlength;
   2240           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
   2241           int32_t  safeoffset = textoffset;
   2242 
   2243     if (textoffset &&
   2244         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
   2245                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
   2246                                          ], collator)) {
   2247         safeoffset     = getNextSafeOffset(collator, text, textoffset,
   2248                                            strsrch->search->textLength);
   2249         safelength     = safeoffset - textoffset;
   2250         safetextlength = INITIAL_ARRAY_SIZE_;
   2251         safetext       = addToUCharArray(safebuffer, &safetextlength,
   2252                                          strsrch->canonicalPrefixAccents,
   2253                                          text + textoffset, safelength,
   2254                                          NULL, status);
   2255     }
   2256     else {
   2257         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
   2258         safetext       = strsrch->canonicalPrefixAccents;
   2259     }
   2260 
   2261     UCollationElements *coleiter = strsrch->utilIter;
   2262      // if status is a failure, ucol_setText does nothing
   2263     ucol_setText(coleiter, safetext, safetextlength, status);
   2264     // status checked in loop below
   2265 
   2266     int32_t  *ce           = strsrch->pattern.ces;
   2267     int32_t   celength     = strsrch->pattern.cesLength;
   2268     int       ceindex      = 0;
   2269     UBool     isSafe       = TRUE; // safe zone indication flag for position
   2270     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
   2271 
   2272     while (ceindex < celength) {
   2273         int32_t textce = ucol_next(coleiter, status);
   2274         if (U_FAILURE(*status)) {
   2275             if (isSafe) {
   2276                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2277             }
   2278             return USEARCH_DONE;
   2279         }
   2280         if (textce == UCOL_NULLORDER) {
   2281             // check if we have passed the safe buffer
   2282             if (coleiter == strsrch->textIter) {
   2283                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2284                 return USEARCH_DONE;
   2285             }
   2286             cleanUpSafeText(strsrch, safetext, safebuffer);
   2287             safetext = safebuffer;
   2288             coleiter = strsrch->textIter;
   2289             setColEIterOffset(coleiter, safeoffset);
   2290             // status checked at the start of the loop
   2291             isSafe = FALSE;
   2292             continue;
   2293         }
   2294         textce = getCE(strsrch, textce);
   2295         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   2296             // do the beginning stuff
   2297             int32_t failedoffset = ucol_getOffset(coleiter);
   2298             if (isSafe && failedoffset <= prefixlength) {
   2299                 // alas... no hope. failed at rearranged accent set
   2300                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2301                 return USEARCH_DONE;
   2302             }
   2303             else {
   2304                 if (isSafe) {
   2305                     failedoffset = safeoffset - failedoffset;
   2306                     cleanUpSafeText(strsrch, safetext, safebuffer);
   2307                 }
   2308 
   2309                 // try rearranging the end accents
   2310                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
   2311                                         textoffset, failedoffset, status);
   2312                 if (result != USEARCH_DONE) {
   2313                     // if status is a failure, ucol_setOffset does nothing
   2314                     setColEIterOffset(strsrch->textIter, result);
   2315                 }
   2316                 if (U_FAILURE(*status)) {
   2317                     return USEARCH_DONE;
   2318                 }
   2319                 return result;
   2320             }
   2321         }
   2322         if (textce == ce[ceindex]) {
   2323             ceindex ++;
   2324         }
   2325     }
   2326     // set offset here
   2327     if (isSafe) {
   2328         int32_t result      = ucol_getOffset(coleiter);
   2329         // sets the text iterator here with the correct expansion and offset
   2330         int32_t     leftoverces = getExpansionSuffix(coleiter);
   2331         cleanUpSafeText(strsrch, safetext, safebuffer);
   2332         if (result <= prefixlength) {
   2333             result = textoffset;
   2334         }
   2335         else {
   2336             result = textoffset + (safeoffset - result);
   2337         }
   2338         setColEIterOffset(strsrch->textIter, result);
   2339         setExpansionSuffix(strsrch->textIter, leftoverces);
   2340         return result;
   2341     }
   2342 
   2343     return ucol_getOffset(coleiter);
   2344 }
   2345 
   2346 /**
   2347 * Trying out the substring and sees if it can be a canonical match.
   2348 * This will try normalizing the starting accents and arranging them into
   2349 * canonical equivalents and check their corresponding ces with the pattern ce.
   2350 * Prefix accents in the text will be grouped according to their combining
   2351 * class and the groups will be mixed and matched to try find the perfect
   2352 * match with the pattern.
   2353 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2354 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2355 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2356 *         "\u0301\u0325".
   2357 * step 2: check if any of the generated substrings matches the pattern.
   2358 * Internal method, status assumed to be success, caller has to check status
   2359 * before calling this method.
   2360 * @param strsrch string search data
   2361 * @param textoffset start offset in the collation element text that starts
   2362 *                   with the accents to be rearranged
   2363 * @param status output error status if any
   2364 * @return TRUE if the match is valid, FALSE otherwise
   2365 */
   2366 static
   2367 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
   2368                                int32_t    textoffset,
   2369                                UErrorCode    *status)
   2370 {
   2371     const UChar       *text       = strsrch->search->text;
   2372           int32_t  temp       = textoffset;
   2373           int32_t      textlength = strsrch->search->textLength;
   2374     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   2375         UCollationElements *coleiter = strsrch->textIter;
   2376         int32_t         offset   = ucol_getOffset(coleiter);
   2377         if (strsrch->pattern.hasSuffixAccents) {
   2378             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
   2379                                                     offset, status);
   2380             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   2381                 setColEIterOffset(coleiter, offset);
   2382                 return TRUE;
   2383             }
   2384         }
   2385         return FALSE;
   2386     }
   2387 
   2388     if (!strsrch->pattern.hasPrefixAccents) {
   2389         return FALSE;
   2390     }
   2391 
   2392     UChar       accents[INITIAL_ARRAY_SIZE_];
   2393     // offset to the last base character in substring to search
   2394     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
   2395     // normalizing the offensive string
   2396     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
   2397                                0, accents, INITIAL_ARRAY_SIZE_, status);
   2398     // status checked in loop
   2399 
   2400     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   2401     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   2402 
   2403     // 2 power n - 1 plus the full set of accents
   2404     int32_t  count = (2 << (size - 1)) - 1;
   2405     while (U_SUCCESS(*status) && count > 0) {
   2406         UChar *rearrange = strsrch->canonicalPrefixAccents;
   2407         // copy the base characters
   2408         for (int k = 0; k < accentsindex[0]; k ++) {
   2409             *rearrange ++ = accents[k];
   2410         }
   2411         // forming all possible canonical rearrangement by dropping
   2412         // sets of accents
   2413         for (int i = 0; i <= size - 1; i ++) {
   2414             int32_t mask = 1 << (size - i - 1);
   2415             if (count & mask) {
   2416                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2417                     *rearrange ++ = accents[j];
   2418                 }
   2419             }
   2420         }
   2421         *rearrange = 0;
   2422         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
   2423                                                           baseoffset, status);
   2424         if (offset != USEARCH_DONE) {
   2425             return TRUE; // match found
   2426         }
   2427         count --;
   2428     }
   2429     return FALSE;
   2430 }
   2431 
   2432 /**
   2433 * Checks match for contraction.
   2434 * If the match starts with a partial contraction we fail.
   2435 * Internal method, status assumed to be success, caller has to check status
   2436 * before calling this method.
   2437 * @param strsrch string search data
   2438 * @param start offset of potential match, to be modified if necessary
   2439 * @param end offset of potential match, to be modified if necessary
   2440 * @param status only error status if any
   2441 * @return TRUE if match passes the contraction test, FALSE otherwise
   2442 */
   2443 static
   2444 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
   2445                                      int32_t   *start,
   2446                                      int32_t   *end, UErrorCode  *status)
   2447 {
   2448           UCollationElements *coleiter   = strsrch->textIter;
   2449           int32_t             textlength = strsrch->search->textLength;
   2450           int32_t         temp       = *end;
   2451     const UCollator          *collator   = strsrch->collator;
   2452     const UChar              *text       = strsrch->search->text;
   2453     // This part checks if either if the start of the match contains potential
   2454     // contraction. If so we'll have to iterate through them
   2455     // Since we used ucol_next while previously looking for the potential
   2456     // match, this guarantees that our end will not be a partial contraction,
   2457     // or a partial supplementary character.
   2458     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2459         int32_t expansion  = getExpansionSuffix(coleiter);
   2460         UBool   expandflag = expansion > 0;
   2461         setColEIterOffset(coleiter, *end);
   2462         while (expansion > 0) {
   2463             // getting rid of the redundant ce
   2464             // since forward contraction/expansion may have extra ces
   2465             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2466             // would have taken care of it.
   2467             // E.g. the character \u01FA will have an expansion of 3, but if
   2468             // we are only looking for A ring A\u030A, we'll have to skip the
   2469             // last ce in the expansion buffer
   2470             ucol_previous(coleiter, status);
   2471             if (U_FAILURE(*status)) {
   2472                 return FALSE;
   2473             }
   2474             if (ucol_getOffset(coleiter) != temp) {
   2475                 *end = temp;
   2476                 temp  = ucol_getOffset(coleiter);
   2477             }
   2478             expansion --;
   2479         }
   2480 
   2481         int32_t  *patternce       = strsrch->pattern.ces;
   2482         int32_t   patterncelength = strsrch->pattern.cesLength;
   2483         int32_t   count           = patterncelength;
   2484         while (count > 0) {
   2485             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2486             // status checked below, note that if status is a failure
   2487             // ucol_previous returns UCOL_NULLORDER
   2488             if (ce == UCOL_IGNORABLE) {
   2489                 continue;
   2490             }
   2491             if (expandflag && count == 0 &&
   2492                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2493                 *end = temp;
   2494                 temp  = ucol_getOffset(coleiter);
   2495             }
   2496             if (count == patterncelength &&
   2497                 ce != patternce[patterncelength - 1]) {
   2498                 // accents may have extra starting ces, this occurs when a
   2499                 // pure accent pattern is matched without rearrangement
   2500                 int32_t    expected = patternce[patterncelength - 1];
   2501                 U16_BACK_1(text, 0, *end);
   2502                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
   2503                     ce = getCE(strsrch, ucol_previous(coleiter, status));
   2504                     while (U_SUCCESS(*status) && ce != expected &&
   2505                            ce != UCOL_NULLORDER &&
   2506                            ucol_getOffset(coleiter) <= *start) {
   2507                         ce = getCE(strsrch, ucol_previous(coleiter, status));
   2508                     }
   2509                 }
   2510             }
   2511             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2512                 (*start) --;
   2513                 *start = getPreviousBaseOffset(text, *start);
   2514                 return FALSE;
   2515             }
   2516             count --;
   2517         }
   2518     }
   2519     return TRUE;
   2520 }
   2521 
   2522 /**
   2523 * Checks and sets the match information if found.
   2524 * Checks
   2525 * <ul>
   2526 * <li> the potential match does not repeat the previous match
   2527 * <li> boundaries are correct
   2528 * <li> potential match does not end in the middle of a contraction
   2529 * <li> identical matches
   2530 * <\ul>
   2531 * Otherwise the offset will be shifted to the next character.
   2532 * Internal method, status assumed to be success, caller has to check status
   2533 * before calling this method.
   2534 * @param strsrch string search data
   2535 * @param textoffset offset in the collation element text. the returned value
   2536 *        will be the truncated start offset of the match or the new start
   2537 *        search offset.
   2538 * @param status only error status if any
   2539 * @return TRUE if the match is valid, FALSE otherwise
   2540 */
   2541 static
   2542 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
   2543                                          int32_t   *textoffset,
   2544                                          UErrorCode    *status)
   2545 {
   2546     // to ensure that the start and ends are not composite characters
   2547     UCollationElements *coleiter = strsrch->textIter;
   2548     // if we have a canonical accent match
   2549     if ((strsrch->pattern.hasSuffixAccents &&
   2550         strsrch->canonicalSuffixAccents[0]) ||
   2551         (strsrch->pattern.hasPrefixAccents &&
   2552         strsrch->canonicalPrefixAccents[0])) {
   2553         strsrch->search->matchedIndex  = *textoffset;
   2554         strsrch->search->matchedLength =
   2555             getNextUStringSearchBaseOffset(strsrch,
   2556                                       getColElemIterOffset(coleiter, FALSE))
   2557             - *textoffset;
   2558         return TRUE;
   2559     }
   2560 
   2561     int32_t end = ucol_getOffset(coleiter);
   2562     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
   2563                                                 status) ||
   2564          U_FAILURE(*status)) {
   2565         return FALSE;
   2566     }
   2567 
   2568     end = getNextUStringSearchBaseOffset(strsrch, end);
   2569     // this totally matches, however we need to check if it is repeating
   2570     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2571         !isBreakUnit(strsrch, *textoffset, end) ||
   2572         !checkIdentical(strsrch, *textoffset, end)) {
   2573         (*textoffset) --;
   2574         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2575                                             *textoffset);
   2576         return FALSE;
   2577     }
   2578 
   2579     strsrch->search->matchedIndex  = *textoffset;
   2580     strsrch->search->matchedLength = end - *textoffset;
   2581     return TRUE;
   2582 }
   2583 #endif // #if BOYER_MOORE
   2584 
   2585 // constructors and destructor -------------------------------------------
   2586 
   2587 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
   2588                                           int32_t         patternlength,
   2589                                     const UChar          *text,
   2590                                           int32_t         textlength,
   2591                                     const char           *locale,
   2592                                           UBreakIterator *breakiter,
   2593                                           UErrorCode     *status)
   2594 {
   2595     if (U_FAILURE(*status)) {
   2596         return NULL;
   2597     }
   2598 #if UCONFIG_NO_BREAK_ITERATION
   2599     if (breakiter != NULL) {
   2600         *status = U_UNSUPPORTED_ERROR;
   2601         return NULL;
   2602     }
   2603 #endif
   2604     if (locale) {
   2605         // ucol_open internally checks for status
   2606         UCollator     *collator = ucol_open(locale, status);
   2607         // pattern, text checks are done in usearch_openFromCollator
   2608         UStringSearch *result   = usearch_openFromCollator(pattern,
   2609                                               patternlength, text, textlength,
   2610                                               collator, breakiter, status);
   2611 
   2612         if (result == NULL || U_FAILURE(*status)) {
   2613             if (collator) {
   2614                 ucol_close(collator);
   2615             }
   2616             return NULL;
   2617         }
   2618         else {
   2619             result->ownCollator = TRUE;
   2620         }
   2621         return result;
   2622     }
   2623     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2624     return NULL;
   2625 }
   2626 
   2627 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
   2628                                   const UChar          *pattern,
   2629                                         int32_t         patternlength,
   2630                                   const UChar          *text,
   2631                                         int32_t         textlength,
   2632                                   const UCollator      *collator,
   2633                                         UBreakIterator *breakiter,
   2634                                         UErrorCode     *status)
   2635 {
   2636     if (U_FAILURE(*status)) {
   2637         return NULL;
   2638     }
   2639 #if UCONFIG_NO_BREAK_ITERATION
   2640     if (breakiter != NULL) {
   2641         *status = U_UNSUPPORTED_ERROR;
   2642         return NULL;
   2643     }
   2644 #endif
   2645     if (pattern == NULL || text == NULL || collator == NULL) {
   2646         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2647         return NULL;
   2648     }
   2649 
   2650     // string search does not really work when numeric collation is turned on
   2651     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
   2652         *status = U_UNSUPPORTED_ERROR;
   2653         return NULL;
   2654     }
   2655 
   2656     if (U_SUCCESS(*status)) {
   2657         initializeFCD(status);
   2658         if (U_FAILURE(*status)) {
   2659             return NULL;
   2660         }
   2661 
   2662         UStringSearch *result;
   2663         if (textlength == -1) {
   2664             textlength = u_strlen(text);
   2665         }
   2666         if (patternlength == -1) {
   2667             patternlength = u_strlen(pattern);
   2668         }
   2669         if (textlength <= 0 || patternlength <= 0) {
   2670             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2671             return NULL;
   2672         }
   2673 
   2674         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
   2675         if (result == NULL) {
   2676             *status = U_MEMORY_ALLOCATION_ERROR;
   2677             return NULL;
   2678         }
   2679 
   2680         result->collator    = collator;
   2681         result->strength    = ucol_getStrength(collator);
   2682         result->ceMask      = getMask(result->strength);
   2683         result->toShift     =
   2684              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   2685                                                             UCOL_SHIFTED;
   2686         result->variableTop = ucol_getVariableTop(collator, status);
   2687 
   2688         result->nfd         = Normalizer2::getNFDInstance(*status);
   2689 
   2690         if (U_FAILURE(*status)) {
   2691             uprv_free(result);
   2692             return NULL;
   2693         }
   2694 
   2695         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
   2696         if (result->search == NULL) {
   2697             *status = U_MEMORY_ALLOCATION_ERROR;
   2698             uprv_free(result);
   2699             return NULL;
   2700         }
   2701 
   2702         result->search->text       = text;
   2703         result->search->textLength = textlength;
   2704 
   2705         result->pattern.text       = pattern;
   2706         result->pattern.textLength = patternlength;
   2707         result->pattern.ces         = NULL;
   2708         result->pattern.pces        = NULL;
   2709 
   2710         result->search->breakIter  = breakiter;
   2711 #if !UCONFIG_NO_BREAK_ITERATION
   2712         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
   2713         if (breakiter) {
   2714             ubrk_setText(breakiter, text, textlength, status);
   2715         }
   2716 #endif
   2717 
   2718         result->ownCollator           = FALSE;
   2719         result->search->matchedLength = 0;
   2720         result->search->matchedIndex  = USEARCH_DONE;
   2721         result->utilIter              = NULL;
   2722         result->textIter              = ucol_openElements(collator, text,
   2723                                                           textlength, status);
   2724         result->textProcessedIter     = NULL;
   2725         if (U_FAILURE(*status)) {
   2726             usearch_close(result);
   2727             return NULL;
   2728         }
   2729 
   2730         result->search->isOverlap          = FALSE;
   2731         result->search->isCanonicalMatch   = FALSE;
   2732         result->search->elementComparisonType = 0;
   2733         result->search->isForwardSearching = TRUE;
   2734         result->search->reset              = TRUE;
   2735 
   2736         initialize(result, status);
   2737 
   2738         if (U_FAILURE(*status)) {
   2739             usearch_close(result);
   2740             return NULL;
   2741         }
   2742 
   2743         return result;
   2744     }
   2745     return NULL;
   2746 }
   2747 
   2748 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
   2749 {
   2750     if (strsrch) {
   2751         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
   2752             strsrch->pattern.ces) {
   2753             uprv_free(strsrch->pattern.ces);
   2754         }
   2755 
   2756         if (strsrch->pattern.pces != NULL &&
   2757             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
   2758             uprv_free(strsrch->pattern.pces);
   2759         }
   2760 
   2761         delete strsrch->textProcessedIter;
   2762         ucol_closeElements(strsrch->textIter);
   2763         ucol_closeElements(strsrch->utilIter);
   2764 
   2765         if (strsrch->ownCollator && strsrch->collator) {
   2766             ucol_close((UCollator *)strsrch->collator);
   2767         }
   2768 
   2769 #if !UCONFIG_NO_BREAK_ITERATION
   2770         if (strsrch->search->internalBreakIter) {
   2771             ubrk_close(strsrch->search->internalBreakIter);
   2772         }
   2773 #endif
   2774 
   2775         uprv_free(strsrch->search);
   2776         uprv_free(strsrch);
   2777     }
   2778 }
   2779 
   2780 namespace {
   2781 
   2782 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
   2783     if (U_FAILURE(*status)) { return FALSE; }
   2784     if (strsrch->textProcessedIter == NULL) {
   2785         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
   2786         if (strsrch->textProcessedIter == NULL) {
   2787             *status = U_MEMORY_ALLOCATION_ERROR;
   2788             return FALSE;
   2789         }
   2790     } else {
   2791         strsrch->textProcessedIter->init(strsrch->textIter);
   2792     }
   2793     return TRUE;
   2794 }
   2795 
   2796 }
   2797 
   2798 // set and get methods --------------------------------------------------
   2799 
   2800 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
   2801                                         int32_t    position,
   2802                                         UErrorCode    *status)
   2803 {
   2804     if (U_SUCCESS(*status) && strsrch) {
   2805         if (isOutOfBounds(strsrch->search->textLength, position)) {
   2806             *status = U_INDEX_OUTOFBOUNDS_ERROR;
   2807         }
   2808         else {
   2809             setColEIterOffset(strsrch->textIter, position);
   2810         }
   2811         strsrch->search->matchedIndex  = USEARCH_DONE;
   2812         strsrch->search->matchedLength = 0;
   2813         strsrch->search->reset         = FALSE;
   2814     }
   2815 }
   2816 
   2817 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
   2818 {
   2819     if (strsrch) {
   2820         int32_t result = ucol_getOffset(strsrch->textIter);
   2821         if (isOutOfBounds(strsrch->search->textLength, result)) {
   2822             return USEARCH_DONE;
   2823         }
   2824         return result;
   2825     }
   2826     return USEARCH_DONE;
   2827 }
   2828 
   2829 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
   2830                                  USearchAttribute attribute,
   2831                                  USearchAttributeValue value,
   2832                                  UErrorCode *status)
   2833 {
   2834     if (U_SUCCESS(*status) && strsrch) {
   2835         switch (attribute)
   2836         {
   2837         case USEARCH_OVERLAP :
   2838             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
   2839             break;
   2840         case USEARCH_CANONICAL_MATCH :
   2841             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
   2842                                                                       FALSE);
   2843             break;
   2844         case USEARCH_ELEMENT_COMPARISON :
   2845             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2846                 strsrch->search->elementComparisonType = (int16_t)value;
   2847             } else {
   2848                 strsrch->search->elementComparisonType = 0;
   2849             }
   2850             break;
   2851         case USEARCH_ATTRIBUTE_COUNT :
   2852         default:
   2853             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2854         }
   2855     }
   2856     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
   2857         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2858     }
   2859 }
   2860 
   2861 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
   2862                                                 const UStringSearch *strsrch,
   2863                                                 USearchAttribute attribute)
   2864 {
   2865     if (strsrch) {
   2866         switch (attribute) {
   2867         case USEARCH_OVERLAP :
   2868             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
   2869                                                         USEARCH_OFF);
   2870         case USEARCH_CANONICAL_MATCH :
   2871             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
   2872                                                                USEARCH_OFF);
   2873         case USEARCH_ELEMENT_COMPARISON :
   2874             {
   2875                 int16_t value = strsrch->search->elementComparisonType;
   2876                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2877                     return (USearchAttributeValue)value;
   2878                 } else {
   2879                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
   2880                 }
   2881             }
   2882         case USEARCH_ATTRIBUTE_COUNT :
   2883             return USEARCH_DEFAULT;
   2884         }
   2885     }
   2886     return USEARCH_DEFAULT;
   2887 }
   2888 
   2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
   2890                                                 const UStringSearch *strsrch)
   2891 {
   2892     if (strsrch == NULL) {
   2893         return USEARCH_DONE;
   2894     }
   2895     return strsrch->search->matchedIndex;
   2896 }
   2897 
   2898 
   2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
   2900                                             UChar         *result,
   2901                                             int32_t        resultCapacity,
   2902                                             UErrorCode    *status)
   2903 {
   2904     if (U_FAILURE(*status)) {
   2905         return USEARCH_DONE;
   2906     }
   2907     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
   2908         result == NULL)) {
   2909         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2910         return USEARCH_DONE;
   2911     }
   2912 
   2913     int32_t     copylength = strsrch->search->matchedLength;
   2914     int32_t copyindex  = strsrch->search->matchedIndex;
   2915     if (copyindex == USEARCH_DONE) {
   2916         u_terminateUChars(result, resultCapacity, 0, status);
   2917         return USEARCH_DONE;
   2918     }
   2919 
   2920     if (resultCapacity < copylength) {
   2921         copylength = resultCapacity;
   2922     }
   2923     if (copylength > 0) {
   2924         uprv_memcpy(result, strsrch->search->text + copyindex,
   2925                     copylength * sizeof(UChar));
   2926     }
   2927     return u_terminateUChars(result, resultCapacity,
   2928                              strsrch->search->matchedLength, status);
   2929 }
   2930 
   2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
   2932                                               const UStringSearch *strsrch)
   2933 {
   2934     if (strsrch) {
   2935         return strsrch->search->matchedLength;
   2936     }
   2937     return USEARCH_DONE;
   2938 }
   2939 
   2940 #if !UCONFIG_NO_BREAK_ITERATION
   2941 
   2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
   2943                                                UBreakIterator *breakiter,
   2944                                                UErrorCode     *status)
   2945 {
   2946     if (U_SUCCESS(*status) && strsrch) {
   2947         strsrch->search->breakIter = breakiter;
   2948         if (breakiter) {
   2949             ubrk_setText(breakiter, strsrch->search->text,
   2950                          strsrch->search->textLength, status);
   2951         }
   2952     }
   2953 }
   2954 
   2955 U_CAPI const UBreakIterator* U_EXPORT2
   2956 usearch_getBreakIterator(const UStringSearch *strsrch)
   2957 {
   2958     if (strsrch) {
   2959         return strsrch->search->breakIter;
   2960     }
   2961     return NULL;
   2962 }
   2963 
   2964 #endif
   2965 
   2966 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
   2967                                       const UChar         *text,
   2968                                             int32_t        textlength,
   2969                                             UErrorCode    *status)
   2970 {
   2971     if (U_SUCCESS(*status)) {
   2972         if (strsrch == NULL || text == NULL || textlength < -1 ||
   2973             textlength == 0) {
   2974             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2975         }
   2976         else {
   2977             if (textlength == -1) {
   2978                 textlength = u_strlen(text);
   2979             }
   2980             strsrch->search->text       = text;
   2981             strsrch->search->textLength = textlength;
   2982             ucol_setText(strsrch->textIter, text, textlength, status);
   2983             strsrch->search->matchedIndex  = USEARCH_DONE;
   2984             strsrch->search->matchedLength = 0;
   2985             strsrch->search->reset         = TRUE;
   2986 #if !UCONFIG_NO_BREAK_ITERATION
   2987             if (strsrch->search->breakIter != NULL) {
   2988                 ubrk_setText(strsrch->search->breakIter, text,
   2989                              textlength, status);
   2990             }
   2991             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
   2992 #endif
   2993         }
   2994     }
   2995 }
   2996 
   2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
   2998                                                      int32_t       *length)
   2999 {
   3000     if (strsrch) {
   3001         *length = strsrch->search->textLength;
   3002         return strsrch->search->text;
   3003     }
   3004     return NULL;
   3005 }
   3006 
   3007 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
   3008                                           const UCollator     *collator,
   3009                                                 UErrorCode    *status)
   3010 {
   3011     if (U_SUCCESS(*status)) {
   3012         if (collator == NULL) {
   3013             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3014             return;
   3015         }
   3016 
   3017         if (strsrch) {
   3018             delete strsrch->textProcessedIter;
   3019             strsrch->textProcessedIter = NULL;
   3020             ucol_closeElements(strsrch->textIter);
   3021             ucol_closeElements(strsrch->utilIter);
   3022             strsrch->textIter = strsrch->utilIter = NULL;
   3023             if (strsrch->ownCollator && (strsrch->collator != collator)) {
   3024                 ucol_close((UCollator *)strsrch->collator);
   3025                 strsrch->ownCollator = FALSE;
   3026             }
   3027             strsrch->collator    = collator;
   3028             strsrch->strength    = ucol_getStrength(collator);
   3029             strsrch->ceMask      = getMask(strsrch->strength);
   3030 #if !UCONFIG_NO_BREAK_ITERATION
   3031             ubrk_close(strsrch->search->internalBreakIter);
   3032             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
   3033                                                      strsrch->search->text, strsrch->search->textLength, status);
   3034 #endif
   3035             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3036             strsrch->toShift     =
   3037                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   3038                                                                 UCOL_SHIFTED;
   3039             // if status is a failure, ucol_getVariableTop returns 0
   3040             strsrch->variableTop = ucol_getVariableTop(collator, status);
   3041             strsrch->textIter = ucol_openElements(collator,
   3042                                       strsrch->search->text,
   3043                                       strsrch->search->textLength,
   3044                                       status);
   3045             strsrch->utilIter = ucol_openElements(
   3046                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
   3047             // initialize() _after_ setting the iterators for the new collator.
   3048             initialize(strsrch, status);
   3049         }
   3050 
   3051         // **** are these calls needed?
   3052         // **** we call uprv_init_pce in initializePatternPCETable
   3053         // **** and the CEIBuffer constructor...
   3054 #if 0
   3055         uprv_init_pce(strsrch->textIter);
   3056         uprv_init_pce(strsrch->utilIter);
   3057 #endif
   3058     }
   3059 }
   3060 
   3061 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
   3062 {
   3063     if (strsrch) {
   3064         return (UCollator *)strsrch->collator;
   3065     }
   3066     return NULL;
   3067 }
   3068 
   3069 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
   3070                                          const UChar         *pattern,
   3071                                                int32_t        patternlength,
   3072                                                UErrorCode    *status)
   3073 {
   3074     if (U_SUCCESS(*status)) {
   3075         if (strsrch == NULL || pattern == NULL) {
   3076             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3077         }
   3078         else {
   3079             if (patternlength == -1) {
   3080                 patternlength = u_strlen(pattern);
   3081             }
   3082             if (patternlength == 0) {
   3083                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   3084                 return;
   3085             }
   3086             strsrch->pattern.text       = pattern;
   3087             strsrch->pattern.textLength = patternlength;
   3088             initialize(strsrch, status);
   3089         }
   3090     }
   3091 }
   3092 
   3093 U_CAPI const UChar* U_EXPORT2
   3094 usearch_getPattern(const UStringSearch *strsrch,
   3095                    int32_t       *length)
   3096 {
   3097     if (strsrch) {
   3098         *length = strsrch->pattern.textLength;
   3099         return strsrch->pattern.text;
   3100     }
   3101     return NULL;
   3102 }
   3103 
   3104 // miscellanous methods --------------------------------------------------
   3105 
   3106 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
   3107                                            UErrorCode    *status)
   3108 {
   3109     if (strsrch && U_SUCCESS(*status)) {
   3110         strsrch->search->isForwardSearching = TRUE;
   3111         usearch_setOffset(strsrch, 0, status);
   3112         if (U_SUCCESS(*status)) {
   3113             return usearch_next(strsrch, status);
   3114         }
   3115     }
   3116     return USEARCH_DONE;
   3117 }
   3118 
   3119 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
   3120                                                int32_t    position,
   3121                                                UErrorCode    *status)
   3122 {
   3123     if (strsrch && U_SUCCESS(*status)) {
   3124         strsrch->search->isForwardSearching = TRUE;
   3125         // position checked in usearch_setOffset
   3126         usearch_setOffset(strsrch, position, status);
   3127         if (U_SUCCESS(*status)) {
   3128             return usearch_next(strsrch, status);
   3129         }
   3130     }
   3131     return USEARCH_DONE;
   3132 }
   3133 
   3134 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
   3135                                           UErrorCode    *status)
   3136 {
   3137     if (strsrch && U_SUCCESS(*status)) {
   3138         strsrch->search->isForwardSearching = FALSE;
   3139         usearch_setOffset(strsrch, strsrch->search->textLength, status);
   3140         if (U_SUCCESS(*status)) {
   3141             return usearch_previous(strsrch, status);
   3142         }
   3143     }
   3144     return USEARCH_DONE;
   3145 }
   3146 
   3147 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
   3148                                                int32_t    position,
   3149                                                UErrorCode    *status)
   3150 {
   3151     if (strsrch && U_SUCCESS(*status)) {
   3152         strsrch->search->isForwardSearching = FALSE;
   3153         // position checked in usearch_setOffset
   3154         usearch_setOffset(strsrch, position, status);
   3155         if (U_SUCCESS(*status)) {
   3156             return usearch_previous(strsrch, status);
   3157         }
   3158     }
   3159     return USEARCH_DONE;
   3160 }
   3161 
   3162 /**
   3163 * If a direction switch is required, we'll count the number of ces till the
   3164 * beginning of the collation element iterator and iterate forwards that
   3165 * number of times. This is so that we get to the correct point within the
   3166 * string to continue the search in. Imagine when we are in the middle of the
   3167 * normalization buffer when the change in direction is request. arrrgghh....
   3168 * After searching the offset within the collation element iterator will be
   3169 * shifted to the start of the match. If a match is not found, the offset would
   3170 * have been set to the end of the text string in the collation element
   3171 * iterator.
   3172 * Okay, here's my take on normalization buffer. The only time when there can
   3173 * be 2 matches within the same normalization is when the pattern is consists
   3174 * of all accents. But since the offset returned is from the text string, we
   3175 * should not confuse the caller by returning the second match within the
   3176 * same normalization buffer. If we do, the 2 results will have the same match
   3177 * offsets, and that'll be confusing. I'll return the next match that doesn't
   3178 * fall within the same normalization buffer. Note this does not affect the
   3179 * results of matches spanning the text and the normalization buffer.
   3180 * The position to start searching is taken from the collation element
   3181 * iterator. Callers of this API would have to set the offset in the collation
   3182 * element iterator before using this method.
   3183 */
   3184 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
   3185                                           UErrorCode    *status)
   3186 {
   3187     if (U_SUCCESS(*status) && strsrch) {
   3188         // note offset is either equivalent to the start of the previous match
   3189         // or is set by the user
   3190         int32_t      offset       = usearch_getOffset(strsrch);
   3191         USearch     *search       = strsrch->search;
   3192         search->reset             = FALSE;
   3193         int32_t      textlength   = search->textLength;
   3194         if (search->isForwardSearching) {
   3195 #if BOYER_MOORE
   3196             if (offset == textlength
   3197                 || (!search->isOverlap &&
   3198                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
   3199                     (search->matchedIndex != USEARCH_DONE &&
   3200                      offset + search->matchedLength >= textlength)))) {
   3201                 // not enough characters to match
   3202                 setMatchNotFound(strsrch);
   3203                 return USEARCH_DONE;
   3204             }
   3205 #else
   3206             if (offset == textlength ||
   3207                 (! search->isOverlap &&
   3208                 (search->matchedIndex != USEARCH_DONE &&
   3209                 offset + search->matchedLength > textlength))) {
   3210                     // not enough characters to match
   3211                     setMatchNotFound(strsrch);
   3212                     return USEARCH_DONE;
   3213             }
   3214 #endif
   3215         }
   3216         else {
   3217             // switching direction.
   3218             // if matchedIndex == USEARCH_DONE, it means that either a
   3219             // setOffset has been called or that previous ran off the text
   3220             // string. the iterator would have been set to offset 0 if a
   3221             // match is not found.
   3222             search->isForwardSearching = TRUE;
   3223             if (search->matchedIndex != USEARCH_DONE) {
   3224                 // there's no need to set the collation element iterator
   3225                 // the next call to next will set the offset.
   3226                 return search->matchedIndex;
   3227             }
   3228         }
   3229 
   3230         if (U_SUCCESS(*status)) {
   3231             if (strsrch->pattern.cesLength == 0) {
   3232                 if (search->matchedIndex == USEARCH_DONE) {
   3233                     search->matchedIndex = offset;
   3234                 }
   3235                 else { // moves by codepoints
   3236                     U16_FWD_1(search->text, search->matchedIndex, textlength);
   3237                 }
   3238 
   3239                 search->matchedLength = 0;
   3240                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3241                 // status checked below
   3242                 if (search->matchedIndex == textlength) {
   3243                     search->matchedIndex = USEARCH_DONE;
   3244                 }
   3245             }
   3246             else {
   3247                 if (search->matchedLength > 0) {
   3248                     // if matchlength is 0 we are at the start of the iteration
   3249                     if (search->isOverlap) {
   3250                         ucol_setOffset(strsrch->textIter, offset + 1, status);
   3251                     }
   3252                     else {
   3253                         ucol_setOffset(strsrch->textIter,
   3254                                        offset + search->matchedLength, status);
   3255                     }
   3256                 }
   3257                 else {
   3258                     // for boundary check purposes. this will ensure that the
   3259                     // next match will not preceed the current offset
   3260                     // note search->matchedIndex will always be set to something
   3261                     // in the code
   3262                     search->matchedIndex = offset - 1;
   3263                 }
   3264 
   3265                 if (search->isCanonicalMatch) {
   3266                     // can't use exact here since extra accents are allowed.
   3267                     usearch_handleNextCanonical(strsrch, status);
   3268                 }
   3269                 else {
   3270                     usearch_handleNextExact(strsrch, status);
   3271                 }
   3272             }
   3273 
   3274             if (U_FAILURE(*status)) {
   3275                 return USEARCH_DONE;
   3276             }
   3277 
   3278 #if !BOYER_MOORE
   3279             if (search->matchedIndex == USEARCH_DONE) {
   3280                 ucol_setOffset(strsrch->textIter, search->textLength, status);
   3281             } else {
   3282                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
   3283             }
   3284 #endif
   3285 
   3286             return search->matchedIndex;
   3287         }
   3288     }
   3289     return USEARCH_DONE;
   3290 }
   3291 
   3292 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
   3293                                               UErrorCode *status)
   3294 {
   3295     if (U_SUCCESS(*status) && strsrch) {
   3296         int32_t offset;
   3297         USearch *search = strsrch->search;
   3298         if (search->reset) {
   3299             offset                     = search->textLength;
   3300             search->isForwardSearching = FALSE;
   3301             search->reset              = FALSE;
   3302             setColEIterOffset(strsrch->textIter, offset);
   3303         }
   3304         else {
   3305             offset = usearch_getOffset(strsrch);
   3306         }
   3307 
   3308         int32_t matchedindex = search->matchedIndex;
   3309         if (search->isForwardSearching == TRUE) {
   3310             // switching direction.
   3311             // if matchedIndex == USEARCH_DONE, it means that either a
   3312             // setOffset has been called or that next ran off the text
   3313             // string. the iterator would have been set to offset textLength if
   3314             // a match is not found.
   3315             search->isForwardSearching = FALSE;
   3316             if (matchedindex != USEARCH_DONE) {
   3317                 return matchedindex;
   3318             }
   3319         }
   3320         else {
   3321 #if BOYER_MOORE
   3322             if (offset == 0 || matchedindex == 0 ||
   3323                 (!search->isOverlap &&
   3324                     (offset < strsrch->pattern.defaultShiftSize ||
   3325                     (matchedindex != USEARCH_DONE &&
   3326                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
   3327                 // not enough characters to match
   3328                 setMatchNotFound(strsrch);
   3329                 return USEARCH_DONE;
   3330             }
   3331 #else
   3332             // Could check pattern length, but the
   3333             // linear search will do the right thing
   3334             if (offset == 0 || matchedindex == 0) {
   3335                 setMatchNotFound(strsrch);
   3336                 return USEARCH_DONE;
   3337             }
   3338 #endif
   3339         }
   3340 
   3341         if (U_SUCCESS(*status)) {
   3342             if (strsrch->pattern.cesLength == 0) {
   3343                 search->matchedIndex =
   3344                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
   3345                 if (search->matchedIndex == 0) {
   3346                     setMatchNotFound(strsrch);
   3347                     // status checked below
   3348                 }
   3349                 else { // move by codepoints
   3350                     U16_BACK_1(search->text, 0, search->matchedIndex);
   3351                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3352                     // status checked below
   3353                     search->matchedLength = 0;
   3354                 }
   3355             }
   3356             else {
   3357                 if (strsrch->search->isCanonicalMatch) {
   3358                     // can't use exact here since extra accents are allowed.
   3359                     usearch_handlePreviousCanonical(strsrch, status);
   3360                     // status checked below
   3361                 }
   3362                 else {
   3363                     usearch_handlePreviousExact(strsrch, status);
   3364                     // status checked below
   3365                 }
   3366             }
   3367 
   3368             if (U_FAILURE(*status)) {
   3369                 return USEARCH_DONE;
   3370             }
   3371 
   3372             return search->matchedIndex;
   3373         }
   3374     }
   3375     return USEARCH_DONE;
   3376 }
   3377 
   3378 
   3379 
   3380 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
   3381 {
   3382     /*
   3383     reset is setting the attributes that are already in
   3384     string search, hence all attributes in the collator should
   3385     be retrieved without any problems
   3386     */
   3387     if (strsrch) {
   3388         UErrorCode status            = U_ZERO_ERROR;
   3389         UBool      sameCollAttribute = TRUE;
   3390         uint32_t   ceMask;
   3391         UBool      shift;
   3392         uint32_t   varTop;
   3393 
   3394         // **** hack to deal w/ how processed CEs encode quaternary ****
   3395         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
   3396         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
   3397             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
   3398                 sameCollAttribute = FALSE;
   3399         }
   3400 
   3401         strsrch->strength    = ucol_getStrength(strsrch->collator);
   3402         ceMask = getMask(strsrch->strength);
   3403         if (strsrch->ceMask != ceMask) {
   3404             strsrch->ceMask = ceMask;
   3405             sameCollAttribute = FALSE;
   3406         }
   3407 
   3408         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3409         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
   3410                                   &status) == UCOL_SHIFTED;
   3411         if (strsrch->toShift != shift) {
   3412             strsrch->toShift  = shift;
   3413             sameCollAttribute = FALSE;
   3414         }
   3415 
   3416         // if status is a failure, ucol_getVariableTop returns 0
   3417         varTop = ucol_getVariableTop(strsrch->collator, &status);
   3418         if (strsrch->variableTop != varTop) {
   3419             strsrch->variableTop = varTop;
   3420             sameCollAttribute    = FALSE;
   3421         }
   3422         if (!sameCollAttribute) {
   3423             initialize(strsrch, &status);
   3424         }
   3425         ucol_setText(strsrch->textIter, strsrch->search->text,
   3426                               strsrch->search->textLength,
   3427                               &status);
   3428         strsrch->search->matchedLength      = 0;
   3429         strsrch->search->matchedIndex       = USEARCH_DONE;
   3430         strsrch->search->isOverlap          = FALSE;
   3431         strsrch->search->isCanonicalMatch   = FALSE;
   3432         strsrch->search->elementComparisonType = 0;
   3433         strsrch->search->isForwardSearching = TRUE;
   3434         strsrch->search->reset              = TRUE;
   3435     }
   3436 }
   3437 
   3438 //
   3439 //  CEI  Collation Element + source text index.
   3440 //       These structs are kept in the circular buffer.
   3441 //
   3442 struct  CEI {
   3443     int64_t ce;
   3444     int32_t lowIndex;
   3445     int32_t highIndex;
   3446 };
   3447 
   3448 U_NAMESPACE_BEGIN
   3449 
   3450 namespace {
   3451 //
   3452 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
   3453 //
   3454 #define   DEFAULT_CEBUFFER_SIZE 96
   3455 #define   CEBUFFER_EXTRA 32
   3456 // Some typical max values to make buffer size more reasonable for asymmetric search.
   3457 // #8694 is for a better long-term solution to allocation of this buffer.
   3458 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
   3459 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
   3460 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
   3461 struct CEIBuffer {
   3462     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
   3463     CEI                 *buf;
   3464     int32_t              bufSize;
   3465     int32_t              firstIx;
   3466     int32_t              limitIx;
   3467     UCollationElements  *ceIter;
   3468     UStringSearch       *strSearch;
   3469 
   3470 
   3471 
   3472                CEIBuffer(UStringSearch *ss, UErrorCode *status);
   3473                ~CEIBuffer();
   3474    const CEI   *get(int32_t index);
   3475    const CEI   *getPrevious(int32_t index);
   3476 };
   3477 
   3478 
   3479 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
   3480     buf = defBuf;
   3481     strSearch = ss;
   3482     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
   3483     if (ss->search->elementComparisonType != 0) {
   3484         const UChar * patText = ss->pattern.text;
   3485         if (patText) {
   3486             const UChar * patTextLimit = patText + ss->pattern.textLength;
   3487             while ( patText < patTextLimit ) {
   3488                 UChar c = *patText++;
   3489                 if (MIGHT_BE_JAMO_L(c)) {
   3490                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
   3491                 } else {
   3492                     // No check for surrogates, we might allocate slightly more buffer than necessary.
   3493                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
   3494                 }
   3495             }
   3496         }
   3497     }
   3498     ceIter    = ss->textIter;
   3499     firstIx = 0;
   3500     limitIx = 0;
   3501 
   3502     if (!initTextProcessedIter(ss, status)) { return; }
   3503 
   3504     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
   3505         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
   3506         if (buf == NULL) {
   3507             *status = U_MEMORY_ALLOCATION_ERROR;
   3508         }
   3509     }
   3510 }
   3511 
   3512 // TODO: add a reset or init function so that allocated
   3513 //       buffers can be retained & reused.
   3514 
   3515 CEIBuffer::~CEIBuffer() {
   3516     if (buf != defBuf) {
   3517         uprv_free(buf);
   3518     }
   3519 }
   3520 
   3521 
   3522 // Get the CE with the specified index.
   3523 //   Index must be in the range
   3524 //          n-history_size < index < n+1
   3525 //   where n is the largest index to have been fetched by some previous call to this function.
   3526 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3527 //
   3528 const CEI *CEIBuffer::get(int32_t index) {
   3529     int i = index % bufSize;
   3530 
   3531     if (index>=firstIx && index<limitIx) {
   3532         // The request was for an entry already in our buffer.
   3533         //  Just return it.
   3534         return &buf[i];
   3535     }
   3536 
   3537     // Caller is requesting a new, never accessed before, CE.
   3538     //   Verify that it is the next one in sequence, which is all
   3539     //   that is allowed.
   3540     if (index != limitIx) {
   3541         U_ASSERT(FALSE);
   3542 
   3543         return NULL;
   3544     }
   3545 
   3546     // Manage the circular CE buffer indexing
   3547     limitIx++;
   3548 
   3549     if (limitIx - firstIx >= bufSize) {
   3550         // The buffer is full, knock out the lowest-indexed entry.
   3551         firstIx++;
   3552     }
   3553 
   3554     UErrorCode status = U_ZERO_ERROR;
   3555 
   3556     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3557 
   3558     return &buf[i];
   3559 }
   3560 
   3561 // Get the CE with the specified index.
   3562 //   Index must be in the range
   3563 //          n-history_size < index < n+1
   3564 //   where n is the largest index to have been fetched by some previous call to this function.
   3565 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3566 //
   3567 const CEI *CEIBuffer::getPrevious(int32_t index) {
   3568     int i = index % bufSize;
   3569 
   3570     if (index>=firstIx && index<limitIx) {
   3571         // The request was for an entry already in our buffer.
   3572         //  Just return it.
   3573         return &buf[i];
   3574     }
   3575 
   3576     // Caller is requesting a new, never accessed before, CE.
   3577     //   Verify that it is the next one in sequence, which is all
   3578     //   that is allowed.
   3579     if (index != limitIx) {
   3580         U_ASSERT(FALSE);
   3581 
   3582         return NULL;
   3583     }
   3584 
   3585     // Manage the circular CE buffer indexing
   3586     limitIx++;
   3587 
   3588     if (limitIx - firstIx >= bufSize) {
   3589         // The buffer is full, knock out the lowest-indexed entry.
   3590         firstIx++;
   3591     }
   3592 
   3593     UErrorCode status = U_ZERO_ERROR;
   3594 
   3595     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3596 
   3597     return &buf[i];
   3598 }
   3599 
   3600 }
   3601 
   3602 U_NAMESPACE_END
   3603 
   3604 
   3605 // #define USEARCH_DEBUG
   3606 
   3607 #ifdef USEARCH_DEBUG
   3608 #include <stdio.h>
   3609 #include <stdlib.h>
   3610 #endif
   3611 
   3612 /*
   3613  * Find the next break boundary after startIndex. If the UStringSearch object
   3614  * has an external break iterator, use that. Otherwise use the internal character
   3615  * break iterator.
   3616  */
   3617 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
   3618 #if 0
   3619     const UChar *text = strsrch->search->text;
   3620     int32_t textLen   = strsrch->search->textLength;
   3621 
   3622     U_ASSERT(startIndex>=0);
   3623     U_ASSERT(startIndex<=textLen);
   3624 
   3625     if (startIndex >= textLen) {
   3626         return startIndex;
   3627     }
   3628 
   3629     UChar32  c;
   3630     int32_t  i = startIndex;
   3631     U16_NEXT(text, i, textLen, c);
   3632 
   3633     // If we are on a control character, stop without looking for combining marks.
   3634     //    Control characters do not combine.
   3635     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3636     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   3637         return i;
   3638     }
   3639 
   3640     // The initial character was not a control, and can thus accept trailing
   3641     //   combining characters.  Advance over however many of them there are.
   3642     int32_t  indexOfLastCharChecked;
   3643     for (;;) {
   3644         indexOfLastCharChecked = i;
   3645         if (i>=textLen) {
   3646             break;
   3647         }
   3648         U16_NEXT(text, i, textLen, c);
   3649         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3650         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3651             break;
   3652         }
   3653     }
   3654     return indexOfLastCharChecked;
   3655 #elif !UCONFIG_NO_BREAK_ITERATION
   3656     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3657 
   3658     if (breakiterator == NULL) {
   3659         breakiterator = strsrch->search->internalBreakIter;
   3660     }
   3661 
   3662     if (breakiterator != NULL) {
   3663         return ubrk_following(breakiterator, startIndex);
   3664     }
   3665 
   3666     return startIndex;
   3667 #else
   3668     // **** or should we use the original code? ****
   3669     return startIndex;
   3670 #endif
   3671 
   3672 }
   3673 
   3674 /*
   3675  * Returns TRUE if index is on a break boundary. If the UStringSearch
   3676  * has an external break iterator, test using that, otherwise test
   3677  * using the internal character break iterator.
   3678  */
   3679 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
   3680 #if 0
   3681     const UChar *text = strsrch->search->text;
   3682     int32_t textLen   = strsrch->search->textLength;
   3683 
   3684     U_ASSERT(index>=0);
   3685     U_ASSERT(index<=textLen);
   3686 
   3687     if (index>=textLen || index<=0) {
   3688         return TRUE;
   3689     }
   3690 
   3691     // If the character at the current index is not a GRAPHEME_EXTEND
   3692     //    then we can not be within a combining sequence.
   3693     UChar32  c;
   3694     U16_GET(text, 0, index, textLen, c);
   3695     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3696     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3697         return TRUE;
   3698     }
   3699 
   3700     // We are at a combining mark.  If the preceding character is anything
   3701     //   except a CONTROL, CR or LF, we are in a combining sequence.
   3702     U16_PREV(text, 0, index, c);
   3703     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3704     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   3705     return !combining;
   3706 #elif !UCONFIG_NO_BREAK_ITERATION
   3707     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3708 
   3709     if (breakiterator == NULL) {
   3710         breakiterator = strsrch->search->internalBreakIter;
   3711     }
   3712 
   3713     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
   3714 #else
   3715     // **** or use the original code? ****
   3716     return TRUE;
   3717 #endif
   3718 }
   3719 
   3720 #if 0
   3721 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
   3722 {
   3723 #if !UCONFIG_NO_BREAK_ITERATION
   3724     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3725 
   3726     if (breakiterator != NULL) {
   3727         int32_t startindex = ubrk_first(breakiterator);
   3728         int32_t endindex   = ubrk_last(breakiterator);
   3729 
   3730         // out-of-range indexes are never boundary positions
   3731         if (start < startindex || start > endindex ||
   3732             end < startindex || end > endindex) {
   3733             return FALSE;
   3734         }
   3735 
   3736         return ubrk_isBoundary(breakiterator, start) &&
   3737                ubrk_isBoundary(breakiterator, end);
   3738     }
   3739 #endif
   3740 
   3741     return TRUE;
   3742 }
   3743 #endif
   3744 
   3745 typedef enum {
   3746     U_CE_MATCH = -1,
   3747     U_CE_NO_MATCH = 0,
   3748     U_CE_SKIP_TARG,
   3749     U_CE_SKIP_PATN
   3750 } UCompareCEsResult;
   3751 #define U_CE_LEVEL2_BASE 0x00000005
   3752 #define U_CE_LEVEL3_BASE 0x00050000
   3753 
   3754 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
   3755     if (targCE == patCE) {
   3756         return U_CE_MATCH;
   3757     }
   3758     if (compareType == 0) {
   3759         return U_CE_NO_MATCH;
   3760     }
   3761 
   3762     int64_t targCEshifted = targCE >> 32;
   3763     int64_t patCEshifted = patCE >> 32;
   3764     int64_t mask;
   3765 
   3766     mask = 0xFFFF0000;
   3767     int32_t targLev1 = (int32_t)(targCEshifted & mask);
   3768     int32_t patLev1 = (int32_t)(patCEshifted & mask);
   3769     if ( targLev1 != patLev1 ) {
   3770         if ( targLev1 == 0 ) {
   3771             return U_CE_SKIP_TARG;
   3772         }
   3773         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3774             return U_CE_SKIP_PATN;
   3775         }
   3776         return U_CE_NO_MATCH;
   3777     }
   3778 
   3779     mask = 0x0000FFFF;
   3780     int32_t targLev2 = (int32_t)(targCEshifted & mask);
   3781     int32_t patLev2 = (int32_t)(patCEshifted & mask);
   3782     if ( targLev2 != patLev2 ) {
   3783         if ( targLev2 == 0 ) {
   3784             return U_CE_SKIP_TARG;
   3785         }
   3786         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3787             return U_CE_SKIP_PATN;
   3788         }
   3789         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
   3790             U_CE_MATCH: U_CE_NO_MATCH;
   3791     }
   3792 
   3793     mask = 0xFFFF0000;
   3794     int32_t targLev3 = (int32_t)(targCE & mask);
   3795     int32_t patLev3 = (int32_t)(patCE & mask);
   3796     if ( targLev3 != patLev3 ) {
   3797         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
   3798             U_CE_MATCH: U_CE_NO_MATCH;
   3799    }
   3800 
   3801     return U_CE_MATCH;
   3802 }
   3803 
   3804 #if BOYER_MOORE
   3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
   3806 #endif
   3807 
   3808 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
   3809                                        int32_t        startIdx,
   3810                                        int32_t        *matchStart,
   3811                                        int32_t        *matchLimit,
   3812                                        UErrorCode     *status)
   3813 {
   3814     if (U_FAILURE(*status)) {
   3815         return FALSE;
   3816     }
   3817 
   3818     // TODO:  reject search patterns beginning with a combining char.
   3819 
   3820 #ifdef USEARCH_DEBUG
   3821     if (getenv("USEARCH_DEBUG") != NULL) {
   3822         printf("Pattern CEs\n");
   3823         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
   3824             printf(" %8x", strsrch->pattern.ces[ii]);
   3825         }
   3826         printf("\n");
   3827     }
   3828 
   3829 #endif
   3830     // Input parameter sanity check.
   3831     //  TODO:  should input indicies clip to the text length
   3832     //         in the same way that UText does.
   3833     if(strsrch->pattern.cesLength == 0         ||
   3834        startIdx < 0                           ||
   3835        startIdx > strsrch->search->textLength ||
   3836        strsrch->pattern.ces == NULL) {
   3837            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3838            return FALSE;
   3839     }
   3840 
   3841     if (strsrch->pattern.pces == NULL) {
   3842         initializePatternPCETable(strsrch, status);
   3843     }
   3844 
   3845     ucol_setOffset(strsrch->textIter, startIdx, status);
   3846     CEIBuffer ceb(strsrch, status);
   3847 
   3848 
   3849     int32_t    targetIx = 0;
   3850     const CEI *targetCEI = NULL;
   3851     int32_t    patIx;
   3852     UBool      found;
   3853 
   3854     int32_t  mStart = -1;
   3855     int32_t  mLimit = -1;
   3856     int32_t  minLimit;
   3857     int32_t  maxLimit;
   3858 
   3859 
   3860 
   3861     // Outer loop moves over match starting positions in the
   3862     //      target CE space.
   3863     // Here we see the target as a sequence of collation elements, resulting from the following:
   3864     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
   3865     //    (for example, digraphs such as IJ may be broken into two characters).
   3866     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
   3867     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
   3868     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
   3869     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
   3870     //    then the CE is deleted, so the following code sees only CEs that are relevant.
   3871     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
   3872     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
   3873     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
   3874     //
   3875     for(targetIx=0; ; targetIx++)
   3876     {
   3877         found = TRUE;
   3878         //  Inner loop checks for a match beginning at each
   3879         //  position from the outer loop.
   3880         int32_t targetIxOffset = 0;
   3881         int64_t patCE = 0;
   3882         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
   3883         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   3884         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
   3885         const CEI *firstCEI = ceb.get(targetIx);
   3886         if (firstCEI == NULL) {
   3887             *status = U_INTERNAL_PROGRAM_ERROR;
   3888             found = FALSE;
   3889             break;
   3890         }
   3891 
   3892         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
   3893             patCE = strsrch->pattern.pces[patIx];
   3894             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
   3895             //  Compare CE from target string with CE from the pattern.
   3896             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
   3897             //    which will fail the compare, below.
   3898             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   3899             if ( ceMatch == U_CE_NO_MATCH ) {
   3900                 found = FALSE;
   3901                 break;
   3902             } else if ( ceMatch > U_CE_NO_MATCH ) {
   3903                 if ( ceMatch == U_CE_SKIP_TARG ) {
   3904                     // redo with same patCE, next targCE
   3905                     patIx--;
   3906                     targetIxOffset++;
   3907                 } else { // ceMatch == U_CE_SKIP_PATN
   3908                     // redo with same targCE, next patCE
   3909                     targetIxOffset--;
   3910                 }
   3911             }
   3912         }
   3913         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
   3914 
   3915         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   3916             // No match at this targetIx.  Try again at the next.
   3917             continue;
   3918         }
   3919 
   3920         if (!found) {
   3921             // No match at all, we have run off the end of the target text.
   3922             break;
   3923         }
   3924 
   3925 
   3926         // We have found a match in CE space.
   3927         // Now determine the bounds in string index space.
   3928         //  There still is a chance of match failure if the CE range not correspond to
   3929         //     an acceptable character range.
   3930         //
   3931         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
   3932 
   3933         mStart   = firstCEI->lowIndex;
   3934         minLimit = lastCEI->lowIndex;
   3935 
   3936         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   3937         //   extended to the end of input, and the match is good.
   3938 
   3939         // Look at the high and low indices of the CE following the match. If
   3940         // they are the same it means one of two things:
   3941         //    1. The match extended to the last CE from the target text, which is OK, or
   3942         //    2. The last CE that was part of the match is in an expansion that extends
   3943         //       to the first CE after the match. In this case, we reject the match.
   3944         const CEI *nextCEI = 0;
   3945         if (strsrch->search->elementComparisonType == 0) {
   3946             nextCEI  = ceb.get(targetIx + targetIxOffset);
   3947             maxLimit = nextCEI->lowIndex;
   3948             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3949                 found = FALSE;
   3950             }
   3951         } else {
   3952             for ( ; ; ++targetIxOffset ) {
   3953                 nextCEI = ceb.get(targetIx + targetIxOffset);
   3954                 maxLimit = nextCEI->lowIndex;
   3955                 // If we are at the end of the target too, match succeeds
   3956                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
   3957                     break;
   3958                 }
   3959                 // As long as the next CE has primary weight of 0,
   3960                 // it is part of the last target element matched by the pattern;
   3961                 // make sure it can be part of a match with the last patCE
   3962                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
   3963                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
   3964                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
   3965                         found = FALSE;
   3966                         break;
   3967                     }
   3968                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
   3969                 // target element, but it has non-zero primary weight => match fails
   3970                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
   3971                     found = false;
   3972                     break;
   3973                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
   3974                 } else {
   3975                     break;
   3976                 }
   3977             }
   3978         }
   3979 
   3980 
   3981         // Check for the start of the match being within a combining sequence.
   3982         //   This can happen if the pattern itself begins with a combining char, and
   3983         //   the match found combining marks in the target text that were attached
   3984         //    to something else.
   3985         //   This type of match should be rejected for not completely consuming a
   3986         //   combining sequence.
   3987         if (!isBreakBoundary(strsrch, mStart)) {
   3988             found = FALSE;
   3989         }
   3990 
   3991         // Check for the start of the match being within an Collation Element Expansion,
   3992         //   meaning that the first char of the match is only partially matched.
   3993         //   With exapnsions, the first CE will report the index of the source
   3994         //   character, and all subsequent (expansions) CEs will report the source index of the
   3995         //    _following_ character.
   3996         int32_t secondIx = firstCEI->highIndex;
   3997         if (mStart == secondIx) {
   3998             found = FALSE;
   3999         }
   4000 
   4001         //  Advance the match end position to the first acceptable match boundary.
   4002         //    This advances the index over any combining charcters.
   4003         mLimit = maxLimit;
   4004         if (minLimit < maxLimit) {
   4005             // When the last CE's low index is same with its high index, the CE is likely
   4006             // a part of expansion. In this case, the index is located just after the
   4007             // character corresponding to the CEs compared above. If the index is right
   4008             // at the break boundary, move the position to the next boundary will result
   4009             // incorrect match length when there are ignorable characters exist between
   4010             // the position and the next character produces CE(s). See ticket#8482.
   4011             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
   4012                 mLimit = minLimit;
   4013             } else {
   4014                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4015                 if (nba >= lastCEI->highIndex) {
   4016                     mLimit = nba;
   4017                 }
   4018             }
   4019         }
   4020 
   4021     #ifdef USEARCH_DEBUG
   4022         if (getenv("USEARCH_DEBUG") != NULL) {
   4023             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4024         }
   4025     #endif
   4026 
   4027         // If advancing to the end of a combining sequence in character indexing space
   4028         //   advanced us beyond the end of the match in CE space, reject this match.
   4029         if (mLimit > maxLimit) {
   4030             found = FALSE;
   4031         }
   4032 
   4033         if (!isBreakBoundary(strsrch, mLimit)) {
   4034             found = FALSE;
   4035         }
   4036 
   4037         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4038             found = FALSE;
   4039         }
   4040 
   4041         if (found) {
   4042             break;
   4043         }
   4044     }
   4045 
   4046     #ifdef USEARCH_DEBUG
   4047     if (getenv("USEARCH_DEBUG") != NULL) {
   4048         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4049         int32_t  lastToPrint = ceb.limitIx+2;
   4050         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4051             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4052         }
   4053         printf("\n%s\n", found? "match found" : "no match");
   4054     }
   4055     #endif
   4056 
   4057     // All Done.  Store back the match bounds to the caller.
   4058     //
   4059     if (found==FALSE) {
   4060         mLimit = -1;
   4061         mStart = -1;
   4062     }
   4063 
   4064     if (matchStart != NULL) {
   4065         *matchStart= mStart;
   4066     }
   4067 
   4068     if (matchLimit != NULL) {
   4069         *matchLimit = mLimit;
   4070     }
   4071 
   4072     return found;
   4073 }
   4074 
   4075 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
   4076                                                 int32_t        startIdx,
   4077                                                 int32_t        *matchStart,
   4078                                                 int32_t        *matchLimit,
   4079                                                 UErrorCode     *status)
   4080 {
   4081     if (U_FAILURE(*status)) {
   4082         return FALSE;
   4083     }
   4084 
   4085     // TODO:  reject search patterns beginning with a combining char.
   4086 
   4087 #ifdef USEARCH_DEBUG
   4088     if (getenv("USEARCH_DEBUG") != NULL) {
   4089         printf("Pattern CEs\n");
   4090         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
   4091             printf(" %8x", strsrch->pattern.ces[ii]);
   4092         }
   4093         printf("\n");
   4094     }
   4095 
   4096 #endif
   4097     // Input parameter sanity check.
   4098     //  TODO:  should input indicies clip to the text length
   4099     //         in the same way that UText does.
   4100     if(strsrch->pattern.cesLength == 0         ||
   4101        startIdx < 0                           ||
   4102        startIdx > strsrch->search->textLength ||
   4103        strsrch->pattern.ces == NULL) {
   4104            *status = U_ILLEGAL_ARGUMENT_ERROR;
   4105            return FALSE;
   4106     }
   4107 
   4108     if (strsrch->pattern.pces == NULL) {
   4109         initializePatternPCETable(strsrch, status);
   4110     }
   4111 
   4112     CEIBuffer ceb(strsrch, status);
   4113     int32_t    targetIx = 0;
   4114 
   4115     /*
   4116      * Pre-load the buffer with the CE's for the grapheme
   4117      * after our starting position so that we're sure that
   4118      * we can look at the CE following the match when we
   4119      * check the match boundaries.
   4120      *
   4121      * This will also pre-fetch the first CE that we'll
   4122      * consider for the match.
   4123      */
   4124     if (startIdx < strsrch->search->textLength) {
   4125         UBreakIterator *bi = strsrch->search->internalBreakIter;
   4126         int32_t next = ubrk_following(bi, startIdx);
   4127 
   4128         ucol_setOffset(strsrch->textIter, next, status);
   4129 
   4130         for (targetIx = 0; ; targetIx += 1) {
   4131             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
   4132                 break;
   4133             }
   4134         }
   4135     } else {
   4136         ucol_setOffset(strsrch->textIter, startIdx, status);
   4137     }
   4138 
   4139 
   4140     const CEI *targetCEI = NULL;
   4141     int32_t    patIx;
   4142     UBool      found;
   4143 
   4144     int32_t  limitIx = targetIx;
   4145     int32_t  mStart = -1;
   4146     int32_t  mLimit = -1;
   4147     int32_t  minLimit;
   4148     int32_t  maxLimit;
   4149 
   4150 
   4151 
   4152     // Outer loop moves over match starting positions in the
   4153     //      target CE space.
   4154     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
   4155     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
   4156     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
   4157     // and the beginning of the base text.
   4158     for(targetIx = limitIx; ; targetIx += 1)
   4159     {
   4160         found = TRUE;
   4161         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
   4162         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   4163         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
   4164         const CEI *lastCEI  = ceb.getPrevious(targetIx);
   4165         if (lastCEI == NULL) {
   4166             *status = U_INTERNAL_PROGRAM_ERROR;
   4167             found = FALSE;
   4168              break;
   4169         }
   4170         //  Inner loop checks for a match beginning at each
   4171         //  position from the outer loop.
   4172         int32_t targetIxOffset = 0;
   4173         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
   4174             int64_t patCE = strsrch->pattern.pces[patIx];
   4175 
   4176             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
   4177             //  Compare CE from target string with CE from the pattern.
   4178             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   4179             //    which will fail the compare, below.
   4180             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   4181             if ( ceMatch == U_CE_NO_MATCH ) {
   4182                 found = FALSE;
   4183                 break;
   4184             } else if ( ceMatch > U_CE_NO_MATCH ) {
   4185                 if ( ceMatch == U_CE_SKIP_TARG ) {
   4186                     // redo with same patCE, next targCE
   4187                     patIx++;
   4188                     targetIxOffset++;
   4189                 } else { // ceMatch == U_CE_SKIP_PATN
   4190                     // redo with same targCE, next patCE
   4191                     targetIxOffset--;
   4192                 }
   4193             }
   4194         }
   4195 
   4196         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   4197             // No match at this targetIx.  Try again at the next.
   4198             continue;
   4199         }
   4200 
   4201         if (!found) {
   4202             // No match at all, we have run off the end of the target text.
   4203             break;
   4204         }
   4205 
   4206 
   4207         // We have found a match in CE space.
   4208         // Now determine the bounds in string index space.
   4209         //  There still is a chance of match failure if the CE range not correspond to
   4210         //     an acceptable character range.
   4211         //
   4212         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
   4213         mStart   = firstCEI->lowIndex;
   4214 
   4215         // Check for the start of the match being within a combining sequence.
   4216         //   This can happen if the pattern itself begins with a combining char, and
   4217         //   the match found combining marks in the target text that were attached
   4218         //    to something else.
   4219         //   This type of match should be rejected for not completely consuming a
   4220         //   combining sequence.
   4221         if (!isBreakBoundary(strsrch, mStart)) {
   4222             found = FALSE;
   4223         }
   4224 
   4225         // Look at the high index of the first CE in the match. If it's the same as the
   4226         // low index, the first CE in the match is in the middle of an expansion.
   4227         if (mStart == firstCEI->highIndex) {
   4228             found = FALSE;
   4229         }
   4230 
   4231 
   4232         minLimit = lastCEI->lowIndex;
   4233 
   4234         if (targetIx > 0) {
   4235             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   4236             //   extended to the end of input, and the match is good.
   4237 
   4238             // Look at the high and low indices of the CE following the match. If
   4239             // they are the same it means one of two things:
   4240             //    1. The match extended to the last CE from the target text, which is OK, or
   4241             //    2. The last CE that was part of the match is in an expansion that extends
   4242             //       to the first CE after the match. In this case, we reject the match.
   4243             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
   4244 
   4245             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4246                 found = FALSE;
   4247             }
   4248 
   4249             mLimit = maxLimit = nextCEI->lowIndex;
   4250 
   4251             //  Advance the match end position to the first acceptable match boundary.
   4252             //    This advances the index over any combining charcters.
   4253             if (minLimit < maxLimit) {
   4254                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4255 
   4256                 if (nba >= lastCEI->highIndex) {
   4257                     mLimit = nba;
   4258                 }
   4259             }
   4260 
   4261             // If advancing to the end of a combining sequence in character indexing space
   4262             //   advanced us beyond the end of the match in CE space, reject this match.
   4263             if (mLimit > maxLimit) {
   4264                 found = FALSE;
   4265             }
   4266 
   4267             // Make sure the end of the match is on a break boundary
   4268             if (!isBreakBoundary(strsrch, mLimit)) {
   4269                 found = FALSE;
   4270             }
   4271 
   4272         } else {
   4273             // No non-ignorable CEs after this point.
   4274             // The maximum position is detected by boundary after
   4275             // the last non-ignorable CE. Combining sequence
   4276             // across the start index will be truncated.
   4277             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4278             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
   4279         }
   4280 
   4281     #ifdef USEARCH_DEBUG
   4282         if (getenv("USEARCH_DEBUG") != NULL) {
   4283             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4284         }
   4285     #endif
   4286 
   4287 
   4288         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4289             found = FALSE;
   4290         }
   4291 
   4292         if (found) {
   4293             break;
   4294         }
   4295     }
   4296 
   4297     #ifdef USEARCH_DEBUG
   4298     if (getenv("USEARCH_DEBUG") != NULL) {
   4299         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4300         int32_t  lastToPrint = ceb.limitIx+2;
   4301         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4302             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4303         }
   4304         printf("\n%s\n", found? "match found" : "no match");
   4305     }
   4306     #endif
   4307 
   4308     // All Done.  Store back the match bounds to the caller.
   4309     //
   4310     if (found==FALSE) {
   4311         mLimit = -1;
   4312         mStart = -1;
   4313     }
   4314 
   4315     if (matchStart != NULL) {
   4316         *matchStart= mStart;
   4317     }
   4318 
   4319     if (matchLimit != NULL) {
   4320         *matchLimit = mLimit;
   4321     }
   4322 
   4323     return found;
   4324 }
   4325 
   4326 // internal use methods declared in usrchimp.h -----------------------------
   4327 
   4328 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
   4329 {
   4330     if (U_FAILURE(*status)) {
   4331         setMatchNotFound(strsrch);
   4332         return FALSE;
   4333     }
   4334 
   4335 #if BOYER_MOORE
   4336     UCollationElements *coleiter        = strsrch->textIter;
   4337     int32_t             textlength      = strsrch->search->textLength;
   4338     int32_t            *patternce       = strsrch->pattern.ces;
   4339     int32_t             patterncelength = strsrch->pattern.cesLength;
   4340     int32_t             textoffset      = ucol_getOffset(coleiter);
   4341 
   4342     // status used in setting coleiter offset, since offset is checked in
   4343     // shiftForward before setting the coleiter offset, status never
   4344     // a failure
   4345     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4346                               patterncelength);
   4347     while (textoffset <= textlength)
   4348     {
   4349         uint32_t    patternceindex = patterncelength - 1;
   4350         int32_t     targetce;
   4351         UBool       found          = FALSE;
   4352         int32_t    lastce          = UCOL_NULLORDER;
   4353 
   4354         setColEIterOffset(coleiter, textoffset);
   4355 
   4356         for (;;) {
   4357             // finding the last pattern ce match, imagine composite characters
   4358             // for example: search for pattern A in text \u00C0
   4359             // we'll have to skip \u0300 the grave first before we get to A
   4360             targetce = ucol_previous(coleiter, status);
   4361             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4362                 found = FALSE;
   4363                 break;
   4364             }
   4365             targetce = getCE(strsrch, targetce);
   4366             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
   4367                 // this is for the text \u0315\u0300 that requires
   4368                 // normalization and pattern \u0300, where \u0315 is ignorable
   4369                 continue;
   4370             }
   4371             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4372                 lastce = targetce;
   4373             }
   4374             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4375             if (targetce == patternce[patternceindex]) {
   4376                 // the first ce can be a contraction
   4377                 found = TRUE;
   4378                 break;
   4379             }
   4380             if (!hasExpansion(coleiter)) {
   4381                 found = FALSE;
   4382                 break;
   4383             }
   4384         }
   4385 
   4386         //targetce = lastce;
   4387 
   4388         while (found && patternceindex > 0) {
   4389             lastce = targetce;
   4390             targetce    = ucol_previous(coleiter, status);
   4391             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4392                 found = FALSE;
   4393                 break;
   4394             }
   4395             targetce    = getCE(strsrch, targetce);
   4396             if (targetce == UCOL_IGNORABLE) {
   4397                 continue;
   4398             }
   4399 
   4400             patternceindex --;
   4401             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4402             found = found && targetce == patternce[patternceindex];
   4403         }
   4404 
   4405         targetce = lastce;
   4406 
   4407         if (!found) {
   4408             if (U_FAILURE(*status)) {
   4409                 break;
   4410             }
   4411             textoffset = shiftForward(strsrch, textoffset, lastce,
   4412                                       patternceindex);
   4413             // status checked at loop.
   4414             patternceindex = patterncelength;
   4415             continue;
   4416         }
   4417 
   4418         if (checkNextExactMatch(strsrch, &textoffset, status)) {
   4419             // status checked in ucol_setOffset
   4420             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4421             return TRUE;
   4422         }
   4423     }
   4424     setMatchNotFound(strsrch);
   4425     return FALSE;
   4426 #else
   4427     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4428     int32_t start = -1;
   4429     int32_t end = -1;
   4430 
   4431     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4432         strsrch->search->matchedIndex  = start;
   4433         strsrch->search->matchedLength = end - start;
   4434         return TRUE;
   4435     } else {
   4436         setMatchNotFound(strsrch);
   4437         return FALSE;
   4438     }
   4439 #endif
   4440 }
   4441 
   4442 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
   4443 {
   4444     if (U_FAILURE(*status)) {
   4445         setMatchNotFound(strsrch);
   4446         return FALSE;
   4447     }
   4448 
   4449 #if BOYER_MOORE
   4450     UCollationElements *coleiter        = strsrch->textIter;
   4451     int32_t             textlength      = strsrch->search->textLength;
   4452     int32_t            *patternce       = strsrch->pattern.ces;
   4453     int32_t             patterncelength = strsrch->pattern.cesLength;
   4454     int32_t             textoffset      = ucol_getOffset(coleiter);
   4455     UBool               hasPatternAccents =
   4456        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4457 
   4458     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4459                               patterncelength);
   4460     strsrch->canonicalPrefixAccents[0] = 0;
   4461     strsrch->canonicalSuffixAccents[0] = 0;
   4462 
   4463     while (textoffset <= textlength)
   4464     {
   4465         int32_t     patternceindex = patterncelength - 1;
   4466         int32_t     targetce;
   4467         UBool       found          = FALSE;
   4468         int32_t     lastce         = UCOL_NULLORDER;
   4469 
   4470         setColEIterOffset(coleiter, textoffset);
   4471 
   4472         for (;;) {
   4473             // finding the last pattern ce match, imagine composite characters
   4474             // for example: search for pattern A in text \u00C0
   4475             // we'll have to skip \u0300 the grave first before we get to A
   4476             targetce = ucol_previous(coleiter, status);
   4477             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4478                 found = FALSE;
   4479                 break;
   4480             }
   4481             targetce = getCE(strsrch, targetce);
   4482             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4483                 lastce = targetce;
   4484             }
   4485             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4486             if (targetce == patternce[patternceindex]) {
   4487                 // the first ce can be a contraction
   4488                 found = TRUE;
   4489                 break;
   4490             }
   4491             if (!hasExpansion(coleiter)) {
   4492                 found = FALSE;
   4493                 break;
   4494             }
   4495         }
   4496 
   4497         while (found && patternceindex > 0) {
   4498             targetce    = ucol_previous(coleiter, status);
   4499             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4500                 found = FALSE;
   4501                 break;
   4502             }
   4503             targetce    = getCE(strsrch, targetce);
   4504             if (targetce == UCOL_IGNORABLE) {
   4505                 continue;
   4506             }
   4507 
   4508             patternceindex --;
   4509             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4510             found = found && targetce == patternce[patternceindex];
   4511         }
   4512 
   4513         // initializing the rearranged accent array
   4514         if (hasPatternAccents && !found) {
   4515             strsrch->canonicalPrefixAccents[0] = 0;
   4516             strsrch->canonicalSuffixAccents[0] = 0;
   4517             if (U_FAILURE(*status)) {
   4518                 break;
   4519             }
   4520             found = doNextCanonicalMatch(strsrch, textoffset, status);
   4521         }
   4522 
   4523         if (!found) {
   4524             if (U_FAILURE(*status)) {
   4525                 break;
   4526             }
   4527             textoffset = shiftForward(strsrch, textoffset, lastce,
   4528                                       patternceindex);
   4529             // status checked at loop
   4530             patternceindex = patterncelength;
   4531             continue;
   4532         }
   4533 
   4534         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
   4535             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4536             return TRUE;
   4537         }
   4538     }
   4539     setMatchNotFound(strsrch);
   4540     return FALSE;
   4541 #else
   4542     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4543     int32_t start = -1;
   4544     int32_t end = -1;
   4545 
   4546     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4547         strsrch->search->matchedIndex  = start;
   4548         strsrch->search->matchedLength = end - start;
   4549         return TRUE;
   4550     } else {
   4551         setMatchNotFound(strsrch);
   4552         return FALSE;
   4553     }
   4554 #endif
   4555 }
   4556 
   4557 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
   4558 {
   4559     if (U_FAILURE(*status)) {
   4560         setMatchNotFound(strsrch);
   4561         return FALSE;
   4562     }
   4563 
   4564 #if BOYER_MOORE
   4565     UCollationElements *coleiter        = strsrch->textIter;
   4566     int32_t            *patternce       = strsrch->pattern.ces;
   4567     int32_t             patterncelength = strsrch->pattern.cesLength;
   4568     int32_t             textoffset      = ucol_getOffset(coleiter);
   4569 
   4570     // shifting it check for setting offset
   4571     // if setOffset is called previously or there was no previous match, we
   4572     // leave the offset as it is.
   4573     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4574         textoffset = strsrch->search->matchedIndex;
   4575     }
   4576 
   4577     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4578                               patterncelength);
   4579 
   4580     while (textoffset >= 0)
   4581     {
   4582         int32_t     patternceindex = 1;
   4583         int32_t     targetce;
   4584         UBool       found          = FALSE;
   4585         int32_t     firstce        = UCOL_NULLORDER;
   4586 
   4587         // if status is a failure, ucol_setOffset does nothing
   4588         setColEIterOffset(coleiter, textoffset);
   4589 
   4590         for (;;) {
   4591             // finding the first pattern ce match, imagine composite
   4592             // characters. for example: search for pattern \u0300 in text
   4593             // \u00C0, we'll have to skip A first before we get to
   4594             // \u0300 the grave accent
   4595             targetce = ucol_next(coleiter, status);
   4596             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4597                 found = FALSE;
   4598                 break;
   4599             }
   4600             targetce = getCE(strsrch, targetce);
   4601             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4602                 firstce = targetce;
   4603             }
   4604             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
   4605                 continue;
   4606             }
   4607             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4608             if (targetce == patternce[0]) {
   4609                 found = TRUE;
   4610                 break;
   4611             }
   4612             if (!hasExpansion(coleiter)) {
   4613                 // checking for accents in composite character
   4614                 found = FALSE;
   4615                 break;
   4616             }
   4617         }
   4618 
   4619         //targetce = firstce;
   4620 
   4621         while (found && (patternceindex < patterncelength)) {
   4622             firstce = targetce;
   4623             targetce    = ucol_next(coleiter, status);
   4624             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4625                 found = FALSE;
   4626                 break;
   4627             }
   4628             targetce    = getCE(strsrch, targetce);
   4629             if (targetce == UCOL_IGNORABLE) {
   4630                 continue;
   4631             }
   4632 
   4633             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4634             found = found && targetce == patternce[patternceindex];
   4635             patternceindex ++;
   4636         }
   4637 
   4638         targetce = firstce;
   4639 
   4640         if (!found) {
   4641             if (U_FAILURE(*status)) {
   4642                 break;
   4643             }
   4644 
   4645             textoffset = reverseShift(strsrch, textoffset, targetce,
   4646                                       patternceindex);
   4647             patternceindex = 0;
   4648             continue;
   4649         }
   4650 
   4651         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
   4652             setColEIterOffset(coleiter, textoffset);
   4653             return TRUE;
   4654         }
   4655     }
   4656     setMatchNotFound(strsrch);
   4657     return FALSE;
   4658 #else
   4659     int32_t textOffset;
   4660 
   4661     if (strsrch->search->isOverlap) {
   4662         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4663             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4664         } else {
   4665             // move the start position at the end of possible match
   4666             initializePatternPCETable(strsrch, status);
   4667             if (!initTextProcessedIter(strsrch, status)) {
   4668                 setMatchNotFound(strsrch);
   4669                 return FALSE;
   4670             }
   4671             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
   4672                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4673                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4674                     // at the end of the text
   4675                     break;
   4676                 }
   4677             }
   4678             if (U_FAILURE(*status)) {
   4679                 setMatchNotFound(strsrch);
   4680                 return FALSE;
   4681             }
   4682             textOffset = ucol_getOffset(strsrch->textIter);
   4683         }
   4684     } else {
   4685         textOffset = ucol_getOffset(strsrch->textIter);
   4686     }
   4687 
   4688     int32_t start = -1;
   4689     int32_t end = -1;
   4690 
   4691     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4692         strsrch->search->matchedIndex = start;
   4693         strsrch->search->matchedLength = end - start;
   4694         return TRUE;
   4695     } else {
   4696         setMatchNotFound(strsrch);
   4697         return FALSE;
   4698     }
   4699 #endif
   4700 }
   4701 
   4702 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
   4703                                       UErrorCode    *status)
   4704 {
   4705     if (U_FAILURE(*status)) {
   4706         setMatchNotFound(strsrch);
   4707         return FALSE;
   4708     }
   4709 
   4710 #if BOYER_MOORE
   4711     UCollationElements *coleiter        = strsrch->textIter;
   4712     int32_t            *patternce       = strsrch->pattern.ces;
   4713     int32_t             patterncelength = strsrch->pattern.cesLength;
   4714     int32_t             textoffset      = ucol_getOffset(coleiter);
   4715     UBool               hasPatternAccents =
   4716        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4717 
   4718     // shifting it check for setting offset
   4719     // if setOffset is called previously or there was no previous match, we
   4720     // leave the offset as it is.
   4721     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4722         textoffset = strsrch->search->matchedIndex;
   4723     }
   4724 
   4725     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4726                               patterncelength);
   4727     strsrch->canonicalPrefixAccents[0] = 0;
   4728     strsrch->canonicalSuffixAccents[0] = 0;
   4729 
   4730     while (textoffset >= 0)
   4731     {
   4732         int32_t     patternceindex = 1;
   4733         int32_t     targetce;
   4734         UBool       found          = FALSE;
   4735         int32_t     firstce        = UCOL_NULLORDER;
   4736 
   4737         setColEIterOffset(coleiter, textoffset);
   4738         for (;;) {
   4739             // finding the first pattern ce match, imagine composite
   4740             // characters. for example: search for pattern \u0300 in text
   4741             // \u00C0, we'll have to skip A first before we get to
   4742             // \u0300 the grave accent
   4743             targetce = ucol_next(coleiter, status);
   4744             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4745                 found = FALSE;
   4746                 break;
   4747             }
   4748             targetce = getCE(strsrch, targetce);
   4749             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4750                 firstce = targetce;
   4751             }
   4752 
   4753             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4754             if (targetce == patternce[0]) {
   4755                 // the first ce can be a contraction
   4756                 found = TRUE;
   4757                 break;
   4758             }
   4759             if (!hasExpansion(coleiter)) {
   4760                 // checking for accents in composite character
   4761                 found = FALSE;
   4762                 break;
   4763             }
   4764         }
   4765 
   4766         targetce = firstce;
   4767 
   4768         while (found && patternceindex < patterncelength) {
   4769             targetce    = ucol_next(coleiter, status);
   4770             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4771                 found = FALSE;
   4772                 break;
   4773             }
   4774             targetce = getCE(strsrch, targetce);
   4775             if (targetce == UCOL_IGNORABLE) {
   4776                 continue;
   4777             }
   4778 
   4779             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4780             found = found && targetce == patternce[patternceindex];
   4781             patternceindex ++;
   4782         }
   4783 
   4784         // initializing the rearranged accent array
   4785         if (hasPatternAccents && !found) {
   4786             strsrch->canonicalPrefixAccents[0] = 0;
   4787             strsrch->canonicalSuffixAccents[0] = 0;
   4788             if (U_FAILURE(*status)) {
   4789                 break;
   4790             }
   4791             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
   4792         }
   4793 
   4794         if (!found) {
   4795             if (U_FAILURE(*status)) {
   4796                 break;
   4797             }
   4798             textoffset = reverseShift(strsrch, textoffset, targetce,
   4799                                       patternceindex);
   4800             patternceindex = 0;
   4801             continue;
   4802         }
   4803 
   4804         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
   4805             setColEIterOffset(coleiter, textoffset);
   4806             return TRUE;
   4807         }
   4808     }
   4809     setMatchNotFound(strsrch);
   4810     return FALSE;
   4811 #else
   4812     int32_t textOffset;
   4813 
   4814     if (strsrch->search->isOverlap) {
   4815         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4816             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4817         } else {
   4818             // move the start position at the end of possible match
   4819             initializePatternPCETable(strsrch, status);
   4820             if (!initTextProcessedIter(strsrch, status)) {
   4821                 setMatchNotFound(strsrch);
   4822                 return FALSE;
   4823             }
   4824             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
   4825                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4826                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4827                     // at the end of the text
   4828                     break;
   4829                 }
   4830             }
   4831             if (U_FAILURE(*status)) {
   4832                 setMatchNotFound(strsrch);
   4833                 return FALSE;
   4834             }
   4835             textOffset = ucol_getOffset(strsrch->textIter);
   4836         }
   4837     } else {
   4838         textOffset = ucol_getOffset(strsrch->textIter);
   4839     }
   4840 
   4841     int32_t start = -1;
   4842     int32_t end = -1;
   4843 
   4844     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4845         strsrch->search->matchedIndex = start;
   4846         strsrch->search->matchedLength = end - start;
   4847         return TRUE;
   4848     } else {
   4849         setMatchNotFound(strsrch);
   4850         return FALSE;
   4851     }
   4852 #endif
   4853 }
   4854 
   4855 #endif /* #if !UCONFIG_NO_COLLATION */
   4856