Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  07/02/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/usearch.h"
     15 #include "unicode/ustring.h"
     16 #include "unicode/uchar.h"
     17 #include "unicode/utf16.h"
     18 #include "normalizer2impl.h"
     19 #include "ucol_imp.h"
     20 #include "usrchimp.h"
     21 #include "cmemory.h"
     22 #include "ucln_in.h"
     23 #include "uassert.h"
     24 #include "ustr_imp.h"
     25 
     26 U_NAMESPACE_USE
     27 
     28 // don't use Boyer-Moore
     29 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
     30 #define BOYER_MOORE 0
     31 
     32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     33 
     34 // internal definition ---------------------------------------------------
     35 
     36 #define LAST_BYTE_MASK_          0xFF
     37 #define SECOND_LAST_BYTE_SHIFT_  8
     38 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
     39 
     40 static const Normalizer2Impl *g_nfcImpl = NULL;
     41 
     42 // internal methods -------------------------------------------------
     43 
     44 /**
     45 * Fast collation element iterator setOffset.
     46 * This function does not check for bounds.
     47 * @param coleiter collation element iterator
     48 * @param offset to set
     49 */
     50 static
     51 inline void setColEIterOffset(UCollationElements *elems,
     52                       int32_t             offset)
     53 {
     54     collIterate *ci = &(elems->iteratordata_);
     55     ci->pos         = ci->string + offset;
     56     ci->CEpos       = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
     57     if (ci->flags & UCOL_ITER_INNORMBUF) {
     58         ci->flags = ci->origFlags;
     59     }
     60     ci->fcdPosition = NULL;
     61 
     62     ci->offsetReturn = NULL;
     63     ci->offsetStore  = ci->offsetBuffer;
     64     ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
     65 }
     66 
     67 /**
     68 * Getting the mask for collation strength
     69 * @param strength collation strength
     70 * @return collation element mask
     71 */
     72 static
     73 inline uint32_t getMask(UCollationStrength strength)
     74 {
     75     switch (strength)
     76     {
     77     case UCOL_PRIMARY:
     78         return UCOL_PRIMARYORDERMASK;
     79     case UCOL_SECONDARY:
     80         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
     81     default:
     82         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
     83                UCOL_PRIMARYORDERMASK;
     84     }
     85 }
     86 
     87 /**
     88 * This is to squeeze the 21bit ces into a 256 table
     89 * @param ce collation element
     90 * @return collapsed version of the collation element
     91 */
     92 static
     93 inline int hash(uint32_t ce)
     94 {
     95     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
     96     // well with the new collation where most of the latin 1 characters
     97     // are of the value xx000xxx. their hashes will most of the time be 0
     98     // to be discussed on the hash algo.
     99     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
    100 }
    101 
    102 U_CDECL_BEGIN
    103 static UBool U_CALLCONV
    104 usearch_cleanup(void) {
    105     g_nfcImpl = NULL;
    106     return TRUE;
    107 }
    108 U_CDECL_END
    109 
    110 /**
    111 * Initializing the fcd tables.
    112 * Internal method, status assumed to be a success.
    113 * @param status output error if any, caller to check status before calling
    114 *               method, status assumed to be success when passed in.
    115 */
    116 static
    117 inline void initializeFCD(UErrorCode *status)
    118 {
    119     if (g_nfcImpl == NULL) {
    120         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    121         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
    122     }
    123 }
    124 
    125 /**
    126 * Gets the fcd value for a character at the argument index.
    127 * This method takes into accounts of the supplementary characters.
    128 * @param str UTF16 string where character for fcd retrieval resides
    129 * @param offset position of the character whose fcd is to be retrieved, to be
    130 *               overwritten with the next character position, taking
    131 *               surrogate characters into consideration.
    132 * @param strlength length of the argument string
    133 * @return fcd value
    134 */
    135 static
    136 uint16_t getFCD(const UChar   *str, int32_t *offset,
    137                              int32_t  strlength)
    138 {
    139     const UChar *temp = str + *offset;
    140     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
    141     *offset = (int32_t)(temp - str);
    142     return result;
    143 }
    144 
    145 /**
    146 * Getting the modified collation elements taking into account the collation
    147 * attributes
    148 * @param strsrch string search data
    149 * @param sourcece
    150 * @return the modified collation element
    151 */
    152 static
    153 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
    154 {
    155     // note for tertiary we can't use the collator->tertiaryMask, that
    156     // is a preprocessed mask that takes into account case options. since
    157     // we are only concerned with exact matches, we don't need that.
    158     sourcece &= strsrch->ceMask;
    159 
    160     if (strsrch->toShift) {
    161         // alternate handling here, since only the 16 most significant digits
    162         // is only used, we can safely do a compare without masking
    163         // if the ce is a variable, we mask and get only the primary values
    164         // no shifting to quartenary is required since all primary values
    165         // less than variabletop will need to be masked off anyway.
    166         if (strsrch->variableTop > sourcece) {
    167             if (strsrch->strength >= UCOL_QUATERNARY) {
    168                 sourcece &= UCOL_PRIMARYORDERMASK;
    169             }
    170             else {
    171                 sourcece = UCOL_IGNORABLE;
    172             }
    173         }
    174     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
    175         sourcece = 0xFFFF;
    176     }
    177 
    178     return sourcece;
    179 }
    180 
    181 /**
    182 * Allocate a memory and returns NULL if it failed.
    183 * Internal method, status assumed to be a success.
    184 * @param size to allocate
    185 * @param status output error if any, caller to check status before calling
    186 *               method, status assumed to be success when passed in.
    187 * @return newly allocated array, NULL otherwise
    188 */
    189 static
    190 inline void * allocateMemory(uint32_t size, UErrorCode *status)
    191 {
    192     uint32_t *result = (uint32_t *)uprv_malloc(size);
    193     if (result == NULL) {
    194         *status = U_MEMORY_ALLOCATION_ERROR;
    195     }
    196     return result;
    197 }
    198 
    199 /**
    200 * Adds a uint32_t value to a destination array.
    201 * Creates a new array if we run out of space. The caller will have to
    202 * manually deallocate the newly allocated array.
    203 * Internal method, status assumed to be success, caller has to check status
    204 * before calling this method. destination not to be NULL and has at least
    205 * size destinationlength.
    206 * @param destination target array
    207 * @param offset destination offset to add value
    208 * @param destinationlength target array size, return value for the new size
    209 * @param value to be added
    210 * @param increments incremental size expected
    211 * @param status output error if any, caller to check status before calling
    212 *               method, status assumed to be success when passed in.
    213 * @return new destination array, destination if there was no new allocation
    214 */
    215 static
    216 inline int32_t * addTouint32_tArray(int32_t    *destination,
    217                                     uint32_t    offset,
    218                                     uint32_t   *destinationlength,
    219                                     uint32_t    value,
    220                                     uint32_t    increments,
    221                                     UErrorCode *status)
    222 {
    223     uint32_t newlength = *destinationlength;
    224     if (offset + 1 == newlength) {
    225         newlength += increments;
    226         int32_t *temp = (int32_t *)allocateMemory(
    227                                          sizeof(int32_t) * newlength, status);
    228         if (U_FAILURE(*status)) {
    229             return NULL;
    230         }
    231         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
    232         *destinationlength = newlength;
    233         destination        = temp;
    234     }
    235     destination[offset] = value;
    236     return destination;
    237 }
    238 
    239 /**
    240 * Adds a uint64_t value to a destination array.
    241 * Creates a new array if we run out of space. The caller will have to
    242 * manually deallocate the newly allocated array.
    243 * Internal method, status assumed to be success, caller has to check status
    244 * before calling this method. destination not to be NULL and has at least
    245 * size destinationlength.
    246 * @param destination target array
    247 * @param offset destination offset to add value
    248 * @param destinationlength target array size, return value for the new size
    249 * @param value to be added
    250 * @param increments incremental size expected
    251 * @param status output error if any, caller to check status before calling
    252 *               method, status assumed to be success when passed in.
    253 * @return new destination array, destination if there was no new allocation
    254 */
    255 static
    256 inline int64_t * addTouint64_tArray(int64_t    *destination,
    257                                     uint32_t    offset,
    258                                     uint32_t   *destinationlength,
    259                                     uint64_t    value,
    260                                     uint32_t    increments,
    261                                     UErrorCode *status)
    262 {
    263     uint32_t newlength = *destinationlength;
    264     if (offset + 1 == newlength) {
    265         newlength += increments;
    266         int64_t *temp = (int64_t *)allocateMemory(
    267                                          sizeof(int64_t) * newlength, status);
    268 
    269         if (U_FAILURE(*status)) {
    270             return NULL;
    271         }
    272 
    273         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
    274         *destinationlength = newlength;
    275         destination        = temp;
    276     }
    277 
    278     destination[offset] = value;
    279 
    280     return destination;
    281 }
    282 
    283 /**
    284 * Initializing the ce table for a pattern.
    285 * Stores non-ignorable collation keys.
    286 * Table size will be estimated by the size of the pattern text. Table
    287 * expansion will be perform as we go along. Adding 1 to ensure that the table
    288 * size definitely increases.
    289 * Internal method, status assumed to be a success.
    290 * @param strsrch string search data
    291 * @param status output error if any, caller to check status before calling
    292 *               method, status assumed to be success when passed in.
    293 * @return total number of expansions
    294 */
    295 static
    296 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
    297                                          UErrorCode    *status)
    298 {
    299     UPattern *pattern            = &(strsrch->pattern);
    300     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
    301     int32_t  *cetable            = pattern->CEBuffer;
    302     uint32_t  patternlength      = pattern->textLength;
    303     UCollationElements *coleiter = strsrch->utilIter;
    304 
    305     if (coleiter == NULL) {
    306         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    307                                      patternlength, status);
    308         // status will be checked in ucol_next(..) later and if it is an
    309         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    310         // returned.
    311         strsrch->utilIter = coleiter;
    312     }
    313     else {
    314         uprv_init_collIterate(strsrch->collator, pattern->text,
    315                          pattern->textLength,
    316                          &coleiter->iteratordata_,
    317                          status);
    318     }
    319     if(U_FAILURE(*status)) {
    320         return 0;
    321     }
    322 
    323     if (pattern->CE != cetable && pattern->CE) {
    324         uprv_free(pattern->CE);
    325     }
    326 
    327     uint16_t  offset      = 0;
    328     uint16_t  result      = 0;
    329     int32_t   ce;
    330 
    331     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
    332            U_SUCCESS(*status)) {
    333         uint32_t newce = getCE(strsrch, ce);
    334         if (newce) {
    335             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
    336                                   newce,
    337                                   patternlength - ucol_getOffset(coleiter) + 1,
    338                                   status);
    339             if (U_FAILURE(*status)) {
    340                 return 0;
    341             }
    342             offset ++;
    343             if (cetable != temp && cetable != pattern->CEBuffer) {
    344                 uprv_free(cetable);
    345             }
    346             cetable = temp;
    347         }
    348         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    349     }
    350 
    351     cetable[offset]   = 0;
    352     pattern->CE       = cetable;
    353     pattern->CELength = offset;
    354 
    355     return result;
    356 }
    357 
    358 /**
    359 * Initializing the pce table for a pattern.
    360 * Stores non-ignorable collation keys.
    361 * Table size will be estimated by the size of the pattern text. Table
    362 * expansion will be perform as we go along. Adding 1 to ensure that the table
    363 * size definitely increases.
    364 * Internal method, status assumed to be a success.
    365 * @param strsrch string search data
    366 * @param status output error if any, caller to check status before calling
    367 *               method, status assumed to be success when passed in.
    368 * @return total number of expansions
    369 */
    370 static
    371 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
    372                                           UErrorCode    *status)
    373 {
    374     UPattern *pattern            = &(strsrch->pattern);
    375     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
    376     int64_t  *pcetable           = pattern->PCEBuffer;
    377     uint32_t  patternlength      = pattern->textLength;
    378     UCollationElements *coleiter = strsrch->utilIter;
    379 
    380     if (coleiter == NULL) {
    381         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    382                                      patternlength, status);
    383         // status will be checked in ucol_next(..) later and if it is an
    384         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    385         // returned.
    386         strsrch->utilIter = coleiter;
    387     } else {
    388         uprv_init_collIterate(strsrch->collator, pattern->text,
    389                               pattern->textLength,
    390                               &coleiter->iteratordata_,
    391                               status);
    392     }
    393     if(U_FAILURE(*status)) {
    394         return 0;
    395     }
    396 
    397     if (pattern->PCE != pcetable && pattern->PCE != NULL) {
    398         uprv_free(pattern->PCE);
    399     }
    400 
    401     uint16_t  offset = 0;
    402     uint16_t  result = 0;
    403     int64_t   pce;
    404 
    405     uprv_init_pce(coleiter);
    406 
    407     // ** Should processed CEs be signed or unsigned?
    408     // ** (the rest of the code in this file seems to play fast-and-loose with
    409     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
    410     while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
    411            U_SUCCESS(*status)) {
    412         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
    413                               pce,
    414                               patternlength - ucol_getOffset(coleiter) + 1,
    415                               status);
    416 
    417         if (U_FAILURE(*status)) {
    418             return 0;
    419         }
    420 
    421         offset += 1;
    422 
    423         if (pcetable != temp && pcetable != pattern->PCEBuffer) {
    424             uprv_free(pcetable);
    425         }
    426 
    427         pcetable = temp;
    428         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    429     }
    430 
    431     pcetable[offset]   = 0;
    432     pattern->PCE       = pcetable;
    433     pattern->PCELength = offset;
    434 
    435     return result;
    436 }
    437 
    438 /**
    439 * Initializes the pattern struct.
    440 * Internal method, status assumed to be success.
    441 * @param strsrch UStringSearch data storage
    442 * @param status output error if any, caller to check status before calling
    443 *               method, status assumed to be success when passed in.
    444 * @return expansionsize the total expansion size of the pattern
    445 */
    446 static
    447 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
    448 {
    449           UPattern   *pattern     = &(strsrch->pattern);
    450     const UChar      *patterntext = pattern->text;
    451           int32_t     length      = pattern->textLength;
    452           int32_t index       = 0;
    453 
    454     // Since the strength is primary, accents are ignored in the pattern.
    455     if (strsrch->strength == UCOL_PRIMARY) {
    456         pattern->hasPrefixAccents = 0;
    457         pattern->hasSuffixAccents = 0;
    458     } else {
    459         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
    460                                                          SECOND_LAST_BYTE_SHIFT_;
    461         index = length;
    462         U16_BACK_1(patterntext, 0, index);
    463         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
    464                                                                  LAST_BYTE_MASK_;
    465     }
    466 
    467     // ** HACK **
    468     if (strsrch->pattern.PCE != NULL) {
    469         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
    470             uprv_free(strsrch->pattern.PCE);
    471         }
    472 
    473         strsrch->pattern.PCE = NULL;
    474     }
    475 
    476     // since intializePattern is an internal method status is a success.
    477     return initializePatternCETable(strsrch, status);
    478 }
    479 
    480 /**
    481 * Initializing shift tables, with the default values.
    482 * If a corresponding default value is 0, the shift table is not set.
    483 * @param shift table for forwards shift
    484 * @param backshift table for backwards shift
    485 * @param cetable table containing pattern ce
    486 * @param cesize size of the pattern ces
    487 * @param expansionsize total size of the expansions
    488 * @param defaultforward the default forward value
    489 * @param defaultbackward the default backward value
    490 */
    491 static
    492 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
    493                           int32_t  *cetable, int32_t cesize,
    494                           int16_t   expansionsize,
    495                           int16_t   defaultforward,
    496                           int16_t   defaultbackward)
    497 {
    498     // estimate the value to shift. to do that we estimate the smallest
    499     // number of characters to give the relevant ces, ie approximately
    500     // the number of ces minus their expansion, since expansions can come
    501     // from a character.
    502     int32_t count;
    503     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    504         shift[count] = defaultforward;
    505     }
    506     cesize --; // down to the last index
    507     for (count = 0; count < cesize; count ++) {
    508         // number of ces from right of array to the count
    509         int temp = defaultforward - count - 1;
    510         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
    511     }
    512     shift[hash(cetable[cesize])] = 1;
    513     // for ignorables we just shift by one. see test examples.
    514     shift[hash(0)] = 1;
    515 
    516     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    517         backshift[count] = defaultbackward;
    518     }
    519     for (count = cesize; count > 0; count --) {
    520         // the original value count does not seem to work
    521         backshift[hash(cetable[count])] = count > expansionsize ?
    522                                           (int16_t)(count - expansionsize) : 1;
    523     }
    524     backshift[hash(cetable[0])] = 1;
    525     backshift[hash(0)] = 1;
    526 }
    527 
    528 /**
    529 * Building of the pattern collation element list and the boyer moore strsrch
    530 * table.
    531 * The canonical match will only be performed after the default match fails.
    532 * For both cases we need to remember the size of the composed and decomposed
    533 * versions of the string. Since the Boyer-Moore shift calculations shifts by
    534 * a number of characters in the text and tries to match the pattern from that
    535 * offset, the shift value can not be too large in case we miss some
    536 * characters. To choose a right shift size, we estimate the NFC form of the
    537 * and use its size as a shift guide. The NFC form should be the small
    538 * possible representation of the pattern. Anyways, we'll err on the smaller
    539 * shift size. Hence the calculation for minlength.
    540 * Canonical match will be performed slightly differently. We'll split the
    541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
    542 * the first and last base character (MS), the ending accents (EA). Matches
    543 * will be done on MS first, and only when we match MS then some processing
    544 * will be required for the prefix and end accents in order to determine if
    545 * they match PA and EA. Hence the default shift values
    546 * for the canonical match will take the size of either end's accent into
    547 * consideration. Forwards search will take the end accents into consideration
    548 * for the default shift values and the backwards search will take the prefix
    549 * accents into consideration.
    550 * If pattern has no non-ignorable ce, we return a illegal argument error.
    551 * Internal method, status assumed to be success.
    552 * @param strsrch UStringSearch data storage
    553 * @param status  for output errors if it occurs, status is assumed to be a
    554 *                success when it is passed in.
    555 */
    556 static
    557 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
    558 {
    559     int16_t expandlength  = initializePattern(strsrch, status);
    560     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
    561         UPattern *pattern = &strsrch->pattern;
    562         int32_t   cesize  = pattern->CELength;
    563 
    564         int16_t minlength = cesize > expandlength
    565                             ? (int16_t)cesize - expandlength : 1;
    566         pattern->defaultShiftSize    = minlength;
    567         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
    568                       cesize, expandlength, minlength, minlength);
    569         return;
    570     }
    571     strsrch->pattern.defaultShiftSize = 0;
    572 }
    573 
    574 #if BOYER_MOORE
    575 /**
    576 * Check to make sure that the match length is at the end of the character by
    577 * using the breakiterator.
    578 * @param strsrch string search data
    579 * @param start target text start offset
    580 * @param end target text end offset
    581 */
    582 static
    583 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
    584                                int32_t *end)
    585 {
    586 #if !UCONFIG_NO_BREAK_ITERATION
    587     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
    588     if (breakiterator) {
    589         int32_t matchend = *end;
    590         //int32_t matchstart = *start;
    591 
    592         if (!ubrk_isBoundary(breakiterator, matchend)) {
    593             *end = ubrk_following(breakiterator, matchend);
    594         }
    595 
    596         /* Check the start of the matched text to make sure it doesn't have any accents
    597          * before it.  This code may not be necessary and so it is commented out */
    598         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
    599             *start = ubrk_preceding(breakiterator, matchstart);
    600         }*/
    601     }
    602 #endif
    603 }
    604 
    605 /**
    606 * Determine whether the target text in UStringSearch bounded by the offset
    607 * start and end is one or more whole units of text as
    608 * determined by the breakiterator in UStringSearch.
    609 * @param strsrch string search data
    610 * @param start target text start offset
    611 * @param end target text end offset
    612 */
    613 static
    614 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
    615                                int32_t    end)
    616 {
    617 #if !UCONFIG_NO_BREAK_ITERATION
    618     UBreakIterator *breakiterator = strsrch->search->breakIter;
    619     //TODO: Add here.
    620     if (breakiterator) {
    621         int32_t startindex = ubrk_first(breakiterator);
    622         int32_t endindex   = ubrk_last(breakiterator);
    623 
    624         // out-of-range indexes are never boundary positions
    625         if (start < startindex || start > endindex ||
    626             end < startindex || end > endindex) {
    627             return FALSE;
    628         }
    629         // otherwise, we can use following() on the position before the
    630         // specified one and return true of the position we get back is the
    631         // one the user specified
    632         UBool result = (start == startindex ||
    633                 ubrk_following(breakiterator, start - 1) == start) &&
    634                (end == endindex ||
    635                 ubrk_following(breakiterator, end - 1) == end);
    636         if (result) {
    637             // iterates the individual ces
    638                   UCollationElements *coleiter  = strsrch->utilIter;
    639             const UChar              *text      = strsrch->search->text +
    640                                                                       start;
    641                   UErrorCode          status    = U_ZERO_ERROR;
    642             ucol_setText(coleiter, text, end - start, &status);
    643             for (int32_t count = 0; count < strsrch->pattern.CELength;
    644                  count ++) {
    645                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
    646                 if (ce == UCOL_IGNORABLE) {
    647                     count --;
    648                     continue;
    649                 }
    650                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
    651                     return FALSE;
    652                 }
    653             }
    654             int32_t nextce = ucol_next(coleiter, &status);
    655             while (ucol_getOffset(coleiter) == (end - start)
    656                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
    657                 nextce = ucol_next(coleiter, &status);
    658             }
    659             if (ucol_getOffset(coleiter) == (end - start)
    660                 && nextce != UCOL_NULLORDER) {
    661                 // extra collation elements at the end of the match
    662                 return FALSE;
    663             }
    664         }
    665         return result;
    666     }
    667 #endif
    668     return TRUE;
    669 }
    670 
    671 /**
    672 * Getting the next base character offset if current offset is an accent,
    673 * or the current offset if the current character contains a base character.
    674 * accents the following base character will be returned
    675 * @param text string
    676 * @param textoffset current offset
    677 * @param textlength length of text string
    678 * @return the next base character or the current offset
    679 *         if the current character is contains a base character.
    680 */
    681 static
    682 inline int32_t getNextBaseOffset(const UChar       *text,
    683                                            int32_t  textoffset,
    684                                            int32_t      textlength)
    685 {
    686     if (textoffset < textlength) {
    687         int32_t temp = textoffset;
    688         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
    689             while (temp < textlength) {
    690                 int32_t result = temp;
    691                 if ((getFCD(text, &temp, textlength) >>
    692                      SECOND_LAST_BYTE_SHIFT_) == 0) {
    693                     return result;
    694                 }
    695             }
    696             return textlength;
    697         }
    698     }
    699     return textoffset;
    700 }
    701 
    702 /**
    703 * Gets the next base character offset depending on the string search pattern
    704 * data
    705 * @param strsrch string search data
    706 * @param textoffset current offset, one offset away from the last character
    707 *                   to search for.
    708 * @return start index of the next base character or the current offset
    709 *         if the current character is contains a base character.
    710 */
    711 static
    712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
    713                                                   int32_t    textoffset)
    714 {
    715     int32_t textlength = strsrch->search->textLength;
    716     if (strsrch->pattern.hasSuffixAccents &&
    717         textoffset < textlength) {
    718               int32_t  temp       = textoffset;
    719         const UChar       *text       = strsrch->search->text;
    720         U16_BACK_1(text, 0, temp);
    721         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
    722             return getNextBaseOffset(text, textoffset, textlength);
    723         }
    724     }
    725     return textoffset;
    726 }
    727 
    728 /**
    729 * Shifting the collation element iterator position forward to prepare for
    730 * a following match. If the last character is a unsafe character, we'll only
    731 * shift by 1 to capture contractions, normalization etc.
    732 * Internal method, status assumed to be success.
    733 * @param text strsrch string search data
    734 * @param textoffset start text position to do search
    735 * @param ce the text ce which failed the match.
    736 * @param patternceindex index of the ce within the pattern ce buffer which
    737 *        failed the match
    738 * @return final offset
    739 */
    740 static
    741 inline int32_t shiftForward(UStringSearch *strsrch,
    742                                 int32_t    textoffset,
    743                                 int32_t       ce,
    744                                 int32_t        patternceindex)
    745 {
    746     UPattern *pattern = &(strsrch->pattern);
    747     if (ce != UCOL_NULLORDER) {
    748         int32_t shift = pattern->shift[hash(ce)];
    749         // this is to adjust for characters in the middle of the
    750         // substring for matching that failed.
    751         int32_t adjust = pattern->CELength - patternceindex;
    752         if (adjust > 1 && shift >= adjust) {
    753             shift -= adjust - 1;
    754         }
    755         textoffset += shift;
    756     }
    757     else {
    758         textoffset += pattern->defaultShiftSize;
    759     }
    760 
    761     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
    762     // check for unsafe characters
    763     // * if it is the start or middle of a contraction: to be done after
    764     //   a initial match is found
    765     // * thai or lao base consonant character: similar to contraction
    766     // * high surrogate character: similar to contraction
    767     // * next character is a accent: shift to the next base character
    768     return textoffset;
    769 }
    770 #endif // #if BOYER_MOORE
    771 
    772 /**
    773 * sets match not found
    774 * @param strsrch string search data
    775 */
    776 static
    777 inline void setMatchNotFound(UStringSearch *strsrch)
    778 {
    779     // this method resets the match result regardless of the error status.
    780     strsrch->search->matchedIndex = USEARCH_DONE;
    781     strsrch->search->matchedLength = 0;
    782     if (strsrch->search->isForwardSearching) {
    783         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
    784     }
    785     else {
    786         setColEIterOffset(strsrch->textIter, 0);
    787     }
    788 }
    789 
    790 #if BOYER_MOORE
    791 /**
    792 * Gets the offset to the next safe point in text.
    793 * ie. not the middle of a contraction, swappable characters or supplementary
    794 * characters.
    795 * @param collator collation sata
    796 * @param text string to work with
    797 * @param textoffset offset in string
    798 * @param textlength length of text string
    799 * @return offset to the next safe character
    800 */
    801 static
    802 inline int32_t getNextSafeOffset(const UCollator   *collator,
    803                                      const UChar       *text,
    804                                            int32_t  textoffset,
    805                                            int32_t      textlength)
    806 {
    807     int32_t result = textoffset; // first contraction character
    808     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
    809         result ++;
    810     }
    811     return result;
    812 }
    813 
    814 /**
    815 * This checks for accents in the potential match started with a .
    816 * composite character.
    817 * This is really painful... we have to check that composite character do not
    818 * have any extra accents. We have to normalize the potential match and find
    819 * the immediate decomposed character before the match.
    820 * The first composite character would have been taken care of by the fcd
    821 * checks in checkForwardExactMatch.
    822 * This is the slow path after the fcd of the first character and
    823 * the last character has been checked by checkForwardExactMatch and we
    824 * determine that the potential match has extra non-ignorable preceding
    825 * ces.
    826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    828 * Note here that accents checking are slow and cautioned in the API docs.
    829 * Internal method, status assumed to be a success, caller should check status
    830 * before calling this method
    831 * @param strsrch string search data
    832 * @param start index of the potential unfriendly composite character
    833 * @param end index of the potential unfriendly composite character
    834 * @param status output error status if any.
    835 * @return TRUE if there is non-ignorable accents before at the beginning
    836 *              of the match, FALSE otherwise.
    837 */
    838 
    839 static
    840 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
    841                                    int32_t    end,
    842                                    UErrorCode    *status)
    843 {
    844     UBool result = FALSE;
    845     if (strsrch->pattern.hasPrefixAccents) {
    846               int32_t  length = end - start;
    847               int32_t  offset = 0;
    848         const UChar       *text   = strsrch->search->text + start;
    849 
    850         U16_FWD_1(text, offset, length);
    851         // we are only concerned with the first composite character
    852         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
    853             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
    854                                                        text, 0, length);
    855             if (safeoffset != length) {
    856                 safeoffset ++;
    857             }
    858             UChar   *norm = NULL;
    859             UChar    buffer[INITIAL_ARRAY_SIZE_];
    860             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
    861                                             buffer, INITIAL_ARRAY_SIZE_,
    862                                             status);
    863             if (U_FAILURE(*status)) {
    864                 return FALSE;
    865             }
    866             if (size >= INITIAL_ARRAY_SIZE_) {
    867                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
    868                                                status);
    869                 // if allocation failed, status will be set to
    870                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
    871                 // checks for it.
    872                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
    873                                        size, status);
    874                 if (U_FAILURE(*status) && norm != NULL) {
    875                     uprv_free(norm);
    876                     return FALSE;
    877                 }
    878             }
    879             else {
    880                 norm = buffer;
    881             }
    882 
    883             UCollationElements *coleiter  = strsrch->utilIter;
    884             ucol_setText(coleiter, norm, size, status);
    885             uint32_t            firstce   = strsrch->pattern.CE[0];
    886             UBool               ignorable = TRUE;
    887             uint32_t            ce        = UCOL_IGNORABLE;
    888             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
    889                 offset = ucol_getOffset(coleiter);
    890                 if (ce != firstce && ce != UCOL_IGNORABLE) {
    891                     ignorable = FALSE;
    892                 }
    893                 ce = ucol_next(coleiter, status);
    894             }
    895             UChar32 codepoint;
    896             U16_PREV(norm, 0, offset, codepoint);
    897             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
    898 
    899             if (norm != buffer) {
    900                 uprv_free(norm);
    901             }
    902         }
    903     }
    904 
    905     return result;
    906 }
    907 
    908 /**
    909 * Used by exact matches, checks if there are accents before the match.
    910 * This is really painful... we have to check that composite characters at
    911 * the start of the matches have to not have any extra accents.
    912 * We check the FCD of the character first, if it starts with an accent and
    913 * the first pattern ce does not match the first ce of the character, we bail.
    914 * Otherwise we try normalizing the first composite
    915 * character and find the immediate decomposed character before the match to
    916 * see if it is an non-ignorable accent.
    917 * Now normalizing the first composite character is enough because we ensure
    918 * that when the match is passed in here with extra beginning ces, the
    919 * first or last ce that match has to occur within the first character.
    920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    922 * Note here that accents checking are slow and cautioned in the API docs.
    923 * @param strsrch string search data
    924 * @param start offset
    925 * @param end offset
    926 * @return TRUE if there are accents on either side of the match,
    927 *         FALSE otherwise
    928 */
    929 static
    930 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
    931                                   int32_t    end)
    932 {
    933     if (strsrch->pattern.hasPrefixAccents) {
    934         UCollationElements *coleiter  = strsrch->textIter;
    935         UErrorCode          status    = U_ZERO_ERROR;
    936         // we have been iterating forwards previously
    937         uint32_t            ignorable = TRUE;
    938         int32_t             firstce   = strsrch->pattern.CE[0];
    939 
    940         setColEIterOffset(coleiter, start);
    941         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
    942         if (U_FAILURE(status)) {
    943             return TRUE;
    944         }
    945         while (ce != firstce) {
    946             if (ce != UCOL_IGNORABLE) {
    947                 ignorable = FALSE;
    948             }
    949             ce = getCE(strsrch, ucol_next(coleiter, &status));
    950             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
    951                 return TRUE;
    952             }
    953         }
    954         if (!ignorable && inNormBuf(coleiter)) {
    955             // within normalization buffer, discontiguous handled here
    956             return TRUE;
    957         }
    958 
    959         // within text
    960         int32_t temp = start;
    961         // original code
    962         // accent = (getFCD(strsrch->search->text, &temp,
    963         //                  strsrch->search->textLength)
    964         //            >> SECOND_LAST_BYTE_SHIFT_);
    965         // however this code does not work well with VC7 .net in release mode.
    966         // maybe the inlines for getFCD combined with shifting has bugs in
    967         // VC7. anyways this is a work around.
    968         UBool accent = getFCD(strsrch->search->text, &temp,
    969                               strsrch->search->textLength) > 0xFF;
    970         if (!accent) {
    971             return checkExtraMatchAccents(strsrch, start, end, &status);
    972         }
    973         if (!ignorable) {
    974             return TRUE;
    975         }
    976         if (start > 0) {
    977             temp = start;
    978             U16_BACK_1(strsrch->search->text, 0, temp);
    979             if (getFCD(strsrch->search->text, &temp,
    980                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
    981                 setColEIterOffset(coleiter, start);
    982                 ce = ucol_previous(coleiter, &status);
    983                 if (U_FAILURE(status) ||
    984                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
    985                     return TRUE;
    986                 }
    987             }
    988         }
    989     }
    990 
    991     return FALSE;
    992 }
    993 
    994 /**
    995 * Used by exact matches, checks if there are accents bounding the match.
    996 * Note this is the initial boundary check. If the potential match
    997 * starts or ends with composite characters, the accents in those
    998 * characters will be determined later.
    999 * Not doing backwards iteration here, since discontiguos contraction for
   1000 * backwards collation element iterator, use up too many characters.
   1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
   1002 * should fail since there is a acute at the end of \u01FA
   1003 * Note here that accents checking are slow and cautioned in the API docs.
   1004 * @param strsrch string search data
   1005 * @param start offset of match
   1006 * @param end end offset of the match
   1007 * @return TRUE if there are accents on either side of the match,
   1008 *         FALSE otherwise
   1009 */
   1010 static
   1011 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
   1012                                  int32_t    end)
   1013 {
   1014     if (strsrch->pattern.hasSuffixAccents) {
   1015         const UChar       *text       = strsrch->search->text;
   1016               int32_t  temp       = end;
   1017               int32_t      textlength = strsrch->search->textLength;
   1018         U16_BACK_1(text, 0, temp);
   1019         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   1020             int32_t             firstce  = strsrch->pattern.CE[0];
   1021             UCollationElements *coleiter = strsrch->textIter;
   1022             UErrorCode          status   = U_ZERO_ERROR;
   1023             int32_t ce;
   1024             setColEIterOffset(coleiter, start);
   1025             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
   1026                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   1027                     return TRUE;
   1028                 }
   1029             }
   1030             int32_t count = 1;
   1031             while (count < strsrch->pattern.CELength) {
   1032                 if (getCE(strsrch, ucol_next(coleiter, &status))
   1033                     == UCOL_IGNORABLE) {
   1034                     // Thai can give an ignorable here.
   1035                     count --;
   1036                 }
   1037                 if (U_FAILURE(status)) {
   1038                     return TRUE;
   1039                 }
   1040                 count ++;
   1041             }
   1042 
   1043             ce = ucol_next(coleiter, &status);
   1044             if (U_FAILURE(status)) {
   1045                 return TRUE;
   1046             }
   1047             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1048                 ce = getCE(strsrch, ce);
   1049             }
   1050             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1051                 if (ucol_getOffset(coleiter) <= end) {
   1052                     return TRUE;
   1053                 }
   1054                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   1055                     return TRUE;
   1056                 }
   1057             }
   1058         }
   1059     }
   1060     return FALSE;
   1061 }
   1062 #endif // #if BOYER_MOORE
   1063 
   1064 /**
   1065 * Checks if the offset runs out of the text string
   1066 * @param offset
   1067 * @param textlength of the text string
   1068 * @return TRUE if offset is out of bounds, FALSE otherwise
   1069 */
   1070 static
   1071 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
   1072 {
   1073     return offset < 0 || offset > textlength;
   1074 }
   1075 
   1076 /**
   1077 * Checks for identical match
   1078 * @param strsrch string search data
   1079 * @param start offset of possible match
   1080 * @param end offset of possible match
   1081 * @return TRUE if identical match is found
   1082 */
   1083 static
   1084 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
   1085                                   int32_t    end)
   1086 {
   1087     if (strsrch->strength != UCOL_IDENTICAL) {
   1088         return TRUE;
   1089     }
   1090 
   1091     // Note: We could use Normalizer::compare() or similar, but for short strings
   1092     // which may not be in FCD it might be faster to just NFD them.
   1093     UErrorCode status = U_ZERO_ERROR;
   1094     UnicodeString t2, p2;
   1095     strsrch->nfd->normalize(
   1096         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
   1097     strsrch->nfd->normalize(
   1098         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
   1099     // return FALSE if NFD failed
   1100     return U_SUCCESS(status) && t2 == p2;
   1101 }
   1102 
   1103 #if BOYER_MOORE
   1104 /**
   1105 * Checks to see if the match is repeated
   1106 * @param strsrch string search data
   1107 * @param start new match start index
   1108 * @param end new match end index
   1109 * @return TRUE if the the match is repeated, FALSE otherwise
   1110 */
   1111 static
   1112 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
   1113                                 int32_t    start,
   1114                                 int32_t    end)
   1115 {
   1116     int32_t lastmatchindex = strsrch->search->matchedIndex;
   1117     UBool       result;
   1118     if (lastmatchindex == USEARCH_DONE) {
   1119         return FALSE;
   1120     }
   1121     if (strsrch->search->isForwardSearching) {
   1122         result = start <= lastmatchindex;
   1123     }
   1124     else {
   1125         result = start >= lastmatchindex;
   1126     }
   1127     if (!result && !strsrch->search->isOverlap) {
   1128         if (strsrch->search->isForwardSearching) {
   1129             result = start < lastmatchindex + strsrch->search->matchedLength;
   1130         }
   1131         else {
   1132             result = end > lastmatchindex;
   1133         }
   1134     }
   1135     return result;
   1136 }
   1137 
   1138 /**
   1139 * Gets the collation element iterator's current offset.
   1140 * @param coleiter collation element iterator
   1141 * @param forwards flag TRUE if we are moving in th forwards direction
   1142 * @return current offset
   1143 */
   1144 static
   1145 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
   1146                                               UBool               forwards)
   1147 {
   1148     int32_t result = ucol_getOffset(coleiter);
   1149     // intricacies of the the backwards collation element iterator
   1150     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
   1151         result ++;
   1152     }
   1153     return result;
   1154 }
   1155 
   1156 /**
   1157 * Checks match for contraction.
   1158 * If the match ends with a partial contraction we fail.
   1159 * If the match starts too far off (because of backwards iteration) we try to
   1160 * chip off the extra characters depending on whether a breakiterator has
   1161 * been used.
   1162 * Internal method, error assumed to be success, caller has to check status
   1163 * before calling this method.
   1164 * @param strsrch string search data
   1165 * @param start offset of potential match, to be modified if necessary
   1166 * @param end offset of potential match, to be modified if necessary
   1167 * @param status output error status if any
   1168 * @return TRUE if match passes the contraction test, FALSE otherwise
   1169 */
   1170 
   1171 static
   1172 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
   1173                                      int32_t   *start,
   1174                                      int32_t   *end, UErrorCode  *status)
   1175 {
   1176           UCollationElements *coleiter   = strsrch->textIter;
   1177           int32_t             textlength = strsrch->search->textLength;
   1178           int32_t             temp       = *start;
   1179     const UCollator          *collator   = strsrch->collator;
   1180     const UChar              *text       = strsrch->search->text;
   1181     // This part checks if either ends of the match contains potential
   1182     // contraction. If so we'll have to iterate through them
   1183     // The start contraction needs to be checked since ucol_previous dumps
   1184     // all characters till the first safe character into the buffer.
   1185     // *start + 1 is used to test for the unsafe characters instead of *start
   1186     // because ucol_prev takes all unsafe characters till the first safe
   1187     // character ie *start. so by testing *start + 1, we can estimate if
   1188     // excess prefix characters has been included in the potential search
   1189     // results.
   1190     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1191         (*start + 1 < textlength
   1192          && ucol_unsafeCP(text[*start + 1], collator))) {
   1193         int32_t expansion  = getExpansionPrefix(coleiter);
   1194         UBool   expandflag = expansion > 0;
   1195         setColEIterOffset(coleiter, *start);
   1196         while (expansion > 0) {
   1197             // getting rid of the redundant ce, caused by setOffset.
   1198             // since backward contraction/expansion may have extra ces if we
   1199             // are in the normalization buffer, hasAccentsBeforeMatch would
   1200             // have taken care of it.
   1201             // E.g. the character \u01FA will have an expansion of 3, but if
   1202             // we are only looking for acute and ring \u030A and \u0301, we'll
   1203             // have to skip the first ce in the expansion buffer.
   1204             ucol_next(coleiter, status);
   1205             if (U_FAILURE(*status)) {
   1206                 return FALSE;
   1207             }
   1208             if (ucol_getOffset(coleiter) != temp) {
   1209                 *start = temp;
   1210                 temp  = ucol_getOffset(coleiter);
   1211             }
   1212             expansion --;
   1213         }
   1214 
   1215         int32_t  *patternce       = strsrch->pattern.CE;
   1216         int32_t   patterncelength = strsrch->pattern.CELength;
   1217         int32_t   count           = 0;
   1218         while (count < patterncelength) {
   1219             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1220             if (ce == UCOL_IGNORABLE) {
   1221                 continue;
   1222             }
   1223             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1224                 *start = temp;
   1225                 temp   = ucol_getOffset(coleiter);
   1226             }
   1227             if (U_FAILURE(*status) || ce != patternce[count]) {
   1228                 (*end) ++;
   1229                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1230                 return FALSE;
   1231             }
   1232             count ++;
   1233         }
   1234     }
   1235     return TRUE;
   1236 }
   1237 
   1238 /**
   1239 * Checks and sets the match information if found.
   1240 * Checks
   1241 * <ul>
   1242 * <li> the potential match does not repeat the previous match
   1243 * <li> boundaries are correct
   1244 * <li> exact matches has no extra accents
   1245 * <li> identical matchesb
   1246 * <li> potential match does not end in the middle of a contraction
   1247 * <\ul>
   1248 * Otherwise the offset will be shifted to the next character.
   1249 * Internal method, status assumed to be success, caller has to check status
   1250 * before calling this method.
   1251 * @param strsrch string search data
   1252 * @param textoffset offset in the collation element text. the returned value
   1253 *        will be the truncated end offset of the match or the new start
   1254 *        search offset.
   1255 * @param status output error status if any
   1256 * @return TRUE if the match is valid, FALSE otherwise
   1257 */
   1258 static
   1259 inline UBool checkNextExactMatch(UStringSearch *strsrch,
   1260                                  int32_t   *textoffset, UErrorCode *status)
   1261 {
   1262     UCollationElements *coleiter = strsrch->textIter;
   1263     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
   1264 
   1265     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
   1266         return FALSE;
   1267     }
   1268 
   1269     // this totally matches, however we need to check if it is repeating
   1270     if (!isBreakUnit(strsrch, start, *textoffset) ||
   1271         checkRepeatedMatch(strsrch, start, *textoffset) ||
   1272         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
   1273         !checkIdentical(strsrch, start, *textoffset) ||
   1274         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
   1275 
   1276         (*textoffset) ++;
   1277         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
   1278         return FALSE;
   1279     }
   1280 
   1281     //Add breakiterator boundary check for primary strength search.
   1282     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   1283         checkBreakBoundary(strsrch, &start, textoffset);
   1284     }
   1285 
   1286     // totally match, we will get rid of the ending ignorables.
   1287     strsrch->search->matchedIndex  = start;
   1288     strsrch->search->matchedLength = *textoffset - start;
   1289     return TRUE;
   1290 }
   1291 
   1292 /**
   1293 * Getting the previous base character offset, or the current offset if the
   1294 * current character is a base character
   1295 * @param text string
   1296 * @param textoffset one offset after the current character
   1297 * @return the offset of the next character after the base character or the first
   1298 *         composed character with accents
   1299 */
   1300 static
   1301 inline int32_t getPreviousBaseOffset(const UChar       *text,
   1302                                                int32_t  textoffset)
   1303 {
   1304     if (textoffset > 0) {
   1305         for (;;) {
   1306             int32_t result = textoffset;
   1307             U16_BACK_1(text, 0, textoffset);
   1308             int32_t temp = textoffset;
   1309             uint16_t fcd = getFCD(text, &temp, result);
   1310             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   1311                 if (fcd & LAST_BYTE_MASK_) {
   1312                     return textoffset;
   1313                 }
   1314                 return result;
   1315             }
   1316             if (textoffset == 0) {
   1317                 return 0;
   1318             }
   1319         }
   1320     }
   1321     return textoffset;
   1322 }
   1323 
   1324 /**
   1325 * Getting the indexes of the accents that are not blocked in the argument
   1326 * accent array
   1327 * @param accents array of accents in nfd terminated by a 0.
   1328 * @param accentsindex array of indexes of the accents that are not blocked
   1329 */
   1330 static
   1331 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
   1332 {
   1333     int32_t index     = 0;
   1334     int32_t     length    = u_strlen(accents);
   1335     UChar32     codepoint = 0;
   1336     int         cclass    = 0;
   1337     int         result    = 0;
   1338     int32_t temp;
   1339     while (index < length) {
   1340         temp = index;
   1341         U16_NEXT(accents, index, length, codepoint);
   1342         if (u_getCombiningClass(codepoint) != cclass) {
   1343             cclass        = u_getCombiningClass(codepoint);
   1344             accentsindex[result] = temp;
   1345             result ++;
   1346         }
   1347     }
   1348     accentsindex[result] = length;
   1349     return result;
   1350 }
   1351 
   1352 /**
   1353 * Appends 3 UChar arrays to a destination array.
   1354 * Creates a new array if we run out of space. The caller will have to
   1355 * manually deallocate the newly allocated array.
   1356 * Internal method, status assumed to be success, caller has to check status
   1357 * before calling this method. destination not to be NULL and has at least
   1358 * size destinationlength.
   1359 * @param destination target array
   1360 * @param destinationlength target array size, returning the appended length
   1361 * @param source1 null-terminated first array
   1362 * @param source2 second array
   1363 * @param source2length length of seond array
   1364 * @param source3 null-terminated third array
   1365 * @param status error status if any
   1366 * @return new destination array, destination if there was no new allocation
   1367 */
   1368 static
   1369 inline UChar * addToUCharArray(      UChar      *destination,
   1370                                      int32_t    *destinationlength,
   1371                                const UChar      *source1,
   1372                                const UChar      *source2,
   1373                                      int32_t     source2length,
   1374                                const UChar      *source3,
   1375                                      UErrorCode *status)
   1376 {
   1377     int32_t source1length = source1 ? u_strlen(source1) : 0;
   1378     int32_t source3length = source3 ? u_strlen(source3) : 0;
   1379     if (*destinationlength < source1length + source2length + source3length +
   1380                                                                            1)
   1381     {
   1382         destination = (UChar *)allocateMemory(
   1383           (source1length + source2length + source3length + 1) * sizeof(UChar),
   1384           status);
   1385         // if error allocating memory, status will be
   1386         // U_MEMORY_ALLOCATION_ERROR
   1387         if (U_FAILURE(*status)) {
   1388             *destinationlength = 0;
   1389             return NULL;
   1390         }
   1391     }
   1392     if (source1length != 0) {
   1393         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
   1394     }
   1395     if (source2length != 0) {
   1396         uprv_memcpy(destination + source1length, source2,
   1397                     sizeof(UChar) * source2length);
   1398     }
   1399     if (source3length != 0) {
   1400         uprv_memcpy(destination + source1length + source2length, source3,
   1401                     sizeof(UChar) * source3length);
   1402     }
   1403     *destinationlength = source1length + source2length + source3length;
   1404     return destination;
   1405 }
   1406 
   1407 /**
   1408 * Running through a collation element iterator to see if the contents matches
   1409 * pattern in string search data
   1410 * @param strsrch string search data
   1411 * @param coleiter collation element iterator
   1412 * @return TRUE if a match if found, FALSE otherwise
   1413 */
   1414 static
   1415 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
   1416                                        UCollationElements *coleiter)
   1417 {
   1418     int         patternceindex = strsrch->pattern.CELength;
   1419     int32_t    *patternce      = strsrch->pattern.CE;
   1420     UErrorCode  status = U_ZERO_ERROR;
   1421     while (patternceindex > 0) {
   1422         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   1423         if (ce == UCOL_IGNORABLE) {
   1424             continue;
   1425         }
   1426         if (U_FAILURE(status) || ce != *patternce) {
   1427             return FALSE;
   1428         }
   1429         patternce ++;
   1430         patternceindex --;
   1431     }
   1432     return TRUE;
   1433 }
   1434 
   1435 /**
   1436 * Rearranges the front accents to try matching.
   1437 * Prefix accents in the text will be grouped according to their combining
   1438 * class and the groups will be mixed and matched to try find the perfect
   1439 * match with the pattern.
   1440 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1441 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1442 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1443 *         "\u0301\u0325".
   1444 * step 2: check if any of the generated substrings matches the pattern.
   1445 * Internal method, status is assumed to be success, caller has to check status
   1446 * before calling this method.
   1447 * @param strsrch string search match
   1448 * @param start first offset of the accents to start searching
   1449 * @param end start of the last accent set
   1450 * @param status output error status if any
   1451 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1452 *         offset of the match. Note this start includes all preceding accents.
   1453 */
   1454 static
   1455 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
   1456                                        int32_t    start,
   1457                                        int32_t    end,
   1458                                        UErrorCode    *status)
   1459 {
   1460     const UChar       *text       = strsrch->search->text;
   1461           int32_t      textlength = strsrch->search->textLength;
   1462           int32_t  tempstart  = start;
   1463 
   1464     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
   1465         // die... failed at a base character
   1466         return USEARCH_DONE;
   1467     }
   1468 
   1469     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
   1470     start = getPreviousBaseOffset(text, tempstart);
   1471 
   1472     UChar       accents[INITIAL_ARRAY_SIZE_];
   1473     // normalizing the offensive string
   1474     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
   1475                     INITIAL_ARRAY_SIZE_, status);
   1476     if (U_FAILURE(*status)) {
   1477         return USEARCH_DONE;
   1478     }
   1479 
   1480     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   1481     int32_t         accentsize = getUnblockedAccentIndex(accents,
   1482                                                                  accentsindex);
   1483     int32_t         count      = (2 << (accentsize - 1)) - 1;
   1484     UChar               buffer[INITIAL_ARRAY_SIZE_];
   1485     UCollationElements *coleiter   = strsrch->utilIter;
   1486     while (U_SUCCESS(*status) && count > 0) {
   1487         UChar *rearrange = strsrch->canonicalPrefixAccents;
   1488         // copy the base characters
   1489         for (int k = 0; k < accentsindex[0]; k ++) {
   1490             *rearrange ++ = accents[k];
   1491         }
   1492         // forming all possible canonical rearrangement by dropping
   1493         // sets of accents
   1494         for (int i = 0; i <= accentsize - 1; i ++) {
   1495             int32_t mask = 1 << (accentsize - i - 1);
   1496             if (count & mask) {
   1497                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1498                     *rearrange ++ = accents[j];
   1499                 }
   1500             }
   1501         }
   1502         *rearrange = 0;
   1503         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   1504         UChar   *match     = addToUCharArray(buffer, &matchsize,
   1505                                            strsrch->canonicalPrefixAccents,
   1506                                            strsrch->search->text + offset,
   1507                                            end - offset,
   1508                                            strsrch->canonicalSuffixAccents,
   1509                                            status);
   1510 
   1511         // if status is a failure, ucol_setText does nothing.
   1512         // run the collator iterator through this match
   1513         ucol_setText(coleiter, match, matchsize, status);
   1514         if (U_SUCCESS(*status)) {
   1515             if (checkCollationMatch(strsrch, coleiter)) {
   1516                 if (match != buffer) {
   1517                     uprv_free(match);
   1518                 }
   1519                 return start;
   1520             }
   1521         }
   1522         count --;
   1523     }
   1524     return USEARCH_DONE;
   1525 }
   1526 
   1527 /**
   1528 * Gets the offset to the safe point in text before textoffset.
   1529 * ie. not the middle of a contraction, swappable characters or supplementary
   1530 * characters.
   1531 * @param collator collation sata
   1532 * @param text string to work with
   1533 * @param textoffset offset in string
   1534 * @param textlength length of text string
   1535 * @return offset to the previous safe character
   1536 */
   1537 static
   1538 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
   1539                                       const UChar       *text,
   1540                                             int32_t  textoffset)
   1541 {
   1542     int32_t result = textoffset; // first contraction character
   1543     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
   1544         result --;
   1545     }
   1546     if (result != 0) {
   1547         // the first contraction character is consider unsafe here
   1548         result --;
   1549     }
   1550     return result;
   1551 }
   1552 
   1553 /**
   1554 * Cleaning up after we passed the safe zone
   1555 * @param strsrch string search data
   1556 * @param safetext safe text array
   1557 * @param safebuffer safe text buffer
   1558 * @param coleiter collation element iterator for safe text
   1559 */
   1560 static
   1561 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
   1562                                   UChar         *safebuffer)
   1563 {
   1564     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
   1565     {
   1566        uprv_free(safetext);
   1567     }
   1568 }
   1569 
   1570 /**
   1571 * Take the rearranged end accents and tries matching. If match failed at
   1572 * a seperate preceding set of accents (seperated from the rearranged on by
   1573 * at least a base character) then we rearrange the preceding accents and
   1574 * tries matching again.
   1575 * We allow skipping of the ends of the accent set if the ces do not match.
   1576 * However if the failure is found before the accent set, it fails.
   1577 * Internal method, status assumed to be success, caller has to check status
   1578 * before calling this method.
   1579 * @param strsrch string search data
   1580 * @param textoffset of the start of the rearranged accent
   1581 * @param status output error status if any
   1582 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1583 *         offset of the match. Note this start includes all preceding accents.
   1584 */
   1585 static
   1586 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
   1587                                        int32_t    textoffset,
   1588                                        UErrorCode    *status)
   1589 {
   1590     const UChar              *text           = strsrch->search->text;
   1591     const UCollator          *collator       = strsrch->collator;
   1592           int32_t             safelength     = 0;
   1593           UChar              *safetext;
   1594           int32_t             safetextlength;
   1595           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
   1596           UCollationElements *coleiter       = strsrch->utilIter;
   1597           int32_t         safeoffset     = textoffset;
   1598 
   1599     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
   1600                                          collator)) {
   1601         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
   1602         safelength     = textoffset - safeoffset;
   1603         safetextlength = INITIAL_ARRAY_SIZE_;
   1604         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
   1605                                          text + safeoffset, safelength,
   1606                                          strsrch->canonicalSuffixAccents,
   1607                                          status);
   1608     }
   1609     else {
   1610         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
   1611         safetext       = strsrch->canonicalSuffixAccents;
   1612     }
   1613 
   1614     // if status is a failure, ucol_setText does nothing
   1615     ucol_setText(coleiter, safetext, safetextlength, status);
   1616     // status checked in loop below
   1617 
   1618     int32_t  *ce        = strsrch->pattern.CE;
   1619     int32_t   celength  = strsrch->pattern.CELength;
   1620     int       ceindex   = celength - 1;
   1621     UBool     isSafe    = TRUE; // indication flag for position in safe zone
   1622 
   1623     while (ceindex >= 0) {
   1624         int32_t textce = ucol_previous(coleiter, status);
   1625         if (U_FAILURE(*status)) {
   1626             if (isSafe) {
   1627                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1628             }
   1629             return USEARCH_DONE;
   1630         }
   1631         if (textce == UCOL_NULLORDER) {
   1632             // check if we have passed the safe buffer
   1633             if (coleiter == strsrch->textIter) {
   1634                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1635                 return USEARCH_DONE;
   1636             }
   1637             cleanUpSafeText(strsrch, safetext, safebuffer);
   1638             safetext = safebuffer;
   1639             coleiter = strsrch->textIter;
   1640             setColEIterOffset(coleiter, safeoffset);
   1641             // status checked at the start of the loop
   1642             isSafe = FALSE;
   1643             continue;
   1644         }
   1645         textce = getCE(strsrch, textce);
   1646         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   1647             // do the beginning stuff
   1648             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
   1649             if (isSafe && failedoffset >= safelength) {
   1650                 // alas... no hope. failed at rearranged accent set
   1651                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1652                 return USEARCH_DONE;
   1653             }
   1654             else {
   1655                 if (isSafe) {
   1656                     failedoffset += safeoffset;
   1657                     cleanUpSafeText(strsrch, safetext, safebuffer);
   1658                 }
   1659 
   1660                 // try rearranging the front accents
   1661                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
   1662                                         failedoffset, textoffset, status);
   1663                 if (result != USEARCH_DONE) {
   1664                     // if status is a failure, ucol_setOffset does nothing
   1665                     setColEIterOffset(strsrch->textIter, result);
   1666                 }
   1667                 if (U_FAILURE(*status)) {
   1668                     return USEARCH_DONE;
   1669                 }
   1670                 return result;
   1671             }
   1672         }
   1673         if (textce == ce[ceindex]) {
   1674             ceindex --;
   1675         }
   1676     }
   1677     // set offset here
   1678     if (isSafe) {
   1679         int32_t result     = getColElemIterOffset(coleiter, FALSE);
   1680         // sets the text iterator here with the correct expansion and offset
   1681         int32_t    leftoverces = getExpansionPrefix(coleiter);
   1682         cleanUpSafeText(strsrch, safetext, safebuffer);
   1683         if (result >= safelength) {
   1684             result = textoffset;
   1685         }
   1686         else {
   1687             result += safeoffset;
   1688         }
   1689         setColEIterOffset(strsrch->textIter, result);
   1690         strsrch->textIter->iteratordata_.toReturn =
   1691                        setExpansionPrefix(strsrch->textIter, leftoverces);
   1692         return result;
   1693     }
   1694 
   1695     return ucol_getOffset(coleiter);
   1696 }
   1697 
   1698 /**
   1699 * Trying out the substring and sees if it can be a canonical match.
   1700 * This will try normalizing the end accents and arranging them into canonical
   1701 * equivalents and check their corresponding ces with the pattern ce.
   1702 * Suffix accents in the text will be grouped according to their combining
   1703 * class and the groups will be mixed and matched to try find the perfect
   1704 * match with the pattern.
   1705 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1706 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1707 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1708 *         "\u0301\u0325".
   1709 * step 2: check if any of the generated substrings matches the pattern.
   1710 * Internal method, status assumed to be success, caller has to check status
   1711 * before calling this method.
   1712 * @param strsrch string search data
   1713 * @param textoffset end offset in the collation element text that ends with
   1714 *                   the accents to be rearranged
   1715 * @param status error status if any
   1716 * @return TRUE if the match is valid, FALSE otherwise
   1717 */
   1718 static
   1719 UBool doNextCanonicalMatch(UStringSearch *strsrch,
   1720                            int32_t    textoffset,
   1721                            UErrorCode    *status)
   1722 {
   1723     const UChar       *text = strsrch->search->text;
   1724           int32_t  temp = textoffset;
   1725     U16_BACK_1(text, 0, temp);
   1726     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
   1727         UCollationElements *coleiter = strsrch->textIter;
   1728         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
   1729         if (strsrch->pattern.hasPrefixAccents) {
   1730             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
   1731                                                 status);
   1732             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   1733                 setColEIterOffset(coleiter, offset);
   1734                 return TRUE;
   1735             }
   1736         }
   1737         return FALSE;
   1738     }
   1739 
   1740     if (!strsrch->pattern.hasSuffixAccents) {
   1741         return FALSE;
   1742     }
   1743 
   1744     UChar       accents[INITIAL_ARRAY_SIZE_];
   1745     // offset to the last base character in substring to search
   1746     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
   1747     // normalizing the offensive string
   1748     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
   1749                                0, accents, INITIAL_ARRAY_SIZE_, status);
   1750     // status checked in loop below
   1751 
   1752     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   1753     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   1754 
   1755     // 2 power n - 1 plus the full set of accents
   1756     int32_t  count = (2 << (size - 1)) - 1;
   1757     while (U_SUCCESS(*status) && count > 0) {
   1758         UChar *rearrange = strsrch->canonicalSuffixAccents;
   1759         // copy the base characters
   1760         for (int k = 0; k < accentsindex[0]; k ++) {
   1761             *rearrange ++ = accents[k];
   1762         }
   1763         // forming all possible canonical rearrangement by dropping
   1764         // sets of accents
   1765         for (int i = 0; i <= size - 1; i ++) {
   1766             int32_t mask = 1 << (size - i - 1);
   1767             if (count & mask) {
   1768                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1769                     *rearrange ++ = accents[j];
   1770                 }
   1771             }
   1772         }
   1773         *rearrange = 0;
   1774         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
   1775                                                         status);
   1776         if (offset != USEARCH_DONE) {
   1777             return TRUE; // match found
   1778         }
   1779         count --;
   1780     }
   1781     return FALSE;
   1782 }
   1783 
   1784 /**
   1785 * Gets the previous base character offset depending on the string search
   1786 * pattern data
   1787 * @param strsrch string search data
   1788 * @param textoffset current offset, current character
   1789 * @return the offset of the next character after this base character or itself
   1790 *         if it is a composed character with accents
   1791 */
   1792 static
   1793 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
   1794                                                       int32_t textoffset)
   1795 {
   1796     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
   1797         const UChar       *text = strsrch->search->text;
   1798               int32_t  offset = textoffset;
   1799         if (getFCD(text, &offset, strsrch->search->textLength) >>
   1800                                                    SECOND_LAST_BYTE_SHIFT_) {
   1801             return getPreviousBaseOffset(text, textoffset);
   1802         }
   1803     }
   1804     return textoffset;
   1805 }
   1806 
   1807 /**
   1808 * Checks match for contraction.
   1809 * If the match ends with a partial contraction we fail.
   1810 * If the match starts too far off (because of backwards iteration) we try to
   1811 * chip off the extra characters
   1812 * Internal method, status assumed to be success, caller has to check status
   1813 * before calling this method.
   1814 * @param strsrch string search data
   1815 * @param start offset of potential match, to be modified if necessary
   1816 * @param end offset of potential match, to be modified if necessary
   1817 * @param status output error status if any
   1818 * @return TRUE if match passes the contraction test, FALSE otherwise
   1819 */
   1820 static
   1821 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
   1822                                          int32_t   *start,
   1823                                          int32_t   *end,
   1824                                          UErrorCode    *status)
   1825 {
   1826           UCollationElements *coleiter   = strsrch->textIter;
   1827           int32_t             textlength = strsrch->search->textLength;
   1828           int32_t         temp       = *start;
   1829     const UCollator          *collator   = strsrch->collator;
   1830     const UChar              *text       = strsrch->search->text;
   1831     // This part checks if either ends of the match contains potential
   1832     // contraction. If so we'll have to iterate through them
   1833     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1834         (*start + 1 < textlength
   1835          && ucol_unsafeCP(text[*start + 1], collator))) {
   1836         int32_t expansion  = getExpansionPrefix(coleiter);
   1837         UBool   expandflag = expansion > 0;
   1838         setColEIterOffset(coleiter, *start);
   1839         while (expansion > 0) {
   1840             // getting rid of the redundant ce, caused by setOffset.
   1841             // since backward contraction/expansion may have extra ces if we
   1842             // are in the normalization buffer, hasAccentsBeforeMatch would
   1843             // have taken care of it.
   1844             // E.g. the character \u01FA will have an expansion of 3, but if
   1845             // we are only looking for acute and ring \u030A and \u0301, we'll
   1846             // have to skip the first ce in the expansion buffer.
   1847             ucol_next(coleiter, status);
   1848             if (U_FAILURE(*status)) {
   1849                 return FALSE;
   1850             }
   1851             if (ucol_getOffset(coleiter) != temp) {
   1852                 *start = temp;
   1853                 temp  = ucol_getOffset(coleiter);
   1854             }
   1855             expansion --;
   1856         }
   1857 
   1858         int32_t  *patternce       = strsrch->pattern.CE;
   1859         int32_t   patterncelength = strsrch->pattern.CELength;
   1860         int32_t   count           = 0;
   1861         int32_t   textlength      = strsrch->search->textLength;
   1862         while (count < patterncelength) {
   1863             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1864             // status checked below, note that if status is a failure
   1865             // ucol_next returns UCOL_NULLORDER
   1866             if (ce == UCOL_IGNORABLE) {
   1867                 continue;
   1868             }
   1869             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1870                 *start = temp;
   1871                 temp   = ucol_getOffset(coleiter);
   1872             }
   1873 
   1874             if (count == 0 && ce != patternce[0]) {
   1875                 // accents may have extra starting ces, this occurs when a
   1876                 // pure accent pattern is matched without rearrangement
   1877                 // text \u0325\u0300 and looking for \u0300
   1878                 int32_t expected = patternce[0];
   1879                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
   1880                     ce = getCE(strsrch, ucol_next(coleiter, status));
   1881                     while (U_SUCCESS(*status) && ce != expected &&
   1882                            ce != UCOL_NULLORDER &&
   1883                            ucol_getOffset(coleiter) <= *end) {
   1884                         ce = getCE(strsrch, ucol_next(coleiter, status));
   1885                     }
   1886                 }
   1887             }
   1888             if (U_FAILURE(*status) || ce != patternce[count]) {
   1889                 (*end) ++;
   1890                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1891                 return FALSE;
   1892             }
   1893             count ++;
   1894         }
   1895     }
   1896     return TRUE;
   1897 }
   1898 
   1899 /**
   1900 * Checks and sets the match information if found.
   1901 * Checks
   1902 * <ul>
   1903 * <li> the potential match does not repeat the previous match
   1904 * <li> boundaries are correct
   1905 * <li> potential match does not end in the middle of a contraction
   1906 * <li> identical matches
   1907 * <\ul>
   1908 * Otherwise the offset will be shifted to the next character.
   1909 * Internal method, status assumed to be success, caller has to check the
   1910 * status before calling this method.
   1911 * @param strsrch string search data
   1912 * @param textoffset offset in the collation element text. the returned value
   1913 *        will be the truncated end offset of the match or the new start
   1914 *        search offset.
   1915 * @param status output error status if any
   1916 * @return TRUE if the match is valid, FALSE otherwise
   1917 */
   1918 static
   1919 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
   1920                                      int32_t   *textoffset,
   1921                                      UErrorCode    *status)
   1922 {
   1923     // to ensure that the start and ends are not composite characters
   1924     UCollationElements *coleiter = strsrch->textIter;
   1925     // if we have a canonical accent match
   1926     if ((strsrch->pattern.hasSuffixAccents &&
   1927         strsrch->canonicalSuffixAccents[0]) ||
   1928         (strsrch->pattern.hasPrefixAccents &&
   1929         strsrch->canonicalPrefixAccents[0])) {
   1930         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
   1931                                                     strsrch,
   1932                                                     ucol_getOffset(coleiter));
   1933         strsrch->search->matchedLength = *textoffset -
   1934                                                 strsrch->search->matchedIndex;
   1935         return TRUE;
   1936     }
   1937 
   1938     int32_t start = getColElemIterOffset(coleiter, FALSE);
   1939     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
   1940                                             status) || U_FAILURE(*status)) {
   1941         return FALSE;
   1942     }
   1943 
   1944     start = getPreviousUStringSearchBaseOffset(strsrch, start);
   1945     // this totally matches, however we need to check if it is repeating
   1946     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
   1947         !isBreakUnit(strsrch, start, *textoffset) ||
   1948         !checkIdentical(strsrch, start, *textoffset)) {
   1949         (*textoffset) ++;
   1950         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
   1951                                         strsrch->search->textLength);
   1952         return FALSE;
   1953     }
   1954 
   1955     strsrch->search->matchedIndex  = start;
   1956     strsrch->search->matchedLength = *textoffset - start;
   1957     return TRUE;
   1958 }
   1959 
   1960 /**
   1961 * Shifting the collation element iterator position forward to prepare for
   1962 * a preceding match. If the first character is a unsafe character, we'll only
   1963 * shift by 1 to capture contractions, normalization etc.
   1964 * Internal method, status assumed to be success, caller has to check status
   1965 * before calling this method.
   1966 * @param text strsrch string search data
   1967 * @param textoffset start text position to do search
   1968 * @param ce the text ce which failed the match.
   1969 * @param patternceindex index of the ce within the pattern ce buffer which
   1970 *        failed the match
   1971 * @return final offset
   1972 */
   1973 static
   1974 inline int32_t reverseShift(UStringSearch *strsrch,
   1975                                 int32_t    textoffset,
   1976                                 int32_t       ce,
   1977                                 int32_t        patternceindex)
   1978 {
   1979     if (strsrch->search->isOverlap) {
   1980         if (textoffset != strsrch->search->textLength) {
   1981             textoffset --;
   1982         }
   1983         else {
   1984             textoffset -= strsrch->pattern.defaultShiftSize;
   1985         }
   1986     }
   1987     else {
   1988         if (ce != UCOL_NULLORDER) {
   1989             int32_t shift = strsrch->pattern.backShift[hash(ce)];
   1990 
   1991             // this is to adjust for characters in the middle of the substring
   1992             // for matching that failed.
   1993             int32_t adjust = patternceindex;
   1994             if (adjust > 1 && shift > adjust) {
   1995                 shift -= adjust - 1;
   1996             }
   1997             textoffset -= shift;
   1998         }
   1999         else {
   2000             textoffset -= strsrch->pattern.defaultShiftSize;
   2001         }
   2002     }
   2003     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
   2004     return textoffset;
   2005 }
   2006 
   2007 /**
   2008 * Checks match for contraction.
   2009 * If the match starts with a partial contraction we fail.
   2010 * Internal method, status assumed to be success, caller has to check status
   2011 * before calling this method.
   2012 * @param strsrch string search data
   2013 * @param start offset of potential match, to be modified if necessary
   2014 * @param end offset of potential match, to be modified if necessary
   2015 * @param status output error status if any
   2016 * @return TRUE if match passes the contraction test, FALSE otherwise
   2017 */
   2018 static
   2019 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
   2020                                      int32_t   *start,
   2021                                      int32_t   *end, UErrorCode  *status)
   2022 {
   2023           UCollationElements *coleiter   = strsrch->textIter;
   2024           int32_t             textlength = strsrch->search->textLength;
   2025           int32_t             temp       = *end;
   2026     const UCollator          *collator   = strsrch->collator;
   2027     const UChar              *text       = strsrch->search->text;
   2028     // This part checks if either if the start of the match contains potential
   2029     // contraction. If so we'll have to iterate through them
   2030     // Since we used ucol_next while previously looking for the potential
   2031     // match, this guarantees that our end will not be a partial contraction,
   2032     // or a partial supplementary character.
   2033     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2034         int32_t expansion  = getExpansionSuffix(coleiter);
   2035         UBool   expandflag = expansion > 0;
   2036         setColEIterOffset(coleiter, *end);
   2037         while (U_SUCCESS(*status) && expansion > 0) {
   2038             // getting rid of the redundant ce
   2039             // since forward contraction/expansion may have extra ces
   2040             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2041             // would have taken care of it.
   2042             // E.g. the character \u01FA will have an expansion of 3, but if
   2043             // we are only looking for A ring A\u030A, we'll have to skip the
   2044             // last ce in the expansion buffer
   2045             ucol_previous(coleiter, status);
   2046             if (U_FAILURE(*status)) {
   2047                 return FALSE;
   2048             }
   2049             if (ucol_getOffset(coleiter) != temp) {
   2050                 *end = temp;
   2051                 temp  = ucol_getOffset(coleiter);
   2052             }
   2053             expansion --;
   2054         }
   2055 
   2056         int32_t  *patternce       = strsrch->pattern.CE;
   2057         int32_t   patterncelength = strsrch->pattern.CELength;
   2058         int32_t   count           = patterncelength;
   2059         while (count > 0) {
   2060             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2061             // status checked below, note that if status is a failure
   2062             // ucol_previous returns UCOL_NULLORDER
   2063             if (ce == UCOL_IGNORABLE) {
   2064                 continue;
   2065             }
   2066             if (expandflag && count == 0 &&
   2067                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2068                 *end = temp;
   2069                 temp  = ucol_getOffset(coleiter);
   2070             }
   2071             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2072                 (*start) --;
   2073                 *start = getPreviousBaseOffset(text, *start);
   2074                 return FALSE;
   2075             }
   2076             count --;
   2077         }
   2078     }
   2079     return TRUE;
   2080 }
   2081 
   2082 /**
   2083 * Checks and sets the match information if found.
   2084 * Checks
   2085 * <ul>
   2086 * <li> the current match does not repeat the last match
   2087 * <li> boundaries are correct
   2088 * <li> exact matches has no extra accents
   2089 * <li> identical matches
   2090 * <\ul>
   2091 * Otherwise the offset will be shifted to the preceding character.
   2092 * Internal method, status assumed to be success, caller has to check status
   2093 * before calling this method.
   2094 * @param strsrch string search data
   2095 * @param collator
   2096 * @param coleiter collation element iterator
   2097 * @param text string
   2098 * @param textoffset offset in the collation element text. the returned value
   2099 *        will be the truncated start offset of the match or the new start
   2100 *        search offset.
   2101 * @param status output error status if any
   2102 * @return TRUE if the match is valid, FALSE otherwise
   2103 */
   2104 static
   2105 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
   2106                                      int32_t   *textoffset,
   2107                                      UErrorCode    *status)
   2108 {
   2109     // to ensure that the start and ends are not composite characters
   2110     int32_t end = ucol_getOffset(strsrch->textIter);
   2111     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
   2112         || U_FAILURE(*status)) {
   2113             return FALSE;
   2114     }
   2115 
   2116     // this totally matches, however we need to check if it is repeating
   2117     // the old match
   2118     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2119         !isBreakUnit(strsrch, *textoffset, end) ||
   2120         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
   2121         !checkIdentical(strsrch, *textoffset, end) ||
   2122         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
   2123         (*textoffset) --;
   2124         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2125                                             *textoffset);
   2126         return FALSE;
   2127     }
   2128 
   2129     //Add breakiterator boundary check for primary strength search.
   2130     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   2131         checkBreakBoundary(strsrch, textoffset, &end);
   2132     }
   2133 
   2134     strsrch->search->matchedIndex = *textoffset;
   2135     strsrch->search->matchedLength = end - *textoffset;
   2136     return TRUE;
   2137 }
   2138 
   2139 /**
   2140 * Rearranges the end accents to try matching.
   2141 * Suffix accents in the text will be grouped according to their combining
   2142 * class and the groups will be mixed and matched to try find the perfect
   2143 * match with the pattern.
   2144 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2145 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2146 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2147 *         "\u0301\u0325".
   2148 * step 2: check if any of the generated substrings matches the pattern.
   2149 * Internal method, status assumed to be success, user has to check status
   2150 * before calling this method.
   2151 * @param strsrch string search match
   2152 * @param start offset of the first base character
   2153 * @param end start of the last accent set
   2154 * @param status only error status if any
   2155 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2156 *         offset of the match. Note this start includes all following accents.
   2157 */
   2158 static
   2159 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
   2160                                            int32_t    start,
   2161                                            int32_t    end,
   2162                                            UErrorCode    *status)
   2163 {
   2164     const UChar       *text       = strsrch->search->text;
   2165           int32_t  tempend    = end;
   2166 
   2167     U16_BACK_1(text, 0, tempend);
   2168     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
   2169                                                            LAST_BYTE_MASK_)) {
   2170         // die... failed at a base character
   2171         return USEARCH_DONE;
   2172     }
   2173     end = getNextBaseOffset(text, end, strsrch->search->textLength);
   2174 
   2175     if (U_SUCCESS(*status)) {
   2176         UChar       accents[INITIAL_ARRAY_SIZE_];
   2177         int32_t offset = getPreviousBaseOffset(text, end);
   2178         // normalizing the offensive string
   2179         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
   2180                         INITIAL_ARRAY_SIZE_, status);
   2181 
   2182         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   2183         int32_t         accentsize = getUnblockedAccentIndex(accents,
   2184                                                          accentsindex);
   2185         int32_t         count      = (2 << (accentsize - 1)) - 1;
   2186         UChar               buffer[INITIAL_ARRAY_SIZE_];
   2187         UCollationElements *coleiter = strsrch->utilIter;
   2188         while (U_SUCCESS(*status) && count > 0) {
   2189             UChar *rearrange = strsrch->canonicalSuffixAccents;
   2190             // copy the base characters
   2191             for (int k = 0; k < accentsindex[0]; k ++) {
   2192                 *rearrange ++ = accents[k];
   2193             }
   2194             // forming all possible canonical rearrangement by dropping
   2195             // sets of accents
   2196             for (int i = 0; i <= accentsize - 1; i ++) {
   2197                 int32_t mask = 1 << (accentsize - i - 1);
   2198                 if (count & mask) {
   2199                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2200                         *rearrange ++ = accents[j];
   2201                     }
   2202                 }
   2203             }
   2204             *rearrange = 0;
   2205             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   2206             UChar   *match     = addToUCharArray(buffer, &matchsize,
   2207                                            strsrch->canonicalPrefixAccents,
   2208                                            strsrch->search->text + start,
   2209                                            offset - start,
   2210                                            strsrch->canonicalSuffixAccents,
   2211                                            status);
   2212 
   2213             // run the collator iterator through this match
   2214             // if status is a failure ucol_setText does nothing
   2215             ucol_setText(coleiter, match, matchsize, status);
   2216             if (U_SUCCESS(*status)) {
   2217                 if (checkCollationMatch(strsrch, coleiter)) {
   2218                     if (match != buffer) {
   2219                         uprv_free(match);
   2220                     }
   2221                     return end;
   2222                 }
   2223             }
   2224             count --;
   2225         }
   2226     }
   2227     return USEARCH_DONE;
   2228 }
   2229 
   2230 /**
   2231 * Take the rearranged start accents and tries matching. If match failed at
   2232 * a seperate following set of accents (seperated from the rearranged on by
   2233 * at least a base character) then we rearrange the preceding accents and
   2234 * tries matching again.
   2235 * We allow skipping of the ends of the accent set if the ces do not match.
   2236 * However if the failure is found before the accent set, it fails.
   2237 * Internal method, status assumed to be success, caller has to check status
   2238 * before calling this method.
   2239 * @param strsrch string search data
   2240 * @param textoffset of the ends of the rearranged accent
   2241 * @param status output error status if any
   2242 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2243 *         offset of the match. Note this start includes all following accents.
   2244 */
   2245 static
   2246 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
   2247                                            int32_t    textoffset,
   2248                                            UErrorCode    *status)
   2249 {
   2250     const UChar       *text       = strsrch->search->text;
   2251     const UCollator   *collator   = strsrch->collator;
   2252           int32_t      safelength = 0;
   2253           UChar       *safetext;
   2254           int32_t      safetextlength;
   2255           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
   2256           int32_t  safeoffset = textoffset;
   2257 
   2258     if (textoffset &&
   2259         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
   2260                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
   2261                                          ], collator)) {
   2262         safeoffset     = getNextSafeOffset(collator, text, textoffset,
   2263                                            strsrch->search->textLength);
   2264         safelength     = safeoffset - textoffset;
   2265         safetextlength = INITIAL_ARRAY_SIZE_;
   2266         safetext       = addToUCharArray(safebuffer, &safetextlength,
   2267                                          strsrch->canonicalPrefixAccents,
   2268                                          text + textoffset, safelength,
   2269                                          NULL, status);
   2270     }
   2271     else {
   2272         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
   2273         safetext       = strsrch->canonicalPrefixAccents;
   2274     }
   2275 
   2276     UCollationElements *coleiter = strsrch->utilIter;
   2277      // if status is a failure, ucol_setText does nothing
   2278     ucol_setText(coleiter, safetext, safetextlength, status);
   2279     // status checked in loop below
   2280 
   2281     int32_t  *ce           = strsrch->pattern.CE;
   2282     int32_t   celength     = strsrch->pattern.CELength;
   2283     int       ceindex      = 0;
   2284     UBool     isSafe       = TRUE; // safe zone indication flag for position
   2285     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
   2286 
   2287     while (ceindex < celength) {
   2288         int32_t textce = ucol_next(coleiter, status);
   2289         if (U_FAILURE(*status)) {
   2290             if (isSafe) {
   2291                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2292             }
   2293             return USEARCH_DONE;
   2294         }
   2295         if (textce == UCOL_NULLORDER) {
   2296             // check if we have passed the safe buffer
   2297             if (coleiter == strsrch->textIter) {
   2298                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2299                 return USEARCH_DONE;
   2300             }
   2301             cleanUpSafeText(strsrch, safetext, safebuffer);
   2302             safetext = safebuffer;
   2303             coleiter = strsrch->textIter;
   2304             setColEIterOffset(coleiter, safeoffset);
   2305             // status checked at the start of the loop
   2306             isSafe = FALSE;
   2307             continue;
   2308         }
   2309         textce = getCE(strsrch, textce);
   2310         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   2311             // do the beginning stuff
   2312             int32_t failedoffset = ucol_getOffset(coleiter);
   2313             if (isSafe && failedoffset <= prefixlength) {
   2314                 // alas... no hope. failed at rearranged accent set
   2315                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2316                 return USEARCH_DONE;
   2317             }
   2318             else {
   2319                 if (isSafe) {
   2320                     failedoffset = safeoffset - failedoffset;
   2321                     cleanUpSafeText(strsrch, safetext, safebuffer);
   2322                 }
   2323 
   2324                 // try rearranging the end accents
   2325                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
   2326                                         textoffset, failedoffset, status);
   2327                 if (result != USEARCH_DONE) {
   2328                     // if status is a failure, ucol_setOffset does nothing
   2329                     setColEIterOffset(strsrch->textIter, result);
   2330                 }
   2331                 if (U_FAILURE(*status)) {
   2332                     return USEARCH_DONE;
   2333                 }
   2334                 return result;
   2335             }
   2336         }
   2337         if (textce == ce[ceindex]) {
   2338             ceindex ++;
   2339         }
   2340     }
   2341     // set offset here
   2342     if (isSafe) {
   2343         int32_t result      = ucol_getOffset(coleiter);
   2344         // sets the text iterator here with the correct expansion and offset
   2345         int32_t     leftoverces = getExpansionSuffix(coleiter);
   2346         cleanUpSafeText(strsrch, safetext, safebuffer);
   2347         if (result <= prefixlength) {
   2348             result = textoffset;
   2349         }
   2350         else {
   2351             result = textoffset + (safeoffset - result);
   2352         }
   2353         setColEIterOffset(strsrch->textIter, result);
   2354         setExpansionSuffix(strsrch->textIter, leftoverces);
   2355         return result;
   2356     }
   2357 
   2358     return ucol_getOffset(coleiter);
   2359 }
   2360 
   2361 /**
   2362 * Trying out the substring and sees if it can be a canonical match.
   2363 * This will try normalizing the starting accents and arranging them into
   2364 * canonical equivalents and check their corresponding ces with the pattern ce.
   2365 * Prefix accents in the text will be grouped according to their combining
   2366 * class and the groups will be mixed and matched to try find the perfect
   2367 * match with the pattern.
   2368 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2369 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2370 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2371 *         "\u0301\u0325".
   2372 * step 2: check if any of the generated substrings matches the pattern.
   2373 * Internal method, status assumed to be success, caller has to check status
   2374 * before calling this method.
   2375 * @param strsrch string search data
   2376 * @param textoffset start offset in the collation element text that starts
   2377 *                   with the accents to be rearranged
   2378 * @param status output error status if any
   2379 * @return TRUE if the match is valid, FALSE otherwise
   2380 */
   2381 static
   2382 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
   2383                                int32_t    textoffset,
   2384                                UErrorCode    *status)
   2385 {
   2386     const UChar       *text       = strsrch->search->text;
   2387           int32_t  temp       = textoffset;
   2388           int32_t      textlength = strsrch->search->textLength;
   2389     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   2390         UCollationElements *coleiter = strsrch->textIter;
   2391         int32_t         offset   = ucol_getOffset(coleiter);
   2392         if (strsrch->pattern.hasSuffixAccents) {
   2393             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
   2394                                                     offset, status);
   2395             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   2396                 setColEIterOffset(coleiter, offset);
   2397                 return TRUE;
   2398             }
   2399         }
   2400         return FALSE;
   2401     }
   2402 
   2403     if (!strsrch->pattern.hasPrefixAccents) {
   2404         return FALSE;
   2405     }
   2406 
   2407     UChar       accents[INITIAL_ARRAY_SIZE_];
   2408     // offset to the last base character in substring to search
   2409     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
   2410     // normalizing the offensive string
   2411     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
   2412                                0, accents, INITIAL_ARRAY_SIZE_, status);
   2413     // status checked in loop
   2414 
   2415     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   2416     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   2417 
   2418     // 2 power n - 1 plus the full set of accents
   2419     int32_t  count = (2 << (size - 1)) - 1;
   2420     while (U_SUCCESS(*status) && count > 0) {
   2421         UChar *rearrange = strsrch->canonicalPrefixAccents;
   2422         // copy the base characters
   2423         for (int k = 0; k < accentsindex[0]; k ++) {
   2424             *rearrange ++ = accents[k];
   2425         }
   2426         // forming all possible canonical rearrangement by dropping
   2427         // sets of accents
   2428         for (int i = 0; i <= size - 1; i ++) {
   2429             int32_t mask = 1 << (size - i - 1);
   2430             if (count & mask) {
   2431                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2432                     *rearrange ++ = accents[j];
   2433                 }
   2434             }
   2435         }
   2436         *rearrange = 0;
   2437         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
   2438                                                           baseoffset, status);
   2439         if (offset != USEARCH_DONE) {
   2440             return TRUE; // match found
   2441         }
   2442         count --;
   2443     }
   2444     return FALSE;
   2445 }
   2446 
   2447 /**
   2448 * Checks match for contraction.
   2449 * If the match starts with a partial contraction we fail.
   2450 * Internal method, status assumed to be success, caller has to check status
   2451 * before calling this method.
   2452 * @param strsrch string search data
   2453 * @param start offset of potential match, to be modified if necessary
   2454 * @param end offset of potential match, to be modified if necessary
   2455 * @param status only error status if any
   2456 * @return TRUE if match passes the contraction test, FALSE otherwise
   2457 */
   2458 static
   2459 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
   2460                                      int32_t   *start,
   2461                                      int32_t   *end, UErrorCode  *status)
   2462 {
   2463           UCollationElements *coleiter   = strsrch->textIter;
   2464           int32_t             textlength = strsrch->search->textLength;
   2465           int32_t         temp       = *end;
   2466     const UCollator          *collator   = strsrch->collator;
   2467     const UChar              *text       = strsrch->search->text;
   2468     // This part checks if either if the start of the match contains potential
   2469     // contraction. If so we'll have to iterate through them
   2470     // Since we used ucol_next while previously looking for the potential
   2471     // match, this guarantees that our end will not be a partial contraction,
   2472     // or a partial supplementary character.
   2473     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2474         int32_t expansion  = getExpansionSuffix(coleiter);
   2475         UBool   expandflag = expansion > 0;
   2476         setColEIterOffset(coleiter, *end);
   2477         while (expansion > 0) {
   2478             // getting rid of the redundant ce
   2479             // since forward contraction/expansion may have extra ces
   2480             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2481             // would have taken care of it.
   2482             // E.g. the character \u01FA will have an expansion of 3, but if
   2483             // we are only looking for A ring A\u030A, we'll have to skip the
   2484             // last ce in the expansion buffer
   2485             ucol_previous(coleiter, status);
   2486             if (U_FAILURE(*status)) {
   2487                 return FALSE;
   2488             }
   2489             if (ucol_getOffset(coleiter) != temp) {
   2490                 *end = temp;
   2491                 temp  = ucol_getOffset(coleiter);
   2492             }
   2493             expansion --;
   2494         }
   2495 
   2496         int32_t  *patternce       = strsrch->pattern.CE;
   2497         int32_t   patterncelength = strsrch->pattern.CELength;
   2498         int32_t   count           = patterncelength;
   2499         while (count > 0) {
   2500             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2501             // status checked below, note that if status is a failure
   2502             // ucol_previous returns UCOL_NULLORDER
   2503             if (ce == UCOL_IGNORABLE) {
   2504                 continue;
   2505             }
   2506             if (expandflag && count == 0 &&
   2507                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2508                 *end = temp;
   2509                 temp  = ucol_getOffset(coleiter);
   2510             }
   2511             if (count == patterncelength &&
   2512                 ce != patternce[patterncelength - 1]) {
   2513                 // accents may have extra starting ces, this occurs when a
   2514                 // pure accent pattern is matched without rearrangement
   2515                 int32_t    expected = patternce[patterncelength - 1];
   2516                 U16_BACK_1(text, 0, *end);
   2517                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
   2518                     ce = getCE(strsrch, ucol_previous(coleiter, status));
   2519                     while (U_SUCCESS(*status) && ce != expected &&
   2520                            ce != UCOL_NULLORDER &&
   2521                            ucol_getOffset(coleiter) <= *start) {
   2522                         ce = getCE(strsrch, ucol_previous(coleiter, status));
   2523                     }
   2524                 }
   2525             }
   2526             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2527                 (*start) --;
   2528                 *start = getPreviousBaseOffset(text, *start);
   2529                 return FALSE;
   2530             }
   2531             count --;
   2532         }
   2533     }
   2534     return TRUE;
   2535 }
   2536 
   2537 /**
   2538 * Checks and sets the match information if found.
   2539 * Checks
   2540 * <ul>
   2541 * <li> the potential match does not repeat the previous match
   2542 * <li> boundaries are correct
   2543 * <li> potential match does not end in the middle of a contraction
   2544 * <li> identical matches
   2545 * <\ul>
   2546 * Otherwise the offset will be shifted to the next character.
   2547 * Internal method, status assumed to be success, caller has to check status
   2548 * before calling this method.
   2549 * @param strsrch string search data
   2550 * @param textoffset offset in the collation element text. the returned value
   2551 *        will be the truncated start offset of the match or the new start
   2552 *        search offset.
   2553 * @param status only error status if any
   2554 * @return TRUE if the match is valid, FALSE otherwise
   2555 */
   2556 static
   2557 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
   2558                                          int32_t   *textoffset,
   2559                                          UErrorCode    *status)
   2560 {
   2561     // to ensure that the start and ends are not composite characters
   2562     UCollationElements *coleiter = strsrch->textIter;
   2563     // if we have a canonical accent match
   2564     if ((strsrch->pattern.hasSuffixAccents &&
   2565         strsrch->canonicalSuffixAccents[0]) ||
   2566         (strsrch->pattern.hasPrefixAccents &&
   2567         strsrch->canonicalPrefixAccents[0])) {
   2568         strsrch->search->matchedIndex  = *textoffset;
   2569         strsrch->search->matchedLength =
   2570             getNextUStringSearchBaseOffset(strsrch,
   2571                                       getColElemIterOffset(coleiter, FALSE))
   2572             - *textoffset;
   2573         return TRUE;
   2574     }
   2575 
   2576     int32_t end = ucol_getOffset(coleiter);
   2577     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
   2578                                                 status) ||
   2579          U_FAILURE(*status)) {
   2580         return FALSE;
   2581     }
   2582 
   2583     end = getNextUStringSearchBaseOffset(strsrch, end);
   2584     // this totally matches, however we need to check if it is repeating
   2585     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2586         !isBreakUnit(strsrch, *textoffset, end) ||
   2587         !checkIdentical(strsrch, *textoffset, end)) {
   2588         (*textoffset) --;
   2589         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2590                                             *textoffset);
   2591         return FALSE;
   2592     }
   2593 
   2594     strsrch->search->matchedIndex  = *textoffset;
   2595     strsrch->search->matchedLength = end - *textoffset;
   2596     return TRUE;
   2597 }
   2598 #endif // #if BOYER_MOORE
   2599 
   2600 // constructors and destructor -------------------------------------------
   2601 
   2602 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
   2603                                           int32_t         patternlength,
   2604                                     const UChar          *text,
   2605                                           int32_t         textlength,
   2606                                     const char           *locale,
   2607                                           UBreakIterator *breakiter,
   2608                                           UErrorCode     *status)
   2609 {
   2610     if (U_FAILURE(*status)) {
   2611         return NULL;
   2612     }
   2613 #if UCONFIG_NO_BREAK_ITERATION
   2614     if (breakiter != NULL) {
   2615         *status = U_UNSUPPORTED_ERROR;
   2616         return NULL;
   2617     }
   2618 #endif
   2619     if (locale) {
   2620         // ucol_open internally checks for status
   2621         UCollator     *collator = ucol_open(locale, status);
   2622         // pattern, text checks are done in usearch_openFromCollator
   2623         UStringSearch *result   = usearch_openFromCollator(pattern,
   2624                                               patternlength, text, textlength,
   2625                                               collator, breakiter, status);
   2626 
   2627         if (result == NULL || U_FAILURE(*status)) {
   2628             if (collator) {
   2629                 ucol_close(collator);
   2630             }
   2631             return NULL;
   2632         }
   2633         else {
   2634             result->ownCollator = TRUE;
   2635         }
   2636         return result;
   2637     }
   2638     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2639     return NULL;
   2640 }
   2641 
   2642 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
   2643                                   const UChar          *pattern,
   2644                                         int32_t         patternlength,
   2645                                   const UChar          *text,
   2646                                         int32_t         textlength,
   2647                                   const UCollator      *collator,
   2648                                         UBreakIterator *breakiter,
   2649                                         UErrorCode     *status)
   2650 {
   2651     if (U_FAILURE(*status)) {
   2652         return NULL;
   2653     }
   2654 #if UCONFIG_NO_BREAK_ITERATION
   2655     if (breakiter != NULL) {
   2656         *status = U_UNSUPPORTED_ERROR;
   2657         return NULL;
   2658     }
   2659 #endif
   2660     if (pattern == NULL || text == NULL || collator == NULL) {
   2661         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2662         return NULL;
   2663     }
   2664 
   2665     // string search does not really work when numeric collation is turned on
   2666     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
   2667         *status = U_UNSUPPORTED_ERROR;
   2668         return NULL;
   2669     }
   2670 
   2671     if (U_SUCCESS(*status)) {
   2672         initializeFCD(status);
   2673         if (U_FAILURE(*status)) {
   2674             return NULL;
   2675         }
   2676 
   2677         UStringSearch *result;
   2678         if (textlength == -1) {
   2679             textlength = u_strlen(text);
   2680         }
   2681         if (patternlength == -1) {
   2682             patternlength = u_strlen(pattern);
   2683         }
   2684         if (textlength <= 0 || patternlength <= 0) {
   2685             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2686             return NULL;
   2687         }
   2688 
   2689         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
   2690         if (result == NULL) {
   2691             *status = U_MEMORY_ALLOCATION_ERROR;
   2692             return NULL;
   2693         }
   2694 
   2695         result->collator    = collator;
   2696         result->strength    = ucol_getStrength(collator);
   2697         result->ceMask      = getMask(result->strength);
   2698         result->toShift     =
   2699              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   2700                                                             UCOL_SHIFTED;
   2701         result->variableTop = ucol_getVariableTop(collator, status);
   2702 
   2703         result->nfd         = Normalizer2Factory::getNFDInstance(*status);
   2704 
   2705         if (U_FAILURE(*status)) {
   2706             uprv_free(result);
   2707             return NULL;
   2708         }
   2709 
   2710         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
   2711         if (result->search == NULL) {
   2712             *status = U_MEMORY_ALLOCATION_ERROR;
   2713             uprv_free(result);
   2714             return NULL;
   2715         }
   2716 
   2717         result->search->text       = text;
   2718         result->search->textLength = textlength;
   2719 
   2720         result->pattern.text       = pattern;
   2721         result->pattern.textLength = patternlength;
   2722         result->pattern.CE         = NULL;
   2723         result->pattern.PCE        = NULL;
   2724 
   2725         result->search->breakIter  = breakiter;
   2726 #if !UCONFIG_NO_BREAK_ITERATION
   2727         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
   2728         if (breakiter) {
   2729             ubrk_setText(breakiter, text, textlength, status);
   2730         }
   2731 #endif
   2732 
   2733         result->ownCollator           = FALSE;
   2734         result->search->matchedLength = 0;
   2735         result->search->matchedIndex  = USEARCH_DONE;
   2736         result->utilIter              = NULL;
   2737         result->textIter              = ucol_openElements(collator, text,
   2738                                                           textlength, status);
   2739         if (U_FAILURE(*status)) {
   2740             usearch_close(result);
   2741             return NULL;
   2742         }
   2743 
   2744         result->search->isOverlap          = FALSE;
   2745         result->search->isCanonicalMatch   = FALSE;
   2746         result->search->elementComparisonType = 0;
   2747         result->search->isForwardSearching = TRUE;
   2748         result->search->reset              = TRUE;
   2749 
   2750         initialize(result, status);
   2751 
   2752         if (U_FAILURE(*status)) {
   2753             usearch_close(result);
   2754             return NULL;
   2755         }
   2756 
   2757         return result;
   2758     }
   2759     return NULL;
   2760 }
   2761 
   2762 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
   2763 {
   2764     if (strsrch) {
   2765         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
   2766             strsrch->pattern.CE) {
   2767             uprv_free(strsrch->pattern.CE);
   2768         }
   2769 
   2770         if (strsrch->pattern.PCE != NULL &&
   2771             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
   2772             uprv_free(strsrch->pattern.PCE);
   2773         }
   2774 
   2775         ucol_closeElements(strsrch->textIter);
   2776         ucol_closeElements(strsrch->utilIter);
   2777 
   2778         if (strsrch->ownCollator && strsrch->collator) {
   2779             ucol_close((UCollator *)strsrch->collator);
   2780         }
   2781 
   2782 #if !UCONFIG_NO_BREAK_ITERATION
   2783         if (strsrch->search->internalBreakIter) {
   2784             ubrk_close(strsrch->search->internalBreakIter);
   2785         }
   2786 #endif
   2787 
   2788         uprv_free(strsrch->search);
   2789         uprv_free(strsrch);
   2790     }
   2791 }
   2792 
   2793 // set and get methods --------------------------------------------------
   2794 
   2795 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
   2796                                         int32_t    position,
   2797                                         UErrorCode    *status)
   2798 {
   2799     if (U_SUCCESS(*status) && strsrch) {
   2800         if (isOutOfBounds(strsrch->search->textLength, position)) {
   2801             *status = U_INDEX_OUTOFBOUNDS_ERROR;
   2802         }
   2803         else {
   2804             setColEIterOffset(strsrch->textIter, position);
   2805         }
   2806         strsrch->search->matchedIndex  = USEARCH_DONE;
   2807         strsrch->search->matchedLength = 0;
   2808         strsrch->search->reset         = FALSE;
   2809     }
   2810 }
   2811 
   2812 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
   2813 {
   2814     if (strsrch) {
   2815         int32_t result = ucol_getOffset(strsrch->textIter);
   2816         if (isOutOfBounds(strsrch->search->textLength, result)) {
   2817             return USEARCH_DONE;
   2818         }
   2819         return result;
   2820     }
   2821     return USEARCH_DONE;
   2822 }
   2823 
   2824 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
   2825                                  USearchAttribute attribute,
   2826                                  USearchAttributeValue value,
   2827                                  UErrorCode *status)
   2828 {
   2829     if (U_SUCCESS(*status) && strsrch) {
   2830         switch (attribute)
   2831         {
   2832         case USEARCH_OVERLAP :
   2833             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
   2834             break;
   2835         case USEARCH_CANONICAL_MATCH :
   2836             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
   2837                                                                       FALSE);
   2838             break;
   2839         case USEARCH_ELEMENT_COMPARISON :
   2840             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2841                 strsrch->search->elementComparisonType = (int16_t)value;
   2842             } else {
   2843                 strsrch->search->elementComparisonType = 0;
   2844             }
   2845             break;
   2846         case USEARCH_ATTRIBUTE_COUNT :
   2847         default:
   2848             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2849         }
   2850     }
   2851     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
   2852         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2853     }
   2854 }
   2855 
   2856 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
   2857                                                 const UStringSearch *strsrch,
   2858                                                 USearchAttribute attribute)
   2859 {
   2860     if (strsrch) {
   2861         switch (attribute) {
   2862         case USEARCH_OVERLAP :
   2863             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
   2864                                                         USEARCH_OFF);
   2865         case USEARCH_CANONICAL_MATCH :
   2866             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
   2867                                                                USEARCH_OFF);
   2868         case USEARCH_ELEMENT_COMPARISON :
   2869             {
   2870                 int16_t value = strsrch->search->elementComparisonType;
   2871                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2872                     return (USearchAttributeValue)value;
   2873                 } else {
   2874                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
   2875                 }
   2876             }
   2877         case USEARCH_ATTRIBUTE_COUNT :
   2878             return USEARCH_DEFAULT;
   2879         }
   2880     }
   2881     return USEARCH_DEFAULT;
   2882 }
   2883 
   2884 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
   2885                                                 const UStringSearch *strsrch)
   2886 {
   2887     if (strsrch == NULL) {
   2888         return USEARCH_DONE;
   2889     }
   2890     return strsrch->search->matchedIndex;
   2891 }
   2892 
   2893 
   2894 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
   2895                                             UChar         *result,
   2896                                             int32_t        resultCapacity,
   2897                                             UErrorCode    *status)
   2898 {
   2899     if (U_FAILURE(*status)) {
   2900         return USEARCH_DONE;
   2901     }
   2902     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
   2903         result == NULL)) {
   2904         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2905         return USEARCH_DONE;
   2906     }
   2907 
   2908     int32_t     copylength = strsrch->search->matchedLength;
   2909     int32_t copyindex  = strsrch->search->matchedIndex;
   2910     if (copyindex == USEARCH_DONE) {
   2911         u_terminateUChars(result, resultCapacity, 0, status);
   2912         return USEARCH_DONE;
   2913     }
   2914 
   2915     if (resultCapacity < copylength) {
   2916         copylength = resultCapacity;
   2917     }
   2918     if (copylength > 0) {
   2919         uprv_memcpy(result, strsrch->search->text + copyindex,
   2920                     copylength * sizeof(UChar));
   2921     }
   2922     return u_terminateUChars(result, resultCapacity,
   2923                              strsrch->search->matchedLength, status);
   2924 }
   2925 
   2926 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
   2927                                               const UStringSearch *strsrch)
   2928 {
   2929     if (strsrch) {
   2930         return strsrch->search->matchedLength;
   2931     }
   2932     return USEARCH_DONE;
   2933 }
   2934 
   2935 #if !UCONFIG_NO_BREAK_ITERATION
   2936 
   2937 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
   2938                                                UBreakIterator *breakiter,
   2939                                                UErrorCode     *status)
   2940 {
   2941     if (U_SUCCESS(*status) && strsrch) {
   2942         strsrch->search->breakIter = breakiter;
   2943         if (breakiter) {
   2944             ubrk_setText(breakiter, strsrch->search->text,
   2945                          strsrch->search->textLength, status);
   2946         }
   2947     }
   2948 }
   2949 
   2950 U_CAPI const UBreakIterator* U_EXPORT2
   2951 usearch_getBreakIterator(const UStringSearch *strsrch)
   2952 {
   2953     if (strsrch) {
   2954         return strsrch->search->breakIter;
   2955     }
   2956     return NULL;
   2957 }
   2958 
   2959 #endif
   2960 
   2961 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
   2962                                       const UChar         *text,
   2963                                             int32_t        textlength,
   2964                                             UErrorCode    *status)
   2965 {
   2966     if (U_SUCCESS(*status)) {
   2967         if (strsrch == NULL || text == NULL || textlength < -1 ||
   2968             textlength == 0) {
   2969             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2970         }
   2971         else {
   2972             if (textlength == -1) {
   2973                 textlength = u_strlen(text);
   2974             }
   2975             strsrch->search->text       = text;
   2976             strsrch->search->textLength = textlength;
   2977             ucol_setText(strsrch->textIter, text, textlength, status);
   2978             strsrch->search->matchedIndex  = USEARCH_DONE;
   2979             strsrch->search->matchedLength = 0;
   2980             strsrch->search->reset         = TRUE;
   2981 #if !UCONFIG_NO_BREAK_ITERATION
   2982             if (strsrch->search->breakIter != NULL) {
   2983                 ubrk_setText(strsrch->search->breakIter, text,
   2984                              textlength, status);
   2985             }
   2986             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
   2987 #endif
   2988         }
   2989     }
   2990 }
   2991 
   2992 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
   2993                                                      int32_t       *length)
   2994 {
   2995     if (strsrch) {
   2996         *length = strsrch->search->textLength;
   2997         return strsrch->search->text;
   2998     }
   2999     return NULL;
   3000 }
   3001 
   3002 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
   3003                                           const UCollator     *collator,
   3004                                                 UErrorCode    *status)
   3005 {
   3006     if (U_SUCCESS(*status)) {
   3007         if (collator == NULL) {
   3008             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3009             return;
   3010         }
   3011 
   3012         if (strsrch) {
   3013             if (strsrch->ownCollator && (strsrch->collator != collator)) {
   3014                 ucol_close((UCollator *)strsrch->collator);
   3015                 strsrch->ownCollator = FALSE;
   3016             }
   3017             strsrch->collator    = collator;
   3018             strsrch->strength    = ucol_getStrength(collator);
   3019             strsrch->ceMask      = getMask(strsrch->strength);
   3020 #if !UCONFIG_NO_BREAK_ITERATION
   3021             ubrk_close(strsrch->search->internalBreakIter);
   3022             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
   3023                                                      strsrch->search->text, strsrch->search->textLength, status);
   3024 #endif
   3025             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3026             strsrch->toShift     =
   3027                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   3028                                                                 UCOL_SHIFTED;
   3029             // if status is a failure, ucol_getVariableTop returns 0
   3030             strsrch->variableTop = ucol_getVariableTop(collator, status);
   3031             if (U_SUCCESS(*status)) {
   3032                 initialize(strsrch, status);
   3033                 if (U_SUCCESS(*status)) {
   3034                     /* free offset buffer to avoid memory leak before initializing. */
   3035                     ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
   3036                     uprv_init_collIterate(collator, strsrch->search->text,
   3037                                           strsrch->search->textLength,
   3038                                           &(strsrch->textIter->iteratordata_),
   3039                                           status);
   3040                     strsrch->utilIter->iteratordata_.coll = collator;
   3041                 }
   3042             }
   3043         }
   3044 
   3045         // **** are these calls needed?
   3046         // **** we call uprv_init_pce in initializePatternPCETable
   3047         // **** and the CEBuffer constructor...
   3048 #if 0
   3049         uprv_init_pce(strsrch->textIter);
   3050         uprv_init_pce(strsrch->utilIter);
   3051 #endif
   3052     }
   3053 }
   3054 
   3055 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
   3056 {
   3057     if (strsrch) {
   3058         return (UCollator *)strsrch->collator;
   3059     }
   3060     return NULL;
   3061 }
   3062 
   3063 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
   3064                                          const UChar         *pattern,
   3065                                                int32_t        patternlength,
   3066                                                UErrorCode    *status)
   3067 {
   3068     if (U_SUCCESS(*status)) {
   3069         if (strsrch == NULL || pattern == NULL) {
   3070             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3071         }
   3072         else {
   3073             if (patternlength == -1) {
   3074                 patternlength = u_strlen(pattern);
   3075             }
   3076             if (patternlength == 0) {
   3077                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   3078                 return;
   3079             }
   3080             strsrch->pattern.text       = pattern;
   3081             strsrch->pattern.textLength = patternlength;
   3082             initialize(strsrch, status);
   3083         }
   3084     }
   3085 }
   3086 
   3087 U_CAPI const UChar* U_EXPORT2
   3088 usearch_getPattern(const UStringSearch *strsrch,
   3089                    int32_t       *length)
   3090 {
   3091     if (strsrch) {
   3092         *length = strsrch->pattern.textLength;
   3093         return strsrch->pattern.text;
   3094     }
   3095     return NULL;
   3096 }
   3097 
   3098 // miscellanous methods --------------------------------------------------
   3099 
   3100 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
   3101                                            UErrorCode    *status)
   3102 {
   3103     if (strsrch && U_SUCCESS(*status)) {
   3104         strsrch->search->isForwardSearching = TRUE;
   3105         usearch_setOffset(strsrch, 0, status);
   3106         if (U_SUCCESS(*status)) {
   3107             return usearch_next(strsrch, status);
   3108         }
   3109     }
   3110     return USEARCH_DONE;
   3111 }
   3112 
   3113 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
   3114                                                int32_t    position,
   3115                                                UErrorCode    *status)
   3116 {
   3117     if (strsrch && U_SUCCESS(*status)) {
   3118         strsrch->search->isForwardSearching = TRUE;
   3119         // position checked in usearch_setOffset
   3120         usearch_setOffset(strsrch, position, status);
   3121         if (U_SUCCESS(*status)) {
   3122             return usearch_next(strsrch, status);
   3123         }
   3124     }
   3125     return USEARCH_DONE;
   3126 }
   3127 
   3128 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
   3129                                           UErrorCode    *status)
   3130 {
   3131     if (strsrch && U_SUCCESS(*status)) {
   3132         strsrch->search->isForwardSearching = FALSE;
   3133         usearch_setOffset(strsrch, strsrch->search->textLength, status);
   3134         if (U_SUCCESS(*status)) {
   3135             return usearch_previous(strsrch, status);
   3136         }
   3137     }
   3138     return USEARCH_DONE;
   3139 }
   3140 
   3141 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
   3142                                                int32_t    position,
   3143                                                UErrorCode    *status)
   3144 {
   3145     if (strsrch && U_SUCCESS(*status)) {
   3146         strsrch->search->isForwardSearching = FALSE;
   3147         // position checked in usearch_setOffset
   3148         usearch_setOffset(strsrch, position, status);
   3149         if (U_SUCCESS(*status)) {
   3150             return usearch_previous(strsrch, status);
   3151         }
   3152     }
   3153     return USEARCH_DONE;
   3154 }
   3155 
   3156 /**
   3157 * If a direction switch is required, we'll count the number of ces till the
   3158 * beginning of the collation element iterator and iterate forwards that
   3159 * number of times. This is so that we get to the correct point within the
   3160 * string to continue the search in. Imagine when we are in the middle of the
   3161 * normalization buffer when the change in direction is request. arrrgghh....
   3162 * After searching the offset within the collation element iterator will be
   3163 * shifted to the start of the match. If a match is not found, the offset would
   3164 * have been set to the end of the text string in the collation element
   3165 * iterator.
   3166 * Okay, here's my take on normalization buffer. The only time when there can
   3167 * be 2 matches within the same normalization is when the pattern is consists
   3168 * of all accents. But since the offset returned is from the text string, we
   3169 * should not confuse the caller by returning the second match within the
   3170 * same normalization buffer. If we do, the 2 results will have the same match
   3171 * offsets, and that'll be confusing. I'll return the next match that doesn't
   3172 * fall within the same normalization buffer. Note this does not affect the
   3173 * results of matches spanning the text and the normalization buffer.
   3174 * The position to start searching is taken from the collation element
   3175 * iterator. Callers of this API would have to set the offset in the collation
   3176 * element iterator before using this method.
   3177 */
   3178 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
   3179                                           UErrorCode    *status)
   3180 {
   3181     if (U_SUCCESS(*status) && strsrch) {
   3182         // note offset is either equivalent to the start of the previous match
   3183         // or is set by the user
   3184         int32_t      offset       = usearch_getOffset(strsrch);
   3185         USearch     *search       = strsrch->search;
   3186         search->reset             = FALSE;
   3187         int32_t      textlength   = search->textLength;
   3188         if (search->isForwardSearching) {
   3189 #if BOYER_MOORE
   3190             if (offset == textlength
   3191                 || (!search->isOverlap &&
   3192                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
   3193                     (search->matchedIndex != USEARCH_DONE &&
   3194                      offset + search->matchedLength >= textlength)))) {
   3195                 // not enough characters to match
   3196                 setMatchNotFound(strsrch);
   3197                 return USEARCH_DONE;
   3198             }
   3199 #else
   3200             if (offset == textlength ||
   3201                 (! search->isOverlap &&
   3202                 (search->matchedIndex != USEARCH_DONE &&
   3203                 offset + search->matchedLength > textlength))) {
   3204                     // not enough characters to match
   3205                     setMatchNotFound(strsrch);
   3206                     return USEARCH_DONE;
   3207             }
   3208 #endif
   3209         }
   3210         else {
   3211             // switching direction.
   3212             // if matchedIndex == USEARCH_DONE, it means that either a
   3213             // setOffset has been called or that previous ran off the text
   3214             // string. the iterator would have been set to offset 0 if a
   3215             // match is not found.
   3216             search->isForwardSearching = TRUE;
   3217             if (search->matchedIndex != USEARCH_DONE) {
   3218                 // there's no need to set the collation element iterator
   3219                 // the next call to next will set the offset.
   3220                 return search->matchedIndex;
   3221             }
   3222         }
   3223 
   3224         if (U_SUCCESS(*status)) {
   3225             if (strsrch->pattern.CELength == 0) {
   3226                 if (search->matchedIndex == USEARCH_DONE) {
   3227                     search->matchedIndex = offset;
   3228                 }
   3229                 else { // moves by codepoints
   3230                     U16_FWD_1(search->text, search->matchedIndex, textlength);
   3231                 }
   3232 
   3233                 search->matchedLength = 0;
   3234                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3235                 // status checked below
   3236                 if (search->matchedIndex == textlength) {
   3237                     search->matchedIndex = USEARCH_DONE;
   3238                 }
   3239             }
   3240             else {
   3241                 if (search->matchedLength > 0) {
   3242                     // if matchlength is 0 we are at the start of the iteration
   3243                     if (search->isOverlap) {
   3244                         ucol_setOffset(strsrch->textIter, offset + 1, status);
   3245                     }
   3246                     else {
   3247                         ucol_setOffset(strsrch->textIter,
   3248                                        offset + search->matchedLength, status);
   3249                     }
   3250                 }
   3251                 else {
   3252                     // for boundary check purposes. this will ensure that the
   3253                     // next match will not preceed the current offset
   3254                     // note search->matchedIndex will always be set to something
   3255                     // in the code
   3256                     search->matchedIndex = offset - 1;
   3257                 }
   3258 
   3259                 if (search->isCanonicalMatch) {
   3260                     // can't use exact here since extra accents are allowed.
   3261                     usearch_handleNextCanonical(strsrch, status);
   3262                 }
   3263                 else {
   3264                     usearch_handleNextExact(strsrch, status);
   3265                 }
   3266             }
   3267 
   3268             if (U_FAILURE(*status)) {
   3269                 return USEARCH_DONE;
   3270             }
   3271 
   3272 #if !BOYER_MOORE
   3273             if (search->matchedIndex == USEARCH_DONE) {
   3274                 ucol_setOffset(strsrch->textIter, search->textLength, status);
   3275             } else {
   3276                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
   3277             }
   3278 #endif
   3279 
   3280             return search->matchedIndex;
   3281         }
   3282     }
   3283     return USEARCH_DONE;
   3284 }
   3285 
   3286 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
   3287                                               UErrorCode *status)
   3288 {
   3289     if (U_SUCCESS(*status) && strsrch) {
   3290         int32_t offset;
   3291         USearch *search = strsrch->search;
   3292         if (search->reset) {
   3293             offset                     = search->textLength;
   3294             search->isForwardSearching = FALSE;
   3295             search->reset              = FALSE;
   3296             setColEIterOffset(strsrch->textIter, offset);
   3297         }
   3298         else {
   3299             offset = usearch_getOffset(strsrch);
   3300         }
   3301 
   3302         int32_t matchedindex = search->matchedIndex;
   3303         if (search->isForwardSearching == TRUE) {
   3304             // switching direction.
   3305             // if matchedIndex == USEARCH_DONE, it means that either a
   3306             // setOffset has been called or that next ran off the text
   3307             // string. the iterator would have been set to offset textLength if
   3308             // a match is not found.
   3309             search->isForwardSearching = FALSE;
   3310             if (matchedindex != USEARCH_DONE) {
   3311                 return matchedindex;
   3312             }
   3313         }
   3314         else {
   3315 #if BOYER_MOORE
   3316             if (offset == 0 || matchedindex == 0 ||
   3317                 (!search->isOverlap &&
   3318                     (offset < strsrch->pattern.defaultShiftSize ||
   3319                     (matchedindex != USEARCH_DONE &&
   3320                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
   3321                 // not enough characters to match
   3322                 setMatchNotFound(strsrch);
   3323                 return USEARCH_DONE;
   3324             }
   3325 #else
   3326             // Could check pattern length, but the
   3327             // linear search will do the right thing
   3328             if (offset == 0 || matchedindex == 0) {
   3329                 setMatchNotFound(strsrch);
   3330                 return USEARCH_DONE;
   3331             }
   3332 #endif
   3333         }
   3334 
   3335         if (U_SUCCESS(*status)) {
   3336             if (strsrch->pattern.CELength == 0) {
   3337                 search->matchedIndex =
   3338                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
   3339                 if (search->matchedIndex == 0) {
   3340                     setMatchNotFound(strsrch);
   3341                     // status checked below
   3342                 }
   3343                 else { // move by codepoints
   3344                     U16_BACK_1(search->text, 0, search->matchedIndex);
   3345                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3346                     // status checked below
   3347                     search->matchedLength = 0;
   3348                 }
   3349             }
   3350             else {
   3351                 if (strsrch->search->isCanonicalMatch) {
   3352                     // can't use exact here since extra accents are allowed.
   3353                     usearch_handlePreviousCanonical(strsrch, status);
   3354                     // status checked below
   3355                 }
   3356                 else {
   3357                     usearch_handlePreviousExact(strsrch, status);
   3358                     // status checked below
   3359                 }
   3360             }
   3361 
   3362             if (U_FAILURE(*status)) {
   3363                 return USEARCH_DONE;
   3364             }
   3365 
   3366             return search->matchedIndex;
   3367         }
   3368     }
   3369     return USEARCH_DONE;
   3370 }
   3371 
   3372 
   3373 
   3374 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
   3375 {
   3376     /*
   3377     reset is setting the attributes that are already in
   3378     string search, hence all attributes in the collator should
   3379     be retrieved without any problems
   3380     */
   3381     if (strsrch) {
   3382         UErrorCode status            = U_ZERO_ERROR;
   3383         UBool      sameCollAttribute = TRUE;
   3384         uint32_t   ceMask;
   3385         UBool      shift;
   3386         uint32_t   varTop;
   3387 
   3388         // **** hack to deal w/ how processed CEs encode quaternary ****
   3389         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
   3390         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
   3391             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
   3392                 sameCollAttribute = FALSE;
   3393         }
   3394 
   3395         strsrch->strength    = ucol_getStrength(strsrch->collator);
   3396         ceMask = getMask(strsrch->strength);
   3397         if (strsrch->ceMask != ceMask) {
   3398             strsrch->ceMask = ceMask;
   3399             sameCollAttribute = FALSE;
   3400         }
   3401 
   3402         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3403         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
   3404                                   &status) == UCOL_SHIFTED;
   3405         if (strsrch->toShift != shift) {
   3406             strsrch->toShift  = shift;
   3407             sameCollAttribute = FALSE;
   3408         }
   3409 
   3410         // if status is a failure, ucol_getVariableTop returns 0
   3411         varTop = ucol_getVariableTop(strsrch->collator, &status);
   3412         if (strsrch->variableTop != varTop) {
   3413             strsrch->variableTop = varTop;
   3414             sameCollAttribute    = FALSE;
   3415         }
   3416         if (!sameCollAttribute) {
   3417             initialize(strsrch, &status);
   3418         }
   3419         /* free offset buffer to avoid memory leak before initializing. */
   3420         ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
   3421         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
   3422                               strsrch->search->textLength,
   3423                               &(strsrch->textIter->iteratordata_),
   3424                               &status);
   3425         strsrch->search->matchedLength      = 0;
   3426         strsrch->search->matchedIndex       = USEARCH_DONE;
   3427         strsrch->search->isOverlap          = FALSE;
   3428         strsrch->search->isCanonicalMatch   = FALSE;
   3429         strsrch->search->elementComparisonType = 0;
   3430         strsrch->search->isForwardSearching = TRUE;
   3431         strsrch->search->reset              = TRUE;
   3432     }
   3433 }
   3434 
   3435 //
   3436 //  CEI  Collation Element + source text index.
   3437 //       These structs are kept in the circular buffer.
   3438 //
   3439 struct  CEI {
   3440     int64_t ce;
   3441     int32_t lowIndex;
   3442     int32_t highIndex;
   3443 };
   3444 
   3445 U_NAMESPACE_BEGIN
   3446 
   3447 
   3448 //
   3449 //  CEBuffer   A circular buffer of CEs from the text being searched.
   3450 //
   3451 #define   DEFAULT_CEBUFFER_SIZE 96
   3452 #define   CEBUFFER_EXTRA 32
   3453 // Some typical max values to make buffer size more reasonable for asymmetric search.
   3454 // #8694 is for a better long-term solution to allocation of this buffer.
   3455 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
   3456 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
   3457 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
   3458 struct CEBuffer {
   3459     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
   3460     CEI                 *buf;
   3461     int32_t              bufSize;
   3462     int32_t              firstIx;
   3463     int32_t              limitIx;
   3464     UCollationElements  *ceIter;
   3465     UStringSearch       *strSearch;
   3466 
   3467 
   3468 
   3469                CEBuffer(UStringSearch *ss, UErrorCode *status);
   3470                ~CEBuffer();
   3471    const CEI   *get(int32_t index);
   3472    const CEI   *getPrevious(int32_t index);
   3473 };
   3474 
   3475 
   3476 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
   3477     buf = defBuf;
   3478     strSearch = ss;
   3479     bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA;
   3480     if (ss->search->elementComparisonType != 0) {
   3481         const UChar * patText = ss->pattern.text;
   3482         if (patText) {
   3483             const UChar * patTextLimit = patText + ss->pattern.textLength;
   3484             while ( patText < patTextLimit ) {
   3485                 UChar c = *patText++;
   3486                 if (MIGHT_BE_JAMO_L(c)) {
   3487                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
   3488                 } else {
   3489                     // No check for surrogates, we might allocate slightly more buffer than necessary.
   3490                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
   3491                 }
   3492             }
   3493         }
   3494     }
   3495     ceIter    = ss->textIter;
   3496     firstIx = 0;
   3497     limitIx = 0;
   3498 
   3499     uprv_init_pce(ceIter);
   3500 
   3501     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
   3502         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
   3503         if (buf == NULL) {
   3504             *status = U_MEMORY_ALLOCATION_ERROR;
   3505         }
   3506     }
   3507 }
   3508 
   3509 // TODO: add a reset or init function so that allocated
   3510 //       buffers can be retained & reused.
   3511 
   3512 CEBuffer::~CEBuffer() {
   3513     if (buf != defBuf) {
   3514         uprv_free(buf);
   3515     }
   3516 }
   3517 
   3518 
   3519 // Get the CE with the specified index.
   3520 //   Index must be in the range
   3521 //          n-history_size < index < n+1
   3522 //   where n is the largest index to have been fetched by some previous call to this function.
   3523 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3524 //
   3525 const CEI *CEBuffer::get(int32_t index) {
   3526     int i = index % bufSize;
   3527 
   3528     if (index>=firstIx && index<limitIx) {
   3529         // The request was for an entry already in our buffer.
   3530         //  Just return it.
   3531         return &buf[i];
   3532     }
   3533 
   3534     // Caller is requesting a new, never accessed before, CE.
   3535     //   Verify that it is the next one in sequence, which is all
   3536     //   that is allowed.
   3537     if (index != limitIx) {
   3538         U_ASSERT(FALSE);
   3539 
   3540         return NULL;
   3541     }
   3542 
   3543     // Manage the circular CE buffer indexing
   3544     limitIx++;
   3545 
   3546     if (limitIx - firstIx >= bufSize) {
   3547         // The buffer is full, knock out the lowest-indexed entry.
   3548         firstIx++;
   3549     }
   3550 
   3551     UErrorCode status = U_ZERO_ERROR;
   3552 
   3553     buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
   3554 
   3555     return &buf[i];
   3556 }
   3557 
   3558 // Get the CE with the specified index.
   3559 //   Index must be in the range
   3560 //          n-history_size < index < n+1
   3561 //   where n is the largest index to have been fetched by some previous call to this function.
   3562 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3563 //
   3564 const CEI *CEBuffer::getPrevious(int32_t index) {
   3565     int i = index % bufSize;
   3566 
   3567     if (index>=firstIx && index<limitIx) {
   3568         // The request was for an entry already in our buffer.
   3569         //  Just return it.
   3570         return &buf[i];
   3571     }
   3572 
   3573     // Caller is requesting a new, never accessed before, CE.
   3574     //   Verify that it is the next one in sequence, which is all
   3575     //   that is allowed.
   3576     if (index != limitIx) {
   3577         U_ASSERT(FALSE);
   3578 
   3579         return NULL;
   3580     }
   3581 
   3582     // Manage the circular CE buffer indexing
   3583     limitIx++;
   3584 
   3585     if (limitIx - firstIx >= bufSize) {
   3586         // The buffer is full, knock out the lowest-indexed entry.
   3587         firstIx++;
   3588     }
   3589 
   3590     UErrorCode status = U_ZERO_ERROR;
   3591 
   3592     buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
   3593 
   3594     return &buf[i];
   3595 }
   3596 
   3597 U_NAMESPACE_END
   3598 
   3599 
   3600 // #define USEARCH_DEBUG
   3601 
   3602 #ifdef USEARCH_DEBUG
   3603 #include <stdio.h>
   3604 #include <stdlib.h>
   3605 #endif
   3606 
   3607 /*
   3608  * Find the next break boundary after startIndex. If the UStringSearch object
   3609  * has an external break iterator, use that. Otherwise use the internal character
   3610  * break iterator.
   3611  */
   3612 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
   3613 #if 0
   3614     const UChar *text = strsrch->search->text;
   3615     int32_t textLen   = strsrch->search->textLength;
   3616 
   3617     U_ASSERT(startIndex>=0);
   3618     U_ASSERT(startIndex<=textLen);
   3619 
   3620     if (startIndex >= textLen) {
   3621         return startIndex;
   3622     }
   3623 
   3624     UChar32  c;
   3625     int32_t  i = startIndex;
   3626     U16_NEXT(text, i, textLen, c);
   3627 
   3628     // If we are on a control character, stop without looking for combining marks.
   3629     //    Control characters do not combine.
   3630     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3631     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   3632         return i;
   3633     }
   3634 
   3635     // The initial character was not a control, and can thus accept trailing
   3636     //   combining characters.  Advance over however many of them there are.
   3637     int32_t  indexOfLastCharChecked;
   3638     for (;;) {
   3639         indexOfLastCharChecked = i;
   3640         if (i>=textLen) {
   3641             break;
   3642         }
   3643         U16_NEXT(text, i, textLen, c);
   3644         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3645         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3646             break;
   3647         }
   3648     }
   3649     return indexOfLastCharChecked;
   3650 #elif !UCONFIG_NO_BREAK_ITERATION
   3651     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3652 
   3653     if (breakiterator == NULL) {
   3654         breakiterator = strsrch->search->internalBreakIter;
   3655     }
   3656 
   3657     if (breakiterator != NULL) {
   3658         return ubrk_following(breakiterator, startIndex);
   3659     }
   3660 
   3661     return startIndex;
   3662 #else
   3663     // **** or should we use the original code? ****
   3664     return startIndex;
   3665 #endif
   3666 
   3667 }
   3668 
   3669 /*
   3670  * Returns TRUE if index is on a break boundary. If the UStringSearch
   3671  * has an external break iterator, test using that, otherwise test
   3672  * using the internal character break iterator.
   3673  */
   3674 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
   3675 #if 0
   3676     const UChar *text = strsrch->search->text;
   3677     int32_t textLen   = strsrch->search->textLength;
   3678 
   3679     U_ASSERT(index>=0);
   3680     U_ASSERT(index<=textLen);
   3681 
   3682     if (index>=textLen || index<=0) {
   3683         return TRUE;
   3684     }
   3685 
   3686     // If the character at the current index is not a GRAPHEME_EXTEND
   3687     //    then we can not be within a combining sequence.
   3688     UChar32  c;
   3689     U16_GET(text, 0, index, textLen, c);
   3690     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3691     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3692         return TRUE;
   3693     }
   3694 
   3695     // We are at a combining mark.  If the preceding character is anything
   3696     //   except a CONTROL, CR or LF, we are in a combining sequence.
   3697     U16_PREV(text, 0, index, c);
   3698     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3699     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   3700     return !combining;
   3701 #elif !UCONFIG_NO_BREAK_ITERATION
   3702     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3703 
   3704     if (breakiterator == NULL) {
   3705         breakiterator = strsrch->search->internalBreakIter;
   3706     }
   3707 
   3708     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
   3709 #else
   3710     // **** or use the original code? ****
   3711     return TRUE;
   3712 #endif
   3713 }
   3714 
   3715 #if 0
   3716 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
   3717 {
   3718 #if !UCONFIG_NO_BREAK_ITERATION
   3719     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3720 
   3721     if (breakiterator != NULL) {
   3722         int32_t startindex = ubrk_first(breakiterator);
   3723         int32_t endindex   = ubrk_last(breakiterator);
   3724 
   3725         // out-of-range indexes are never boundary positions
   3726         if (start < startindex || start > endindex ||
   3727             end < startindex || end > endindex) {
   3728             return FALSE;
   3729         }
   3730 
   3731         return ubrk_isBoundary(breakiterator, start) &&
   3732                ubrk_isBoundary(breakiterator, end);
   3733     }
   3734 #endif
   3735 
   3736     return TRUE;
   3737 }
   3738 #endif
   3739 
   3740 typedef enum {
   3741     U_CE_MATCH = -1,
   3742     U_CE_NO_MATCH = 0,
   3743     U_CE_SKIP_TARG,
   3744     U_CE_SKIP_PATN
   3745 } UCompareCEsResult;
   3746 #define U_CE_LEVEL2_BASE 0x00000005
   3747 #define U_CE_LEVEL3_BASE 0x00050000
   3748 
   3749 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
   3750     if (targCE == patCE) {
   3751         return U_CE_MATCH;
   3752     }
   3753     if (compareType == 0) {
   3754         return U_CE_NO_MATCH;
   3755     }
   3756 
   3757     int64_t targCEshifted = targCE >> 32;
   3758     int64_t patCEshifted = patCE >> 32;
   3759     int64_t mask;
   3760 
   3761     mask = 0xFFFF0000;
   3762     int32_t targLev1 = (int32_t)(targCEshifted & mask);
   3763     int32_t patLev1 = (int32_t)(patCEshifted & mask);
   3764     if ( targLev1 != patLev1 ) {
   3765         if ( targLev1 == 0 ) {
   3766             return U_CE_SKIP_TARG;
   3767         }
   3768         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3769             return U_CE_SKIP_PATN;
   3770         }
   3771         return U_CE_NO_MATCH;
   3772     }
   3773 
   3774     mask = 0x0000FFFF;
   3775     int32_t targLev2 = (int32_t)(targCEshifted & mask);
   3776     int32_t patLev2 = (int32_t)(patCEshifted & mask);
   3777     if ( targLev2 != patLev2 ) {
   3778         if ( targLev2 == 0 ) {
   3779             return U_CE_SKIP_TARG;
   3780         }
   3781         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3782             return U_CE_SKIP_PATN;
   3783         }
   3784         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
   3785             U_CE_MATCH: U_CE_NO_MATCH;
   3786     }
   3787 
   3788     mask = 0xFFFF0000;
   3789     int32_t targLev3 = (int32_t)(targCE & mask);
   3790     int32_t patLev3 = (int32_t)(patCE & mask);
   3791     if ( targLev3 != patLev3 ) {
   3792         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
   3793             U_CE_MATCH: U_CE_NO_MATCH;
   3794    }
   3795 
   3796     return U_CE_MATCH;
   3797 }
   3798 
   3799 #if BOYER_MOORE
   3800 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
   3801 #endif
   3802 
   3803 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
   3804                                        int32_t        startIdx,
   3805                                        int32_t        *matchStart,
   3806                                        int32_t        *matchLimit,
   3807                                        UErrorCode     *status)
   3808 {
   3809     if (U_FAILURE(*status)) {
   3810         return FALSE;
   3811     }
   3812 
   3813     // TODO:  reject search patterns beginning with a combining char.
   3814 
   3815 #ifdef USEARCH_DEBUG
   3816     if (getenv("USEARCH_DEBUG") != NULL) {
   3817         printf("Pattern CEs\n");
   3818         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   3819             printf(" %8x", strsrch->pattern.CE[ii]);
   3820         }
   3821         printf("\n");
   3822     }
   3823 
   3824 #endif
   3825     // Input parameter sanity check.
   3826     //  TODO:  should input indicies clip to the text length
   3827     //         in the same way that UText does.
   3828     if(strsrch->pattern.CELength == 0         ||
   3829        startIdx < 0                           ||
   3830        startIdx > strsrch->search->textLength ||
   3831        strsrch->pattern.CE == NULL) {
   3832            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3833            return FALSE;
   3834     }
   3835 
   3836     if (strsrch->pattern.PCE == NULL) {
   3837         initializePatternPCETable(strsrch, status);
   3838     }
   3839 
   3840     ucol_setOffset(strsrch->textIter, startIdx, status);
   3841     CEBuffer ceb(strsrch, status);
   3842 
   3843 
   3844     int32_t    targetIx = 0;
   3845     const CEI *targetCEI = NULL;
   3846     int32_t    patIx;
   3847     UBool      found;
   3848 
   3849     int32_t  mStart = -1;
   3850     int32_t  mLimit = -1;
   3851     int32_t  minLimit;
   3852     int32_t  maxLimit;
   3853 
   3854 
   3855 
   3856     // Outer loop moves over match starting positions in the
   3857     //      target CE space.
   3858     // Here we see the target as a sequence of collation elements, resulting from the following:
   3859     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
   3860     //    (for example, digraphs such as IJ may be broken into two characters).
   3861     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
   3862     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
   3863     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
   3864     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
   3865     //    then the CE is deleted, so the following code sees only CEs that are relevant.
   3866     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
   3867     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
   3868     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
   3869     //
   3870     for(targetIx=0; ; targetIx++)
   3871     {
   3872         found = TRUE;
   3873         //  Inner loop checks for a match beginning at each
   3874         //  position from the outer loop.
   3875         int32_t targetIxOffset = 0;
   3876         int64_t patCE = 0;
   3877         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
   3878         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   3879         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
   3880         const CEI *firstCEI = ceb.get(targetIx);
   3881         if (firstCEI == NULL) {
   3882             *status = U_INTERNAL_PROGRAM_ERROR;
   3883             found = FALSE;
   3884             break;
   3885         }
   3886 
   3887         for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
   3888             patCE = strsrch->pattern.PCE[patIx];
   3889             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
   3890             //  Compare CE from target string with CE from the pattern.
   3891             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
   3892             //    which will fail the compare, below.
   3893             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   3894             if ( ceMatch == U_CE_NO_MATCH ) {
   3895                 found = FALSE;
   3896                 break;
   3897             } else if ( ceMatch > U_CE_NO_MATCH ) {
   3898                 if ( ceMatch == U_CE_SKIP_TARG ) {
   3899                     // redo with same patCE, next targCE
   3900                     patIx--;
   3901                     targetIxOffset++;
   3902                 } else { // ceMatch == U_CE_SKIP_PATN
   3903                     // redo with same targCE, next patCE
   3904                     targetIxOffset--;
   3905                 }
   3906             }
   3907         }
   3908         targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far
   3909 
   3910         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   3911             // No match at this targetIx.  Try again at the next.
   3912             continue;
   3913         }
   3914 
   3915         if (!found) {
   3916             // No match at all, we have run off the end of the target text.
   3917             break;
   3918         }
   3919 
   3920 
   3921         // We have found a match in CE space.
   3922         // Now determine the bounds in string index space.
   3923         //  There still is a chance of match failure if the CE range not correspond to
   3924         //     an acceptable character range.
   3925         //
   3926         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
   3927 
   3928         mStart   = firstCEI->lowIndex;
   3929         minLimit = lastCEI->lowIndex;
   3930 
   3931         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   3932         //   extended to the end of input, and the match is good.
   3933 
   3934         // Look at the high and low indices of the CE following the match. If
   3935         // they are the same it means one of two things:
   3936         //    1. The match extended to the last CE from the target text, which is OK, or
   3937         //    2. The last CE that was part of the match is in an expansion that extends
   3938         //       to the first CE after the match. In this case, we reject the match.
   3939         const CEI *nextCEI = 0;
   3940         if (strsrch->search->elementComparisonType == 0) {
   3941             nextCEI  = ceb.get(targetIx + targetIxOffset);
   3942             maxLimit = nextCEI->lowIndex;
   3943             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3944                 found = FALSE;
   3945             }
   3946         } else {
   3947             for ( ; ; ++targetIxOffset ) {
   3948                 nextCEI = ceb.get(targetIx + targetIxOffset);
   3949                 maxLimit = nextCEI->lowIndex;
   3950                 // If we are at the end of the target too, match succeeds
   3951                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
   3952                     break;
   3953                 }
   3954                 // As long as the next CE has primary weight of 0,
   3955                 // it is part of the last target element matched by the pattern;
   3956                 // make sure it can be part of a match with the last patCE
   3957                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
   3958                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
   3959                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
   3960                         found = FALSE;
   3961                         break;
   3962                     }
   3963                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
   3964                 // target element, but it has non-zero primary weight => match fails
   3965                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
   3966                     found = false;
   3967                     break;
   3968                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
   3969                 } else {
   3970                     break;
   3971                 }
   3972             }
   3973         }
   3974 
   3975 
   3976         // Check for the start of the match being within a combining sequence.
   3977         //   This can happen if the pattern itself begins with a combining char, and
   3978         //   the match found combining marks in the target text that were attached
   3979         //    to something else.
   3980         //   This type of match should be rejected for not completely consuming a
   3981         //   combining sequence.
   3982         if (!isBreakBoundary(strsrch, mStart)) {
   3983             found = FALSE;
   3984         }
   3985 
   3986         // Check for the start of the match being within an Collation Element Expansion,
   3987         //   meaning that the first char of the match is only partially matched.
   3988         //   With exapnsions, the first CE will report the index of the source
   3989         //   character, and all subsequent (expansions) CEs will report the source index of the
   3990         //    _following_ character.
   3991         int32_t secondIx = firstCEI->highIndex;
   3992         if (mStart == secondIx) {
   3993             found = FALSE;
   3994         }
   3995 
   3996         //  Advance the match end position to the first acceptable match boundary.
   3997         //    This advances the index over any combining charcters.
   3998         mLimit = maxLimit;
   3999         if (minLimit < maxLimit) {
   4000             // When the last CE's low index is same with its high index, the CE is likely
   4001             // a part of expansion. In this case, the index is located just after the
   4002             // character corresponding to the CEs compared above. If the index is right
   4003             // at the break boundary, move the position to the next boundary will result
   4004             // incorrect match length when there are ignorable characters exist between
   4005             // the position and the next character produces CE(s). See ticket#8482.
   4006             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
   4007                 mLimit = minLimit;
   4008             } else {
   4009                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4010                 if (nba >= lastCEI->highIndex) {
   4011                     mLimit = nba;
   4012                 }
   4013             }
   4014         }
   4015 
   4016     #ifdef USEARCH_DEBUG
   4017         if (getenv("USEARCH_DEBUG") != NULL) {
   4018             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4019         }
   4020     #endif
   4021 
   4022         // If advancing to the end of a combining sequence in character indexing space
   4023         //   advanced us beyond the end of the match in CE space, reject this match.
   4024         if (mLimit > maxLimit) {
   4025             found = FALSE;
   4026         }
   4027 
   4028         if (!isBreakBoundary(strsrch, mLimit)) {
   4029             found = FALSE;
   4030         }
   4031 
   4032         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4033             found = FALSE;
   4034         }
   4035 
   4036         if (found) {
   4037             break;
   4038         }
   4039     }
   4040 
   4041     #ifdef USEARCH_DEBUG
   4042     if (getenv("USEARCH_DEBUG") != NULL) {
   4043         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4044         int32_t  lastToPrint = ceb.limitIx+2;
   4045         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4046             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4047         }
   4048         printf("\n%s\n", found? "match found" : "no match");
   4049     }
   4050     #endif
   4051 
   4052     // All Done.  Store back the match bounds to the caller.
   4053     //
   4054     if (found==FALSE) {
   4055         mLimit = -1;
   4056         mStart = -1;
   4057     }
   4058 
   4059     if (matchStart != NULL) {
   4060         *matchStart= mStart;
   4061     }
   4062 
   4063     if (matchLimit != NULL) {
   4064         *matchLimit = mLimit;
   4065     }
   4066 
   4067     return found;
   4068 }
   4069 
   4070 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
   4071                                                 int32_t        startIdx,
   4072                                                 int32_t        *matchStart,
   4073                                                 int32_t        *matchLimit,
   4074                                                 UErrorCode     *status)
   4075 {
   4076     if (U_FAILURE(*status)) {
   4077         return FALSE;
   4078     }
   4079 
   4080     // TODO:  reject search patterns beginning with a combining char.
   4081 
   4082 #ifdef USEARCH_DEBUG
   4083     if (getenv("USEARCH_DEBUG") != NULL) {
   4084         printf("Pattern CEs\n");
   4085         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   4086             printf(" %8x", strsrch->pattern.CE[ii]);
   4087         }
   4088         printf("\n");
   4089     }
   4090 
   4091 #endif
   4092     // Input parameter sanity check.
   4093     //  TODO:  should input indicies clip to the text length
   4094     //         in the same way that UText does.
   4095     if(strsrch->pattern.CELength == 0         ||
   4096        startIdx < 0                           ||
   4097        startIdx > strsrch->search->textLength ||
   4098        strsrch->pattern.CE == NULL) {
   4099            *status = U_ILLEGAL_ARGUMENT_ERROR;
   4100            return FALSE;
   4101     }
   4102 
   4103     if (strsrch->pattern.PCE == NULL) {
   4104         initializePatternPCETable(strsrch, status);
   4105     }
   4106 
   4107     CEBuffer ceb(strsrch, status);
   4108     int32_t    targetIx = 0;
   4109 
   4110     /*
   4111      * Pre-load the buffer with the CE's for the grapheme
   4112      * after our starting position so that we're sure that
   4113      * we can look at the CE following the match when we
   4114      * check the match boundaries.
   4115      *
   4116      * This will also pre-fetch the first CE that we'll
   4117      * consider for the match.
   4118      */
   4119     if (startIdx < strsrch->search->textLength) {
   4120         UBreakIterator *bi = strsrch->search->internalBreakIter;
   4121         int32_t next = ubrk_following(bi, startIdx);
   4122 
   4123         ucol_setOffset(strsrch->textIter, next, status);
   4124 
   4125         for (targetIx = 0; ; targetIx += 1) {
   4126             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
   4127                 break;
   4128             }
   4129         }
   4130     } else {
   4131         ucol_setOffset(strsrch->textIter, startIdx, status);
   4132     }
   4133 
   4134 
   4135     const CEI *targetCEI = NULL;
   4136     int32_t    patIx;
   4137     UBool      found;
   4138 
   4139     int32_t  limitIx = targetIx;
   4140     int32_t  mStart = -1;
   4141     int32_t  mLimit = -1;
   4142     int32_t  minLimit;
   4143     int32_t  maxLimit;
   4144 
   4145 
   4146 
   4147     // Outer loop moves over match starting positions in the
   4148     //      target CE space.
   4149     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
   4150     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
   4151     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
   4152     // and the beginning of the base text.
   4153     for(targetIx = limitIx; ; targetIx += 1)
   4154     {
   4155         found = TRUE;
   4156         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
   4157         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   4158         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
   4159         const CEI *lastCEI  = ceb.getPrevious(targetIx);
   4160         if (lastCEI == NULL) {
   4161             *status = U_INTERNAL_PROGRAM_ERROR;
   4162             found = FALSE;
   4163              break;
   4164         }
   4165         //  Inner loop checks for a match beginning at each
   4166         //  position from the outer loop.
   4167         int32_t targetIxOffset = 0;
   4168         for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
   4169             int64_t patCE = strsrch->pattern.PCE[patIx];
   4170 
   4171             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
   4172             //  Compare CE from target string with CE from the pattern.
   4173             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   4174             //    which will fail the compare, below.
   4175             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   4176             if ( ceMatch == U_CE_NO_MATCH ) {
   4177                 found = FALSE;
   4178                 break;
   4179             } else if ( ceMatch > U_CE_NO_MATCH ) {
   4180                 if ( ceMatch == U_CE_SKIP_TARG ) {
   4181                     // redo with same patCE, next targCE
   4182                     patIx++;
   4183                     targetIxOffset++;
   4184                 } else { // ceMatch == U_CE_SKIP_PATN
   4185                     // redo with same targCE, next patCE
   4186                     targetIxOffset--;
   4187                 }
   4188             }
   4189         }
   4190 
   4191         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   4192             // No match at this targetIx.  Try again at the next.
   4193             continue;
   4194         }
   4195 
   4196         if (!found) {
   4197             // No match at all, we have run off the end of the target text.
   4198             break;
   4199         }
   4200 
   4201 
   4202         // We have found a match in CE space.
   4203         // Now determine the bounds in string index space.
   4204         //  There still is a chance of match failure if the CE range not correspond to
   4205         //     an acceptable character range.
   4206         //
   4207         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
   4208         mStart   = firstCEI->lowIndex;
   4209 
   4210         // Check for the start of the match being within a combining sequence.
   4211         //   This can happen if the pattern itself begins with a combining char, and
   4212         //   the match found combining marks in the target text that were attached
   4213         //    to something else.
   4214         //   This type of match should be rejected for not completely consuming a
   4215         //   combining sequence.
   4216         if (!isBreakBoundary(strsrch, mStart)) {
   4217             found = FALSE;
   4218         }
   4219 
   4220         // Look at the high index of the first CE in the match. If it's the same as the
   4221         // low index, the first CE in the match is in the middle of an expansion.
   4222         if (mStart == firstCEI->highIndex) {
   4223             found = FALSE;
   4224         }
   4225 
   4226 
   4227         minLimit = lastCEI->lowIndex;
   4228 
   4229         if (targetIx > 0) {
   4230             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   4231             //   extended to the end of input, and the match is good.
   4232 
   4233             // Look at the high and low indices of the CE following the match. If
   4234             // they are the same it means one of two things:
   4235             //    1. The match extended to the last CE from the target text, which is OK, or
   4236             //    2. The last CE that was part of the match is in an expansion that extends
   4237             //       to the first CE after the match. In this case, we reject the match.
   4238             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
   4239 
   4240             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4241                 found = FALSE;
   4242             }
   4243 
   4244             mLimit = maxLimit = nextCEI->lowIndex;
   4245 
   4246             //  Advance the match end position to the first acceptable match boundary.
   4247             //    This advances the index over any combining charcters.
   4248             if (minLimit < maxLimit) {
   4249                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4250 
   4251                 if (nba >= lastCEI->highIndex) {
   4252                     mLimit = nba;
   4253                 }
   4254             }
   4255 
   4256             // If advancing to the end of a combining sequence in character indexing space
   4257             //   advanced us beyond the end of the match in CE space, reject this match.
   4258             if (mLimit > maxLimit) {
   4259                 found = FALSE;
   4260             }
   4261 
   4262             // Make sure the end of the match is on a break boundary
   4263             if (!isBreakBoundary(strsrch, mLimit)) {
   4264                 found = FALSE;
   4265             }
   4266 
   4267         } else {
   4268             // No non-ignorable CEs after this point.
   4269             // The maximum position is detected by boundary after
   4270             // the last non-ignorable CE. Combining sequence
   4271             // across the start index will be truncated.
   4272             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4273             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
   4274         }
   4275 
   4276     #ifdef USEARCH_DEBUG
   4277         if (getenv("USEARCH_DEBUG") != NULL) {
   4278             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4279         }
   4280     #endif
   4281 
   4282 
   4283         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4284             found = FALSE;
   4285         }
   4286 
   4287         if (found) {
   4288             break;
   4289         }
   4290     }
   4291 
   4292     #ifdef USEARCH_DEBUG
   4293     if (getenv("USEARCH_DEBUG") != NULL) {
   4294         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4295         int32_t  lastToPrint = ceb.limitIx+2;
   4296         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4297             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4298         }
   4299         printf("\n%s\n", found? "match found" : "no match");
   4300     }
   4301     #endif
   4302 
   4303     // All Done.  Store back the match bounds to the caller.
   4304     //
   4305     if (found==FALSE) {
   4306         mLimit = -1;
   4307         mStart = -1;
   4308     }
   4309 
   4310     if (matchStart != NULL) {
   4311         *matchStart= mStart;
   4312     }
   4313 
   4314     if (matchLimit != NULL) {
   4315         *matchLimit = mLimit;
   4316     }
   4317 
   4318     return found;
   4319 }
   4320 
   4321 // internal use methods declared in usrchimp.h -----------------------------
   4322 
   4323 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
   4324 {
   4325     if (U_FAILURE(*status)) {
   4326         setMatchNotFound(strsrch);
   4327         return FALSE;
   4328     }
   4329 
   4330 #if BOYER_MOORE
   4331     UCollationElements *coleiter        = strsrch->textIter;
   4332     int32_t             textlength      = strsrch->search->textLength;
   4333     int32_t            *patternce       = strsrch->pattern.CE;
   4334     int32_t             patterncelength = strsrch->pattern.CELength;
   4335     int32_t             textoffset      = ucol_getOffset(coleiter);
   4336 
   4337     // status used in setting coleiter offset, since offset is checked in
   4338     // shiftForward before setting the coleiter offset, status never
   4339     // a failure
   4340     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4341                               patterncelength);
   4342     while (textoffset <= textlength)
   4343     {
   4344         uint32_t    patternceindex = patterncelength - 1;
   4345         int32_t     targetce;
   4346         UBool       found          = FALSE;
   4347         int32_t    lastce          = UCOL_NULLORDER;
   4348 
   4349         setColEIterOffset(coleiter, textoffset);
   4350 
   4351         for (;;) {
   4352             // finding the last pattern ce match, imagine composite characters
   4353             // for example: search for pattern A in text \u00C0
   4354             // we'll have to skip \u0300 the grave first before we get to A
   4355             targetce = ucol_previous(coleiter, status);
   4356             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4357                 found = FALSE;
   4358                 break;
   4359             }
   4360             targetce = getCE(strsrch, targetce);
   4361             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
   4362                 // this is for the text \u0315\u0300 that requires
   4363                 // normalization and pattern \u0300, where \u0315 is ignorable
   4364                 continue;
   4365             }
   4366             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4367                 lastce = targetce;
   4368             }
   4369             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4370             if (targetce == patternce[patternceindex]) {
   4371                 // the first ce can be a contraction
   4372                 found = TRUE;
   4373                 break;
   4374             }
   4375             if (!hasExpansion(coleiter)) {
   4376                 found = FALSE;
   4377                 break;
   4378             }
   4379         }
   4380 
   4381         //targetce = lastce;
   4382 
   4383         while (found && patternceindex > 0) {
   4384             lastce = targetce;
   4385             targetce    = ucol_previous(coleiter, status);
   4386             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4387                 found = FALSE;
   4388                 break;
   4389             }
   4390             targetce    = getCE(strsrch, targetce);
   4391             if (targetce == UCOL_IGNORABLE) {
   4392                 continue;
   4393             }
   4394 
   4395             patternceindex --;
   4396             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4397             found = found && targetce == patternce[patternceindex];
   4398         }
   4399 
   4400         targetce = lastce;
   4401 
   4402         if (!found) {
   4403             if (U_FAILURE(*status)) {
   4404                 break;
   4405             }
   4406             textoffset = shiftForward(strsrch, textoffset, lastce,
   4407                                       patternceindex);
   4408             // status checked at loop.
   4409             patternceindex = patterncelength;
   4410             continue;
   4411         }
   4412 
   4413         if (checkNextExactMatch(strsrch, &textoffset, status)) {
   4414             // status checked in ucol_setOffset
   4415             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4416             return TRUE;
   4417         }
   4418     }
   4419     setMatchNotFound(strsrch);
   4420     return FALSE;
   4421 #else
   4422     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4423     int32_t start = -1;
   4424     int32_t end = -1;
   4425 
   4426     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4427         strsrch->search->matchedIndex  = start;
   4428         strsrch->search->matchedLength = end - start;
   4429         return TRUE;
   4430     } else {
   4431         setMatchNotFound(strsrch);
   4432         return FALSE;
   4433     }
   4434 #endif
   4435 }
   4436 
   4437 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
   4438 {
   4439     if (U_FAILURE(*status)) {
   4440         setMatchNotFound(strsrch);
   4441         return FALSE;
   4442     }
   4443 
   4444 #if BOYER_MOORE
   4445     UCollationElements *coleiter        = strsrch->textIter;
   4446     int32_t             textlength      = strsrch->search->textLength;
   4447     int32_t            *patternce       = strsrch->pattern.CE;
   4448     int32_t             patterncelength = strsrch->pattern.CELength;
   4449     int32_t             textoffset      = ucol_getOffset(coleiter);
   4450     UBool               hasPatternAccents =
   4451        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4452 
   4453     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4454                               patterncelength);
   4455     strsrch->canonicalPrefixAccents[0] = 0;
   4456     strsrch->canonicalSuffixAccents[0] = 0;
   4457 
   4458     while (textoffset <= textlength)
   4459     {
   4460         int32_t     patternceindex = patterncelength - 1;
   4461         int32_t     targetce;
   4462         UBool       found          = FALSE;
   4463         int32_t     lastce         = UCOL_NULLORDER;
   4464 
   4465         setColEIterOffset(coleiter, textoffset);
   4466 
   4467         for (;;) {
   4468             // finding the last pattern ce match, imagine composite characters
   4469             // for example: search for pattern A in text \u00C0
   4470             // we'll have to skip \u0300 the grave first before we get to A
   4471             targetce = ucol_previous(coleiter, status);
   4472             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4473                 found = FALSE;
   4474                 break;
   4475             }
   4476             targetce = getCE(strsrch, targetce);
   4477             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4478                 lastce = targetce;
   4479             }
   4480             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4481             if (targetce == patternce[patternceindex]) {
   4482                 // the first ce can be a contraction
   4483                 found = TRUE;
   4484                 break;
   4485             }
   4486             if (!hasExpansion(coleiter)) {
   4487                 found = FALSE;
   4488                 break;
   4489             }
   4490         }
   4491 
   4492         while (found && patternceindex > 0) {
   4493             targetce    = ucol_previous(coleiter, status);
   4494             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4495                 found = FALSE;
   4496                 break;
   4497             }
   4498             targetce    = getCE(strsrch, targetce);
   4499             if (targetce == UCOL_IGNORABLE) {
   4500                 continue;
   4501             }
   4502 
   4503             patternceindex --;
   4504             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4505             found = found && targetce == patternce[patternceindex];
   4506         }
   4507 
   4508         // initializing the rearranged accent array
   4509         if (hasPatternAccents && !found) {
   4510             strsrch->canonicalPrefixAccents[0] = 0;
   4511             strsrch->canonicalSuffixAccents[0] = 0;
   4512             if (U_FAILURE(*status)) {
   4513                 break;
   4514             }
   4515             found = doNextCanonicalMatch(strsrch, textoffset, status);
   4516         }
   4517 
   4518         if (!found) {
   4519             if (U_FAILURE(*status)) {
   4520                 break;
   4521             }
   4522             textoffset = shiftForward(strsrch, textoffset, lastce,
   4523                                       patternceindex);
   4524             // status checked at loop
   4525             patternceindex = patterncelength;
   4526             continue;
   4527         }
   4528 
   4529         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
   4530             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4531             return TRUE;
   4532         }
   4533     }
   4534     setMatchNotFound(strsrch);
   4535     return FALSE;
   4536 #else
   4537     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4538     int32_t start = -1;
   4539     int32_t end = -1;
   4540 
   4541     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4542         strsrch->search->matchedIndex  = start;
   4543         strsrch->search->matchedLength = end - start;
   4544         return TRUE;
   4545     } else {
   4546         setMatchNotFound(strsrch);
   4547         return FALSE;
   4548     }
   4549 #endif
   4550 }
   4551 
   4552 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
   4553 {
   4554     if (U_FAILURE(*status)) {
   4555         setMatchNotFound(strsrch);
   4556         return FALSE;
   4557     }
   4558 
   4559 #if BOYER_MOORE
   4560     UCollationElements *coleiter        = strsrch->textIter;
   4561     int32_t            *patternce       = strsrch->pattern.CE;
   4562     int32_t             patterncelength = strsrch->pattern.CELength;
   4563     int32_t             textoffset      = ucol_getOffset(coleiter);
   4564 
   4565     // shifting it check for setting offset
   4566     // if setOffset is called previously or there was no previous match, we
   4567     // leave the offset as it is.
   4568     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4569         textoffset = strsrch->search->matchedIndex;
   4570     }
   4571 
   4572     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4573                               patterncelength);
   4574 
   4575     while (textoffset >= 0)
   4576     {
   4577         int32_t     patternceindex = 1;
   4578         int32_t     targetce;
   4579         UBool       found          = FALSE;
   4580         int32_t     firstce        = UCOL_NULLORDER;
   4581 
   4582         // if status is a failure, ucol_setOffset does nothing
   4583         setColEIterOffset(coleiter, textoffset);
   4584 
   4585         for (;;) {
   4586             // finding the first pattern ce match, imagine composite
   4587             // characters. for example: search for pattern \u0300 in text
   4588             // \u00C0, we'll have to skip A first before we get to
   4589             // \u0300 the grave accent
   4590             targetce = ucol_next(coleiter, status);
   4591             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4592                 found = FALSE;
   4593                 break;
   4594             }
   4595             targetce = getCE(strsrch, targetce);
   4596             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4597                 firstce = targetce;
   4598             }
   4599             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
   4600                 continue;
   4601             }
   4602             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4603             if (targetce == patternce[0]) {
   4604                 found = TRUE;
   4605                 break;
   4606             }
   4607             if (!hasExpansion(coleiter)) {
   4608                 // checking for accents in composite character
   4609                 found = FALSE;
   4610                 break;
   4611             }
   4612         }
   4613 
   4614         //targetce = firstce;
   4615 
   4616         while (found && (patternceindex < patterncelength)) {
   4617             firstce = targetce;
   4618             targetce    = ucol_next(coleiter, status);
   4619             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4620                 found = FALSE;
   4621                 break;
   4622             }
   4623             targetce    = getCE(strsrch, targetce);
   4624             if (targetce == UCOL_IGNORABLE) {
   4625                 continue;
   4626             }
   4627 
   4628             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4629             found = found && targetce == patternce[patternceindex];
   4630             patternceindex ++;
   4631         }
   4632 
   4633         targetce = firstce;
   4634 
   4635         if (!found) {
   4636             if (U_FAILURE(*status)) {
   4637                 break;
   4638             }
   4639 
   4640             textoffset = reverseShift(strsrch, textoffset, targetce,
   4641                                       patternceindex);
   4642             patternceindex = 0;
   4643             continue;
   4644         }
   4645 
   4646         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
   4647             setColEIterOffset(coleiter, textoffset);
   4648             return TRUE;
   4649         }
   4650     }
   4651     setMatchNotFound(strsrch);
   4652     return FALSE;
   4653 #else
   4654     int32_t textOffset;
   4655 
   4656     if (strsrch->search->isOverlap) {
   4657         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4658             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4659         } else {
   4660             // move the start position at the end of possible match
   4661             initializePatternPCETable(strsrch, status);
   4662             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
   4663                 int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status);
   4664                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4665                     // at the end of the text
   4666                     break;
   4667                 }
   4668             }
   4669             if (U_FAILURE(*status)) {
   4670                 setMatchNotFound(strsrch);
   4671                 return FALSE;
   4672             }
   4673             textOffset = ucol_getOffset(strsrch->textIter);
   4674         }
   4675     } else {
   4676         textOffset = ucol_getOffset(strsrch->textIter);
   4677     }
   4678 
   4679     int32_t start = -1;
   4680     int32_t end = -1;
   4681 
   4682     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4683         strsrch->search->matchedIndex = start;
   4684         strsrch->search->matchedLength = end - start;
   4685         return TRUE;
   4686     } else {
   4687         setMatchNotFound(strsrch);
   4688         return FALSE;
   4689     }
   4690 #endif
   4691 }
   4692 
   4693 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
   4694                                       UErrorCode    *status)
   4695 {
   4696     if (U_FAILURE(*status)) {
   4697         setMatchNotFound(strsrch);
   4698         return FALSE;
   4699     }
   4700 
   4701 #if BOYER_MOORE
   4702     UCollationElements *coleiter        = strsrch->textIter;
   4703     int32_t            *patternce       = strsrch->pattern.CE;
   4704     int32_t             patterncelength = strsrch->pattern.CELength;
   4705     int32_t             textoffset      = ucol_getOffset(coleiter);
   4706     UBool               hasPatternAccents =
   4707        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4708 
   4709     // shifting it check for setting offset
   4710     // if setOffset is called previously or there was no previous match, we
   4711     // leave the offset as it is.
   4712     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4713         textoffset = strsrch->search->matchedIndex;
   4714     }
   4715 
   4716     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4717                               patterncelength);
   4718     strsrch->canonicalPrefixAccents[0] = 0;
   4719     strsrch->canonicalSuffixAccents[0] = 0;
   4720 
   4721     while (textoffset >= 0)
   4722     {
   4723         int32_t     patternceindex = 1;
   4724         int32_t     targetce;
   4725         UBool       found          = FALSE;
   4726         int32_t     firstce        = UCOL_NULLORDER;
   4727 
   4728         setColEIterOffset(coleiter, textoffset);
   4729         for (;;) {
   4730             // finding the first pattern ce match, imagine composite
   4731             // characters. for example: search for pattern \u0300 in text
   4732             // \u00C0, we'll have to skip A first before we get to
   4733             // \u0300 the grave accent
   4734             targetce = ucol_next(coleiter, status);
   4735             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4736                 found = FALSE;
   4737                 break;
   4738             }
   4739             targetce = getCE(strsrch, targetce);
   4740             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4741                 firstce = targetce;
   4742             }
   4743 
   4744             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4745             if (targetce == patternce[0]) {
   4746                 // the first ce can be a contraction
   4747                 found = TRUE;
   4748                 break;
   4749             }
   4750             if (!hasExpansion(coleiter)) {
   4751                 // checking for accents in composite character
   4752                 found = FALSE;
   4753                 break;
   4754             }
   4755         }
   4756 
   4757         targetce = firstce;
   4758 
   4759         while (found && patternceindex < patterncelength) {
   4760             targetce    = ucol_next(coleiter, status);
   4761             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4762                 found = FALSE;
   4763                 break;
   4764             }
   4765             targetce = getCE(strsrch, targetce);
   4766             if (targetce == UCOL_IGNORABLE) {
   4767                 continue;
   4768             }
   4769 
   4770             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4771             found = found && targetce == patternce[patternceindex];
   4772             patternceindex ++;
   4773         }
   4774 
   4775         // initializing the rearranged accent array
   4776         if (hasPatternAccents && !found) {
   4777             strsrch->canonicalPrefixAccents[0] = 0;
   4778             strsrch->canonicalSuffixAccents[0] = 0;
   4779             if (U_FAILURE(*status)) {
   4780                 break;
   4781             }
   4782             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
   4783         }
   4784 
   4785         if (!found) {
   4786             if (U_FAILURE(*status)) {
   4787                 break;
   4788             }
   4789             textoffset = reverseShift(strsrch, textoffset, targetce,
   4790                                       patternceindex);
   4791             patternceindex = 0;
   4792             continue;
   4793         }
   4794 
   4795         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
   4796             setColEIterOffset(coleiter, textoffset);
   4797             return TRUE;
   4798         }
   4799     }
   4800     setMatchNotFound(strsrch);
   4801     return FALSE;
   4802 #else
   4803     int32_t textOffset;
   4804 
   4805     if (strsrch->search->isOverlap) {
   4806         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4807             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4808         } else {
   4809             // move the start position at the end of possible match
   4810             initializePatternPCETable(strsrch, status);
   4811             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
   4812                 int64_t pce = ucol_nextProcessed(strsrch->textIter, NULL, NULL, status);
   4813                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4814                     // at the end of the text
   4815                     break;
   4816                 }
   4817             }
   4818             if (U_FAILURE(*status)) {
   4819                 setMatchNotFound(strsrch);
   4820                 return FALSE;
   4821             }
   4822             textOffset = ucol_getOffset(strsrch->textIter);
   4823         }
   4824     } else {
   4825         textOffset = ucol_getOffset(strsrch->textIter);
   4826     }
   4827 
   4828     int32_t start = -1;
   4829     int32_t end = -1;
   4830 
   4831     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4832         strsrch->search->matchedIndex = start;
   4833         strsrch->search->matchedLength = end - start;
   4834         return TRUE;
   4835     } else {
   4836         setMatchNotFound(strsrch);
   4837         return FALSE;
   4838     }
   4839 #endif
   4840 }
   4841 
   4842 #endif /* #if !UCONFIG_NO_COLLATION */
   4843