Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  07/02/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/usearch.h"
     15 #include "unicode/ustring.h"
     16 #include "unicode/uchar.h"
     17 #include "unicode/utf16.h"
     18 #include "normalizer2impl.h"
     19 #include "usrchimp.h"
     20 #include "cmemory.h"
     21 #include "ucln_in.h"
     22 #include "uassert.h"
     23 #include "ustr_imp.h"
     24 
     25 U_NAMESPACE_USE
     26 
     27 // don't use Boyer-Moore
     28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
     29 #define BOYER_MOORE 0
     30 
     31 // internal definition ---------------------------------------------------
     32 
     33 #define LAST_BYTE_MASK_          0xFF
     34 #define SECOND_LAST_BYTE_SHIFT_  8
     35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
     36 
     37 static const Normalizer2Impl *g_nfcImpl = NULL;
     38 
     39 // internal methods -------------------------------------------------
     40 
     41 /**
     42 * Fast collation element iterator setOffset.
     43 * This function does not check for bounds.
     44 * @param coleiter collation element iterator
     45 * @param offset to set
     46 */
     47 static
     48 inline void setColEIterOffset(UCollationElements *elems,
     49                       int32_t             offset)
     50 {
     51     // Note: Not "fast" any more after the 2013 collation rewrite.
     52     // We do not want to expose more internals than necessary.
     53     UErrorCode status = U_ZERO_ERROR;
     54     ucol_setOffset(elems, offset, &status);
     55 }
     56 
     57 /**
     58 * Getting the mask for collation strength
     59 * @param strength collation strength
     60 * @return collation element mask
     61 */
     62 static
     63 inline uint32_t getMask(UCollationStrength strength)
     64 {
     65     switch (strength)
     66     {
     67     case UCOL_PRIMARY:
     68         return UCOL_PRIMARYORDERMASK;
     69     case UCOL_SECONDARY:
     70         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
     71     default:
     72         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
     73                UCOL_PRIMARYORDERMASK;
     74     }
     75 }
     76 
     77 /**
     78 * @param ce 32-bit collation element
     79 * @return hash code
     80 */
     81 static
     82 inline int hashFromCE32(uint32_t ce)
     83 {
     84     int hc = (int)(
     85             ((((((ce >> 24) * 37) +
     86             (ce >> 16)) * 37) +
     87             (ce >> 8)) * 37) +
     88             ce);
     89     hc %= MAX_TABLE_SIZE_;
     90     if (hc < 0) {
     91         hc += MAX_TABLE_SIZE_;
     92     }
     93     return hc;
     94 }
     95 
     96 U_CDECL_BEGIN
     97 static UBool U_CALLCONV
     98 usearch_cleanup(void) {
     99     g_nfcImpl = NULL;
    100     return TRUE;
    101 }
    102 U_CDECL_END
    103 
    104 /**
    105 * Initializing the fcd tables.
    106 * Internal method, status assumed to be a success.
    107 * @param status output error if any, caller to check status before calling
    108 *               method, status assumed to be success when passed in.
    109 */
    110 static
    111 inline void initializeFCD(UErrorCode *status)
    112 {
    113     if (g_nfcImpl == NULL) {
    114         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    115         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
    116     }
    117 }
    118 
    119 /**
    120 * Gets the fcd value for a character at the argument index.
    121 * This method takes into accounts of the supplementary characters.
    122 * @param str UTF16 string where character for fcd retrieval resides
    123 * @param offset position of the character whose fcd is to be retrieved, to be
    124 *               overwritten with the next character position, taking
    125 *               surrogate characters into consideration.
    126 * @param strlength length of the argument string
    127 * @return fcd value
    128 */
    129 static
    130 uint16_t getFCD(const UChar   *str, int32_t *offset,
    131                              int32_t  strlength)
    132 {
    133     const UChar *temp = str + *offset;
    134     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
    135     *offset = (int32_t)(temp - str);
    136     return result;
    137 }
    138 
    139 /**
    140 * Getting the modified collation elements taking into account the collation
    141 * attributes
    142 * @param strsrch string search data
    143 * @param sourcece
    144 * @return the modified collation element
    145 */
    146 static
    147 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
    148 {
    149     // note for tertiary we can't use the collator->tertiaryMask, that
    150     // is a preprocessed mask that takes into account case options. since
    151     // we are only concerned with exact matches, we don't need that.
    152     sourcece &= strsrch->ceMask;
    153 
    154     if (strsrch->toShift) {
    155         // alternate handling here, since only the 16 most significant digits
    156         // is only used, we can safely do a compare without masking
    157         // if the ce is a variable, we mask and get only the primary values
    158         // no shifting to quartenary is required since all primary values
    159         // less than variabletop will need to be masked off anyway.
    160         if (strsrch->variableTop > sourcece) {
    161             if (strsrch->strength >= UCOL_QUATERNARY) {
    162                 sourcece &= UCOL_PRIMARYORDERMASK;
    163             }
    164             else {
    165                 sourcece = UCOL_IGNORABLE;
    166             }
    167         }
    168     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
    169         sourcece = 0xFFFF;
    170     }
    171 
    172     return sourcece;
    173 }
    174 
    175 /**
    176 * Allocate a memory and returns NULL if it failed.
    177 * Internal method, status assumed to be a success.
    178 * @param size to allocate
    179 * @param status output error if any, caller to check status before calling
    180 *               method, status assumed to be success when passed in.
    181 * @return newly allocated array, NULL otherwise
    182 */
    183 static
    184 inline void * allocateMemory(uint32_t size, UErrorCode *status)
    185 {
    186     uint32_t *result = (uint32_t *)uprv_malloc(size);
    187     if (result == NULL) {
    188         *status = U_MEMORY_ALLOCATION_ERROR;
    189     }
    190     return result;
    191 }
    192 
    193 /**
    194 * Adds a uint32_t value to a destination array.
    195 * Creates a new array if we run out of space. The caller will have to
    196 * manually deallocate the newly allocated array.
    197 * Internal method, status assumed to be success, caller has to check status
    198 * before calling this method. destination not to be NULL and has at least
    199 * size destinationlength.
    200 * @param destination target array
    201 * @param offset destination offset to add value
    202 * @param destinationlength target array size, return value for the new size
    203 * @param value to be added
    204 * @param increments incremental size expected
    205 * @param status output error if any, caller to check status before calling
    206 *               method, status assumed to be success when passed in.
    207 * @return new destination array, destination if there was no new allocation
    208 */
    209 static
    210 inline int32_t * addTouint32_tArray(int32_t    *destination,
    211                                     uint32_t    offset,
    212                                     uint32_t   *destinationlength,
    213                                     uint32_t    value,
    214                                     uint32_t    increments,
    215                                     UErrorCode *status)
    216 {
    217     uint32_t newlength = *destinationlength;
    218     if (offset + 1 == newlength) {
    219         newlength += increments;
    220         int32_t *temp = (int32_t *)allocateMemory(
    221                                          sizeof(int32_t) * newlength, status);
    222         if (U_FAILURE(*status)) {
    223             return NULL;
    224         }
    225         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
    226         *destinationlength = newlength;
    227         destination        = temp;
    228     }
    229     destination[offset] = value;
    230     return destination;
    231 }
    232 
    233 /**
    234 * Adds a uint64_t value to a destination array.
    235 * Creates a new array if we run out of space. The caller will have to
    236 * manually deallocate the newly allocated array.
    237 * Internal method, status assumed to be success, caller has to check status
    238 * before calling this method. destination not to be NULL and has at least
    239 * size destinationlength.
    240 * @param destination target array
    241 * @param offset destination offset to add value
    242 * @param destinationlength target array size, return value for the new size
    243 * @param value to be added
    244 * @param increments incremental size expected
    245 * @param status output error if any, caller to check status before calling
    246 *               method, status assumed to be success when passed in.
    247 * @return new destination array, destination if there was no new allocation
    248 */
    249 static
    250 inline int64_t * addTouint64_tArray(int64_t    *destination,
    251                                     uint32_t    offset,
    252                                     uint32_t   *destinationlength,
    253                                     uint64_t    value,
    254                                     uint32_t    increments,
    255                                     UErrorCode *status)
    256 {
    257     uint32_t newlength = *destinationlength;
    258     if (offset + 1 == newlength) {
    259         newlength += increments;
    260         int64_t *temp = (int64_t *)allocateMemory(
    261                                          sizeof(int64_t) * newlength, status);
    262 
    263         if (U_FAILURE(*status)) {
    264             return NULL;
    265         }
    266 
    267         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
    268         *destinationlength = newlength;
    269         destination        = temp;
    270     }
    271 
    272     destination[offset] = value;
    273 
    274     return destination;
    275 }
    276 
    277 /**
    278 * Initializing the ce table for a pattern.
    279 * Stores non-ignorable collation keys.
    280 * Table size will be estimated by the size of the pattern text. Table
    281 * expansion will be perform as we go along. Adding 1 to ensure that the table
    282 * size definitely increases.
    283 * Internal method, status assumed to be a success.
    284 * @param strsrch string search data
    285 * @param status output error if any, caller to check status before calling
    286 *               method, status assumed to be success when passed in.
    287 * @return total number of expansions
    288 */
    289 static
    290 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
    291                                          UErrorCode    *status)
    292 {
    293     UPattern *pattern            = &(strsrch->pattern);
    294     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
    295     int32_t  *cetable            = pattern->cesBuffer;
    296     uint32_t  patternlength      = pattern->textLength;
    297     UCollationElements *coleiter = strsrch->utilIter;
    298 
    299     if (coleiter == NULL) {
    300         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    301                                      patternlength, status);
    302         // status will be checked in ucol_next(..) later and if it is an
    303         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    304         // returned.
    305         strsrch->utilIter = coleiter;
    306     }
    307     else {
    308         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    309     }
    310     if(U_FAILURE(*status)) {
    311         return 0;
    312     }
    313 
    314     if (pattern->ces != cetable && pattern->ces) {
    315         uprv_free(pattern->ces);
    316     }
    317 
    318     uint16_t  offset      = 0;
    319     uint16_t  result      = 0;
    320     int32_t   ce;
    321 
    322     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
    323            U_SUCCESS(*status)) {
    324         uint32_t newce = getCE(strsrch, ce);
    325         if (newce) {
    326             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
    327                                   newce,
    328                                   patternlength - ucol_getOffset(coleiter) + 1,
    329                                   status);
    330             if (U_FAILURE(*status)) {
    331                 return 0;
    332             }
    333             offset ++;
    334             if (cetable != temp && cetable != pattern->cesBuffer) {
    335                 uprv_free(cetable);
    336             }
    337             cetable = temp;
    338         }
    339         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    340     }
    341 
    342     cetable[offset]   = 0;
    343     pattern->ces       = cetable;
    344     pattern->cesLength = offset;
    345 
    346     return result;
    347 }
    348 
    349 /**
    350 * Initializing the pce table for a pattern.
    351 * Stores non-ignorable collation keys.
    352 * Table size will be estimated by the size of the pattern text. Table
    353 * expansion will be perform as we go along. Adding 1 to ensure that the table
    354 * size definitely increases.
    355 * Internal method, status assumed to be a success.
    356 * @param strsrch string search data
    357 * @param status output error if any, caller to check status before calling
    358 *               method, status assumed to be success when passed in.
    359 * @return total number of expansions
    360 */
    361 static
    362 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
    363                                           UErrorCode    *status)
    364 {
    365     UPattern *pattern            = &(strsrch->pattern);
    366     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
    367     int64_t  *pcetable           = pattern->pcesBuffer;
    368     uint32_t  patternlength      = pattern->textLength;
    369     UCollationElements *coleiter = strsrch->utilIter;
    370 
    371     if (coleiter == NULL) {
    372         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    373                                      patternlength, status);
    374         // status will be checked in ucol_next(..) later and if it is an
    375         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    376         // returned.
    377         strsrch->utilIter = coleiter;
    378     } else {
    379         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    380     }
    381     if(U_FAILURE(*status)) {
    382         return 0;
    383     }
    384 
    385     if (pattern->pces != pcetable && pattern->pces != NULL) {
    386         uprv_free(pattern->pces);
    387     }
    388 
    389     uint16_t  offset = 0;
    390     uint16_t  result = 0;
    391     int64_t   pce;
    392 
    393     icu::UCollationPCE iter(coleiter);
    394 
    395     // ** Should processed CEs be signed or unsigned?
    396     // ** (the rest of the code in this file seems to play fast-and-loose with
    397     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
    398     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
    399            U_SUCCESS(*status)) {
    400         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
    401                               pce,
    402                               patternlength - ucol_getOffset(coleiter) + 1,
    403                               status);
    404 
    405         if (U_FAILURE(*status)) {
    406             return 0;
    407         }
    408 
    409         offset += 1;
    410 
    411         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
    412             uprv_free(pcetable);
    413         }
    414 
    415         pcetable = temp;
    416         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    417     }
    418 
    419     pcetable[offset]   = 0;
    420     pattern->pces       = pcetable;
    421     pattern->pcesLength = offset;
    422 
    423     return result;
    424 }
    425 
    426 /**
    427 * Initializes the pattern struct.
    428 * Internal method, status assumed to be success.
    429 * @param strsrch UStringSearch data storage
    430 * @param status output error if any, caller to check status before calling
    431 *               method, status assumed to be success when passed in.
    432 * @return expansionsize the total expansion size of the pattern
    433 */
    434 static
    435 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
    436 {
    437     if (U_FAILURE(*status)) { return 0; }
    438           UPattern   *pattern     = &(strsrch->pattern);
    439     const UChar      *patterntext = pattern->text;
    440           int32_t     length      = pattern->textLength;
    441           int32_t index       = 0;
    442 
    443     // Since the strength is primary, accents are ignored in the pattern.
    444     if (strsrch->strength == UCOL_PRIMARY) {
    445         pattern->hasPrefixAccents = 0;
    446         pattern->hasSuffixAccents = 0;
    447     } else {
    448         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
    449                                                          SECOND_LAST_BYTE_SHIFT_;
    450         index = length;
    451         U16_BACK_1(patterntext, 0, index);
    452         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
    453                                                                  LAST_BYTE_MASK_;
    454     }
    455 
    456     // ** HACK **
    457     if (strsrch->pattern.pces != NULL) {
    458         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
    459             uprv_free(strsrch->pattern.pces);
    460         }
    461 
    462         strsrch->pattern.pces = NULL;
    463     }
    464 
    465     // since intializePattern is an internal method status is a success.
    466     return initializePatternCETable(strsrch, status);
    467 }
    468 
    469 /**
    470 * Initializing shift tables, with the default values.
    471 * If a corresponding default value is 0, the shift table is not set.
    472 * @param shift table for forwards shift
    473 * @param backshift table for backwards shift
    474 * @param cetable table containing pattern ce
    475 * @param cesize size of the pattern ces
    476 * @param expansionsize total size of the expansions
    477 * @param defaultforward the default forward value
    478 * @param defaultbackward the default backward value
    479 */
    480 static
    481 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
    482                           int32_t  *cetable, int32_t cesize,
    483                           int16_t   expansionsize,
    484                           int16_t   defaultforward,
    485                           int16_t   defaultbackward)
    486 {
    487     // estimate the value to shift. to do that we estimate the smallest
    488     // number of characters to give the relevant ces, ie approximately
    489     // the number of ces minus their expansion, since expansions can come
    490     // from a character.
    491     int32_t count;
    492     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    493         shift[count] = defaultforward;
    494     }
    495     cesize --; // down to the last index
    496     for (count = 0; count < cesize; count ++) {
    497         // number of ces from right of array to the count
    498         int temp = defaultforward - count - 1;
    499         shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
    500     }
    501     shift[hashFromCE32(cetable[cesize])] = 1;
    502     // for ignorables we just shift by one. see test examples.
    503     shift[hashFromCE32(0)] = 1;
    504 
    505     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    506         backshift[count] = defaultbackward;
    507     }
    508     for (count = cesize; count > 0; count --) {
    509         // the original value count does not seem to work
    510         backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
    511                                           (int16_t)(count - expansionsize) : 1;
    512     }
    513     backshift[hashFromCE32(cetable[0])] = 1;
    514     backshift[hashFromCE32(0)] = 1;
    515 }
    516 
    517 /**
    518 * Building of the pattern collation element list and the boyer moore strsrch
    519 * table.
    520 * The canonical match will only be performed after the default match fails.
    521 * For both cases we need to remember the size of the composed and decomposed
    522 * versions of the string. Since the Boyer-Moore shift calculations shifts by
    523 * a number of characters in the text and tries to match the pattern from that
    524 * offset, the shift value can not be too large in case we miss some
    525 * characters. To choose a right shift size, we estimate the NFC form of the
    526 * and use its size as a shift guide. The NFC form should be the small
    527 * possible representation of the pattern. Anyways, we'll err on the smaller
    528 * shift size. Hence the calculation for minlength.
    529 * Canonical match will be performed slightly differently. We'll split the
    530 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
    531 * the first and last base character (MS), the ending accents (EA). Matches
    532 * will be done on MS first, and only when we match MS then some processing
    533 * will be required for the prefix and end accents in order to determine if
    534 * they match PA and EA. Hence the default shift values
    535 * for the canonical match will take the size of either end's accent into
    536 * consideration. Forwards search will take the end accents into consideration
    537 * for the default shift values and the backwards search will take the prefix
    538 * accents into consideration.
    539 * If pattern has no non-ignorable ce, we return a illegal argument error.
    540 * Internal method, status assumed to be success.
    541 * @param strsrch UStringSearch data storage
    542 * @param status  for output errors if it occurs, status is assumed to be a
    543 *                success when it is passed in.
    544 */
    545 static
    546 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
    547 {
    548     int16_t expandlength  = initializePattern(strsrch, status);
    549     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
    550         UPattern *pattern = &strsrch->pattern;
    551         int32_t   cesize  = pattern->cesLength;
    552 
    553         int16_t minlength = cesize > expandlength
    554                             ? (int16_t)cesize - expandlength : 1;
    555         pattern->defaultShiftSize    = minlength;
    556         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
    557                       cesize, expandlength, minlength, minlength);
    558         return;
    559     }
    560     strsrch->pattern.defaultShiftSize = 0;
    561 }
    562 
    563 #if BOYER_MOORE
    564 /**
    565 * Check to make sure that the match length is at the end of the character by
    566 * using the breakiterator.
    567 * @param strsrch string search data
    568 * @param start target text start offset
    569 * @param end target text end offset
    570 */
    571 static
    572 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
    573                                int32_t *end)
    574 {
    575 #if !UCONFIG_NO_BREAK_ITERATION
    576     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
    577     if (breakiterator) {
    578         int32_t matchend = *end;
    579         //int32_t matchstart = *start;
    580 
    581         if (!ubrk_isBoundary(breakiterator, matchend)) {
    582             *end = ubrk_following(breakiterator, matchend);
    583         }
    584 
    585         /* Check the start of the matched text to make sure it doesn't have any accents
    586          * before it.  This code may not be necessary and so it is commented out */
    587         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
    588             *start = ubrk_preceding(breakiterator, matchstart);
    589         }*/
    590     }
    591 #endif
    592 }
    593 
    594 /**
    595 * Determine whether the target text in UStringSearch bounded by the offset
    596 * start and end is one or more whole units of text as
    597 * determined by the breakiterator in UStringSearch.
    598 * @param strsrch string search data
    599 * @param start target text start offset
    600 * @param end target text end offset
    601 */
    602 static
    603 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
    604                                int32_t    end)
    605 {
    606 #if !UCONFIG_NO_BREAK_ITERATION
    607     UBreakIterator *breakiterator = strsrch->search->breakIter;
    608     //TODO: Add here.
    609     if (breakiterator) {
    610         int32_t startindex = ubrk_first(breakiterator);
    611         int32_t endindex   = ubrk_last(breakiterator);
    612 
    613         // out-of-range indexes are never boundary positions
    614         if (start < startindex || start > endindex ||
    615             end < startindex || end > endindex) {
    616             return FALSE;
    617         }
    618         // otherwise, we can use following() on the position before the
    619         // specified one and return true of the position we get back is the
    620         // one the user specified
    621         UBool result = (start == startindex ||
    622                 ubrk_following(breakiterator, start - 1) == start) &&
    623                (end == endindex ||
    624                 ubrk_following(breakiterator, end - 1) == end);
    625         if (result) {
    626             // iterates the individual ces
    627                   UCollationElements *coleiter  = strsrch->utilIter;
    628             const UChar              *text      = strsrch->search->text +
    629                                                                       start;
    630                   UErrorCode          status    = U_ZERO_ERROR;
    631             ucol_setText(coleiter, text, end - start, &status);
    632             for (int32_t count = 0; count < strsrch->pattern.cesLength;
    633                  count ++) {
    634                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
    635                 if (ce == UCOL_IGNORABLE) {
    636                     count --;
    637                     continue;
    638                 }
    639                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
    640                     return FALSE;
    641                 }
    642             }
    643             int32_t nextce = ucol_next(coleiter, &status);
    644             while (ucol_getOffset(coleiter) == (end - start)
    645                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
    646                 nextce = ucol_next(coleiter, &status);
    647             }
    648             if (ucol_getOffset(coleiter) == (end - start)
    649                 && nextce != UCOL_NULLORDER) {
    650                 // extra collation elements at the end of the match
    651                 return FALSE;
    652             }
    653         }
    654         return result;
    655     }
    656 #endif
    657     return TRUE;
    658 }
    659 
    660 /**
    661 * Getting the next base character offset if current offset is an accent,
    662 * or the current offset if the current character contains a base character.
    663 * accents the following base character will be returned
    664 * @param text string
    665 * @param textoffset current offset
    666 * @param textlength length of text string
    667 * @return the next base character or the current offset
    668 *         if the current character is contains a base character.
    669 */
    670 static
    671 inline int32_t getNextBaseOffset(const UChar       *text,
    672                                            int32_t  textoffset,
    673                                            int32_t      textlength)
    674 {
    675     if (textoffset < textlength) {
    676         int32_t temp = textoffset;
    677         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
    678             while (temp < textlength) {
    679                 int32_t result = temp;
    680                 if ((getFCD(text, &temp, textlength) >>
    681                      SECOND_LAST_BYTE_SHIFT_) == 0) {
    682                     return result;
    683                 }
    684             }
    685             return textlength;
    686         }
    687     }
    688     return textoffset;
    689 }
    690 
    691 /**
    692 * Gets the next base character offset depending on the string search pattern
    693 * data
    694 * @param strsrch string search data
    695 * @param textoffset current offset, one offset away from the last character
    696 *                   to search for.
    697 * @return start index of the next base character or the current offset
    698 *         if the current character is contains a base character.
    699 */
    700 static
    701 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
    702                                                   int32_t    textoffset)
    703 {
    704     int32_t textlength = strsrch->search->textLength;
    705     if (strsrch->pattern.hasSuffixAccents &&
    706         textoffset < textlength) {
    707               int32_t  temp       = textoffset;
    708         const UChar       *text       = strsrch->search->text;
    709         U16_BACK_1(text, 0, temp);
    710         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
    711             return getNextBaseOffset(text, textoffset, textlength);
    712         }
    713     }
    714     return textoffset;
    715 }
    716 
    717 /**
    718 * Shifting the collation element iterator position forward to prepare for
    719 * a following match. If the last character is a unsafe character, we'll only
    720 * shift by 1 to capture contractions, normalization etc.
    721 * Internal method, status assumed to be success.
    722 * @param text strsrch string search data
    723 * @param textoffset start text position to do search
    724 * @param ce the text ce which failed the match.
    725 * @param patternceindex index of the ce within the pattern ce buffer which
    726 *        failed the match
    727 * @return final offset
    728 */
    729 static
    730 inline int32_t shiftForward(UStringSearch *strsrch,
    731                                 int32_t    textoffset,
    732                                 int32_t       ce,
    733                                 int32_t        patternceindex)
    734 {
    735     UPattern *pattern = &(strsrch->pattern);
    736     if (ce != UCOL_NULLORDER) {
    737         int32_t shift = pattern->shift[hashFromCE32(ce)];
    738         // this is to adjust for characters in the middle of the
    739         // substring for matching that failed.
    740         int32_t adjust = pattern->cesLength - patternceindex;
    741         if (adjust > 1 && shift >= adjust) {
    742             shift -= adjust - 1;
    743         }
    744         textoffset += shift;
    745     }
    746     else {
    747         textoffset += pattern->defaultShiftSize;
    748     }
    749 
    750     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
    751     // check for unsafe characters
    752     // * if it is the start or middle of a contraction: to be done after
    753     //   a initial match is found
    754     // * thai or lao base consonant character: similar to contraction
    755     // * high surrogate character: similar to contraction
    756     // * next character is a accent: shift to the next base character
    757     return textoffset;
    758 }
    759 #endif // #if BOYER_MOORE
    760 
    761 /**
    762 * sets match not found
    763 * @param strsrch string search data
    764 */
    765 static
    766 inline void setMatchNotFound(UStringSearch *strsrch)
    767 {
    768     // this method resets the match result regardless of the error status.
    769     strsrch->search->matchedIndex = USEARCH_DONE;
    770     strsrch->search->matchedLength = 0;
    771     if (strsrch->search->isForwardSearching) {
    772         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
    773     }
    774     else {
    775         setColEIterOffset(strsrch->textIter, 0);
    776     }
    777 }
    778 
    779 #if BOYER_MOORE
    780 /**
    781 * Gets the offset to the next safe point in text.
    782 * ie. not the middle of a contraction, swappable characters or supplementary
    783 * characters.
    784 * @param collator collation sata
    785 * @param text string to work with
    786 * @param textoffset offset in string
    787 * @param textlength length of text string
    788 * @return offset to the next safe character
    789 */
    790 static
    791 inline int32_t getNextSafeOffset(const UCollator   *collator,
    792                                      const UChar       *text,
    793                                            int32_t  textoffset,
    794                                            int32_t      textlength)
    795 {
    796     int32_t result = textoffset; // first contraction character
    797     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
    798         result ++;
    799     }
    800     return result;
    801 }
    802 
    803 /**
    804 * This checks for accents in the potential match started with a .
    805 * composite character.
    806 * This is really painful... we have to check that composite character do not
    807 * have any extra accents. We have to normalize the potential match and find
    808 * the immediate decomposed character before the match.
    809 * The first composite character would have been taken care of by the fcd
    810 * checks in checkForwardExactMatch.
    811 * This is the slow path after the fcd of the first character and
    812 * the last character has been checked by checkForwardExactMatch and we
    813 * determine that the potential match has extra non-ignorable preceding
    814 * ces.
    815 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    816 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    817 * Note here that accents checking are slow and cautioned in the API docs.
    818 * Internal method, status assumed to be a success, caller should check status
    819 * before calling this method
    820 * @param strsrch string search data
    821 * @param start index of the potential unfriendly composite character
    822 * @param end index of the potential unfriendly composite character
    823 * @param status output error status if any.
    824 * @return TRUE if there is non-ignorable accents before at the beginning
    825 *              of the match, FALSE otherwise.
    826 */
    827 
    828 static
    829 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
    830                                    int32_t    end,
    831                                    UErrorCode    *status)
    832 {
    833     UBool result = FALSE;
    834     if (strsrch->pattern.hasPrefixAccents) {
    835               int32_t  length = end - start;
    836               int32_t  offset = 0;
    837         const UChar       *text   = strsrch->search->text + start;
    838 
    839         U16_FWD_1(text, offset, length);
    840         // we are only concerned with the first composite character
    841         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
    842             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
    843                                                        text, 0, length);
    844             if (safeoffset != length) {
    845                 safeoffset ++;
    846             }
    847             UChar   *norm = NULL;
    848             UChar    buffer[INITIAL_ARRAY_SIZE_];
    849             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
    850                                             buffer, INITIAL_ARRAY_SIZE_,
    851                                             status);
    852             if (U_FAILURE(*status)) {
    853                 return FALSE;
    854             }
    855             if (size >= INITIAL_ARRAY_SIZE_) {
    856                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
    857                                                status);
    858                 // if allocation failed, status will be set to
    859                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
    860                 // checks for it.
    861                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
    862                                        size, status);
    863                 if (U_FAILURE(*status) && norm != NULL) {
    864                     uprv_free(norm);
    865                     return FALSE;
    866                 }
    867             }
    868             else {
    869                 norm = buffer;
    870             }
    871 
    872             UCollationElements *coleiter  = strsrch->utilIter;
    873             ucol_setText(coleiter, norm, size, status);
    874             uint32_t            firstce   = strsrch->pattern.ces[0];
    875             UBool               ignorable = TRUE;
    876             uint32_t            ce        = UCOL_IGNORABLE;
    877             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
    878                 offset = ucol_getOffset(coleiter);
    879                 if (ce != firstce && ce != UCOL_IGNORABLE) {
    880                     ignorable = FALSE;
    881                 }
    882                 ce = ucol_next(coleiter, status);
    883             }
    884             UChar32 codepoint;
    885             U16_PREV(norm, 0, offset, codepoint);
    886             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
    887 
    888             if (norm != buffer) {
    889                 uprv_free(norm);
    890             }
    891         }
    892     }
    893 
    894     return result;
    895 }
    896 
    897 /**
    898 * Used by exact matches, checks if there are accents before the match.
    899 * This is really painful... we have to check that composite characters at
    900 * the start of the matches have to not have any extra accents.
    901 * We check the FCD of the character first, if it starts with an accent and
    902 * the first pattern ce does not match the first ce of the character, we bail.
    903 * Otherwise we try normalizing the first composite
    904 * character and find the immediate decomposed character before the match to
    905 * see if it is an non-ignorable accent.
    906 * Now normalizing the first composite character is enough because we ensure
    907 * that when the match is passed in here with extra beginning ces, the
    908 * first or last ce that match has to occur within the first character.
    909 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    910 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    911 * Note here that accents checking are slow and cautioned in the API docs.
    912 * @param strsrch string search data
    913 * @param start offset
    914 * @param end offset
    915 * @return TRUE if there are accents on either side of the match,
    916 *         FALSE otherwise
    917 */
    918 static
    919 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
    920                                   int32_t    end)
    921 {
    922     if (strsrch->pattern.hasPrefixAccents) {
    923         UCollationElements *coleiter  = strsrch->textIter;
    924         UErrorCode          status    = U_ZERO_ERROR;
    925         // we have been iterating forwards previously
    926         uint32_t            ignorable = TRUE;
    927         int32_t             firstce   = strsrch->pattern.ces[0];
    928 
    929         setColEIterOffset(coleiter, start);
    930         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
    931         if (U_FAILURE(status)) {
    932             return TRUE;
    933         }
    934         while (ce != firstce) {
    935             if (ce != UCOL_IGNORABLE) {
    936                 ignorable = FALSE;
    937             }
    938             ce = getCE(strsrch, ucol_next(coleiter, &status));
    939             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
    940                 return TRUE;
    941             }
    942         }
    943         if (!ignorable && inNormBuf(coleiter)) {
    944             // within normalization buffer, discontiguous handled here
    945             return TRUE;
    946         }
    947 
    948         // within text
    949         int32_t temp = start;
    950         // original code
    951         // accent = (getFCD(strsrch->search->text, &temp,
    952         //                  strsrch->search->textLength)
    953         //            >> SECOND_LAST_BYTE_SHIFT_);
    954         // however this code does not work well with VC7 .net in release mode.
    955         // maybe the inlines for getFCD combined with shifting has bugs in
    956         // VC7. anyways this is a work around.
    957         UBool accent = getFCD(strsrch->search->text, &temp,
    958                               strsrch->search->textLength) > 0xFF;
    959         if (!accent) {
    960             return checkExtraMatchAccents(strsrch, start, end, &status);
    961         }
    962         if (!ignorable) {
    963             return TRUE;
    964         }
    965         if (start > 0) {
    966             temp = start;
    967             U16_BACK_1(strsrch->search->text, 0, temp);
    968             if (getFCD(strsrch->search->text, &temp,
    969                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
    970                 setColEIterOffset(coleiter, start);
    971                 ce = ucol_previous(coleiter, &status);
    972                 if (U_FAILURE(status) ||
    973                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
    974                     return TRUE;
    975                 }
    976             }
    977         }
    978     }
    979 
    980     return FALSE;
    981 }
    982 
    983 /**
    984 * Used by exact matches, checks if there are accents bounding the match.
    985 * Note this is the initial boundary check. If the potential match
    986 * starts or ends with composite characters, the accents in those
    987 * characters will be determined later.
    988 * Not doing backwards iteration here, since discontiguos contraction for
    989 * backwards collation element iterator, use up too many characters.
    990 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
    991 * should fail since there is a acute at the end of \u01FA
    992 * Note here that accents checking are slow and cautioned in the API docs.
    993 * @param strsrch string search data
    994 * @param start offset of match
    995 * @param end end offset of the match
    996 * @return TRUE if there are accents on either side of the match,
    997 *         FALSE otherwise
    998 */
    999 static
   1000 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
   1001                                  int32_t    end)
   1002 {
   1003     if (strsrch->pattern.hasSuffixAccents) {
   1004         const UChar       *text       = strsrch->search->text;
   1005               int32_t  temp       = end;
   1006               int32_t      textlength = strsrch->search->textLength;
   1007         U16_BACK_1(text, 0, temp);
   1008         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   1009             int32_t             firstce  = strsrch->pattern.ces[0];
   1010             UCollationElements *coleiter = strsrch->textIter;
   1011             UErrorCode          status   = U_ZERO_ERROR;
   1012             int32_t ce;
   1013             setColEIterOffset(coleiter, start);
   1014             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
   1015                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   1016                     return TRUE;
   1017                 }
   1018             }
   1019             int32_t count = 1;
   1020             while (count < strsrch->pattern.cesLength) {
   1021                 if (getCE(strsrch, ucol_next(coleiter, &status))
   1022                     == UCOL_IGNORABLE) {
   1023                     // Thai can give an ignorable here.
   1024                     count --;
   1025                 }
   1026                 if (U_FAILURE(status)) {
   1027                     return TRUE;
   1028                 }
   1029                 count ++;
   1030             }
   1031 
   1032             ce = ucol_next(coleiter, &status);
   1033             if (U_FAILURE(status)) {
   1034                 return TRUE;
   1035             }
   1036             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1037                 ce = getCE(strsrch, ce);
   1038             }
   1039             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1040                 if (ucol_getOffset(coleiter) <= end) {
   1041                     return TRUE;
   1042                 }
   1043                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   1044                     return TRUE;
   1045                 }
   1046             }
   1047         }
   1048     }
   1049     return FALSE;
   1050 }
   1051 #endif // #if BOYER_MOORE
   1052 
   1053 /**
   1054 * Checks if the offset runs out of the text string
   1055 * @param offset
   1056 * @param textlength of the text string
   1057 * @return TRUE if offset is out of bounds, FALSE otherwise
   1058 */
   1059 static
   1060 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
   1061 {
   1062     return offset < 0 || offset > textlength;
   1063 }
   1064 
   1065 /**
   1066 * Checks for identical match
   1067 * @param strsrch string search data
   1068 * @param start offset of possible match
   1069 * @param end offset of possible match
   1070 * @return TRUE if identical match is found
   1071 */
   1072 static
   1073 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
   1074                                   int32_t    end)
   1075 {
   1076     if (strsrch->strength != UCOL_IDENTICAL) {
   1077         return TRUE;
   1078     }
   1079 
   1080     // Note: We could use Normalizer::compare() or similar, but for short strings
   1081     // which may not be in FCD it might be faster to just NFD them.
   1082     UErrorCode status = U_ZERO_ERROR;
   1083     UnicodeString t2, p2;
   1084     strsrch->nfd->normalize(
   1085         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
   1086     strsrch->nfd->normalize(
   1087         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
   1088     // return FALSE if NFD failed
   1089     return U_SUCCESS(status) && t2 == p2;
   1090 }
   1091 
   1092 #if BOYER_MOORE
   1093 /**
   1094 * Checks to see if the match is repeated
   1095 * @param strsrch string search data
   1096 * @param start new match start index
   1097 * @param end new match end index
   1098 * @return TRUE if the the match is repeated, FALSE otherwise
   1099 */
   1100 static
   1101 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
   1102                                 int32_t    start,
   1103                                 int32_t    end)
   1104 {
   1105     int32_t lastmatchindex = strsrch->search->matchedIndex;
   1106     UBool       result;
   1107     if (lastmatchindex == USEARCH_DONE) {
   1108         return FALSE;
   1109     }
   1110     if (strsrch->search->isForwardSearching) {
   1111         result = start <= lastmatchindex;
   1112     }
   1113     else {
   1114         result = start >= lastmatchindex;
   1115     }
   1116     if (!result && !strsrch->search->isOverlap) {
   1117         if (strsrch->search->isForwardSearching) {
   1118             result = start < lastmatchindex + strsrch->search->matchedLength;
   1119         }
   1120         else {
   1121             result = end > lastmatchindex;
   1122         }
   1123     }
   1124     return result;
   1125 }
   1126 
   1127 /**
   1128 * Gets the collation element iterator's current offset.
   1129 * @param coleiter collation element iterator
   1130 * @param forwards flag TRUE if we are moving in th forwards direction
   1131 * @return current offset
   1132 */
   1133 static
   1134 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
   1135                                               UBool               forwards)
   1136 {
   1137     int32_t result = ucol_getOffset(coleiter);
   1138     // intricacies of the the backwards collation element iterator
   1139     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
   1140         result ++;
   1141     }
   1142     return result;
   1143 }
   1144 
   1145 /**
   1146 * Checks match for contraction.
   1147 * If the match ends with a partial contraction we fail.
   1148 * If the match starts too far off (because of backwards iteration) we try to
   1149 * chip off the extra characters depending on whether a breakiterator has
   1150 * been used.
   1151 * Internal method, error assumed to be success, caller has to check status
   1152 * before calling this method.
   1153 * @param strsrch string search data
   1154 * @param start offset of potential match, to be modified if necessary
   1155 * @param end offset of potential match, to be modified if necessary
   1156 * @param status output error status if any
   1157 * @return TRUE if match passes the contraction test, FALSE otherwise
   1158 */
   1159 
   1160 static
   1161 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
   1162                                      int32_t   *start,
   1163                                      int32_t   *end, UErrorCode  *status)
   1164 {
   1165           UCollationElements *coleiter   = strsrch->textIter;
   1166           int32_t             textlength = strsrch->search->textLength;
   1167           int32_t             temp       = *start;
   1168     const UCollator          *collator   = strsrch->collator;
   1169     const UChar              *text       = strsrch->search->text;
   1170     // This part checks if either ends of the match contains potential
   1171     // contraction. If so we'll have to iterate through them
   1172     // The start contraction needs to be checked since ucol_previous dumps
   1173     // all characters till the first safe character into the buffer.
   1174     // *start + 1 is used to test for the unsafe characters instead of *start
   1175     // because ucol_prev takes all unsafe characters till the first safe
   1176     // character ie *start. so by testing *start + 1, we can estimate if
   1177     // excess prefix characters has been included in the potential search
   1178     // results.
   1179     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1180         (*start + 1 < textlength
   1181          && ucol_unsafeCP(text[*start + 1], collator))) {
   1182         int32_t expansion  = getExpansionPrefix(coleiter);
   1183         UBool   expandflag = expansion > 0;
   1184         setColEIterOffset(coleiter, *start);
   1185         while (expansion > 0) {
   1186             // getting rid of the redundant ce, caused by setOffset.
   1187             // since backward contraction/expansion may have extra ces if we
   1188             // are in the normalization buffer, hasAccentsBeforeMatch would
   1189             // have taken care of it.
   1190             // E.g. the character \u01FA will have an expansion of 3, but if
   1191             // we are only looking for acute and ring \u030A and \u0301, we'll
   1192             // have to skip the first ce in the expansion buffer.
   1193             ucol_next(coleiter, status);
   1194             if (U_FAILURE(*status)) {
   1195                 return FALSE;
   1196             }
   1197             if (ucol_getOffset(coleiter) != temp) {
   1198                 *start = temp;
   1199                 temp  = ucol_getOffset(coleiter);
   1200             }
   1201             expansion --;
   1202         }
   1203 
   1204         int32_t  *patternce       = strsrch->pattern.ces;
   1205         int32_t   patterncelength = strsrch->pattern.cesLength;
   1206         int32_t   count           = 0;
   1207         while (count < patterncelength) {
   1208             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1209             if (ce == UCOL_IGNORABLE) {
   1210                 continue;
   1211             }
   1212             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1213                 *start = temp;
   1214                 temp   = ucol_getOffset(coleiter);
   1215             }
   1216             if (U_FAILURE(*status) || ce != patternce[count]) {
   1217                 (*end) ++;
   1218                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1219                 return FALSE;
   1220             }
   1221             count ++;
   1222         }
   1223     }
   1224     return TRUE;
   1225 }
   1226 
   1227 /**
   1228 * Checks and sets the match information if found.
   1229 * Checks
   1230 * <ul>
   1231 * <li> the potential match does not repeat the previous match
   1232 * <li> boundaries are correct
   1233 * <li> exact matches has no extra accents
   1234 * <li> identical matchesb
   1235 * <li> potential match does not end in the middle of a contraction
   1236 * <\ul>
   1237 * Otherwise the offset will be shifted to the next character.
   1238 * Internal method, status assumed to be success, caller has to check status
   1239 * before calling this method.
   1240 * @param strsrch string search data
   1241 * @param textoffset offset in the collation element text. the returned value
   1242 *        will be the truncated end offset of the match or the new start
   1243 *        search offset.
   1244 * @param status output error status if any
   1245 * @return TRUE if the match is valid, FALSE otherwise
   1246 */
   1247 static
   1248 inline UBool checkNextExactMatch(UStringSearch *strsrch,
   1249                                  int32_t   *textoffset, UErrorCode *status)
   1250 {
   1251     UCollationElements *coleiter = strsrch->textIter;
   1252     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
   1253 
   1254     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
   1255         return FALSE;
   1256     }
   1257 
   1258     // this totally matches, however we need to check if it is repeating
   1259     if (!isBreakUnit(strsrch, start, *textoffset) ||
   1260         checkRepeatedMatch(strsrch, start, *textoffset) ||
   1261         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
   1262         !checkIdentical(strsrch, start, *textoffset) ||
   1263         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
   1264 
   1265         (*textoffset) ++;
   1266         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
   1267         return FALSE;
   1268     }
   1269 
   1270     //Add breakiterator boundary check for primary strength search.
   1271     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   1272         checkBreakBoundary(strsrch, &start, textoffset);
   1273     }
   1274 
   1275     // totally match, we will get rid of the ending ignorables.
   1276     strsrch->search->matchedIndex  = start;
   1277     strsrch->search->matchedLength = *textoffset - start;
   1278     return TRUE;
   1279 }
   1280 
   1281 /**
   1282 * Getting the previous base character offset, or the current offset if the
   1283 * current character is a base character
   1284 * @param text string
   1285 * @param textoffset one offset after the current character
   1286 * @return the offset of the next character after the base character or the first
   1287 *         composed character with accents
   1288 */
   1289 static
   1290 inline int32_t getPreviousBaseOffset(const UChar       *text,
   1291                                                int32_t  textoffset)
   1292 {
   1293     if (textoffset > 0) {
   1294         for (;;) {
   1295             int32_t result = textoffset;
   1296             U16_BACK_1(text, 0, textoffset);
   1297             int32_t temp = textoffset;
   1298             uint16_t fcd = getFCD(text, &temp, result);
   1299             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   1300                 if (fcd & LAST_BYTE_MASK_) {
   1301                     return textoffset;
   1302                 }
   1303                 return result;
   1304             }
   1305             if (textoffset == 0) {
   1306                 return 0;
   1307             }
   1308         }
   1309     }
   1310     return textoffset;
   1311 }
   1312 
   1313 /**
   1314 * Getting the indexes of the accents that are not blocked in the argument
   1315 * accent array
   1316 * @param accents array of accents in nfd terminated by a 0.
   1317 * @param accentsindex array of indexes of the accents that are not blocked
   1318 */
   1319 static
   1320 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
   1321 {
   1322     int32_t index     = 0;
   1323     int32_t     length    = u_strlen(accents);
   1324     UChar32     codepoint = 0;
   1325     int         cclass    = 0;
   1326     int         result    = 0;
   1327     int32_t temp;
   1328     while (index < length) {
   1329         temp = index;
   1330         U16_NEXT(accents, index, length, codepoint);
   1331         if (u_getCombiningClass(codepoint) != cclass) {
   1332             cclass        = u_getCombiningClass(codepoint);
   1333             accentsindex[result] = temp;
   1334             result ++;
   1335         }
   1336     }
   1337     accentsindex[result] = length;
   1338     return result;
   1339 }
   1340 
   1341 /**
   1342 * Appends 3 UChar arrays to a destination array.
   1343 * Creates a new array if we run out of space. The caller will have to
   1344 * manually deallocate the newly allocated array.
   1345 * Internal method, status assumed to be success, caller has to check status
   1346 * before calling this method. destination not to be NULL and has at least
   1347 * size destinationlength.
   1348 * @param destination target array
   1349 * @param destinationlength target array size, returning the appended length
   1350 * @param source1 null-terminated first array
   1351 * @param source2 second array
   1352 * @param source2length length of seond array
   1353 * @param source3 null-terminated third array
   1354 * @param status error status if any
   1355 * @return new destination array, destination if there was no new allocation
   1356 */
   1357 static
   1358 inline UChar * addToUCharArray(      UChar      *destination,
   1359                                      int32_t    *destinationlength,
   1360                                const UChar      *source1,
   1361                                const UChar      *source2,
   1362                                      int32_t     source2length,
   1363                                const UChar      *source3,
   1364                                      UErrorCode *status)
   1365 {
   1366     int32_t source1length = source1 ? u_strlen(source1) : 0;
   1367     int32_t source3length = source3 ? u_strlen(source3) : 0;
   1368     if (*destinationlength < source1length + source2length + source3length +
   1369                                                                            1)
   1370     {
   1371         destination = (UChar *)allocateMemory(
   1372           (source1length + source2length + source3length + 1) * sizeof(UChar),
   1373           status);
   1374         // if error allocating memory, status will be
   1375         // U_MEMORY_ALLOCATION_ERROR
   1376         if (U_FAILURE(*status)) {
   1377             *destinationlength = 0;
   1378             return NULL;
   1379         }
   1380     }
   1381     if (source1length != 0) {
   1382         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
   1383     }
   1384     if (source2length != 0) {
   1385         uprv_memcpy(destination + source1length, source2,
   1386                     sizeof(UChar) * source2length);
   1387     }
   1388     if (source3length != 0) {
   1389         uprv_memcpy(destination + source1length + source2length, source3,
   1390                     sizeof(UChar) * source3length);
   1391     }
   1392     *destinationlength = source1length + source2length + source3length;
   1393     return destination;
   1394 }
   1395 
   1396 /**
   1397 * Running through a collation element iterator to see if the contents matches
   1398 * pattern in string search data
   1399 * @param strsrch string search data
   1400 * @param coleiter collation element iterator
   1401 * @return TRUE if a match if found, FALSE otherwise
   1402 */
   1403 static
   1404 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
   1405                                        UCollationElements *coleiter)
   1406 {
   1407     int         patternceindex = strsrch->pattern.cesLength;
   1408     int32_t    *patternce      = strsrch->pattern.ces;
   1409     UErrorCode  status = U_ZERO_ERROR;
   1410     while (patternceindex > 0) {
   1411         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   1412         if (ce == UCOL_IGNORABLE) {
   1413             continue;
   1414         }
   1415         if (U_FAILURE(status) || ce != *patternce) {
   1416             return FALSE;
   1417         }
   1418         patternce ++;
   1419         patternceindex --;
   1420     }
   1421     return TRUE;
   1422 }
   1423 
   1424 /**
   1425 * Rearranges the front accents to try matching.
   1426 * Prefix accents in the text will be grouped according to their combining
   1427 * class and the groups will be mixed and matched to try find the perfect
   1428 * match with the pattern.
   1429 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1430 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1431 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1432 *         "\u0301\u0325".
   1433 * step 2: check if any of the generated substrings matches the pattern.
   1434 * Internal method, status is assumed to be success, caller has to check status
   1435 * before calling this method.
   1436 * @param strsrch string search match
   1437 * @param start first offset of the accents to start searching
   1438 * @param end start of the last accent set
   1439 * @param status output error status if any
   1440 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1441 *         offset of the match. Note this start includes all preceding accents.
   1442 */
   1443 static
   1444 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
   1445                                        int32_t    start,
   1446                                        int32_t    end,
   1447                                        UErrorCode    *status)
   1448 {
   1449     const UChar       *text       = strsrch->search->text;
   1450           int32_t      textlength = strsrch->search->textLength;
   1451           int32_t  tempstart  = start;
   1452 
   1453     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
   1454         // die... failed at a base character
   1455         return USEARCH_DONE;
   1456     }
   1457 
   1458     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
   1459     start = getPreviousBaseOffset(text, tempstart);
   1460 
   1461     UChar       accents[INITIAL_ARRAY_SIZE_];
   1462     // normalizing the offensive string
   1463     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
   1464                     INITIAL_ARRAY_SIZE_, status);
   1465     if (U_FAILURE(*status)) {
   1466         return USEARCH_DONE;
   1467     }
   1468 
   1469     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   1470     int32_t         accentsize = getUnblockedAccentIndex(accents,
   1471                                                                  accentsindex);
   1472     int32_t         count      = (2 << (accentsize - 1)) - 1;
   1473     UChar               buffer[INITIAL_ARRAY_SIZE_];
   1474     UCollationElements *coleiter   = strsrch->utilIter;
   1475     while (U_SUCCESS(*status) && count > 0) {
   1476         UChar *rearrange = strsrch->canonicalPrefixAccents;
   1477         // copy the base characters
   1478         for (int k = 0; k < accentsindex[0]; k ++) {
   1479             *rearrange ++ = accents[k];
   1480         }
   1481         // forming all possible canonical rearrangement by dropping
   1482         // sets of accents
   1483         for (int i = 0; i <= accentsize - 1; i ++) {
   1484             int32_t mask = 1 << (accentsize - i - 1);
   1485             if (count & mask) {
   1486                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1487                     *rearrange ++ = accents[j];
   1488                 }
   1489             }
   1490         }
   1491         *rearrange = 0;
   1492         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   1493         UChar   *match     = addToUCharArray(buffer, &matchsize,
   1494                                            strsrch->canonicalPrefixAccents,
   1495                                            strsrch->search->text + offset,
   1496                                            end - offset,
   1497                                            strsrch->canonicalSuffixAccents,
   1498                                            status);
   1499 
   1500         // if status is a failure, ucol_setText does nothing.
   1501         // run the collator iterator through this match
   1502         ucol_setText(coleiter, match, matchsize, status);
   1503         if (U_SUCCESS(*status)) {
   1504             if (checkCollationMatch(strsrch, coleiter)) {
   1505                 if (match != buffer) {
   1506                     uprv_free(match);
   1507                 }
   1508                 return start;
   1509             }
   1510         }
   1511         count --;
   1512     }
   1513     return USEARCH_DONE;
   1514 }
   1515 
   1516 /**
   1517 * Gets the offset to the safe point in text before textoffset.
   1518 * ie. not the middle of a contraction, swappable characters or supplementary
   1519 * characters.
   1520 * @param collator collation sata
   1521 * @param text string to work with
   1522 * @param textoffset offset in string
   1523 * @param textlength length of text string
   1524 * @return offset to the previous safe character
   1525 */
   1526 static
   1527 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
   1528                                       const UChar       *text,
   1529                                             int32_t  textoffset)
   1530 {
   1531     int32_t result = textoffset; // first contraction character
   1532     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
   1533         result --;
   1534     }
   1535     if (result != 0) {
   1536         // the first contraction character is consider unsafe here
   1537         result --;
   1538     }
   1539     return result;
   1540 }
   1541 
   1542 /**
   1543 * Cleaning up after we passed the safe zone
   1544 * @param strsrch string search data
   1545 * @param safetext safe text array
   1546 * @param safebuffer safe text buffer
   1547 * @param coleiter collation element iterator for safe text
   1548 */
   1549 static
   1550 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
   1551                                   UChar         *safebuffer)
   1552 {
   1553     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
   1554     {
   1555        uprv_free(safetext);
   1556     }
   1557 }
   1558 
   1559 /**
   1560 * Take the rearranged end accents and tries matching. If match failed at
   1561 * a seperate preceding set of accents (seperated from the rearranged on by
   1562 * at least a base character) then we rearrange the preceding accents and
   1563 * tries matching again.
   1564 * We allow skipping of the ends of the accent set if the ces do not match.
   1565 * However if the failure is found before the accent set, it fails.
   1566 * Internal method, status assumed to be success, caller has to check status
   1567 * before calling this method.
   1568 * @param strsrch string search data
   1569 * @param textoffset of the start of the rearranged accent
   1570 * @param status output error status if any
   1571 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1572 *         offset of the match. Note this start includes all preceding accents.
   1573 */
   1574 static
   1575 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
   1576                                        int32_t    textoffset,
   1577                                        UErrorCode    *status)
   1578 {
   1579     const UChar              *text           = strsrch->search->text;
   1580     const UCollator          *collator       = strsrch->collator;
   1581           int32_t             safelength     = 0;
   1582           UChar              *safetext;
   1583           int32_t             safetextlength;
   1584           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
   1585           UCollationElements *coleiter       = strsrch->utilIter;
   1586           int32_t         safeoffset     = textoffset;
   1587 
   1588     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
   1589                                          collator)) {
   1590         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
   1591         safelength     = textoffset - safeoffset;
   1592         safetextlength = INITIAL_ARRAY_SIZE_;
   1593         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
   1594                                          text + safeoffset, safelength,
   1595                                          strsrch->canonicalSuffixAccents,
   1596                                          status);
   1597     }
   1598     else {
   1599         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
   1600         safetext       = strsrch->canonicalSuffixAccents;
   1601     }
   1602 
   1603     // if status is a failure, ucol_setText does nothing
   1604     ucol_setText(coleiter, safetext, safetextlength, status);
   1605     // status checked in loop below
   1606 
   1607     int32_t  *ce        = strsrch->pattern.ces;
   1608     int32_t   celength  = strsrch->pattern.cesLength;
   1609     int       ceindex   = celength - 1;
   1610     UBool     isSafe    = TRUE; // indication flag for position in safe zone
   1611 
   1612     while (ceindex >= 0) {
   1613         int32_t textce = ucol_previous(coleiter, status);
   1614         if (U_FAILURE(*status)) {
   1615             if (isSafe) {
   1616                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1617             }
   1618             return USEARCH_DONE;
   1619         }
   1620         if (textce == UCOL_NULLORDER) {
   1621             // check if we have passed the safe buffer
   1622             if (coleiter == strsrch->textIter) {
   1623                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1624                 return USEARCH_DONE;
   1625             }
   1626             cleanUpSafeText(strsrch, safetext, safebuffer);
   1627             safetext = safebuffer;
   1628             coleiter = strsrch->textIter;
   1629             setColEIterOffset(coleiter, safeoffset);
   1630             // status checked at the start of the loop
   1631             isSafe = FALSE;
   1632             continue;
   1633         }
   1634         textce = getCE(strsrch, textce);
   1635         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   1636             // do the beginning stuff
   1637             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
   1638             if (isSafe && failedoffset >= safelength) {
   1639                 // alas... no hope. failed at rearranged accent set
   1640                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1641                 return USEARCH_DONE;
   1642             }
   1643             else {
   1644                 if (isSafe) {
   1645                     failedoffset += safeoffset;
   1646                     cleanUpSafeText(strsrch, safetext, safebuffer);
   1647                 }
   1648 
   1649                 // try rearranging the front accents
   1650                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
   1651                                         failedoffset, textoffset, status);
   1652                 if (result != USEARCH_DONE) {
   1653                     // if status is a failure, ucol_setOffset does nothing
   1654                     setColEIterOffset(strsrch->textIter, result);
   1655                 }
   1656                 if (U_FAILURE(*status)) {
   1657                     return USEARCH_DONE;
   1658                 }
   1659                 return result;
   1660             }
   1661         }
   1662         if (textce == ce[ceindex]) {
   1663             ceindex --;
   1664         }
   1665     }
   1666     // set offset here
   1667     if (isSafe) {
   1668         int32_t result     = getColElemIterOffset(coleiter, FALSE);
   1669         // sets the text iterator here with the correct expansion and offset
   1670         int32_t    leftoverces = getExpansionPrefix(coleiter);
   1671         cleanUpSafeText(strsrch, safetext, safebuffer);
   1672         if (result >= safelength) {
   1673             result = textoffset;
   1674         }
   1675         else {
   1676             result += safeoffset;
   1677         }
   1678         setColEIterOffset(strsrch->textIter, result);
   1679         strsrch->textIter->iteratordata_.toReturn =
   1680                        setExpansionPrefix(strsrch->textIter, leftoverces);
   1681         return result;
   1682     }
   1683 
   1684     return ucol_getOffset(coleiter);
   1685 }
   1686 
   1687 /**
   1688 * Trying out the substring and sees if it can be a canonical match.
   1689 * This will try normalizing the end accents and arranging them into canonical
   1690 * equivalents and check their corresponding ces with the pattern ce.
   1691 * Suffix accents in the text will be grouped according to their combining
   1692 * class and the groups will be mixed and matched to try find the perfect
   1693 * match with the pattern.
   1694 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1695 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1696 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1697 *         "\u0301\u0325".
   1698 * step 2: check if any of the generated substrings matches the pattern.
   1699 * Internal method, status assumed to be success, caller has to check status
   1700 * before calling this method.
   1701 * @param strsrch string search data
   1702 * @param textoffset end offset in the collation element text that ends with
   1703 *                   the accents to be rearranged
   1704 * @param status error status if any
   1705 * @return TRUE if the match is valid, FALSE otherwise
   1706 */
   1707 static
   1708 UBool doNextCanonicalMatch(UStringSearch *strsrch,
   1709                            int32_t    textoffset,
   1710                            UErrorCode    *status)
   1711 {
   1712     const UChar       *text = strsrch->search->text;
   1713           int32_t  temp = textoffset;
   1714     U16_BACK_1(text, 0, temp);
   1715     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
   1716         UCollationElements *coleiter = strsrch->textIter;
   1717         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
   1718         if (strsrch->pattern.hasPrefixAccents) {
   1719             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
   1720                                                 status);
   1721             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   1722                 setColEIterOffset(coleiter, offset);
   1723                 return TRUE;
   1724             }
   1725         }
   1726         return FALSE;
   1727     }
   1728 
   1729     if (!strsrch->pattern.hasSuffixAccents) {
   1730         return FALSE;
   1731     }
   1732 
   1733     UChar       accents[INITIAL_ARRAY_SIZE_];
   1734     // offset to the last base character in substring to search
   1735     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
   1736     // normalizing the offensive string
   1737     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
   1738                                0, accents, INITIAL_ARRAY_SIZE_, status);
   1739     // status checked in loop below
   1740 
   1741     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   1742     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   1743 
   1744     // 2 power n - 1 plus the full set of accents
   1745     int32_t  count = (2 << (size - 1)) - 1;
   1746     while (U_SUCCESS(*status) && count > 0) {
   1747         UChar *rearrange = strsrch->canonicalSuffixAccents;
   1748         // copy the base characters
   1749         for (int k = 0; k < accentsindex[0]; k ++) {
   1750             *rearrange ++ = accents[k];
   1751         }
   1752         // forming all possible canonical rearrangement by dropping
   1753         // sets of accents
   1754         for (int i = 0; i <= size - 1; i ++) {
   1755             int32_t mask = 1 << (size - i - 1);
   1756             if (count & mask) {
   1757                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1758                     *rearrange ++ = accents[j];
   1759                 }
   1760             }
   1761         }
   1762         *rearrange = 0;
   1763         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
   1764                                                         status);
   1765         if (offset != USEARCH_DONE) {
   1766             return TRUE; // match found
   1767         }
   1768         count --;
   1769     }
   1770     return FALSE;
   1771 }
   1772 
   1773 /**
   1774 * Gets the previous base character offset depending on the string search
   1775 * pattern data
   1776 * @param strsrch string search data
   1777 * @param textoffset current offset, current character
   1778 * @return the offset of the next character after this base character or itself
   1779 *         if it is a composed character with accents
   1780 */
   1781 static
   1782 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
   1783                                                       int32_t textoffset)
   1784 {
   1785     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
   1786         const UChar       *text = strsrch->search->text;
   1787               int32_t  offset = textoffset;
   1788         if (getFCD(text, &offset, strsrch->search->textLength) >>
   1789                                                    SECOND_LAST_BYTE_SHIFT_) {
   1790             return getPreviousBaseOffset(text, textoffset);
   1791         }
   1792     }
   1793     return textoffset;
   1794 }
   1795 
   1796 /**
   1797 * Checks match for contraction.
   1798 * If the match ends with a partial contraction we fail.
   1799 * If the match starts too far off (because of backwards iteration) we try to
   1800 * chip off the extra characters
   1801 * Internal method, status assumed to be success, caller has to check status
   1802 * before calling this method.
   1803 * @param strsrch string search data
   1804 * @param start offset of potential match, to be modified if necessary
   1805 * @param end offset of potential match, to be modified if necessary
   1806 * @param status output error status if any
   1807 * @return TRUE if match passes the contraction test, FALSE otherwise
   1808 */
   1809 static
   1810 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
   1811                                          int32_t   *start,
   1812                                          int32_t   *end,
   1813                                          UErrorCode    *status)
   1814 {
   1815           UCollationElements *coleiter   = strsrch->textIter;
   1816           int32_t             textlength = strsrch->search->textLength;
   1817           int32_t         temp       = *start;
   1818     const UCollator          *collator   = strsrch->collator;
   1819     const UChar              *text       = strsrch->search->text;
   1820     // This part checks if either ends of the match contains potential
   1821     // contraction. If so we'll have to iterate through them
   1822     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1823         (*start + 1 < textlength
   1824          && ucol_unsafeCP(text[*start + 1], collator))) {
   1825         int32_t expansion  = getExpansionPrefix(coleiter);
   1826         UBool   expandflag = expansion > 0;
   1827         setColEIterOffset(coleiter, *start);
   1828         while (expansion > 0) {
   1829             // getting rid of the redundant ce, caused by setOffset.
   1830             // since backward contraction/expansion may have extra ces if we
   1831             // are in the normalization buffer, hasAccentsBeforeMatch would
   1832             // have taken care of it.
   1833             // E.g. the character \u01FA will have an expansion of 3, but if
   1834             // we are only looking for acute and ring \u030A and \u0301, we'll
   1835             // have to skip the first ce in the expansion buffer.
   1836             ucol_next(coleiter, status);
   1837             if (U_FAILURE(*status)) {
   1838                 return FALSE;
   1839             }
   1840             if (ucol_getOffset(coleiter) != temp) {
   1841                 *start = temp;
   1842                 temp  = ucol_getOffset(coleiter);
   1843             }
   1844             expansion --;
   1845         }
   1846 
   1847         int32_t  *patternce       = strsrch->pattern.ces;
   1848         int32_t   patterncelength = strsrch->pattern.cesLength;
   1849         int32_t   count           = 0;
   1850         int32_t   textlength      = strsrch->search->textLength;
   1851         while (count < patterncelength) {
   1852             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1853             // status checked below, note that if status is a failure
   1854             // ucol_next returns UCOL_NULLORDER
   1855             if (ce == UCOL_IGNORABLE) {
   1856                 continue;
   1857             }
   1858             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1859                 *start = temp;
   1860                 temp   = ucol_getOffset(coleiter);
   1861             }
   1862 
   1863             if (count == 0 && ce != patternce[0]) {
   1864                 // accents may have extra starting ces, this occurs when a
   1865                 // pure accent pattern is matched without rearrangement
   1866                 // text \u0325\u0300 and looking for \u0300
   1867                 int32_t expected = patternce[0];
   1868                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
   1869                     ce = getCE(strsrch, ucol_next(coleiter, status));
   1870                     while (U_SUCCESS(*status) && ce != expected &&
   1871                            ce != UCOL_NULLORDER &&
   1872                            ucol_getOffset(coleiter) <= *end) {
   1873                         ce = getCE(strsrch, ucol_next(coleiter, status));
   1874                     }
   1875                 }
   1876             }
   1877             if (U_FAILURE(*status) || ce != patternce[count]) {
   1878                 (*end) ++;
   1879                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1880                 return FALSE;
   1881             }
   1882             count ++;
   1883         }
   1884     }
   1885     return TRUE;
   1886 }
   1887 
   1888 /**
   1889 * Checks and sets the match information if found.
   1890 * Checks
   1891 * <ul>
   1892 * <li> the potential match does not repeat the previous match
   1893 * <li> boundaries are correct
   1894 * <li> potential match does not end in the middle of a contraction
   1895 * <li> identical matches
   1896 * <\ul>
   1897 * Otherwise the offset will be shifted to the next character.
   1898 * Internal method, status assumed to be success, caller has to check the
   1899 * status before calling this method.
   1900 * @param strsrch string search data
   1901 * @param textoffset offset in the collation element text. the returned value
   1902 *        will be the truncated end offset of the match or the new start
   1903 *        search offset.
   1904 * @param status output error status if any
   1905 * @return TRUE if the match is valid, FALSE otherwise
   1906 */
   1907 static
   1908 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
   1909                                      int32_t   *textoffset,
   1910                                      UErrorCode    *status)
   1911 {
   1912     // to ensure that the start and ends are not composite characters
   1913     UCollationElements *coleiter = strsrch->textIter;
   1914     // if we have a canonical accent match
   1915     if ((strsrch->pattern.hasSuffixAccents &&
   1916         strsrch->canonicalSuffixAccents[0]) ||
   1917         (strsrch->pattern.hasPrefixAccents &&
   1918         strsrch->canonicalPrefixAccents[0])) {
   1919         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
   1920                                                     strsrch,
   1921                                                     ucol_getOffset(coleiter));
   1922         strsrch->search->matchedLength = *textoffset -
   1923                                                 strsrch->search->matchedIndex;
   1924         return TRUE;
   1925     }
   1926 
   1927     int32_t start = getColElemIterOffset(coleiter, FALSE);
   1928     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
   1929                                             status) || U_FAILURE(*status)) {
   1930         return FALSE;
   1931     }
   1932 
   1933     start = getPreviousUStringSearchBaseOffset(strsrch, start);
   1934     // this totally matches, however we need to check if it is repeating
   1935     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
   1936         !isBreakUnit(strsrch, start, *textoffset) ||
   1937         !checkIdentical(strsrch, start, *textoffset)) {
   1938         (*textoffset) ++;
   1939         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
   1940                                         strsrch->search->textLength);
   1941         return FALSE;
   1942     }
   1943 
   1944     strsrch->search->matchedIndex  = start;
   1945     strsrch->search->matchedLength = *textoffset - start;
   1946     return TRUE;
   1947 }
   1948 
   1949 /**
   1950 * Shifting the collation element iterator position forward to prepare for
   1951 * a preceding match. If the first character is a unsafe character, we'll only
   1952 * shift by 1 to capture contractions, normalization etc.
   1953 * Internal method, status assumed to be success, caller has to check status
   1954 * before calling this method.
   1955 * @param text strsrch string search data
   1956 * @param textoffset start text position to do search
   1957 * @param ce the text ce which failed the match.
   1958 * @param patternceindex index of the ce within the pattern ce buffer which
   1959 *        failed the match
   1960 * @return final offset
   1961 */
   1962 static
   1963 inline int32_t reverseShift(UStringSearch *strsrch,
   1964                                 int32_t    textoffset,
   1965                                 int32_t       ce,
   1966                                 int32_t        patternceindex)
   1967 {
   1968     if (strsrch->search->isOverlap) {
   1969         if (textoffset != strsrch->search->textLength) {
   1970             textoffset --;
   1971         }
   1972         else {
   1973             textoffset -= strsrch->pattern.defaultShiftSize;
   1974         }
   1975     }
   1976     else {
   1977         if (ce != UCOL_NULLORDER) {
   1978             int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
   1979 
   1980             // this is to adjust for characters in the middle of the substring
   1981             // for matching that failed.
   1982             int32_t adjust = patternceindex;
   1983             if (adjust > 1 && shift > adjust) {
   1984                 shift -= adjust - 1;
   1985             }
   1986             textoffset -= shift;
   1987         }
   1988         else {
   1989             textoffset -= strsrch->pattern.defaultShiftSize;
   1990         }
   1991     }
   1992     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
   1993     return textoffset;
   1994 }
   1995 
   1996 /**
   1997 * Checks match for contraction.
   1998 * If the match starts with a partial contraction we fail.
   1999 * Internal method, status assumed to be success, caller has to check status
   2000 * before calling this method.
   2001 * @param strsrch string search data
   2002 * @param start offset of potential match, to be modified if necessary
   2003 * @param end offset of potential match, to be modified if necessary
   2004 * @param status output error status if any
   2005 * @return TRUE if match passes the contraction test, FALSE otherwise
   2006 */
   2007 static
   2008 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
   2009                                      int32_t   *start,
   2010                                      int32_t   *end, UErrorCode  *status)
   2011 {
   2012           UCollationElements *coleiter   = strsrch->textIter;
   2013           int32_t             textlength = strsrch->search->textLength;
   2014           int32_t             temp       = *end;
   2015     const UCollator          *collator   = strsrch->collator;
   2016     const UChar              *text       = strsrch->search->text;
   2017     // This part checks if either if the start of the match contains potential
   2018     // contraction. If so we'll have to iterate through them
   2019     // Since we used ucol_next while previously looking for the potential
   2020     // match, this guarantees that our end will not be a partial contraction,
   2021     // or a partial supplementary character.
   2022     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2023         int32_t expansion  = getExpansionSuffix(coleiter);
   2024         UBool   expandflag = expansion > 0;
   2025         setColEIterOffset(coleiter, *end);
   2026         while (U_SUCCESS(*status) && expansion > 0) {
   2027             // getting rid of the redundant ce
   2028             // since forward contraction/expansion may have extra ces
   2029             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2030             // would have taken care of it.
   2031             // E.g. the character \u01FA will have an expansion of 3, but if
   2032             // we are only looking for A ring A\u030A, we'll have to skip the
   2033             // last ce in the expansion buffer
   2034             ucol_previous(coleiter, status);
   2035             if (U_FAILURE(*status)) {
   2036                 return FALSE;
   2037             }
   2038             if (ucol_getOffset(coleiter) != temp) {
   2039                 *end = temp;
   2040                 temp  = ucol_getOffset(coleiter);
   2041             }
   2042             expansion --;
   2043         }
   2044 
   2045         int32_t  *patternce       = strsrch->pattern.ces;
   2046         int32_t   patterncelength = strsrch->pattern.cesLength;
   2047         int32_t   count           = patterncelength;
   2048         while (count > 0) {
   2049             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2050             // status checked below, note that if status is a failure
   2051             // ucol_previous returns UCOL_NULLORDER
   2052             if (ce == UCOL_IGNORABLE) {
   2053                 continue;
   2054             }
   2055             if (expandflag && count == 0 &&
   2056                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2057                 *end = temp;
   2058                 temp  = ucol_getOffset(coleiter);
   2059             }
   2060             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2061                 (*start) --;
   2062                 *start = getPreviousBaseOffset(text, *start);
   2063                 return FALSE;
   2064             }
   2065             count --;
   2066         }
   2067     }
   2068     return TRUE;
   2069 }
   2070 
   2071 /**
   2072 * Checks and sets the match information if found.
   2073 * Checks
   2074 * <ul>
   2075 * <li> the current match does not repeat the last match
   2076 * <li> boundaries are correct
   2077 * <li> exact matches has no extra accents
   2078 * <li> identical matches
   2079 * <\ul>
   2080 * Otherwise the offset will be shifted to the preceding character.
   2081 * Internal method, status assumed to be success, caller has to check status
   2082 * before calling this method.
   2083 * @param strsrch string search data
   2084 * @param collator
   2085 * @param coleiter collation element iterator
   2086 * @param text string
   2087 * @param textoffset offset in the collation element text. the returned value
   2088 *        will be the truncated start offset of the match or the new start
   2089 *        search offset.
   2090 * @param status output error status if any
   2091 * @return TRUE if the match is valid, FALSE otherwise
   2092 */
   2093 static
   2094 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
   2095                                      int32_t   *textoffset,
   2096                                      UErrorCode    *status)
   2097 {
   2098     // to ensure that the start and ends are not composite characters
   2099     int32_t end = ucol_getOffset(strsrch->textIter);
   2100     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
   2101         || U_FAILURE(*status)) {
   2102             return FALSE;
   2103     }
   2104 
   2105     // this totally matches, however we need to check if it is repeating
   2106     // the old match
   2107     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2108         !isBreakUnit(strsrch, *textoffset, end) ||
   2109         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
   2110         !checkIdentical(strsrch, *textoffset, end) ||
   2111         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
   2112         (*textoffset) --;
   2113         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2114                                             *textoffset);
   2115         return FALSE;
   2116     }
   2117 
   2118     //Add breakiterator boundary check for primary strength search.
   2119     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   2120         checkBreakBoundary(strsrch, textoffset, &end);
   2121     }
   2122 
   2123     strsrch->search->matchedIndex = *textoffset;
   2124     strsrch->search->matchedLength = end - *textoffset;
   2125     return TRUE;
   2126 }
   2127 
   2128 /**
   2129 * Rearranges the end accents to try matching.
   2130 * Suffix accents in the text will be grouped according to their combining
   2131 * class and the groups will be mixed and matched to try find the perfect
   2132 * match with the pattern.
   2133 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2134 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2135 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2136 *         "\u0301\u0325".
   2137 * step 2: check if any of the generated substrings matches the pattern.
   2138 * Internal method, status assumed to be success, user has to check status
   2139 * before calling this method.
   2140 * @param strsrch string search match
   2141 * @param start offset of the first base character
   2142 * @param end start of the last accent set
   2143 * @param status only error status if any
   2144 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2145 *         offset of the match. Note this start includes all following accents.
   2146 */
   2147 static
   2148 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
   2149                                            int32_t    start,
   2150                                            int32_t    end,
   2151                                            UErrorCode    *status)
   2152 {
   2153     const UChar       *text       = strsrch->search->text;
   2154           int32_t  tempend    = end;
   2155 
   2156     U16_BACK_1(text, 0, tempend);
   2157     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
   2158                                                            LAST_BYTE_MASK_)) {
   2159         // die... failed at a base character
   2160         return USEARCH_DONE;
   2161     }
   2162     end = getNextBaseOffset(text, end, strsrch->search->textLength);
   2163 
   2164     if (U_SUCCESS(*status)) {
   2165         UChar       accents[INITIAL_ARRAY_SIZE_];
   2166         int32_t offset = getPreviousBaseOffset(text, end);
   2167         // normalizing the offensive string
   2168         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
   2169                         INITIAL_ARRAY_SIZE_, status);
   2170 
   2171         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   2172         int32_t         accentsize = getUnblockedAccentIndex(accents,
   2173                                                          accentsindex);
   2174         int32_t         count      = (2 << (accentsize - 1)) - 1;
   2175         UChar               buffer[INITIAL_ARRAY_SIZE_];
   2176         UCollationElements *coleiter = strsrch->utilIter;
   2177         while (U_SUCCESS(*status) && count > 0) {
   2178             UChar *rearrange = strsrch->canonicalSuffixAccents;
   2179             // copy the base characters
   2180             for (int k = 0; k < accentsindex[0]; k ++) {
   2181                 *rearrange ++ = accents[k];
   2182             }
   2183             // forming all possible canonical rearrangement by dropping
   2184             // sets of accents
   2185             for (int i = 0; i <= accentsize - 1; i ++) {
   2186                 int32_t mask = 1 << (accentsize - i - 1);
   2187                 if (count & mask) {
   2188                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2189                         *rearrange ++ = accents[j];
   2190                     }
   2191                 }
   2192             }
   2193             *rearrange = 0;
   2194             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   2195             UChar   *match     = addToUCharArray(buffer, &matchsize,
   2196                                            strsrch->canonicalPrefixAccents,
   2197                                            strsrch->search->text + start,
   2198                                            offset - start,
   2199                                            strsrch->canonicalSuffixAccents,
   2200                                            status);
   2201 
   2202             // run the collator iterator through this match
   2203             // if status is a failure ucol_setText does nothing
   2204             ucol_setText(coleiter, match, matchsize, status);
   2205             if (U_SUCCESS(*status)) {
   2206                 if (checkCollationMatch(strsrch, coleiter)) {
   2207                     if (match != buffer) {
   2208                         uprv_free(match);
   2209                     }
   2210                     return end;
   2211                 }
   2212             }
   2213             count --;
   2214         }
   2215     }
   2216     return USEARCH_DONE;
   2217 }
   2218 
   2219 /**
   2220 * Take the rearranged start accents and tries matching. If match failed at
   2221 * a seperate following set of accents (seperated from the rearranged on by
   2222 * at least a base character) then we rearrange the preceding accents and
   2223 * tries matching again.
   2224 * We allow skipping of the ends of the accent set if the ces do not match.
   2225 * However if the failure is found before the accent set, it fails.
   2226 * Internal method, status assumed to be success, caller has to check status
   2227 * before calling this method.
   2228 * @param strsrch string search data
   2229 * @param textoffset of the ends of the rearranged accent
   2230 * @param status output error status if any
   2231 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2232 *         offset of the match. Note this start includes all following accents.
   2233 */
   2234 static
   2235 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
   2236                                            int32_t    textoffset,
   2237                                            UErrorCode    *status)
   2238 {
   2239     const UChar       *text       = strsrch->search->text;
   2240     const UCollator   *collator   = strsrch->collator;
   2241           int32_t      safelength = 0;
   2242           UChar       *safetext;
   2243           int32_t      safetextlength;
   2244           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
   2245           int32_t  safeoffset = textoffset;
   2246 
   2247     if (textoffset &&
   2248         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
   2249                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
   2250                                          ], collator)) {
   2251         safeoffset     = getNextSafeOffset(collator, text, textoffset,
   2252                                            strsrch->search->textLength);
   2253         safelength     = safeoffset - textoffset;
   2254         safetextlength = INITIAL_ARRAY_SIZE_;
   2255         safetext       = addToUCharArray(safebuffer, &safetextlength,
   2256                                          strsrch->canonicalPrefixAccents,
   2257                                          text + textoffset, safelength,
   2258                                          NULL, status);
   2259     }
   2260     else {
   2261         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
   2262         safetext       = strsrch->canonicalPrefixAccents;
   2263     }
   2264 
   2265     UCollationElements *coleiter = strsrch->utilIter;
   2266      // if status is a failure, ucol_setText does nothing
   2267     ucol_setText(coleiter, safetext, safetextlength, status);
   2268     // status checked in loop below
   2269 
   2270     int32_t  *ce           = strsrch->pattern.ces;
   2271     int32_t   celength     = strsrch->pattern.cesLength;
   2272     int       ceindex      = 0;
   2273     UBool     isSafe       = TRUE; // safe zone indication flag for position
   2274     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
   2275 
   2276     while (ceindex < celength) {
   2277         int32_t textce = ucol_next(coleiter, status);
   2278         if (U_FAILURE(*status)) {
   2279             if (isSafe) {
   2280                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2281             }
   2282             return USEARCH_DONE;
   2283         }
   2284         if (textce == UCOL_NULLORDER) {
   2285             // check if we have passed the safe buffer
   2286             if (coleiter == strsrch->textIter) {
   2287                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2288                 return USEARCH_DONE;
   2289             }
   2290             cleanUpSafeText(strsrch, safetext, safebuffer);
   2291             safetext = safebuffer;
   2292             coleiter = strsrch->textIter;
   2293             setColEIterOffset(coleiter, safeoffset);
   2294             // status checked at the start of the loop
   2295             isSafe = FALSE;
   2296             continue;
   2297         }
   2298         textce = getCE(strsrch, textce);
   2299         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   2300             // do the beginning stuff
   2301             int32_t failedoffset = ucol_getOffset(coleiter);
   2302             if (isSafe && failedoffset <= prefixlength) {
   2303                 // alas... no hope. failed at rearranged accent set
   2304                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2305                 return USEARCH_DONE;
   2306             }
   2307             else {
   2308                 if (isSafe) {
   2309                     failedoffset = safeoffset - failedoffset;
   2310                     cleanUpSafeText(strsrch, safetext, safebuffer);
   2311                 }
   2312 
   2313                 // try rearranging the end accents
   2314                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
   2315                                         textoffset, failedoffset, status);
   2316                 if (result != USEARCH_DONE) {
   2317                     // if status is a failure, ucol_setOffset does nothing
   2318                     setColEIterOffset(strsrch->textIter, result);
   2319                 }
   2320                 if (U_FAILURE(*status)) {
   2321                     return USEARCH_DONE;
   2322                 }
   2323                 return result;
   2324             }
   2325         }
   2326         if (textce == ce[ceindex]) {
   2327             ceindex ++;
   2328         }
   2329     }
   2330     // set offset here
   2331     if (isSafe) {
   2332         int32_t result      = ucol_getOffset(coleiter);
   2333         // sets the text iterator here with the correct expansion and offset
   2334         int32_t     leftoverces = getExpansionSuffix(coleiter);
   2335         cleanUpSafeText(strsrch, safetext, safebuffer);
   2336         if (result <= prefixlength) {
   2337             result = textoffset;
   2338         }
   2339         else {
   2340             result = textoffset + (safeoffset - result);
   2341         }
   2342         setColEIterOffset(strsrch->textIter, result);
   2343         setExpansionSuffix(strsrch->textIter, leftoverces);
   2344         return result;
   2345     }
   2346 
   2347     return ucol_getOffset(coleiter);
   2348 }
   2349 
   2350 /**
   2351 * Trying out the substring and sees if it can be a canonical match.
   2352 * This will try normalizing the starting accents and arranging them into
   2353 * canonical equivalents and check their corresponding ces with the pattern ce.
   2354 * Prefix accents in the text will be grouped according to their combining
   2355 * class and the groups will be mixed and matched to try find the perfect
   2356 * match with the pattern.
   2357 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2358 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2359 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2360 *         "\u0301\u0325".
   2361 * step 2: check if any of the generated substrings matches the pattern.
   2362 * Internal method, status assumed to be success, caller has to check status
   2363 * before calling this method.
   2364 * @param strsrch string search data
   2365 * @param textoffset start offset in the collation element text that starts
   2366 *                   with the accents to be rearranged
   2367 * @param status output error status if any
   2368 * @return TRUE if the match is valid, FALSE otherwise
   2369 */
   2370 static
   2371 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
   2372                                int32_t    textoffset,
   2373                                UErrorCode    *status)
   2374 {
   2375     const UChar       *text       = strsrch->search->text;
   2376           int32_t  temp       = textoffset;
   2377           int32_t      textlength = strsrch->search->textLength;
   2378     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   2379         UCollationElements *coleiter = strsrch->textIter;
   2380         int32_t         offset   = ucol_getOffset(coleiter);
   2381         if (strsrch->pattern.hasSuffixAccents) {
   2382             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
   2383                                                     offset, status);
   2384             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   2385                 setColEIterOffset(coleiter, offset);
   2386                 return TRUE;
   2387             }
   2388         }
   2389         return FALSE;
   2390     }
   2391 
   2392     if (!strsrch->pattern.hasPrefixAccents) {
   2393         return FALSE;
   2394     }
   2395 
   2396     UChar       accents[INITIAL_ARRAY_SIZE_];
   2397     // offset to the last base character in substring to search
   2398     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
   2399     // normalizing the offensive string
   2400     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
   2401                                0, accents, INITIAL_ARRAY_SIZE_, status);
   2402     // status checked in loop
   2403 
   2404     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   2405     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   2406 
   2407     // 2 power n - 1 plus the full set of accents
   2408     int32_t  count = (2 << (size - 1)) - 1;
   2409     while (U_SUCCESS(*status) && count > 0) {
   2410         UChar *rearrange = strsrch->canonicalPrefixAccents;
   2411         // copy the base characters
   2412         for (int k = 0; k < accentsindex[0]; k ++) {
   2413             *rearrange ++ = accents[k];
   2414         }
   2415         // forming all possible canonical rearrangement by dropping
   2416         // sets of accents
   2417         for (int i = 0; i <= size - 1; i ++) {
   2418             int32_t mask = 1 << (size - i - 1);
   2419             if (count & mask) {
   2420                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2421                     *rearrange ++ = accents[j];
   2422                 }
   2423             }
   2424         }
   2425         *rearrange = 0;
   2426         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
   2427                                                           baseoffset, status);
   2428         if (offset != USEARCH_DONE) {
   2429             return TRUE; // match found
   2430         }
   2431         count --;
   2432     }
   2433     return FALSE;
   2434 }
   2435 
   2436 /**
   2437 * Checks match for contraction.
   2438 * If the match starts with a partial contraction we fail.
   2439 * Internal method, status assumed to be success, caller has to check status
   2440 * before calling this method.
   2441 * @param strsrch string search data
   2442 * @param start offset of potential match, to be modified if necessary
   2443 * @param end offset of potential match, to be modified if necessary
   2444 * @param status only error status if any
   2445 * @return TRUE if match passes the contraction test, FALSE otherwise
   2446 */
   2447 static
   2448 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
   2449                                      int32_t   *start,
   2450                                      int32_t   *end, UErrorCode  *status)
   2451 {
   2452           UCollationElements *coleiter   = strsrch->textIter;
   2453           int32_t             textlength = strsrch->search->textLength;
   2454           int32_t         temp       = *end;
   2455     const UCollator          *collator   = strsrch->collator;
   2456     const UChar              *text       = strsrch->search->text;
   2457     // This part checks if either if the start of the match contains potential
   2458     // contraction. If so we'll have to iterate through them
   2459     // Since we used ucol_next while previously looking for the potential
   2460     // match, this guarantees that our end will not be a partial contraction,
   2461     // or a partial supplementary character.
   2462     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2463         int32_t expansion  = getExpansionSuffix(coleiter);
   2464         UBool   expandflag = expansion > 0;
   2465         setColEIterOffset(coleiter, *end);
   2466         while (expansion > 0) {
   2467             // getting rid of the redundant ce
   2468             // since forward contraction/expansion may have extra ces
   2469             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2470             // would have taken care of it.
   2471             // E.g. the character \u01FA will have an expansion of 3, but if
   2472             // we are only looking for A ring A\u030A, we'll have to skip the
   2473             // last ce in the expansion buffer
   2474             ucol_previous(coleiter, status);
   2475             if (U_FAILURE(*status)) {
   2476                 return FALSE;
   2477             }
   2478             if (ucol_getOffset(coleiter) != temp) {
   2479                 *end = temp;
   2480                 temp  = ucol_getOffset(coleiter);
   2481             }
   2482             expansion --;
   2483         }
   2484 
   2485         int32_t  *patternce       = strsrch->pattern.ces;
   2486         int32_t   patterncelength = strsrch->pattern.cesLength;
   2487         int32_t   count           = patterncelength;
   2488         while (count > 0) {
   2489             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2490             // status checked below, note that if status is a failure
   2491             // ucol_previous returns UCOL_NULLORDER
   2492             if (ce == UCOL_IGNORABLE) {
   2493                 continue;
   2494             }
   2495             if (expandflag && count == 0 &&
   2496                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2497                 *end = temp;
   2498                 temp  = ucol_getOffset(coleiter);
   2499             }
   2500             if (count == patterncelength &&
   2501                 ce != patternce[patterncelength - 1]) {
   2502                 // accents may have extra starting ces, this occurs when a
   2503                 // pure accent pattern is matched without rearrangement
   2504                 int32_t    expected = patternce[patterncelength - 1];
   2505                 U16_BACK_1(text, 0, *end);
   2506                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
   2507                     ce = getCE(strsrch, ucol_previous(coleiter, status));
   2508                     while (U_SUCCESS(*status) && ce != expected &&
   2509                            ce != UCOL_NULLORDER &&
   2510                            ucol_getOffset(coleiter) <= *start) {
   2511                         ce = getCE(strsrch, ucol_previous(coleiter, status));
   2512                     }
   2513                 }
   2514             }
   2515             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2516                 (*start) --;
   2517                 *start = getPreviousBaseOffset(text, *start);
   2518                 return FALSE;
   2519             }
   2520             count --;
   2521         }
   2522     }
   2523     return TRUE;
   2524 }
   2525 
   2526 /**
   2527 * Checks and sets the match information if found.
   2528 * Checks
   2529 * <ul>
   2530 * <li> the potential match does not repeat the previous match
   2531 * <li> boundaries are correct
   2532 * <li> potential match does not end in the middle of a contraction
   2533 * <li> identical matches
   2534 * <\ul>
   2535 * Otherwise the offset will be shifted to the next character.
   2536 * Internal method, status assumed to be success, caller has to check status
   2537 * before calling this method.
   2538 * @param strsrch string search data
   2539 * @param textoffset offset in the collation element text. the returned value
   2540 *        will be the truncated start offset of the match or the new start
   2541 *        search offset.
   2542 * @param status only error status if any
   2543 * @return TRUE if the match is valid, FALSE otherwise
   2544 */
   2545 static
   2546 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
   2547                                          int32_t   *textoffset,
   2548                                          UErrorCode    *status)
   2549 {
   2550     // to ensure that the start and ends are not composite characters
   2551     UCollationElements *coleiter = strsrch->textIter;
   2552     // if we have a canonical accent match
   2553     if ((strsrch->pattern.hasSuffixAccents &&
   2554         strsrch->canonicalSuffixAccents[0]) ||
   2555         (strsrch->pattern.hasPrefixAccents &&
   2556         strsrch->canonicalPrefixAccents[0])) {
   2557         strsrch->search->matchedIndex  = *textoffset;
   2558         strsrch->search->matchedLength =
   2559             getNextUStringSearchBaseOffset(strsrch,
   2560                                       getColElemIterOffset(coleiter, FALSE))
   2561             - *textoffset;
   2562         return TRUE;
   2563     }
   2564 
   2565     int32_t end = ucol_getOffset(coleiter);
   2566     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
   2567                                                 status) ||
   2568          U_FAILURE(*status)) {
   2569         return FALSE;
   2570     }
   2571 
   2572     end = getNextUStringSearchBaseOffset(strsrch, end);
   2573     // this totally matches, however we need to check if it is repeating
   2574     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2575         !isBreakUnit(strsrch, *textoffset, end) ||
   2576         !checkIdentical(strsrch, *textoffset, end)) {
   2577         (*textoffset) --;
   2578         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2579                                             *textoffset);
   2580         return FALSE;
   2581     }
   2582 
   2583     strsrch->search->matchedIndex  = *textoffset;
   2584     strsrch->search->matchedLength = end - *textoffset;
   2585     return TRUE;
   2586 }
   2587 #endif // #if BOYER_MOORE
   2588 
   2589 // constructors and destructor -------------------------------------------
   2590 
   2591 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
   2592                                           int32_t         patternlength,
   2593                                     const UChar          *text,
   2594                                           int32_t         textlength,
   2595                                     const char           *locale,
   2596                                           UBreakIterator *breakiter,
   2597                                           UErrorCode     *status)
   2598 {
   2599     if (U_FAILURE(*status)) {
   2600         return NULL;
   2601     }
   2602 #if UCONFIG_NO_BREAK_ITERATION
   2603     if (breakiter != NULL) {
   2604         *status = U_UNSUPPORTED_ERROR;
   2605         return NULL;
   2606     }
   2607 #endif
   2608     if (locale) {
   2609         // ucol_open internally checks for status
   2610         UCollator     *collator = ucol_open(locale, status);
   2611         // pattern, text checks are done in usearch_openFromCollator
   2612         UStringSearch *result   = usearch_openFromCollator(pattern,
   2613                                               patternlength, text, textlength,
   2614                                               collator, breakiter, status);
   2615 
   2616         if (result == NULL || U_FAILURE(*status)) {
   2617             if (collator) {
   2618                 ucol_close(collator);
   2619             }
   2620             return NULL;
   2621         }
   2622         else {
   2623             result->ownCollator = TRUE;
   2624         }
   2625         return result;
   2626     }
   2627     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2628     return NULL;
   2629 }
   2630 
   2631 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
   2632                                   const UChar          *pattern,
   2633                                         int32_t         patternlength,
   2634                                   const UChar          *text,
   2635                                         int32_t         textlength,
   2636                                   const UCollator      *collator,
   2637                                         UBreakIterator *breakiter,
   2638                                         UErrorCode     *status)
   2639 {
   2640     if (U_FAILURE(*status)) {
   2641         return NULL;
   2642     }
   2643 #if UCONFIG_NO_BREAK_ITERATION
   2644     if (breakiter != NULL) {
   2645         *status = U_UNSUPPORTED_ERROR;
   2646         return NULL;
   2647     }
   2648 #endif
   2649     if (pattern == NULL || text == NULL || collator == NULL) {
   2650         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2651         return NULL;
   2652     }
   2653 
   2654     // string search does not really work when numeric collation is turned on
   2655     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
   2656         *status = U_UNSUPPORTED_ERROR;
   2657         return NULL;
   2658     }
   2659 
   2660     if (U_SUCCESS(*status)) {
   2661         initializeFCD(status);
   2662         if (U_FAILURE(*status)) {
   2663             return NULL;
   2664         }
   2665 
   2666         UStringSearch *result;
   2667         if (textlength == -1) {
   2668             textlength = u_strlen(text);
   2669         }
   2670         if (patternlength == -1) {
   2671             patternlength = u_strlen(pattern);
   2672         }
   2673         if (textlength <= 0 || patternlength <= 0) {
   2674             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2675             return NULL;
   2676         }
   2677 
   2678         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
   2679         if (result == NULL) {
   2680             *status = U_MEMORY_ALLOCATION_ERROR;
   2681             return NULL;
   2682         }
   2683 
   2684         result->collator    = collator;
   2685         result->strength    = ucol_getStrength(collator);
   2686         result->ceMask      = getMask(result->strength);
   2687         result->toShift     =
   2688              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   2689                                                             UCOL_SHIFTED;
   2690         result->variableTop = ucol_getVariableTop(collator, status);
   2691 
   2692         result->nfd         = Normalizer2::getNFDInstance(*status);
   2693 
   2694         if (U_FAILURE(*status)) {
   2695             uprv_free(result);
   2696             return NULL;
   2697         }
   2698 
   2699         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
   2700         if (result->search == NULL) {
   2701             *status = U_MEMORY_ALLOCATION_ERROR;
   2702             uprv_free(result);
   2703             return NULL;
   2704         }
   2705 
   2706         result->search->text       = text;
   2707         result->search->textLength = textlength;
   2708 
   2709         result->pattern.text       = pattern;
   2710         result->pattern.textLength = patternlength;
   2711         result->pattern.ces         = NULL;
   2712         result->pattern.pces        = NULL;
   2713 
   2714         result->search->breakIter  = breakiter;
   2715 #if !UCONFIG_NO_BREAK_ITERATION
   2716         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
   2717         if (breakiter) {
   2718             ubrk_setText(breakiter, text, textlength, status);
   2719         }
   2720 #endif
   2721 
   2722         result->ownCollator           = FALSE;
   2723         result->search->matchedLength = 0;
   2724         result->search->matchedIndex  = USEARCH_DONE;
   2725         result->utilIter              = NULL;
   2726         result->textIter              = ucol_openElements(collator, text,
   2727                                                           textlength, status);
   2728         result->textProcessedIter     = NULL;
   2729         if (U_FAILURE(*status)) {
   2730             usearch_close(result);
   2731             return NULL;
   2732         }
   2733 
   2734         result->search->isOverlap          = FALSE;
   2735         result->search->isCanonicalMatch   = FALSE;
   2736         result->search->elementComparisonType = 0;
   2737         result->search->isForwardSearching = TRUE;
   2738         result->search->reset              = TRUE;
   2739 
   2740         initialize(result, status);
   2741 
   2742         if (U_FAILURE(*status)) {
   2743             usearch_close(result);
   2744             return NULL;
   2745         }
   2746 
   2747         return result;
   2748     }
   2749     return NULL;
   2750 }
   2751 
   2752 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
   2753 {
   2754     if (strsrch) {
   2755         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
   2756             strsrch->pattern.ces) {
   2757             uprv_free(strsrch->pattern.ces);
   2758         }
   2759 
   2760         if (strsrch->pattern.pces != NULL &&
   2761             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
   2762             uprv_free(strsrch->pattern.pces);
   2763         }
   2764 
   2765         delete strsrch->textProcessedIter;
   2766         ucol_closeElements(strsrch->textIter);
   2767         ucol_closeElements(strsrch->utilIter);
   2768 
   2769         if (strsrch->ownCollator && strsrch->collator) {
   2770             ucol_close((UCollator *)strsrch->collator);
   2771         }
   2772 
   2773 #if !UCONFIG_NO_BREAK_ITERATION
   2774         if (strsrch->search->internalBreakIter) {
   2775             ubrk_close(strsrch->search->internalBreakIter);
   2776         }
   2777 #endif
   2778 
   2779         uprv_free(strsrch->search);
   2780         uprv_free(strsrch);
   2781     }
   2782 }
   2783 
   2784 namespace {
   2785 
   2786 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
   2787     if (U_FAILURE(*status)) { return FALSE; }
   2788     if (strsrch->textProcessedIter == NULL) {
   2789         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
   2790         if (strsrch->textProcessedIter == NULL) {
   2791             *status = U_MEMORY_ALLOCATION_ERROR;
   2792             return FALSE;
   2793         }
   2794     } else {
   2795         strsrch->textProcessedIter->init(strsrch->textIter);
   2796     }
   2797     return TRUE;
   2798 }
   2799 
   2800 }
   2801 
   2802 // set and get methods --------------------------------------------------
   2803 
   2804 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
   2805                                         int32_t    position,
   2806                                         UErrorCode    *status)
   2807 {
   2808     if (U_SUCCESS(*status) && strsrch) {
   2809         if (isOutOfBounds(strsrch->search->textLength, position)) {
   2810             *status = U_INDEX_OUTOFBOUNDS_ERROR;
   2811         }
   2812         else {
   2813             setColEIterOffset(strsrch->textIter, position);
   2814         }
   2815         strsrch->search->matchedIndex  = USEARCH_DONE;
   2816         strsrch->search->matchedLength = 0;
   2817         strsrch->search->reset         = FALSE;
   2818     }
   2819 }
   2820 
   2821 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
   2822 {
   2823     if (strsrch) {
   2824         int32_t result = ucol_getOffset(strsrch->textIter);
   2825         if (isOutOfBounds(strsrch->search->textLength, result)) {
   2826             return USEARCH_DONE;
   2827         }
   2828         return result;
   2829     }
   2830     return USEARCH_DONE;
   2831 }
   2832 
   2833 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
   2834                                  USearchAttribute attribute,
   2835                                  USearchAttributeValue value,
   2836                                  UErrorCode *status)
   2837 {
   2838     if (U_SUCCESS(*status) && strsrch) {
   2839         switch (attribute)
   2840         {
   2841         case USEARCH_OVERLAP :
   2842             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
   2843             break;
   2844         case USEARCH_CANONICAL_MATCH :
   2845             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
   2846                                                                       FALSE);
   2847             break;
   2848         case USEARCH_ELEMENT_COMPARISON :
   2849             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2850                 strsrch->search->elementComparisonType = (int16_t)value;
   2851             } else {
   2852                 strsrch->search->elementComparisonType = 0;
   2853             }
   2854             break;
   2855         case USEARCH_ATTRIBUTE_COUNT :
   2856         default:
   2857             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2858         }
   2859     }
   2860     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
   2861         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2862     }
   2863 }
   2864 
   2865 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
   2866                                                 const UStringSearch *strsrch,
   2867                                                 USearchAttribute attribute)
   2868 {
   2869     if (strsrch) {
   2870         switch (attribute) {
   2871         case USEARCH_OVERLAP :
   2872             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
   2873                                                         USEARCH_OFF);
   2874         case USEARCH_CANONICAL_MATCH :
   2875             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
   2876                                                                USEARCH_OFF);
   2877         case USEARCH_ELEMENT_COMPARISON :
   2878             {
   2879                 int16_t value = strsrch->search->elementComparisonType;
   2880                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2881                     return (USearchAttributeValue)value;
   2882                 } else {
   2883                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
   2884                 }
   2885             }
   2886         case USEARCH_ATTRIBUTE_COUNT :
   2887             return USEARCH_DEFAULT;
   2888         }
   2889     }
   2890     return USEARCH_DEFAULT;
   2891 }
   2892 
   2893 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
   2894                                                 const UStringSearch *strsrch)
   2895 {
   2896     if (strsrch == NULL) {
   2897         return USEARCH_DONE;
   2898     }
   2899     return strsrch->search->matchedIndex;
   2900 }
   2901 
   2902 
   2903 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
   2904                                             UChar         *result,
   2905                                             int32_t        resultCapacity,
   2906                                             UErrorCode    *status)
   2907 {
   2908     if (U_FAILURE(*status)) {
   2909         return USEARCH_DONE;
   2910     }
   2911     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
   2912         result == NULL)) {
   2913         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2914         return USEARCH_DONE;
   2915     }
   2916 
   2917     int32_t     copylength = strsrch->search->matchedLength;
   2918     int32_t copyindex  = strsrch->search->matchedIndex;
   2919     if (copyindex == USEARCH_DONE) {
   2920         u_terminateUChars(result, resultCapacity, 0, status);
   2921         return USEARCH_DONE;
   2922     }
   2923 
   2924     if (resultCapacity < copylength) {
   2925         copylength = resultCapacity;
   2926     }
   2927     if (copylength > 0) {
   2928         uprv_memcpy(result, strsrch->search->text + copyindex,
   2929                     copylength * sizeof(UChar));
   2930     }
   2931     return u_terminateUChars(result, resultCapacity,
   2932                              strsrch->search->matchedLength, status);
   2933 }
   2934 
   2935 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
   2936                                               const UStringSearch *strsrch)
   2937 {
   2938     if (strsrch) {
   2939         return strsrch->search->matchedLength;
   2940     }
   2941     return USEARCH_DONE;
   2942 }
   2943 
   2944 #if !UCONFIG_NO_BREAK_ITERATION
   2945 
   2946 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
   2947                                                UBreakIterator *breakiter,
   2948                                                UErrorCode     *status)
   2949 {
   2950     if (U_SUCCESS(*status) && strsrch) {
   2951         strsrch->search->breakIter = breakiter;
   2952         if (breakiter) {
   2953             ubrk_setText(breakiter, strsrch->search->text,
   2954                          strsrch->search->textLength, status);
   2955         }
   2956     }
   2957 }
   2958 
   2959 U_CAPI const UBreakIterator* U_EXPORT2
   2960 usearch_getBreakIterator(const UStringSearch *strsrch)
   2961 {
   2962     if (strsrch) {
   2963         return strsrch->search->breakIter;
   2964     }
   2965     return NULL;
   2966 }
   2967 
   2968 #endif
   2969 
   2970 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
   2971                                       const UChar         *text,
   2972                                             int32_t        textlength,
   2973                                             UErrorCode    *status)
   2974 {
   2975     if (U_SUCCESS(*status)) {
   2976         if (strsrch == NULL || text == NULL || textlength < -1 ||
   2977             textlength == 0) {
   2978             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2979         }
   2980         else {
   2981             if (textlength == -1) {
   2982                 textlength = u_strlen(text);
   2983             }
   2984             strsrch->search->text       = text;
   2985             strsrch->search->textLength = textlength;
   2986             ucol_setText(strsrch->textIter, text, textlength, status);
   2987             strsrch->search->matchedIndex  = USEARCH_DONE;
   2988             strsrch->search->matchedLength = 0;
   2989             strsrch->search->reset         = TRUE;
   2990 #if !UCONFIG_NO_BREAK_ITERATION
   2991             if (strsrch->search->breakIter != NULL) {
   2992                 ubrk_setText(strsrch->search->breakIter, text,
   2993                              textlength, status);
   2994             }
   2995             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
   2996 #endif
   2997         }
   2998     }
   2999 }
   3000 
   3001 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
   3002                                                      int32_t       *length)
   3003 {
   3004     if (strsrch) {
   3005         *length = strsrch->search->textLength;
   3006         return strsrch->search->text;
   3007     }
   3008     return NULL;
   3009 }
   3010 
   3011 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
   3012                                           const UCollator     *collator,
   3013                                                 UErrorCode    *status)
   3014 {
   3015     if (U_SUCCESS(*status)) {
   3016         if (collator == NULL) {
   3017             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3018             return;
   3019         }
   3020 
   3021         if (strsrch) {
   3022             delete strsrch->textProcessedIter;
   3023             strsrch->textProcessedIter = NULL;
   3024             ucol_closeElements(strsrch->textIter);
   3025             ucol_closeElements(strsrch->utilIter);
   3026             strsrch->textIter = strsrch->utilIter = NULL;
   3027             if (strsrch->ownCollator && (strsrch->collator != collator)) {
   3028                 ucol_close((UCollator *)strsrch->collator);
   3029                 strsrch->ownCollator = FALSE;
   3030             }
   3031             strsrch->collator    = collator;
   3032             strsrch->strength    = ucol_getStrength(collator);
   3033             strsrch->ceMask      = getMask(strsrch->strength);
   3034 #if !UCONFIG_NO_BREAK_ITERATION
   3035             ubrk_close(strsrch->search->internalBreakIter);
   3036             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
   3037                                                      strsrch->search->text, strsrch->search->textLength, status);
   3038 #endif
   3039             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3040             strsrch->toShift     =
   3041                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   3042                                                                 UCOL_SHIFTED;
   3043             // if status is a failure, ucol_getVariableTop returns 0
   3044             strsrch->variableTop = ucol_getVariableTop(collator, status);
   3045             strsrch->textIter = ucol_openElements(collator,
   3046                                       strsrch->search->text,
   3047                                       strsrch->search->textLength,
   3048                                       status);
   3049             strsrch->utilIter = ucol_openElements(
   3050                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
   3051             // initialize() _after_ setting the iterators for the new collator.
   3052             initialize(strsrch, status);
   3053         }
   3054 
   3055         // **** are these calls needed?
   3056         // **** we call uprv_init_pce in initializePatternPCETable
   3057         // **** and the CEIBuffer constructor...
   3058 #if 0
   3059         uprv_init_pce(strsrch->textIter);
   3060         uprv_init_pce(strsrch->utilIter);
   3061 #endif
   3062     }
   3063 }
   3064 
   3065 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
   3066 {
   3067     if (strsrch) {
   3068         return (UCollator *)strsrch->collator;
   3069     }
   3070     return NULL;
   3071 }
   3072 
   3073 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
   3074                                          const UChar         *pattern,
   3075                                                int32_t        patternlength,
   3076                                                UErrorCode    *status)
   3077 {
   3078     if (U_SUCCESS(*status)) {
   3079         if (strsrch == NULL || pattern == NULL) {
   3080             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3081         }
   3082         else {
   3083             if (patternlength == -1) {
   3084                 patternlength = u_strlen(pattern);
   3085             }
   3086             if (patternlength == 0) {
   3087                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   3088                 return;
   3089             }
   3090             strsrch->pattern.text       = pattern;
   3091             strsrch->pattern.textLength = patternlength;
   3092             initialize(strsrch, status);
   3093         }
   3094     }
   3095 }
   3096 
   3097 U_CAPI const UChar* U_EXPORT2
   3098 usearch_getPattern(const UStringSearch *strsrch,
   3099                    int32_t       *length)
   3100 {
   3101     if (strsrch) {
   3102         *length = strsrch->pattern.textLength;
   3103         return strsrch->pattern.text;
   3104     }
   3105     return NULL;
   3106 }
   3107 
   3108 // miscellanous methods --------------------------------------------------
   3109 
   3110 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
   3111                                            UErrorCode    *status)
   3112 {
   3113     if (strsrch && U_SUCCESS(*status)) {
   3114         strsrch->search->isForwardSearching = TRUE;
   3115         usearch_setOffset(strsrch, 0, status);
   3116         if (U_SUCCESS(*status)) {
   3117             return usearch_next(strsrch, status);
   3118         }
   3119     }
   3120     return USEARCH_DONE;
   3121 }
   3122 
   3123 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
   3124                                                int32_t    position,
   3125                                                UErrorCode    *status)
   3126 {
   3127     if (strsrch && U_SUCCESS(*status)) {
   3128         strsrch->search->isForwardSearching = TRUE;
   3129         // position checked in usearch_setOffset
   3130         usearch_setOffset(strsrch, position, status);
   3131         if (U_SUCCESS(*status)) {
   3132             return usearch_next(strsrch, status);
   3133         }
   3134     }
   3135     return USEARCH_DONE;
   3136 }
   3137 
   3138 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
   3139                                           UErrorCode    *status)
   3140 {
   3141     if (strsrch && U_SUCCESS(*status)) {
   3142         strsrch->search->isForwardSearching = FALSE;
   3143         usearch_setOffset(strsrch, strsrch->search->textLength, status);
   3144         if (U_SUCCESS(*status)) {
   3145             return usearch_previous(strsrch, status);
   3146         }
   3147     }
   3148     return USEARCH_DONE;
   3149 }
   3150 
   3151 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
   3152                                                int32_t    position,
   3153                                                UErrorCode    *status)
   3154 {
   3155     if (strsrch && U_SUCCESS(*status)) {
   3156         strsrch->search->isForwardSearching = FALSE;
   3157         // position checked in usearch_setOffset
   3158         usearch_setOffset(strsrch, position, status);
   3159         if (U_SUCCESS(*status)) {
   3160             return usearch_previous(strsrch, status);
   3161         }
   3162     }
   3163     return USEARCH_DONE;
   3164 }
   3165 
   3166 /**
   3167 * If a direction switch is required, we'll count the number of ces till the
   3168 * beginning of the collation element iterator and iterate forwards that
   3169 * number of times. This is so that we get to the correct point within the
   3170 * string to continue the search in. Imagine when we are in the middle of the
   3171 * normalization buffer when the change in direction is request. arrrgghh....
   3172 * After searching the offset within the collation element iterator will be
   3173 * shifted to the start of the match. If a match is not found, the offset would
   3174 * have been set to the end of the text string in the collation element
   3175 * iterator.
   3176 * Okay, here's my take on normalization buffer. The only time when there can
   3177 * be 2 matches within the same normalization is when the pattern is consists
   3178 * of all accents. But since the offset returned is from the text string, we
   3179 * should not confuse the caller by returning the second match within the
   3180 * same normalization buffer. If we do, the 2 results will have the same match
   3181 * offsets, and that'll be confusing. I'll return the next match that doesn't
   3182 * fall within the same normalization buffer. Note this does not affect the
   3183 * results of matches spanning the text and the normalization buffer.
   3184 * The position to start searching is taken from the collation element
   3185 * iterator. Callers of this API would have to set the offset in the collation
   3186 * element iterator before using this method.
   3187 */
   3188 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
   3189                                           UErrorCode    *status)
   3190 {
   3191     if (U_SUCCESS(*status) && strsrch) {
   3192         // note offset is either equivalent to the start of the previous match
   3193         // or is set by the user
   3194         int32_t      offset       = usearch_getOffset(strsrch);
   3195         USearch     *search       = strsrch->search;
   3196         search->reset             = FALSE;
   3197         int32_t      textlength   = search->textLength;
   3198         if (search->isForwardSearching) {
   3199 #if BOYER_MOORE
   3200             if (offset == textlength
   3201                 || (!search->isOverlap &&
   3202                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
   3203                     (search->matchedIndex != USEARCH_DONE &&
   3204                      offset + search->matchedLength >= textlength)))) {
   3205                 // not enough characters to match
   3206                 setMatchNotFound(strsrch);
   3207                 return USEARCH_DONE;
   3208             }
   3209 #else
   3210             if (offset == textlength ||
   3211                 (! search->isOverlap &&
   3212                 (search->matchedIndex != USEARCH_DONE &&
   3213                 offset + search->matchedLength > textlength))) {
   3214                     // not enough characters to match
   3215                     setMatchNotFound(strsrch);
   3216                     return USEARCH_DONE;
   3217             }
   3218 #endif
   3219         }
   3220         else {
   3221             // switching direction.
   3222             // if matchedIndex == USEARCH_DONE, it means that either a
   3223             // setOffset has been called or that previous ran off the text
   3224             // string. the iterator would have been set to offset 0 if a
   3225             // match is not found.
   3226             search->isForwardSearching = TRUE;
   3227             if (search->matchedIndex != USEARCH_DONE) {
   3228                 // there's no need to set the collation element iterator
   3229                 // the next call to next will set the offset.
   3230                 return search->matchedIndex;
   3231             }
   3232         }
   3233 
   3234         if (U_SUCCESS(*status)) {
   3235             if (strsrch->pattern.cesLength == 0) {
   3236                 if (search->matchedIndex == USEARCH_DONE) {
   3237                     search->matchedIndex = offset;
   3238                 }
   3239                 else { // moves by codepoints
   3240                     U16_FWD_1(search->text, search->matchedIndex, textlength);
   3241                 }
   3242 
   3243                 search->matchedLength = 0;
   3244                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3245                 // status checked below
   3246                 if (search->matchedIndex == textlength) {
   3247                     search->matchedIndex = USEARCH_DONE;
   3248                 }
   3249             }
   3250             else {
   3251                 if (search->matchedLength > 0) {
   3252                     // if matchlength is 0 we are at the start of the iteration
   3253                     if (search->isOverlap) {
   3254                         ucol_setOffset(strsrch->textIter, offset + 1, status);
   3255                     }
   3256                     else {
   3257                         ucol_setOffset(strsrch->textIter,
   3258                                        offset + search->matchedLength, status);
   3259                     }
   3260                 }
   3261                 else {
   3262                     // for boundary check purposes. this will ensure that the
   3263                     // next match will not preceed the current offset
   3264                     // note search->matchedIndex will always be set to something
   3265                     // in the code
   3266                     search->matchedIndex = offset - 1;
   3267                 }
   3268 
   3269                 if (search->isCanonicalMatch) {
   3270                     // can't use exact here since extra accents are allowed.
   3271                     usearch_handleNextCanonical(strsrch, status);
   3272                 }
   3273                 else {
   3274                     usearch_handleNextExact(strsrch, status);
   3275                 }
   3276             }
   3277 
   3278             if (U_FAILURE(*status)) {
   3279                 return USEARCH_DONE;
   3280             }
   3281 
   3282 #if !BOYER_MOORE
   3283             if (search->matchedIndex == USEARCH_DONE) {
   3284                 ucol_setOffset(strsrch->textIter, search->textLength, status);
   3285             } else {
   3286                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
   3287             }
   3288 #endif
   3289 
   3290             return search->matchedIndex;
   3291         }
   3292     }
   3293     return USEARCH_DONE;
   3294 }
   3295 
   3296 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
   3297                                               UErrorCode *status)
   3298 {
   3299     if (U_SUCCESS(*status) && strsrch) {
   3300         int32_t offset;
   3301         USearch *search = strsrch->search;
   3302         if (search->reset) {
   3303             offset                     = search->textLength;
   3304             search->isForwardSearching = FALSE;
   3305             search->reset              = FALSE;
   3306             setColEIterOffset(strsrch->textIter, offset);
   3307         }
   3308         else {
   3309             offset = usearch_getOffset(strsrch);
   3310         }
   3311 
   3312         int32_t matchedindex = search->matchedIndex;
   3313         if (search->isForwardSearching == TRUE) {
   3314             // switching direction.
   3315             // if matchedIndex == USEARCH_DONE, it means that either a
   3316             // setOffset has been called or that next ran off the text
   3317             // string. the iterator would have been set to offset textLength if
   3318             // a match is not found.
   3319             search->isForwardSearching = FALSE;
   3320             if (matchedindex != USEARCH_DONE) {
   3321                 return matchedindex;
   3322             }
   3323         }
   3324         else {
   3325 #if BOYER_MOORE
   3326             if (offset == 0 || matchedindex == 0 ||
   3327                 (!search->isOverlap &&
   3328                     (offset < strsrch->pattern.defaultShiftSize ||
   3329                     (matchedindex != USEARCH_DONE &&
   3330                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
   3331                 // not enough characters to match
   3332                 setMatchNotFound(strsrch);
   3333                 return USEARCH_DONE;
   3334             }
   3335 #else
   3336             // Could check pattern length, but the
   3337             // linear search will do the right thing
   3338             if (offset == 0 || matchedindex == 0) {
   3339                 setMatchNotFound(strsrch);
   3340                 return USEARCH_DONE;
   3341             }
   3342 #endif
   3343         }
   3344 
   3345         if (U_SUCCESS(*status)) {
   3346             if (strsrch->pattern.cesLength == 0) {
   3347                 search->matchedIndex =
   3348                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
   3349                 if (search->matchedIndex == 0) {
   3350                     setMatchNotFound(strsrch);
   3351                     // status checked below
   3352                 }
   3353                 else { // move by codepoints
   3354                     U16_BACK_1(search->text, 0, search->matchedIndex);
   3355                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3356                     // status checked below
   3357                     search->matchedLength = 0;
   3358                 }
   3359             }
   3360             else {
   3361                 if (strsrch->search->isCanonicalMatch) {
   3362                     // can't use exact here since extra accents are allowed.
   3363                     usearch_handlePreviousCanonical(strsrch, status);
   3364                     // status checked below
   3365                 }
   3366                 else {
   3367                     usearch_handlePreviousExact(strsrch, status);
   3368                     // status checked below
   3369                 }
   3370             }
   3371 
   3372             if (U_FAILURE(*status)) {
   3373                 return USEARCH_DONE;
   3374             }
   3375 
   3376             return search->matchedIndex;
   3377         }
   3378     }
   3379     return USEARCH_DONE;
   3380 }
   3381 
   3382 
   3383 
   3384 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
   3385 {
   3386     /*
   3387     reset is setting the attributes that are already in
   3388     string search, hence all attributes in the collator should
   3389     be retrieved without any problems
   3390     */
   3391     if (strsrch) {
   3392         UErrorCode status            = U_ZERO_ERROR;
   3393         UBool      sameCollAttribute = TRUE;
   3394         uint32_t   ceMask;
   3395         UBool      shift;
   3396         uint32_t   varTop;
   3397 
   3398         // **** hack to deal w/ how processed CEs encode quaternary ****
   3399         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
   3400         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
   3401             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
   3402                 sameCollAttribute = FALSE;
   3403         }
   3404 
   3405         strsrch->strength    = ucol_getStrength(strsrch->collator);
   3406         ceMask = getMask(strsrch->strength);
   3407         if (strsrch->ceMask != ceMask) {
   3408             strsrch->ceMask = ceMask;
   3409             sameCollAttribute = FALSE;
   3410         }
   3411 
   3412         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3413         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
   3414                                   &status) == UCOL_SHIFTED;
   3415         if (strsrch->toShift != shift) {
   3416             strsrch->toShift  = shift;
   3417             sameCollAttribute = FALSE;
   3418         }
   3419 
   3420         // if status is a failure, ucol_getVariableTop returns 0
   3421         varTop = ucol_getVariableTop(strsrch->collator, &status);
   3422         if (strsrch->variableTop != varTop) {
   3423             strsrch->variableTop = varTop;
   3424             sameCollAttribute    = FALSE;
   3425         }
   3426         if (!sameCollAttribute) {
   3427             initialize(strsrch, &status);
   3428         }
   3429         ucol_setText(strsrch->textIter, strsrch->search->text,
   3430                               strsrch->search->textLength,
   3431                               &status);
   3432         strsrch->search->matchedLength      = 0;
   3433         strsrch->search->matchedIndex       = USEARCH_DONE;
   3434         strsrch->search->isOverlap          = FALSE;
   3435         strsrch->search->isCanonicalMatch   = FALSE;
   3436         strsrch->search->elementComparisonType = 0;
   3437         strsrch->search->isForwardSearching = TRUE;
   3438         strsrch->search->reset              = TRUE;
   3439     }
   3440 }
   3441 
   3442 //
   3443 //  CEI  Collation Element + source text index.
   3444 //       These structs are kept in the circular buffer.
   3445 //
   3446 struct  CEI {
   3447     int64_t ce;
   3448     int32_t lowIndex;
   3449     int32_t highIndex;
   3450 };
   3451 
   3452 U_NAMESPACE_BEGIN
   3453 
   3454 namespace {
   3455 //
   3456 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
   3457 //
   3458 #define   DEFAULT_CEBUFFER_SIZE 96
   3459 #define   CEBUFFER_EXTRA 32
   3460 // Some typical max values to make buffer size more reasonable for asymmetric search.
   3461 // #8694 is for a better long-term solution to allocation of this buffer.
   3462 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
   3463 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
   3464 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
   3465 struct CEIBuffer {
   3466     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
   3467     CEI                 *buf;
   3468     int32_t              bufSize;
   3469     int32_t              firstIx;
   3470     int32_t              limitIx;
   3471     UCollationElements  *ceIter;
   3472     UStringSearch       *strSearch;
   3473 
   3474 
   3475 
   3476                CEIBuffer(UStringSearch *ss, UErrorCode *status);
   3477                ~CEIBuffer();
   3478    const CEI   *get(int32_t index);
   3479    const CEI   *getPrevious(int32_t index);
   3480 };
   3481 
   3482 
   3483 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
   3484     buf = defBuf;
   3485     strSearch = ss;
   3486     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
   3487     if (ss->search->elementComparisonType != 0) {
   3488         const UChar * patText = ss->pattern.text;
   3489         if (patText) {
   3490             const UChar * patTextLimit = patText + ss->pattern.textLength;
   3491             while ( patText < patTextLimit ) {
   3492                 UChar c = *patText++;
   3493                 if (MIGHT_BE_JAMO_L(c)) {
   3494                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
   3495                 } else {
   3496                     // No check for surrogates, we might allocate slightly more buffer than necessary.
   3497                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
   3498                 }
   3499             }
   3500         }
   3501     }
   3502     ceIter    = ss->textIter;
   3503     firstIx = 0;
   3504     limitIx = 0;
   3505 
   3506     if (!initTextProcessedIter(ss, status)) { return; }
   3507 
   3508     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
   3509         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
   3510         if (buf == NULL) {
   3511             *status = U_MEMORY_ALLOCATION_ERROR;
   3512         }
   3513     }
   3514 }
   3515 
   3516 // TODO: add a reset or init function so that allocated
   3517 //       buffers can be retained & reused.
   3518 
   3519 CEIBuffer::~CEIBuffer() {
   3520     if (buf != defBuf) {
   3521         uprv_free(buf);
   3522     }
   3523 }
   3524 
   3525 
   3526 // Get the CE with the specified index.
   3527 //   Index must be in the range
   3528 //          n-history_size < index < n+1
   3529 //   where n is the largest index to have been fetched by some previous call to this function.
   3530 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3531 //
   3532 const CEI *CEIBuffer::get(int32_t index) {
   3533     int i = index % bufSize;
   3534 
   3535     if (index>=firstIx && index<limitIx) {
   3536         // The request was for an entry already in our buffer.
   3537         //  Just return it.
   3538         return &buf[i];
   3539     }
   3540 
   3541     // Caller is requesting a new, never accessed before, CE.
   3542     //   Verify that it is the next one in sequence, which is all
   3543     //   that is allowed.
   3544     if (index != limitIx) {
   3545         U_ASSERT(FALSE);
   3546 
   3547         return NULL;
   3548     }
   3549 
   3550     // Manage the circular CE buffer indexing
   3551     limitIx++;
   3552 
   3553     if (limitIx - firstIx >= bufSize) {
   3554         // The buffer is full, knock out the lowest-indexed entry.
   3555         firstIx++;
   3556     }
   3557 
   3558     UErrorCode status = U_ZERO_ERROR;
   3559 
   3560     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3561 
   3562     return &buf[i];
   3563 }
   3564 
   3565 // Get the CE with the specified index.
   3566 //   Index must be in the range
   3567 //          n-history_size < index < n+1
   3568 //   where n is the largest index to have been fetched by some previous call to this function.
   3569 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3570 //
   3571 const CEI *CEIBuffer::getPrevious(int32_t index) {
   3572     int i = index % bufSize;
   3573 
   3574     if (index>=firstIx && index<limitIx) {
   3575         // The request was for an entry already in our buffer.
   3576         //  Just return it.
   3577         return &buf[i];
   3578     }
   3579 
   3580     // Caller is requesting a new, never accessed before, CE.
   3581     //   Verify that it is the next one in sequence, which is all
   3582     //   that is allowed.
   3583     if (index != limitIx) {
   3584         U_ASSERT(FALSE);
   3585 
   3586         return NULL;
   3587     }
   3588 
   3589     // Manage the circular CE buffer indexing
   3590     limitIx++;
   3591 
   3592     if (limitIx - firstIx >= bufSize) {
   3593         // The buffer is full, knock out the lowest-indexed entry.
   3594         firstIx++;
   3595     }
   3596 
   3597     UErrorCode status = U_ZERO_ERROR;
   3598 
   3599     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3600 
   3601     return &buf[i];
   3602 }
   3603 
   3604 }
   3605 
   3606 U_NAMESPACE_END
   3607 
   3608 
   3609 // #define USEARCH_DEBUG
   3610 
   3611 #ifdef USEARCH_DEBUG
   3612 #include <stdio.h>
   3613 #include <stdlib.h>
   3614 #endif
   3615 
   3616 /*
   3617  * Find the next break boundary after startIndex. If the UStringSearch object
   3618  * has an external break iterator, use that. Otherwise use the internal character
   3619  * break iterator.
   3620  */
   3621 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
   3622 #if 0
   3623     const UChar *text = strsrch->search->text;
   3624     int32_t textLen   = strsrch->search->textLength;
   3625 
   3626     U_ASSERT(startIndex>=0);
   3627     U_ASSERT(startIndex<=textLen);
   3628 
   3629     if (startIndex >= textLen) {
   3630         return startIndex;
   3631     }
   3632 
   3633     UChar32  c;
   3634     int32_t  i = startIndex;
   3635     U16_NEXT(text, i, textLen, c);
   3636 
   3637     // If we are on a control character, stop without looking for combining marks.
   3638     //    Control characters do not combine.
   3639     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3640     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   3641         return i;
   3642     }
   3643 
   3644     // The initial character was not a control, and can thus accept trailing
   3645     //   combining characters.  Advance over however many of them there are.
   3646     int32_t  indexOfLastCharChecked;
   3647     for (;;) {
   3648         indexOfLastCharChecked = i;
   3649         if (i>=textLen) {
   3650             break;
   3651         }
   3652         U16_NEXT(text, i, textLen, c);
   3653         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3654         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3655             break;
   3656         }
   3657     }
   3658     return indexOfLastCharChecked;
   3659 #elif !UCONFIG_NO_BREAK_ITERATION
   3660     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3661 
   3662     if (breakiterator == NULL) {
   3663         breakiterator = strsrch->search->internalBreakIter;
   3664     }
   3665 
   3666     if (breakiterator != NULL) {
   3667         return ubrk_following(breakiterator, startIndex);
   3668     }
   3669 
   3670     return startIndex;
   3671 #else
   3672     // **** or should we use the original code? ****
   3673     return startIndex;
   3674 #endif
   3675 
   3676 }
   3677 
   3678 /*
   3679  * Returns TRUE if index is on a break boundary. If the UStringSearch
   3680  * has an external break iterator, test using that, otherwise test
   3681  * using the internal character break iterator.
   3682  */
   3683 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
   3684 #if 0
   3685     const UChar *text = strsrch->search->text;
   3686     int32_t textLen   = strsrch->search->textLength;
   3687 
   3688     U_ASSERT(index>=0);
   3689     U_ASSERT(index<=textLen);
   3690 
   3691     if (index>=textLen || index<=0) {
   3692         return TRUE;
   3693     }
   3694 
   3695     // If the character at the current index is not a GRAPHEME_EXTEND
   3696     //    then we can not be within a combining sequence.
   3697     UChar32  c;
   3698     U16_GET(text, 0, index, textLen, c);
   3699     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3700     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3701         return TRUE;
   3702     }
   3703 
   3704     // We are at a combining mark.  If the preceding character is anything
   3705     //   except a CONTROL, CR or LF, we are in a combining sequence.
   3706     U16_PREV(text, 0, index, c);
   3707     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3708     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   3709     return !combining;
   3710 #elif !UCONFIG_NO_BREAK_ITERATION
   3711     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3712 
   3713     if (breakiterator == NULL) {
   3714         breakiterator = strsrch->search->internalBreakIter;
   3715     }
   3716 
   3717     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
   3718 #else
   3719     // **** or use the original code? ****
   3720     return TRUE;
   3721 #endif
   3722 }
   3723 
   3724 #if 0
   3725 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
   3726 {
   3727 #if !UCONFIG_NO_BREAK_ITERATION
   3728     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3729 
   3730     if (breakiterator != NULL) {
   3731         int32_t startindex = ubrk_first(breakiterator);
   3732         int32_t endindex   = ubrk_last(breakiterator);
   3733 
   3734         // out-of-range indexes are never boundary positions
   3735         if (start < startindex || start > endindex ||
   3736             end < startindex || end > endindex) {
   3737             return FALSE;
   3738         }
   3739 
   3740         return ubrk_isBoundary(breakiterator, start) &&
   3741                ubrk_isBoundary(breakiterator, end);
   3742     }
   3743 #endif
   3744 
   3745     return TRUE;
   3746 }
   3747 #endif
   3748 
   3749 typedef enum {
   3750     U_CE_MATCH = -1,
   3751     U_CE_NO_MATCH = 0,
   3752     U_CE_SKIP_TARG,
   3753     U_CE_SKIP_PATN
   3754 } UCompareCEsResult;
   3755 #define U_CE_LEVEL2_BASE 0x00000005
   3756 #define U_CE_LEVEL3_BASE 0x00050000
   3757 
   3758 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
   3759     if (targCE == patCE) {
   3760         return U_CE_MATCH;
   3761     }
   3762     if (compareType == 0) {
   3763         return U_CE_NO_MATCH;
   3764     }
   3765 
   3766     int64_t targCEshifted = targCE >> 32;
   3767     int64_t patCEshifted = patCE >> 32;
   3768     int64_t mask;
   3769 
   3770     mask = 0xFFFF0000;
   3771     int32_t targLev1 = (int32_t)(targCEshifted & mask);
   3772     int32_t patLev1 = (int32_t)(patCEshifted & mask);
   3773     if ( targLev1 != patLev1 ) {
   3774         if ( targLev1 == 0 ) {
   3775             return U_CE_SKIP_TARG;
   3776         }
   3777         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3778             return U_CE_SKIP_PATN;
   3779         }
   3780         return U_CE_NO_MATCH;
   3781     }
   3782 
   3783     mask = 0x0000FFFF;
   3784     int32_t targLev2 = (int32_t)(targCEshifted & mask);
   3785     int32_t patLev2 = (int32_t)(patCEshifted & mask);
   3786     if ( targLev2 != patLev2 ) {
   3787         if ( targLev2 == 0 ) {
   3788             return U_CE_SKIP_TARG;
   3789         }
   3790         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3791             return U_CE_SKIP_PATN;
   3792         }
   3793         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
   3794             U_CE_MATCH: U_CE_NO_MATCH;
   3795     }
   3796 
   3797     mask = 0xFFFF0000;
   3798     int32_t targLev3 = (int32_t)(targCE & mask);
   3799     int32_t patLev3 = (int32_t)(patCE & mask);
   3800     if ( targLev3 != patLev3 ) {
   3801         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
   3802             U_CE_MATCH: U_CE_NO_MATCH;
   3803    }
   3804 
   3805     return U_CE_MATCH;
   3806 }
   3807 
   3808 #if BOYER_MOORE
   3809 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
   3810 #endif
   3811 
   3812 namespace {
   3813 
   3814 UChar32 codePointAt(const USearch &search, int32_t index) {
   3815     if (index < search.textLength) {
   3816         UChar32 c;
   3817         U16_NEXT(search.text, index, search.textLength, c);
   3818         return c;
   3819     }
   3820     return U_SENTINEL;
   3821 }
   3822 
   3823 UChar32 codePointBefore(const USearch &search, int32_t index) {
   3824     if (0 < index) {
   3825         UChar32 c;
   3826         U16_PREV(search.text, 0, index, c);
   3827         return c;
   3828     }
   3829     return U_SENTINEL;
   3830 }
   3831 
   3832 }  // namespace
   3833 
   3834 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
   3835                                        int32_t        startIdx,
   3836                                        int32_t        *matchStart,
   3837                                        int32_t        *matchLimit,
   3838                                        UErrorCode     *status)
   3839 {
   3840     if (U_FAILURE(*status)) {
   3841         return FALSE;
   3842     }
   3843 
   3844     // TODO:  reject search patterns beginning with a combining char.
   3845 
   3846 #ifdef USEARCH_DEBUG
   3847     if (getenv("USEARCH_DEBUG") != NULL) {
   3848         printf("Pattern CEs\n");
   3849         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
   3850             printf(" %8x", strsrch->pattern.ces[ii]);
   3851         }
   3852         printf("\n");
   3853     }
   3854 
   3855 #endif
   3856     // Input parameter sanity check.
   3857     //  TODO:  should input indicies clip to the text length
   3858     //         in the same way that UText does.
   3859     if(strsrch->pattern.cesLength == 0         ||
   3860        startIdx < 0                           ||
   3861        startIdx > strsrch->search->textLength ||
   3862        strsrch->pattern.ces == NULL) {
   3863            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3864            return FALSE;
   3865     }
   3866 
   3867     if (strsrch->pattern.pces == NULL) {
   3868         initializePatternPCETable(strsrch, status);
   3869     }
   3870 
   3871     ucol_setOffset(strsrch->textIter, startIdx, status);
   3872     CEIBuffer ceb(strsrch, status);
   3873 
   3874 
   3875     int32_t    targetIx = 0;
   3876     const CEI *targetCEI = NULL;
   3877     int32_t    patIx;
   3878     UBool      found;
   3879 
   3880     int32_t  mStart = -1;
   3881     int32_t  mLimit = -1;
   3882     int32_t  minLimit;
   3883     int32_t  maxLimit;
   3884 
   3885 
   3886 
   3887     // Outer loop moves over match starting positions in the
   3888     //      target CE space.
   3889     // Here we see the target as a sequence of collation elements, resulting from the following:
   3890     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
   3891     //    (for example, digraphs such as IJ may be broken into two characters).
   3892     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
   3893     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
   3894     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
   3895     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
   3896     //    then the CE is deleted, so the following code sees only CEs that are relevant.
   3897     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
   3898     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
   3899     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
   3900     //
   3901     for(targetIx=0; ; targetIx++)
   3902     {
   3903         found = TRUE;
   3904         //  Inner loop checks for a match beginning at each
   3905         //  position from the outer loop.
   3906         int32_t targetIxOffset = 0;
   3907         int64_t patCE = 0;
   3908         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
   3909         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   3910         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
   3911         const CEI *firstCEI = ceb.get(targetIx);
   3912         if (firstCEI == NULL) {
   3913             *status = U_INTERNAL_PROGRAM_ERROR;
   3914             found = FALSE;
   3915             break;
   3916         }
   3917 
   3918         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
   3919             patCE = strsrch->pattern.pces[patIx];
   3920             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
   3921             //  Compare CE from target string with CE from the pattern.
   3922             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
   3923             //    which will fail the compare, below.
   3924             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   3925             if ( ceMatch == U_CE_NO_MATCH ) {
   3926                 found = FALSE;
   3927                 break;
   3928             } else if ( ceMatch > U_CE_NO_MATCH ) {
   3929                 if ( ceMatch == U_CE_SKIP_TARG ) {
   3930                     // redo with same patCE, next targCE
   3931                     patIx--;
   3932                     targetIxOffset++;
   3933                 } else { // ceMatch == U_CE_SKIP_PATN
   3934                     // redo with same targCE, next patCE
   3935                     targetIxOffset--;
   3936                 }
   3937             }
   3938         }
   3939         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
   3940 
   3941         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   3942             // No match at this targetIx.  Try again at the next.
   3943             continue;
   3944         }
   3945 
   3946         if (!found) {
   3947             // No match at all, we have run off the end of the target text.
   3948             break;
   3949         }
   3950 
   3951 
   3952         // We have found a match in CE space.
   3953         // Now determine the bounds in string index space.
   3954         //  There still is a chance of match failure if the CE range not correspond to
   3955         //     an acceptable character range.
   3956         //
   3957         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
   3958 
   3959         mStart   = firstCEI->lowIndex;
   3960         minLimit = lastCEI->lowIndex;
   3961 
   3962         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   3963         //   extended to the end of input, and the match is good.
   3964 
   3965         // Look at the high and low indices of the CE following the match. If
   3966         // they are the same it means one of two things:
   3967         //    1. The match extended to the last CE from the target text, which is OK, or
   3968         //    2. The last CE that was part of the match is in an expansion that extends
   3969         //       to the first CE after the match. In this case, we reject the match.
   3970         const CEI *nextCEI = 0;
   3971         if (strsrch->search->elementComparisonType == 0) {
   3972             nextCEI  = ceb.get(targetIx + targetIxOffset);
   3973             maxLimit = nextCEI->lowIndex;
   3974             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3975                 found = FALSE;
   3976             }
   3977         } else {
   3978             for ( ; ; ++targetIxOffset ) {
   3979                 nextCEI = ceb.get(targetIx + targetIxOffset);
   3980                 maxLimit = nextCEI->lowIndex;
   3981                 // If we are at the end of the target too, match succeeds
   3982                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
   3983                     break;
   3984                 }
   3985                 // As long as the next CE has primary weight of 0,
   3986                 // it is part of the last target element matched by the pattern;
   3987                 // make sure it can be part of a match with the last patCE
   3988                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
   3989                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
   3990                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
   3991                         found = FALSE;
   3992                         break;
   3993                     }
   3994                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
   3995                 // target element, but it has non-zero primary weight => match fails
   3996                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
   3997                     found = false;
   3998                     break;
   3999                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
   4000                 } else {
   4001                     break;
   4002                 }
   4003             }
   4004         }
   4005 
   4006 
   4007         // Check for the start of the match being within a combining sequence.
   4008         //   This can happen if the pattern itself begins with a combining char, and
   4009         //   the match found combining marks in the target text that were attached
   4010         //    to something else.
   4011         //   This type of match should be rejected for not completely consuming a
   4012         //   combining sequence.
   4013         if (!isBreakBoundary(strsrch, mStart)) {
   4014             found = FALSE;
   4015         }
   4016 
   4017         // Check for the start of the match being within an Collation Element Expansion,
   4018         //   meaning that the first char of the match is only partially matched.
   4019         //   With exapnsions, the first CE will report the index of the source
   4020         //   character, and all subsequent (expansions) CEs will report the source index of the
   4021         //    _following_ character.
   4022         int32_t secondIx = firstCEI->highIndex;
   4023         if (mStart == secondIx) {
   4024             found = FALSE;
   4025         }
   4026 
   4027         // Allow matches to end in the middle of a grapheme cluster if the following
   4028         // conditions are met; this is needed to make prefix search work properly in
   4029         // Indic, see #11750
   4030         // * the default breakIter is being used
   4031         // * the next collation element after this combining sequence
   4032         //   - has non-zero primary weight
   4033         //   - corresponds to a separate character following the one at end of the current match
   4034         //   (the second of these conditions, and perhaps both, may be redundant given the
   4035         //   subsequent check for normalization boundary; however they are likely much faster
   4036         //   tests in any case)
   4037         // * the match limit is a normalization boundary
   4038         UBool allowMidclusterMatch = FALSE;
   4039         if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
   4040             allowMidclusterMatch =
   4041                     strsrch->search->breakIter == NULL &&
   4042                     nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
   4043                     maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
   4044                     (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
   4045                         strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
   4046         }
   4047         // If those conditions are met, then:
   4048         // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
   4049         //   the match limit may be backed off to a previous break boundary. This handles
   4050         //   cases in which mLimit includes target characters that are ignorable with current
   4051         //   settings (such as space) and which extend beyond the pattern match.
   4052         // * do NOT require that end of the combining sequence not extend beyond the match in CE space
   4053         // * do NOT require that match limit be on a breakIter boundary
   4054 
   4055         //  Advance the match end position to the first acceptable match boundary.
   4056         //    This advances the index over any combining charcters.
   4057         mLimit = maxLimit;
   4058         if (minLimit < maxLimit) {
   4059             // When the last CE's low index is same with its high index, the CE is likely
   4060             // a part of expansion. In this case, the index is located just after the
   4061             // character corresponding to the CEs compared above. If the index is right
   4062             // at the break boundary, move the position to the next boundary will result
   4063             // incorrect match length when there are ignorable characters exist between
   4064             // the position and the next character produces CE(s). See ticket#8482.
   4065             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
   4066                 mLimit = minLimit;
   4067             } else {
   4068                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4069                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
   4070                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
   4071                 // (i.e. we back off mLimit to the previous breakIterator boundary).
   4072                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
   4073                     mLimit = nba;
   4074                 }
   4075             }
   4076         }
   4077 
   4078     #ifdef USEARCH_DEBUG
   4079         if (getenv("USEARCH_DEBUG") != NULL) {
   4080             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4081         }
   4082     #endif
   4083 
   4084         if (!allowMidclusterMatch) {
   4085             // If advancing to the end of a combining sequence in character indexing space
   4086             //   advanced us beyond the end of the match in CE space, reject this match.
   4087             if (mLimit > maxLimit) {
   4088                 found = FALSE;
   4089             }
   4090 
   4091             if (!isBreakBoundary(strsrch, mLimit)) {
   4092                 found = FALSE;
   4093             }
   4094         }
   4095 
   4096         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4097             found = FALSE;
   4098         }
   4099 
   4100         if (found) {
   4101             break;
   4102         }
   4103     }
   4104 
   4105     #ifdef USEARCH_DEBUG
   4106     if (getenv("USEARCH_DEBUG") != NULL) {
   4107         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4108         int32_t  lastToPrint = ceb.limitIx+2;
   4109         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4110             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4111         }
   4112         printf("\n%s\n", found? "match found" : "no match");
   4113     }
   4114     #endif
   4115 
   4116     // All Done.  Store back the match bounds to the caller.
   4117     //
   4118     if (found==FALSE) {
   4119         mLimit = -1;
   4120         mStart = -1;
   4121     }
   4122 
   4123     if (matchStart != NULL) {
   4124         *matchStart= mStart;
   4125     }
   4126 
   4127     if (matchLimit != NULL) {
   4128         *matchLimit = mLimit;
   4129     }
   4130 
   4131     return found;
   4132 }
   4133 
   4134 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
   4135                                                 int32_t        startIdx,
   4136                                                 int32_t        *matchStart,
   4137                                                 int32_t        *matchLimit,
   4138                                                 UErrorCode     *status)
   4139 {
   4140     if (U_FAILURE(*status)) {
   4141         return FALSE;
   4142     }
   4143 
   4144     // TODO:  reject search patterns beginning with a combining char.
   4145 
   4146 #ifdef USEARCH_DEBUG
   4147     if (getenv("USEARCH_DEBUG") != NULL) {
   4148         printf("Pattern CEs\n");
   4149         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
   4150             printf(" %8x", strsrch->pattern.ces[ii]);
   4151         }
   4152         printf("\n");
   4153     }
   4154 
   4155 #endif
   4156     // Input parameter sanity check.
   4157     //  TODO:  should input indicies clip to the text length
   4158     //         in the same way that UText does.
   4159     if(strsrch->pattern.cesLength == 0         ||
   4160        startIdx < 0                           ||
   4161        startIdx > strsrch->search->textLength ||
   4162        strsrch->pattern.ces == NULL) {
   4163            *status = U_ILLEGAL_ARGUMENT_ERROR;
   4164            return FALSE;
   4165     }
   4166 
   4167     if (strsrch->pattern.pces == NULL) {
   4168         initializePatternPCETable(strsrch, status);
   4169     }
   4170 
   4171     CEIBuffer ceb(strsrch, status);
   4172     int32_t    targetIx = 0;
   4173 
   4174     /*
   4175      * Pre-load the buffer with the CE's for the grapheme
   4176      * after our starting position so that we're sure that
   4177      * we can look at the CE following the match when we
   4178      * check the match boundaries.
   4179      *
   4180      * This will also pre-fetch the first CE that we'll
   4181      * consider for the match.
   4182      */
   4183     if (startIdx < strsrch->search->textLength) {
   4184         UBreakIterator *bi = strsrch->search->internalBreakIter;
   4185         int32_t next = ubrk_following(bi, startIdx);
   4186 
   4187         ucol_setOffset(strsrch->textIter, next, status);
   4188 
   4189         for (targetIx = 0; ; targetIx += 1) {
   4190             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
   4191                 break;
   4192             }
   4193         }
   4194     } else {
   4195         ucol_setOffset(strsrch->textIter, startIdx, status);
   4196     }
   4197 
   4198 
   4199     const CEI *targetCEI = NULL;
   4200     int32_t    patIx;
   4201     UBool      found;
   4202 
   4203     int32_t  limitIx = targetIx;
   4204     int32_t  mStart = -1;
   4205     int32_t  mLimit = -1;
   4206     int32_t  minLimit;
   4207     int32_t  maxLimit;
   4208 
   4209 
   4210 
   4211     // Outer loop moves over match starting positions in the
   4212     //      target CE space.
   4213     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
   4214     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
   4215     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
   4216     // and the beginning of the base text.
   4217     for(targetIx = limitIx; ; targetIx += 1)
   4218     {
   4219         found = TRUE;
   4220         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
   4221         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   4222         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
   4223         const CEI *lastCEI  = ceb.getPrevious(targetIx);
   4224         if (lastCEI == NULL) {
   4225             *status = U_INTERNAL_PROGRAM_ERROR;
   4226             found = FALSE;
   4227              break;
   4228         }
   4229         //  Inner loop checks for a match beginning at each
   4230         //  position from the outer loop.
   4231         int32_t targetIxOffset = 0;
   4232         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
   4233             int64_t patCE = strsrch->pattern.pces[patIx];
   4234 
   4235             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
   4236             //  Compare CE from target string with CE from the pattern.
   4237             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   4238             //    which will fail the compare, below.
   4239             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   4240             if ( ceMatch == U_CE_NO_MATCH ) {
   4241                 found = FALSE;
   4242                 break;
   4243             } else if ( ceMatch > U_CE_NO_MATCH ) {
   4244                 if ( ceMatch == U_CE_SKIP_TARG ) {
   4245                     // redo with same patCE, next targCE
   4246                     patIx++;
   4247                     targetIxOffset++;
   4248                 } else { // ceMatch == U_CE_SKIP_PATN
   4249                     // redo with same targCE, next patCE
   4250                     targetIxOffset--;
   4251                 }
   4252             }
   4253         }
   4254 
   4255         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   4256             // No match at this targetIx.  Try again at the next.
   4257             continue;
   4258         }
   4259 
   4260         if (!found) {
   4261             // No match at all, we have run off the end of the target text.
   4262             break;
   4263         }
   4264 
   4265 
   4266         // We have found a match in CE space.
   4267         // Now determine the bounds in string index space.
   4268         //  There still is a chance of match failure if the CE range not correspond to
   4269         //     an acceptable character range.
   4270         //
   4271         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
   4272         mStart   = firstCEI->lowIndex;
   4273 
   4274         // Check for the start of the match being within a combining sequence.
   4275         //   This can happen if the pattern itself begins with a combining char, and
   4276         //   the match found combining marks in the target text that were attached
   4277         //    to something else.
   4278         //   This type of match should be rejected for not completely consuming a
   4279         //   combining sequence.
   4280         if (!isBreakBoundary(strsrch, mStart)) {
   4281             found = FALSE;
   4282         }
   4283 
   4284         // Look at the high index of the first CE in the match. If it's the same as the
   4285         // low index, the first CE in the match is in the middle of an expansion.
   4286         if (mStart == firstCEI->highIndex) {
   4287             found = FALSE;
   4288         }
   4289 
   4290 
   4291         minLimit = lastCEI->lowIndex;
   4292 
   4293         if (targetIx > 0) {
   4294             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   4295             //   extended to the end of input, and the match is good.
   4296 
   4297             // Look at the high and low indices of the CE following the match. If
   4298             // they are the same it means one of two things:
   4299             //    1. The match extended to the last CE from the target text, which is OK, or
   4300             //    2. The last CE that was part of the match is in an expansion that extends
   4301             //       to the first CE after the match. In this case, we reject the match.
   4302             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
   4303 
   4304             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4305                 found = FALSE;
   4306             }
   4307 
   4308             mLimit = maxLimit = nextCEI->lowIndex;
   4309 
   4310             // Allow matches to end in the middle of a grapheme cluster if the following
   4311             // conditions are met; this is needed to make prefix search work properly in
   4312             // Indic, see #11750
   4313             // * the default breakIter is being used
   4314             // * the next collation element after this combining sequence
   4315             //   - has non-zero primary weight
   4316             //   - corresponds to a separate character following the one at end of the current match
   4317             //   (the second of these conditions, and perhaps both, may be redundant given the
   4318             //   subsequent check for normalization boundary; however they are likely much faster
   4319             //   tests in any case)
   4320             // * the match limit is a normalization boundary
   4321             UBool allowMidclusterMatch = FALSE;
   4322             if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
   4323                 allowMidclusterMatch =
   4324                         strsrch->search->breakIter == NULL &&
   4325                         nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
   4326                         maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
   4327                         (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
   4328                             strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
   4329             }
   4330             // If those conditions are met, then:
   4331             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
   4332             //   the match limit may be backed off to a previous break boundary. This handles
   4333             //   cases in which mLimit includes target characters that are ignorable with current
   4334             //   settings (such as space) and which extend beyond the pattern match.
   4335             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
   4336             // * do NOT require that match limit be on a breakIter boundary
   4337 
   4338             //  Advance the match end position to the first acceptable match boundary.
   4339             //    This advances the index over any combining characters.
   4340             if (minLimit < maxLimit) {
   4341                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4342                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
   4343                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
   4344                 // (i.e. we back off mLimit to the previous breakIterator boundary).
   4345                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
   4346                     mLimit = nba;
   4347                 }
   4348             }
   4349 
   4350             if (!allowMidclusterMatch) {
   4351                 // If advancing to the end of a combining sequence in character indexing space
   4352                 //   advanced us beyond the end of the match in CE space, reject this match.
   4353                 if (mLimit > maxLimit) {
   4354                     found = FALSE;
   4355                 }
   4356 
   4357                 // Make sure the end of the match is on a break boundary
   4358                 if (!isBreakBoundary(strsrch, mLimit)) {
   4359                     found = FALSE;
   4360                 }
   4361             }
   4362 
   4363         } else {
   4364             // No non-ignorable CEs after this point.
   4365             // The maximum position is detected by boundary after
   4366             // the last non-ignorable CE. Combining sequence
   4367             // across the start index will be truncated.
   4368             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4369             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
   4370         }
   4371 
   4372     #ifdef USEARCH_DEBUG
   4373         if (getenv("USEARCH_DEBUG") != NULL) {
   4374             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4375         }
   4376     #endif
   4377 
   4378 
   4379         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4380             found = FALSE;
   4381         }
   4382 
   4383         if (found) {
   4384             break;
   4385         }
   4386     }
   4387 
   4388     #ifdef USEARCH_DEBUG
   4389     if (getenv("USEARCH_DEBUG") != NULL) {
   4390         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4391         int32_t  lastToPrint = ceb.limitIx+2;
   4392         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4393             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4394         }
   4395         printf("\n%s\n", found? "match found" : "no match");
   4396     }
   4397     #endif
   4398 
   4399     // All Done.  Store back the match bounds to the caller.
   4400     //
   4401     if (found==FALSE) {
   4402         mLimit = -1;
   4403         mStart = -1;
   4404     }
   4405 
   4406     if (matchStart != NULL) {
   4407         *matchStart= mStart;
   4408     }
   4409 
   4410     if (matchLimit != NULL) {
   4411         *matchLimit = mLimit;
   4412     }
   4413 
   4414     return found;
   4415 }
   4416 
   4417 // internal use methods declared in usrchimp.h -----------------------------
   4418 
   4419 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
   4420 {
   4421     if (U_FAILURE(*status)) {
   4422         setMatchNotFound(strsrch);
   4423         return FALSE;
   4424     }
   4425 
   4426 #if BOYER_MOORE
   4427     UCollationElements *coleiter        = strsrch->textIter;
   4428     int32_t             textlength      = strsrch->search->textLength;
   4429     int32_t            *patternce       = strsrch->pattern.ces;
   4430     int32_t             patterncelength = strsrch->pattern.cesLength;
   4431     int32_t             textoffset      = ucol_getOffset(coleiter);
   4432 
   4433     // status used in setting coleiter offset, since offset is checked in
   4434     // shiftForward before setting the coleiter offset, status never
   4435     // a failure
   4436     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4437                               patterncelength);
   4438     while (textoffset <= textlength)
   4439     {
   4440         uint32_t    patternceindex = patterncelength - 1;
   4441         int32_t     targetce;
   4442         UBool       found          = FALSE;
   4443         int32_t    lastce          = UCOL_NULLORDER;
   4444 
   4445         setColEIterOffset(coleiter, textoffset);
   4446 
   4447         for (;;) {
   4448             // finding the last pattern ce match, imagine composite characters
   4449             // for example: search for pattern A in text \u00C0
   4450             // we'll have to skip \u0300 the grave first before we get to A
   4451             targetce = ucol_previous(coleiter, status);
   4452             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4453                 found = FALSE;
   4454                 break;
   4455             }
   4456             targetce = getCE(strsrch, targetce);
   4457             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
   4458                 // this is for the text \u0315\u0300 that requires
   4459                 // normalization and pattern \u0300, where \u0315 is ignorable
   4460                 continue;
   4461             }
   4462             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4463                 lastce = targetce;
   4464             }
   4465             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4466             if (targetce == patternce[patternceindex]) {
   4467                 // the first ce can be a contraction
   4468                 found = TRUE;
   4469                 break;
   4470             }
   4471             if (!hasExpansion(coleiter)) {
   4472                 found = FALSE;
   4473                 break;
   4474             }
   4475         }
   4476 
   4477         //targetce = lastce;
   4478 
   4479         while (found && patternceindex > 0) {
   4480             lastce = targetce;
   4481             targetce    = ucol_previous(coleiter, status);
   4482             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4483                 found = FALSE;
   4484                 break;
   4485             }
   4486             targetce    = getCE(strsrch, targetce);
   4487             if (targetce == UCOL_IGNORABLE) {
   4488                 continue;
   4489             }
   4490 
   4491             patternceindex --;
   4492             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4493             found = found && targetce == patternce[patternceindex];
   4494         }
   4495 
   4496         targetce = lastce;
   4497 
   4498         if (!found) {
   4499             if (U_FAILURE(*status)) {
   4500                 break;
   4501             }
   4502             textoffset = shiftForward(strsrch, textoffset, lastce,
   4503                                       patternceindex);
   4504             // status checked at loop.
   4505             patternceindex = patterncelength;
   4506             continue;
   4507         }
   4508 
   4509         if (checkNextExactMatch(strsrch, &textoffset, status)) {
   4510             // status checked in ucol_setOffset
   4511             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4512             return TRUE;
   4513         }
   4514     }
   4515     setMatchNotFound(strsrch);
   4516     return FALSE;
   4517 #else
   4518     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4519     int32_t start = -1;
   4520     int32_t end = -1;
   4521 
   4522     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4523         strsrch->search->matchedIndex  = start;
   4524         strsrch->search->matchedLength = end - start;
   4525         return TRUE;
   4526     } else {
   4527         setMatchNotFound(strsrch);
   4528         return FALSE;
   4529     }
   4530 #endif
   4531 }
   4532 
   4533 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
   4534 {
   4535     if (U_FAILURE(*status)) {
   4536         setMatchNotFound(strsrch);
   4537         return FALSE;
   4538     }
   4539 
   4540 #if BOYER_MOORE
   4541     UCollationElements *coleiter        = strsrch->textIter;
   4542     int32_t             textlength      = strsrch->search->textLength;
   4543     int32_t            *patternce       = strsrch->pattern.ces;
   4544     int32_t             patterncelength = strsrch->pattern.cesLength;
   4545     int32_t             textoffset      = ucol_getOffset(coleiter);
   4546     UBool               hasPatternAccents =
   4547        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4548 
   4549     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4550                               patterncelength);
   4551     strsrch->canonicalPrefixAccents[0] = 0;
   4552     strsrch->canonicalSuffixAccents[0] = 0;
   4553 
   4554     while (textoffset <= textlength)
   4555     {
   4556         int32_t     patternceindex = patterncelength - 1;
   4557         int32_t     targetce;
   4558         UBool       found          = FALSE;
   4559         int32_t     lastce         = UCOL_NULLORDER;
   4560 
   4561         setColEIterOffset(coleiter, textoffset);
   4562 
   4563         for (;;) {
   4564             // finding the last pattern ce match, imagine composite characters
   4565             // for example: search for pattern A in text \u00C0
   4566             // we'll have to skip \u0300 the grave first before we get to A
   4567             targetce = ucol_previous(coleiter, status);
   4568             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4569                 found = FALSE;
   4570                 break;
   4571             }
   4572             targetce = getCE(strsrch, targetce);
   4573             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4574                 lastce = targetce;
   4575             }
   4576             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4577             if (targetce == patternce[patternceindex]) {
   4578                 // the first ce can be a contraction
   4579                 found = TRUE;
   4580                 break;
   4581             }
   4582             if (!hasExpansion(coleiter)) {
   4583                 found = FALSE;
   4584                 break;
   4585             }
   4586         }
   4587 
   4588         while (found && patternceindex > 0) {
   4589             targetce    = ucol_previous(coleiter, status);
   4590             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4591                 found = FALSE;
   4592                 break;
   4593             }
   4594             targetce    = getCE(strsrch, targetce);
   4595             if (targetce == UCOL_IGNORABLE) {
   4596                 continue;
   4597             }
   4598 
   4599             patternceindex --;
   4600             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4601             found = found && targetce == patternce[patternceindex];
   4602         }
   4603 
   4604         // initializing the rearranged accent array
   4605         if (hasPatternAccents && !found) {
   4606             strsrch->canonicalPrefixAccents[0] = 0;
   4607             strsrch->canonicalSuffixAccents[0] = 0;
   4608             if (U_FAILURE(*status)) {
   4609                 break;
   4610             }
   4611             found = doNextCanonicalMatch(strsrch, textoffset, status);
   4612         }
   4613 
   4614         if (!found) {
   4615             if (U_FAILURE(*status)) {
   4616                 break;
   4617             }
   4618             textoffset = shiftForward(strsrch, textoffset, lastce,
   4619                                       patternceindex);
   4620             // status checked at loop
   4621             patternceindex = patterncelength;
   4622             continue;
   4623         }
   4624 
   4625         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
   4626             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4627             return TRUE;
   4628         }
   4629     }
   4630     setMatchNotFound(strsrch);
   4631     return FALSE;
   4632 #else
   4633     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4634     int32_t start = -1;
   4635     int32_t end = -1;
   4636 
   4637     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4638         strsrch->search->matchedIndex  = start;
   4639         strsrch->search->matchedLength = end - start;
   4640         return TRUE;
   4641     } else {
   4642         setMatchNotFound(strsrch);
   4643         return FALSE;
   4644     }
   4645 #endif
   4646 }
   4647 
   4648 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
   4649 {
   4650     if (U_FAILURE(*status)) {
   4651         setMatchNotFound(strsrch);
   4652         return FALSE;
   4653     }
   4654 
   4655 #if BOYER_MOORE
   4656     UCollationElements *coleiter        = strsrch->textIter;
   4657     int32_t            *patternce       = strsrch->pattern.ces;
   4658     int32_t             patterncelength = strsrch->pattern.cesLength;
   4659     int32_t             textoffset      = ucol_getOffset(coleiter);
   4660 
   4661     // shifting it check for setting offset
   4662     // if setOffset is called previously or there was no previous match, we
   4663     // leave the offset as it is.
   4664     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4665         textoffset = strsrch->search->matchedIndex;
   4666     }
   4667 
   4668     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4669                               patterncelength);
   4670 
   4671     while (textoffset >= 0)
   4672     {
   4673         int32_t     patternceindex = 1;
   4674         int32_t     targetce;
   4675         UBool       found          = FALSE;
   4676         int32_t     firstce        = UCOL_NULLORDER;
   4677 
   4678         // if status is a failure, ucol_setOffset does nothing
   4679         setColEIterOffset(coleiter, textoffset);
   4680 
   4681         for (;;) {
   4682             // finding the first pattern ce match, imagine composite
   4683             // characters. for example: search for pattern \u0300 in text
   4684             // \u00C0, we'll have to skip A first before we get to
   4685             // \u0300 the grave accent
   4686             targetce = ucol_next(coleiter, status);
   4687             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4688                 found = FALSE;
   4689                 break;
   4690             }
   4691             targetce = getCE(strsrch, targetce);
   4692             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4693                 firstce = targetce;
   4694             }
   4695             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
   4696                 continue;
   4697             }
   4698             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4699             if (targetce == patternce[0]) {
   4700                 found = TRUE;
   4701                 break;
   4702             }
   4703             if (!hasExpansion(coleiter)) {
   4704                 // checking for accents in composite character
   4705                 found = FALSE;
   4706                 break;
   4707             }
   4708         }
   4709 
   4710         //targetce = firstce;
   4711 
   4712         while (found && (patternceindex < patterncelength)) {
   4713             firstce = targetce;
   4714             targetce    = ucol_next(coleiter, status);
   4715             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4716                 found = FALSE;
   4717                 break;
   4718             }
   4719             targetce    = getCE(strsrch, targetce);
   4720             if (targetce == UCOL_IGNORABLE) {
   4721                 continue;
   4722             }
   4723 
   4724             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4725             found = found && targetce == patternce[patternceindex];
   4726             patternceindex ++;
   4727         }
   4728 
   4729         targetce = firstce;
   4730 
   4731         if (!found) {
   4732             if (U_FAILURE(*status)) {
   4733                 break;
   4734             }
   4735 
   4736             textoffset = reverseShift(strsrch, textoffset, targetce,
   4737                                       patternceindex);
   4738             patternceindex = 0;
   4739             continue;
   4740         }
   4741 
   4742         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
   4743             setColEIterOffset(coleiter, textoffset);
   4744             return TRUE;
   4745         }
   4746     }
   4747     setMatchNotFound(strsrch);
   4748     return FALSE;
   4749 #else
   4750     int32_t textOffset;
   4751 
   4752     if (strsrch->search->isOverlap) {
   4753         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4754             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4755         } else {
   4756             // move the start position at the end of possible match
   4757             initializePatternPCETable(strsrch, status);
   4758             if (!initTextProcessedIter(strsrch, status)) {
   4759                 setMatchNotFound(strsrch);
   4760                 return FALSE;
   4761             }
   4762             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
   4763                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4764                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4765                     // at the end of the text
   4766                     break;
   4767                 }
   4768             }
   4769             if (U_FAILURE(*status)) {
   4770                 setMatchNotFound(strsrch);
   4771                 return FALSE;
   4772             }
   4773             textOffset = ucol_getOffset(strsrch->textIter);
   4774         }
   4775     } else {
   4776         textOffset = ucol_getOffset(strsrch->textIter);
   4777     }
   4778 
   4779     int32_t start = -1;
   4780     int32_t end = -1;
   4781 
   4782     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4783         strsrch->search->matchedIndex = start;
   4784         strsrch->search->matchedLength = end - start;
   4785         return TRUE;
   4786     } else {
   4787         setMatchNotFound(strsrch);
   4788         return FALSE;
   4789     }
   4790 #endif
   4791 }
   4792 
   4793 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
   4794                                       UErrorCode    *status)
   4795 {
   4796     if (U_FAILURE(*status)) {
   4797         setMatchNotFound(strsrch);
   4798         return FALSE;
   4799     }
   4800 
   4801 #if BOYER_MOORE
   4802     UCollationElements *coleiter        = strsrch->textIter;
   4803     int32_t            *patternce       = strsrch->pattern.ces;
   4804     int32_t             patterncelength = strsrch->pattern.cesLength;
   4805     int32_t             textoffset      = ucol_getOffset(coleiter);
   4806     UBool               hasPatternAccents =
   4807        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4808 
   4809     // shifting it check for setting offset
   4810     // if setOffset is called previously or there was no previous match, we
   4811     // leave the offset as it is.
   4812     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4813         textoffset = strsrch->search->matchedIndex;
   4814     }
   4815 
   4816     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4817                               patterncelength);
   4818     strsrch->canonicalPrefixAccents[0] = 0;
   4819     strsrch->canonicalSuffixAccents[0] = 0;
   4820 
   4821     while (textoffset >= 0)
   4822     {
   4823         int32_t     patternceindex = 1;
   4824         int32_t     targetce;
   4825         UBool       found          = FALSE;
   4826         int32_t     firstce        = UCOL_NULLORDER;
   4827 
   4828         setColEIterOffset(coleiter, textoffset);
   4829         for (;;) {
   4830             // finding the first pattern ce match, imagine composite
   4831             // characters. for example: search for pattern \u0300 in text
   4832             // \u00C0, we'll have to skip A first before we get to
   4833             // \u0300 the grave accent
   4834             targetce = ucol_next(coleiter, status);
   4835             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4836                 found = FALSE;
   4837                 break;
   4838             }
   4839             targetce = getCE(strsrch, targetce);
   4840             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4841                 firstce = targetce;
   4842             }
   4843 
   4844             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4845             if (targetce == patternce[0]) {
   4846                 // the first ce can be a contraction
   4847                 found = TRUE;
   4848                 break;
   4849             }
   4850             if (!hasExpansion(coleiter)) {
   4851                 // checking for accents in composite character
   4852                 found = FALSE;
   4853                 break;
   4854             }
   4855         }
   4856 
   4857         targetce = firstce;
   4858 
   4859         while (found && patternceindex < patterncelength) {
   4860             targetce    = ucol_next(coleiter, status);
   4861             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4862                 found = FALSE;
   4863                 break;
   4864             }
   4865             targetce = getCE(strsrch, targetce);
   4866             if (targetce == UCOL_IGNORABLE) {
   4867                 continue;
   4868             }
   4869 
   4870             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4871             found = found && targetce == patternce[patternceindex];
   4872             patternceindex ++;
   4873         }
   4874 
   4875         // initializing the rearranged accent array
   4876         if (hasPatternAccents && !found) {
   4877             strsrch->canonicalPrefixAccents[0] = 0;
   4878             strsrch->canonicalSuffixAccents[0] = 0;
   4879             if (U_FAILURE(*status)) {
   4880                 break;
   4881             }
   4882             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
   4883         }
   4884 
   4885         if (!found) {
   4886             if (U_FAILURE(*status)) {
   4887                 break;
   4888             }
   4889             textoffset = reverseShift(strsrch, textoffset, targetce,
   4890                                       patternceindex);
   4891             patternceindex = 0;
   4892             continue;
   4893         }
   4894 
   4895         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
   4896             setColEIterOffset(coleiter, textoffset);
   4897             return TRUE;
   4898         }
   4899     }
   4900     setMatchNotFound(strsrch);
   4901     return FALSE;
   4902 #else
   4903     int32_t textOffset;
   4904 
   4905     if (strsrch->search->isOverlap) {
   4906         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4907             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4908         } else {
   4909             // move the start position at the end of possible match
   4910             initializePatternPCETable(strsrch, status);
   4911             if (!initTextProcessedIter(strsrch, status)) {
   4912                 setMatchNotFound(strsrch);
   4913                 return FALSE;
   4914             }
   4915             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
   4916                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4917                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4918                     // at the end of the text
   4919                     break;
   4920                 }
   4921             }
   4922             if (U_FAILURE(*status)) {
   4923                 setMatchNotFound(strsrch);
   4924                 return FALSE;
   4925             }
   4926             textOffset = ucol_getOffset(strsrch->textIter);
   4927         }
   4928     } else {
   4929         textOffset = ucol_getOffset(strsrch->textIter);
   4930     }
   4931 
   4932     int32_t start = -1;
   4933     int32_t end = -1;
   4934 
   4935     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4936         strsrch->search->matchedIndex = start;
   4937         strsrch->search->matchedLength = end - start;
   4938         return TRUE;
   4939     } else {
   4940         setMatchNotFound(strsrch);
   4941         return FALSE;
   4942     }
   4943 #endif
   4944 }
   4945 
   4946 #endif /* #if !UCONFIG_NO_COLLATION */
   4947