Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
      6 **********************************************************************
      7 *   Date        Name        Description
      8 *  07/02/2001   synwee      Creation.
      9 **********************************************************************
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     15 
     16 #include "unicode/usearch.h"
     17 #include "unicode/ustring.h"
     18 #include "unicode/uchar.h"
     19 #include "unicode/utf16.h"
     20 #include "normalizer2impl.h"
     21 #include "usrchimp.h"
     22 #include "cmemory.h"
     23 #include "ucln_in.h"
     24 #include "uassert.h"
     25 #include "ustr_imp.h"
     26 
     27 U_NAMESPACE_USE
     28 
     29 // don't use Boyer-Moore
     30 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
     31 #define BOYER_MOORE 0
     32 
     33 // internal definition ---------------------------------------------------
     34 
     35 #define LAST_BYTE_MASK_          0xFF
     36 #define SECOND_LAST_BYTE_SHIFT_  8
     37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
     38 
     39 static const Normalizer2Impl *g_nfcImpl = NULL;
     40 
     41 // internal methods -------------------------------------------------
     42 
     43 /**
     44 * Fast collation element iterator setOffset.
     45 * This function does not check for bounds.
     46 * @param coleiter collation element iterator
     47 * @param offset to set
     48 */
     49 static
     50 inline void setColEIterOffset(UCollationElements *elems,
     51                       int32_t             offset)
     52 {
     53     // Note: Not "fast" any more after the 2013 collation rewrite.
     54     // We do not want to expose more internals than necessary.
     55     UErrorCode status = U_ZERO_ERROR;
     56     ucol_setOffset(elems, offset, &status);
     57 }
     58 
     59 /**
     60 * Getting the mask for collation strength
     61 * @param strength collation strength
     62 * @return collation element mask
     63 */
     64 static
     65 inline uint32_t getMask(UCollationStrength strength)
     66 {
     67     switch (strength)
     68     {
     69     case UCOL_PRIMARY:
     70         return UCOL_PRIMARYORDERMASK;
     71     case UCOL_SECONDARY:
     72         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
     73     default:
     74         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
     75                UCOL_PRIMARYORDERMASK;
     76     }
     77 }
     78 
     79 /**
     80 * @param ce 32-bit collation element
     81 * @return hash code
     82 */
     83 static
     84 inline int hashFromCE32(uint32_t ce)
     85 {
     86     int hc = (int)(
     87             ((((((ce >> 24) * 37) +
     88             (ce >> 16)) * 37) +
     89             (ce >> 8)) * 37) +
     90             ce);
     91     hc %= MAX_TABLE_SIZE_;
     92     if (hc < 0) {
     93         hc += MAX_TABLE_SIZE_;
     94     }
     95     return hc;
     96 }
     97 
     98 U_CDECL_BEGIN
     99 static UBool U_CALLCONV
    100 usearch_cleanup(void) {
    101     g_nfcImpl = NULL;
    102     return TRUE;
    103 }
    104 U_CDECL_END
    105 
    106 /**
    107 * Initializing the fcd tables.
    108 * Internal method, status assumed to be a success.
    109 * @param status output error if any, caller to check status before calling
    110 *               method, status assumed to be success when passed in.
    111 */
    112 static
    113 inline void initializeFCD(UErrorCode *status)
    114 {
    115     if (g_nfcImpl == NULL) {
    116         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    117         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
    118     }
    119 }
    120 
    121 /**
    122 * Gets the fcd value for a character at the argument index.
    123 * This method takes into accounts of the supplementary characters.
    124 * @param str UTF16 string where character for fcd retrieval resides
    125 * @param offset position of the character whose fcd is to be retrieved, to be
    126 *               overwritten with the next character position, taking
    127 *               surrogate characters into consideration.
    128 * @param strlength length of the argument string
    129 * @return fcd value
    130 */
    131 static
    132 uint16_t getFCD(const UChar   *str, int32_t *offset,
    133                              int32_t  strlength)
    134 {
    135     const UChar *temp = str + *offset;
    136     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
    137     *offset = (int32_t)(temp - str);
    138     return result;
    139 }
    140 
    141 /**
    142 * Getting the modified collation elements taking into account the collation
    143 * attributes
    144 * @param strsrch string search data
    145 * @param sourcece
    146 * @return the modified collation element
    147 */
    148 static
    149 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
    150 {
    151     // note for tertiary we can't use the collator->tertiaryMask, that
    152     // is a preprocessed mask that takes into account case options. since
    153     // we are only concerned with exact matches, we don't need that.
    154     sourcece &= strsrch->ceMask;
    155 
    156     if (strsrch->toShift) {
    157         // alternate handling here, since only the 16 most significant digits
    158         // is only used, we can safely do a compare without masking
    159         // if the ce is a variable, we mask and get only the primary values
    160         // no shifting to quartenary is required since all primary values
    161         // less than variabletop will need to be masked off anyway.
    162         if (strsrch->variableTop > sourcece) {
    163             if (strsrch->strength >= UCOL_QUATERNARY) {
    164                 sourcece &= UCOL_PRIMARYORDERMASK;
    165             }
    166             else {
    167                 sourcece = UCOL_IGNORABLE;
    168             }
    169         }
    170     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
    171         sourcece = 0xFFFF;
    172     }
    173 
    174     return sourcece;
    175 }
    176 
    177 /**
    178 * Allocate a memory and returns NULL if it failed.
    179 * Internal method, status assumed to be a success.
    180 * @param size to allocate
    181 * @param status output error if any, caller to check status before calling
    182 *               method, status assumed to be success when passed in.
    183 * @return newly allocated array, NULL otherwise
    184 */
    185 static
    186 inline void * allocateMemory(uint32_t size, UErrorCode *status)
    187 {
    188     uint32_t *result = (uint32_t *)uprv_malloc(size);
    189     if (result == NULL) {
    190         *status = U_MEMORY_ALLOCATION_ERROR;
    191     }
    192     return result;
    193 }
    194 
    195 /**
    196 * Adds a uint32_t value to a destination array.
    197 * Creates a new array if we run out of space. The caller will have to
    198 * manually deallocate the newly allocated array.
    199 * Internal method, status assumed to be success, caller has to check status
    200 * before calling this method. destination not to be NULL and has at least
    201 * size destinationlength.
    202 * @param destination target array
    203 * @param offset destination offset to add value
    204 * @param destinationlength target array size, return value for the new size
    205 * @param value to be added
    206 * @param increments incremental size expected
    207 * @param status output error if any, caller to check status before calling
    208 *               method, status assumed to be success when passed in.
    209 * @return new destination array, destination if there was no new allocation
    210 */
    211 static
    212 inline int32_t * addTouint32_tArray(int32_t    *destination,
    213                                     uint32_t    offset,
    214                                     uint32_t   *destinationlength,
    215                                     uint32_t    value,
    216                                     uint32_t    increments,
    217                                     UErrorCode *status)
    218 {
    219     uint32_t newlength = *destinationlength;
    220     if (offset + 1 == newlength) {
    221         newlength += increments;
    222         int32_t *temp = (int32_t *)allocateMemory(
    223                                          sizeof(int32_t) * newlength, status);
    224         if (U_FAILURE(*status)) {
    225             return NULL;
    226         }
    227         uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
    228         *destinationlength = newlength;
    229         destination        = temp;
    230     }
    231     destination[offset] = value;
    232     return destination;
    233 }
    234 
    235 /**
    236 * Adds a uint64_t value to a destination array.
    237 * Creates a new array if we run out of space. The caller will have to
    238 * manually deallocate the newly allocated array.
    239 * Internal method, status assumed to be success, caller has to check status
    240 * before calling this method. destination not to be NULL and has at least
    241 * size destinationlength.
    242 * @param destination target array
    243 * @param offset destination offset to add value
    244 * @param destinationlength target array size, return value for the new size
    245 * @param value to be added
    246 * @param increments incremental size expected
    247 * @param status output error if any, caller to check status before calling
    248 *               method, status assumed to be success when passed in.
    249 * @return new destination array, destination if there was no new allocation
    250 */
    251 static
    252 inline int64_t * addTouint64_tArray(int64_t    *destination,
    253                                     uint32_t    offset,
    254                                     uint32_t   *destinationlength,
    255                                     uint64_t    value,
    256                                     uint32_t    increments,
    257                                     UErrorCode *status)
    258 {
    259     uint32_t newlength = *destinationlength;
    260     if (offset + 1 == newlength) {
    261         newlength += increments;
    262         int64_t *temp = (int64_t *)allocateMemory(
    263                                          sizeof(int64_t) * newlength, status);
    264 
    265         if (U_FAILURE(*status)) {
    266             return NULL;
    267         }
    268 
    269         uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
    270         *destinationlength = newlength;
    271         destination        = temp;
    272     }
    273 
    274     destination[offset] = value;
    275 
    276     return destination;
    277 }
    278 
    279 /**
    280 * Initializing the ce table for a pattern.
    281 * Stores non-ignorable collation keys.
    282 * Table size will be estimated by the size of the pattern text. Table
    283 * expansion will be perform as we go along. Adding 1 to ensure that the table
    284 * size definitely increases.
    285 * Internal method, status assumed to be a success.
    286 * @param strsrch string search data
    287 * @param status output error if any, caller to check status before calling
    288 *               method, status assumed to be success when passed in.
    289 * @return total number of expansions
    290 */
    291 static
    292 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
    293                                          UErrorCode    *status)
    294 {
    295     UPattern *pattern            = &(strsrch->pattern);
    296     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
    297     int32_t  *cetable            = pattern->cesBuffer;
    298     uint32_t  patternlength      = pattern->textLength;
    299     UCollationElements *coleiter = strsrch->utilIter;
    300 
    301     if (coleiter == NULL) {
    302         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    303                                      patternlength, status);
    304         // status will be checked in ucol_next(..) later and if it is an
    305         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    306         // returned.
    307         strsrch->utilIter = coleiter;
    308     }
    309     else {
    310         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    311     }
    312     if(U_FAILURE(*status)) {
    313         return 0;
    314     }
    315 
    316     if (pattern->ces != cetable && pattern->ces) {
    317         uprv_free(pattern->ces);
    318     }
    319 
    320     uint16_t  offset      = 0;
    321     uint16_t  result      = 0;
    322     int32_t   ce;
    323 
    324     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
    325            U_SUCCESS(*status)) {
    326         uint32_t newce = getCE(strsrch, ce);
    327         if (newce) {
    328             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
    329                                   newce,
    330                                   patternlength - ucol_getOffset(coleiter) + 1,
    331                                   status);
    332             if (U_FAILURE(*status)) {
    333                 return 0;
    334             }
    335             offset ++;
    336             if (cetable != temp && cetable != pattern->cesBuffer) {
    337                 uprv_free(cetable);
    338             }
    339             cetable = temp;
    340         }
    341         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    342     }
    343 
    344     cetable[offset]   = 0;
    345     pattern->ces       = cetable;
    346     pattern->cesLength = offset;
    347 
    348     return result;
    349 }
    350 
    351 /**
    352 * Initializing the pce table for a pattern.
    353 * Stores non-ignorable collation keys.
    354 * Table size will be estimated by the size of the pattern text. Table
    355 * expansion will be perform as we go along. Adding 1 to ensure that the table
    356 * size definitely increases.
    357 * Internal method, status assumed to be a success.
    358 * @param strsrch string search data
    359 * @param status output error if any, caller to check status before calling
    360 *               method, status assumed to be success when passed in.
    361 * @return total number of expansions
    362 */
    363 static
    364 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
    365                                           UErrorCode    *status)
    366 {
    367     UPattern *pattern            = &(strsrch->pattern);
    368     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
    369     int64_t  *pcetable           = pattern->pcesBuffer;
    370     uint32_t  patternlength      = pattern->textLength;
    371     UCollationElements *coleiter = strsrch->utilIter;
    372 
    373     if (coleiter == NULL) {
    374         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    375                                      patternlength, status);
    376         // status will be checked in ucol_next(..) later and if it is an
    377         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    378         // returned.
    379         strsrch->utilIter = coleiter;
    380     } else {
    381         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
    382     }
    383     if(U_FAILURE(*status)) {
    384         return 0;
    385     }
    386 
    387     if (pattern->pces != pcetable && pattern->pces != NULL) {
    388         uprv_free(pattern->pces);
    389     }
    390 
    391     uint16_t  offset = 0;
    392     uint16_t  result = 0;
    393     int64_t   pce;
    394 
    395     icu::UCollationPCE iter(coleiter);
    396 
    397     // ** Should processed CEs be signed or unsigned?
    398     // ** (the rest of the code in this file seems to play fast-and-loose with
    399     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
    400     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
    401            U_SUCCESS(*status)) {
    402         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
    403                               pce,
    404                               patternlength - ucol_getOffset(coleiter) + 1,
    405                               status);
    406 
    407         if (U_FAILURE(*status)) {
    408             return 0;
    409         }
    410 
    411         offset += 1;
    412 
    413         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
    414             uprv_free(pcetable);
    415         }
    416 
    417         pcetable = temp;
    418         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    419     }
    420 
    421     pcetable[offset]   = 0;
    422     pattern->pces       = pcetable;
    423     pattern->pcesLength = offset;
    424 
    425     return result;
    426 }
    427 
    428 /**
    429 * Initializes the pattern struct.
    430 * Internal method, status assumed to be success.
    431 * @param strsrch UStringSearch data storage
    432 * @param status output error if any, caller to check status before calling
    433 *               method, status assumed to be success when passed in.
    434 * @return expansionsize the total expansion size of the pattern
    435 */
    436 static
    437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
    438 {
    439     if (U_FAILURE(*status)) { return 0; }
    440           UPattern   *pattern     = &(strsrch->pattern);
    441     const UChar      *patterntext = pattern->text;
    442           int32_t     length      = pattern->textLength;
    443           int32_t index       = 0;
    444 
    445     // Since the strength is primary, accents are ignored in the pattern.
    446     if (strsrch->strength == UCOL_PRIMARY) {
    447         pattern->hasPrefixAccents = 0;
    448         pattern->hasSuffixAccents = 0;
    449     } else {
    450         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
    451                                                          SECOND_LAST_BYTE_SHIFT_;
    452         index = length;
    453         U16_BACK_1(patterntext, 0, index);
    454         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
    455                                                                  LAST_BYTE_MASK_;
    456     }
    457 
    458     // ** HACK **
    459     if (strsrch->pattern.pces != NULL) {
    460         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
    461             uprv_free(strsrch->pattern.pces);
    462         }
    463 
    464         strsrch->pattern.pces = NULL;
    465     }
    466 
    467     // since intializePattern is an internal method status is a success.
    468     return initializePatternCETable(strsrch, status);
    469 }
    470 
    471 /**
    472 * Initializing shift tables, with the default values.
    473 * If a corresponding default value is 0, the shift table is not set.
    474 * @param shift table for forwards shift
    475 * @param backshift table for backwards shift
    476 * @param cetable table containing pattern ce
    477 * @param cesize size of the pattern ces
    478 * @param expansionsize total size of the expansions
    479 * @param defaultforward the default forward value
    480 * @param defaultbackward the default backward value
    481 */
    482 static
    483 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
    484                           int32_t  *cetable, int32_t cesize,
    485                           int16_t   expansionsize,
    486                           int16_t   defaultforward,
    487                           int16_t   defaultbackward)
    488 {
    489     // estimate the value to shift. to do that we estimate the smallest
    490     // number of characters to give the relevant ces, ie approximately
    491     // the number of ces minus their expansion, since expansions can come
    492     // from a character.
    493     int32_t count;
    494     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    495         shift[count] = defaultforward;
    496     }
    497     cesize --; // down to the last index
    498     for (count = 0; count < cesize; count ++) {
    499         // number of ces from right of array to the count
    500         int temp = defaultforward - count - 1;
    501         shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
    502     }
    503     shift[hashFromCE32(cetable[cesize])] = 1;
    504     // for ignorables we just shift by one. see test examples.
    505     shift[hashFromCE32(0)] = 1;
    506 
    507     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    508         backshift[count] = defaultbackward;
    509     }
    510     for (count = cesize; count > 0; count --) {
    511         // the original value count does not seem to work
    512         backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
    513                                           (int16_t)(count - expansionsize) : 1;
    514     }
    515     backshift[hashFromCE32(cetable[0])] = 1;
    516     backshift[hashFromCE32(0)] = 1;
    517 }
    518 
    519 /**
    520 * Building of the pattern collation element list and the boyer moore strsrch
    521 * table.
    522 * The canonical match will only be performed after the default match fails.
    523 * For both cases we need to remember the size of the composed and decomposed
    524 * versions of the string. Since the Boyer-Moore shift calculations shifts by
    525 * a number of characters in the text and tries to match the pattern from that
    526 * offset, the shift value can not be too large in case we miss some
    527 * characters. To choose a right shift size, we estimate the NFC form of the
    528 * and use its size as a shift guide. The NFC form should be the small
    529 * possible representation of the pattern. Anyways, we'll err on the smaller
    530 * shift size. Hence the calculation for minlength.
    531 * Canonical match will be performed slightly differently. We'll split the
    532 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
    533 * the first and last base character (MS), the ending accents (EA). Matches
    534 * will be done on MS first, and only when we match MS then some processing
    535 * will be required for the prefix and end accents in order to determine if
    536 * they match PA and EA. Hence the default shift values
    537 * for the canonical match will take the size of either end's accent into
    538 * consideration. Forwards search will take the end accents into consideration
    539 * for the default shift values and the backwards search will take the prefix
    540 * accents into consideration.
    541 * If pattern has no non-ignorable ce, we return a illegal argument error.
    542 * Internal method, status assumed to be success.
    543 * @param strsrch UStringSearch data storage
    544 * @param status  for output errors if it occurs, status is assumed to be a
    545 *                success when it is passed in.
    546 */
    547 static
    548 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
    549 {
    550     int16_t expandlength  = initializePattern(strsrch, status);
    551     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
    552         UPattern *pattern = &strsrch->pattern;
    553         int32_t   cesize  = pattern->cesLength;
    554 
    555         int16_t minlength = cesize > expandlength
    556                             ? (int16_t)cesize - expandlength : 1;
    557         pattern->defaultShiftSize    = minlength;
    558         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
    559                       cesize, expandlength, minlength, minlength);
    560         return;
    561     }
    562     strsrch->pattern.defaultShiftSize = 0;
    563 }
    564 
    565 #if BOYER_MOORE
    566 /**
    567 * Check to make sure that the match length is at the end of the character by
    568 * using the breakiterator.
    569 * @param strsrch string search data
    570 * @param start target text start offset
    571 * @param end target text end offset
    572 */
    573 static
    574 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
    575                                int32_t *end)
    576 {
    577 #if !UCONFIG_NO_BREAK_ITERATION
    578     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
    579     if (breakiterator) {
    580         int32_t matchend = *end;
    581         //int32_t matchstart = *start;
    582 
    583         if (!ubrk_isBoundary(breakiterator, matchend)) {
    584             *end = ubrk_following(breakiterator, matchend);
    585         }
    586 
    587         /* Check the start of the matched text to make sure it doesn't have any accents
    588          * before it.  This code may not be necessary and so it is commented out */
    589         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
    590             *start = ubrk_preceding(breakiterator, matchstart);
    591         }*/
    592     }
    593 #endif
    594 }
    595 
    596 /**
    597 * Determine whether the target text in UStringSearch bounded by the offset
    598 * start and end is one or more whole units of text as
    599 * determined by the breakiterator in UStringSearch.
    600 * @param strsrch string search data
    601 * @param start target text start offset
    602 * @param end target text end offset
    603 */
    604 static
    605 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
    606                                int32_t    end)
    607 {
    608 #if !UCONFIG_NO_BREAK_ITERATION
    609     UBreakIterator *breakiterator = strsrch->search->breakIter;
    610     //TODO: Add here.
    611     if (breakiterator) {
    612         int32_t startindex = ubrk_first(breakiterator);
    613         int32_t endindex   = ubrk_last(breakiterator);
    614 
    615         // out-of-range indexes are never boundary positions
    616         if (start < startindex || start > endindex ||
    617             end < startindex || end > endindex) {
    618             return FALSE;
    619         }
    620         // otherwise, we can use following() on the position before the
    621         // specified one and return true of the position we get back is the
    622         // one the user specified
    623         UBool result = (start == startindex ||
    624                 ubrk_following(breakiterator, start - 1) == start) &&
    625                (end == endindex ||
    626                 ubrk_following(breakiterator, end - 1) == end);
    627         if (result) {
    628             // iterates the individual ces
    629                   UCollationElements *coleiter  = strsrch->utilIter;
    630             const UChar              *text      = strsrch->search->text +
    631                                                                       start;
    632                   UErrorCode          status    = U_ZERO_ERROR;
    633             ucol_setText(coleiter, text, end - start, &status);
    634             for (int32_t count = 0; count < strsrch->pattern.cesLength;
    635                  count ++) {
    636                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
    637                 if (ce == UCOL_IGNORABLE) {
    638                     count --;
    639                     continue;
    640                 }
    641                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
    642                     return FALSE;
    643                 }
    644             }
    645             int32_t nextce = ucol_next(coleiter, &status);
    646             while (ucol_getOffset(coleiter) == (end - start)
    647                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
    648                 nextce = ucol_next(coleiter, &status);
    649             }
    650             if (ucol_getOffset(coleiter) == (end - start)
    651                 && nextce != UCOL_NULLORDER) {
    652                 // extra collation elements at the end of the match
    653                 return FALSE;
    654             }
    655         }
    656         return result;
    657     }
    658 #endif
    659     return TRUE;
    660 }
    661 
    662 /**
    663 * Getting the next base character offset if current offset is an accent,
    664 * or the current offset if the current character contains a base character.
    665 * accents the following base character will be returned
    666 * @param text string
    667 * @param textoffset current offset
    668 * @param textlength length of text string
    669 * @return the next base character or the current offset
    670 *         if the current character is contains a base character.
    671 */
    672 static
    673 inline int32_t getNextBaseOffset(const UChar       *text,
    674                                            int32_t  textoffset,
    675                                            int32_t      textlength)
    676 {
    677     if (textoffset < textlength) {
    678         int32_t temp = textoffset;
    679         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
    680             while (temp < textlength) {
    681                 int32_t result = temp;
    682                 if ((getFCD(text, &temp, textlength) >>
    683                      SECOND_LAST_BYTE_SHIFT_) == 0) {
    684                     return result;
    685                 }
    686             }
    687             return textlength;
    688         }
    689     }
    690     return textoffset;
    691 }
    692 
    693 /**
    694 * Gets the next base character offset depending on the string search pattern
    695 * data
    696 * @param strsrch string search data
    697 * @param textoffset current offset, one offset away from the last character
    698 *                   to search for.
    699 * @return start index of the next base character or the current offset
    700 *         if the current character is contains a base character.
    701 */
    702 static
    703 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
    704                                                   int32_t    textoffset)
    705 {
    706     int32_t textlength = strsrch->search->textLength;
    707     if (strsrch->pattern.hasSuffixAccents &&
    708         textoffset < textlength) {
    709               int32_t  temp       = textoffset;
    710         const UChar       *text       = strsrch->search->text;
    711         U16_BACK_1(text, 0, temp);
    712         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
    713             return getNextBaseOffset(text, textoffset, textlength);
    714         }
    715     }
    716     return textoffset;
    717 }
    718 
    719 /**
    720 * Shifting the collation element iterator position forward to prepare for
    721 * a following match. If the last character is a unsafe character, we'll only
    722 * shift by 1 to capture contractions, normalization etc.
    723 * Internal method, status assumed to be success.
    724 * @param text strsrch string search data
    725 * @param textoffset start text position to do search
    726 * @param ce the text ce which failed the match.
    727 * @param patternceindex index of the ce within the pattern ce buffer which
    728 *        failed the match
    729 * @return final offset
    730 */
    731 static
    732 inline int32_t shiftForward(UStringSearch *strsrch,
    733                                 int32_t    textoffset,
    734                                 int32_t       ce,
    735                                 int32_t        patternceindex)
    736 {
    737     UPattern *pattern = &(strsrch->pattern);
    738     if (ce != UCOL_NULLORDER) {
    739         int32_t shift = pattern->shift[hashFromCE32(ce)];
    740         // this is to adjust for characters in the middle of the
    741         // substring for matching that failed.
    742         int32_t adjust = pattern->cesLength - patternceindex;
    743         if (adjust > 1 && shift >= adjust) {
    744             shift -= adjust - 1;
    745         }
    746         textoffset += shift;
    747     }
    748     else {
    749         textoffset += pattern->defaultShiftSize;
    750     }
    751 
    752     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
    753     // check for unsafe characters
    754     // * if it is the start or middle of a contraction: to be done after
    755     //   a initial match is found
    756     // * thai or lao base consonant character: similar to contraction
    757     // * high surrogate character: similar to contraction
    758     // * next character is a accent: shift to the next base character
    759     return textoffset;
    760 }
    761 #endif // #if BOYER_MOORE
    762 
    763 /**
    764 * sets match not found
    765 * @param strsrch string search data
    766 */
    767 static
    768 inline void setMatchNotFound(UStringSearch *strsrch)
    769 {
    770     // this method resets the match result regardless of the error status.
    771     strsrch->search->matchedIndex = USEARCH_DONE;
    772     strsrch->search->matchedLength = 0;
    773     if (strsrch->search->isForwardSearching) {
    774         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
    775     }
    776     else {
    777         setColEIterOffset(strsrch->textIter, 0);
    778     }
    779 }
    780 
    781 #if BOYER_MOORE
    782 /**
    783 * Gets the offset to the next safe point in text.
    784 * ie. not the middle of a contraction, swappable characters or supplementary
    785 * characters.
    786 * @param collator collation sata
    787 * @param text string to work with
    788 * @param textoffset offset in string
    789 * @param textlength length of text string
    790 * @return offset to the next safe character
    791 */
    792 static
    793 inline int32_t getNextSafeOffset(const UCollator   *collator,
    794                                      const UChar       *text,
    795                                            int32_t  textoffset,
    796                                            int32_t      textlength)
    797 {
    798     int32_t result = textoffset; // first contraction character
    799     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
    800         result ++;
    801     }
    802     return result;
    803 }
    804 
    805 /**
    806 * This checks for accents in the potential match started with a .
    807 * composite character.
    808 * This is really painful... we have to check that composite character do not
    809 * have any extra accents. We have to normalize the potential match and find
    810 * the immediate decomposed character before the match.
    811 * The first composite character would have been taken care of by the fcd
    812 * checks in checkForwardExactMatch.
    813 * This is the slow path after the fcd of the first character and
    814 * the last character has been checked by checkForwardExactMatch and we
    815 * determine that the potential match has extra non-ignorable preceding
    816 * ces.
    817 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    818 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    819 * Note here that accents checking are slow and cautioned in the API docs.
    820 * Internal method, status assumed to be a success, caller should check status
    821 * before calling this method
    822 * @param strsrch string search data
    823 * @param start index of the potential unfriendly composite character
    824 * @param end index of the potential unfriendly composite character
    825 * @param status output error status if any.
    826 * @return TRUE if there is non-ignorable accents before at the beginning
    827 *              of the match, FALSE otherwise.
    828 */
    829 
    830 static
    831 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
    832                                    int32_t    end,
    833                                    UErrorCode    *status)
    834 {
    835     UBool result = FALSE;
    836     if (strsrch->pattern.hasPrefixAccents) {
    837               int32_t  length = end - start;
    838               int32_t  offset = 0;
    839         const UChar       *text   = strsrch->search->text + start;
    840 
    841         U16_FWD_1(text, offset, length);
    842         // we are only concerned with the first composite character
    843         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
    844             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
    845                                                        text, 0, length);
    846             if (safeoffset != length) {
    847                 safeoffset ++;
    848             }
    849             UChar   *norm = NULL;
    850             UChar    buffer[INITIAL_ARRAY_SIZE_];
    851             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
    852                                             buffer, INITIAL_ARRAY_SIZE_,
    853                                             status);
    854             if (U_FAILURE(*status)) {
    855                 return FALSE;
    856             }
    857             if (size >= INITIAL_ARRAY_SIZE_) {
    858                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
    859                                                status);
    860                 // if allocation failed, status will be set to
    861                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
    862                 // checks for it.
    863                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
    864                                        size, status);
    865                 if (U_FAILURE(*status) && norm != NULL) {
    866                     uprv_free(norm);
    867                     return FALSE;
    868                 }
    869             }
    870             else {
    871                 norm = buffer;
    872             }
    873 
    874             UCollationElements *coleiter  = strsrch->utilIter;
    875             ucol_setText(coleiter, norm, size, status);
    876             uint32_t            firstce   = strsrch->pattern.ces[0];
    877             UBool               ignorable = TRUE;
    878             uint32_t            ce        = UCOL_IGNORABLE;
    879             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
    880                 offset = ucol_getOffset(coleiter);
    881                 if (ce != firstce && ce != UCOL_IGNORABLE) {
    882                     ignorable = FALSE;
    883                 }
    884                 ce = ucol_next(coleiter, status);
    885             }
    886             UChar32 codepoint;
    887             U16_PREV(norm, 0, offset, codepoint);
    888             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
    889 
    890             if (norm != buffer) {
    891                 uprv_free(norm);
    892             }
    893         }
    894     }
    895 
    896     return result;
    897 }
    898 
    899 /**
    900 * Used by exact matches, checks if there are accents before the match.
    901 * This is really painful... we have to check that composite characters at
    902 * the start of the matches have to not have any extra accents.
    903 * We check the FCD of the character first, if it starts with an accent and
    904 * the first pattern ce does not match the first ce of the character, we bail.
    905 * Otherwise we try normalizing the first composite
    906 * character and find the immediate decomposed character before the match to
    907 * see if it is an non-ignorable accent.
    908 * Now normalizing the first composite character is enough because we ensure
    909 * that when the match is passed in here with extra beginning ces, the
    910 * first or last ce that match has to occur within the first character.
    911 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    912 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    913 * Note here that accents checking are slow and cautioned in the API docs.
    914 * @param strsrch string search data
    915 * @param start offset
    916 * @param end offset
    917 * @return TRUE if there are accents on either side of the match,
    918 *         FALSE otherwise
    919 */
    920 static
    921 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
    922                                   int32_t    end)
    923 {
    924     if (strsrch->pattern.hasPrefixAccents) {
    925         UCollationElements *coleiter  = strsrch->textIter;
    926         UErrorCode          status    = U_ZERO_ERROR;
    927         // we have been iterating forwards previously
    928         uint32_t            ignorable = TRUE;
    929         int32_t             firstce   = strsrch->pattern.ces[0];
    930 
    931         setColEIterOffset(coleiter, start);
    932         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
    933         if (U_FAILURE(status)) {
    934             return TRUE;
    935         }
    936         while (ce != firstce) {
    937             if (ce != UCOL_IGNORABLE) {
    938                 ignorable = FALSE;
    939             }
    940             ce = getCE(strsrch, ucol_next(coleiter, &status));
    941             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
    942                 return TRUE;
    943             }
    944         }
    945         if (!ignorable && inNormBuf(coleiter)) {
    946             // within normalization buffer, discontiguous handled here
    947             return TRUE;
    948         }
    949 
    950         // within text
    951         int32_t temp = start;
    952         // original code
    953         // accent = (getFCD(strsrch->search->text, &temp,
    954         //                  strsrch->search->textLength)
    955         //            >> SECOND_LAST_BYTE_SHIFT_);
    956         // however this code does not work well with VC7 .net in release mode.
    957         // maybe the inlines for getFCD combined with shifting has bugs in
    958         // VC7. anyways this is a work around.
    959         UBool accent = getFCD(strsrch->search->text, &temp,
    960                               strsrch->search->textLength) > 0xFF;
    961         if (!accent) {
    962             return checkExtraMatchAccents(strsrch, start, end, &status);
    963         }
    964         if (!ignorable) {
    965             return TRUE;
    966         }
    967         if (start > 0) {
    968             temp = start;
    969             U16_BACK_1(strsrch->search->text, 0, temp);
    970             if (getFCD(strsrch->search->text, &temp,
    971                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
    972                 setColEIterOffset(coleiter, start);
    973                 ce = ucol_previous(coleiter, &status);
    974                 if (U_FAILURE(status) ||
    975                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
    976                     return TRUE;
    977                 }
    978             }
    979         }
    980     }
    981 
    982     return FALSE;
    983 }
    984 
    985 /**
    986 * Used by exact matches, checks if there are accents bounding the match.
    987 * Note this is the initial boundary check. If the potential match
    988 * starts or ends with composite characters, the accents in those
    989 * characters will be determined later.
    990 * Not doing backwards iteration here, since discontiguos contraction for
    991 * backwards collation element iterator, use up too many characters.
    992 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
    993 * should fail since there is a acute at the end of \u01FA
    994 * Note here that accents checking are slow and cautioned in the API docs.
    995 * @param strsrch string search data
    996 * @param start offset of match
    997 * @param end end offset of the match
    998 * @return TRUE if there are accents on either side of the match,
    999 *         FALSE otherwise
   1000 */
   1001 static
   1002 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
   1003                                  int32_t    end)
   1004 {
   1005     if (strsrch->pattern.hasSuffixAccents) {
   1006         const UChar       *text       = strsrch->search->text;
   1007               int32_t  temp       = end;
   1008               int32_t      textlength = strsrch->search->textLength;
   1009         U16_BACK_1(text, 0, temp);
   1010         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   1011             int32_t             firstce  = strsrch->pattern.ces[0];
   1012             UCollationElements *coleiter = strsrch->textIter;
   1013             UErrorCode          status   = U_ZERO_ERROR;
   1014             int32_t ce;
   1015             setColEIterOffset(coleiter, start);
   1016             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
   1017                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   1018                     return TRUE;
   1019                 }
   1020             }
   1021             int32_t count = 1;
   1022             while (count < strsrch->pattern.cesLength) {
   1023                 if (getCE(strsrch, ucol_next(coleiter, &status))
   1024                     == UCOL_IGNORABLE) {
   1025                     // Thai can give an ignorable here.
   1026                     count --;
   1027                 }
   1028                 if (U_FAILURE(status)) {
   1029                     return TRUE;
   1030                 }
   1031                 count ++;
   1032             }
   1033 
   1034             ce = ucol_next(coleiter, &status);
   1035             if (U_FAILURE(status)) {
   1036                 return TRUE;
   1037             }
   1038             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1039                 ce = getCE(strsrch, ce);
   1040             }
   1041             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1042                 if (ucol_getOffset(coleiter) <= end) {
   1043                     return TRUE;
   1044                 }
   1045                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   1046                     return TRUE;
   1047                 }
   1048             }
   1049         }
   1050     }
   1051     return FALSE;
   1052 }
   1053 #endif // #if BOYER_MOORE
   1054 
   1055 /**
   1056 * Checks if the offset runs out of the text string
   1057 * @param offset
   1058 * @param textlength of the text string
   1059 * @return TRUE if offset is out of bounds, FALSE otherwise
   1060 */
   1061 static
   1062 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
   1063 {
   1064     return offset < 0 || offset > textlength;
   1065 }
   1066 
   1067 /**
   1068 * Checks for identical match
   1069 * @param strsrch string search data
   1070 * @param start offset of possible match
   1071 * @param end offset of possible match
   1072 * @return TRUE if identical match is found
   1073 */
   1074 static
   1075 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
   1076                                   int32_t    end)
   1077 {
   1078     if (strsrch->strength != UCOL_IDENTICAL) {
   1079         return TRUE;
   1080     }
   1081 
   1082     // Note: We could use Normalizer::compare() or similar, but for short strings
   1083     // which may not be in FCD it might be faster to just NFD them.
   1084     UErrorCode status = U_ZERO_ERROR;
   1085     UnicodeString t2, p2;
   1086     strsrch->nfd->normalize(
   1087         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
   1088     strsrch->nfd->normalize(
   1089         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
   1090     // return FALSE if NFD failed
   1091     return U_SUCCESS(status) && t2 == p2;
   1092 }
   1093 
   1094 #if BOYER_MOORE
   1095 /**
   1096 * Checks to see if the match is repeated
   1097 * @param strsrch string search data
   1098 * @param start new match start index
   1099 * @param end new match end index
   1100 * @return TRUE if the the match is repeated, FALSE otherwise
   1101 */
   1102 static
   1103 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
   1104                                 int32_t    start,
   1105                                 int32_t    end)
   1106 {
   1107     int32_t lastmatchindex = strsrch->search->matchedIndex;
   1108     UBool       result;
   1109     if (lastmatchindex == USEARCH_DONE) {
   1110         return FALSE;
   1111     }
   1112     if (strsrch->search->isForwardSearching) {
   1113         result = start <= lastmatchindex;
   1114     }
   1115     else {
   1116         result = start >= lastmatchindex;
   1117     }
   1118     if (!result && !strsrch->search->isOverlap) {
   1119         if (strsrch->search->isForwardSearching) {
   1120             result = start < lastmatchindex + strsrch->search->matchedLength;
   1121         }
   1122         else {
   1123             result = end > lastmatchindex;
   1124         }
   1125     }
   1126     return result;
   1127 }
   1128 
   1129 /**
   1130 * Gets the collation element iterator's current offset.
   1131 * @param coleiter collation element iterator
   1132 * @param forwards flag TRUE if we are moving in th forwards direction
   1133 * @return current offset
   1134 */
   1135 static
   1136 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
   1137                                               UBool               forwards)
   1138 {
   1139     int32_t result = ucol_getOffset(coleiter);
   1140     // intricacies of the the backwards collation element iterator
   1141     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
   1142         result ++;
   1143     }
   1144     return result;
   1145 }
   1146 
   1147 /**
   1148 * Checks match for contraction.
   1149 * If the match ends with a partial contraction we fail.
   1150 * If the match starts too far off (because of backwards iteration) we try to
   1151 * chip off the extra characters depending on whether a breakiterator has
   1152 * been used.
   1153 * Internal method, error assumed to be success, caller has to check status
   1154 * before calling this method.
   1155 * @param strsrch string search data
   1156 * @param start offset of potential match, to be modified if necessary
   1157 * @param end offset of potential match, to be modified if necessary
   1158 * @param status output error status if any
   1159 * @return TRUE if match passes the contraction test, FALSE otherwise
   1160 */
   1161 
   1162 static
   1163 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
   1164                                      int32_t   *start,
   1165                                      int32_t   *end, UErrorCode  *status)
   1166 {
   1167           UCollationElements *coleiter   = strsrch->textIter;
   1168           int32_t             textlength = strsrch->search->textLength;
   1169           int32_t             temp       = *start;
   1170     const UCollator          *collator   = strsrch->collator;
   1171     const UChar              *text       = strsrch->search->text;
   1172     // This part checks if either ends of the match contains potential
   1173     // contraction. If so we'll have to iterate through them
   1174     // The start contraction needs to be checked since ucol_previous dumps
   1175     // all characters till the first safe character into the buffer.
   1176     // *start + 1 is used to test for the unsafe characters instead of *start
   1177     // because ucol_prev takes all unsafe characters till the first safe
   1178     // character ie *start. so by testing *start + 1, we can estimate if
   1179     // excess prefix characters has been included in the potential search
   1180     // results.
   1181     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1182         (*start + 1 < textlength
   1183          && ucol_unsafeCP(text[*start + 1], collator))) {
   1184         int32_t expansion  = getExpansionPrefix(coleiter);
   1185         UBool   expandflag = expansion > 0;
   1186         setColEIterOffset(coleiter, *start);
   1187         while (expansion > 0) {
   1188             // getting rid of the redundant ce, caused by setOffset.
   1189             // since backward contraction/expansion may have extra ces if we
   1190             // are in the normalization buffer, hasAccentsBeforeMatch would
   1191             // have taken care of it.
   1192             // E.g. the character \u01FA will have an expansion of 3, but if
   1193             // we are only looking for acute and ring \u030A and \u0301, we'll
   1194             // have to skip the first ce in the expansion buffer.
   1195             ucol_next(coleiter, status);
   1196             if (U_FAILURE(*status)) {
   1197                 return FALSE;
   1198             }
   1199             if (ucol_getOffset(coleiter) != temp) {
   1200                 *start = temp;
   1201                 temp  = ucol_getOffset(coleiter);
   1202             }
   1203             expansion --;
   1204         }
   1205 
   1206         int32_t  *patternce       = strsrch->pattern.ces;
   1207         int32_t   patterncelength = strsrch->pattern.cesLength;
   1208         int32_t   count           = 0;
   1209         while (count < patterncelength) {
   1210             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1211             if (ce == UCOL_IGNORABLE) {
   1212                 continue;
   1213             }
   1214             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1215                 *start = temp;
   1216                 temp   = ucol_getOffset(coleiter);
   1217             }
   1218             if (U_FAILURE(*status) || ce != patternce[count]) {
   1219                 (*end) ++;
   1220                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1221                 return FALSE;
   1222             }
   1223             count ++;
   1224         }
   1225     }
   1226     return TRUE;
   1227 }
   1228 
   1229 /**
   1230 * Checks and sets the match information if found.
   1231 * Checks
   1232 * <ul>
   1233 * <li> the potential match does not repeat the previous match
   1234 * <li> boundaries are correct
   1235 * <li> exact matches has no extra accents
   1236 * <li> identical matchesb
   1237 * <li> potential match does not end in the middle of a contraction
   1238 * <\ul>
   1239 * Otherwise the offset will be shifted to the next character.
   1240 * Internal method, status assumed to be success, caller has to check status
   1241 * before calling this method.
   1242 * @param strsrch string search data
   1243 * @param textoffset offset in the collation element text. the returned value
   1244 *        will be the truncated end offset of the match or the new start
   1245 *        search offset.
   1246 * @param status output error status if any
   1247 * @return TRUE if the match is valid, FALSE otherwise
   1248 */
   1249 static
   1250 inline UBool checkNextExactMatch(UStringSearch *strsrch,
   1251                                  int32_t   *textoffset, UErrorCode *status)
   1252 {
   1253     UCollationElements *coleiter = strsrch->textIter;
   1254     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
   1255 
   1256     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
   1257         return FALSE;
   1258     }
   1259 
   1260     // this totally matches, however we need to check if it is repeating
   1261     if (!isBreakUnit(strsrch, start, *textoffset) ||
   1262         checkRepeatedMatch(strsrch, start, *textoffset) ||
   1263         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
   1264         !checkIdentical(strsrch, start, *textoffset) ||
   1265         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
   1266 
   1267         (*textoffset) ++;
   1268         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
   1269         return FALSE;
   1270     }
   1271 
   1272     //Add breakiterator boundary check for primary strength search.
   1273     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   1274         checkBreakBoundary(strsrch, &start, textoffset);
   1275     }
   1276 
   1277     // totally match, we will get rid of the ending ignorables.
   1278     strsrch->search->matchedIndex  = start;
   1279     strsrch->search->matchedLength = *textoffset - start;
   1280     return TRUE;
   1281 }
   1282 
   1283 /**
   1284 * Getting the previous base character offset, or the current offset if the
   1285 * current character is a base character
   1286 * @param text string
   1287 * @param textoffset one offset after the current character
   1288 * @return the offset of the next character after the base character or the first
   1289 *         composed character with accents
   1290 */
   1291 static
   1292 inline int32_t getPreviousBaseOffset(const UChar       *text,
   1293                                                int32_t  textoffset)
   1294 {
   1295     if (textoffset > 0) {
   1296         for (;;) {
   1297             int32_t result = textoffset;
   1298             U16_BACK_1(text, 0, textoffset);
   1299             int32_t temp = textoffset;
   1300             uint16_t fcd = getFCD(text, &temp, result);
   1301             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   1302                 if (fcd & LAST_BYTE_MASK_) {
   1303                     return textoffset;
   1304                 }
   1305                 return result;
   1306             }
   1307             if (textoffset == 0) {
   1308                 return 0;
   1309             }
   1310         }
   1311     }
   1312     return textoffset;
   1313 }
   1314 
   1315 /**
   1316 * Getting the indexes of the accents that are not blocked in the argument
   1317 * accent array
   1318 * @param accents array of accents in nfd terminated by a 0.
   1319 * @param accentsindex array of indexes of the accents that are not blocked
   1320 */
   1321 static
   1322 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
   1323 {
   1324     int32_t index     = 0;
   1325     int32_t     length    = u_strlen(accents);
   1326     UChar32     codepoint = 0;
   1327     int         cclass    = 0;
   1328     int         result    = 0;
   1329     int32_t temp;
   1330     while (index < length) {
   1331         temp = index;
   1332         U16_NEXT(accents, index, length, codepoint);
   1333         if (u_getCombiningClass(codepoint) != cclass) {
   1334             cclass        = u_getCombiningClass(codepoint);
   1335             accentsindex[result] = temp;
   1336             result ++;
   1337         }
   1338     }
   1339     accentsindex[result] = length;
   1340     return result;
   1341 }
   1342 
   1343 /**
   1344 * Appends 3 UChar arrays to a destination array.
   1345 * Creates a new array if we run out of space. The caller will have to
   1346 * manually deallocate the newly allocated array.
   1347 * Internal method, status assumed to be success, caller has to check status
   1348 * before calling this method. destination not to be NULL and has at least
   1349 * size destinationlength.
   1350 * @param destination target array
   1351 * @param destinationlength target array size, returning the appended length
   1352 * @param source1 null-terminated first array
   1353 * @param source2 second array
   1354 * @param source2length length of seond array
   1355 * @param source3 null-terminated third array
   1356 * @param status error status if any
   1357 * @return new destination array, destination if there was no new allocation
   1358 */
   1359 static
   1360 inline UChar * addToUCharArray(      UChar      *destination,
   1361                                      int32_t    *destinationlength,
   1362                                const UChar      *source1,
   1363                                const UChar      *source2,
   1364                                      int32_t     source2length,
   1365                                const UChar      *source3,
   1366                                      UErrorCode *status)
   1367 {
   1368     int32_t source1length = source1 ? u_strlen(source1) : 0;
   1369     int32_t source3length = source3 ? u_strlen(source3) : 0;
   1370     if (*destinationlength < source1length + source2length + source3length +
   1371                                                                            1)
   1372     {
   1373         destination = (UChar *)allocateMemory(
   1374           (source1length + source2length + source3length + 1) * sizeof(UChar),
   1375           status);
   1376         // if error allocating memory, status will be
   1377         // U_MEMORY_ALLOCATION_ERROR
   1378         if (U_FAILURE(*status)) {
   1379             *destinationlength = 0;
   1380             return NULL;
   1381         }
   1382     }
   1383     if (source1length != 0) {
   1384         u_memcpy(destination, source1, source1length);
   1385     }
   1386     if (source2length != 0) {
   1387         uprv_memcpy(destination + source1length, source2,
   1388                     sizeof(UChar) * source2length);
   1389     }
   1390     if (source3length != 0) {
   1391         uprv_memcpy(destination + source1length + source2length, source3,
   1392                     sizeof(UChar) * source3length);
   1393     }
   1394     *destinationlength = source1length + source2length + source3length;
   1395     return destination;
   1396 }
   1397 
   1398 /**
   1399 * Running through a collation element iterator to see if the contents matches
   1400 * pattern in string search data
   1401 * @param strsrch string search data
   1402 * @param coleiter collation element iterator
   1403 * @return TRUE if a match if found, FALSE otherwise
   1404 */
   1405 static
   1406 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
   1407                                        UCollationElements *coleiter)
   1408 {
   1409     int         patternceindex = strsrch->pattern.cesLength;
   1410     int32_t    *patternce      = strsrch->pattern.ces;
   1411     UErrorCode  status = U_ZERO_ERROR;
   1412     while (patternceindex > 0) {
   1413         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   1414         if (ce == UCOL_IGNORABLE) {
   1415             continue;
   1416         }
   1417         if (U_FAILURE(status) || ce != *patternce) {
   1418             return FALSE;
   1419         }
   1420         patternce ++;
   1421         patternceindex --;
   1422     }
   1423     return TRUE;
   1424 }
   1425 
   1426 /**
   1427 * Rearranges the front accents to try matching.
   1428 * Prefix accents in the text will be grouped according to their combining
   1429 * class and the groups will be mixed and matched to try find the perfect
   1430 * match with the pattern.
   1431 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1432 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1433 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1434 *         "\u0301\u0325".
   1435 * step 2: check if any of the generated substrings matches the pattern.
   1436 * Internal method, status is assumed to be success, caller has to check status
   1437 * before calling this method.
   1438 * @param strsrch string search match
   1439 * @param start first offset of the accents to start searching
   1440 * @param end start of the last accent set
   1441 * @param status output error status if any
   1442 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1443 *         offset of the match. Note this start includes all preceding accents.
   1444 */
   1445 static
   1446 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
   1447                                        int32_t    start,
   1448                                        int32_t    end,
   1449                                        UErrorCode    *status)
   1450 {
   1451     const UChar       *text       = strsrch->search->text;
   1452           int32_t      textlength = strsrch->search->textLength;
   1453           int32_t  tempstart  = start;
   1454 
   1455     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
   1456         // die... failed at a base character
   1457         return USEARCH_DONE;
   1458     }
   1459 
   1460     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
   1461     start = getPreviousBaseOffset(text, tempstart);
   1462 
   1463     UChar       accents[INITIAL_ARRAY_SIZE_];
   1464     // normalizing the offensive string
   1465     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
   1466                     INITIAL_ARRAY_SIZE_, status);
   1467     if (U_FAILURE(*status)) {
   1468         return USEARCH_DONE;
   1469     }
   1470 
   1471     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   1472     int32_t         accentsize = getUnblockedAccentIndex(accents,
   1473                                                                  accentsindex);
   1474     int32_t         count      = (2 << (accentsize - 1)) - 1;
   1475     UChar               buffer[INITIAL_ARRAY_SIZE_];
   1476     UCollationElements *coleiter   = strsrch->utilIter;
   1477     while (U_SUCCESS(*status) && count > 0) {
   1478         UChar *rearrange = strsrch->canonicalPrefixAccents;
   1479         // copy the base characters
   1480         for (int k = 0; k < accentsindex[0]; k ++) {
   1481             *rearrange ++ = accents[k];
   1482         }
   1483         // forming all possible canonical rearrangement by dropping
   1484         // sets of accents
   1485         for (int i = 0; i <= accentsize - 1; i ++) {
   1486             int32_t mask = 1 << (accentsize - i - 1);
   1487             if (count & mask) {
   1488                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1489                     *rearrange ++ = accents[j];
   1490                 }
   1491             }
   1492         }
   1493         *rearrange = 0;
   1494         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   1495         UChar   *match     = addToUCharArray(buffer, &matchsize,
   1496                                            strsrch->canonicalPrefixAccents,
   1497                                            strsrch->search->text + offset,
   1498                                            end - offset,
   1499                                            strsrch->canonicalSuffixAccents,
   1500                                            status);
   1501 
   1502         // if status is a failure, ucol_setText does nothing.
   1503         // run the collator iterator through this match
   1504         ucol_setText(coleiter, match, matchsize, status);
   1505         if (U_SUCCESS(*status)) {
   1506             if (checkCollationMatch(strsrch, coleiter)) {
   1507                 if (match != buffer) {
   1508                     uprv_free(match);
   1509                 }
   1510                 return start;
   1511             }
   1512         }
   1513         count --;
   1514     }
   1515     return USEARCH_DONE;
   1516 }
   1517 
   1518 /**
   1519 * Gets the offset to the safe point in text before textoffset.
   1520 * ie. not the middle of a contraction, swappable characters or supplementary
   1521 * characters.
   1522 * @param collator collation sata
   1523 * @param text string to work with
   1524 * @param textoffset offset in string
   1525 * @param textlength length of text string
   1526 * @return offset to the previous safe character
   1527 */
   1528 static
   1529 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
   1530                                       const UChar       *text,
   1531                                             int32_t  textoffset)
   1532 {
   1533     int32_t result = textoffset; // first contraction character
   1534     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
   1535         result --;
   1536     }
   1537     if (result != 0) {
   1538         // the first contraction character is consider unsafe here
   1539         result --;
   1540     }
   1541     return result;
   1542 }
   1543 
   1544 /**
   1545 * Cleaning up after we passed the safe zone
   1546 * @param strsrch string search data
   1547 * @param safetext safe text array
   1548 * @param safebuffer safe text buffer
   1549 * @param coleiter collation element iterator for safe text
   1550 */
   1551 static
   1552 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
   1553                                   UChar         *safebuffer)
   1554 {
   1555     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
   1556     {
   1557        uprv_free(safetext);
   1558     }
   1559 }
   1560 
   1561 /**
   1562 * Take the rearranged end accents and tries matching. If match failed at
   1563 * a seperate preceding set of accents (seperated from the rearranged on by
   1564 * at least a base character) then we rearrange the preceding accents and
   1565 * tries matching again.
   1566 * We allow skipping of the ends of the accent set if the ces do not match.
   1567 * However if the failure is found before the accent set, it fails.
   1568 * Internal method, status assumed to be success, caller has to check status
   1569 * before calling this method.
   1570 * @param strsrch string search data
   1571 * @param textoffset of the start of the rearranged accent
   1572 * @param status output error status if any
   1573 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1574 *         offset of the match. Note this start includes all preceding accents.
   1575 */
   1576 static
   1577 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
   1578                                        int32_t    textoffset,
   1579                                        UErrorCode    *status)
   1580 {
   1581     const UChar              *text           = strsrch->search->text;
   1582     const UCollator          *collator       = strsrch->collator;
   1583           int32_t             safelength     = 0;
   1584           UChar              *safetext;
   1585           int32_t             safetextlength;
   1586           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
   1587           UCollationElements *coleiter       = strsrch->utilIter;
   1588           int32_t         safeoffset     = textoffset;
   1589 
   1590     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
   1591                                          collator)) {
   1592         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
   1593         safelength     = textoffset - safeoffset;
   1594         safetextlength = INITIAL_ARRAY_SIZE_;
   1595         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
   1596                                          text + safeoffset, safelength,
   1597                                          strsrch->canonicalSuffixAccents,
   1598                                          status);
   1599     }
   1600     else {
   1601         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
   1602         safetext       = strsrch->canonicalSuffixAccents;
   1603     }
   1604 
   1605     // if status is a failure, ucol_setText does nothing
   1606     ucol_setText(coleiter, safetext, safetextlength, status);
   1607     // status checked in loop below
   1608 
   1609     int32_t  *ce        = strsrch->pattern.ces;
   1610     int32_t   celength  = strsrch->pattern.cesLength;
   1611     int       ceindex   = celength - 1;
   1612     UBool     isSafe    = TRUE; // indication flag for position in safe zone
   1613 
   1614     while (ceindex >= 0) {
   1615         int32_t textce = ucol_previous(coleiter, status);
   1616         if (U_FAILURE(*status)) {
   1617             if (isSafe) {
   1618                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1619             }
   1620             return USEARCH_DONE;
   1621         }
   1622         if (textce == UCOL_NULLORDER) {
   1623             // check if we have passed the safe buffer
   1624             if (coleiter == strsrch->textIter) {
   1625                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1626                 return USEARCH_DONE;
   1627             }
   1628             cleanUpSafeText(strsrch, safetext, safebuffer);
   1629             safetext = safebuffer;
   1630             coleiter = strsrch->textIter;
   1631             setColEIterOffset(coleiter, safeoffset);
   1632             // status checked at the start of the loop
   1633             isSafe = FALSE;
   1634             continue;
   1635         }
   1636         textce = getCE(strsrch, textce);
   1637         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   1638             // do the beginning stuff
   1639             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
   1640             if (isSafe && failedoffset >= safelength) {
   1641                 // alas... no hope. failed at rearranged accent set
   1642                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1643                 return USEARCH_DONE;
   1644             }
   1645             else {
   1646                 if (isSafe) {
   1647                     failedoffset += safeoffset;
   1648                     cleanUpSafeText(strsrch, safetext, safebuffer);
   1649                 }
   1650 
   1651                 // try rearranging the front accents
   1652                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
   1653                                         failedoffset, textoffset, status);
   1654                 if (result != USEARCH_DONE) {
   1655                     // if status is a failure, ucol_setOffset does nothing
   1656                     setColEIterOffset(strsrch->textIter, result);
   1657                 }
   1658                 if (U_FAILURE(*status)) {
   1659                     return USEARCH_DONE;
   1660                 }
   1661                 return result;
   1662             }
   1663         }
   1664         if (textce == ce[ceindex]) {
   1665             ceindex --;
   1666         }
   1667     }
   1668     // set offset here
   1669     if (isSafe) {
   1670         int32_t result     = getColElemIterOffset(coleiter, FALSE);
   1671         // sets the text iterator here with the correct expansion and offset
   1672         int32_t    leftoverces = getExpansionPrefix(coleiter);
   1673         cleanUpSafeText(strsrch, safetext, safebuffer);
   1674         if (result >= safelength) {
   1675             result = textoffset;
   1676         }
   1677         else {
   1678             result += safeoffset;
   1679         }
   1680         setColEIterOffset(strsrch->textIter, result);
   1681         strsrch->textIter->iteratordata_.toReturn =
   1682                        setExpansionPrefix(strsrch->textIter, leftoverces);
   1683         return result;
   1684     }
   1685 
   1686     return ucol_getOffset(coleiter);
   1687 }
   1688 
   1689 /**
   1690 * Trying out the substring and sees if it can be a canonical match.
   1691 * This will try normalizing the end accents and arranging them into canonical
   1692 * equivalents and check their corresponding ces with the pattern ce.
   1693 * Suffix accents in the text will be grouped according to their combining
   1694 * class and the groups will be mixed and matched to try find the perfect
   1695 * match with the pattern.
   1696 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1697 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1698 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1699 *         "\u0301\u0325".
   1700 * step 2: check if any of the generated substrings matches the pattern.
   1701 * Internal method, status assumed to be success, caller has to check status
   1702 * before calling this method.
   1703 * @param strsrch string search data
   1704 * @param textoffset end offset in the collation element text that ends with
   1705 *                   the accents to be rearranged
   1706 * @param status error status if any
   1707 * @return TRUE if the match is valid, FALSE otherwise
   1708 */
   1709 static
   1710 UBool doNextCanonicalMatch(UStringSearch *strsrch,
   1711                            int32_t    textoffset,
   1712                            UErrorCode    *status)
   1713 {
   1714     const UChar       *text = strsrch->search->text;
   1715           int32_t  temp = textoffset;
   1716     U16_BACK_1(text, 0, temp);
   1717     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
   1718         UCollationElements *coleiter = strsrch->textIter;
   1719         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
   1720         if (strsrch->pattern.hasPrefixAccents) {
   1721             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
   1722                                                 status);
   1723             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   1724                 setColEIterOffset(coleiter, offset);
   1725                 return TRUE;
   1726             }
   1727         }
   1728         return FALSE;
   1729     }
   1730 
   1731     if (!strsrch->pattern.hasSuffixAccents) {
   1732         return FALSE;
   1733     }
   1734 
   1735     UChar       accents[INITIAL_ARRAY_SIZE_];
   1736     // offset to the last base character in substring to search
   1737     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
   1738     // normalizing the offensive string
   1739     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
   1740                                0, accents, INITIAL_ARRAY_SIZE_, status);
   1741     // status checked in loop below
   1742 
   1743     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   1744     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   1745 
   1746     // 2 power n - 1 plus the full set of accents
   1747     int32_t  count = (2 << (size - 1)) - 1;
   1748     while (U_SUCCESS(*status) && count > 0) {
   1749         UChar *rearrange = strsrch->canonicalSuffixAccents;
   1750         // copy the base characters
   1751         for (int k = 0; k < accentsindex[0]; k ++) {
   1752             *rearrange ++ = accents[k];
   1753         }
   1754         // forming all possible canonical rearrangement by dropping
   1755         // sets of accents
   1756         for (int i = 0; i <= size - 1; i ++) {
   1757             int32_t mask = 1 << (size - i - 1);
   1758             if (count & mask) {
   1759                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1760                     *rearrange ++ = accents[j];
   1761                 }
   1762             }
   1763         }
   1764         *rearrange = 0;
   1765         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
   1766                                                         status);
   1767         if (offset != USEARCH_DONE) {
   1768             return TRUE; // match found
   1769         }
   1770         count --;
   1771     }
   1772     return FALSE;
   1773 }
   1774 
   1775 /**
   1776 * Gets the previous base character offset depending on the string search
   1777 * pattern data
   1778 * @param strsrch string search data
   1779 * @param textoffset current offset, current character
   1780 * @return the offset of the next character after this base character or itself
   1781 *         if it is a composed character with accents
   1782 */
   1783 static
   1784 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
   1785                                                       int32_t textoffset)
   1786 {
   1787     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
   1788         const UChar       *text = strsrch->search->text;
   1789               int32_t  offset = textoffset;
   1790         if (getFCD(text, &offset, strsrch->search->textLength) >>
   1791                                                    SECOND_LAST_BYTE_SHIFT_) {
   1792             return getPreviousBaseOffset(text, textoffset);
   1793         }
   1794     }
   1795     return textoffset;
   1796 }
   1797 
   1798 /**
   1799 * Checks match for contraction.
   1800 * If the match ends with a partial contraction we fail.
   1801 * If the match starts too far off (because of backwards iteration) we try to
   1802 * chip off the extra characters
   1803 * Internal method, status assumed to be success, caller has to check status
   1804 * before calling this method.
   1805 * @param strsrch string search data
   1806 * @param start offset of potential match, to be modified if necessary
   1807 * @param end offset of potential match, to be modified if necessary
   1808 * @param status output error status if any
   1809 * @return TRUE if match passes the contraction test, FALSE otherwise
   1810 */
   1811 static
   1812 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
   1813                                          int32_t   *start,
   1814                                          int32_t   *end,
   1815                                          UErrorCode    *status)
   1816 {
   1817           UCollationElements *coleiter   = strsrch->textIter;
   1818           int32_t             textlength = strsrch->search->textLength;
   1819           int32_t         temp       = *start;
   1820     const UCollator          *collator   = strsrch->collator;
   1821     const UChar              *text       = strsrch->search->text;
   1822     // This part checks if either ends of the match contains potential
   1823     // contraction. If so we'll have to iterate through them
   1824     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1825         (*start + 1 < textlength
   1826          && ucol_unsafeCP(text[*start + 1], collator))) {
   1827         int32_t expansion  = getExpansionPrefix(coleiter);
   1828         UBool   expandflag = expansion > 0;
   1829         setColEIterOffset(coleiter, *start);
   1830         while (expansion > 0) {
   1831             // getting rid of the redundant ce, caused by setOffset.
   1832             // since backward contraction/expansion may have extra ces if we
   1833             // are in the normalization buffer, hasAccentsBeforeMatch would
   1834             // have taken care of it.
   1835             // E.g. the character \u01FA will have an expansion of 3, but if
   1836             // we are only looking for acute and ring \u030A and \u0301, we'll
   1837             // have to skip the first ce in the expansion buffer.
   1838             ucol_next(coleiter, status);
   1839             if (U_FAILURE(*status)) {
   1840                 return FALSE;
   1841             }
   1842             if (ucol_getOffset(coleiter) != temp) {
   1843                 *start = temp;
   1844                 temp  = ucol_getOffset(coleiter);
   1845             }
   1846             expansion --;
   1847         }
   1848 
   1849         int32_t  *patternce       = strsrch->pattern.ces;
   1850         int32_t   patterncelength = strsrch->pattern.cesLength;
   1851         int32_t   count           = 0;
   1852         int32_t   textlength      = strsrch->search->textLength;
   1853         while (count < patterncelength) {
   1854             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1855             // status checked below, note that if status is a failure
   1856             // ucol_next returns UCOL_NULLORDER
   1857             if (ce == UCOL_IGNORABLE) {
   1858                 continue;
   1859             }
   1860             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1861                 *start = temp;
   1862                 temp   = ucol_getOffset(coleiter);
   1863             }
   1864 
   1865             if (count == 0 && ce != patternce[0]) {
   1866                 // accents may have extra starting ces, this occurs when a
   1867                 // pure accent pattern is matched without rearrangement
   1868                 // text \u0325\u0300 and looking for \u0300
   1869                 int32_t expected = patternce[0];
   1870                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
   1871                     ce = getCE(strsrch, ucol_next(coleiter, status));
   1872                     while (U_SUCCESS(*status) && ce != expected &&
   1873                            ce != UCOL_NULLORDER &&
   1874                            ucol_getOffset(coleiter) <= *end) {
   1875                         ce = getCE(strsrch, ucol_next(coleiter, status));
   1876                     }
   1877                 }
   1878             }
   1879             if (U_FAILURE(*status) || ce != patternce[count]) {
   1880                 (*end) ++;
   1881                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1882                 return FALSE;
   1883             }
   1884             count ++;
   1885         }
   1886     }
   1887     return TRUE;
   1888 }
   1889 
   1890 /**
   1891 * Checks and sets the match information if found.
   1892 * Checks
   1893 * <ul>
   1894 * <li> the potential match does not repeat the previous match
   1895 * <li> boundaries are correct
   1896 * <li> potential match does not end in the middle of a contraction
   1897 * <li> identical matches
   1898 * <\ul>
   1899 * Otherwise the offset will be shifted to the next character.
   1900 * Internal method, status assumed to be success, caller has to check the
   1901 * status before calling this method.
   1902 * @param strsrch string search data
   1903 * @param textoffset offset in the collation element text. the returned value
   1904 *        will be the truncated end offset of the match or the new start
   1905 *        search offset.
   1906 * @param status output error status if any
   1907 * @return TRUE if the match is valid, FALSE otherwise
   1908 */
   1909 static
   1910 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
   1911                                      int32_t   *textoffset,
   1912                                      UErrorCode    *status)
   1913 {
   1914     // to ensure that the start and ends are not composite characters
   1915     UCollationElements *coleiter = strsrch->textIter;
   1916     // if we have a canonical accent match
   1917     if ((strsrch->pattern.hasSuffixAccents &&
   1918         strsrch->canonicalSuffixAccents[0]) ||
   1919         (strsrch->pattern.hasPrefixAccents &&
   1920         strsrch->canonicalPrefixAccents[0])) {
   1921         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
   1922                                                     strsrch,
   1923                                                     ucol_getOffset(coleiter));
   1924         strsrch->search->matchedLength = *textoffset -
   1925                                                 strsrch->search->matchedIndex;
   1926         return TRUE;
   1927     }
   1928 
   1929     int32_t start = getColElemIterOffset(coleiter, FALSE);
   1930     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
   1931                                             status) || U_FAILURE(*status)) {
   1932         return FALSE;
   1933     }
   1934 
   1935     start = getPreviousUStringSearchBaseOffset(strsrch, start);
   1936     // this totally matches, however we need to check if it is repeating
   1937     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
   1938         !isBreakUnit(strsrch, start, *textoffset) ||
   1939         !checkIdentical(strsrch, start, *textoffset)) {
   1940         (*textoffset) ++;
   1941         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
   1942                                         strsrch->search->textLength);
   1943         return FALSE;
   1944     }
   1945 
   1946     strsrch->search->matchedIndex  = start;
   1947     strsrch->search->matchedLength = *textoffset - start;
   1948     return TRUE;
   1949 }
   1950 
   1951 /**
   1952 * Shifting the collation element iterator position forward to prepare for
   1953 * a preceding match. If the first character is a unsafe character, we'll only
   1954 * shift by 1 to capture contractions, normalization etc.
   1955 * Internal method, status assumed to be success, caller has to check status
   1956 * before calling this method.
   1957 * @param text strsrch string search data
   1958 * @param textoffset start text position to do search
   1959 * @param ce the text ce which failed the match.
   1960 * @param patternceindex index of the ce within the pattern ce buffer which
   1961 *        failed the match
   1962 * @return final offset
   1963 */
   1964 static
   1965 inline int32_t reverseShift(UStringSearch *strsrch,
   1966                                 int32_t    textoffset,
   1967                                 int32_t       ce,
   1968                                 int32_t        patternceindex)
   1969 {
   1970     if (strsrch->search->isOverlap) {
   1971         if (textoffset != strsrch->search->textLength) {
   1972             textoffset --;
   1973         }
   1974         else {
   1975             textoffset -= strsrch->pattern.defaultShiftSize;
   1976         }
   1977     }
   1978     else {
   1979         if (ce != UCOL_NULLORDER) {
   1980             int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
   1981 
   1982             // this is to adjust for characters in the middle of the substring
   1983             // for matching that failed.
   1984             int32_t adjust = patternceindex;
   1985             if (adjust > 1 && shift > adjust) {
   1986                 shift -= adjust - 1;
   1987             }
   1988             textoffset -= shift;
   1989         }
   1990         else {
   1991             textoffset -= strsrch->pattern.defaultShiftSize;
   1992         }
   1993     }
   1994     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
   1995     return textoffset;
   1996 }
   1997 
   1998 /**
   1999 * Checks match for contraction.
   2000 * If the match starts with a partial contraction we fail.
   2001 * Internal method, status assumed to be success, caller has to check status
   2002 * before calling this method.
   2003 * @param strsrch string search data
   2004 * @param start offset of potential match, to be modified if necessary
   2005 * @param end offset of potential match, to be modified if necessary
   2006 * @param status output error status if any
   2007 * @return TRUE if match passes the contraction test, FALSE otherwise
   2008 */
   2009 static
   2010 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
   2011                                      int32_t   *start,
   2012                                      int32_t   *end, UErrorCode  *status)
   2013 {
   2014           UCollationElements *coleiter   = strsrch->textIter;
   2015           int32_t             textlength = strsrch->search->textLength;
   2016           int32_t             temp       = *end;
   2017     const UCollator          *collator   = strsrch->collator;
   2018     const UChar              *text       = strsrch->search->text;
   2019     // This part checks if either if the start of the match contains potential
   2020     // contraction. If so we'll have to iterate through them
   2021     // Since we used ucol_next while previously looking for the potential
   2022     // match, this guarantees that our end will not be a partial contraction,
   2023     // or a partial supplementary character.
   2024     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2025         int32_t expansion  = getExpansionSuffix(coleiter);
   2026         UBool   expandflag = expansion > 0;
   2027         setColEIterOffset(coleiter, *end);
   2028         while (U_SUCCESS(*status) && expansion > 0) {
   2029             // getting rid of the redundant ce
   2030             // since forward contraction/expansion may have extra ces
   2031             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2032             // would have taken care of it.
   2033             // E.g. the character \u01FA will have an expansion of 3, but if
   2034             // we are only looking for A ring A\u030A, we'll have to skip the
   2035             // last ce in the expansion buffer
   2036             ucol_previous(coleiter, status);
   2037             if (U_FAILURE(*status)) {
   2038                 return FALSE;
   2039             }
   2040             if (ucol_getOffset(coleiter) != temp) {
   2041                 *end = temp;
   2042                 temp  = ucol_getOffset(coleiter);
   2043             }
   2044             expansion --;
   2045         }
   2046 
   2047         int32_t  *patternce       = strsrch->pattern.ces;
   2048         int32_t   patterncelength = strsrch->pattern.cesLength;
   2049         int32_t   count           = patterncelength;
   2050         while (count > 0) {
   2051             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2052             // status checked below, note that if status is a failure
   2053             // ucol_previous returns UCOL_NULLORDER
   2054             if (ce == UCOL_IGNORABLE) {
   2055                 continue;
   2056             }
   2057             if (expandflag && count == 0 &&
   2058                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2059                 *end = temp;
   2060                 temp  = ucol_getOffset(coleiter);
   2061             }
   2062             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2063                 (*start) --;
   2064                 *start = getPreviousBaseOffset(text, *start);
   2065                 return FALSE;
   2066             }
   2067             count --;
   2068         }
   2069     }
   2070     return TRUE;
   2071 }
   2072 
   2073 /**
   2074 * Checks and sets the match information if found.
   2075 * Checks
   2076 * <ul>
   2077 * <li> the current match does not repeat the last match
   2078 * <li> boundaries are correct
   2079 * <li> exact matches has no extra accents
   2080 * <li> identical matches
   2081 * <\ul>
   2082 * Otherwise the offset will be shifted to the preceding character.
   2083 * Internal method, status assumed to be success, caller has to check status
   2084 * before calling this method.
   2085 * @param strsrch string search data
   2086 * @param collator
   2087 * @param coleiter collation element iterator
   2088 * @param text string
   2089 * @param textoffset offset in the collation element text. the returned value
   2090 *        will be the truncated start offset of the match or the new start
   2091 *        search offset.
   2092 * @param status output error status if any
   2093 * @return TRUE if the match is valid, FALSE otherwise
   2094 */
   2095 static
   2096 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
   2097                                      int32_t   *textoffset,
   2098                                      UErrorCode    *status)
   2099 {
   2100     // to ensure that the start and ends are not composite characters
   2101     int32_t end = ucol_getOffset(strsrch->textIter);
   2102     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
   2103         || U_FAILURE(*status)) {
   2104             return FALSE;
   2105     }
   2106 
   2107     // this totally matches, however we need to check if it is repeating
   2108     // the old match
   2109     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2110         !isBreakUnit(strsrch, *textoffset, end) ||
   2111         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
   2112         !checkIdentical(strsrch, *textoffset, end) ||
   2113         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
   2114         (*textoffset) --;
   2115         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2116                                             *textoffset);
   2117         return FALSE;
   2118     }
   2119 
   2120     //Add breakiterator boundary check for primary strength search.
   2121     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   2122         checkBreakBoundary(strsrch, textoffset, &end);
   2123     }
   2124 
   2125     strsrch->search->matchedIndex = *textoffset;
   2126     strsrch->search->matchedLength = end - *textoffset;
   2127     return TRUE;
   2128 }
   2129 
   2130 /**
   2131 * Rearranges the end accents to try matching.
   2132 * Suffix accents in the text will be grouped according to their combining
   2133 * class and the groups will be mixed and matched to try find the perfect
   2134 * match with the pattern.
   2135 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2136 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2137 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2138 *         "\u0301\u0325".
   2139 * step 2: check if any of the generated substrings matches the pattern.
   2140 * Internal method, status assumed to be success, user has to check status
   2141 * before calling this method.
   2142 * @param strsrch string search match
   2143 * @param start offset of the first base character
   2144 * @param end start of the last accent set
   2145 * @param status only error status if any
   2146 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2147 *         offset of the match. Note this start includes all following accents.
   2148 */
   2149 static
   2150 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
   2151                                            int32_t    start,
   2152                                            int32_t    end,
   2153                                            UErrorCode    *status)
   2154 {
   2155     const UChar       *text       = strsrch->search->text;
   2156           int32_t  tempend    = end;
   2157 
   2158     U16_BACK_1(text, 0, tempend);
   2159     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
   2160                                                            LAST_BYTE_MASK_)) {
   2161         // die... failed at a base character
   2162         return USEARCH_DONE;
   2163     }
   2164     end = getNextBaseOffset(text, end, strsrch->search->textLength);
   2165 
   2166     if (U_SUCCESS(*status)) {
   2167         UChar       accents[INITIAL_ARRAY_SIZE_];
   2168         int32_t offset = getPreviousBaseOffset(text, end);
   2169         // normalizing the offensive string
   2170         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
   2171                         INITIAL_ARRAY_SIZE_, status);
   2172 
   2173         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   2174         int32_t         accentsize = getUnblockedAccentIndex(accents,
   2175                                                          accentsindex);
   2176         int32_t         count      = (2 << (accentsize - 1)) - 1;
   2177         UChar               buffer[INITIAL_ARRAY_SIZE_];
   2178         UCollationElements *coleiter = strsrch->utilIter;
   2179         while (U_SUCCESS(*status) && count > 0) {
   2180             UChar *rearrange = strsrch->canonicalSuffixAccents;
   2181             // copy the base characters
   2182             for (int k = 0; k < accentsindex[0]; k ++) {
   2183                 *rearrange ++ = accents[k];
   2184             }
   2185             // forming all possible canonical rearrangement by dropping
   2186             // sets of accents
   2187             for (int i = 0; i <= accentsize - 1; i ++) {
   2188                 int32_t mask = 1 << (accentsize - i - 1);
   2189                 if (count & mask) {
   2190                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2191                         *rearrange ++ = accents[j];
   2192                     }
   2193                 }
   2194             }
   2195             *rearrange = 0;
   2196             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   2197             UChar   *match     = addToUCharArray(buffer, &matchsize,
   2198                                            strsrch->canonicalPrefixAccents,
   2199                                            strsrch->search->text + start,
   2200                                            offset - start,
   2201                                            strsrch->canonicalSuffixAccents,
   2202                                            status);
   2203 
   2204             // run the collator iterator through this match
   2205             // if status is a failure ucol_setText does nothing
   2206             ucol_setText(coleiter, match, matchsize, status);
   2207             if (U_SUCCESS(*status)) {
   2208                 if (checkCollationMatch(strsrch, coleiter)) {
   2209                     if (match != buffer) {
   2210                         uprv_free(match);
   2211                     }
   2212                     return end;
   2213                 }
   2214             }
   2215             count --;
   2216         }
   2217     }
   2218     return USEARCH_DONE;
   2219 }
   2220 
   2221 /**
   2222 * Take the rearranged start accents and tries matching. If match failed at
   2223 * a seperate following set of accents (seperated from the rearranged on by
   2224 * at least a base character) then we rearrange the preceding accents and
   2225 * tries matching again.
   2226 * We allow skipping of the ends of the accent set if the ces do not match.
   2227 * However if the failure is found before the accent set, it fails.
   2228 * Internal method, status assumed to be success, caller has to check status
   2229 * before calling this method.
   2230 * @param strsrch string search data
   2231 * @param textoffset of the ends of the rearranged accent
   2232 * @param status output error status if any
   2233 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2234 *         offset of the match. Note this start includes all following accents.
   2235 */
   2236 static
   2237 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
   2238                                            int32_t    textoffset,
   2239                                            UErrorCode    *status)
   2240 {
   2241     const UChar       *text       = strsrch->search->text;
   2242     const UCollator   *collator   = strsrch->collator;
   2243           int32_t      safelength = 0;
   2244           UChar       *safetext;
   2245           int32_t      safetextlength;
   2246           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
   2247           int32_t  safeoffset = textoffset;
   2248 
   2249     if (textoffset &&
   2250         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
   2251                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
   2252                                          ], collator)) {
   2253         safeoffset     = getNextSafeOffset(collator, text, textoffset,
   2254                                            strsrch->search->textLength);
   2255         safelength     = safeoffset - textoffset;
   2256         safetextlength = INITIAL_ARRAY_SIZE_;
   2257         safetext       = addToUCharArray(safebuffer, &safetextlength,
   2258                                          strsrch->canonicalPrefixAccents,
   2259                                          text + textoffset, safelength,
   2260                                          NULL, status);
   2261     }
   2262     else {
   2263         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
   2264         safetext       = strsrch->canonicalPrefixAccents;
   2265     }
   2266 
   2267     UCollationElements *coleiter = strsrch->utilIter;
   2268      // if status is a failure, ucol_setText does nothing
   2269     ucol_setText(coleiter, safetext, safetextlength, status);
   2270     // status checked in loop below
   2271 
   2272     int32_t  *ce           = strsrch->pattern.ces;
   2273     int32_t   celength     = strsrch->pattern.cesLength;
   2274     int       ceindex      = 0;
   2275     UBool     isSafe       = TRUE; // safe zone indication flag for position
   2276     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
   2277 
   2278     while (ceindex < celength) {
   2279         int32_t textce = ucol_next(coleiter, status);
   2280         if (U_FAILURE(*status)) {
   2281             if (isSafe) {
   2282                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2283             }
   2284             return USEARCH_DONE;
   2285         }
   2286         if (textce == UCOL_NULLORDER) {
   2287             // check if we have passed the safe buffer
   2288             if (coleiter == strsrch->textIter) {
   2289                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2290                 return USEARCH_DONE;
   2291             }
   2292             cleanUpSafeText(strsrch, safetext, safebuffer);
   2293             safetext = safebuffer;
   2294             coleiter = strsrch->textIter;
   2295             setColEIterOffset(coleiter, safeoffset);
   2296             // status checked at the start of the loop
   2297             isSafe = FALSE;
   2298             continue;
   2299         }
   2300         textce = getCE(strsrch, textce);
   2301         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   2302             // do the beginning stuff
   2303             int32_t failedoffset = ucol_getOffset(coleiter);
   2304             if (isSafe && failedoffset <= prefixlength) {
   2305                 // alas... no hope. failed at rearranged accent set
   2306                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2307                 return USEARCH_DONE;
   2308             }
   2309             else {
   2310                 if (isSafe) {
   2311                     failedoffset = safeoffset - failedoffset;
   2312                     cleanUpSafeText(strsrch, safetext, safebuffer);
   2313                 }
   2314 
   2315                 // try rearranging the end accents
   2316                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
   2317                                         textoffset, failedoffset, status);
   2318                 if (result != USEARCH_DONE) {
   2319                     // if status is a failure, ucol_setOffset does nothing
   2320                     setColEIterOffset(strsrch->textIter, result);
   2321                 }
   2322                 if (U_FAILURE(*status)) {
   2323                     return USEARCH_DONE;
   2324                 }
   2325                 return result;
   2326             }
   2327         }
   2328         if (textce == ce[ceindex]) {
   2329             ceindex ++;
   2330         }
   2331     }
   2332     // set offset here
   2333     if (isSafe) {
   2334         int32_t result      = ucol_getOffset(coleiter);
   2335         // sets the text iterator here with the correct expansion and offset
   2336         int32_t     leftoverces = getExpansionSuffix(coleiter);
   2337         cleanUpSafeText(strsrch, safetext, safebuffer);
   2338         if (result <= prefixlength) {
   2339             result = textoffset;
   2340         }
   2341         else {
   2342             result = textoffset + (safeoffset - result);
   2343         }
   2344         setColEIterOffset(strsrch->textIter, result);
   2345         setExpansionSuffix(strsrch->textIter, leftoverces);
   2346         return result;
   2347     }
   2348 
   2349     return ucol_getOffset(coleiter);
   2350 }
   2351 
   2352 /**
   2353 * Trying out the substring and sees if it can be a canonical match.
   2354 * This will try normalizing the starting accents and arranging them into
   2355 * canonical equivalents and check their corresponding ces with the pattern ce.
   2356 * Prefix accents in the text will be grouped according to their combining
   2357 * class and the groups will be mixed and matched to try find the perfect
   2358 * match with the pattern.
   2359 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2360 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2361 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2362 *         "\u0301\u0325".
   2363 * step 2: check if any of the generated substrings matches the pattern.
   2364 * Internal method, status assumed to be success, caller has to check status
   2365 * before calling this method.
   2366 * @param strsrch string search data
   2367 * @param textoffset start offset in the collation element text that starts
   2368 *                   with the accents to be rearranged
   2369 * @param status output error status if any
   2370 * @return TRUE if the match is valid, FALSE otherwise
   2371 */
   2372 static
   2373 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
   2374                                int32_t    textoffset,
   2375                                UErrorCode    *status)
   2376 {
   2377     const UChar       *text       = strsrch->search->text;
   2378           int32_t  temp       = textoffset;
   2379           int32_t      textlength = strsrch->search->textLength;
   2380     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   2381         UCollationElements *coleiter = strsrch->textIter;
   2382         int32_t         offset   = ucol_getOffset(coleiter);
   2383         if (strsrch->pattern.hasSuffixAccents) {
   2384             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
   2385                                                     offset, status);
   2386             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   2387                 setColEIterOffset(coleiter, offset);
   2388                 return TRUE;
   2389             }
   2390         }
   2391         return FALSE;
   2392     }
   2393 
   2394     if (!strsrch->pattern.hasPrefixAccents) {
   2395         return FALSE;
   2396     }
   2397 
   2398     UChar       accents[INITIAL_ARRAY_SIZE_];
   2399     // offset to the last base character in substring to search
   2400     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
   2401     // normalizing the offensive string
   2402     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
   2403                                0, accents, INITIAL_ARRAY_SIZE_, status);
   2404     // status checked in loop
   2405 
   2406     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   2407     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   2408 
   2409     // 2 power n - 1 plus the full set of accents
   2410     int32_t  count = (2 << (size - 1)) - 1;
   2411     while (U_SUCCESS(*status) && count > 0) {
   2412         UChar *rearrange = strsrch->canonicalPrefixAccents;
   2413         // copy the base characters
   2414         for (int k = 0; k < accentsindex[0]; k ++) {
   2415             *rearrange ++ = accents[k];
   2416         }
   2417         // forming all possible canonical rearrangement by dropping
   2418         // sets of accents
   2419         for (int i = 0; i <= size - 1; i ++) {
   2420             int32_t mask = 1 << (size - i - 1);
   2421             if (count & mask) {
   2422                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2423                     *rearrange ++ = accents[j];
   2424                 }
   2425             }
   2426         }
   2427         *rearrange = 0;
   2428         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
   2429                                                           baseoffset, status);
   2430         if (offset != USEARCH_DONE) {
   2431             return TRUE; // match found
   2432         }
   2433         count --;
   2434     }
   2435     return FALSE;
   2436 }
   2437 
   2438 /**
   2439 * Checks match for contraction.
   2440 * If the match starts with a partial contraction we fail.
   2441 * Internal method, status assumed to be success, caller has to check status
   2442 * before calling this method.
   2443 * @param strsrch string search data
   2444 * @param start offset of potential match, to be modified if necessary
   2445 * @param end offset of potential match, to be modified if necessary
   2446 * @param status only error status if any
   2447 * @return TRUE if match passes the contraction test, FALSE otherwise
   2448 */
   2449 static
   2450 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
   2451                                      int32_t   *start,
   2452                                      int32_t   *end, UErrorCode  *status)
   2453 {
   2454           UCollationElements *coleiter   = strsrch->textIter;
   2455           int32_t             textlength = strsrch->search->textLength;
   2456           int32_t         temp       = *end;
   2457     const UCollator          *collator   = strsrch->collator;
   2458     const UChar              *text       = strsrch->search->text;
   2459     // This part checks if either if the start of the match contains potential
   2460     // contraction. If so we'll have to iterate through them
   2461     // Since we used ucol_next while previously looking for the potential
   2462     // match, this guarantees that our end will not be a partial contraction,
   2463     // or a partial supplementary character.
   2464     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2465         int32_t expansion  = getExpansionSuffix(coleiter);
   2466         UBool   expandflag = expansion > 0;
   2467         setColEIterOffset(coleiter, *end);
   2468         while (expansion > 0) {
   2469             // getting rid of the redundant ce
   2470             // since forward contraction/expansion may have extra ces
   2471             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2472             // would have taken care of it.
   2473             // E.g. the character \u01FA will have an expansion of 3, but if
   2474             // we are only looking for A ring A\u030A, we'll have to skip the
   2475             // last ce in the expansion buffer
   2476             ucol_previous(coleiter, status);
   2477             if (U_FAILURE(*status)) {
   2478                 return FALSE;
   2479             }
   2480             if (ucol_getOffset(coleiter) != temp) {
   2481                 *end = temp;
   2482                 temp  = ucol_getOffset(coleiter);
   2483             }
   2484             expansion --;
   2485         }
   2486 
   2487         int32_t  *patternce       = strsrch->pattern.ces;
   2488         int32_t   patterncelength = strsrch->pattern.cesLength;
   2489         int32_t   count           = patterncelength;
   2490         while (count > 0) {
   2491             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2492             // status checked below, note that if status is a failure
   2493             // ucol_previous returns UCOL_NULLORDER
   2494             if (ce == UCOL_IGNORABLE) {
   2495                 continue;
   2496             }
   2497             if (expandflag && count == 0 &&
   2498                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2499                 *end = temp;
   2500                 temp  = ucol_getOffset(coleiter);
   2501             }
   2502             if (count == patterncelength &&
   2503                 ce != patternce[patterncelength - 1]) {
   2504                 // accents may have extra starting ces, this occurs when a
   2505                 // pure accent pattern is matched without rearrangement
   2506                 int32_t    expected = patternce[patterncelength - 1];
   2507                 U16_BACK_1(text, 0, *end);
   2508                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
   2509                     ce = getCE(strsrch, ucol_previous(coleiter, status));
   2510                     while (U_SUCCESS(*status) && ce != expected &&
   2511                            ce != UCOL_NULLORDER &&
   2512                            ucol_getOffset(coleiter) <= *start) {
   2513                         ce = getCE(strsrch, ucol_previous(coleiter, status));
   2514                     }
   2515                 }
   2516             }
   2517             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2518                 (*start) --;
   2519                 *start = getPreviousBaseOffset(text, *start);
   2520                 return FALSE;
   2521             }
   2522             count --;
   2523         }
   2524     }
   2525     return TRUE;
   2526 }
   2527 
   2528 /**
   2529 * Checks and sets the match information if found.
   2530 * Checks
   2531 * <ul>
   2532 * <li> the potential match does not repeat the previous match
   2533 * <li> boundaries are correct
   2534 * <li> potential match does not end in the middle of a contraction
   2535 * <li> identical matches
   2536 * <\ul>
   2537 * Otherwise the offset will be shifted to the next character.
   2538 * Internal method, status assumed to be success, caller has to check status
   2539 * before calling this method.
   2540 * @param strsrch string search data
   2541 * @param textoffset offset in the collation element text. the returned value
   2542 *        will be the truncated start offset of the match or the new start
   2543 *        search offset.
   2544 * @param status only error status if any
   2545 * @return TRUE if the match is valid, FALSE otherwise
   2546 */
   2547 static
   2548 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
   2549                                          int32_t   *textoffset,
   2550                                          UErrorCode    *status)
   2551 {
   2552     // to ensure that the start and ends are not composite characters
   2553     UCollationElements *coleiter = strsrch->textIter;
   2554     // if we have a canonical accent match
   2555     if ((strsrch->pattern.hasSuffixAccents &&
   2556         strsrch->canonicalSuffixAccents[0]) ||
   2557         (strsrch->pattern.hasPrefixAccents &&
   2558         strsrch->canonicalPrefixAccents[0])) {
   2559         strsrch->search->matchedIndex  = *textoffset;
   2560         strsrch->search->matchedLength =
   2561             getNextUStringSearchBaseOffset(strsrch,
   2562                                       getColElemIterOffset(coleiter, FALSE))
   2563             - *textoffset;
   2564         return TRUE;
   2565     }
   2566 
   2567     int32_t end = ucol_getOffset(coleiter);
   2568     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
   2569                                                 status) ||
   2570          U_FAILURE(*status)) {
   2571         return FALSE;
   2572     }
   2573 
   2574     end = getNextUStringSearchBaseOffset(strsrch, end);
   2575     // this totally matches, however we need to check if it is repeating
   2576     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2577         !isBreakUnit(strsrch, *textoffset, end) ||
   2578         !checkIdentical(strsrch, *textoffset, end)) {
   2579         (*textoffset) --;
   2580         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2581                                             *textoffset);
   2582         return FALSE;
   2583     }
   2584 
   2585     strsrch->search->matchedIndex  = *textoffset;
   2586     strsrch->search->matchedLength = end - *textoffset;
   2587     return TRUE;
   2588 }
   2589 #endif // #if BOYER_MOORE
   2590 
   2591 // constructors and destructor -------------------------------------------
   2592 
   2593 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
   2594                                           int32_t         patternlength,
   2595                                     const UChar          *text,
   2596                                           int32_t         textlength,
   2597                                     const char           *locale,
   2598                                           UBreakIterator *breakiter,
   2599                                           UErrorCode     *status)
   2600 {
   2601     if (U_FAILURE(*status)) {
   2602         return NULL;
   2603     }
   2604 #if UCONFIG_NO_BREAK_ITERATION
   2605     if (breakiter != NULL) {
   2606         *status = U_UNSUPPORTED_ERROR;
   2607         return NULL;
   2608     }
   2609 #endif
   2610     if (locale) {
   2611         // ucol_open internally checks for status
   2612         UCollator     *collator = ucol_open(locale, status);
   2613         // pattern, text checks are done in usearch_openFromCollator
   2614         UStringSearch *result   = usearch_openFromCollator(pattern,
   2615                                               patternlength, text, textlength,
   2616                                               collator, breakiter, status);
   2617 
   2618         if (result == NULL || U_FAILURE(*status)) {
   2619             if (collator) {
   2620                 ucol_close(collator);
   2621             }
   2622             return NULL;
   2623         }
   2624         else {
   2625             result->ownCollator = TRUE;
   2626         }
   2627         return result;
   2628     }
   2629     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2630     return NULL;
   2631 }
   2632 
   2633 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
   2634                                   const UChar          *pattern,
   2635                                         int32_t         patternlength,
   2636                                   const UChar          *text,
   2637                                         int32_t         textlength,
   2638                                   const UCollator      *collator,
   2639                                         UBreakIterator *breakiter,
   2640                                         UErrorCode     *status)
   2641 {
   2642     if (U_FAILURE(*status)) {
   2643         return NULL;
   2644     }
   2645 #if UCONFIG_NO_BREAK_ITERATION
   2646     if (breakiter != NULL) {
   2647         *status = U_UNSUPPORTED_ERROR;
   2648         return NULL;
   2649     }
   2650 #endif
   2651     if (pattern == NULL || text == NULL || collator == NULL) {
   2652         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2653         return NULL;
   2654     }
   2655 
   2656     // string search does not really work when numeric collation is turned on
   2657     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
   2658         *status = U_UNSUPPORTED_ERROR;
   2659         return NULL;
   2660     }
   2661 
   2662     if (U_SUCCESS(*status)) {
   2663         initializeFCD(status);
   2664         if (U_FAILURE(*status)) {
   2665             return NULL;
   2666         }
   2667 
   2668         UStringSearch *result;
   2669         if (textlength == -1) {
   2670             textlength = u_strlen(text);
   2671         }
   2672         if (patternlength == -1) {
   2673             patternlength = u_strlen(pattern);
   2674         }
   2675         if (textlength <= 0 || patternlength <= 0) {
   2676             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2677             return NULL;
   2678         }
   2679 
   2680         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
   2681         if (result == NULL) {
   2682             *status = U_MEMORY_ALLOCATION_ERROR;
   2683             return NULL;
   2684         }
   2685 
   2686         result->collator    = collator;
   2687         result->strength    = ucol_getStrength(collator);
   2688         result->ceMask      = getMask(result->strength);
   2689         result->toShift     =
   2690              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   2691                                                             UCOL_SHIFTED;
   2692         result->variableTop = ucol_getVariableTop(collator, status);
   2693 
   2694         result->nfd         = Normalizer2::getNFDInstance(*status);
   2695 
   2696         if (U_FAILURE(*status)) {
   2697             uprv_free(result);
   2698             return NULL;
   2699         }
   2700 
   2701         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
   2702         if (result->search == NULL) {
   2703             *status = U_MEMORY_ALLOCATION_ERROR;
   2704             uprv_free(result);
   2705             return NULL;
   2706         }
   2707 
   2708         result->search->text       = text;
   2709         result->search->textLength = textlength;
   2710 
   2711         result->pattern.text       = pattern;
   2712         result->pattern.textLength = patternlength;
   2713         result->pattern.ces         = NULL;
   2714         result->pattern.pces        = NULL;
   2715 
   2716         result->search->breakIter  = breakiter;
   2717 #if !UCONFIG_NO_BREAK_ITERATION
   2718         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
   2719         if (breakiter) {
   2720             ubrk_setText(breakiter, text, textlength, status);
   2721         }
   2722 #endif
   2723 
   2724         result->ownCollator           = FALSE;
   2725         result->search->matchedLength = 0;
   2726         result->search->matchedIndex  = USEARCH_DONE;
   2727         result->utilIter              = NULL;
   2728         result->textIter              = ucol_openElements(collator, text,
   2729                                                           textlength, status);
   2730         result->textProcessedIter     = NULL;
   2731         if (U_FAILURE(*status)) {
   2732             usearch_close(result);
   2733             return NULL;
   2734         }
   2735 
   2736         result->search->isOverlap          = FALSE;
   2737         result->search->isCanonicalMatch   = FALSE;
   2738         result->search->elementComparisonType = 0;
   2739         result->search->isForwardSearching = TRUE;
   2740         result->search->reset              = TRUE;
   2741 
   2742         initialize(result, status);
   2743 
   2744         if (U_FAILURE(*status)) {
   2745             usearch_close(result);
   2746             return NULL;
   2747         }
   2748 
   2749         return result;
   2750     }
   2751     return NULL;
   2752 }
   2753 
   2754 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
   2755 {
   2756     if (strsrch) {
   2757         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
   2758             strsrch->pattern.ces) {
   2759             uprv_free(strsrch->pattern.ces);
   2760         }
   2761 
   2762         if (strsrch->pattern.pces != NULL &&
   2763             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
   2764             uprv_free(strsrch->pattern.pces);
   2765         }
   2766 
   2767         delete strsrch->textProcessedIter;
   2768         ucol_closeElements(strsrch->textIter);
   2769         ucol_closeElements(strsrch->utilIter);
   2770 
   2771         if (strsrch->ownCollator && strsrch->collator) {
   2772             ucol_close((UCollator *)strsrch->collator);
   2773         }
   2774 
   2775 #if !UCONFIG_NO_BREAK_ITERATION
   2776         if (strsrch->search->internalBreakIter) {
   2777             ubrk_close(strsrch->search->internalBreakIter);
   2778         }
   2779 #endif
   2780 
   2781         uprv_free(strsrch->search);
   2782         uprv_free(strsrch);
   2783     }
   2784 }
   2785 
   2786 namespace {
   2787 
   2788 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
   2789     if (U_FAILURE(*status)) { return FALSE; }
   2790     if (strsrch->textProcessedIter == NULL) {
   2791         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
   2792         if (strsrch->textProcessedIter == NULL) {
   2793             *status = U_MEMORY_ALLOCATION_ERROR;
   2794             return FALSE;
   2795         }
   2796     } else {
   2797         strsrch->textProcessedIter->init(strsrch->textIter);
   2798     }
   2799     return TRUE;
   2800 }
   2801 
   2802 }
   2803 
   2804 // set and get methods --------------------------------------------------
   2805 
   2806 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
   2807                                         int32_t    position,
   2808                                         UErrorCode    *status)
   2809 {
   2810     if (U_SUCCESS(*status) && strsrch) {
   2811         if (isOutOfBounds(strsrch->search->textLength, position)) {
   2812             *status = U_INDEX_OUTOFBOUNDS_ERROR;
   2813         }
   2814         else {
   2815             setColEIterOffset(strsrch->textIter, position);
   2816         }
   2817         strsrch->search->matchedIndex  = USEARCH_DONE;
   2818         strsrch->search->matchedLength = 0;
   2819         strsrch->search->reset         = FALSE;
   2820     }
   2821 }
   2822 
   2823 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
   2824 {
   2825     if (strsrch) {
   2826         int32_t result = ucol_getOffset(strsrch->textIter);
   2827         if (isOutOfBounds(strsrch->search->textLength, result)) {
   2828             return USEARCH_DONE;
   2829         }
   2830         return result;
   2831     }
   2832     return USEARCH_DONE;
   2833 }
   2834 
   2835 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
   2836                                  USearchAttribute attribute,
   2837                                  USearchAttributeValue value,
   2838                                  UErrorCode *status)
   2839 {
   2840     if (U_SUCCESS(*status) && strsrch) {
   2841         switch (attribute)
   2842         {
   2843         case USEARCH_OVERLAP :
   2844             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
   2845             break;
   2846         case USEARCH_CANONICAL_MATCH :
   2847             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
   2848                                                                       FALSE);
   2849             break;
   2850         case USEARCH_ELEMENT_COMPARISON :
   2851             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2852                 strsrch->search->elementComparisonType = (int16_t)value;
   2853             } else {
   2854                 strsrch->search->elementComparisonType = 0;
   2855             }
   2856             break;
   2857         case USEARCH_ATTRIBUTE_COUNT :
   2858         default:
   2859             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2860         }
   2861     }
   2862     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
   2863         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2864     }
   2865 }
   2866 
   2867 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
   2868                                                 const UStringSearch *strsrch,
   2869                                                 USearchAttribute attribute)
   2870 {
   2871     if (strsrch) {
   2872         switch (attribute) {
   2873         case USEARCH_OVERLAP :
   2874             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
   2875                                                         USEARCH_OFF);
   2876         case USEARCH_CANONICAL_MATCH :
   2877             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
   2878                                                                USEARCH_OFF);
   2879         case USEARCH_ELEMENT_COMPARISON :
   2880             {
   2881                 int16_t value = strsrch->search->elementComparisonType;
   2882                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2883                     return (USearchAttributeValue)value;
   2884                 } else {
   2885                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
   2886                 }
   2887             }
   2888         case USEARCH_ATTRIBUTE_COUNT :
   2889             return USEARCH_DEFAULT;
   2890         }
   2891     }
   2892     return USEARCH_DEFAULT;
   2893 }
   2894 
   2895 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
   2896                                                 const UStringSearch *strsrch)
   2897 {
   2898     if (strsrch == NULL) {
   2899         return USEARCH_DONE;
   2900     }
   2901     return strsrch->search->matchedIndex;
   2902 }
   2903 
   2904 
   2905 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
   2906                                             UChar         *result,
   2907                                             int32_t        resultCapacity,
   2908                                             UErrorCode    *status)
   2909 {
   2910     if (U_FAILURE(*status)) {
   2911         return USEARCH_DONE;
   2912     }
   2913     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
   2914         result == NULL)) {
   2915         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2916         return USEARCH_DONE;
   2917     }
   2918 
   2919     int32_t     copylength = strsrch->search->matchedLength;
   2920     int32_t copyindex  = strsrch->search->matchedIndex;
   2921     if (copyindex == USEARCH_DONE) {
   2922         u_terminateUChars(result, resultCapacity, 0, status);
   2923         return USEARCH_DONE;
   2924     }
   2925 
   2926     if (resultCapacity < copylength) {
   2927         copylength = resultCapacity;
   2928     }
   2929     if (copylength > 0) {
   2930         uprv_memcpy(result, strsrch->search->text + copyindex,
   2931                     copylength * sizeof(UChar));
   2932     }
   2933     return u_terminateUChars(result, resultCapacity,
   2934                              strsrch->search->matchedLength, status);
   2935 }
   2936 
   2937 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
   2938                                               const UStringSearch *strsrch)
   2939 {
   2940     if (strsrch) {
   2941         return strsrch->search->matchedLength;
   2942     }
   2943     return USEARCH_DONE;
   2944 }
   2945 
   2946 #if !UCONFIG_NO_BREAK_ITERATION
   2947 
   2948 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
   2949                                                UBreakIterator *breakiter,
   2950                                                UErrorCode     *status)
   2951 {
   2952     if (U_SUCCESS(*status) && strsrch) {
   2953         strsrch->search->breakIter = breakiter;
   2954         if (breakiter) {
   2955             ubrk_setText(breakiter, strsrch->search->text,
   2956                          strsrch->search->textLength, status);
   2957         }
   2958     }
   2959 }
   2960 
   2961 U_CAPI const UBreakIterator* U_EXPORT2
   2962 usearch_getBreakIterator(const UStringSearch *strsrch)
   2963 {
   2964     if (strsrch) {
   2965         return strsrch->search->breakIter;
   2966     }
   2967     return NULL;
   2968 }
   2969 
   2970 #endif
   2971 
   2972 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
   2973                                       const UChar         *text,
   2974                                             int32_t        textlength,
   2975                                             UErrorCode    *status)
   2976 {
   2977     if (U_SUCCESS(*status)) {
   2978         if (strsrch == NULL || text == NULL || textlength < -1 ||
   2979             textlength == 0) {
   2980             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2981         }
   2982         else {
   2983             if (textlength == -1) {
   2984                 textlength = u_strlen(text);
   2985             }
   2986             strsrch->search->text       = text;
   2987             strsrch->search->textLength = textlength;
   2988             ucol_setText(strsrch->textIter, text, textlength, status);
   2989             strsrch->search->matchedIndex  = USEARCH_DONE;
   2990             strsrch->search->matchedLength = 0;
   2991             strsrch->search->reset         = TRUE;
   2992 #if !UCONFIG_NO_BREAK_ITERATION
   2993             if (strsrch->search->breakIter != NULL) {
   2994                 ubrk_setText(strsrch->search->breakIter, text,
   2995                              textlength, status);
   2996             }
   2997             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
   2998 #endif
   2999         }
   3000     }
   3001 }
   3002 
   3003 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
   3004                                                      int32_t       *length)
   3005 {
   3006     if (strsrch) {
   3007         *length = strsrch->search->textLength;
   3008         return strsrch->search->text;
   3009     }
   3010     return NULL;
   3011 }
   3012 
   3013 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
   3014                                           const UCollator     *collator,
   3015                                                 UErrorCode    *status)
   3016 {
   3017     if (U_SUCCESS(*status)) {
   3018         if (collator == NULL) {
   3019             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3020             return;
   3021         }
   3022 
   3023         if (strsrch) {
   3024             delete strsrch->textProcessedIter;
   3025             strsrch->textProcessedIter = NULL;
   3026             ucol_closeElements(strsrch->textIter);
   3027             ucol_closeElements(strsrch->utilIter);
   3028             strsrch->textIter = strsrch->utilIter = NULL;
   3029             if (strsrch->ownCollator && (strsrch->collator != collator)) {
   3030                 ucol_close((UCollator *)strsrch->collator);
   3031                 strsrch->ownCollator = FALSE;
   3032             }
   3033             strsrch->collator    = collator;
   3034             strsrch->strength    = ucol_getStrength(collator);
   3035             strsrch->ceMask      = getMask(strsrch->strength);
   3036 #if !UCONFIG_NO_BREAK_ITERATION
   3037             ubrk_close(strsrch->search->internalBreakIter);
   3038             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
   3039                                                      strsrch->search->text, strsrch->search->textLength, status);
   3040 #endif
   3041             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3042             strsrch->toShift     =
   3043                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   3044                                                                 UCOL_SHIFTED;
   3045             // if status is a failure, ucol_getVariableTop returns 0
   3046             strsrch->variableTop = ucol_getVariableTop(collator, status);
   3047             strsrch->textIter = ucol_openElements(collator,
   3048                                       strsrch->search->text,
   3049                                       strsrch->search->textLength,
   3050                                       status);
   3051             strsrch->utilIter = ucol_openElements(
   3052                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
   3053             // initialize() _after_ setting the iterators for the new collator.
   3054             initialize(strsrch, status);
   3055         }
   3056 
   3057         // **** are these calls needed?
   3058         // **** we call uprv_init_pce in initializePatternPCETable
   3059         // **** and the CEIBuffer constructor...
   3060 #if 0
   3061         uprv_init_pce(strsrch->textIter);
   3062         uprv_init_pce(strsrch->utilIter);
   3063 #endif
   3064     }
   3065 }
   3066 
   3067 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
   3068 {
   3069     if (strsrch) {
   3070         return (UCollator *)strsrch->collator;
   3071     }
   3072     return NULL;
   3073 }
   3074 
   3075 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
   3076                                          const UChar         *pattern,
   3077                                                int32_t        patternlength,
   3078                                                UErrorCode    *status)
   3079 {
   3080     if (U_SUCCESS(*status)) {
   3081         if (strsrch == NULL || pattern == NULL) {
   3082             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3083         }
   3084         else {
   3085             if (patternlength == -1) {
   3086                 patternlength = u_strlen(pattern);
   3087             }
   3088             if (patternlength == 0) {
   3089                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   3090                 return;
   3091             }
   3092             strsrch->pattern.text       = pattern;
   3093             strsrch->pattern.textLength = patternlength;
   3094             initialize(strsrch, status);
   3095         }
   3096     }
   3097 }
   3098 
   3099 U_CAPI const UChar* U_EXPORT2
   3100 usearch_getPattern(const UStringSearch *strsrch,
   3101                    int32_t       *length)
   3102 {
   3103     if (strsrch) {
   3104         *length = strsrch->pattern.textLength;
   3105         return strsrch->pattern.text;
   3106     }
   3107     return NULL;
   3108 }
   3109 
   3110 // miscellanous methods --------------------------------------------------
   3111 
   3112 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
   3113                                            UErrorCode    *status)
   3114 {
   3115     if (strsrch && U_SUCCESS(*status)) {
   3116         strsrch->search->isForwardSearching = TRUE;
   3117         usearch_setOffset(strsrch, 0, status);
   3118         if (U_SUCCESS(*status)) {
   3119             return usearch_next(strsrch, status);
   3120         }
   3121     }
   3122     return USEARCH_DONE;
   3123 }
   3124 
   3125 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
   3126                                                int32_t    position,
   3127                                                UErrorCode    *status)
   3128 {
   3129     if (strsrch && U_SUCCESS(*status)) {
   3130         strsrch->search->isForwardSearching = TRUE;
   3131         // position checked in usearch_setOffset
   3132         usearch_setOffset(strsrch, position, status);
   3133         if (U_SUCCESS(*status)) {
   3134             return usearch_next(strsrch, status);
   3135         }
   3136     }
   3137     return USEARCH_DONE;
   3138 }
   3139 
   3140 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
   3141                                           UErrorCode    *status)
   3142 {
   3143     if (strsrch && U_SUCCESS(*status)) {
   3144         strsrch->search->isForwardSearching = FALSE;
   3145         usearch_setOffset(strsrch, strsrch->search->textLength, status);
   3146         if (U_SUCCESS(*status)) {
   3147             return usearch_previous(strsrch, status);
   3148         }
   3149     }
   3150     return USEARCH_DONE;
   3151 }
   3152 
   3153 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
   3154                                                int32_t    position,
   3155                                                UErrorCode    *status)
   3156 {
   3157     if (strsrch && U_SUCCESS(*status)) {
   3158         strsrch->search->isForwardSearching = FALSE;
   3159         // position checked in usearch_setOffset
   3160         usearch_setOffset(strsrch, position, status);
   3161         if (U_SUCCESS(*status)) {
   3162             return usearch_previous(strsrch, status);
   3163         }
   3164     }
   3165     return USEARCH_DONE;
   3166 }
   3167 
   3168 /**
   3169 * If a direction switch is required, we'll count the number of ces till the
   3170 * beginning of the collation element iterator and iterate forwards that
   3171 * number of times. This is so that we get to the correct point within the
   3172 * string to continue the search in. Imagine when we are in the middle of the
   3173 * normalization buffer when the change in direction is request. arrrgghh....
   3174 * After searching the offset within the collation element iterator will be
   3175 * shifted to the start of the match. If a match is not found, the offset would
   3176 * have been set to the end of the text string in the collation element
   3177 * iterator.
   3178 * Okay, here's my take on normalization buffer. The only time when there can
   3179 * be 2 matches within the same normalization is when the pattern is consists
   3180 * of all accents. But since the offset returned is from the text string, we
   3181 * should not confuse the caller by returning the second match within the
   3182 * same normalization buffer. If we do, the 2 results will have the same match
   3183 * offsets, and that'll be confusing. I'll return the next match that doesn't
   3184 * fall within the same normalization buffer. Note this does not affect the
   3185 * results of matches spanning the text and the normalization buffer.
   3186 * The position to start searching is taken from the collation element
   3187 * iterator. Callers of this API would have to set the offset in the collation
   3188 * element iterator before using this method.
   3189 */
   3190 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
   3191                                           UErrorCode    *status)
   3192 {
   3193     if (U_SUCCESS(*status) && strsrch) {
   3194         // note offset is either equivalent to the start of the previous match
   3195         // or is set by the user
   3196         int32_t      offset       = usearch_getOffset(strsrch);
   3197         USearch     *search       = strsrch->search;
   3198         search->reset             = FALSE;
   3199         int32_t      textlength   = search->textLength;
   3200         if (search->isForwardSearching) {
   3201 #if BOYER_MOORE
   3202             if (offset == textlength
   3203                 || (!search->isOverlap &&
   3204                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
   3205                     (search->matchedIndex != USEARCH_DONE &&
   3206                      offset + search->matchedLength >= textlength)))) {
   3207                 // not enough characters to match
   3208                 setMatchNotFound(strsrch);
   3209                 return USEARCH_DONE;
   3210             }
   3211 #else
   3212             if (offset == textlength ||
   3213                 (! search->isOverlap &&
   3214                 (search->matchedIndex != USEARCH_DONE &&
   3215                 offset + search->matchedLength > textlength))) {
   3216                     // not enough characters to match
   3217                     setMatchNotFound(strsrch);
   3218                     return USEARCH_DONE;
   3219             }
   3220 #endif
   3221         }
   3222         else {
   3223             // switching direction.
   3224             // if matchedIndex == USEARCH_DONE, it means that either a
   3225             // setOffset has been called or that previous ran off the text
   3226             // string. the iterator would have been set to offset 0 if a
   3227             // match is not found.
   3228             search->isForwardSearching = TRUE;
   3229             if (search->matchedIndex != USEARCH_DONE) {
   3230                 // there's no need to set the collation element iterator
   3231                 // the next call to next will set the offset.
   3232                 return search->matchedIndex;
   3233             }
   3234         }
   3235 
   3236         if (U_SUCCESS(*status)) {
   3237             if (strsrch->pattern.cesLength == 0) {
   3238                 if (search->matchedIndex == USEARCH_DONE) {
   3239                     search->matchedIndex = offset;
   3240                 }
   3241                 else { // moves by codepoints
   3242                     U16_FWD_1(search->text, search->matchedIndex, textlength);
   3243                 }
   3244 
   3245                 search->matchedLength = 0;
   3246                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3247                 // status checked below
   3248                 if (search->matchedIndex == textlength) {
   3249                     search->matchedIndex = USEARCH_DONE;
   3250                 }
   3251             }
   3252             else {
   3253                 if (search->matchedLength > 0) {
   3254                     // if matchlength is 0 we are at the start of the iteration
   3255                     if (search->isOverlap) {
   3256                         ucol_setOffset(strsrch->textIter, offset + 1, status);
   3257                     }
   3258                     else {
   3259                         ucol_setOffset(strsrch->textIter,
   3260                                        offset + search->matchedLength, status);
   3261                     }
   3262                 }
   3263                 else {
   3264                     // for boundary check purposes. this will ensure that the
   3265                     // next match will not preceed the current offset
   3266                     // note search->matchedIndex will always be set to something
   3267                     // in the code
   3268                     search->matchedIndex = offset - 1;
   3269                 }
   3270 
   3271                 if (search->isCanonicalMatch) {
   3272                     // can't use exact here since extra accents are allowed.
   3273                     usearch_handleNextCanonical(strsrch, status);
   3274                 }
   3275                 else {
   3276                     usearch_handleNextExact(strsrch, status);
   3277                 }
   3278             }
   3279 
   3280             if (U_FAILURE(*status)) {
   3281                 return USEARCH_DONE;
   3282             }
   3283 
   3284 #if !BOYER_MOORE
   3285             if (search->matchedIndex == USEARCH_DONE) {
   3286                 ucol_setOffset(strsrch->textIter, search->textLength, status);
   3287             } else {
   3288                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
   3289             }
   3290 #endif
   3291 
   3292             return search->matchedIndex;
   3293         }
   3294     }
   3295     return USEARCH_DONE;
   3296 }
   3297 
   3298 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
   3299                                               UErrorCode *status)
   3300 {
   3301     if (U_SUCCESS(*status) && strsrch) {
   3302         int32_t offset;
   3303         USearch *search = strsrch->search;
   3304         if (search->reset) {
   3305             offset                     = search->textLength;
   3306             search->isForwardSearching = FALSE;
   3307             search->reset              = FALSE;
   3308             setColEIterOffset(strsrch->textIter, offset);
   3309         }
   3310         else {
   3311             offset = usearch_getOffset(strsrch);
   3312         }
   3313 
   3314         int32_t matchedindex = search->matchedIndex;
   3315         if (search->isForwardSearching == TRUE) {
   3316             // switching direction.
   3317             // if matchedIndex == USEARCH_DONE, it means that either a
   3318             // setOffset has been called or that next ran off the text
   3319             // string. the iterator would have been set to offset textLength if
   3320             // a match is not found.
   3321             search->isForwardSearching = FALSE;
   3322             if (matchedindex != USEARCH_DONE) {
   3323                 return matchedindex;
   3324             }
   3325         }
   3326         else {
   3327 #if BOYER_MOORE
   3328             if (offset == 0 || matchedindex == 0 ||
   3329                 (!search->isOverlap &&
   3330                     (offset < strsrch->pattern.defaultShiftSize ||
   3331                     (matchedindex != USEARCH_DONE &&
   3332                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
   3333                 // not enough characters to match
   3334                 setMatchNotFound(strsrch);
   3335                 return USEARCH_DONE;
   3336             }
   3337 #else
   3338             // Could check pattern length, but the
   3339             // linear search will do the right thing
   3340             if (offset == 0 || matchedindex == 0) {
   3341                 setMatchNotFound(strsrch);
   3342                 return USEARCH_DONE;
   3343             }
   3344 #endif
   3345         }
   3346 
   3347         if (U_SUCCESS(*status)) {
   3348             if (strsrch->pattern.cesLength == 0) {
   3349                 search->matchedIndex =
   3350                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
   3351                 if (search->matchedIndex == 0) {
   3352                     setMatchNotFound(strsrch);
   3353                     // status checked below
   3354                 }
   3355                 else { // move by codepoints
   3356                     U16_BACK_1(search->text, 0, search->matchedIndex);
   3357                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3358                     // status checked below
   3359                     search->matchedLength = 0;
   3360                 }
   3361             }
   3362             else {
   3363                 if (strsrch->search->isCanonicalMatch) {
   3364                     // can't use exact here since extra accents are allowed.
   3365                     usearch_handlePreviousCanonical(strsrch, status);
   3366                     // status checked below
   3367                 }
   3368                 else {
   3369                     usearch_handlePreviousExact(strsrch, status);
   3370                     // status checked below
   3371                 }
   3372             }
   3373 
   3374             if (U_FAILURE(*status)) {
   3375                 return USEARCH_DONE;
   3376             }
   3377 
   3378             return search->matchedIndex;
   3379         }
   3380     }
   3381     return USEARCH_DONE;
   3382 }
   3383 
   3384 
   3385 
   3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
   3387 {
   3388     /*
   3389     reset is setting the attributes that are already in
   3390     string search, hence all attributes in the collator should
   3391     be retrieved without any problems
   3392     */
   3393     if (strsrch) {
   3394         UErrorCode status            = U_ZERO_ERROR;
   3395         UBool      sameCollAttribute = TRUE;
   3396         uint32_t   ceMask;
   3397         UBool      shift;
   3398         uint32_t   varTop;
   3399 
   3400         // **** hack to deal w/ how processed CEs encode quaternary ****
   3401         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
   3402         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
   3403             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
   3404                 sameCollAttribute = FALSE;
   3405         }
   3406 
   3407         strsrch->strength    = ucol_getStrength(strsrch->collator);
   3408         ceMask = getMask(strsrch->strength);
   3409         if (strsrch->ceMask != ceMask) {
   3410             strsrch->ceMask = ceMask;
   3411             sameCollAttribute = FALSE;
   3412         }
   3413 
   3414         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3415         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
   3416                                   &status) == UCOL_SHIFTED;
   3417         if (strsrch->toShift != shift) {
   3418             strsrch->toShift  = shift;
   3419             sameCollAttribute = FALSE;
   3420         }
   3421 
   3422         // if status is a failure, ucol_getVariableTop returns 0
   3423         varTop = ucol_getVariableTop(strsrch->collator, &status);
   3424         if (strsrch->variableTop != varTop) {
   3425             strsrch->variableTop = varTop;
   3426             sameCollAttribute    = FALSE;
   3427         }
   3428         if (!sameCollAttribute) {
   3429             initialize(strsrch, &status);
   3430         }
   3431         ucol_setText(strsrch->textIter, strsrch->search->text,
   3432                               strsrch->search->textLength,
   3433                               &status);
   3434         strsrch->search->matchedLength      = 0;
   3435         strsrch->search->matchedIndex       = USEARCH_DONE;
   3436         strsrch->search->isOverlap          = FALSE;
   3437         strsrch->search->isCanonicalMatch   = FALSE;
   3438         strsrch->search->elementComparisonType = 0;
   3439         strsrch->search->isForwardSearching = TRUE;
   3440         strsrch->search->reset              = TRUE;
   3441     }
   3442 }
   3443 
   3444 //
   3445 //  CEI  Collation Element + source text index.
   3446 //       These structs are kept in the circular buffer.
   3447 //
   3448 struct  CEI {
   3449     int64_t ce;
   3450     int32_t lowIndex;
   3451     int32_t highIndex;
   3452 };
   3453 
   3454 U_NAMESPACE_BEGIN
   3455 
   3456 namespace {
   3457 //
   3458 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
   3459 //
   3460 #define   DEFAULT_CEBUFFER_SIZE 96
   3461 #define   CEBUFFER_EXTRA 32
   3462 // Some typical max values to make buffer size more reasonable for asymmetric search.
   3463 // #8694 is for a better long-term solution to allocation of this buffer.
   3464 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
   3465 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
   3466 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
   3467 struct CEIBuffer {
   3468     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
   3469     CEI                 *buf;
   3470     int32_t              bufSize;
   3471     int32_t              firstIx;
   3472     int32_t              limitIx;
   3473     UCollationElements  *ceIter;
   3474     UStringSearch       *strSearch;
   3475 
   3476 
   3477 
   3478                CEIBuffer(UStringSearch *ss, UErrorCode *status);
   3479                ~CEIBuffer();
   3480    const CEI   *get(int32_t index);
   3481    const CEI   *getPrevious(int32_t index);
   3482 };
   3483 
   3484 
   3485 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
   3486     buf = defBuf;
   3487     strSearch = ss;
   3488     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
   3489     if (ss->search->elementComparisonType != 0) {
   3490         const UChar * patText = ss->pattern.text;
   3491         if (patText) {
   3492             const UChar * patTextLimit = patText + ss->pattern.textLength;
   3493             while ( patText < patTextLimit ) {
   3494                 UChar c = *patText++;
   3495                 if (MIGHT_BE_JAMO_L(c)) {
   3496                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
   3497                 } else {
   3498                     // No check for surrogates, we might allocate slightly more buffer than necessary.
   3499                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
   3500                 }
   3501             }
   3502         }
   3503     }
   3504     ceIter    = ss->textIter;
   3505     firstIx = 0;
   3506     limitIx = 0;
   3507 
   3508     if (!initTextProcessedIter(ss, status)) { return; }
   3509 
   3510     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
   3511         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
   3512         if (buf == NULL) {
   3513             *status = U_MEMORY_ALLOCATION_ERROR;
   3514         }
   3515     }
   3516 }
   3517 
   3518 // TODO: add a reset or init function so that allocated
   3519 //       buffers can be retained & reused.
   3520 
   3521 CEIBuffer::~CEIBuffer() {
   3522     if (buf != defBuf) {
   3523         uprv_free(buf);
   3524     }
   3525 }
   3526 
   3527 
   3528 // Get the CE with the specified index.
   3529 //   Index must be in the range
   3530 //          n-history_size < index < n+1
   3531 //   where n is the largest index to have been fetched by some previous call to this function.
   3532 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3533 //
   3534 const CEI *CEIBuffer::get(int32_t index) {
   3535     int i = index % bufSize;
   3536 
   3537     if (index>=firstIx && index<limitIx) {
   3538         // The request was for an entry already in our buffer.
   3539         //  Just return it.
   3540         return &buf[i];
   3541     }
   3542 
   3543     // Caller is requesting a new, never accessed before, CE.
   3544     //   Verify that it is the next one in sequence, which is all
   3545     //   that is allowed.
   3546     if (index != limitIx) {
   3547         U_ASSERT(FALSE);
   3548 
   3549         return NULL;
   3550     }
   3551 
   3552     // Manage the circular CE buffer indexing
   3553     limitIx++;
   3554 
   3555     if (limitIx - firstIx >= bufSize) {
   3556         // The buffer is full, knock out the lowest-indexed entry.
   3557         firstIx++;
   3558     }
   3559 
   3560     UErrorCode status = U_ZERO_ERROR;
   3561 
   3562     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3563 
   3564     return &buf[i];
   3565 }
   3566 
   3567 // Get the CE with the specified index.
   3568 //   Index must be in the range
   3569 //          n-history_size < index < n+1
   3570 //   where n is the largest index to have been fetched by some previous call to this function.
   3571 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3572 //
   3573 const CEI *CEIBuffer::getPrevious(int32_t index) {
   3574     int i = index % bufSize;
   3575 
   3576     if (index>=firstIx && index<limitIx) {
   3577         // The request was for an entry already in our buffer.
   3578         //  Just return it.
   3579         return &buf[i];
   3580     }
   3581 
   3582     // Caller is requesting a new, never accessed before, CE.
   3583     //   Verify that it is the next one in sequence, which is all
   3584     //   that is allowed.
   3585     if (index != limitIx) {
   3586         U_ASSERT(FALSE);
   3587 
   3588         return NULL;
   3589     }
   3590 
   3591     // Manage the circular CE buffer indexing
   3592     limitIx++;
   3593 
   3594     if (limitIx - firstIx >= bufSize) {
   3595         // The buffer is full, knock out the lowest-indexed entry.
   3596         firstIx++;
   3597     }
   3598 
   3599     UErrorCode status = U_ZERO_ERROR;
   3600 
   3601     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
   3602 
   3603     return &buf[i];
   3604 }
   3605 
   3606 }
   3607 
   3608 U_NAMESPACE_END
   3609 
   3610 
   3611 // #define USEARCH_DEBUG
   3612 
   3613 #ifdef USEARCH_DEBUG
   3614 #include <stdio.h>
   3615 #include <stdlib.h>
   3616 #endif
   3617 
   3618 /*
   3619  * Find the next break boundary after startIndex. If the UStringSearch object
   3620  * has an external break iterator, use that. Otherwise use the internal character
   3621  * break iterator.
   3622  */
   3623 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
   3624 #if 0
   3625     const UChar *text = strsrch->search->text;
   3626     int32_t textLen   = strsrch->search->textLength;
   3627 
   3628     U_ASSERT(startIndex>=0);
   3629     U_ASSERT(startIndex<=textLen);
   3630 
   3631     if (startIndex >= textLen) {
   3632         return startIndex;
   3633     }
   3634 
   3635     UChar32  c;
   3636     int32_t  i = startIndex;
   3637     U16_NEXT(text, i, textLen, c);
   3638 
   3639     // If we are on a control character, stop without looking for combining marks.
   3640     //    Control characters do not combine.
   3641     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3642     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   3643         return i;
   3644     }
   3645 
   3646     // The initial character was not a control, and can thus accept trailing
   3647     //   combining characters.  Advance over however many of them there are.
   3648     int32_t  indexOfLastCharChecked;
   3649     for (;;) {
   3650         indexOfLastCharChecked = i;
   3651         if (i>=textLen) {
   3652             break;
   3653         }
   3654         U16_NEXT(text, i, textLen, c);
   3655         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3656         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3657             break;
   3658         }
   3659     }
   3660     return indexOfLastCharChecked;
   3661 #elif !UCONFIG_NO_BREAK_ITERATION
   3662     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3663 
   3664     if (breakiterator == NULL) {
   3665         breakiterator = strsrch->search->internalBreakIter;
   3666     }
   3667 
   3668     if (breakiterator != NULL) {
   3669         return ubrk_following(breakiterator, startIndex);
   3670     }
   3671 
   3672     return startIndex;
   3673 #else
   3674     // **** or should we use the original code? ****
   3675     return startIndex;
   3676 #endif
   3677 
   3678 }
   3679 
   3680 /*
   3681  * Returns TRUE if index is on a break boundary. If the UStringSearch
   3682  * has an external break iterator, test using that, otherwise test
   3683  * using the internal character break iterator.
   3684  */
   3685 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
   3686 #if 0
   3687     const UChar *text = strsrch->search->text;
   3688     int32_t textLen   = strsrch->search->textLength;
   3689 
   3690     U_ASSERT(index>=0);
   3691     U_ASSERT(index<=textLen);
   3692 
   3693     if (index>=textLen || index<=0) {
   3694         return TRUE;
   3695     }
   3696 
   3697     // If the character at the current index is not a GRAPHEME_EXTEND
   3698     //    then we can not be within a combining sequence.
   3699     UChar32  c;
   3700     U16_GET(text, 0, index, textLen, c);
   3701     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3702     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3703         return TRUE;
   3704     }
   3705 
   3706     // We are at a combining mark.  If the preceding character is anything
   3707     //   except a CONTROL, CR or LF, we are in a combining sequence.
   3708     U16_PREV(text, 0, index, c);
   3709     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3710     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   3711     return !combining;
   3712 #elif !UCONFIG_NO_BREAK_ITERATION
   3713     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3714 
   3715     if (breakiterator == NULL) {
   3716         breakiterator = strsrch->search->internalBreakIter;
   3717     }
   3718 
   3719     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
   3720 #else
   3721     // **** or use the original code? ****
   3722     return TRUE;
   3723 #endif
   3724 }
   3725 
   3726 #if 0
   3727 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
   3728 {
   3729 #if !UCONFIG_NO_BREAK_ITERATION
   3730     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3731 
   3732     if (breakiterator != NULL) {
   3733         int32_t startindex = ubrk_first(breakiterator);
   3734         int32_t endindex   = ubrk_last(breakiterator);
   3735 
   3736         // out-of-range indexes are never boundary positions
   3737         if (start < startindex || start > endindex ||
   3738             end < startindex || end > endindex) {
   3739             return FALSE;
   3740         }
   3741 
   3742         return ubrk_isBoundary(breakiterator, start) &&
   3743                ubrk_isBoundary(breakiterator, end);
   3744     }
   3745 #endif
   3746 
   3747     return TRUE;
   3748 }
   3749 #endif
   3750 
   3751 typedef enum {
   3752     U_CE_MATCH = -1,
   3753     U_CE_NO_MATCH = 0,
   3754     U_CE_SKIP_TARG,
   3755     U_CE_SKIP_PATN
   3756 } UCompareCEsResult;
   3757 #define U_CE_LEVEL2_BASE 0x00000005
   3758 #define U_CE_LEVEL3_BASE 0x00050000
   3759 
   3760 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
   3761     if (targCE == patCE) {
   3762         return U_CE_MATCH;
   3763     }
   3764     if (compareType == 0) {
   3765         return U_CE_NO_MATCH;
   3766     }
   3767 
   3768     int64_t targCEshifted = targCE >> 32;
   3769     int64_t patCEshifted = patCE >> 32;
   3770     int64_t mask;
   3771 
   3772     mask = 0xFFFF0000;
   3773     int32_t targLev1 = (int32_t)(targCEshifted & mask);
   3774     int32_t patLev1 = (int32_t)(patCEshifted & mask);
   3775     if ( targLev1 != patLev1 ) {
   3776         if ( targLev1 == 0 ) {
   3777             return U_CE_SKIP_TARG;
   3778         }
   3779         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3780             return U_CE_SKIP_PATN;
   3781         }
   3782         return U_CE_NO_MATCH;
   3783     }
   3784 
   3785     mask = 0x0000FFFF;
   3786     int32_t targLev2 = (int32_t)(targCEshifted & mask);
   3787     int32_t patLev2 = (int32_t)(patCEshifted & mask);
   3788     if ( targLev2 != patLev2 ) {
   3789         if ( targLev2 == 0 ) {
   3790             return U_CE_SKIP_TARG;
   3791         }
   3792         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3793             return U_CE_SKIP_PATN;
   3794         }
   3795         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
   3796             U_CE_MATCH: U_CE_NO_MATCH;
   3797     }
   3798 
   3799     mask = 0xFFFF0000;
   3800     int32_t targLev3 = (int32_t)(targCE & mask);
   3801     int32_t patLev3 = (int32_t)(patCE & mask);
   3802     if ( targLev3 != patLev3 ) {
   3803         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
   3804             U_CE_MATCH: U_CE_NO_MATCH;
   3805    }
   3806 
   3807     return U_CE_MATCH;
   3808 }
   3809 
   3810 #if BOYER_MOORE
   3811 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
   3812 #endif
   3813 
   3814 namespace {
   3815 
   3816 UChar32 codePointAt(const USearch &search, int32_t index) {
   3817     if (index < search.textLength) {
   3818         UChar32 c;
   3819         U16_NEXT(search.text, index, search.textLength, c);
   3820         return c;
   3821     }
   3822     return U_SENTINEL;
   3823 }
   3824 
   3825 UChar32 codePointBefore(const USearch &search, int32_t index) {
   3826     if (0 < index) {
   3827         UChar32 c;
   3828         U16_PREV(search.text, 0, index, c);
   3829         return c;
   3830     }
   3831     return U_SENTINEL;
   3832 }
   3833 
   3834 }  // namespace
   3835 
   3836 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
   3837                                        int32_t        startIdx,
   3838                                        int32_t        *matchStart,
   3839                                        int32_t        *matchLimit,
   3840                                        UErrorCode     *status)
   3841 {
   3842     if (U_FAILURE(*status)) {
   3843         return FALSE;
   3844     }
   3845 
   3846     // TODO:  reject search patterns beginning with a combining char.
   3847 
   3848 #ifdef USEARCH_DEBUG
   3849     if (getenv("USEARCH_DEBUG") != NULL) {
   3850         printf("Pattern CEs\n");
   3851         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
   3852             printf(" %8x", strsrch->pattern.ces[ii]);
   3853         }
   3854         printf("\n");
   3855     }
   3856 
   3857 #endif
   3858     // Input parameter sanity check.
   3859     //  TODO:  should input indicies clip to the text length
   3860     //         in the same way that UText does.
   3861     if(strsrch->pattern.cesLength == 0         ||
   3862        startIdx < 0                           ||
   3863        startIdx > strsrch->search->textLength ||
   3864        strsrch->pattern.ces == NULL) {
   3865            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3866            return FALSE;
   3867     }
   3868 
   3869     if (strsrch->pattern.pces == NULL) {
   3870         initializePatternPCETable(strsrch, status);
   3871     }
   3872 
   3873     ucol_setOffset(strsrch->textIter, startIdx, status);
   3874     CEIBuffer ceb(strsrch, status);
   3875 
   3876 
   3877     int32_t    targetIx = 0;
   3878     const CEI *targetCEI = NULL;
   3879     int32_t    patIx;
   3880     UBool      found;
   3881 
   3882     int32_t  mStart = -1;
   3883     int32_t  mLimit = -1;
   3884     int32_t  minLimit;
   3885     int32_t  maxLimit;
   3886 
   3887 
   3888 
   3889     // Outer loop moves over match starting positions in the
   3890     //      target CE space.
   3891     // Here we see the target as a sequence of collation elements, resulting from the following:
   3892     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
   3893     //    (for example, digraphs such as IJ may be broken into two characters).
   3894     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
   3895     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
   3896     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
   3897     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
   3898     //    then the CE is deleted, so the following code sees only CEs that are relevant.
   3899     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
   3900     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
   3901     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
   3902     //
   3903     for(targetIx=0; ; targetIx++)
   3904     {
   3905         found = TRUE;
   3906         //  Inner loop checks for a match beginning at each
   3907         //  position from the outer loop.
   3908         int32_t targetIxOffset = 0;
   3909         int64_t patCE = 0;
   3910         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
   3911         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   3912         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
   3913         const CEI *firstCEI = ceb.get(targetIx);
   3914         if (firstCEI == NULL) {
   3915             *status = U_INTERNAL_PROGRAM_ERROR;
   3916             found = FALSE;
   3917             break;
   3918         }
   3919 
   3920         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
   3921             patCE = strsrch->pattern.pces[patIx];
   3922             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
   3923             //  Compare CE from target string with CE from the pattern.
   3924             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
   3925             //    which will fail the compare, below.
   3926             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   3927             if ( ceMatch == U_CE_NO_MATCH ) {
   3928                 found = FALSE;
   3929                 break;
   3930             } else if ( ceMatch > U_CE_NO_MATCH ) {
   3931                 if ( ceMatch == U_CE_SKIP_TARG ) {
   3932                     // redo with same patCE, next targCE
   3933                     patIx--;
   3934                     targetIxOffset++;
   3935                 } else { // ceMatch == U_CE_SKIP_PATN
   3936                     // redo with same targCE, next patCE
   3937                     targetIxOffset--;
   3938                 }
   3939             }
   3940         }
   3941         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
   3942 
   3943         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   3944             // No match at this targetIx.  Try again at the next.
   3945             continue;
   3946         }
   3947 
   3948         if (!found) {
   3949             // No match at all, we have run off the end of the target text.
   3950             break;
   3951         }
   3952 
   3953 
   3954         // We have found a match in CE space.
   3955         // Now determine the bounds in string index space.
   3956         //  There still is a chance of match failure if the CE range not correspond to
   3957         //     an acceptable character range.
   3958         //
   3959         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
   3960 
   3961         mStart   = firstCEI->lowIndex;
   3962         minLimit = lastCEI->lowIndex;
   3963 
   3964         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   3965         //   extended to the end of input, and the match is good.
   3966 
   3967         // Look at the high and low indices of the CE following the match. If
   3968         // they are the same it means one of two things:
   3969         //    1. The match extended to the last CE from the target text, which is OK, or
   3970         //    2. The last CE that was part of the match is in an expansion that extends
   3971         //       to the first CE after the match. In this case, we reject the match.
   3972         const CEI *nextCEI = 0;
   3973         if (strsrch->search->elementComparisonType == 0) {
   3974             nextCEI  = ceb.get(targetIx + targetIxOffset);
   3975             maxLimit = nextCEI->lowIndex;
   3976             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3977                 found = FALSE;
   3978             }
   3979         } else {
   3980             for ( ; ; ++targetIxOffset ) {
   3981                 nextCEI = ceb.get(targetIx + targetIxOffset);
   3982                 maxLimit = nextCEI->lowIndex;
   3983                 // If we are at the end of the target too, match succeeds
   3984                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
   3985                     break;
   3986                 }
   3987                 // As long as the next CE has primary weight of 0,
   3988                 // it is part of the last target element matched by the pattern;
   3989                 // make sure it can be part of a match with the last patCE
   3990                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
   3991                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
   3992                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
   3993                         found = FALSE;
   3994                         break;
   3995                     }
   3996                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
   3997                 // target element, but it has non-zero primary weight => match fails
   3998                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
   3999                     found = false;
   4000                     break;
   4001                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
   4002                 } else {
   4003                     break;
   4004                 }
   4005             }
   4006         }
   4007 
   4008 
   4009         // Check for the start of the match being within a combining sequence.
   4010         //   This can happen if the pattern itself begins with a combining char, and
   4011         //   the match found combining marks in the target text that were attached
   4012         //    to something else.
   4013         //   This type of match should be rejected for not completely consuming a
   4014         //   combining sequence.
   4015         if (!isBreakBoundary(strsrch, mStart)) {
   4016             found = FALSE;
   4017         }
   4018 
   4019         // Check for the start of the match being within an Collation Element Expansion,
   4020         //   meaning that the first char of the match is only partially matched.
   4021         //   With exapnsions, the first CE will report the index of the source
   4022         //   character, and all subsequent (expansions) CEs will report the source index of the
   4023         //    _following_ character.
   4024         int32_t secondIx = firstCEI->highIndex;
   4025         if (mStart == secondIx) {
   4026             found = FALSE;
   4027         }
   4028 
   4029         // Allow matches to end in the middle of a grapheme cluster if the following
   4030         // conditions are met; this is needed to make prefix search work properly in
   4031         // Indic, see #11750
   4032         // * the default breakIter is being used
   4033         // * the next collation element after this combining sequence
   4034         //   - has non-zero primary weight
   4035         //   - corresponds to a separate character following the one at end of the current match
   4036         //   (the second of these conditions, and perhaps both, may be redundant given the
   4037         //   subsequent check for normalization boundary; however they are likely much faster
   4038         //   tests in any case)
   4039         // * the match limit is a normalization boundary
   4040         UBool allowMidclusterMatch = FALSE;
   4041         if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
   4042             allowMidclusterMatch =
   4043                     strsrch->search->breakIter == NULL &&
   4044                     nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
   4045                     maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
   4046                     (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
   4047                         strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
   4048         }
   4049         // If those conditions are met, then:
   4050         // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
   4051         //   the match limit may be backed off to a previous break boundary. This handles
   4052         //   cases in which mLimit includes target characters that are ignorable with current
   4053         //   settings (such as space) and which extend beyond the pattern match.
   4054         // * do NOT require that end of the combining sequence not extend beyond the match in CE space
   4055         // * do NOT require that match limit be on a breakIter boundary
   4056 
   4057         //  Advance the match end position to the first acceptable match boundary.
   4058         //    This advances the index over any combining charcters.
   4059         mLimit = maxLimit;
   4060         if (minLimit < maxLimit) {
   4061             // When the last CE's low index is same with its high index, the CE is likely
   4062             // a part of expansion. In this case, the index is located just after the
   4063             // character corresponding to the CEs compared above. If the index is right
   4064             // at the break boundary, move the position to the next boundary will result
   4065             // incorrect match length when there are ignorable characters exist between
   4066             // the position and the next character produces CE(s). See ticket#8482.
   4067             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
   4068                 mLimit = minLimit;
   4069             } else {
   4070                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4071                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
   4072                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
   4073                 // (i.e. we back off mLimit to the previous breakIterator boundary).
   4074                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
   4075                     mLimit = nba;
   4076                 }
   4077             }
   4078         }
   4079 
   4080     #ifdef USEARCH_DEBUG
   4081         if (getenv("USEARCH_DEBUG") != NULL) {
   4082             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4083         }
   4084     #endif
   4085 
   4086         if (!allowMidclusterMatch) {
   4087             // If advancing to the end of a combining sequence in character indexing space
   4088             //   advanced us beyond the end of the match in CE space, reject this match.
   4089             if (mLimit > maxLimit) {
   4090                 found = FALSE;
   4091             }
   4092 
   4093             if (!isBreakBoundary(strsrch, mLimit)) {
   4094                 found = FALSE;
   4095             }
   4096         }
   4097 
   4098         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4099             found = FALSE;
   4100         }
   4101 
   4102         if (found) {
   4103             break;
   4104         }
   4105     }
   4106 
   4107     #ifdef USEARCH_DEBUG
   4108     if (getenv("USEARCH_DEBUG") != NULL) {
   4109         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4110         int32_t  lastToPrint = ceb.limitIx+2;
   4111         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4112             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4113         }
   4114         printf("\n%s\n", found? "match found" : "no match");
   4115     }
   4116     #endif
   4117 
   4118     // All Done.  Store back the match bounds to the caller.
   4119     //
   4120     if (found==FALSE) {
   4121         mLimit = -1;
   4122         mStart = -1;
   4123     }
   4124 
   4125     if (matchStart != NULL) {
   4126         *matchStart= mStart;
   4127     }
   4128 
   4129     if (matchLimit != NULL) {
   4130         *matchLimit = mLimit;
   4131     }
   4132 
   4133     return found;
   4134 }
   4135 
   4136 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
   4137                                                 int32_t        startIdx,
   4138                                                 int32_t        *matchStart,
   4139                                                 int32_t        *matchLimit,
   4140                                                 UErrorCode     *status)
   4141 {
   4142     if (U_FAILURE(*status)) {
   4143         return FALSE;
   4144     }
   4145 
   4146     // TODO:  reject search patterns beginning with a combining char.
   4147 
   4148 #ifdef USEARCH_DEBUG
   4149     if (getenv("USEARCH_DEBUG") != NULL) {
   4150         printf("Pattern CEs\n");
   4151         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
   4152             printf(" %8x", strsrch->pattern.ces[ii]);
   4153         }
   4154         printf("\n");
   4155     }
   4156 
   4157 #endif
   4158     // Input parameter sanity check.
   4159     //  TODO:  should input indicies clip to the text length
   4160     //         in the same way that UText does.
   4161     if(strsrch->pattern.cesLength == 0         ||
   4162        startIdx < 0                           ||
   4163        startIdx > strsrch->search->textLength ||
   4164        strsrch->pattern.ces == NULL) {
   4165            *status = U_ILLEGAL_ARGUMENT_ERROR;
   4166            return FALSE;
   4167     }
   4168 
   4169     if (strsrch->pattern.pces == NULL) {
   4170         initializePatternPCETable(strsrch, status);
   4171     }
   4172 
   4173     CEIBuffer ceb(strsrch, status);
   4174     int32_t    targetIx = 0;
   4175 
   4176     /*
   4177      * Pre-load the buffer with the CE's for the grapheme
   4178      * after our starting position so that we're sure that
   4179      * we can look at the CE following the match when we
   4180      * check the match boundaries.
   4181      *
   4182      * This will also pre-fetch the first CE that we'll
   4183      * consider for the match.
   4184      */
   4185     if (startIdx < strsrch->search->textLength) {
   4186         UBreakIterator *bi = strsrch->search->internalBreakIter;
   4187         int32_t next = ubrk_following(bi, startIdx);
   4188 
   4189         ucol_setOffset(strsrch->textIter, next, status);
   4190 
   4191         for (targetIx = 0; ; targetIx += 1) {
   4192             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
   4193                 break;
   4194             }
   4195         }
   4196     } else {
   4197         ucol_setOffset(strsrch->textIter, startIdx, status);
   4198     }
   4199 
   4200 
   4201     const CEI *targetCEI = NULL;
   4202     int32_t    patIx;
   4203     UBool      found;
   4204 
   4205     int32_t  limitIx = targetIx;
   4206     int32_t  mStart = -1;
   4207     int32_t  mLimit = -1;
   4208     int32_t  minLimit;
   4209     int32_t  maxLimit;
   4210 
   4211 
   4212 
   4213     // Outer loop moves over match starting positions in the
   4214     //      target CE space.
   4215     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
   4216     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
   4217     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
   4218     // and the beginning of the base text.
   4219     for(targetIx = limitIx; ; targetIx += 1)
   4220     {
   4221         found = TRUE;
   4222         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
   4223         // (compared to the last CE fetched for the previous targetIx value) as we need to go
   4224         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
   4225         const CEI *lastCEI  = ceb.getPrevious(targetIx);
   4226         if (lastCEI == NULL) {
   4227             *status = U_INTERNAL_PROGRAM_ERROR;
   4228             found = FALSE;
   4229              break;
   4230         }
   4231         //  Inner loop checks for a match beginning at each
   4232         //  position from the outer loop.
   4233         int32_t targetIxOffset = 0;
   4234         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
   4235             int64_t patCE = strsrch->pattern.pces[patIx];
   4236 
   4237             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
   4238             //  Compare CE from target string with CE from the pattern.
   4239             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   4240             //    which will fail the compare, below.
   4241             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   4242             if ( ceMatch == U_CE_NO_MATCH ) {
   4243                 found = FALSE;
   4244                 break;
   4245             } else if ( ceMatch > U_CE_NO_MATCH ) {
   4246                 if ( ceMatch == U_CE_SKIP_TARG ) {
   4247                     // redo with same patCE, next targCE
   4248                     patIx++;
   4249                     targetIxOffset++;
   4250                 } else { // ceMatch == U_CE_SKIP_PATN
   4251                     // redo with same targCE, next patCE
   4252                     targetIxOffset--;
   4253                 }
   4254             }
   4255         }
   4256 
   4257         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   4258             // No match at this targetIx.  Try again at the next.
   4259             continue;
   4260         }
   4261 
   4262         if (!found) {
   4263             // No match at all, we have run off the end of the target text.
   4264             break;
   4265         }
   4266 
   4267 
   4268         // We have found a match in CE space.
   4269         // Now determine the bounds in string index space.
   4270         //  There still is a chance of match failure if the CE range not correspond to
   4271         //     an acceptable character range.
   4272         //
   4273         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
   4274         mStart   = firstCEI->lowIndex;
   4275 
   4276         // Check for the start of the match being within a combining sequence.
   4277         //   This can happen if the pattern itself begins with a combining char, and
   4278         //   the match found combining marks in the target text that were attached
   4279         //    to something else.
   4280         //   This type of match should be rejected for not completely consuming a
   4281         //   combining sequence.
   4282         if (!isBreakBoundary(strsrch, mStart)) {
   4283             found = FALSE;
   4284         }
   4285 
   4286         // Look at the high index of the first CE in the match. If it's the same as the
   4287         // low index, the first CE in the match is in the middle of an expansion.
   4288         if (mStart == firstCEI->highIndex) {
   4289             found = FALSE;
   4290         }
   4291 
   4292 
   4293         minLimit = lastCEI->lowIndex;
   4294 
   4295         if (targetIx > 0) {
   4296             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   4297             //   extended to the end of input, and the match is good.
   4298 
   4299             // Look at the high and low indices of the CE following the match. If
   4300             // they are the same it means one of two things:
   4301             //    1. The match extended to the last CE from the target text, which is OK, or
   4302             //    2. The last CE that was part of the match is in an expansion that extends
   4303             //       to the first CE after the match. In this case, we reject the match.
   4304             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
   4305 
   4306             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4307                 found = FALSE;
   4308             }
   4309 
   4310             mLimit = maxLimit = nextCEI->lowIndex;
   4311 
   4312             // Allow matches to end in the middle of a grapheme cluster if the following
   4313             // conditions are met; this is needed to make prefix search work properly in
   4314             // Indic, see #11750
   4315             // * the default breakIter is being used
   4316             // * the next collation element after this combining sequence
   4317             //   - has non-zero primary weight
   4318             //   - corresponds to a separate character following the one at end of the current match
   4319             //   (the second of these conditions, and perhaps both, may be redundant given the
   4320             //   subsequent check for normalization boundary; however they are likely much faster
   4321             //   tests in any case)
   4322             // * the match limit is a normalization boundary
   4323             UBool allowMidclusterMatch = FALSE;
   4324             if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
   4325                 allowMidclusterMatch =
   4326                         strsrch->search->breakIter == NULL &&
   4327                         nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
   4328                         maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
   4329                         (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
   4330                             strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
   4331             }
   4332             // If those conditions are met, then:
   4333             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
   4334             //   the match limit may be backed off to a previous break boundary. This handles
   4335             //   cases in which mLimit includes target characters that are ignorable with current
   4336             //   settings (such as space) and which extend beyond the pattern match.
   4337             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
   4338             // * do NOT require that match limit be on a breakIter boundary
   4339 
   4340             //  Advance the match end position to the first acceptable match boundary.
   4341             //    This advances the index over any combining characters.
   4342             if (minLimit < maxLimit) {
   4343                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4344                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
   4345                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
   4346                 // (i.e. we back off mLimit to the previous breakIterator boundary).
   4347                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
   4348                     mLimit = nba;
   4349                 }
   4350             }
   4351 
   4352             if (!allowMidclusterMatch) {
   4353                 // If advancing to the end of a combining sequence in character indexing space
   4354                 //   advanced us beyond the end of the match in CE space, reject this match.
   4355                 if (mLimit > maxLimit) {
   4356                     found = FALSE;
   4357                 }
   4358 
   4359                 // Make sure the end of the match is on a break boundary
   4360                 if (!isBreakBoundary(strsrch, mLimit)) {
   4361                     found = FALSE;
   4362                 }
   4363             }
   4364 
   4365         } else {
   4366             // No non-ignorable CEs after this point.
   4367             // The maximum position is detected by boundary after
   4368             // the last non-ignorable CE. Combining sequence
   4369             // across the start index will be truncated.
   4370             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4371             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
   4372         }
   4373 
   4374     #ifdef USEARCH_DEBUG
   4375         if (getenv("USEARCH_DEBUG") != NULL) {
   4376             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4377         }
   4378     #endif
   4379 
   4380 
   4381         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4382             found = FALSE;
   4383         }
   4384 
   4385         if (found) {
   4386             break;
   4387         }
   4388     }
   4389 
   4390     #ifdef USEARCH_DEBUG
   4391     if (getenv("USEARCH_DEBUG") != NULL) {
   4392         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4393         int32_t  lastToPrint = ceb.limitIx+2;
   4394         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4395             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4396         }
   4397         printf("\n%s\n", found? "match found" : "no match");
   4398     }
   4399     #endif
   4400 
   4401     // All Done.  Store back the match bounds to the caller.
   4402     //
   4403     if (found==FALSE) {
   4404         mLimit = -1;
   4405         mStart = -1;
   4406     }
   4407 
   4408     if (matchStart != NULL) {
   4409         *matchStart= mStart;
   4410     }
   4411 
   4412     if (matchLimit != NULL) {
   4413         *matchLimit = mLimit;
   4414     }
   4415 
   4416     return found;
   4417 }
   4418 
   4419 // internal use methods declared in usrchimp.h -----------------------------
   4420 
   4421 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
   4422 {
   4423     if (U_FAILURE(*status)) {
   4424         setMatchNotFound(strsrch);
   4425         return FALSE;
   4426     }
   4427 
   4428 #if BOYER_MOORE
   4429     UCollationElements *coleiter        = strsrch->textIter;
   4430     int32_t             textlength      = strsrch->search->textLength;
   4431     int32_t            *patternce       = strsrch->pattern.ces;
   4432     int32_t             patterncelength = strsrch->pattern.cesLength;
   4433     int32_t             textoffset      = ucol_getOffset(coleiter);
   4434 
   4435     // status used in setting coleiter offset, since offset is checked in
   4436     // shiftForward before setting the coleiter offset, status never
   4437     // a failure
   4438     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4439                               patterncelength);
   4440     while (textoffset <= textlength)
   4441     {
   4442         uint32_t    patternceindex = patterncelength - 1;
   4443         int32_t     targetce;
   4444         UBool       found          = FALSE;
   4445         int32_t    lastce          = UCOL_NULLORDER;
   4446 
   4447         setColEIterOffset(coleiter, textoffset);
   4448 
   4449         for (;;) {
   4450             // finding the last pattern ce match, imagine composite characters
   4451             // for example: search for pattern A in text \u00C0
   4452             // we'll have to skip \u0300 the grave first before we get to A
   4453             targetce = ucol_previous(coleiter, status);
   4454             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4455                 found = FALSE;
   4456                 break;
   4457             }
   4458             targetce = getCE(strsrch, targetce);
   4459             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
   4460                 // this is for the text \u0315\u0300 that requires
   4461                 // normalization and pattern \u0300, where \u0315 is ignorable
   4462                 continue;
   4463             }
   4464             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4465                 lastce = targetce;
   4466             }
   4467             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4468             if (targetce == patternce[patternceindex]) {
   4469                 // the first ce can be a contraction
   4470                 found = TRUE;
   4471                 break;
   4472             }
   4473             if (!hasExpansion(coleiter)) {
   4474                 found = FALSE;
   4475                 break;
   4476             }
   4477         }
   4478 
   4479         //targetce = lastce;
   4480 
   4481         while (found && patternceindex > 0) {
   4482             lastce = targetce;
   4483             targetce    = ucol_previous(coleiter, status);
   4484             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4485                 found = FALSE;
   4486                 break;
   4487             }
   4488             targetce    = getCE(strsrch, targetce);
   4489             if (targetce == UCOL_IGNORABLE) {
   4490                 continue;
   4491             }
   4492 
   4493             patternceindex --;
   4494             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4495             found = found && targetce == patternce[patternceindex];
   4496         }
   4497 
   4498         targetce = lastce;
   4499 
   4500         if (!found) {
   4501             if (U_FAILURE(*status)) {
   4502                 break;
   4503             }
   4504             textoffset = shiftForward(strsrch, textoffset, lastce,
   4505                                       patternceindex);
   4506             // status checked at loop.
   4507             patternceindex = patterncelength;
   4508             continue;
   4509         }
   4510 
   4511         if (checkNextExactMatch(strsrch, &textoffset, status)) {
   4512             // status checked in ucol_setOffset
   4513             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4514             return TRUE;
   4515         }
   4516     }
   4517     setMatchNotFound(strsrch);
   4518     return FALSE;
   4519 #else
   4520     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4521     int32_t start = -1;
   4522     int32_t end = -1;
   4523 
   4524     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4525         strsrch->search->matchedIndex  = start;
   4526         strsrch->search->matchedLength = end - start;
   4527         return TRUE;
   4528     } else {
   4529         setMatchNotFound(strsrch);
   4530         return FALSE;
   4531     }
   4532 #endif
   4533 }
   4534 
   4535 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
   4536 {
   4537     if (U_FAILURE(*status)) {
   4538         setMatchNotFound(strsrch);
   4539         return FALSE;
   4540     }
   4541 
   4542 #if BOYER_MOORE
   4543     UCollationElements *coleiter        = strsrch->textIter;
   4544     int32_t             textlength      = strsrch->search->textLength;
   4545     int32_t            *patternce       = strsrch->pattern.ces;
   4546     int32_t             patterncelength = strsrch->pattern.cesLength;
   4547     int32_t             textoffset      = ucol_getOffset(coleiter);
   4548     UBool               hasPatternAccents =
   4549        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4550 
   4551     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4552                               patterncelength);
   4553     strsrch->canonicalPrefixAccents[0] = 0;
   4554     strsrch->canonicalSuffixAccents[0] = 0;
   4555 
   4556     while (textoffset <= textlength)
   4557     {
   4558         int32_t     patternceindex = patterncelength - 1;
   4559         int32_t     targetce;
   4560         UBool       found          = FALSE;
   4561         int32_t     lastce         = UCOL_NULLORDER;
   4562 
   4563         setColEIterOffset(coleiter, textoffset);
   4564 
   4565         for (;;) {
   4566             // finding the last pattern ce match, imagine composite characters
   4567             // for example: search for pattern A in text \u00C0
   4568             // we'll have to skip \u0300 the grave first before we get to A
   4569             targetce = ucol_previous(coleiter, status);
   4570             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4571                 found = FALSE;
   4572                 break;
   4573             }
   4574             targetce = getCE(strsrch, targetce);
   4575             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4576                 lastce = targetce;
   4577             }
   4578             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4579             if (targetce == patternce[patternceindex]) {
   4580                 // the first ce can be a contraction
   4581                 found = TRUE;
   4582                 break;
   4583             }
   4584             if (!hasExpansion(coleiter)) {
   4585                 found = FALSE;
   4586                 break;
   4587             }
   4588         }
   4589 
   4590         while (found && patternceindex > 0) {
   4591             targetce    = ucol_previous(coleiter, status);
   4592             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4593                 found = FALSE;
   4594                 break;
   4595             }
   4596             targetce    = getCE(strsrch, targetce);
   4597             if (targetce == UCOL_IGNORABLE) {
   4598                 continue;
   4599             }
   4600 
   4601             patternceindex --;
   4602             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4603             found = found && targetce == patternce[patternceindex];
   4604         }
   4605 
   4606         // initializing the rearranged accent array
   4607         if (hasPatternAccents && !found) {
   4608             strsrch->canonicalPrefixAccents[0] = 0;
   4609             strsrch->canonicalSuffixAccents[0] = 0;
   4610             if (U_FAILURE(*status)) {
   4611                 break;
   4612             }
   4613             found = doNextCanonicalMatch(strsrch, textoffset, status);
   4614         }
   4615 
   4616         if (!found) {
   4617             if (U_FAILURE(*status)) {
   4618                 break;
   4619             }
   4620             textoffset = shiftForward(strsrch, textoffset, lastce,
   4621                                       patternceindex);
   4622             // status checked at loop
   4623             patternceindex = patterncelength;
   4624             continue;
   4625         }
   4626 
   4627         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
   4628             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4629             return TRUE;
   4630         }
   4631     }
   4632     setMatchNotFound(strsrch);
   4633     return FALSE;
   4634 #else
   4635     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4636     int32_t start = -1;
   4637     int32_t end = -1;
   4638 
   4639     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4640         strsrch->search->matchedIndex  = start;
   4641         strsrch->search->matchedLength = end - start;
   4642         return TRUE;
   4643     } else {
   4644         setMatchNotFound(strsrch);
   4645         return FALSE;
   4646     }
   4647 #endif
   4648 }
   4649 
   4650 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
   4651 {
   4652     if (U_FAILURE(*status)) {
   4653         setMatchNotFound(strsrch);
   4654         return FALSE;
   4655     }
   4656 
   4657 #if BOYER_MOORE
   4658     UCollationElements *coleiter        = strsrch->textIter;
   4659     int32_t            *patternce       = strsrch->pattern.ces;
   4660     int32_t             patterncelength = strsrch->pattern.cesLength;
   4661     int32_t             textoffset      = ucol_getOffset(coleiter);
   4662 
   4663     // shifting it check for setting offset
   4664     // if setOffset is called previously or there was no previous match, we
   4665     // leave the offset as it is.
   4666     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4667         textoffset = strsrch->search->matchedIndex;
   4668     }
   4669 
   4670     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4671                               patterncelength);
   4672 
   4673     while (textoffset >= 0)
   4674     {
   4675         int32_t     patternceindex = 1;
   4676         int32_t     targetce;
   4677         UBool       found          = FALSE;
   4678         int32_t     firstce        = UCOL_NULLORDER;
   4679 
   4680         // if status is a failure, ucol_setOffset does nothing
   4681         setColEIterOffset(coleiter, textoffset);
   4682 
   4683         for (;;) {
   4684             // finding the first pattern ce match, imagine composite
   4685             // characters. for example: search for pattern \u0300 in text
   4686             // \u00C0, we'll have to skip A first before we get to
   4687             // \u0300 the grave accent
   4688             targetce = ucol_next(coleiter, status);
   4689             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4690                 found = FALSE;
   4691                 break;
   4692             }
   4693             targetce = getCE(strsrch, targetce);
   4694             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4695                 firstce = targetce;
   4696             }
   4697             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
   4698                 continue;
   4699             }
   4700             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4701             if (targetce == patternce[0]) {
   4702                 found = TRUE;
   4703                 break;
   4704             }
   4705             if (!hasExpansion(coleiter)) {
   4706                 // checking for accents in composite character
   4707                 found = FALSE;
   4708                 break;
   4709             }
   4710         }
   4711 
   4712         //targetce = firstce;
   4713 
   4714         while (found && (patternceindex < patterncelength)) {
   4715             firstce = targetce;
   4716             targetce    = ucol_next(coleiter, status);
   4717             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4718                 found = FALSE;
   4719                 break;
   4720             }
   4721             targetce    = getCE(strsrch, targetce);
   4722             if (targetce == UCOL_IGNORABLE) {
   4723                 continue;
   4724             }
   4725 
   4726             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4727             found = found && targetce == patternce[patternceindex];
   4728             patternceindex ++;
   4729         }
   4730 
   4731         targetce = firstce;
   4732 
   4733         if (!found) {
   4734             if (U_FAILURE(*status)) {
   4735                 break;
   4736             }
   4737 
   4738             textoffset = reverseShift(strsrch, textoffset, targetce,
   4739                                       patternceindex);
   4740             patternceindex = 0;
   4741             continue;
   4742         }
   4743 
   4744         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
   4745             setColEIterOffset(coleiter, textoffset);
   4746             return TRUE;
   4747         }
   4748     }
   4749     setMatchNotFound(strsrch);
   4750     return FALSE;
   4751 #else
   4752     int32_t textOffset;
   4753 
   4754     if (strsrch->search->isOverlap) {
   4755         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4756             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4757         } else {
   4758             // move the start position at the end of possible match
   4759             initializePatternPCETable(strsrch, status);
   4760             if (!initTextProcessedIter(strsrch, status)) {
   4761                 setMatchNotFound(strsrch);
   4762                 return FALSE;
   4763             }
   4764             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
   4765                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4766                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4767                     // at the end of the text
   4768                     break;
   4769                 }
   4770             }
   4771             if (U_FAILURE(*status)) {
   4772                 setMatchNotFound(strsrch);
   4773                 return FALSE;
   4774             }
   4775             textOffset = ucol_getOffset(strsrch->textIter);
   4776         }
   4777     } else {
   4778         textOffset = ucol_getOffset(strsrch->textIter);
   4779     }
   4780 
   4781     int32_t start = -1;
   4782     int32_t end = -1;
   4783 
   4784     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4785         strsrch->search->matchedIndex = start;
   4786         strsrch->search->matchedLength = end - start;
   4787         return TRUE;
   4788     } else {
   4789         setMatchNotFound(strsrch);
   4790         return FALSE;
   4791     }
   4792 #endif
   4793 }
   4794 
   4795 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
   4796                                       UErrorCode    *status)
   4797 {
   4798     if (U_FAILURE(*status)) {
   4799         setMatchNotFound(strsrch);
   4800         return FALSE;
   4801     }
   4802 
   4803 #if BOYER_MOORE
   4804     UCollationElements *coleiter        = strsrch->textIter;
   4805     int32_t            *patternce       = strsrch->pattern.ces;
   4806     int32_t             patterncelength = strsrch->pattern.cesLength;
   4807     int32_t             textoffset      = ucol_getOffset(coleiter);
   4808     UBool               hasPatternAccents =
   4809        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4810 
   4811     // shifting it check for setting offset
   4812     // if setOffset is called previously or there was no previous match, we
   4813     // leave the offset as it is.
   4814     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4815         textoffset = strsrch->search->matchedIndex;
   4816     }
   4817 
   4818     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4819                               patterncelength);
   4820     strsrch->canonicalPrefixAccents[0] = 0;
   4821     strsrch->canonicalSuffixAccents[0] = 0;
   4822 
   4823     while (textoffset >= 0)
   4824     {
   4825         int32_t     patternceindex = 1;
   4826         int32_t     targetce;
   4827         UBool       found          = FALSE;
   4828         int32_t     firstce        = UCOL_NULLORDER;
   4829 
   4830         setColEIterOffset(coleiter, textoffset);
   4831         for (;;) {
   4832             // finding the first pattern ce match, imagine composite
   4833             // characters. for example: search for pattern \u0300 in text
   4834             // \u00C0, we'll have to skip A first before we get to
   4835             // \u0300 the grave accent
   4836             targetce = ucol_next(coleiter, status);
   4837             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4838                 found = FALSE;
   4839                 break;
   4840             }
   4841             targetce = getCE(strsrch, targetce);
   4842             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4843                 firstce = targetce;
   4844             }
   4845 
   4846             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4847             if (targetce == patternce[0]) {
   4848                 // the first ce can be a contraction
   4849                 found = TRUE;
   4850                 break;
   4851             }
   4852             if (!hasExpansion(coleiter)) {
   4853                 // checking for accents in composite character
   4854                 found = FALSE;
   4855                 break;
   4856             }
   4857         }
   4858 
   4859         targetce = firstce;
   4860 
   4861         while (found && patternceindex < patterncelength) {
   4862             targetce    = ucol_next(coleiter, status);
   4863             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4864                 found = FALSE;
   4865                 break;
   4866             }
   4867             targetce = getCE(strsrch, targetce);
   4868             if (targetce == UCOL_IGNORABLE) {
   4869                 continue;
   4870             }
   4871 
   4872             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4873             found = found && targetce == patternce[patternceindex];
   4874             patternceindex ++;
   4875         }
   4876 
   4877         // initializing the rearranged accent array
   4878         if (hasPatternAccents && !found) {
   4879             strsrch->canonicalPrefixAccents[0] = 0;
   4880             strsrch->canonicalSuffixAccents[0] = 0;
   4881             if (U_FAILURE(*status)) {
   4882                 break;
   4883             }
   4884             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
   4885         }
   4886 
   4887         if (!found) {
   4888             if (U_FAILURE(*status)) {
   4889                 break;
   4890             }
   4891             textoffset = reverseShift(strsrch, textoffset, targetce,
   4892                                       patternceindex);
   4893             patternceindex = 0;
   4894             continue;
   4895         }
   4896 
   4897         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
   4898             setColEIterOffset(coleiter, textoffset);
   4899             return TRUE;
   4900         }
   4901     }
   4902     setMatchNotFound(strsrch);
   4903     return FALSE;
   4904 #else
   4905     int32_t textOffset;
   4906 
   4907     if (strsrch->search->isOverlap) {
   4908         if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4909             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
   4910         } else {
   4911             // move the start position at the end of possible match
   4912             initializePatternPCETable(strsrch, status);
   4913             if (!initTextProcessedIter(strsrch, status)) {
   4914                 setMatchNotFound(strsrch);
   4915                 return FALSE;
   4916             }
   4917             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
   4918                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
   4919                 if (pce == UCOL_PROCESSED_NULLORDER) {
   4920                     // at the end of the text
   4921                     break;
   4922                 }
   4923             }
   4924             if (U_FAILURE(*status)) {
   4925                 setMatchNotFound(strsrch);
   4926                 return FALSE;
   4927             }
   4928             textOffset = ucol_getOffset(strsrch->textIter);
   4929         }
   4930     } else {
   4931         textOffset = ucol_getOffset(strsrch->textIter);
   4932     }
   4933 
   4934     int32_t start = -1;
   4935     int32_t end = -1;
   4936 
   4937     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4938         strsrch->search->matchedIndex = start;
   4939         strsrch->search->matchedLength = end - start;
   4940         return TRUE;
   4941     } else {
   4942         setMatchNotFound(strsrch);
   4943         return FALSE;
   4944     }
   4945 #endif
   4946 }
   4947 
   4948 #endif /* #if !UCONFIG_NO_COLLATION */
   4949