Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2009 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  07/02/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/usearch.h"
     15 #include "unicode/ustring.h"
     16 #include "unicode/uchar.h"
     17 #include "unormimp.h"
     18 #include "ucol_imp.h"
     19 #include "usrchimp.h"
     20 #include "cmemory.h"
     21 #include "ucln_in.h"
     22 #include "uassert.h"
     23 
     24 U_NAMESPACE_USE
     25 
     26 // don't use Boyer-Moore
     27 #define BOYER_MOORE 0
     28 
     29 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     30 
     31 // internal definition ---------------------------------------------------
     32 
     33 #define LAST_BYTE_MASK_          0xFF
     34 #define SECOND_LAST_BYTE_SHIFT_  8
     35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
     36 
     37 static const uint16_t *fcdTrieIndex = NULL;
     38 static UChar32 fcdHighStart = 0;
     39 
     40 // internal methods -------------------------------------------------
     41 
     42 /**
     43 * Fast collation element iterator setOffset.
     44 * This function does not check for bounds.
     45 * @param coleiter collation element iterator
     46 * @param offset to set
     47 */
     48 static
     49 inline void setColEIterOffset(UCollationElements *elems,
     50                       int32_t             offset)
     51 {
     52     collIterate *ci = &(elems->iteratordata_);
     53     ci->pos         = ci->string + offset;
     54     ci->CEpos       = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
     55     if (ci->flags & UCOL_ITER_INNORMBUF) {
     56         ci->flags = ci->origFlags;
     57     }
     58     ci->fcdPosition = NULL;
     59 
     60 	ci->offsetReturn = NULL;
     61     ci->offsetStore  = ci->offsetBuffer;
     62 	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
     63 }
     64 
     65 /**
     66 * Getting the mask for collation strength
     67 * @param strength collation strength
     68 * @return collation element mask
     69 */
     70 static
     71 inline uint32_t getMask(UCollationStrength strength)
     72 {
     73     switch (strength)
     74     {
     75     case UCOL_PRIMARY:
     76         return UCOL_PRIMARYORDERMASK;
     77     case UCOL_SECONDARY:
     78         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
     79     default:
     80         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
     81                UCOL_PRIMARYORDERMASK;
     82     }
     83 }
     84 
     85 /**
     86 * This is to squeeze the 21bit ces into a 256 table
     87 * @param ce collation element
     88 * @return collapsed version of the collation element
     89 */
     90 static
     91 inline int hash(uint32_t ce)
     92 {
     93     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
     94     // well with the new collation where most of the latin 1 characters
     95     // are of the value xx000xxx. their hashes will most of the time be 0
     96     // to be discussed on the hash algo.
     97     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
     98 }
     99 
    100 U_CDECL_BEGIN
    101 static UBool U_CALLCONV
    102 usearch_cleanup(void) {
    103     fcdTrieIndex = NULL;
    104     return TRUE;
    105 }
    106 U_CDECL_END
    107 
    108 /**
    109 * Initializing the fcd tables.
    110 * Internal method, status assumed to be a success.
    111 * @param status output error if any, caller to check status before calling
    112 *               method, status assumed to be success when passed in.
    113 */
    114 static
    115 inline void initializeFCD(UErrorCode *status)
    116 {
    117     if (fcdTrieIndex == NULL) {
    118         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
    119         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
    120     }
    121 }
    122 
    123 /**
    124 * Gets the fcd value for a character at the argument index.
    125 * This method takes into accounts of the supplementary characters.
    126 * @param str UTF16 string where character for fcd retrieval resides
    127 * @param offset position of the character whose fcd is to be retrieved, to be
    128 *               overwritten with the next character position, taking
    129 *               surrogate characters into consideration.
    130 * @param strlength length of the argument string
    131 * @return fcd value
    132 */
    133 static
    134 uint16_t getFCD(const UChar   *str, int32_t *offset,
    135                              int32_t  strlength)
    136 {
    137     const UChar *temp = str + *offset;
    138     uint16_t    result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength);
    139     *offset = (int32_t)(temp - str);
    140     return result;
    141 }
    142 
    143 /**
    144 * Getting the modified collation elements taking into account the collation
    145 * attributes
    146 * @param strsrch string search data
    147 * @param sourcece
    148 * @return the modified collation element
    149 */
    150 static
    151 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
    152 {
    153     // note for tertiary we can't use the collator->tertiaryMask, that
    154     // is a preprocessed mask that takes into account case options. since
    155     // we are only concerned with exact matches, we don't need that.
    156     sourcece &= strsrch->ceMask;
    157 
    158     if (strsrch->toShift) {
    159         // alternate handling here, since only the 16 most significant digits
    160         // is only used, we can safely do a compare without masking
    161         // if the ce is a variable, we mask and get only the primary values
    162         // no shifting to quartenary is required since all primary values
    163         // less than variabletop will need to be masked off anyway.
    164         if (strsrch->variableTop > sourcece) {
    165             if (strsrch->strength >= UCOL_QUATERNARY) {
    166                 sourcece &= UCOL_PRIMARYORDERMASK;
    167             }
    168             else {
    169                 sourcece = UCOL_IGNORABLE;
    170             }
    171         }
    172     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
    173         sourcece = 0xFFFF;
    174     }
    175 
    176     return sourcece;
    177 }
    178 
    179 /**
    180 * Allocate a memory and returns NULL if it failed.
    181 * Internal method, status assumed to be a success.
    182 * @param size to allocate
    183 * @param status output error if any, caller to check status before calling
    184 *               method, status assumed to be success when passed in.
    185 * @return newly allocated array, NULL otherwise
    186 */
    187 static
    188 inline void * allocateMemory(uint32_t size, UErrorCode *status)
    189 {
    190     uint32_t *result = (uint32_t *)uprv_malloc(size);
    191     if (result == NULL) {
    192         *status = U_MEMORY_ALLOCATION_ERROR;
    193     }
    194     return result;
    195 }
    196 
    197 /**
    198 * Adds a uint32_t value to a destination array.
    199 * Creates a new array if we run out of space. The caller will have to
    200 * manually deallocate the newly allocated array.
    201 * Internal method, status assumed to be success, caller has to check status
    202 * before calling this method. destination not to be NULL and has at least
    203 * size destinationlength.
    204 * @param destination target array
    205 * @param offset destination offset to add value
    206 * @param destinationlength target array size, return value for the new size
    207 * @param value to be added
    208 * @param increments incremental size expected
    209 * @param status output error if any, caller to check status before calling
    210 *               method, status assumed to be success when passed in.
    211 * @return new destination array, destination if there was no new allocation
    212 */
    213 static
    214 inline int32_t * addTouint32_tArray(int32_t    *destination,
    215                                     uint32_t    offset,
    216                                     uint32_t   *destinationlength,
    217                                     uint32_t    value,
    218                                     uint32_t    increments,
    219                                     UErrorCode *status)
    220 {
    221     uint32_t newlength = *destinationlength;
    222     if (offset + 1 == newlength) {
    223         newlength += increments;
    224         int32_t *temp = (int32_t *)allocateMemory(
    225                                          sizeof(int32_t) * newlength, status);
    226         if (U_FAILURE(*status)) {
    227             return NULL;
    228         }
    229         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
    230         *destinationlength = newlength;
    231         destination        = temp;
    232     }
    233     destination[offset] = value;
    234     return destination;
    235 }
    236 
    237 /**
    238 * Adds a uint64_t value to a destination array.
    239 * Creates a new array if we run out of space. The caller will have to
    240 * manually deallocate the newly allocated array.
    241 * Internal method, status assumed to be success, caller has to check status
    242 * before calling this method. destination not to be NULL and has at least
    243 * size destinationlength.
    244 * @param destination target array
    245 * @param offset destination offset to add value
    246 * @param destinationlength target array size, return value for the new size
    247 * @param value to be added
    248 * @param increments incremental size expected
    249 * @param status output error if any, caller to check status before calling
    250 *               method, status assumed to be success when passed in.
    251 * @return new destination array, destination if there was no new allocation
    252 */
    253 static
    254 inline int64_t * addTouint64_tArray(int64_t    *destination,
    255                                     uint32_t    offset,
    256                                     uint32_t   *destinationlength,
    257                                     uint64_t    value,
    258                                     uint32_t    increments,
    259                                     UErrorCode *status)
    260 {
    261     uint32_t newlength = *destinationlength;
    262     if (offset + 1 == newlength) {
    263         newlength += increments;
    264         int64_t *temp = (int64_t *)allocateMemory(
    265                                          sizeof(int64_t) * newlength, status);
    266 
    267         if (U_FAILURE(*status)) {
    268             return NULL;
    269         }
    270 
    271         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
    272         *destinationlength = newlength;
    273         destination        = temp;
    274     }
    275 
    276     destination[offset] = value;
    277 
    278     return destination;
    279 }
    280 
    281 /**
    282 * Initializing the ce table for a pattern.
    283 * Stores non-ignorable collation keys.
    284 * Table size will be estimated by the size of the pattern text. Table
    285 * expansion will be perform as we go along. Adding 1 to ensure that the table
    286 * size definitely increases.
    287 * Internal method, status assumed to be a success.
    288 * @param strsrch string search data
    289 * @param status output error if any, caller to check status before calling
    290 *               method, status assumed to be success when passed in.
    291 * @return total number of expansions
    292 */
    293 static
    294 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
    295                                          UErrorCode    *status)
    296 {
    297     UPattern *pattern            = &(strsrch->pattern);
    298     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
    299     int32_t  *cetable            = pattern->CEBuffer;
    300     uint32_t  patternlength      = pattern->textLength;
    301     UCollationElements *coleiter = strsrch->utilIter;
    302 
    303     if (coleiter == NULL) {
    304         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    305                                      patternlength, status);
    306         // status will be checked in ucol_next(..) later and if it is an
    307         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    308         // returned.
    309         strsrch->utilIter = coleiter;
    310     }
    311     else {
    312         uprv_init_collIterate(strsrch->collator, pattern->text,
    313                          pattern->textLength,
    314                          &coleiter->iteratordata_);
    315     }
    316 
    317     if (pattern->CE != cetable && pattern->CE) {
    318         uprv_free(pattern->CE);
    319     }
    320 
    321     uint16_t  offset      = 0;
    322     uint16_t  result      = 0;
    323     int32_t   ce;
    324 
    325     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
    326            U_SUCCESS(*status)) {
    327         uint32_t newce = getCE(strsrch, ce);
    328         if (newce) {
    329             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
    330                                   newce,
    331                                   patternlength - ucol_getOffset(coleiter) + 1,
    332                                   status);
    333             if (U_FAILURE(*status)) {
    334                 return 0;
    335             }
    336             offset ++;
    337             if (cetable != temp && cetable != pattern->CEBuffer) {
    338                 uprv_free(cetable);
    339             }
    340             cetable = temp;
    341         }
    342         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    343     }
    344 
    345     cetable[offset]   = 0;
    346     pattern->CE       = cetable;
    347     pattern->CELength = offset;
    348 
    349     return result;
    350 }
    351 
    352 /**
    353 * Initializing the pce table for a pattern.
    354 * Stores non-ignorable collation keys.
    355 * Table size will be estimated by the size of the pattern text. Table
    356 * expansion will be perform as we go along. Adding 1 to ensure that the table
    357 * size definitely increases.
    358 * Internal method, status assumed to be a success.
    359 * @param strsrch string search data
    360 * @param status output error if any, caller to check status before calling
    361 *               method, status assumed to be success when passed in.
    362 * @return total number of expansions
    363 */
    364 static
    365 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
    366                                           UErrorCode    *status)
    367 {
    368     UPattern *pattern            = &(strsrch->pattern);
    369     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
    370     int64_t  *pcetable           = pattern->PCEBuffer;
    371     uint32_t  patternlength      = pattern->textLength;
    372     UCollationElements *coleiter = strsrch->utilIter;
    373 
    374     if (coleiter == NULL) {
    375         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    376                                      patternlength, status);
    377         // status will be checked in ucol_next(..) later and if it is an
    378         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    379         // returned.
    380         strsrch->utilIter = coleiter;
    381     } else {
    382         uprv_init_collIterate(strsrch->collator, pattern->text,
    383                               pattern->textLength,
    384                               &coleiter->iteratordata_);
    385     }
    386 
    387     if (pattern->PCE != pcetable && pattern->PCE != NULL) {
    388         uprv_free(pattern->PCE);
    389     }
    390 
    391     uint16_t  offset = 0;
    392     uint16_t  result = 0;
    393     int64_t   pce;
    394 
    395     uprv_init_pce(coleiter);
    396 
    397     // ** Should processed CEs be signed or unsigned?
    398     // ** (the rest of the code in this file seems to play fast-and-loose with
    399     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
    400     while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
    401            U_SUCCESS(*status)) {
    402         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
    403                               pce,
    404                               patternlength - ucol_getOffset(coleiter) + 1,
    405                               status);
    406 
    407         if (U_FAILURE(*status)) {
    408             return 0;
    409         }
    410 
    411         offset += 1;
    412 
    413         if (pcetable != temp && pcetable != pattern->PCEBuffer) {
    414             uprv_free(pcetable);
    415         }
    416 
    417         pcetable = temp;
    418         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    419     }
    420 
    421     pcetable[offset]   = 0;
    422     pattern->PCE       = pcetable;
    423     pattern->PCELength = offset;
    424 
    425     return result;
    426 }
    427 
    428 /**
    429 * Initializes the pattern struct.
    430 * Internal method, status assumed to be success.
    431 * @param strsrch UStringSearch data storage
    432 * @param status output error if any, caller to check status before calling
    433 *               method, status assumed to be success when passed in.
    434 * @return expansionsize the total expansion size of the pattern
    435 */
    436 static
    437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
    438 {
    439           UPattern   *pattern     = &(strsrch->pattern);
    440     const UChar      *patterntext = pattern->text;
    441           int32_t     length      = pattern->textLength;
    442           int32_t index       = 0;
    443 
    444     // Since the strength is primary, accents are ignored in the pattern.
    445     if (strsrch->strength == UCOL_PRIMARY) {
    446     	pattern->hasPrefixAccents = 0;
    447     	pattern->hasSuffixAccents = 0;
    448     } else {
    449 	    pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
    450 	                                                     SECOND_LAST_BYTE_SHIFT_;
    451 	    index = length;
    452 	    UTF_BACK_1(patterntext, 0, index);
    453 	    pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
    454 	                                                             LAST_BYTE_MASK_;
    455     }
    456 
    457     // ** HACK **
    458     if (strsrch->pattern.PCE != NULL) {
    459         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
    460             uprv_free(strsrch->pattern.PCE);
    461         }
    462 
    463         strsrch->pattern.PCE = NULL;
    464     }
    465 
    466     // since intializePattern is an internal method status is a success.
    467     return initializePatternCETable(strsrch, status);
    468 }
    469 
    470 /**
    471 * Initializing shift tables, with the default values.
    472 * If a corresponding default value is 0, the shift table is not set.
    473 * @param shift table for forwards shift
    474 * @param backshift table for backwards shift
    475 * @param cetable table containing pattern ce
    476 * @param cesize size of the pattern ces
    477 * @param expansionsize total size of the expansions
    478 * @param defaultforward the default forward value
    479 * @param defaultbackward the default backward value
    480 */
    481 static
    482 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
    483                           int32_t  *cetable, int32_t cesize,
    484                           int16_t   expansionsize,
    485                           int16_t   defaultforward,
    486                           int16_t   defaultbackward)
    487 {
    488     // estimate the value to shift. to do that we estimate the smallest
    489     // number of characters to give the relevant ces, ie approximately
    490     // the number of ces minus their expansion, since expansions can come
    491     // from a character.
    492     int32_t count;
    493     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    494         shift[count] = defaultforward;
    495     }
    496     cesize --; // down to the last index
    497     for (count = 0; count < cesize; count ++) {
    498         // number of ces from right of array to the count
    499         int temp = defaultforward - count - 1;
    500         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
    501     }
    502     shift[hash(cetable[cesize])] = 1;
    503     // for ignorables we just shift by one. see test examples.
    504     shift[hash(0)] = 1;
    505 
    506     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    507         backshift[count] = defaultbackward;
    508     }
    509     for (count = cesize; count > 0; count --) {
    510         // the original value count does not seem to work
    511         backshift[hash(cetable[count])] = count > expansionsize ?
    512                                           (int16_t)(count - expansionsize) : 1;
    513     }
    514     backshift[hash(cetable[0])] = 1;
    515     backshift[hash(0)] = 1;
    516 }
    517 
    518 /**
    519 * Building of the pattern collation element list and the boyer moore strsrch
    520 * table.
    521 * The canonical match will only be performed after the default match fails.
    522 * For both cases we need to remember the size of the composed and decomposed
    523 * versions of the string. Since the Boyer-Moore shift calculations shifts by
    524 * a number of characters in the text and tries to match the pattern from that
    525 * offset, the shift value can not be too large in case we miss some
    526 * characters. To choose a right shift size, we estimate the NFC form of the
    527 * and use its size as a shift guide. The NFC form should be the small
    528 * possible representation of the pattern. Anyways, we'll err on the smaller
    529 * shift size. Hence the calculation for minlength.
    530 * Canonical match will be performed slightly differently. We'll split the
    531 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
    532 * the first and last base character (MS), the ending accents (EA). Matches
    533 * will be done on MS first, and only when we match MS then some processing
    534 * will be required for the prefix and end accents in order to determine if
    535 * they match PA and EA. Hence the default shift values
    536 * for the canonical match will take the size of either end's accent into
    537 * consideration. Forwards search will take the end accents into consideration
    538 * for the default shift values and the backwards search will take the prefix
    539 * accents into consideration.
    540 * If pattern has no non-ignorable ce, we return a illegal argument error.
    541 * Internal method, status assumed to be success.
    542 * @param strsrch UStringSearch data storage
    543 * @param status  for output errors if it occurs, status is assumed to be a
    544 *                success when it is passed in.
    545 */
    546 static
    547 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
    548 {
    549     int16_t expandlength  = initializePattern(strsrch, status);
    550     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
    551         UPattern *pattern = &strsrch->pattern;
    552         int32_t   cesize  = pattern->CELength;
    553 
    554         int16_t minlength = cesize > expandlength
    555                             ? (int16_t)cesize - expandlength : 1;
    556         pattern->defaultShiftSize    = minlength;
    557         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
    558                       cesize, expandlength, minlength, minlength);
    559         return;
    560     }
    561     strsrch->pattern.defaultShiftSize = 0;
    562 }
    563 
    564 #if BOYER_MOORE
    565 /**
    566 * Check to make sure that the match length is at the end of the character by
    567 * using the breakiterator.
    568 * @param strsrch string search data
    569 * @param start target text start offset
    570 * @param end target text end offset
    571 */
    572 static
    573 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
    574                                int32_t *end)
    575 {
    576 #if !UCONFIG_NO_BREAK_ITERATION
    577     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
    578     if (breakiterator) {
    579 	    int32_t matchend = *end;
    580 	    //int32_t matchstart = *start;
    581 
    582 	    if (!ubrk_isBoundary(breakiterator, matchend)) {
    583 	    	*end = ubrk_following(breakiterator, matchend);
    584         }
    585 
    586 	    /* Check the start of the matched text to make sure it doesn't have any accents
    587 	     * before it.  This code may not be necessary and so it is commented out */
    588 	    /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
    589 	    	*start = ubrk_preceding(breakiterator, matchstart);
    590 	    }*/
    591     }
    592 #endif
    593 }
    594 
    595 /**
    596 * Determine whether the target text in UStringSearch bounded by the offset
    597 * start and end is one or more whole units of text as
    598 * determined by the breakiterator in UStringSearch.
    599 * @param strsrch string search data
    600 * @param start target text start offset
    601 * @param end target text end offset
    602 */
    603 static
    604 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
    605                                int32_t    end)
    606 {
    607 #if !UCONFIG_NO_BREAK_ITERATION
    608     UBreakIterator *breakiterator = strsrch->search->breakIter;
    609     //TODO: Add here.
    610     if (breakiterator) {
    611         int32_t startindex = ubrk_first(breakiterator);
    612         int32_t endindex   = ubrk_last(breakiterator);
    613 
    614         // out-of-range indexes are never boundary positions
    615         if (start < startindex || start > endindex ||
    616             end < startindex || end > endindex) {
    617             return FALSE;
    618         }
    619         // otherwise, we can use following() on the position before the
    620         // specified one and return true of the position we get back is the
    621         // one the user specified
    622         UBool result = (start == startindex ||
    623                 ubrk_following(breakiterator, start - 1) == start) &&
    624                (end == endindex ||
    625                 ubrk_following(breakiterator, end - 1) == end);
    626         if (result) {
    627             // iterates the individual ces
    628                   UCollationElements *coleiter  = strsrch->utilIter;
    629             const UChar              *text      = strsrch->search->text +
    630                                                                       start;
    631                   UErrorCode          status    = U_ZERO_ERROR;
    632             ucol_setText(coleiter, text, end - start, &status);
    633             for (int32_t count = 0; count < strsrch->pattern.CELength;
    634                  count ++) {
    635                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
    636                 if (ce == UCOL_IGNORABLE) {
    637                     count --;
    638                     continue;
    639                 }
    640                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
    641                     return FALSE;
    642                 }
    643             }
    644             int32_t nextce = ucol_next(coleiter, &status);
    645             while (ucol_getOffset(coleiter) == (end - start)
    646                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
    647                 nextce = ucol_next(coleiter, &status);
    648             }
    649             if (ucol_getOffset(coleiter) == (end - start)
    650                 && nextce != UCOL_NULLORDER) {
    651                 // extra collation elements at the end of the match
    652                 return FALSE;
    653             }
    654         }
    655         return result;
    656     }
    657 #endif
    658     return TRUE;
    659 }
    660 
    661 /**
    662 * Getting the next base character offset if current offset is an accent,
    663 * or the current offset if the current character contains a base character.
    664 * accents the following base character will be returned
    665 * @param text string
    666 * @param textoffset current offset
    667 * @param textlength length of text string
    668 * @return the next base character or the current offset
    669 *         if the current character is contains a base character.
    670 */
    671 static
    672 inline int32_t getNextBaseOffset(const UChar       *text,
    673                                            int32_t  textoffset,
    674                                            int32_t      textlength)
    675 {
    676     if (textoffset < textlength) {
    677         int32_t temp = textoffset;
    678         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
    679             while (temp < textlength) {
    680                 int32_t result = temp;
    681                 if ((getFCD(text, &temp, textlength) >>
    682                      SECOND_LAST_BYTE_SHIFT_) == 0) {
    683                     return result;
    684                 }
    685             }
    686             return textlength;
    687         }
    688     }
    689     return textoffset;
    690 }
    691 
    692 /**
    693 * Gets the next base character offset depending on the string search pattern
    694 * data
    695 * @param strsrch string search data
    696 * @param textoffset current offset, one offset away from the last character
    697 *                   to search for.
    698 * @return start index of the next base character or the current offset
    699 *         if the current character is contains a base character.
    700 */
    701 static
    702 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
    703                                                   int32_t    textoffset)
    704 {
    705     int32_t textlength = strsrch->search->textLength;
    706     if (strsrch->pattern.hasSuffixAccents &&
    707         textoffset < textlength) {
    708               int32_t  temp       = textoffset;
    709         const UChar       *text       = strsrch->search->text;
    710         UTF_BACK_1(text, 0, temp);
    711         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
    712             return getNextBaseOffset(text, textoffset, textlength);
    713         }
    714     }
    715     return textoffset;
    716 }
    717 
    718 /**
    719 * Shifting the collation element iterator position forward to prepare for
    720 * a following match. If the last character is a unsafe character, we'll only
    721 * shift by 1 to capture contractions, normalization etc.
    722 * Internal method, status assumed to be success.
    723 * @param text strsrch string search data
    724 * @param textoffset start text position to do search
    725 * @param ce the text ce which failed the match.
    726 * @param patternceindex index of the ce within the pattern ce buffer which
    727 *        failed the match
    728 * @return final offset
    729 */
    730 static
    731 inline int32_t shiftForward(UStringSearch *strsrch,
    732                                 int32_t    textoffset,
    733                                 int32_t       ce,
    734                                 int32_t        patternceindex)
    735 {
    736     UPattern *pattern = &(strsrch->pattern);
    737     if (ce != UCOL_NULLORDER) {
    738         int32_t shift = pattern->shift[hash(ce)];
    739         // this is to adjust for characters in the middle of the
    740         // substring for matching that failed.
    741         int32_t adjust = pattern->CELength - patternceindex;
    742         if (adjust > 1 && shift >= adjust) {
    743             shift -= adjust - 1;
    744         }
    745         textoffset += shift;
    746     }
    747     else {
    748         textoffset += pattern->defaultShiftSize;
    749     }
    750 
    751     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
    752     // check for unsafe characters
    753     // * if it is the start or middle of a contraction: to be done after
    754     //   a initial match is found
    755     // * thai or lao base consonant character: similar to contraction
    756     // * high surrogate character: similar to contraction
    757     // * next character is a accent: shift to the next base character
    758     return textoffset;
    759 }
    760 #endif // #if BOYER_MOORE
    761 
    762 /**
    763 * sets match not found
    764 * @param strsrch string search data
    765 */
    766 static
    767 inline void setMatchNotFound(UStringSearch *strsrch)
    768 {
    769     // this method resets the match result regardless of the error status.
    770     strsrch->search->matchedIndex = USEARCH_DONE;
    771     strsrch->search->matchedLength = 0;
    772     if (strsrch->search->isForwardSearching) {
    773         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
    774     }
    775     else {
    776         setColEIterOffset(strsrch->textIter, 0);
    777     }
    778 }
    779 
    780 #if BOYER_MOORE
    781 /**
    782 * Gets the offset to the next safe point in text.
    783 * ie. not the middle of a contraction, swappable characters or supplementary
    784 * characters.
    785 * @param collator collation sata
    786 * @param text string to work with
    787 * @param textoffset offset in string
    788 * @param textlength length of text string
    789 * @return offset to the next safe character
    790 */
    791 static
    792 inline int32_t getNextSafeOffset(const UCollator   *collator,
    793                                      const UChar       *text,
    794                                            int32_t  textoffset,
    795                                            int32_t      textlength)
    796 {
    797     int32_t result = textoffset; // first contraction character
    798     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
    799         result ++;
    800     }
    801     return result;
    802 }
    803 
    804 /**
    805 * This checks for accents in the potential match started with a .
    806 * composite character.
    807 * This is really painful... we have to check that composite character do not
    808 * have any extra accents. We have to normalize the potential match and find
    809 * the immediate decomposed character before the match.
    810 * The first composite character would have been taken care of by the fcd
    811 * checks in checkForwardExactMatch.
    812 * This is the slow path after the fcd of the first character and
    813 * the last character has been checked by checkForwardExactMatch and we
    814 * determine that the potential match has extra non-ignorable preceding
    815 * ces.
    816 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    817 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    818 * Note here that accents checking are slow and cautioned in the API docs.
    819 * Internal method, status assumed to be a success, caller should check status
    820 * before calling this method
    821 * @param strsrch string search data
    822 * @param start index of the potential unfriendly composite character
    823 * @param end index of the potential unfriendly composite character
    824 * @param status output error status if any.
    825 * @return TRUE if there is non-ignorable accents before at the beginning
    826 *              of the match, FALSE otherwise.
    827 */
    828 
    829 static
    830 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
    831                                    int32_t    end,
    832                                    UErrorCode    *status)
    833 {
    834     UBool result = FALSE;
    835     if (strsrch->pattern.hasPrefixAccents) {
    836               int32_t  length = end - start;
    837               int32_t  offset = 0;
    838         const UChar       *text   = strsrch->search->text + start;
    839 
    840         UTF_FWD_1(text, offset, length);
    841         // we are only concerned with the first composite character
    842         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
    843             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
    844                                                        text, 0, length);
    845             if (safeoffset != length) {
    846                 safeoffset ++;
    847             }
    848             UChar   *norm = NULL;
    849             UChar    buffer[INITIAL_ARRAY_SIZE_];
    850             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
    851                                             buffer, INITIAL_ARRAY_SIZE_,
    852                                             status);
    853             if (U_FAILURE(*status)) {
    854                 return FALSE;
    855             }
    856             if (size >= INITIAL_ARRAY_SIZE_) {
    857                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
    858                                                status);
    859                 // if allocation failed, status will be set to
    860                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
    861                 // checks for it.
    862                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
    863                                        size, status);
    864                 if (U_FAILURE(*status) && norm != NULL) {
    865                     uprv_free(norm);
    866                     return FALSE;
    867                 }
    868             }
    869             else {
    870                 norm = buffer;
    871             }
    872 
    873             UCollationElements *coleiter  = strsrch->utilIter;
    874             ucol_setText(coleiter, norm, size, status);
    875             uint32_t            firstce   = strsrch->pattern.CE[0];
    876             UBool               ignorable = TRUE;
    877             uint32_t            ce        = UCOL_IGNORABLE;
    878             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
    879                 offset = ucol_getOffset(coleiter);
    880                 if (ce != firstce && ce != UCOL_IGNORABLE) {
    881                     ignorable = FALSE;
    882                 }
    883                 ce = ucol_next(coleiter, status);
    884             }
    885             UChar32 codepoint;
    886             UTF_PREV_CHAR(norm, 0, offset, codepoint);
    887             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
    888 
    889             if (norm != buffer) {
    890                 uprv_free(norm);
    891             }
    892         }
    893     }
    894 
    895     return result;
    896 }
    897 
    898 /**
    899 * Used by exact matches, checks if there are accents before the match.
    900 * This is really painful... we have to check that composite characters at
    901 * the start of the matches have to not have any extra accents.
    902 * We check the FCD of the character first, if it starts with an accent and
    903 * the first pattern ce does not match the first ce of the character, we bail.
    904 * Otherwise we try normalizing the first composite
    905 * character and find the immediate decomposed character before the match to
    906 * see if it is an non-ignorable accent.
    907 * Now normalizing the first composite character is enough because we ensure
    908 * that when the match is passed in here with extra beginning ces, the
    909 * first or last ce that match has to occur within the first character.
    910 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    911 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    912 * Note here that accents checking are slow and cautioned in the API docs.
    913 * @param strsrch string search data
    914 * @param start offset
    915 * @param end offset
    916 * @return TRUE if there are accents on either side of the match,
    917 *         FALSE otherwise
    918 */
    919 static
    920 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
    921                                   int32_t    end)
    922 {
    923     if (strsrch->pattern.hasPrefixAccents) {
    924         UCollationElements *coleiter  = strsrch->textIter;
    925         UErrorCode          status    = U_ZERO_ERROR;
    926         // we have been iterating forwards previously
    927         uint32_t            ignorable = TRUE;
    928         int32_t             firstce   = strsrch->pattern.CE[0];
    929 
    930         setColEIterOffset(coleiter, start);
    931         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
    932         if (U_FAILURE(status)) {
    933             return TRUE;
    934         }
    935         while (ce != firstce) {
    936             if (ce != UCOL_IGNORABLE) {
    937                 ignorable = FALSE;
    938             }
    939             ce = getCE(strsrch, ucol_next(coleiter, &status));
    940             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
    941                 return TRUE;
    942             }
    943         }
    944         if (!ignorable && inNormBuf(coleiter)) {
    945             // within normalization buffer, discontiguous handled here
    946             return TRUE;
    947         }
    948 
    949         // within text
    950         int32_t temp = start;
    951         // original code
    952         // accent = (getFCD(strsrch->search->text, &temp,
    953         //                  strsrch->search->textLength)
    954         //            >> SECOND_LAST_BYTE_SHIFT_);
    955         // however this code does not work well with VC7 .net in release mode.
    956         // maybe the inlines for getFCD combined with shifting has bugs in
    957         // VC7. anyways this is a work around.
    958         UBool accent = getFCD(strsrch->search->text, &temp,
    959                               strsrch->search->textLength) > 0xFF;
    960         if (!accent) {
    961             return checkExtraMatchAccents(strsrch, start, end, &status);
    962         }
    963         if (!ignorable) {
    964             return TRUE;
    965         }
    966         if (start > 0) {
    967             temp = start;
    968             UTF_BACK_1(strsrch->search->text, 0, temp);
    969             if (getFCD(strsrch->search->text, &temp,
    970                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
    971                 setColEIterOffset(coleiter, start);
    972                 ce = ucol_previous(coleiter, &status);
    973                 if (U_FAILURE(status) ||
    974                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
    975                     return TRUE;
    976                 }
    977             }
    978         }
    979     }
    980 
    981     return FALSE;
    982 }
    983 
    984 /**
    985 * Used by exact matches, checks if there are accents bounding the match.
    986 * Note this is the initial boundary check. If the potential match
    987 * starts or ends with composite characters, the accents in those
    988 * characters will be determined later.
    989 * Not doing backwards iteration here, since discontiguos contraction for
    990 * backwards collation element iterator, use up too many characters.
    991 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
    992 * should fail since there is a acute at the end of \u01FA
    993 * Note here that accents checking are slow and cautioned in the API docs.
    994 * @param strsrch string search data
    995 * @param start offset of match
    996 * @param end end offset of the match
    997 * @return TRUE if there are accents on either side of the match,
    998 *         FALSE otherwise
    999 */
   1000 static
   1001 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
   1002                                  int32_t    end)
   1003 {
   1004     if (strsrch->pattern.hasSuffixAccents) {
   1005         const UChar       *text       = strsrch->search->text;
   1006               int32_t  temp       = end;
   1007               int32_t      textlength = strsrch->search->textLength;
   1008         UTF_BACK_1(text, 0, temp);
   1009         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   1010             int32_t             firstce  = strsrch->pattern.CE[0];
   1011             UCollationElements *coleiter = strsrch->textIter;
   1012             UErrorCode          status   = U_ZERO_ERROR;
   1013 			int32_t ce;
   1014             setColEIterOffset(coleiter, start);
   1015             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
   1016                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   1017                     return TRUE;
   1018                 }
   1019             }
   1020             int32_t count = 1;
   1021             while (count < strsrch->pattern.CELength) {
   1022                 if (getCE(strsrch, ucol_next(coleiter, &status))
   1023                     == UCOL_IGNORABLE) {
   1024                     // Thai can give an ignorable here.
   1025                     count --;
   1026                 }
   1027                 if (U_FAILURE(status)) {
   1028                     return TRUE;
   1029                 }
   1030                 count ++;
   1031             }
   1032 
   1033 			ce = ucol_next(coleiter, &status);
   1034             if (U_FAILURE(status)) {
   1035                 return TRUE;
   1036             }
   1037             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1038             	ce = getCE(strsrch, ce);
   1039             }
   1040             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1041                 if (ucol_getOffset(coleiter) <= end) {
   1042                     return TRUE;
   1043                 }
   1044                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   1045                     return TRUE;
   1046                 }
   1047             }
   1048         }
   1049     }
   1050     return FALSE;
   1051 }
   1052 #endif // #if BOYER_MOORE
   1053 
   1054 /**
   1055 * Checks if the offset runs out of the text string
   1056 * @param offset
   1057 * @param textlength of the text string
   1058 * @return TRUE if offset is out of bounds, FALSE otherwise
   1059 */
   1060 static
   1061 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
   1062 {
   1063     return offset < 0 || offset > textlength;
   1064 }
   1065 
   1066 /**
   1067 * Checks for identical match
   1068 * @param strsrch string search data
   1069 * @param start offset of possible match
   1070 * @param end offset of possible match
   1071 * @return TRUE if identical match is found
   1072 */
   1073 static
   1074 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
   1075                                   int32_t    end)
   1076 {
   1077     UChar t2[32], p2[32];
   1078     int32_t length = end - start;
   1079     if (strsrch->strength != UCOL_IDENTICAL) {
   1080         return TRUE;
   1081     }
   1082 
   1083     UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
   1084     int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
   1085                                        strsrch->search->text + start, length,
   1086                                        FALSE, 0, &status);
   1087     // use separate status2 in case of buffer overflow
   1088     if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
   1089                                         strsrch->pattern.text,
   1090                                         strsrch->pattern.textLength,
   1091                                         FALSE, 0, &status2)) {
   1092         return FALSE; // lengths are different
   1093     }
   1094 
   1095     // compare contents
   1096     UChar *text, *pattern;
   1097     if(U_SUCCESS(status)) {
   1098         text = t2;
   1099         pattern = p2;
   1100     } else if(status==U_BUFFER_OVERFLOW_ERROR) {
   1101         status = U_ZERO_ERROR;
   1102         // allocate one buffer for both decompositions
   1103         text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
   1104         // Check for allocation failure.
   1105         if (text == NULL) {
   1106         	return FALSE;
   1107         }
   1108         pattern = text + decomplength;
   1109         unorm_decompose(text, decomplength, strsrch->search->text + start,
   1110                         length, FALSE, 0, &status);
   1111         unorm_decompose(pattern, decomplength, strsrch->pattern.text,
   1112                         strsrch->pattern.textLength, FALSE, 0, &status);
   1113     } else {
   1114         // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
   1115         // and that we don't uprv_free() an undefined text pointer
   1116         text = pattern = t2;
   1117         decomplength = 0;
   1118     }
   1119     UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
   1120     if(text != t2) {
   1121         uprv_free(text);
   1122     }
   1123     // return FALSE if NFD failed
   1124     return U_SUCCESS(status) && result;
   1125 }
   1126 
   1127 #if BOYER_MOORE
   1128 /**
   1129 * Checks to see if the match is repeated
   1130 * @param strsrch string search data
   1131 * @param start new match start index
   1132 * @param end new match end index
   1133 * @return TRUE if the the match is repeated, FALSE otherwise
   1134 */
   1135 static
   1136 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
   1137                                 int32_t    start,
   1138                                 int32_t    end)
   1139 {
   1140     int32_t lastmatchindex = strsrch->search->matchedIndex;
   1141     UBool       result;
   1142     if (lastmatchindex == USEARCH_DONE) {
   1143         return FALSE;
   1144     }
   1145     if (strsrch->search->isForwardSearching) {
   1146         result = start <= lastmatchindex;
   1147     }
   1148     else {
   1149         result = start >= lastmatchindex;
   1150     }
   1151     if (!result && !strsrch->search->isOverlap) {
   1152         if (strsrch->search->isForwardSearching) {
   1153             result = start < lastmatchindex + strsrch->search->matchedLength;
   1154         }
   1155         else {
   1156             result = end > lastmatchindex;
   1157         }
   1158     }
   1159     return result;
   1160 }
   1161 
   1162 /**
   1163 * Gets the collation element iterator's current offset.
   1164 * @param coleiter collation element iterator
   1165 * @param forwards flag TRUE if we are moving in th forwards direction
   1166 * @return current offset
   1167 */
   1168 static
   1169 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
   1170                                               UBool               forwards)
   1171 {
   1172     int32_t result = ucol_getOffset(coleiter);
   1173     // intricacies of the the backwards collation element iterator
   1174     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
   1175         result ++;
   1176     }
   1177     return result;
   1178 }
   1179 
   1180 /**
   1181 * Checks match for contraction.
   1182 * If the match ends with a partial contraction we fail.
   1183 * If the match starts too far off (because of backwards iteration) we try to
   1184 * chip off the extra characters depending on whether a breakiterator has
   1185 * been used.
   1186 * Internal method, error assumed to be success, caller has to check status
   1187 * before calling this method.
   1188 * @param strsrch string search data
   1189 * @param start offset of potential match, to be modified if necessary
   1190 * @param end offset of potential match, to be modified if necessary
   1191 * @param status output error status if any
   1192 * @return TRUE if match passes the contraction test, FALSE otherwise
   1193 */
   1194 
   1195 static
   1196 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
   1197                                      int32_t   *start,
   1198                                      int32_t   *end, UErrorCode  *status)
   1199 {
   1200           UCollationElements *coleiter   = strsrch->textIter;
   1201           int32_t             textlength = strsrch->search->textLength;
   1202           int32_t             temp       = *start;
   1203     const UCollator          *collator   = strsrch->collator;
   1204     const UChar              *text       = strsrch->search->text;
   1205     // This part checks if either ends of the match contains potential
   1206     // contraction. If so we'll have to iterate through them
   1207     // The start contraction needs to be checked since ucol_previous dumps
   1208     // all characters till the first safe character into the buffer.
   1209     // *start + 1 is used to test for the unsafe characters instead of *start
   1210     // because ucol_prev takes all unsafe characters till the first safe
   1211     // character ie *start. so by testing *start + 1, we can estimate if
   1212     // excess prefix characters has been included in the potential search
   1213     // results.
   1214     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1215         (*start + 1 < textlength
   1216          && ucol_unsafeCP(text[*start + 1], collator))) {
   1217         int32_t expansion  = getExpansionPrefix(coleiter);
   1218         UBool   expandflag = expansion > 0;
   1219         setColEIterOffset(coleiter, *start);
   1220         while (expansion > 0) {
   1221             // getting rid of the redundant ce, caused by setOffset.
   1222             // since backward contraction/expansion may have extra ces if we
   1223             // are in the normalization buffer, hasAccentsBeforeMatch would
   1224             // have taken care of it.
   1225             // E.g. the character \u01FA will have an expansion of 3, but if
   1226             // we are only looking for acute and ring \u030A and \u0301, we'll
   1227             // have to skip the first ce in the expansion buffer.
   1228             ucol_next(coleiter, status);
   1229             if (U_FAILURE(*status)) {
   1230                 return FALSE;
   1231             }
   1232             if (ucol_getOffset(coleiter) != temp) {
   1233                 *start = temp;
   1234                 temp  = ucol_getOffset(coleiter);
   1235             }
   1236             expansion --;
   1237         }
   1238 
   1239         int32_t  *patternce       = strsrch->pattern.CE;
   1240         int32_t   patterncelength = strsrch->pattern.CELength;
   1241         int32_t   count           = 0;
   1242         while (count < patterncelength) {
   1243             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1244             if (ce == UCOL_IGNORABLE) {
   1245                 continue;
   1246             }
   1247             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1248                 *start = temp;
   1249                 temp   = ucol_getOffset(coleiter);
   1250             }
   1251             if (U_FAILURE(*status) || ce != patternce[count]) {
   1252                 (*end) ++;
   1253                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1254                 return FALSE;
   1255             }
   1256             count ++;
   1257         }
   1258     }
   1259     return TRUE;
   1260 }
   1261 
   1262 /**
   1263 * Checks and sets the match information if found.
   1264 * Checks
   1265 * <ul>
   1266 * <li> the potential match does not repeat the previous match
   1267 * <li> boundaries are correct
   1268 * <li> exact matches has no extra accents
   1269 * <li> identical matchesb
   1270 * <li> potential match does not end in the middle of a contraction
   1271 * <\ul>
   1272 * Otherwise the offset will be shifted to the next character.
   1273 * Internal method, status assumed to be success, caller has to check status
   1274 * before calling this method.
   1275 * @param strsrch string search data
   1276 * @param textoffset offset in the collation element text. the returned value
   1277 *        will be the truncated end offset of the match or the new start
   1278 *        search offset.
   1279 * @param status output error status if any
   1280 * @return TRUE if the match is valid, FALSE otherwise
   1281 */
   1282 static
   1283 inline UBool checkNextExactMatch(UStringSearch *strsrch,
   1284                                  int32_t   *textoffset, UErrorCode *status)
   1285 {
   1286     UCollationElements *coleiter = strsrch->textIter;
   1287     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
   1288 
   1289     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
   1290         return FALSE;
   1291     }
   1292 
   1293     // this totally matches, however we need to check if it is repeating
   1294     if (!isBreakUnit(strsrch, start, *textoffset) ||
   1295         checkRepeatedMatch(strsrch, start, *textoffset) ||
   1296         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
   1297         !checkIdentical(strsrch, start, *textoffset) ||
   1298         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
   1299 
   1300         (*textoffset) ++;
   1301         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
   1302         return FALSE;
   1303     }
   1304 
   1305     //Add breakiterator boundary check for primary strength search.
   1306     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   1307     	checkBreakBoundary(strsrch, &start, textoffset);
   1308     }
   1309 
   1310     // totally match, we will get rid of the ending ignorables.
   1311     strsrch->search->matchedIndex  = start;
   1312     strsrch->search->matchedLength = *textoffset - start;
   1313     return TRUE;
   1314 }
   1315 
   1316 /**
   1317 * Getting the previous base character offset, or the current offset if the
   1318 * current character is a base character
   1319 * @param text string
   1320 * @param textoffset one offset after the current character
   1321 * @return the offset of the next character after the base character or the first
   1322 *         composed character with accents
   1323 */
   1324 static
   1325 inline int32_t getPreviousBaseOffset(const UChar       *text,
   1326                                                int32_t  textoffset)
   1327 {
   1328     if (textoffset > 0) {
   1329         for (;;) {
   1330             int32_t result = textoffset;
   1331             UTF_BACK_1(text, 0, textoffset);
   1332             int32_t temp = textoffset;
   1333             uint16_t fcd = getFCD(text, &temp, result);
   1334             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   1335                 if (fcd & LAST_BYTE_MASK_) {
   1336                     return textoffset;
   1337                 }
   1338                 return result;
   1339             }
   1340             if (textoffset == 0) {
   1341                 return 0;
   1342             }
   1343         }
   1344     }
   1345     return textoffset;
   1346 }
   1347 
   1348 /**
   1349 * Getting the indexes of the accents that are not blocked in the argument
   1350 * accent array
   1351 * @param accents array of accents in nfd terminated by a 0.
   1352 * @param accentsindex array of indexes of the accents that are not blocked
   1353 */
   1354 static
   1355 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
   1356 {
   1357     int32_t index     = 0;
   1358     int32_t     length    = u_strlen(accents);
   1359     UChar32     codepoint = 0;
   1360     int         cclass    = 0;
   1361     int         result    = 0;
   1362     int32_t temp;
   1363     while (index < length) {
   1364         temp = index;
   1365         UTF_NEXT_CHAR(accents, index, length, codepoint);
   1366         if (u_getCombiningClass(codepoint) != cclass) {
   1367             cclass        = u_getCombiningClass(codepoint);
   1368             accentsindex[result] = temp;
   1369             result ++;
   1370         }
   1371     }
   1372     accentsindex[result] = length;
   1373     return result;
   1374 }
   1375 
   1376 /**
   1377 * Appends 3 UChar arrays to a destination array.
   1378 * Creates a new array if we run out of space. The caller will have to
   1379 * manually deallocate the newly allocated array.
   1380 * Internal method, status assumed to be success, caller has to check status
   1381 * before calling this method. destination not to be NULL and has at least
   1382 * size destinationlength.
   1383 * @param destination target array
   1384 * @param destinationlength target array size, returning the appended length
   1385 * @param source1 null-terminated first array
   1386 * @param source2 second array
   1387 * @param source2length length of seond array
   1388 * @param source3 null-terminated third array
   1389 * @param status error status if any
   1390 * @return new destination array, destination if there was no new allocation
   1391 */
   1392 static
   1393 inline UChar * addToUCharArray(      UChar      *destination,
   1394                                      int32_t    *destinationlength,
   1395                                const UChar      *source1,
   1396                                const UChar      *source2,
   1397                                      int32_t     source2length,
   1398                                const UChar      *source3,
   1399                                      UErrorCode *status)
   1400 {
   1401     int32_t source1length = source1 ? u_strlen(source1) : 0;
   1402     int32_t source3length = source3 ? u_strlen(source3) : 0;
   1403     if (*destinationlength < source1length + source2length + source3length +
   1404                                                                            1)
   1405     {
   1406         destination = (UChar *)allocateMemory(
   1407           (source1length + source2length + source3length + 1) * sizeof(UChar),
   1408           status);
   1409         // if error allocating memory, status will be
   1410         // U_MEMORY_ALLOCATION_ERROR
   1411         if (U_FAILURE(*status)) {
   1412             *destinationlength = 0;
   1413             return NULL;
   1414         }
   1415     }
   1416     if (source1length != 0) {
   1417         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
   1418     }
   1419     if (source2length != 0) {
   1420         uprv_memcpy(destination + source1length, source2,
   1421                     sizeof(UChar) * source2length);
   1422     }
   1423     if (source3length != 0) {
   1424         uprv_memcpy(destination + source1length + source2length, source3,
   1425                     sizeof(UChar) * source3length);
   1426     }
   1427     *destinationlength = source1length + source2length + source3length;
   1428     return destination;
   1429 }
   1430 
   1431 /**
   1432 * Running through a collation element iterator to see if the contents matches
   1433 * pattern in string search data
   1434 * @param strsrch string search data
   1435 * @param coleiter collation element iterator
   1436 * @return TRUE if a match if found, FALSE otherwise
   1437 */
   1438 static
   1439 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
   1440                                        UCollationElements *coleiter)
   1441 {
   1442     int         patternceindex = strsrch->pattern.CELength;
   1443     int32_t    *patternce      = strsrch->pattern.CE;
   1444     UErrorCode  status = U_ZERO_ERROR;
   1445     while (patternceindex > 0) {
   1446         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   1447         if (ce == UCOL_IGNORABLE) {
   1448             continue;
   1449         }
   1450         if (U_FAILURE(status) || ce != *patternce) {
   1451             return FALSE;
   1452         }
   1453         patternce ++;
   1454         patternceindex --;
   1455     }
   1456     return TRUE;
   1457 }
   1458 
   1459 /**
   1460 * Rearranges the front accents to try matching.
   1461 * Prefix accents in the text will be grouped according to their combining
   1462 * class and the groups will be mixed and matched to try find the perfect
   1463 * match with the pattern.
   1464 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1465 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1466 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1467 *         "\u0301\u0325".
   1468 * step 2: check if any of the generated substrings matches the pattern.
   1469 * Internal method, status is assumed to be success, caller has to check status
   1470 * before calling this method.
   1471 * @param strsrch string search match
   1472 * @param start first offset of the accents to start searching
   1473 * @param end start of the last accent set
   1474 * @param status output error status if any
   1475 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1476 *         offset of the match. Note this start includes all preceding accents.
   1477 */
   1478 static
   1479 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
   1480                                        int32_t    start,
   1481                                        int32_t    end,
   1482                                        UErrorCode    *status)
   1483 {
   1484     const UChar       *text       = strsrch->search->text;
   1485           int32_t      textlength = strsrch->search->textLength;
   1486           int32_t  tempstart  = start;
   1487 
   1488     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
   1489         // die... failed at a base character
   1490         return USEARCH_DONE;
   1491     }
   1492 
   1493     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
   1494     start = getPreviousBaseOffset(text, tempstart);
   1495 
   1496     UChar       accents[INITIAL_ARRAY_SIZE_];
   1497     // normalizing the offensive string
   1498     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
   1499                     INITIAL_ARRAY_SIZE_, status);
   1500     if (U_FAILURE(*status)) {
   1501         return USEARCH_DONE;
   1502     }
   1503 
   1504     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   1505     int32_t         accentsize = getUnblockedAccentIndex(accents,
   1506                                                                  accentsindex);
   1507     int32_t         count      = (2 << (accentsize - 1)) - 1;
   1508     UChar               buffer[INITIAL_ARRAY_SIZE_];
   1509     UCollationElements *coleiter   = strsrch->utilIter;
   1510     while (U_SUCCESS(*status) && count > 0) {
   1511         UChar *rearrange = strsrch->canonicalPrefixAccents;
   1512         // copy the base characters
   1513         for (int k = 0; k < accentsindex[0]; k ++) {
   1514             *rearrange ++ = accents[k];
   1515         }
   1516         // forming all possible canonical rearrangement by dropping
   1517         // sets of accents
   1518         for (int i = 0; i <= accentsize - 1; i ++) {
   1519             int32_t mask = 1 << (accentsize - i - 1);
   1520             if (count & mask) {
   1521                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1522                     *rearrange ++ = accents[j];
   1523                 }
   1524             }
   1525         }
   1526         *rearrange = 0;
   1527         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   1528         UChar   *match     = addToUCharArray(buffer, &matchsize,
   1529                                            strsrch->canonicalPrefixAccents,
   1530                                            strsrch->search->text + offset,
   1531                                            end - offset,
   1532                                            strsrch->canonicalSuffixAccents,
   1533                                            status);
   1534 
   1535         // if status is a failure, ucol_setText does nothing.
   1536         // run the collator iterator through this match
   1537         ucol_setText(coleiter, match, matchsize, status);
   1538         if (U_SUCCESS(*status)) {
   1539             if (checkCollationMatch(strsrch, coleiter)) {
   1540                 if (match != buffer) {
   1541                     uprv_free(match);
   1542                 }
   1543                 return start;
   1544             }
   1545         }
   1546         count --;
   1547     }
   1548     return USEARCH_DONE;
   1549 }
   1550 
   1551 /**
   1552 * Gets the offset to the safe point in text before textoffset.
   1553 * ie. not the middle of a contraction, swappable characters or supplementary
   1554 * characters.
   1555 * @param collator collation sata
   1556 * @param text string to work with
   1557 * @param textoffset offset in string
   1558 * @param textlength length of text string
   1559 * @return offset to the previous safe character
   1560 */
   1561 static
   1562 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
   1563                                       const UChar       *text,
   1564                                             int32_t  textoffset)
   1565 {
   1566     int32_t result = textoffset; // first contraction character
   1567     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
   1568         result --;
   1569     }
   1570     if (result != 0) {
   1571         // the first contraction character is consider unsafe here
   1572         result --;
   1573     }
   1574     return result;
   1575 }
   1576 
   1577 /**
   1578 * Cleaning up after we passed the safe zone
   1579 * @param strsrch string search data
   1580 * @param safetext safe text array
   1581 * @param safebuffer safe text buffer
   1582 * @param coleiter collation element iterator for safe text
   1583 */
   1584 static
   1585 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
   1586                                   UChar         *safebuffer)
   1587 {
   1588     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
   1589     {
   1590        uprv_free(safetext);
   1591     }
   1592 }
   1593 
   1594 /**
   1595 * Take the rearranged end accents and tries matching. If match failed at
   1596 * a seperate preceding set of accents (seperated from the rearranged on by
   1597 * at least a base character) then we rearrange the preceding accents and
   1598 * tries matching again.
   1599 * We allow skipping of the ends of the accent set if the ces do not match.
   1600 * However if the failure is found before the accent set, it fails.
   1601 * Internal method, status assumed to be success, caller has to check status
   1602 * before calling this method.
   1603 * @param strsrch string search data
   1604 * @param textoffset of the start of the rearranged accent
   1605 * @param status output error status if any
   1606 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1607 *         offset of the match. Note this start includes all preceding accents.
   1608 */
   1609 static
   1610 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
   1611                                        int32_t    textoffset,
   1612                                        UErrorCode    *status)
   1613 {
   1614     const UChar              *text           = strsrch->search->text;
   1615     const UCollator          *collator       = strsrch->collator;
   1616           int32_t             safelength     = 0;
   1617           UChar              *safetext;
   1618           int32_t             safetextlength;
   1619           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
   1620           UCollationElements *coleiter       = strsrch->utilIter;
   1621           int32_t         safeoffset     = textoffset;
   1622 
   1623     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
   1624                                          collator)) {
   1625         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
   1626         safelength     = textoffset - safeoffset;
   1627         safetextlength = INITIAL_ARRAY_SIZE_;
   1628         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
   1629                                          text + safeoffset, safelength,
   1630                                          strsrch->canonicalSuffixAccents,
   1631                                          status);
   1632     }
   1633     else {
   1634         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
   1635         safetext       = strsrch->canonicalSuffixAccents;
   1636     }
   1637 
   1638     // if status is a failure, ucol_setText does nothing
   1639     ucol_setText(coleiter, safetext, safetextlength, status);
   1640     // status checked in loop below
   1641 
   1642     int32_t  *ce        = strsrch->pattern.CE;
   1643     int32_t   celength  = strsrch->pattern.CELength;
   1644     int       ceindex   = celength - 1;
   1645     UBool     isSafe    = TRUE; // indication flag for position in safe zone
   1646 
   1647     while (ceindex >= 0) {
   1648         int32_t textce = ucol_previous(coleiter, status);
   1649         if (U_FAILURE(*status)) {
   1650             if (isSafe) {
   1651                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1652             }
   1653             return USEARCH_DONE;
   1654         }
   1655         if (textce == UCOL_NULLORDER) {
   1656             // check if we have passed the safe buffer
   1657             if (coleiter == strsrch->textIter) {
   1658                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1659                 return USEARCH_DONE;
   1660             }
   1661             cleanUpSafeText(strsrch, safetext, safebuffer);
   1662             safetext = safebuffer;
   1663             coleiter = strsrch->textIter;
   1664             setColEIterOffset(coleiter, safeoffset);
   1665             // status checked at the start of the loop
   1666             isSafe = FALSE;
   1667             continue;
   1668         }
   1669         textce = getCE(strsrch, textce);
   1670         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   1671             // do the beginning stuff
   1672             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
   1673             if (isSafe && failedoffset >= safelength) {
   1674                 // alas... no hope. failed at rearranged accent set
   1675                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1676                 return USEARCH_DONE;
   1677             }
   1678             else {
   1679                 if (isSafe) {
   1680                     failedoffset += safeoffset;
   1681                     cleanUpSafeText(strsrch, safetext, safebuffer);
   1682                 }
   1683 
   1684                 // try rearranging the front accents
   1685                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
   1686                                         failedoffset, textoffset, status);
   1687                 if (result != USEARCH_DONE) {
   1688                     // if status is a failure, ucol_setOffset does nothing
   1689                     setColEIterOffset(strsrch->textIter, result);
   1690                 }
   1691                 if (U_FAILURE(*status)) {
   1692                     return USEARCH_DONE;
   1693                 }
   1694                 return result;
   1695             }
   1696         }
   1697         if (textce == ce[ceindex]) {
   1698             ceindex --;
   1699         }
   1700     }
   1701     // set offset here
   1702     if (isSafe) {
   1703         int32_t result     = getColElemIterOffset(coleiter, FALSE);
   1704         // sets the text iterator here with the correct expansion and offset
   1705         int32_t    leftoverces = getExpansionPrefix(coleiter);
   1706         cleanUpSafeText(strsrch, safetext, safebuffer);
   1707         if (result >= safelength) {
   1708             result = textoffset;
   1709         }
   1710         else {
   1711             result += safeoffset;
   1712         }
   1713         setColEIterOffset(strsrch->textIter, result);
   1714         strsrch->textIter->iteratordata_.toReturn =
   1715                        setExpansionPrefix(strsrch->textIter, leftoverces);
   1716         return result;
   1717     }
   1718 
   1719     return ucol_getOffset(coleiter);
   1720 }
   1721 
   1722 /**
   1723 * Trying out the substring and sees if it can be a canonical match.
   1724 * This will try normalizing the end accents and arranging them into canonical
   1725 * equivalents and check their corresponding ces with the pattern ce.
   1726 * Suffix accents in the text will be grouped according to their combining
   1727 * class and the groups will be mixed and matched to try find the perfect
   1728 * match with the pattern.
   1729 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1730 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1731 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1732 *         "\u0301\u0325".
   1733 * step 2: check if any of the generated substrings matches the pattern.
   1734 * Internal method, status assumed to be success, caller has to check status
   1735 * before calling this method.
   1736 * @param strsrch string search data
   1737 * @param textoffset end offset in the collation element text that ends with
   1738 *                   the accents to be rearranged
   1739 * @param status error status if any
   1740 * @return TRUE if the match is valid, FALSE otherwise
   1741 */
   1742 static
   1743 UBool doNextCanonicalMatch(UStringSearch *strsrch,
   1744                            int32_t    textoffset,
   1745                            UErrorCode    *status)
   1746 {
   1747     const UChar       *text = strsrch->search->text;
   1748           int32_t  temp = textoffset;
   1749     UTF_BACK_1(text, 0, temp);
   1750     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
   1751         UCollationElements *coleiter = strsrch->textIter;
   1752         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
   1753         if (strsrch->pattern.hasPrefixAccents) {
   1754             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
   1755                                                 status);
   1756             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   1757                 setColEIterOffset(coleiter, offset);
   1758                 return TRUE;
   1759             }
   1760         }
   1761         return FALSE;
   1762     }
   1763 
   1764     if (!strsrch->pattern.hasSuffixAccents) {
   1765         return FALSE;
   1766     }
   1767 
   1768     UChar       accents[INITIAL_ARRAY_SIZE_];
   1769     // offset to the last base character in substring to search
   1770     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
   1771     // normalizing the offensive string
   1772     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
   1773                                0, accents, INITIAL_ARRAY_SIZE_, status);
   1774     // status checked in loop below
   1775 
   1776     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   1777     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   1778 
   1779     // 2 power n - 1 plus the full set of accents
   1780     int32_t  count = (2 << (size - 1)) - 1;
   1781     while (U_SUCCESS(*status) && count > 0) {
   1782         UChar *rearrange = strsrch->canonicalSuffixAccents;
   1783         // copy the base characters
   1784         for (int k = 0; k < accentsindex[0]; k ++) {
   1785             *rearrange ++ = accents[k];
   1786         }
   1787         // forming all possible canonical rearrangement by dropping
   1788         // sets of accents
   1789         for (int i = 0; i <= size - 1; i ++) {
   1790             int32_t mask = 1 << (size - i - 1);
   1791             if (count & mask) {
   1792                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1793                     *rearrange ++ = accents[j];
   1794                 }
   1795             }
   1796         }
   1797         *rearrange = 0;
   1798         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
   1799                                                         status);
   1800         if (offset != USEARCH_DONE) {
   1801             return TRUE; // match found
   1802         }
   1803         count --;
   1804     }
   1805     return FALSE;
   1806 }
   1807 
   1808 /**
   1809 * Gets the previous base character offset depending on the string search
   1810 * pattern data
   1811 * @param strsrch string search data
   1812 * @param textoffset current offset, current character
   1813 * @return the offset of the next character after this base character or itself
   1814 *         if it is a composed character with accents
   1815 */
   1816 static
   1817 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
   1818                                                       int32_t textoffset)
   1819 {
   1820     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
   1821         const UChar       *text = strsrch->search->text;
   1822               int32_t  offset = textoffset;
   1823         if (getFCD(text, &offset, strsrch->search->textLength) >>
   1824                                                    SECOND_LAST_BYTE_SHIFT_) {
   1825             return getPreviousBaseOffset(text, textoffset);
   1826         }
   1827     }
   1828     return textoffset;
   1829 }
   1830 
   1831 /**
   1832 * Checks match for contraction.
   1833 * If the match ends with a partial contraction we fail.
   1834 * If the match starts too far off (because of backwards iteration) we try to
   1835 * chip off the extra characters
   1836 * Internal method, status assumed to be success, caller has to check status
   1837 * before calling this method.
   1838 * @param strsrch string search data
   1839 * @param start offset of potential match, to be modified if necessary
   1840 * @param end offset of potential match, to be modified if necessary
   1841 * @param status output error status if any
   1842 * @return TRUE if match passes the contraction test, FALSE otherwise
   1843 */
   1844 static
   1845 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
   1846                                          int32_t   *start,
   1847                                          int32_t   *end,
   1848                                          UErrorCode    *status)
   1849 {
   1850           UCollationElements *coleiter   = strsrch->textIter;
   1851           int32_t             textlength = strsrch->search->textLength;
   1852           int32_t         temp       = *start;
   1853     const UCollator          *collator   = strsrch->collator;
   1854     const UChar              *text       = strsrch->search->text;
   1855     // This part checks if either ends of the match contains potential
   1856     // contraction. If so we'll have to iterate through them
   1857     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1858         (*start + 1 < textlength
   1859          && ucol_unsafeCP(text[*start + 1], collator))) {
   1860         int32_t expansion  = getExpansionPrefix(coleiter);
   1861         UBool   expandflag = expansion > 0;
   1862         setColEIterOffset(coleiter, *start);
   1863         while (expansion > 0) {
   1864             // getting rid of the redundant ce, caused by setOffset.
   1865             // since backward contraction/expansion may have extra ces if we
   1866             // are in the normalization buffer, hasAccentsBeforeMatch would
   1867             // have taken care of it.
   1868             // E.g. the character \u01FA will have an expansion of 3, but if
   1869             // we are only looking for acute and ring \u030A and \u0301, we'll
   1870             // have to skip the first ce in the expansion buffer.
   1871             ucol_next(coleiter, status);
   1872             if (U_FAILURE(*status)) {
   1873                 return FALSE;
   1874             }
   1875             if (ucol_getOffset(coleiter) != temp) {
   1876                 *start = temp;
   1877                 temp  = ucol_getOffset(coleiter);
   1878             }
   1879             expansion --;
   1880         }
   1881 
   1882         int32_t  *patternce       = strsrch->pattern.CE;
   1883         int32_t   patterncelength = strsrch->pattern.CELength;
   1884         int32_t   count           = 0;
   1885         int32_t   textlength      = strsrch->search->textLength;
   1886         while (count < patterncelength) {
   1887             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1888             // status checked below, note that if status is a failure
   1889             // ucol_next returns UCOL_NULLORDER
   1890             if (ce == UCOL_IGNORABLE) {
   1891                 continue;
   1892             }
   1893             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1894                 *start = temp;
   1895                 temp   = ucol_getOffset(coleiter);
   1896             }
   1897 
   1898             if (count == 0 && ce != patternce[0]) {
   1899                 // accents may have extra starting ces, this occurs when a
   1900                 // pure accent pattern is matched without rearrangement
   1901                 // text \u0325\u0300 and looking for \u0300
   1902                 int32_t expected = patternce[0];
   1903                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
   1904                     ce = getCE(strsrch, ucol_next(coleiter, status));
   1905                     while (U_SUCCESS(*status) && ce != expected &&
   1906                            ce != UCOL_NULLORDER &&
   1907                            ucol_getOffset(coleiter) <= *end) {
   1908                         ce = getCE(strsrch, ucol_next(coleiter, status));
   1909                     }
   1910                 }
   1911             }
   1912             if (U_FAILURE(*status) || ce != patternce[count]) {
   1913                 (*end) ++;
   1914                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1915                 return FALSE;
   1916             }
   1917             count ++;
   1918         }
   1919     }
   1920     return TRUE;
   1921 }
   1922 
   1923 /**
   1924 * Checks and sets the match information if found.
   1925 * Checks
   1926 * <ul>
   1927 * <li> the potential match does not repeat the previous match
   1928 * <li> boundaries are correct
   1929 * <li> potential match does not end in the middle of a contraction
   1930 * <li> identical matches
   1931 * <\ul>
   1932 * Otherwise the offset will be shifted to the next character.
   1933 * Internal method, status assumed to be success, caller has to check the
   1934 * status before calling this method.
   1935 * @param strsrch string search data
   1936 * @param textoffset offset in the collation element text. the returned value
   1937 *        will be the truncated end offset of the match or the new start
   1938 *        search offset.
   1939 * @param status output error status if any
   1940 * @return TRUE if the match is valid, FALSE otherwise
   1941 */
   1942 static
   1943 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
   1944                                      int32_t   *textoffset,
   1945                                      UErrorCode    *status)
   1946 {
   1947     // to ensure that the start and ends are not composite characters
   1948     UCollationElements *coleiter = strsrch->textIter;
   1949     // if we have a canonical accent match
   1950     if ((strsrch->pattern.hasSuffixAccents &&
   1951         strsrch->canonicalSuffixAccents[0]) ||
   1952         (strsrch->pattern.hasPrefixAccents &&
   1953         strsrch->canonicalPrefixAccents[0])) {
   1954         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
   1955                                                     strsrch,
   1956                                                     ucol_getOffset(coleiter));
   1957         strsrch->search->matchedLength = *textoffset -
   1958                                                 strsrch->search->matchedIndex;
   1959         return TRUE;
   1960     }
   1961 
   1962     int32_t start = getColElemIterOffset(coleiter, FALSE);
   1963     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
   1964                                             status) || U_FAILURE(*status)) {
   1965         return FALSE;
   1966     }
   1967 
   1968     start = getPreviousUStringSearchBaseOffset(strsrch, start);
   1969     // this totally matches, however we need to check if it is repeating
   1970     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
   1971         !isBreakUnit(strsrch, start, *textoffset) ||
   1972         !checkIdentical(strsrch, start, *textoffset)) {
   1973         (*textoffset) ++;
   1974         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
   1975                                         strsrch->search->textLength);
   1976         return FALSE;
   1977     }
   1978 
   1979     strsrch->search->matchedIndex  = start;
   1980     strsrch->search->matchedLength = *textoffset - start;
   1981     return TRUE;
   1982 }
   1983 
   1984 /**
   1985 * Shifting the collation element iterator position forward to prepare for
   1986 * a preceding match. If the first character is a unsafe character, we'll only
   1987 * shift by 1 to capture contractions, normalization etc.
   1988 * Internal method, status assumed to be success, caller has to check status
   1989 * before calling this method.
   1990 * @param text strsrch string search data
   1991 * @param textoffset start text position to do search
   1992 * @param ce the text ce which failed the match.
   1993 * @param patternceindex index of the ce within the pattern ce buffer which
   1994 *        failed the match
   1995 * @return final offset
   1996 */
   1997 static
   1998 inline int32_t reverseShift(UStringSearch *strsrch,
   1999                                 int32_t    textoffset,
   2000                                 int32_t       ce,
   2001                                 int32_t        patternceindex)
   2002 {
   2003     if (strsrch->search->isOverlap) {
   2004         if (textoffset != strsrch->search->textLength) {
   2005             textoffset --;
   2006         }
   2007         else {
   2008             textoffset -= strsrch->pattern.defaultShiftSize;
   2009         }
   2010     }
   2011     else {
   2012         if (ce != UCOL_NULLORDER) {
   2013             int32_t shift = strsrch->pattern.backShift[hash(ce)];
   2014 
   2015             // this is to adjust for characters in the middle of the substring
   2016             // for matching that failed.
   2017             int32_t adjust = patternceindex;
   2018             if (adjust > 1 && shift > adjust) {
   2019                 shift -= adjust - 1;
   2020             }
   2021             textoffset -= shift;
   2022         }
   2023         else {
   2024             textoffset -= strsrch->pattern.defaultShiftSize;
   2025         }
   2026     }
   2027     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
   2028     return textoffset;
   2029 }
   2030 
   2031 /**
   2032 * Checks match for contraction.
   2033 * If the match starts with a partial contraction we fail.
   2034 * Internal method, status assumed to be success, caller has to check status
   2035 * before calling this method.
   2036 * @param strsrch string search data
   2037 * @param start offset of potential match, to be modified if necessary
   2038 * @param end offset of potential match, to be modified if necessary
   2039 * @param status output error status if any
   2040 * @return TRUE if match passes the contraction test, FALSE otherwise
   2041 */
   2042 static
   2043 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
   2044                                      int32_t   *start,
   2045                                      int32_t   *end, UErrorCode  *status)
   2046 {
   2047           UCollationElements *coleiter   = strsrch->textIter;
   2048           int32_t             textlength = strsrch->search->textLength;
   2049           int32_t             temp       = *end;
   2050     const UCollator          *collator   = strsrch->collator;
   2051     const UChar              *text       = strsrch->search->text;
   2052     // This part checks if either if the start of the match contains potential
   2053     // contraction. If so we'll have to iterate through them
   2054     // Since we used ucol_next while previously looking for the potential
   2055     // match, this guarantees that our end will not be a partial contraction,
   2056     // or a partial supplementary character.
   2057     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2058         int32_t expansion  = getExpansionSuffix(coleiter);
   2059         UBool   expandflag = expansion > 0;
   2060         setColEIterOffset(coleiter, *end);
   2061         while (U_SUCCESS(*status) && expansion > 0) {
   2062             // getting rid of the redundant ce
   2063             // since forward contraction/expansion may have extra ces
   2064             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2065             // would have taken care of it.
   2066             // E.g. the character \u01FA will have an expansion of 3, but if
   2067             // we are only looking for A ring A\u030A, we'll have to skip the
   2068             // last ce in the expansion buffer
   2069             ucol_previous(coleiter, status);
   2070             if (U_FAILURE(*status)) {
   2071                 return FALSE;
   2072             }
   2073             if (ucol_getOffset(coleiter) != temp) {
   2074                 *end = temp;
   2075                 temp  = ucol_getOffset(coleiter);
   2076             }
   2077             expansion --;
   2078         }
   2079 
   2080         int32_t  *patternce       = strsrch->pattern.CE;
   2081         int32_t   patterncelength = strsrch->pattern.CELength;
   2082         int32_t   count           = patterncelength;
   2083         while (count > 0) {
   2084             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2085             // status checked below, note that if status is a failure
   2086             // ucol_previous returns UCOL_NULLORDER
   2087             if (ce == UCOL_IGNORABLE) {
   2088                 continue;
   2089             }
   2090             if (expandflag && count == 0 &&
   2091                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2092                 *end = temp;
   2093                 temp  = ucol_getOffset(coleiter);
   2094             }
   2095             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2096                 (*start) --;
   2097                 *start = getPreviousBaseOffset(text, *start);
   2098                 return FALSE;
   2099             }
   2100             count --;
   2101         }
   2102     }
   2103     return TRUE;
   2104 }
   2105 
   2106 /**
   2107 * Checks and sets the match information if found.
   2108 * Checks
   2109 * <ul>
   2110 * <li> the current match does not repeat the last match
   2111 * <li> boundaries are correct
   2112 * <li> exact matches has no extra accents
   2113 * <li> identical matches
   2114 * <\ul>
   2115 * Otherwise the offset will be shifted to the preceding character.
   2116 * Internal method, status assumed to be success, caller has to check status
   2117 * before calling this method.
   2118 * @param strsrch string search data
   2119 * @param collator
   2120 * @param coleiter collation element iterator
   2121 * @param text string
   2122 * @param textoffset offset in the collation element text. the returned value
   2123 *        will be the truncated start offset of the match or the new start
   2124 *        search offset.
   2125 * @param status output error status if any
   2126 * @return TRUE if the match is valid, FALSE otherwise
   2127 */
   2128 static
   2129 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
   2130                                      int32_t   *textoffset,
   2131                                      UErrorCode    *status)
   2132 {
   2133     // to ensure that the start and ends are not composite characters
   2134     int32_t end = ucol_getOffset(strsrch->textIter);
   2135     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
   2136         || U_FAILURE(*status)) {
   2137             return FALSE;
   2138     }
   2139 
   2140     // this totally matches, however we need to check if it is repeating
   2141     // the old match
   2142     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2143         !isBreakUnit(strsrch, *textoffset, end) ||
   2144         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
   2145         !checkIdentical(strsrch, *textoffset, end) ||
   2146         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
   2147         (*textoffset) --;
   2148         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2149                                             *textoffset);
   2150         return FALSE;
   2151     }
   2152 
   2153     //Add breakiterator boundary check for primary strength search.
   2154     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   2155     	checkBreakBoundary(strsrch, textoffset, &end);
   2156     }
   2157 
   2158     strsrch->search->matchedIndex = *textoffset;
   2159     strsrch->search->matchedLength = end - *textoffset;
   2160     return TRUE;
   2161 }
   2162 
   2163 /**
   2164 * Rearranges the end accents to try matching.
   2165 * Suffix accents in the text will be grouped according to their combining
   2166 * class and the groups will be mixed and matched to try find the perfect
   2167 * match with the pattern.
   2168 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2169 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2170 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2171 *         "\u0301\u0325".
   2172 * step 2: check if any of the generated substrings matches the pattern.
   2173 * Internal method, status assumed to be success, user has to check status
   2174 * before calling this method.
   2175 * @param strsrch string search match
   2176 * @param start offset of the first base character
   2177 * @param end start of the last accent set
   2178 * @param status only error status if any
   2179 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2180 *         offset of the match. Note this start includes all following accents.
   2181 */
   2182 static
   2183 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
   2184                                            int32_t    start,
   2185                                            int32_t    end,
   2186                                            UErrorCode    *status)
   2187 {
   2188     const UChar       *text       = strsrch->search->text;
   2189           int32_t  tempend    = end;
   2190 
   2191     UTF_BACK_1(text, 0, tempend);
   2192     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
   2193                                                            LAST_BYTE_MASK_)) {
   2194         // die... failed at a base character
   2195         return USEARCH_DONE;
   2196     }
   2197     end = getNextBaseOffset(text, end, strsrch->search->textLength);
   2198 
   2199     if (U_SUCCESS(*status)) {
   2200         UChar       accents[INITIAL_ARRAY_SIZE_];
   2201         int32_t offset = getPreviousBaseOffset(text, end);
   2202         // normalizing the offensive string
   2203         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
   2204                         INITIAL_ARRAY_SIZE_, status);
   2205 
   2206         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   2207         int32_t         accentsize = getUnblockedAccentIndex(accents,
   2208                                                          accentsindex);
   2209         int32_t         count      = (2 << (accentsize - 1)) - 1;
   2210         UChar               buffer[INITIAL_ARRAY_SIZE_];
   2211         UCollationElements *coleiter = strsrch->utilIter;
   2212         while (U_SUCCESS(*status) && count > 0) {
   2213             UChar *rearrange = strsrch->canonicalSuffixAccents;
   2214             // copy the base characters
   2215             for (int k = 0; k < accentsindex[0]; k ++) {
   2216                 *rearrange ++ = accents[k];
   2217             }
   2218             // forming all possible canonical rearrangement by dropping
   2219             // sets of accents
   2220             for (int i = 0; i <= accentsize - 1; i ++) {
   2221                 int32_t mask = 1 << (accentsize - i - 1);
   2222                 if (count & mask) {
   2223                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2224                         *rearrange ++ = accents[j];
   2225                     }
   2226                 }
   2227             }
   2228             *rearrange = 0;
   2229             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   2230             UChar   *match     = addToUCharArray(buffer, &matchsize,
   2231                                            strsrch->canonicalPrefixAccents,
   2232                                            strsrch->search->text + start,
   2233                                            offset - start,
   2234                                            strsrch->canonicalSuffixAccents,
   2235                                            status);
   2236 
   2237             // run the collator iterator through this match
   2238             // if status is a failure ucol_setText does nothing
   2239             ucol_setText(coleiter, match, matchsize, status);
   2240             if (U_SUCCESS(*status)) {
   2241                 if (checkCollationMatch(strsrch, coleiter)) {
   2242                     if (match != buffer) {
   2243                         uprv_free(match);
   2244                     }
   2245                     return end;
   2246                 }
   2247             }
   2248             count --;
   2249         }
   2250     }
   2251     return USEARCH_DONE;
   2252 }
   2253 
   2254 /**
   2255 * Take the rearranged start accents and tries matching. If match failed at
   2256 * a seperate following set of accents (seperated from the rearranged on by
   2257 * at least a base character) then we rearrange the preceding accents and
   2258 * tries matching again.
   2259 * We allow skipping of the ends of the accent set if the ces do not match.
   2260 * However if the failure is found before the accent set, it fails.
   2261 * Internal method, status assumed to be success, caller has to check status
   2262 * before calling this method.
   2263 * @param strsrch string search data
   2264 * @param textoffset of the ends of the rearranged accent
   2265 * @param status output error status if any
   2266 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2267 *         offset of the match. Note this start includes all following accents.
   2268 */
   2269 static
   2270 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
   2271                                            int32_t    textoffset,
   2272                                            UErrorCode    *status)
   2273 {
   2274     const UChar       *text       = strsrch->search->text;
   2275     const UCollator   *collator   = strsrch->collator;
   2276           int32_t      safelength = 0;
   2277           UChar       *safetext;
   2278           int32_t      safetextlength;
   2279           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
   2280           int32_t  safeoffset = textoffset;
   2281 
   2282     if (textoffset &&
   2283         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
   2284                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
   2285                                          ], collator)) {
   2286         safeoffset     = getNextSafeOffset(collator, text, textoffset,
   2287                                            strsrch->search->textLength);
   2288         safelength     = safeoffset - textoffset;
   2289         safetextlength = INITIAL_ARRAY_SIZE_;
   2290         safetext       = addToUCharArray(safebuffer, &safetextlength,
   2291                                          strsrch->canonicalPrefixAccents,
   2292                                          text + textoffset, safelength,
   2293                                          NULL, status);
   2294     }
   2295     else {
   2296         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
   2297         safetext       = strsrch->canonicalPrefixAccents;
   2298     }
   2299 
   2300     UCollationElements *coleiter = strsrch->utilIter;
   2301      // if status is a failure, ucol_setText does nothing
   2302     ucol_setText(coleiter, safetext, safetextlength, status);
   2303     // status checked in loop below
   2304 
   2305     int32_t  *ce           = strsrch->pattern.CE;
   2306     int32_t   celength     = strsrch->pattern.CELength;
   2307     int       ceindex      = 0;
   2308     UBool     isSafe       = TRUE; // safe zone indication flag for position
   2309     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
   2310 
   2311     while (ceindex < celength) {
   2312         int32_t textce = ucol_next(coleiter, status);
   2313         if (U_FAILURE(*status)) {
   2314             if (isSafe) {
   2315                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2316             }
   2317             return USEARCH_DONE;
   2318         }
   2319         if (textce == UCOL_NULLORDER) {
   2320             // check if we have passed the safe buffer
   2321             if (coleiter == strsrch->textIter) {
   2322                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2323                 return USEARCH_DONE;
   2324             }
   2325             cleanUpSafeText(strsrch, safetext, safebuffer);
   2326             safetext = safebuffer;
   2327             coleiter = strsrch->textIter;
   2328             setColEIterOffset(coleiter, safeoffset);
   2329             // status checked at the start of the loop
   2330             isSafe = FALSE;
   2331             continue;
   2332         }
   2333         textce = getCE(strsrch, textce);
   2334         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   2335             // do the beginning stuff
   2336             int32_t failedoffset = ucol_getOffset(coleiter);
   2337             if (isSafe && failedoffset <= prefixlength) {
   2338                 // alas... no hope. failed at rearranged accent set
   2339                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2340                 return USEARCH_DONE;
   2341             }
   2342             else {
   2343                 if (isSafe) {
   2344                     failedoffset = safeoffset - failedoffset;
   2345                     cleanUpSafeText(strsrch, safetext, safebuffer);
   2346                 }
   2347 
   2348                 // try rearranging the end accents
   2349                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
   2350                                         textoffset, failedoffset, status);
   2351                 if (result != USEARCH_DONE) {
   2352                     // if status is a failure, ucol_setOffset does nothing
   2353                     setColEIterOffset(strsrch->textIter, result);
   2354                 }
   2355                 if (U_FAILURE(*status)) {
   2356                     return USEARCH_DONE;
   2357                 }
   2358                 return result;
   2359             }
   2360         }
   2361         if (textce == ce[ceindex]) {
   2362             ceindex ++;
   2363         }
   2364     }
   2365     // set offset here
   2366     if (isSafe) {
   2367         int32_t result      = ucol_getOffset(coleiter);
   2368         // sets the text iterator here with the correct expansion and offset
   2369         int32_t     leftoverces = getExpansionSuffix(coleiter);
   2370         cleanUpSafeText(strsrch, safetext, safebuffer);
   2371         if (result <= prefixlength) {
   2372             result = textoffset;
   2373         }
   2374         else {
   2375             result = textoffset + (safeoffset - result);
   2376         }
   2377         setColEIterOffset(strsrch->textIter, result);
   2378         setExpansionSuffix(strsrch->textIter, leftoverces);
   2379         return result;
   2380     }
   2381 
   2382     return ucol_getOffset(coleiter);
   2383 }
   2384 
   2385 /**
   2386 * Trying out the substring and sees if it can be a canonical match.
   2387 * This will try normalizing the starting accents and arranging them into
   2388 * canonical equivalents and check their corresponding ces with the pattern ce.
   2389 * Prefix accents in the text will be grouped according to their combining
   2390 * class and the groups will be mixed and matched to try find the perfect
   2391 * match with the pattern.
   2392 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2393 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2394 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2395 *         "\u0301\u0325".
   2396 * step 2: check if any of the generated substrings matches the pattern.
   2397 * Internal method, status assumed to be success, caller has to check status
   2398 * before calling this method.
   2399 * @param strsrch string search data
   2400 * @param textoffset start offset in the collation element text that starts
   2401 *                   with the accents to be rearranged
   2402 * @param status output error status if any
   2403 * @return TRUE if the match is valid, FALSE otherwise
   2404 */
   2405 static
   2406 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
   2407                                int32_t    textoffset,
   2408                                UErrorCode    *status)
   2409 {
   2410     const UChar       *text       = strsrch->search->text;
   2411           int32_t  temp       = textoffset;
   2412           int32_t      textlength = strsrch->search->textLength;
   2413     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   2414         UCollationElements *coleiter = strsrch->textIter;
   2415         int32_t         offset   = ucol_getOffset(coleiter);
   2416         if (strsrch->pattern.hasSuffixAccents) {
   2417             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
   2418                                                     offset, status);
   2419             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   2420                 setColEIterOffset(coleiter, offset);
   2421                 return TRUE;
   2422             }
   2423         }
   2424         return FALSE;
   2425     }
   2426 
   2427     if (!strsrch->pattern.hasPrefixAccents) {
   2428         return FALSE;
   2429     }
   2430 
   2431     UChar       accents[INITIAL_ARRAY_SIZE_];
   2432     // offset to the last base character in substring to search
   2433     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
   2434     // normalizing the offensive string
   2435     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
   2436                                0, accents, INITIAL_ARRAY_SIZE_, status);
   2437     // status checked in loop
   2438 
   2439     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   2440     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   2441 
   2442     // 2 power n - 1 plus the full set of accents
   2443     int32_t  count = (2 << (size - 1)) - 1;
   2444     while (U_SUCCESS(*status) && count > 0) {
   2445         UChar *rearrange = strsrch->canonicalPrefixAccents;
   2446         // copy the base characters
   2447         for (int k = 0; k < accentsindex[0]; k ++) {
   2448             *rearrange ++ = accents[k];
   2449         }
   2450         // forming all possible canonical rearrangement by dropping
   2451         // sets of accents
   2452         for (int i = 0; i <= size - 1; i ++) {
   2453             int32_t mask = 1 << (size - i - 1);
   2454             if (count & mask) {
   2455                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2456                     *rearrange ++ = accents[j];
   2457                 }
   2458             }
   2459         }
   2460         *rearrange = 0;
   2461         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
   2462                                                           baseoffset, status);
   2463         if (offset != USEARCH_DONE) {
   2464             return TRUE; // match found
   2465         }
   2466         count --;
   2467     }
   2468     return FALSE;
   2469 }
   2470 
   2471 /**
   2472 * Checks match for contraction.
   2473 * If the match starts with a partial contraction we fail.
   2474 * Internal method, status assumed to be success, caller has to check status
   2475 * before calling this method.
   2476 * @param strsrch string search data
   2477 * @param start offset of potential match, to be modified if necessary
   2478 * @param end offset of potential match, to be modified if necessary
   2479 * @param status only error status if any
   2480 * @return TRUE if match passes the contraction test, FALSE otherwise
   2481 */
   2482 static
   2483 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
   2484                                      int32_t   *start,
   2485                                      int32_t   *end, UErrorCode  *status)
   2486 {
   2487           UCollationElements *coleiter   = strsrch->textIter;
   2488           int32_t             textlength = strsrch->search->textLength;
   2489           int32_t         temp       = *end;
   2490     const UCollator          *collator   = strsrch->collator;
   2491     const UChar              *text       = strsrch->search->text;
   2492     // This part checks if either if the start of the match contains potential
   2493     // contraction. If so we'll have to iterate through them
   2494     // Since we used ucol_next while previously looking for the potential
   2495     // match, this guarantees that our end will not be a partial contraction,
   2496     // or a partial supplementary character.
   2497     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2498         int32_t expansion  = getExpansionSuffix(coleiter);
   2499         UBool   expandflag = expansion > 0;
   2500         setColEIterOffset(coleiter, *end);
   2501         while (expansion > 0) {
   2502             // getting rid of the redundant ce
   2503             // since forward contraction/expansion may have extra ces
   2504             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2505             // would have taken care of it.
   2506             // E.g. the character \u01FA will have an expansion of 3, but if
   2507             // we are only looking for A ring A\u030A, we'll have to skip the
   2508             // last ce in the expansion buffer
   2509             ucol_previous(coleiter, status);
   2510             if (U_FAILURE(*status)) {
   2511                 return FALSE;
   2512             }
   2513             if (ucol_getOffset(coleiter) != temp) {
   2514                 *end = temp;
   2515                 temp  = ucol_getOffset(coleiter);
   2516             }
   2517             expansion --;
   2518         }
   2519 
   2520         int32_t  *patternce       = strsrch->pattern.CE;
   2521         int32_t   patterncelength = strsrch->pattern.CELength;
   2522         int32_t   count           = patterncelength;
   2523         while (count > 0) {
   2524             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2525             // status checked below, note that if status is a failure
   2526             // ucol_previous returns UCOL_NULLORDER
   2527             if (ce == UCOL_IGNORABLE) {
   2528                 continue;
   2529             }
   2530             if (expandflag && count == 0 &&
   2531                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2532                 *end = temp;
   2533                 temp  = ucol_getOffset(coleiter);
   2534             }
   2535             if (count == patterncelength &&
   2536                 ce != patternce[patterncelength - 1]) {
   2537                 // accents may have extra starting ces, this occurs when a
   2538                 // pure accent pattern is matched without rearrangement
   2539                 int32_t    expected = patternce[patterncelength - 1];
   2540                 UTF_BACK_1(text, 0, *end);
   2541                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
   2542                     ce = getCE(strsrch, ucol_previous(coleiter, status));
   2543                     while (U_SUCCESS(*status) && ce != expected &&
   2544                            ce != UCOL_NULLORDER &&
   2545                            ucol_getOffset(coleiter) <= *start) {
   2546                         ce = getCE(strsrch, ucol_previous(coleiter, status));
   2547                     }
   2548                 }
   2549             }
   2550             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2551                 (*start) --;
   2552                 *start = getPreviousBaseOffset(text, *start);
   2553                 return FALSE;
   2554             }
   2555             count --;
   2556         }
   2557     }
   2558     return TRUE;
   2559 }
   2560 
   2561 /**
   2562 * Checks and sets the match information if found.
   2563 * Checks
   2564 * <ul>
   2565 * <li> the potential match does not repeat the previous match
   2566 * <li> boundaries are correct
   2567 * <li> potential match does not end in the middle of a contraction
   2568 * <li> identical matches
   2569 * <\ul>
   2570 * Otherwise the offset will be shifted to the next character.
   2571 * Internal method, status assumed to be success, caller has to check status
   2572 * before calling this method.
   2573 * @param strsrch string search data
   2574 * @param textoffset offset in the collation element text. the returned value
   2575 *        will be the truncated start offset of the match or the new start
   2576 *        search offset.
   2577 * @param status only error status if any
   2578 * @return TRUE if the match is valid, FALSE otherwise
   2579 */
   2580 static
   2581 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
   2582                                          int32_t   *textoffset,
   2583                                          UErrorCode    *status)
   2584 {
   2585     // to ensure that the start and ends are not composite characters
   2586     UCollationElements *coleiter = strsrch->textIter;
   2587     // if we have a canonical accent match
   2588     if ((strsrch->pattern.hasSuffixAccents &&
   2589         strsrch->canonicalSuffixAccents[0]) ||
   2590         (strsrch->pattern.hasPrefixAccents &&
   2591         strsrch->canonicalPrefixAccents[0])) {
   2592         strsrch->search->matchedIndex  = *textoffset;
   2593         strsrch->search->matchedLength =
   2594             getNextUStringSearchBaseOffset(strsrch,
   2595                                       getColElemIterOffset(coleiter, FALSE))
   2596             - *textoffset;
   2597         return TRUE;
   2598     }
   2599 
   2600     int32_t end = ucol_getOffset(coleiter);
   2601     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
   2602                                                 status) ||
   2603          U_FAILURE(*status)) {
   2604         return FALSE;
   2605     }
   2606 
   2607     end = getNextUStringSearchBaseOffset(strsrch, end);
   2608     // this totally matches, however we need to check if it is repeating
   2609     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2610         !isBreakUnit(strsrch, *textoffset, end) ||
   2611         !checkIdentical(strsrch, *textoffset, end)) {
   2612         (*textoffset) --;
   2613         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2614                                             *textoffset);
   2615         return FALSE;
   2616     }
   2617 
   2618     strsrch->search->matchedIndex  = *textoffset;
   2619     strsrch->search->matchedLength = end - *textoffset;
   2620     return TRUE;
   2621 }
   2622 #endif // #if BOYER_MOORE
   2623 
   2624 // constructors and destructor -------------------------------------------
   2625 
   2626 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
   2627                                           int32_t         patternlength,
   2628                                     const UChar          *text,
   2629                                           int32_t         textlength,
   2630                                     const char           *locale,
   2631                                           UBreakIterator *breakiter,
   2632                                           UErrorCode     *status)
   2633 {
   2634     if (U_FAILURE(*status)) {
   2635         return NULL;
   2636     }
   2637 #if UCONFIG_NO_BREAK_ITERATION
   2638     if (breakiter != NULL) {
   2639         *status = U_UNSUPPORTED_ERROR;
   2640         return NULL;
   2641     }
   2642 #endif
   2643     if (locale) {
   2644         // ucol_open internally checks for status
   2645         UCollator     *collator = ucol_open(locale, status);
   2646         // pattern, text checks are done in usearch_openFromCollator
   2647         UStringSearch *result   = usearch_openFromCollator(pattern,
   2648                                               patternlength, text, textlength,
   2649                                               collator, breakiter, status);
   2650 
   2651         if (result == NULL || U_FAILURE(*status)) {
   2652             if (collator) {
   2653                 ucol_close(collator);
   2654             }
   2655             return NULL;
   2656         }
   2657         else {
   2658             result->ownCollator = TRUE;
   2659         }
   2660         return result;
   2661     }
   2662     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2663     return NULL;
   2664 }
   2665 
   2666 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
   2667                                   const UChar          *pattern,
   2668                                         int32_t         patternlength,
   2669                                   const UChar          *text,
   2670                                         int32_t         textlength,
   2671                                   const UCollator      *collator,
   2672                                         UBreakIterator *breakiter,
   2673                                         UErrorCode     *status)
   2674 {
   2675     if (U_FAILURE(*status)) {
   2676         return NULL;
   2677     }
   2678 #if UCONFIG_NO_BREAK_ITERATION
   2679     if (breakiter != NULL) {
   2680         *status = U_UNSUPPORTED_ERROR;
   2681         return NULL;
   2682     }
   2683 #endif
   2684     if (pattern == NULL || text == NULL || collator == NULL) {
   2685         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2686         return NULL;
   2687     }
   2688 
   2689     // string search does not really work when numeric collation is turned on
   2690     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
   2691         *status = U_UNSUPPORTED_ERROR;
   2692         return NULL;
   2693     }
   2694 
   2695     if (U_SUCCESS(*status)) {
   2696         initializeFCD(status);
   2697         if (U_FAILURE(*status)) {
   2698             return NULL;
   2699         }
   2700 
   2701         UStringSearch *result;
   2702         if (textlength == -1) {
   2703             textlength = u_strlen(text);
   2704         }
   2705         if (patternlength == -1) {
   2706             patternlength = u_strlen(pattern);
   2707         }
   2708         if (textlength <= 0 || patternlength <= 0) {
   2709             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2710             return NULL;
   2711         }
   2712 
   2713         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
   2714         if (result == NULL) {
   2715             *status = U_MEMORY_ALLOCATION_ERROR;
   2716             return NULL;
   2717         }
   2718 
   2719         result->collator    = collator;
   2720         result->strength    = ucol_getStrength(collator);
   2721         result->ceMask      = getMask(result->strength);
   2722         result->toShift     =
   2723              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   2724                                                             UCOL_SHIFTED;
   2725         result->variableTop = ucol_getVariableTop(collator, status);
   2726 
   2727         if (U_FAILURE(*status)) {
   2728             uprv_free(result);
   2729             return NULL;
   2730         }
   2731 
   2732         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
   2733         if (result->search == NULL) {
   2734             *status = U_MEMORY_ALLOCATION_ERROR;
   2735             uprv_free(result);
   2736             return NULL;
   2737         }
   2738 
   2739         result->search->text       = text;
   2740         result->search->textLength = textlength;
   2741 
   2742         result->pattern.text       = pattern;
   2743         result->pattern.textLength = patternlength;
   2744         result->pattern.CE         = NULL;
   2745         result->pattern.PCE        = NULL;
   2746 
   2747         result->search->breakIter  = breakiter;
   2748 #if !UCONFIG_NO_BREAK_ITERATION
   2749         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
   2750         if (breakiter) {
   2751         	ubrk_setText(breakiter, text, textlength, status);
   2752         }
   2753 #endif
   2754 
   2755         result->ownCollator           = FALSE;
   2756         result->search->matchedLength = 0;
   2757         result->search->matchedIndex  = USEARCH_DONE;
   2758         result->utilIter              = NULL;
   2759         result->textIter              = ucol_openElements(collator, text,
   2760                                                           textlength, status);
   2761         if (U_FAILURE(*status)) {
   2762             usearch_close(result);
   2763             return NULL;
   2764         }
   2765 
   2766         result->search->isOverlap          = FALSE;
   2767         result->search->isCanonicalMatch   = FALSE;
   2768         result->search->isForwardSearching = TRUE;
   2769         result->search->reset              = TRUE;
   2770 
   2771         initialize(result, status);
   2772 
   2773         if (U_FAILURE(*status)) {
   2774             usearch_close(result);
   2775             return NULL;
   2776         }
   2777 
   2778         return result;
   2779     }
   2780     return NULL;
   2781 }
   2782 
   2783 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
   2784 {
   2785     if (strsrch) {
   2786         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
   2787             strsrch->pattern.CE) {
   2788             uprv_free(strsrch->pattern.CE);
   2789         }
   2790 
   2791         if (strsrch->pattern.PCE != NULL &&
   2792             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
   2793             uprv_free(strsrch->pattern.PCE);
   2794         }
   2795 
   2796         ucol_closeElements(strsrch->textIter);
   2797         ucol_closeElements(strsrch->utilIter);
   2798 
   2799         if (strsrch->ownCollator && strsrch->collator) {
   2800             ucol_close((UCollator *)strsrch->collator);
   2801         }
   2802 
   2803 #if !UCONFIG_NO_BREAK_ITERATION
   2804         if (strsrch->search->internalBreakIter) {
   2805         	ubrk_close(strsrch->search->internalBreakIter);
   2806         }
   2807 #endif
   2808 
   2809         uprv_free(strsrch->search);
   2810         uprv_free(strsrch);
   2811     }
   2812 }
   2813 
   2814 // set and get methods --------------------------------------------------
   2815 
   2816 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
   2817                                         int32_t    position,
   2818                                         UErrorCode    *status)
   2819 {
   2820     if (U_SUCCESS(*status) && strsrch) {
   2821         if (isOutOfBounds(strsrch->search->textLength, position)) {
   2822             *status = U_INDEX_OUTOFBOUNDS_ERROR;
   2823         }
   2824         else {
   2825             setColEIterOffset(strsrch->textIter, position);
   2826         }
   2827         strsrch->search->matchedIndex  = USEARCH_DONE;
   2828         strsrch->search->matchedLength = 0;
   2829         strsrch->search->reset         = FALSE;
   2830     }
   2831 }
   2832 
   2833 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
   2834 {
   2835     if (strsrch) {
   2836         int32_t result = ucol_getOffset(strsrch->textIter);
   2837         if (isOutOfBounds(strsrch->search->textLength, result)) {
   2838             return USEARCH_DONE;
   2839         }
   2840         return result;
   2841     }
   2842     return USEARCH_DONE;
   2843 }
   2844 
   2845 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
   2846                                  USearchAttribute attribute,
   2847                                  USearchAttributeValue value,
   2848                                  UErrorCode *status)
   2849 {
   2850     if (U_SUCCESS(*status) && strsrch) {
   2851         switch (attribute)
   2852         {
   2853         case USEARCH_OVERLAP :
   2854             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
   2855             break;
   2856         case USEARCH_CANONICAL_MATCH :
   2857             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
   2858                                                                       FALSE);
   2859             break;
   2860         case USEARCH_ATTRIBUTE_COUNT :
   2861         default:
   2862             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2863         }
   2864     }
   2865     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
   2866         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2867     }
   2868 }
   2869 
   2870 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
   2871                                                 const UStringSearch *strsrch,
   2872                                                 USearchAttribute attribute)
   2873 {
   2874     if (strsrch) {
   2875         switch (attribute) {
   2876         case USEARCH_OVERLAP :
   2877             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
   2878                                                         USEARCH_OFF);
   2879         case USEARCH_CANONICAL_MATCH :
   2880             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
   2881                                                                USEARCH_OFF);
   2882         case USEARCH_ATTRIBUTE_COUNT :
   2883             return USEARCH_DEFAULT;
   2884         }
   2885     }
   2886     return USEARCH_DEFAULT;
   2887 }
   2888 
   2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
   2890                                                 const UStringSearch *strsrch)
   2891 {
   2892     if (strsrch == NULL) {
   2893         return USEARCH_DONE;
   2894     }
   2895     return strsrch->search->matchedIndex;
   2896 }
   2897 
   2898 
   2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
   2900                                             UChar         *result,
   2901                                             int32_t        resultCapacity,
   2902                                             UErrorCode    *status)
   2903 {
   2904     if (U_FAILURE(*status)) {
   2905         return USEARCH_DONE;
   2906     }
   2907     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
   2908         result == NULL)) {
   2909         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2910         return USEARCH_DONE;
   2911     }
   2912 
   2913     int32_t     copylength = strsrch->search->matchedLength;
   2914     int32_t copyindex  = strsrch->search->matchedIndex;
   2915     if (copyindex == USEARCH_DONE) {
   2916         u_terminateUChars(result, resultCapacity, 0, status);
   2917         return USEARCH_DONE;
   2918     }
   2919 
   2920     if (resultCapacity < copylength) {
   2921         copylength = resultCapacity;
   2922     }
   2923     if (copylength > 0) {
   2924         uprv_memcpy(result, strsrch->search->text + copyindex,
   2925                     copylength * sizeof(UChar));
   2926     }
   2927     return u_terminateUChars(result, resultCapacity,
   2928                              strsrch->search->matchedLength, status);
   2929 }
   2930 
   2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
   2932                                               const UStringSearch *strsrch)
   2933 {
   2934     if (strsrch) {
   2935         return strsrch->search->matchedLength;
   2936     }
   2937     return USEARCH_DONE;
   2938 }
   2939 
   2940 #if !UCONFIG_NO_BREAK_ITERATION
   2941 
   2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
   2943                                                UBreakIterator *breakiter,
   2944                                                UErrorCode     *status)
   2945 {
   2946     if (U_SUCCESS(*status) && strsrch) {
   2947     	strsrch->search->breakIter = breakiter;
   2948         if (breakiter) {
   2949             ubrk_setText(breakiter, strsrch->search->text,
   2950                          strsrch->search->textLength, status);
   2951         }
   2952     }
   2953 }
   2954 
   2955 U_CAPI const UBreakIterator* U_EXPORT2
   2956 usearch_getBreakIterator(const UStringSearch *strsrch)
   2957 {
   2958     if (strsrch) {
   2959         return strsrch->search->breakIter;
   2960     }
   2961     return NULL;
   2962 }
   2963 
   2964 #endif
   2965 
   2966 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
   2967                                       const UChar         *text,
   2968                                             int32_t        textlength,
   2969                                             UErrorCode    *status)
   2970 {
   2971     if (U_SUCCESS(*status)) {
   2972         if (strsrch == NULL || text == NULL || textlength < -1 ||
   2973             textlength == 0) {
   2974             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2975         }
   2976         else {
   2977             if (textlength == -1) {
   2978                 textlength = u_strlen(text);
   2979             }
   2980             strsrch->search->text       = text;
   2981             strsrch->search->textLength = textlength;
   2982             ucol_setText(strsrch->textIter, text, textlength, status);
   2983             strsrch->search->matchedIndex  = USEARCH_DONE;
   2984             strsrch->search->matchedLength = 0;
   2985             strsrch->search->reset         = TRUE;
   2986 #if !UCONFIG_NO_BREAK_ITERATION
   2987             if (strsrch->search->breakIter != NULL) {
   2988                 ubrk_setText(strsrch->search->breakIter, text,
   2989                              textlength, status);
   2990             }
   2991             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
   2992 #endif
   2993         }
   2994     }
   2995 }
   2996 
   2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
   2998                                                      int32_t       *length)
   2999 {
   3000     if (strsrch) {
   3001         *length = strsrch->search->textLength;
   3002         return strsrch->search->text;
   3003     }
   3004     return NULL;
   3005 }
   3006 
   3007 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
   3008                                           const UCollator     *collator,
   3009                                                 UErrorCode    *status)
   3010 {
   3011     if (U_SUCCESS(*status)) {
   3012         if (collator == NULL) {
   3013             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3014             return;
   3015         }
   3016 
   3017         if (strsrch) {
   3018             if (strsrch->ownCollator && (strsrch->collator != collator)) {
   3019                 ucol_close((UCollator *)strsrch->collator);
   3020                 strsrch->ownCollator = FALSE;
   3021             }
   3022             strsrch->collator    = collator;
   3023             strsrch->strength    = ucol_getStrength(collator);
   3024             strsrch->ceMask      = getMask(strsrch->strength);
   3025 #if !UCONFIG_NO_BREAK_ITERATION
   3026         	ubrk_close(strsrch->search->internalBreakIter);
   3027         	strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
   3028         											 strsrch->search->text, strsrch->search->textLength, status);
   3029 #endif
   3030             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3031             strsrch->toShift     =
   3032                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   3033                                                                 UCOL_SHIFTED;
   3034             // if status is a failure, ucol_getVariableTop returns 0
   3035             strsrch->variableTop = ucol_getVariableTop(collator, status);
   3036             if (U_SUCCESS(*status)) {
   3037                 initialize(strsrch, status);
   3038                 if (U_SUCCESS(*status)) {
   3039                     /* free offset buffer to avoid memory leak before initializing. */
   3040                     ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
   3041                     uprv_init_collIterate(collator, strsrch->search->text,
   3042                                           strsrch->search->textLength,
   3043                                           &(strsrch->textIter->iteratordata_));
   3044                     strsrch->utilIter->iteratordata_.coll = collator;
   3045                 }
   3046             }
   3047         }
   3048 
   3049         // **** are these calls needed?
   3050         // **** we call uprv_init_pce in initializePatternPCETable
   3051         // **** and the CEBuffer constructor...
   3052 #if 0
   3053         uprv_init_pce(strsrch->textIter);
   3054         uprv_init_pce(strsrch->utilIter);
   3055 #endif
   3056     }
   3057 }
   3058 
   3059 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
   3060 {
   3061     if (strsrch) {
   3062         return (UCollator *)strsrch->collator;
   3063     }
   3064     return NULL;
   3065 }
   3066 
   3067 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
   3068                                          const UChar         *pattern,
   3069                                                int32_t        patternlength,
   3070                                                UErrorCode    *status)
   3071 {
   3072     if (U_SUCCESS(*status)) {
   3073         if (strsrch == NULL || pattern == NULL) {
   3074             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3075         }
   3076         else {
   3077             if (patternlength == -1) {
   3078                 patternlength = u_strlen(pattern);
   3079             }
   3080             if (patternlength == 0) {
   3081                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   3082                 return;
   3083             }
   3084             strsrch->pattern.text       = pattern;
   3085             strsrch->pattern.textLength = patternlength;
   3086             initialize(strsrch, status);
   3087         }
   3088     }
   3089 }
   3090 
   3091 U_CAPI const UChar* U_EXPORT2
   3092 usearch_getPattern(const UStringSearch *strsrch,
   3093                    int32_t       *length)
   3094 {
   3095     if (strsrch) {
   3096         *length = strsrch->pattern.textLength;
   3097         return strsrch->pattern.text;
   3098     }
   3099     return NULL;
   3100 }
   3101 
   3102 // miscellanous methods --------------------------------------------------
   3103 
   3104 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
   3105                                            UErrorCode    *status)
   3106 {
   3107     if (strsrch && U_SUCCESS(*status)) {
   3108         strsrch->search->isForwardSearching = TRUE;
   3109         usearch_setOffset(strsrch, 0, status);
   3110         if (U_SUCCESS(*status)) {
   3111             return usearch_next(strsrch, status);
   3112         }
   3113     }
   3114     return USEARCH_DONE;
   3115 }
   3116 
   3117 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
   3118                                                int32_t    position,
   3119                                                UErrorCode    *status)
   3120 {
   3121     if (strsrch && U_SUCCESS(*status)) {
   3122         strsrch->search->isForwardSearching = TRUE;
   3123         // position checked in usearch_setOffset
   3124         usearch_setOffset(strsrch, position, status);
   3125         if (U_SUCCESS(*status)) {
   3126             return usearch_next(strsrch, status);
   3127         }
   3128     }
   3129     return USEARCH_DONE;
   3130 }
   3131 
   3132 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
   3133                                           UErrorCode    *status)
   3134 {
   3135     if (strsrch && U_SUCCESS(*status)) {
   3136         strsrch->search->isForwardSearching = FALSE;
   3137         usearch_setOffset(strsrch, strsrch->search->textLength, status);
   3138         if (U_SUCCESS(*status)) {
   3139             return usearch_previous(strsrch, status);
   3140         }
   3141     }
   3142     return USEARCH_DONE;
   3143 }
   3144 
   3145 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
   3146                                                int32_t    position,
   3147                                                UErrorCode    *status)
   3148 {
   3149     if (strsrch && U_SUCCESS(*status)) {
   3150         strsrch->search->isForwardSearching = FALSE;
   3151         // position checked in usearch_setOffset
   3152         usearch_setOffset(strsrch, position, status);
   3153         if (U_SUCCESS(*status)) {
   3154             return usearch_previous(strsrch, status);
   3155         }
   3156     }
   3157     return USEARCH_DONE;
   3158 }
   3159 
   3160 /**
   3161 * If a direction switch is required, we'll count the number of ces till the
   3162 * beginning of the collation element iterator and iterate forwards that
   3163 * number of times. This is so that we get to the correct point within the
   3164 * string to continue the search in. Imagine when we are in the middle of the
   3165 * normalization buffer when the change in direction is request. arrrgghh....
   3166 * After searching the offset within the collation element iterator will be
   3167 * shifted to the start of the match. If a match is not found, the offset would
   3168 * have been set to the end of the text string in the collation element
   3169 * iterator.
   3170 * Okay, here's my take on normalization buffer. The only time when there can
   3171 * be 2 matches within the same normalization is when the pattern is consists
   3172 * of all accents. But since the offset returned is from the text string, we
   3173 * should not confuse the caller by returning the second match within the
   3174 * same normalization buffer. If we do, the 2 results will have the same match
   3175 * offsets, and that'll be confusing. I'll return the next match that doesn't
   3176 * fall within the same normalization buffer. Note this does not affect the
   3177 * results of matches spanning the text and the normalization buffer.
   3178 * The position to start searching is taken from the collation element
   3179 * iterator. Callers of this API would have to set the offset in the collation
   3180 * element iterator before using this method.
   3181 */
   3182 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
   3183                                           UErrorCode    *status)
   3184 {
   3185     if (U_SUCCESS(*status) && strsrch) {
   3186         // note offset is either equivalent to the start of the previous match
   3187         // or is set by the user
   3188         int32_t      offset       = usearch_getOffset(strsrch);
   3189         USearch     *search       = strsrch->search;
   3190         search->reset             = FALSE;
   3191         int32_t      textlength   = search->textLength;
   3192         if (search->isForwardSearching) {
   3193 #if BOYER_MOORE
   3194             if (offset == textlength
   3195                 || (!search->isOverlap &&
   3196                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
   3197                     (search->matchedIndex != USEARCH_DONE &&
   3198                      offset + search->matchedLength >= textlength)))) {
   3199                 // not enough characters to match
   3200                 setMatchNotFound(strsrch);
   3201                 return USEARCH_DONE;
   3202             }
   3203 #else
   3204             if (offset == textlength ||
   3205                 (! search->isOverlap &&
   3206                 (search->matchedIndex != USEARCH_DONE &&
   3207                 offset + search->matchedLength > textlength))) {
   3208                     // not enough characters to match
   3209                     setMatchNotFound(strsrch);
   3210                     return USEARCH_DONE;
   3211             }
   3212 #endif
   3213         }
   3214         else {
   3215             // switching direction.
   3216             // if matchedIndex == USEARCH_DONE, it means that either a
   3217             // setOffset has been called or that previous ran off the text
   3218             // string. the iterator would have been set to offset 0 if a
   3219             // match is not found.
   3220             search->isForwardSearching = TRUE;
   3221             if (search->matchedIndex != USEARCH_DONE) {
   3222                 // there's no need to set the collation element iterator
   3223                 // the next call to next will set the offset.
   3224                 return search->matchedIndex;
   3225             }
   3226         }
   3227 
   3228         if (U_SUCCESS(*status)) {
   3229             if (strsrch->pattern.CELength == 0) {
   3230                 if (search->matchedIndex == USEARCH_DONE) {
   3231                     search->matchedIndex = offset;
   3232                 }
   3233                 else { // moves by codepoints
   3234                     UTF_FWD_1(search->text, search->matchedIndex, textlength);
   3235                 }
   3236 
   3237                 search->matchedLength = 0;
   3238                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3239                 // status checked below
   3240                 if (search->matchedIndex == textlength) {
   3241                     search->matchedIndex = USEARCH_DONE;
   3242                 }
   3243             }
   3244             else {
   3245                 if (search->matchedLength > 0) {
   3246                     // if matchlength is 0 we are at the start of the iteration
   3247                     if (search->isOverlap) {
   3248                         ucol_setOffset(strsrch->textIter, offset + 1, status);
   3249                     }
   3250                     else {
   3251                         ucol_setOffset(strsrch->textIter,
   3252                                        offset + search->matchedLength, status);
   3253                     }
   3254                 }
   3255                 else {
   3256                     // for boundary check purposes. this will ensure that the
   3257                     // next match will not preceed the current offset
   3258                     // note search->matchedIndex will always be set to something
   3259                     // in the code
   3260                     search->matchedIndex = offset - 1;
   3261                 }
   3262 
   3263                 if (search->isCanonicalMatch) {
   3264                     // can't use exact here since extra accents are allowed.
   3265                     usearch_handleNextCanonical(strsrch, status);
   3266                 }
   3267                 else {
   3268                     usearch_handleNextExact(strsrch, status);
   3269                 }
   3270             }
   3271 
   3272             if (U_FAILURE(*status)) {
   3273                 return USEARCH_DONE;
   3274             }
   3275 
   3276 #if !BOYER_MOORE
   3277             if (search->matchedIndex == USEARCH_DONE) {
   3278                 ucol_setOffset(strsrch->textIter, search->textLength, status);
   3279             } else {
   3280                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
   3281             }
   3282 #endif
   3283 
   3284             return search->matchedIndex;
   3285         }
   3286     }
   3287     return USEARCH_DONE;
   3288 }
   3289 
   3290 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
   3291                                               UErrorCode *status)
   3292 {
   3293     if (U_SUCCESS(*status) && strsrch) {
   3294         int32_t offset;
   3295         USearch *search = strsrch->search;
   3296         if (search->reset) {
   3297             offset                     = search->textLength;
   3298             search->isForwardSearching = FALSE;
   3299             search->reset              = FALSE;
   3300             setColEIterOffset(strsrch->textIter, offset);
   3301         }
   3302         else {
   3303             offset = usearch_getOffset(strsrch);
   3304         }
   3305 
   3306         int32_t matchedindex = search->matchedIndex;
   3307         if (search->isForwardSearching == TRUE) {
   3308             // switching direction.
   3309             // if matchedIndex == USEARCH_DONE, it means that either a
   3310             // setOffset has been called or that next ran off the text
   3311             // string. the iterator would have been set to offset textLength if
   3312             // a match is not found.
   3313             search->isForwardSearching = FALSE;
   3314             if (matchedindex != USEARCH_DONE) {
   3315                 return matchedindex;
   3316             }
   3317         }
   3318         else {
   3319 #if BOYER_MOORE
   3320             if (offset == 0 || matchedindex == 0 ||
   3321                 (!search->isOverlap &&
   3322                     (offset < strsrch->pattern.defaultShiftSize ||
   3323                     (matchedindex != USEARCH_DONE &&
   3324                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
   3325                 // not enough characters to match
   3326                 setMatchNotFound(strsrch);
   3327                 return USEARCH_DONE;
   3328             }
   3329 #else
   3330             // Could check pattern length, but the
   3331             // linear search will do the right thing
   3332             if (offset == 0 || matchedindex == 0) {
   3333                 setMatchNotFound(strsrch);
   3334                 return USEARCH_DONE;
   3335             }
   3336 #endif
   3337         }
   3338 
   3339         if (U_SUCCESS(*status)) {
   3340             if (strsrch->pattern.CELength == 0) {
   3341                 search->matchedIndex =
   3342                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
   3343                 if (search->matchedIndex == 0) {
   3344                     setMatchNotFound(strsrch);
   3345                     // status checked below
   3346                 }
   3347                 else { // move by codepoints
   3348                     UTF_BACK_1(search->text, 0, search->matchedIndex);
   3349                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3350                     // status checked below
   3351                     search->matchedLength = 0;
   3352                 }
   3353             }
   3354             else {
   3355 #if !BOYER_MOORE
   3356                 if (search->matchedIndex != USEARCH_DONE) {
   3357                     if (search->isOverlap) {
   3358                         ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status);
   3359                     }
   3360                 }
   3361 #endif
   3362 
   3363                 if (strsrch->search->isCanonicalMatch) {
   3364                     // can't use exact here since extra accents are allowed.
   3365                     usearch_handlePreviousCanonical(strsrch, status);
   3366                     // status checked below
   3367                 }
   3368                 else {
   3369                     usearch_handlePreviousExact(strsrch, status);
   3370                     // status checked below
   3371                 }
   3372             }
   3373 
   3374             if (U_FAILURE(*status)) {
   3375                 return USEARCH_DONE;
   3376             }
   3377 
   3378             return search->matchedIndex;
   3379         }
   3380     }
   3381     return USEARCH_DONE;
   3382 }
   3383 
   3384 
   3385 
   3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
   3387 {
   3388     /*
   3389     reset is setting the attributes that are already in
   3390     string search, hence all attributes in the collator should
   3391     be retrieved without any problems
   3392     */
   3393     if (strsrch) {
   3394         UErrorCode status            = U_ZERO_ERROR;
   3395         UBool      sameCollAttribute = TRUE;
   3396         uint32_t   ceMask;
   3397         UBool      shift;
   3398         uint32_t   varTop;
   3399 
   3400         // **** hack to deal w/ how processed CEs encode quaternary ****
   3401         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
   3402         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
   3403             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
   3404                 sameCollAttribute = FALSE;
   3405         }
   3406 
   3407         strsrch->strength    = ucol_getStrength(strsrch->collator);
   3408         ceMask = getMask(strsrch->strength);
   3409         if (strsrch->ceMask != ceMask) {
   3410             strsrch->ceMask = ceMask;
   3411             sameCollAttribute = FALSE;
   3412         }
   3413 
   3414         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3415         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
   3416                                   &status) == UCOL_SHIFTED;
   3417         if (strsrch->toShift != shift) {
   3418             strsrch->toShift  = shift;
   3419             sameCollAttribute = FALSE;
   3420         }
   3421 
   3422         // if status is a failure, ucol_getVariableTop returns 0
   3423         varTop = ucol_getVariableTop(strsrch->collator, &status);
   3424         if (strsrch->variableTop != varTop) {
   3425             strsrch->variableTop = varTop;
   3426             sameCollAttribute    = FALSE;
   3427         }
   3428         if (!sameCollAttribute) {
   3429             initialize(strsrch, &status);
   3430         }
   3431         /* free offset buffer to avoid memory leak before initializing. */
   3432         ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
   3433         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
   3434                               strsrch->search->textLength,
   3435                               &(strsrch->textIter->iteratordata_));
   3436         strsrch->search->matchedLength      = 0;
   3437         strsrch->search->matchedIndex       = USEARCH_DONE;
   3438         strsrch->search->isOverlap          = FALSE;
   3439         strsrch->search->isCanonicalMatch   = FALSE;
   3440         strsrch->search->isForwardSearching = TRUE;
   3441         strsrch->search->reset              = TRUE;
   3442     }
   3443 }
   3444 
   3445 //
   3446 //  CEI  Collation Element + source text index.
   3447 //       These structs are kept in the circular buffer.
   3448 //
   3449 struct  CEI {
   3450     int64_t ce;
   3451     int32_t lowIndex;
   3452     int32_t highIndex;
   3453 };
   3454 
   3455 U_NAMESPACE_BEGIN
   3456 
   3457 
   3458 //
   3459 //  CEBuffer   A circular buffer of CEs from the text being searched.
   3460 //
   3461 #define   DEFAULT_CEBUFFER_SIZE 50
   3462 struct CEBuffer {
   3463     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
   3464     CEI                 *buf;
   3465     int32_t              bufSize;
   3466     int32_t              firstIx;
   3467     int32_t              limitIx;
   3468     UCollationElements  *ceIter;
   3469     UStringSearch       *strSearch;
   3470 
   3471 
   3472 
   3473                CEBuffer(UStringSearch *ss, UErrorCode *status);
   3474                ~CEBuffer();
   3475    const CEI   *get(int32_t index);
   3476    const CEI   *getPrevious(int32_t index);
   3477 };
   3478 
   3479 
   3480 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
   3481     buf = defBuf;
   3482     strSearch = ss;
   3483     bufSize = ss->pattern.CELength+10;
   3484     ceIter    = ss->textIter;
   3485     firstIx = 0;
   3486     limitIx = 0;
   3487 
   3488     uprv_init_pce(ceIter);
   3489 
   3490     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
   3491         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
   3492         if (buf == NULL) {
   3493             *status = U_MEMORY_ALLOCATION_ERROR;
   3494         }
   3495     }
   3496 }
   3497 
   3498 // TODO: add a reset or init function so that allocated
   3499 //       buffers can be retained & reused.
   3500 
   3501 CEBuffer::~CEBuffer() {
   3502     if (buf != defBuf) {
   3503         uprv_free(buf);
   3504     }
   3505 }
   3506 
   3507 
   3508 // Get the CE with the specified index.
   3509 //   Index must be in the range
   3510 //          n-history_size < index < n+1
   3511 //   where n is the largest index to have been fetched by some previous call to this function.
   3512 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3513 //
   3514 const CEI *CEBuffer::get(int32_t index) {
   3515     int i = index % bufSize;
   3516 
   3517     if (index>=firstIx && index<limitIx) {
   3518         // The request was for an entry already in our buffer.
   3519         //  Just return it.
   3520         return &buf[i];
   3521     }
   3522 
   3523     // Caller is requesting a new, never accessed before, CE.
   3524     //   Verify that it is the next one in sequence, which is all
   3525     //   that is allowed.
   3526     if (index != limitIx) {
   3527         U_ASSERT(FALSE);
   3528 
   3529         return NULL;
   3530     }
   3531 
   3532     // Manage the circular CE buffer indexing
   3533     limitIx++;
   3534 
   3535     if (limitIx - firstIx >= bufSize) {
   3536         // The buffer is full, knock out the lowest-indexed entry.
   3537         firstIx++;
   3538     }
   3539 
   3540     UErrorCode status = U_ZERO_ERROR;
   3541 
   3542     buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
   3543 
   3544     return &buf[i];
   3545 }
   3546 
   3547 // Get the CE with the specified index.
   3548 //   Index must be in the range
   3549 //          n-history_size < index < n+1
   3550 //   where n is the largest index to have been fetched by some previous call to this function.
   3551 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3552 //
   3553 const CEI *CEBuffer::getPrevious(int32_t index) {
   3554     int i = index % bufSize;
   3555 
   3556     if (index>=firstIx && index<limitIx) {
   3557         // The request was for an entry already in our buffer.
   3558         //  Just return it.
   3559         return &buf[i];
   3560     }
   3561 
   3562     // Caller is requesting a new, never accessed before, CE.
   3563     //   Verify that it is the next one in sequence, which is all
   3564     //   that is allowed.
   3565     if (index != limitIx) {
   3566         U_ASSERT(FALSE);
   3567 
   3568         return NULL;
   3569     }
   3570 
   3571     // Manage the circular CE buffer indexing
   3572     limitIx++;
   3573 
   3574     if (limitIx - firstIx >= bufSize) {
   3575         // The buffer is full, knock out the lowest-indexed entry.
   3576         firstIx++;
   3577     }
   3578 
   3579     UErrorCode status = U_ZERO_ERROR;
   3580 
   3581     buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
   3582 
   3583     return &buf[i];
   3584 }
   3585 
   3586 U_NAMESPACE_END
   3587 
   3588 
   3589 // #define USEARCH_DEBUG
   3590 
   3591 #ifdef USEARCH_DEBUG
   3592 #include <stdio.h>
   3593 #include <stdlib.h>
   3594 #endif
   3595 
   3596 /*
   3597  * Find the next break boundary after startIndex. If the UStringSearch object
   3598  * has an external break iterator, use that. Otherwise use the internal character
   3599  * break iterator.
   3600  */
   3601 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
   3602 #if 0
   3603     const UChar *text = strsrch->search->text;
   3604     int32_t textLen   = strsrch->search->textLength;
   3605 
   3606     U_ASSERT(startIndex>=0);
   3607     U_ASSERT(startIndex<=textLen);
   3608 
   3609     if (startIndex >= textLen) {
   3610         return startIndex;
   3611     }
   3612 
   3613     UChar32  c;
   3614     int32_t  i = startIndex;
   3615     U16_NEXT(text, i, textLen, c);
   3616 
   3617     // If we are on a control character, stop without looking for combining marks.
   3618     //    Control characters do not combine.
   3619     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3620     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   3621         return i;
   3622     }
   3623 
   3624     // The initial character was not a control, and can thus accept trailing
   3625     //   combining characters.  Advance over however many of them there are.
   3626     int32_t  indexOfLastCharChecked;
   3627     for (;;) {
   3628         indexOfLastCharChecked = i;
   3629         if (i>=textLen) {
   3630             break;
   3631         }
   3632         U16_NEXT(text, i, textLen, c);
   3633         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3634         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3635             break;
   3636         }
   3637     }
   3638     return indexOfLastCharChecked;
   3639 #elif !UCONFIG_NO_BREAK_ITERATION
   3640     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3641 
   3642     if (breakiterator == NULL) {
   3643         breakiterator = strsrch->search->internalBreakIter;
   3644     }
   3645 
   3646     if (breakiterator != NULL) {
   3647     	return ubrk_following(breakiterator, startIndex);
   3648     }
   3649 
   3650     return startIndex;
   3651 #else
   3652     // **** or should we use the original code? ****
   3653     return startIndex;
   3654 #endif
   3655 
   3656 }
   3657 
   3658 /*
   3659  * Returns TRUE if index is on a break boundary. If the UStringSearch
   3660  * has an external break iterator, test using that, otherwise test
   3661  * using the internal character break iterator.
   3662  */
   3663 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
   3664 #if 0
   3665     const UChar *text = strsrch->search->text;
   3666     int32_t textLen   = strsrch->search->textLength;
   3667 
   3668     U_ASSERT(index>=0);
   3669     U_ASSERT(index<=textLen);
   3670 
   3671     if (index>=textLen || index<=0) {
   3672         return FALSE;
   3673     }
   3674 
   3675     // If the character at the current index is not a GRAPHEME_EXTEND
   3676     //    then we can not be within a combining sequence.
   3677     UChar32  c;
   3678     U16_GET(text, 0, index, textLen, c);
   3679     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3680     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3681         return FALSE;
   3682     }
   3683 
   3684     // We are at a combining mark.  If the preceding character is anything
   3685     //   except a CONTROL, CR or LF, we are in a combining sequence.
   3686     U16_PREV(text, 0, index, c);
   3687     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3688     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   3689     return combining;
   3690 #elif !UCONFIG_NO_BREAK_ITERATION
   3691     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3692 
   3693     if (breakiterator == NULL) {
   3694         breakiterator = strsrch->search->internalBreakIter;
   3695     }
   3696 
   3697     return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
   3698 #else
   3699     // **** or use the original code? ****
   3700     return FALSE;
   3701 #endif
   3702 }
   3703 
   3704 #if 0
   3705 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
   3706 {
   3707 #if !UCONFIG_NO_BREAK_ITERATION
   3708     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3709 
   3710     if (breakiterator != NULL) {
   3711         int32_t startindex = ubrk_first(breakiterator);
   3712         int32_t endindex   = ubrk_last(breakiterator);
   3713 
   3714         // out-of-range indexes are never boundary positions
   3715         if (start < startindex || start > endindex ||
   3716             end < startindex || end > endindex) {
   3717             return FALSE;
   3718         }
   3719 
   3720         return ubrk_isBoundary(breakiterator, start) &&
   3721                ubrk_isBoundary(breakiterator, end);
   3722     }
   3723 #endif
   3724 
   3725     return TRUE;
   3726 }
   3727 #endif
   3728 
   3729 
   3730 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
   3731                                        int32_t        startIdx,
   3732                                        int32_t        *matchStart,
   3733                                        int32_t        *matchLimit,
   3734                                        UErrorCode     *status)
   3735 {
   3736     if (U_FAILURE(*status)) {
   3737         return FALSE;
   3738     }
   3739 
   3740     // TODO:  reject search patterns beginning with a combining char.
   3741 
   3742 #ifdef USEARCH_DEBUG
   3743     if (getenv("USEARCH_DEBUG") != NULL) {
   3744         printf("Pattern CEs\n");
   3745         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   3746             printf(" %8x", strsrch->pattern.CE[ii]);
   3747         }
   3748         printf("\n");
   3749     }
   3750 
   3751 #endif
   3752     // Input parameter sanity check.
   3753     //  TODO:  should input indicies clip to the text length
   3754     //         in the same way that UText does.
   3755     if(strsrch->pattern.CELength == 0         ||
   3756        startIdx < 0                           ||
   3757        startIdx > strsrch->search->textLength ||
   3758        strsrch->pattern.CE == NULL) {
   3759            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3760            return FALSE;
   3761     }
   3762 
   3763     if (strsrch->pattern.PCE == NULL) {
   3764         initializePatternPCETable(strsrch, status);
   3765     }
   3766 
   3767     ucol_setOffset(strsrch->textIter, startIdx, status);
   3768     CEBuffer ceb(strsrch, status);
   3769 
   3770 
   3771     int32_t    targetIx = 0;
   3772     const CEI *targetCEI;
   3773     int32_t    patIx;
   3774     UBool      found;
   3775 
   3776     int32_t  mStart = -1;
   3777     int32_t  mLimit = -1;
   3778     int32_t  minLimit;
   3779     int32_t  maxLimit;
   3780 
   3781 
   3782 
   3783     // Outer loop moves over match starting positions in the
   3784     //      target CE space.
   3785     for(targetIx=0; ; targetIx++)
   3786     {
   3787         found = TRUE;
   3788         //  Inner loop checks for a match beginning at each
   3789         //  position from the outer loop.
   3790         for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
   3791             int64_t patCE = strsrch->pattern.PCE[patIx];
   3792             targetCEI = ceb.get(targetIx+patIx);
   3793             //  Compare CE from target string with CE from the pattern.
   3794             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   3795             //    which will fail the compare, below.
   3796             if (targetCEI->ce != patCE) {
   3797                 found = FALSE;
   3798                 break;
   3799             }
   3800         }
   3801 
   3802         if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3803             // No match at this targetIx.  Try again at the next.
   3804             continue;
   3805         }
   3806 
   3807         if (!found) {
   3808             // No match at all, we have run off the end of the target text.
   3809             break;
   3810         }
   3811 
   3812 
   3813         // We have found a match in CE space.
   3814         // Now determine the bounds in string index space.
   3815         //  There still is a chance of match failure if the CE range not correspond to
   3816         //     an acceptable character range.
   3817         //
   3818         const CEI *firstCEI = ceb.get(targetIx);
   3819         const CEI *lastCEI  = ceb.get(targetIx + strsrch->pattern.PCELength - 1);
   3820         const CEI *nextCEI  = ceb.get(targetIx + strsrch->pattern.PCELength);
   3821 
   3822         mStart   = firstCEI->lowIndex;
   3823         minLimit = lastCEI->lowIndex;
   3824         maxLimit = nextCEI->lowIndex;
   3825 
   3826         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   3827         //   extended to the end of input, and the match is good.
   3828 
   3829         // Look at the high and low indices of the CE following the match. If
   3830         // they are the same it means one of two things:
   3831         //    1. The match extended to the last CE from the target text, which is OK, or
   3832         //    2. The last CE that was part of the match is in an expansion that extends
   3833         //       to the first CE after the match. In this case, we reject the match.
   3834         if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3835             found = FALSE;
   3836         }
   3837 
   3838 
   3839         // Check for the start of the match being within a combining sequence.
   3840         //   This can happen if the pattern itself begins with a combining char, and
   3841         //   the match found combining marks in the target text that were attached
   3842         //    to something else.
   3843         //   This type of match should be rejected for not completely consuming a
   3844         //   combining sequence.
   3845         if (isBreakBoundary(strsrch, mStart)) {
   3846             found = FALSE;
   3847         }
   3848 
   3849         // Check for the start of the match being within an Collation Element Expansion,
   3850         //   meaning that the first char of the match is only partially matched.
   3851         //   With exapnsions, the first CE will report the index of the source
   3852         //   character, and all subsequent (expansions) CEs will report the source index of the
   3853         //    _following_ character.
   3854         int32_t secondIx = firstCEI->highIndex;
   3855         if (mStart == secondIx) {
   3856             found = FALSE;
   3857         }
   3858 
   3859         //  Advance the match end position to the first acceptable match boundary.
   3860         //    This advances the index over any combining charcters.
   3861         mLimit = maxLimit;
   3862         if (minLimit < maxLimit) {
   3863             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   3864 
   3865             if (nba >= lastCEI->highIndex) {
   3866                 mLimit = nba;
   3867             }
   3868         }
   3869 
   3870     #ifdef USEARCH_DEBUG
   3871         if (getenv("USEARCH_DEBUG") != NULL) {
   3872             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   3873         }
   3874     #endif
   3875 
   3876         // If advancing to the end of a combining sequence in character indexing space
   3877         //   advanced us beyond the end of the match in CE space, reject this match.
   3878         if (mLimit > maxLimit) {
   3879             found = FALSE;
   3880         }
   3881 
   3882         if (isBreakBoundary(strsrch, mLimit)) {
   3883             found = FALSE;
   3884         }
   3885 
   3886         if (! checkIdentical(strsrch, mStart, mLimit)) {
   3887             found = FALSE;
   3888         }
   3889 
   3890         if (found) {
   3891             break;
   3892         }
   3893     }
   3894 
   3895     #ifdef USEARCH_DEBUG
   3896     if (getenv("USEARCH_DEBUG") != NULL) {
   3897         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   3898         int32_t  lastToPrint = ceb.limitIx+2;
   3899         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   3900             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   3901         }
   3902         printf("\n%s\n", found? "match found" : "no match");
   3903     }
   3904     #endif
   3905 
   3906     // All Done.  Store back the match bounds to the caller.
   3907     //
   3908     if (found==FALSE) {
   3909         mLimit = -1;
   3910         mStart = -1;
   3911     }
   3912 
   3913     if (matchStart != NULL) {
   3914         *matchStart= mStart;
   3915     }
   3916 
   3917     if (matchLimit != NULL) {
   3918         *matchLimit = mLimit;
   3919     }
   3920 
   3921     return found;
   3922 }
   3923 
   3924 
   3925 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
   3926                                                 int32_t        startIdx,
   3927                                                 int32_t        *matchStart,
   3928                                                 int32_t        *matchLimit,
   3929                                                 UErrorCode     *status)
   3930 {
   3931     if (U_FAILURE(*status)) {
   3932         return FALSE;
   3933     }
   3934 
   3935     // TODO:  reject search patterns beginning with a combining char.
   3936 
   3937 #ifdef USEARCH_DEBUG
   3938     if (getenv("USEARCH_DEBUG") != NULL) {
   3939         printf("Pattern CEs\n");
   3940         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   3941             printf(" %8x", strsrch->pattern.CE[ii]);
   3942         }
   3943         printf("\n");
   3944     }
   3945 
   3946 #endif
   3947     // Input parameter sanity check.
   3948     //  TODO:  should input indicies clip to the text length
   3949     //         in the same way that UText does.
   3950     if(strsrch->pattern.CELength == 0         ||
   3951        startIdx < 0                           ||
   3952        startIdx > strsrch->search->textLength ||
   3953        strsrch->pattern.CE == NULL) {
   3954            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3955            return FALSE;
   3956     }
   3957 
   3958     if (strsrch->pattern.PCE == NULL) {
   3959         initializePatternPCETable(strsrch, status);
   3960     }
   3961 
   3962     CEBuffer ceb(strsrch, status);
   3963     int32_t    targetIx = 0;
   3964 
   3965     /*
   3966      * Pre-load the buffer with the CE's for the grapheme
   3967      * after our starting position so that we're sure that
   3968      * we can look at the CE following the match when we
   3969      * check the match boundaries.
   3970      *
   3971      * This will also pre-fetch the first CE that we'll
   3972      * consider for the match.
   3973      */
   3974     if (startIdx < strsrch->search->textLength) {
   3975         UBreakIterator *bi = strsrch->search->internalBreakIter;
   3976         int32_t next = ubrk_following(bi, startIdx);
   3977 
   3978         ucol_setOffset(strsrch->textIter, next, status);
   3979 
   3980         for (targetIx = 0; ; targetIx += 1) {
   3981             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
   3982                 break;
   3983             }
   3984         }
   3985     } else {
   3986         ucol_setOffset(strsrch->textIter, startIdx, status);
   3987     }
   3988 
   3989 
   3990    const CEI  *targetCEI;
   3991     int32_t    patIx;
   3992     UBool      found;
   3993 
   3994     int32_t  limitIx = targetIx;
   3995     int32_t  mStart = -1;
   3996     int32_t  mLimit = -1;
   3997     int32_t  minLimit;
   3998     int32_t  maxLimit;
   3999 
   4000 
   4001 
   4002     // Outer loop moves over match starting positions in the
   4003     //      target CE space.
   4004     for(targetIx = limitIx; ; targetIx += 1)
   4005     {
   4006         found = TRUE;
   4007         //  Inner loop checks for a match beginning at each
   4008         //  position from the outer loop.
   4009         for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
   4010             int64_t patCE = strsrch->pattern.PCE[patIx];
   4011 
   4012             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx);
   4013             //  Compare CE from target string with CE from the pattern.
   4014             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   4015             //    which will fail the compare, below.
   4016             if (targetCEI->ce != patCE) {
   4017                 found = FALSE;
   4018                 break;
   4019             }
   4020         }
   4021 
   4022         if (!found && targetCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4023             // No match at this targetIx.  Try again at the next.
   4024             continue;
   4025         }
   4026 
   4027         if (!found) {
   4028             // No match at all, we have run off the end of the target text.
   4029             break;
   4030         }
   4031 
   4032 
   4033         // We have found a match in CE space.
   4034         // Now determine the bounds in string index space.
   4035         //  There still is a chance of match failure if the CE range not correspond to
   4036         //     an acceptable character range.
   4037         //
   4038         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1);
   4039         const CEI *lastCEI  = ceb.getPrevious(targetIx);
   4040         const CEI *nextCEI  = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
   4041 
   4042         mStart   = firstCEI->lowIndex;
   4043         minLimit = lastCEI->lowIndex;
   4044         maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex;
   4045 
   4046         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   4047         //   extended to the end of input, and the match is good.
   4048 
   4049         // Look at the high and low indices of the CE following the match. If
   4050         // they are the same it means one of two things:
   4051         //    1. The match extended to the last CE from the target text, which is OK, or
   4052         //    2. The last CE that was part of the match is in an expansion that extends
   4053         //       to the first CE after the match. In this case, we reject the match.
   4054         if (targetIx >= 1) {
   4055             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4056                 found = FALSE;
   4057             }
   4058         }
   4059 
   4060 
   4061         // Check for the start of the match being within a combining sequence.
   4062         //   This can happen if the pattern itself begins with a combining char, and
   4063         //   the match found combining marks in the target text that were attached
   4064         //    to something else.
   4065         //   This type of match should be rejected for not completely consuming a
   4066         //   combining sequence.
   4067         if (isBreakBoundary(strsrch, mStart)) {
   4068             found = FALSE;
   4069         }
   4070 
   4071         // Look at the high index of the first CE in the match. If it's the same as the
   4072         // low index, the first CE in the match is in the middle of an expansion.
   4073         if (mStart == firstCEI->highIndex) {
   4074             found = FALSE;
   4075         }
   4076 
   4077         //  Advance the match end position to the first acceptable match boundary.
   4078         //    This advances the index over any combining charcters.
   4079         mLimit = maxLimit;
   4080         if (/*targetIx > 0 &&*/ minLimit < maxLimit) {
   4081             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4082 
   4083             if (nba >= lastCEI->highIndex) {
   4084                 mLimit = nba;
   4085             }
   4086         }
   4087 
   4088     #ifdef USEARCH_DEBUG
   4089         if (getenv("USEARCH_DEBUG") != NULL) {
   4090             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4091         }
   4092     #endif
   4093 
   4094         // If advancing to the end of a combining sequence in character indexing space
   4095         //   advanced us beyond the end of the match in CE space, reject this match.
   4096         if (mLimit > maxLimit) {
   4097             found = FALSE;
   4098         }
   4099 
   4100         // Make sure the end of the match is on a break boundary
   4101         if (isBreakBoundary(strsrch, mLimit)) {
   4102             found = FALSE;
   4103         }
   4104 
   4105         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4106             found = FALSE;
   4107         }
   4108 
   4109         if (found) {
   4110             break;
   4111         }
   4112     }
   4113 
   4114     #ifdef USEARCH_DEBUG
   4115     if (getenv("USEARCH_DEBUG") != NULL) {
   4116         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4117         int32_t  lastToPrint = ceb.limitIx+2;
   4118         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4119             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4120         }
   4121         printf("\n%s\n", found? "match found" : "no match");
   4122     }
   4123     #endif
   4124 
   4125     // All Done.  Store back the match bounds to the caller.
   4126     //
   4127     if (found==FALSE) {
   4128         mLimit = -1;
   4129         mStart = -1;
   4130     }
   4131 
   4132     if (matchStart != NULL) {
   4133         *matchStart= mStart;
   4134     }
   4135 
   4136     if (matchLimit != NULL) {
   4137         *matchLimit = mLimit;
   4138     }
   4139 
   4140     return found;
   4141 }
   4142 
   4143 
   4144 
   4145 
   4146 // internal use methods declared in usrchimp.h -----------------------------
   4147 
   4148 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
   4149 {
   4150     if (U_FAILURE(*status)) {
   4151         setMatchNotFound(strsrch);
   4152         return FALSE;
   4153     }
   4154 
   4155 #if BOYER_MOORE
   4156     UCollationElements *coleiter        = strsrch->textIter;
   4157     int32_t             textlength      = strsrch->search->textLength;
   4158     int32_t            *patternce       = strsrch->pattern.CE;
   4159     int32_t             patterncelength = strsrch->pattern.CELength;
   4160     int32_t             textoffset      = ucol_getOffset(coleiter);
   4161 
   4162     // status used in setting coleiter offset, since offset is checked in
   4163     // shiftForward before setting the coleiter offset, status never
   4164     // a failure
   4165     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4166                               patterncelength);
   4167     while (textoffset <= textlength)
   4168     {
   4169         uint32_t    patternceindex = patterncelength - 1;
   4170         int32_t     targetce;
   4171         UBool       found          = FALSE;
   4172         int32_t    lastce          = UCOL_NULLORDER;
   4173 
   4174         setColEIterOffset(coleiter, textoffset);
   4175 
   4176         for (;;) {
   4177             // finding the last pattern ce match, imagine composite characters
   4178             // for example: search for pattern A in text \u00C0
   4179             // we'll have to skip \u0300 the grave first before we get to A
   4180             targetce = ucol_previous(coleiter, status);
   4181             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4182                 found = FALSE;
   4183                 break;
   4184             }
   4185             targetce = getCE(strsrch, targetce);
   4186             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
   4187                 // this is for the text \u0315\u0300 that requires
   4188                 // normalization and pattern \u0300, where \u0315 is ignorable
   4189                 continue;
   4190             }
   4191             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4192                 lastce = targetce;
   4193             }
   4194             if (targetce == patternce[patternceindex]) {
   4195                 // the first ce can be a contraction
   4196                 found = TRUE;
   4197                 break;
   4198             }
   4199             if (!hasExpansion(coleiter)) {
   4200                 found = FALSE;
   4201                 break;
   4202             }
   4203         }
   4204 
   4205         //targetce = lastce;
   4206 
   4207         while (found && patternceindex > 0) {
   4208         	lastce = targetce;
   4209             targetce    = ucol_previous(coleiter, status);
   4210             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4211                 found = FALSE;
   4212                 break;
   4213             }
   4214             targetce    = getCE(strsrch, targetce);
   4215             if (targetce == UCOL_IGNORABLE) {
   4216                 continue;
   4217             }
   4218 
   4219             patternceindex --;
   4220             found = found && targetce == patternce[patternceindex];
   4221         }
   4222 
   4223         targetce = lastce;
   4224 
   4225         if (!found) {
   4226             if (U_FAILURE(*status)) {
   4227                 break;
   4228             }
   4229             textoffset = shiftForward(strsrch, textoffset, lastce,
   4230                                       patternceindex);
   4231             // status checked at loop.
   4232             patternceindex = patterncelength;
   4233             continue;
   4234         }
   4235 
   4236         if (checkNextExactMatch(strsrch, &textoffset, status)) {
   4237             // status checked in ucol_setOffset
   4238             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4239             return TRUE;
   4240         }
   4241     }
   4242     setMatchNotFound(strsrch);
   4243     return FALSE;
   4244 #else
   4245     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4246     int32_t start = -1;
   4247     int32_t end = -1;
   4248 
   4249     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4250         strsrch->search->matchedIndex  = start;
   4251         strsrch->search->matchedLength = end - start;
   4252         return TRUE;
   4253     } else {
   4254         setMatchNotFound(strsrch);
   4255         return FALSE;
   4256     }
   4257 #endif
   4258 }
   4259 
   4260 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
   4261 {
   4262     if (U_FAILURE(*status)) {
   4263         setMatchNotFound(strsrch);
   4264         return FALSE;
   4265     }
   4266 
   4267 #if BOYER_MOORE
   4268     UCollationElements *coleiter        = strsrch->textIter;
   4269     int32_t             textlength      = strsrch->search->textLength;
   4270     int32_t            *patternce       = strsrch->pattern.CE;
   4271     int32_t             patterncelength = strsrch->pattern.CELength;
   4272     int32_t             textoffset      = ucol_getOffset(coleiter);
   4273     UBool               hasPatternAccents =
   4274        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4275 
   4276     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4277                               patterncelength);
   4278     strsrch->canonicalPrefixAccents[0] = 0;
   4279     strsrch->canonicalSuffixAccents[0] = 0;
   4280 
   4281     while (textoffset <= textlength)
   4282     {
   4283         int32_t     patternceindex = patterncelength - 1;
   4284         int32_t     targetce;
   4285         UBool       found          = FALSE;
   4286         int32_t     lastce         = UCOL_NULLORDER;
   4287 
   4288         setColEIterOffset(coleiter, textoffset);
   4289 
   4290         for (;;) {
   4291             // finding the last pattern ce match, imagine composite characters
   4292             // for example: search for pattern A in text \u00C0
   4293             // we'll have to skip \u0300 the grave first before we get to A
   4294             targetce = ucol_previous(coleiter, status);
   4295             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4296                 found = FALSE;
   4297                 break;
   4298             }
   4299             targetce = getCE(strsrch, targetce);
   4300             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4301                 lastce = targetce;
   4302             }
   4303             if (targetce == patternce[patternceindex]) {
   4304                 // the first ce can be a contraction
   4305                 found = TRUE;
   4306                 break;
   4307             }
   4308             if (!hasExpansion(coleiter)) {
   4309                 found = FALSE;
   4310                 break;
   4311             }
   4312         }
   4313 
   4314         while (found && patternceindex > 0) {
   4315             targetce    = ucol_previous(coleiter, status);
   4316             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4317                 found = FALSE;
   4318                 break;
   4319             }
   4320             targetce    = getCE(strsrch, targetce);
   4321             if (targetce == UCOL_IGNORABLE) {
   4322                 continue;
   4323             }
   4324 
   4325             patternceindex --;
   4326             found = found && targetce == patternce[patternceindex];
   4327         }
   4328 
   4329         // initializing the rearranged accent array
   4330         if (hasPatternAccents && !found) {
   4331             strsrch->canonicalPrefixAccents[0] = 0;
   4332             strsrch->canonicalSuffixAccents[0] = 0;
   4333             if (U_FAILURE(*status)) {
   4334                 break;
   4335             }
   4336             found = doNextCanonicalMatch(strsrch, textoffset, status);
   4337         }
   4338 
   4339         if (!found) {
   4340             if (U_FAILURE(*status)) {
   4341                 break;
   4342             }
   4343             textoffset = shiftForward(strsrch, textoffset, lastce,
   4344                                       patternceindex);
   4345             // status checked at loop
   4346             patternceindex = patterncelength;
   4347             continue;
   4348         }
   4349 
   4350         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
   4351             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4352             return TRUE;
   4353         }
   4354     }
   4355     setMatchNotFound(strsrch);
   4356     return FALSE;
   4357 #else
   4358     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4359     int32_t start = -1;
   4360     int32_t end = -1;
   4361 
   4362     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4363         strsrch->search->matchedIndex  = start;
   4364         strsrch->search->matchedLength = end - start;
   4365         return TRUE;
   4366     } else {
   4367         setMatchNotFound(strsrch);
   4368         return FALSE;
   4369     }
   4370 #endif
   4371 }
   4372 
   4373 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
   4374 {
   4375     if (U_FAILURE(*status)) {
   4376         setMatchNotFound(strsrch);
   4377         return FALSE;
   4378     }
   4379 
   4380 #if BOYER_MOORE
   4381     UCollationElements *coleiter        = strsrch->textIter;
   4382     int32_t            *patternce       = strsrch->pattern.CE;
   4383     int32_t             patterncelength = strsrch->pattern.CELength;
   4384     int32_t             textoffset      = ucol_getOffset(coleiter);
   4385 
   4386     // shifting it check for setting offset
   4387     // if setOffset is called previously or there was no previous match, we
   4388     // leave the offset as it is.
   4389     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4390         textoffset = strsrch->search->matchedIndex;
   4391     }
   4392 
   4393     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4394                               patterncelength);
   4395 
   4396     while (textoffset >= 0)
   4397     {
   4398         int32_t     patternceindex = 1;
   4399         int32_t     targetce;
   4400         UBool       found          = FALSE;
   4401         int32_t     firstce        = UCOL_NULLORDER;
   4402 
   4403         // if status is a failure, ucol_setOffset does nothing
   4404         setColEIterOffset(coleiter, textoffset);
   4405 
   4406         for (;;) {
   4407             // finding the first pattern ce match, imagine composite
   4408             // characters. for example: search for pattern \u0300 in text
   4409             // \u00C0, we'll have to skip A first before we get to
   4410             // \u0300 the grave accent
   4411             targetce = ucol_next(coleiter, status);
   4412             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4413                 found = FALSE;
   4414                 break;
   4415             }
   4416             targetce = getCE(strsrch, targetce);
   4417             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4418                 firstce = targetce;
   4419             }
   4420             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
   4421                 continue;
   4422             }
   4423             if (targetce == patternce[0]) {
   4424                 found = TRUE;
   4425                 break;
   4426             }
   4427             if (!hasExpansion(coleiter)) {
   4428                 // checking for accents in composite character
   4429                 found = FALSE;
   4430                 break;
   4431             }
   4432         }
   4433 
   4434         //targetce = firstce;
   4435 
   4436         while (found && (patternceindex < patterncelength)) {
   4437         	firstce = targetce;
   4438             targetce    = ucol_next(coleiter, status);
   4439             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4440                 found = FALSE;
   4441                 break;
   4442             }
   4443             targetce    = getCE(strsrch, targetce);
   4444             if (targetce == UCOL_IGNORABLE) {
   4445                 continue;
   4446             }
   4447 
   4448             found = found && targetce == patternce[patternceindex];
   4449             patternceindex ++;
   4450         }
   4451 
   4452         targetce = firstce;
   4453 
   4454         if (!found) {
   4455             if (U_FAILURE(*status)) {
   4456                 break;
   4457             }
   4458 
   4459             textoffset = reverseShift(strsrch, textoffset, targetce,
   4460                                       patternceindex);
   4461             patternceindex = 0;
   4462             continue;
   4463         }
   4464 
   4465         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
   4466             setColEIterOffset(coleiter, textoffset);
   4467             return TRUE;
   4468         }
   4469     }
   4470     setMatchNotFound(strsrch);
   4471     return FALSE;
   4472 #else
   4473     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4474     int32_t start = -1;
   4475     int32_t end = -1;
   4476 
   4477     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4478         strsrch->search->matchedIndex = start;
   4479         strsrch->search->matchedLength = end - start;
   4480         return TRUE;
   4481     } else {
   4482         setMatchNotFound(strsrch);
   4483         return FALSE;
   4484     }
   4485 #endif
   4486 }
   4487 
   4488 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
   4489                                       UErrorCode    *status)
   4490 {
   4491     if (U_FAILURE(*status)) {
   4492         setMatchNotFound(strsrch);
   4493         return FALSE;
   4494     }
   4495 
   4496 #if BOYER_MOORE
   4497     UCollationElements *coleiter        = strsrch->textIter;
   4498     int32_t            *patternce       = strsrch->pattern.CE;
   4499     int32_t             patterncelength = strsrch->pattern.CELength;
   4500     int32_t             textoffset      = ucol_getOffset(coleiter);
   4501     UBool               hasPatternAccents =
   4502        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4503 
   4504     // shifting it check for setting offset
   4505     // if setOffset is called previously or there was no previous match, we
   4506     // leave the offset as it is.
   4507     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4508         textoffset = strsrch->search->matchedIndex;
   4509     }
   4510 
   4511     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4512                               patterncelength);
   4513     strsrch->canonicalPrefixAccents[0] = 0;
   4514     strsrch->canonicalSuffixAccents[0] = 0;
   4515 
   4516     while (textoffset >= 0)
   4517     {
   4518         int32_t     patternceindex = 1;
   4519         int32_t     targetce;
   4520         UBool       found          = FALSE;
   4521         int32_t     firstce        = UCOL_NULLORDER;
   4522 
   4523         setColEIterOffset(coleiter, textoffset);
   4524         for (;;) {
   4525             // finding the first pattern ce match, imagine composite
   4526             // characters. for example: search for pattern \u0300 in text
   4527             // \u00C0, we'll have to skip A first before we get to
   4528             // \u0300 the grave accent
   4529             targetce = ucol_next(coleiter, status);
   4530             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4531                 found = FALSE;
   4532                 break;
   4533             }
   4534             targetce = getCE(strsrch, targetce);
   4535             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4536                 firstce = targetce;
   4537             }
   4538 
   4539             if (targetce == patternce[0]) {
   4540                 // the first ce can be a contraction
   4541                 found = TRUE;
   4542                 break;
   4543             }
   4544             if (!hasExpansion(coleiter)) {
   4545                 // checking for accents in composite character
   4546                 found = FALSE;
   4547                 break;
   4548             }
   4549         }
   4550 
   4551         targetce = firstce;
   4552 
   4553         while (found && patternceindex < patterncelength) {
   4554             targetce    = ucol_next(coleiter, status);
   4555             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4556                 found = FALSE;
   4557                 break;
   4558             }
   4559             targetce = getCE(strsrch, targetce);
   4560             if (targetce == UCOL_IGNORABLE) {
   4561                 continue;
   4562             }
   4563 
   4564             found = found && targetce == patternce[patternceindex];
   4565             patternceindex ++;
   4566         }
   4567 
   4568         // initializing the rearranged accent array
   4569         if (hasPatternAccents && !found) {
   4570             strsrch->canonicalPrefixAccents[0] = 0;
   4571             strsrch->canonicalSuffixAccents[0] = 0;
   4572             if (U_FAILURE(*status)) {
   4573                 break;
   4574             }
   4575             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
   4576         }
   4577 
   4578         if (!found) {
   4579             if (U_FAILURE(*status)) {
   4580                 break;
   4581             }
   4582             textoffset = reverseShift(strsrch, textoffset, targetce,
   4583                                       patternceindex);
   4584             patternceindex = 0;
   4585             continue;
   4586         }
   4587 
   4588         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
   4589             setColEIterOffset(coleiter, textoffset);
   4590             return TRUE;
   4591         }
   4592     }
   4593     setMatchNotFound(strsrch);
   4594     return FALSE;
   4595 #else
   4596     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4597     int32_t start = -1;
   4598     int32_t end = -1;
   4599 
   4600     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4601         strsrch->search->matchedIndex = start;
   4602         strsrch->search->matchedLength = end - start;
   4603         return TRUE;
   4604     } else {
   4605         setMatchNotFound(strsrch);
   4606         return FALSE;
   4607     }
   4608 #endif
   4609 }
   4610 
   4611 #endif /* #if !UCONFIG_NO_COLLATION */
   4612