Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2001-2010 IBM and others. All rights reserved.
      4 **********************************************************************
      5 *   Date        Name        Description
      6 *  07/02/2001   synwee      Creation.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/usearch.h"
     15 #include "unicode/ustring.h"
     16 #include "unicode/uchar.h"
     17 #include "normalizer2impl.h"
     18 #include "ucol_imp.h"
     19 #include "usrchimp.h"
     20 #include "cmemory.h"
     21 #include "ucln_in.h"
     22 #include "uassert.h"
     23 #include "ustr_imp.h"
     24 
     25 U_NAMESPACE_USE
     26 
     27 // don't use Boyer-Moore
     28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
     29 #define BOYER_MOORE 0
     30 
     31 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     32 
     33 // internal definition ---------------------------------------------------
     34 
     35 #define LAST_BYTE_MASK_          0xFF
     36 #define SECOND_LAST_BYTE_SHIFT_  8
     37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
     38 
     39 static const uint16_t *fcdTrieIndex = NULL;
     40 static UChar32 fcdHighStart = 0;
     41 
     42 // internal methods -------------------------------------------------
     43 
     44 /**
     45 * Fast collation element iterator setOffset.
     46 * This function does not check for bounds.
     47 * @param coleiter collation element iterator
     48 * @param offset to set
     49 */
     50 static
     51 inline void setColEIterOffset(UCollationElements *elems,
     52                       int32_t             offset)
     53 {
     54     collIterate *ci = &(elems->iteratordata_);
     55     ci->pos         = ci->string + offset;
     56     ci->CEpos       = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
     57     if (ci->flags & UCOL_ITER_INNORMBUF) {
     58         ci->flags = ci->origFlags;
     59     }
     60     ci->fcdPosition = NULL;
     61 
     62 	ci->offsetReturn = NULL;
     63     ci->offsetStore  = ci->offsetBuffer;
     64 	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
     65 }
     66 
     67 /**
     68 * Getting the mask for collation strength
     69 * @param strength collation strength
     70 * @return collation element mask
     71 */
     72 static
     73 inline uint32_t getMask(UCollationStrength strength)
     74 {
     75     switch (strength)
     76     {
     77     case UCOL_PRIMARY:
     78         return UCOL_PRIMARYORDERMASK;
     79     case UCOL_SECONDARY:
     80         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
     81     default:
     82         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
     83                UCOL_PRIMARYORDERMASK;
     84     }
     85 }
     86 
     87 /**
     88 * This is to squeeze the 21bit ces into a 256 table
     89 * @param ce collation element
     90 * @return collapsed version of the collation element
     91 */
     92 static
     93 inline int hash(uint32_t ce)
     94 {
     95     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
     96     // well with the new collation where most of the latin 1 characters
     97     // are of the value xx000xxx. their hashes will most of the time be 0
     98     // to be discussed on the hash algo.
     99     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
    100 }
    101 
    102 U_CDECL_BEGIN
    103 static UBool U_CALLCONV
    104 usearch_cleanup(void) {
    105     fcdTrieIndex = NULL;
    106     return TRUE;
    107 }
    108 U_CDECL_END
    109 
    110 /**
    111 * Initializing the fcd tables.
    112 * Internal method, status assumed to be a success.
    113 * @param status output error if any, caller to check status before calling
    114 *               method, status assumed to be success when passed in.
    115 */
    116 static
    117 inline void initializeFCD(UErrorCode *status)
    118 {
    119     if (fcdTrieIndex == NULL) {
    120         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
    121         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
    122     }
    123 }
    124 
    125 /**
    126 * Gets the fcd value for a character at the argument index.
    127 * This method takes into accounts of the supplementary characters.
    128 * @param str UTF16 string where character for fcd retrieval resides
    129 * @param offset position of the character whose fcd is to be retrieved, to be
    130 *               overwritten with the next character position, taking
    131 *               surrogate characters into consideration.
    132 * @param strlength length of the argument string
    133 * @return fcd value
    134 */
    135 static
    136 uint16_t getFCD(const UChar   *str, int32_t *offset,
    137                              int32_t  strlength)
    138 {
    139     const UChar *temp = str + *offset;
    140     uint16_t    result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength);
    141     *offset = (int32_t)(temp - str);
    142     return result;
    143 }
    144 
    145 /**
    146 * Getting the modified collation elements taking into account the collation
    147 * attributes
    148 * @param strsrch string search data
    149 * @param sourcece
    150 * @return the modified collation element
    151 */
    152 static
    153 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
    154 {
    155     // note for tertiary we can't use the collator->tertiaryMask, that
    156     // is a preprocessed mask that takes into account case options. since
    157     // we are only concerned with exact matches, we don't need that.
    158     sourcece &= strsrch->ceMask;
    159 
    160     if (strsrch->toShift) {
    161         // alternate handling here, since only the 16 most significant digits
    162         // is only used, we can safely do a compare without masking
    163         // if the ce is a variable, we mask and get only the primary values
    164         // no shifting to quartenary is required since all primary values
    165         // less than variabletop will need to be masked off anyway.
    166         if (strsrch->variableTop > sourcece) {
    167             if (strsrch->strength >= UCOL_QUATERNARY) {
    168                 sourcece &= UCOL_PRIMARYORDERMASK;
    169             }
    170             else {
    171                 sourcece = UCOL_IGNORABLE;
    172             }
    173         }
    174     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
    175         sourcece = 0xFFFF;
    176     }
    177 
    178     return sourcece;
    179 }
    180 
    181 /**
    182 * Allocate a memory and returns NULL if it failed.
    183 * Internal method, status assumed to be a success.
    184 * @param size to allocate
    185 * @param status output error if any, caller to check status before calling
    186 *               method, status assumed to be success when passed in.
    187 * @return newly allocated array, NULL otherwise
    188 */
    189 static
    190 inline void * allocateMemory(uint32_t size, UErrorCode *status)
    191 {
    192     uint32_t *result = (uint32_t *)uprv_malloc(size);
    193     if (result == NULL) {
    194         *status = U_MEMORY_ALLOCATION_ERROR;
    195     }
    196     return result;
    197 }
    198 
    199 /**
    200 * Adds a uint32_t value to a destination array.
    201 * Creates a new array if we run out of space. The caller will have to
    202 * manually deallocate the newly allocated array.
    203 * Internal method, status assumed to be success, caller has to check status
    204 * before calling this method. destination not to be NULL and has at least
    205 * size destinationlength.
    206 * @param destination target array
    207 * @param offset destination offset to add value
    208 * @param destinationlength target array size, return value for the new size
    209 * @param value to be added
    210 * @param increments incremental size expected
    211 * @param status output error if any, caller to check status before calling
    212 *               method, status assumed to be success when passed in.
    213 * @return new destination array, destination if there was no new allocation
    214 */
    215 static
    216 inline int32_t * addTouint32_tArray(int32_t    *destination,
    217                                     uint32_t    offset,
    218                                     uint32_t   *destinationlength,
    219                                     uint32_t    value,
    220                                     uint32_t    increments,
    221                                     UErrorCode *status)
    222 {
    223     uint32_t newlength = *destinationlength;
    224     if (offset + 1 == newlength) {
    225         newlength += increments;
    226         int32_t *temp = (int32_t *)allocateMemory(
    227                                          sizeof(int32_t) * newlength, status);
    228         if (U_FAILURE(*status)) {
    229             return NULL;
    230         }
    231         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
    232         *destinationlength = newlength;
    233         destination        = temp;
    234     }
    235     destination[offset] = value;
    236     return destination;
    237 }
    238 
    239 /**
    240 * Adds a uint64_t value to a destination array.
    241 * Creates a new array if we run out of space. The caller will have to
    242 * manually deallocate the newly allocated array.
    243 * Internal method, status assumed to be success, caller has to check status
    244 * before calling this method. destination not to be NULL and has at least
    245 * size destinationlength.
    246 * @param destination target array
    247 * @param offset destination offset to add value
    248 * @param destinationlength target array size, return value for the new size
    249 * @param value to be added
    250 * @param increments incremental size expected
    251 * @param status output error if any, caller to check status before calling
    252 *               method, status assumed to be success when passed in.
    253 * @return new destination array, destination if there was no new allocation
    254 */
    255 static
    256 inline int64_t * addTouint64_tArray(int64_t    *destination,
    257                                     uint32_t    offset,
    258                                     uint32_t   *destinationlength,
    259                                     uint64_t    value,
    260                                     uint32_t    increments,
    261                                     UErrorCode *status)
    262 {
    263     uint32_t newlength = *destinationlength;
    264     if (offset + 1 == newlength) {
    265         newlength += increments;
    266         int64_t *temp = (int64_t *)allocateMemory(
    267                                          sizeof(int64_t) * newlength, status);
    268 
    269         if (U_FAILURE(*status)) {
    270             return NULL;
    271         }
    272 
    273         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
    274         *destinationlength = newlength;
    275         destination        = temp;
    276     }
    277 
    278     destination[offset] = value;
    279 
    280     return destination;
    281 }
    282 
    283 /**
    284 * Initializing the ce table for a pattern.
    285 * Stores non-ignorable collation keys.
    286 * Table size will be estimated by the size of the pattern text. Table
    287 * expansion will be perform as we go along. Adding 1 to ensure that the table
    288 * size definitely increases.
    289 * Internal method, status assumed to be a success.
    290 * @param strsrch string search data
    291 * @param status output error if any, caller to check status before calling
    292 *               method, status assumed to be success when passed in.
    293 * @return total number of expansions
    294 */
    295 static
    296 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
    297                                          UErrorCode    *status)
    298 {
    299     UPattern *pattern            = &(strsrch->pattern);
    300     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
    301     int32_t  *cetable            = pattern->CEBuffer;
    302     uint32_t  patternlength      = pattern->textLength;
    303     UCollationElements *coleiter = strsrch->utilIter;
    304 
    305     if (coleiter == NULL) {
    306         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    307                                      patternlength, status);
    308         // status will be checked in ucol_next(..) later and if it is an
    309         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    310         // returned.
    311         strsrch->utilIter = coleiter;
    312     }
    313     else {
    314         uprv_init_collIterate(strsrch->collator, pattern->text,
    315                          pattern->textLength,
    316                          &coleiter->iteratordata_,
    317                          status);
    318     }
    319     if(U_FAILURE(*status)) {
    320         return 0;
    321     }
    322 
    323     if (pattern->CE != cetable && pattern->CE) {
    324         uprv_free(pattern->CE);
    325     }
    326 
    327     uint16_t  offset      = 0;
    328     uint16_t  result      = 0;
    329     int32_t   ce;
    330 
    331     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
    332            U_SUCCESS(*status)) {
    333         uint32_t newce = getCE(strsrch, ce);
    334         if (newce) {
    335             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
    336                                   newce,
    337                                   patternlength - ucol_getOffset(coleiter) + 1,
    338                                   status);
    339             if (U_FAILURE(*status)) {
    340                 return 0;
    341             }
    342             offset ++;
    343             if (cetable != temp && cetable != pattern->CEBuffer) {
    344                 uprv_free(cetable);
    345             }
    346             cetable = temp;
    347         }
    348         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    349     }
    350 
    351     cetable[offset]   = 0;
    352     pattern->CE       = cetable;
    353     pattern->CELength = offset;
    354 
    355     return result;
    356 }
    357 
    358 /**
    359 * Initializing the pce table for a pattern.
    360 * Stores non-ignorable collation keys.
    361 * Table size will be estimated by the size of the pattern text. Table
    362 * expansion will be perform as we go along. Adding 1 to ensure that the table
    363 * size definitely increases.
    364 * Internal method, status assumed to be a success.
    365 * @param strsrch string search data
    366 * @param status output error if any, caller to check status before calling
    367 *               method, status assumed to be success when passed in.
    368 * @return total number of expansions
    369 */
    370 static
    371 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
    372                                           UErrorCode    *status)
    373 {
    374     UPattern *pattern            = &(strsrch->pattern);
    375     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
    376     int64_t  *pcetable           = pattern->PCEBuffer;
    377     uint32_t  patternlength      = pattern->textLength;
    378     UCollationElements *coleiter = strsrch->utilIter;
    379 
    380     if (coleiter == NULL) {
    381         coleiter = ucol_openElements(strsrch->collator, pattern->text,
    382                                      patternlength, status);
    383         // status will be checked in ucol_next(..) later and if it is an
    384         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
    385         // returned.
    386         strsrch->utilIter = coleiter;
    387     } else {
    388         uprv_init_collIterate(strsrch->collator, pattern->text,
    389                               pattern->textLength,
    390                               &coleiter->iteratordata_,
    391                               status);
    392     }
    393     if(U_FAILURE(*status)) {
    394         return 0;
    395     }
    396 
    397     if (pattern->PCE != pcetable && pattern->PCE != NULL) {
    398         uprv_free(pattern->PCE);
    399     }
    400 
    401     uint16_t  offset = 0;
    402     uint16_t  result = 0;
    403     int64_t   pce;
    404 
    405     uprv_init_pce(coleiter);
    406 
    407     // ** Should processed CEs be signed or unsigned?
    408     // ** (the rest of the code in this file seems to play fast-and-loose with
    409     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
    410     while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
    411            U_SUCCESS(*status)) {
    412         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
    413                               pce,
    414                               patternlength - ucol_getOffset(coleiter) + 1,
    415                               status);
    416 
    417         if (U_FAILURE(*status)) {
    418             return 0;
    419         }
    420 
    421         offset += 1;
    422 
    423         if (pcetable != temp && pcetable != pattern->PCEBuffer) {
    424             uprv_free(pcetable);
    425         }
    426 
    427         pcetable = temp;
    428         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
    429     }
    430 
    431     pcetable[offset]   = 0;
    432     pattern->PCE       = pcetable;
    433     pattern->PCELength = offset;
    434 
    435     return result;
    436 }
    437 
    438 /**
    439 * Initializes the pattern struct.
    440 * Internal method, status assumed to be success.
    441 * @param strsrch UStringSearch data storage
    442 * @param status output error if any, caller to check status before calling
    443 *               method, status assumed to be success when passed in.
    444 * @return expansionsize the total expansion size of the pattern
    445 */
    446 static
    447 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
    448 {
    449           UPattern   *pattern     = &(strsrch->pattern);
    450     const UChar      *patterntext = pattern->text;
    451           int32_t     length      = pattern->textLength;
    452           int32_t index       = 0;
    453 
    454     // Since the strength is primary, accents are ignored in the pattern.
    455     if (strsrch->strength == UCOL_PRIMARY) {
    456     	pattern->hasPrefixAccents = 0;
    457     	pattern->hasSuffixAccents = 0;
    458     } else {
    459 	    pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
    460 	                                                     SECOND_LAST_BYTE_SHIFT_;
    461 	    index = length;
    462 	    UTF_BACK_1(patterntext, 0, index);
    463 	    pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
    464 	                                                             LAST_BYTE_MASK_;
    465     }
    466 
    467     // ** HACK **
    468     if (strsrch->pattern.PCE != NULL) {
    469         if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
    470             uprv_free(strsrch->pattern.PCE);
    471         }
    472 
    473         strsrch->pattern.PCE = NULL;
    474     }
    475 
    476     // since intializePattern is an internal method status is a success.
    477     return initializePatternCETable(strsrch, status);
    478 }
    479 
    480 /**
    481 * Initializing shift tables, with the default values.
    482 * If a corresponding default value is 0, the shift table is not set.
    483 * @param shift table for forwards shift
    484 * @param backshift table for backwards shift
    485 * @param cetable table containing pattern ce
    486 * @param cesize size of the pattern ces
    487 * @param expansionsize total size of the expansions
    488 * @param defaultforward the default forward value
    489 * @param defaultbackward the default backward value
    490 */
    491 static
    492 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
    493                           int32_t  *cetable, int32_t cesize,
    494                           int16_t   expansionsize,
    495                           int16_t   defaultforward,
    496                           int16_t   defaultbackward)
    497 {
    498     // estimate the value to shift. to do that we estimate the smallest
    499     // number of characters to give the relevant ces, ie approximately
    500     // the number of ces minus their expansion, since expansions can come
    501     // from a character.
    502     int32_t count;
    503     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    504         shift[count] = defaultforward;
    505     }
    506     cesize --; // down to the last index
    507     for (count = 0; count < cesize; count ++) {
    508         // number of ces from right of array to the count
    509         int temp = defaultforward - count - 1;
    510         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
    511     }
    512     shift[hash(cetable[cesize])] = 1;
    513     // for ignorables we just shift by one. see test examples.
    514     shift[hash(0)] = 1;
    515 
    516     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
    517         backshift[count] = defaultbackward;
    518     }
    519     for (count = cesize; count > 0; count --) {
    520         // the original value count does not seem to work
    521         backshift[hash(cetable[count])] = count > expansionsize ?
    522                                           (int16_t)(count - expansionsize) : 1;
    523     }
    524     backshift[hash(cetable[0])] = 1;
    525     backshift[hash(0)] = 1;
    526 }
    527 
    528 /**
    529 * Building of the pattern collation element list and the boyer moore strsrch
    530 * table.
    531 * The canonical match will only be performed after the default match fails.
    532 * For both cases we need to remember the size of the composed and decomposed
    533 * versions of the string. Since the Boyer-Moore shift calculations shifts by
    534 * a number of characters in the text and tries to match the pattern from that
    535 * offset, the shift value can not be too large in case we miss some
    536 * characters. To choose a right shift size, we estimate the NFC form of the
    537 * and use its size as a shift guide. The NFC form should be the small
    538 * possible representation of the pattern. Anyways, we'll err on the smaller
    539 * shift size. Hence the calculation for minlength.
    540 * Canonical match will be performed slightly differently. We'll split the
    541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
    542 * the first and last base character (MS), the ending accents (EA). Matches
    543 * will be done on MS first, and only when we match MS then some processing
    544 * will be required for the prefix and end accents in order to determine if
    545 * they match PA and EA. Hence the default shift values
    546 * for the canonical match will take the size of either end's accent into
    547 * consideration. Forwards search will take the end accents into consideration
    548 * for the default shift values and the backwards search will take the prefix
    549 * accents into consideration.
    550 * If pattern has no non-ignorable ce, we return a illegal argument error.
    551 * Internal method, status assumed to be success.
    552 * @param strsrch UStringSearch data storage
    553 * @param status  for output errors if it occurs, status is assumed to be a
    554 *                success when it is passed in.
    555 */
    556 static
    557 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
    558 {
    559     int16_t expandlength  = initializePattern(strsrch, status);
    560     if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
    561         UPattern *pattern = &strsrch->pattern;
    562         int32_t   cesize  = pattern->CELength;
    563 
    564         int16_t minlength = cesize > expandlength
    565                             ? (int16_t)cesize - expandlength : 1;
    566         pattern->defaultShiftSize    = minlength;
    567         setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
    568                       cesize, expandlength, minlength, minlength);
    569         return;
    570     }
    571     strsrch->pattern.defaultShiftSize = 0;
    572 }
    573 
    574 #if BOYER_MOORE
    575 /**
    576 * Check to make sure that the match length is at the end of the character by
    577 * using the breakiterator.
    578 * @param strsrch string search data
    579 * @param start target text start offset
    580 * @param end target text end offset
    581 */
    582 static
    583 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
    584                                int32_t *end)
    585 {
    586 #if !UCONFIG_NO_BREAK_ITERATION
    587     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
    588     if (breakiterator) {
    589 	    int32_t matchend = *end;
    590 	    //int32_t matchstart = *start;
    591 
    592 	    if (!ubrk_isBoundary(breakiterator, matchend)) {
    593 	    	*end = ubrk_following(breakiterator, matchend);
    594         }
    595 
    596 	    /* Check the start of the matched text to make sure it doesn't have any accents
    597 	     * before it.  This code may not be necessary and so it is commented out */
    598 	    /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
    599 	    	*start = ubrk_preceding(breakiterator, matchstart);
    600 	    }*/
    601     }
    602 #endif
    603 }
    604 
    605 /**
    606 * Determine whether the target text in UStringSearch bounded by the offset
    607 * start and end is one or more whole units of text as
    608 * determined by the breakiterator in UStringSearch.
    609 * @param strsrch string search data
    610 * @param start target text start offset
    611 * @param end target text end offset
    612 */
    613 static
    614 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
    615                                int32_t    end)
    616 {
    617 #if !UCONFIG_NO_BREAK_ITERATION
    618     UBreakIterator *breakiterator = strsrch->search->breakIter;
    619     //TODO: Add here.
    620     if (breakiterator) {
    621         int32_t startindex = ubrk_first(breakiterator);
    622         int32_t endindex   = ubrk_last(breakiterator);
    623 
    624         // out-of-range indexes are never boundary positions
    625         if (start < startindex || start > endindex ||
    626             end < startindex || end > endindex) {
    627             return FALSE;
    628         }
    629         // otherwise, we can use following() on the position before the
    630         // specified one and return true of the position we get back is the
    631         // one the user specified
    632         UBool result = (start == startindex ||
    633                 ubrk_following(breakiterator, start - 1) == start) &&
    634                (end == endindex ||
    635                 ubrk_following(breakiterator, end - 1) == end);
    636         if (result) {
    637             // iterates the individual ces
    638                   UCollationElements *coleiter  = strsrch->utilIter;
    639             const UChar              *text      = strsrch->search->text +
    640                                                                       start;
    641                   UErrorCode          status    = U_ZERO_ERROR;
    642             ucol_setText(coleiter, text, end - start, &status);
    643             for (int32_t count = 0; count < strsrch->pattern.CELength;
    644                  count ++) {
    645                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
    646                 if (ce == UCOL_IGNORABLE) {
    647                     count --;
    648                     continue;
    649                 }
    650                 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
    651                     return FALSE;
    652                 }
    653             }
    654             int32_t nextce = ucol_next(coleiter, &status);
    655             while (ucol_getOffset(coleiter) == (end - start)
    656                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
    657                 nextce = ucol_next(coleiter, &status);
    658             }
    659             if (ucol_getOffset(coleiter) == (end - start)
    660                 && nextce != UCOL_NULLORDER) {
    661                 // extra collation elements at the end of the match
    662                 return FALSE;
    663             }
    664         }
    665         return result;
    666     }
    667 #endif
    668     return TRUE;
    669 }
    670 
    671 /**
    672 * Getting the next base character offset if current offset is an accent,
    673 * or the current offset if the current character contains a base character.
    674 * accents the following base character will be returned
    675 * @param text string
    676 * @param textoffset current offset
    677 * @param textlength length of text string
    678 * @return the next base character or the current offset
    679 *         if the current character is contains a base character.
    680 */
    681 static
    682 inline int32_t getNextBaseOffset(const UChar       *text,
    683                                            int32_t  textoffset,
    684                                            int32_t      textlength)
    685 {
    686     if (textoffset < textlength) {
    687         int32_t temp = textoffset;
    688         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
    689             while (temp < textlength) {
    690                 int32_t result = temp;
    691                 if ((getFCD(text, &temp, textlength) >>
    692                      SECOND_LAST_BYTE_SHIFT_) == 0) {
    693                     return result;
    694                 }
    695             }
    696             return textlength;
    697         }
    698     }
    699     return textoffset;
    700 }
    701 
    702 /**
    703 * Gets the next base character offset depending on the string search pattern
    704 * data
    705 * @param strsrch string search data
    706 * @param textoffset current offset, one offset away from the last character
    707 *                   to search for.
    708 * @return start index of the next base character or the current offset
    709 *         if the current character is contains a base character.
    710 */
    711 static
    712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
    713                                                   int32_t    textoffset)
    714 {
    715     int32_t textlength = strsrch->search->textLength;
    716     if (strsrch->pattern.hasSuffixAccents &&
    717         textoffset < textlength) {
    718               int32_t  temp       = textoffset;
    719         const UChar       *text       = strsrch->search->text;
    720         UTF_BACK_1(text, 0, temp);
    721         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
    722             return getNextBaseOffset(text, textoffset, textlength);
    723         }
    724     }
    725     return textoffset;
    726 }
    727 
    728 /**
    729 * Shifting the collation element iterator position forward to prepare for
    730 * a following match. If the last character is a unsafe character, we'll only
    731 * shift by 1 to capture contractions, normalization etc.
    732 * Internal method, status assumed to be success.
    733 * @param text strsrch string search data
    734 * @param textoffset start text position to do search
    735 * @param ce the text ce which failed the match.
    736 * @param patternceindex index of the ce within the pattern ce buffer which
    737 *        failed the match
    738 * @return final offset
    739 */
    740 static
    741 inline int32_t shiftForward(UStringSearch *strsrch,
    742                                 int32_t    textoffset,
    743                                 int32_t       ce,
    744                                 int32_t        patternceindex)
    745 {
    746     UPattern *pattern = &(strsrch->pattern);
    747     if (ce != UCOL_NULLORDER) {
    748         int32_t shift = pattern->shift[hash(ce)];
    749         // this is to adjust for characters in the middle of the
    750         // substring for matching that failed.
    751         int32_t adjust = pattern->CELength - patternceindex;
    752         if (adjust > 1 && shift >= adjust) {
    753             shift -= adjust - 1;
    754         }
    755         textoffset += shift;
    756     }
    757     else {
    758         textoffset += pattern->defaultShiftSize;
    759     }
    760 
    761     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
    762     // check for unsafe characters
    763     // * if it is the start or middle of a contraction: to be done after
    764     //   a initial match is found
    765     // * thai or lao base consonant character: similar to contraction
    766     // * high surrogate character: similar to contraction
    767     // * next character is a accent: shift to the next base character
    768     return textoffset;
    769 }
    770 #endif // #if BOYER_MOORE
    771 
    772 /**
    773 * sets match not found
    774 * @param strsrch string search data
    775 */
    776 static
    777 inline void setMatchNotFound(UStringSearch *strsrch)
    778 {
    779     // this method resets the match result regardless of the error status.
    780     strsrch->search->matchedIndex = USEARCH_DONE;
    781     strsrch->search->matchedLength = 0;
    782     if (strsrch->search->isForwardSearching) {
    783         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
    784     }
    785     else {
    786         setColEIterOffset(strsrch->textIter, 0);
    787     }
    788 }
    789 
    790 #if BOYER_MOORE
    791 /**
    792 * Gets the offset to the next safe point in text.
    793 * ie. not the middle of a contraction, swappable characters or supplementary
    794 * characters.
    795 * @param collator collation sata
    796 * @param text string to work with
    797 * @param textoffset offset in string
    798 * @param textlength length of text string
    799 * @return offset to the next safe character
    800 */
    801 static
    802 inline int32_t getNextSafeOffset(const UCollator   *collator,
    803                                      const UChar       *text,
    804                                            int32_t  textoffset,
    805                                            int32_t      textlength)
    806 {
    807     int32_t result = textoffset; // first contraction character
    808     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
    809         result ++;
    810     }
    811     return result;
    812 }
    813 
    814 /**
    815 * This checks for accents in the potential match started with a .
    816 * composite character.
    817 * This is really painful... we have to check that composite character do not
    818 * have any extra accents. We have to normalize the potential match and find
    819 * the immediate decomposed character before the match.
    820 * The first composite character would have been taken care of by the fcd
    821 * checks in checkForwardExactMatch.
    822 * This is the slow path after the fcd of the first character and
    823 * the last character has been checked by checkForwardExactMatch and we
    824 * determine that the potential match has extra non-ignorable preceding
    825 * ces.
    826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    828 * Note here that accents checking are slow and cautioned in the API docs.
    829 * Internal method, status assumed to be a success, caller should check status
    830 * before calling this method
    831 * @param strsrch string search data
    832 * @param start index of the potential unfriendly composite character
    833 * @param end index of the potential unfriendly composite character
    834 * @param status output error status if any.
    835 * @return TRUE if there is non-ignorable accents before at the beginning
    836 *              of the match, FALSE otherwise.
    837 */
    838 
    839 static
    840 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
    841                                    int32_t    end,
    842                                    UErrorCode    *status)
    843 {
    844     UBool result = FALSE;
    845     if (strsrch->pattern.hasPrefixAccents) {
    846               int32_t  length = end - start;
    847               int32_t  offset = 0;
    848         const UChar       *text   = strsrch->search->text + start;
    849 
    850         UTF_FWD_1(text, offset, length);
    851         // we are only concerned with the first composite character
    852         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
    853             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
    854                                                        text, 0, length);
    855             if (safeoffset != length) {
    856                 safeoffset ++;
    857             }
    858             UChar   *norm = NULL;
    859             UChar    buffer[INITIAL_ARRAY_SIZE_];
    860             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
    861                                             buffer, INITIAL_ARRAY_SIZE_,
    862                                             status);
    863             if (U_FAILURE(*status)) {
    864                 return FALSE;
    865             }
    866             if (size >= INITIAL_ARRAY_SIZE_) {
    867                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
    868                                                status);
    869                 // if allocation failed, status will be set to
    870                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
    871                 // checks for it.
    872                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
    873                                        size, status);
    874                 if (U_FAILURE(*status) && norm != NULL) {
    875                     uprv_free(norm);
    876                     return FALSE;
    877                 }
    878             }
    879             else {
    880                 norm = buffer;
    881             }
    882 
    883             UCollationElements *coleiter  = strsrch->utilIter;
    884             ucol_setText(coleiter, norm, size, status);
    885             uint32_t            firstce   = strsrch->pattern.CE[0];
    886             UBool               ignorable = TRUE;
    887             uint32_t            ce        = UCOL_IGNORABLE;
    888             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
    889                 offset = ucol_getOffset(coleiter);
    890                 if (ce != firstce && ce != UCOL_IGNORABLE) {
    891                     ignorable = FALSE;
    892                 }
    893                 ce = ucol_next(coleiter, status);
    894             }
    895             UChar32 codepoint;
    896             UTF_PREV_CHAR(norm, 0, offset, codepoint);
    897             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
    898 
    899             if (norm != buffer) {
    900                 uprv_free(norm);
    901             }
    902         }
    903     }
    904 
    905     return result;
    906 }
    907 
    908 /**
    909 * Used by exact matches, checks if there are accents before the match.
    910 * This is really painful... we have to check that composite characters at
    911 * the start of the matches have to not have any extra accents.
    912 * We check the FCD of the character first, if it starts with an accent and
    913 * the first pattern ce does not match the first ce of the character, we bail.
    914 * Otherwise we try normalizing the first composite
    915 * character and find the immediate decomposed character before the match to
    916 * see if it is an non-ignorable accent.
    917 * Now normalizing the first composite character is enough because we ensure
    918 * that when the match is passed in here with extra beginning ces, the
    919 * first or last ce that match has to occur within the first character.
    920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
    921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
    922 * Note here that accents checking are slow and cautioned in the API docs.
    923 * @param strsrch string search data
    924 * @param start offset
    925 * @param end offset
    926 * @return TRUE if there are accents on either side of the match,
    927 *         FALSE otherwise
    928 */
    929 static
    930 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
    931                                   int32_t    end)
    932 {
    933     if (strsrch->pattern.hasPrefixAccents) {
    934         UCollationElements *coleiter  = strsrch->textIter;
    935         UErrorCode          status    = U_ZERO_ERROR;
    936         // we have been iterating forwards previously
    937         uint32_t            ignorable = TRUE;
    938         int32_t             firstce   = strsrch->pattern.CE[0];
    939 
    940         setColEIterOffset(coleiter, start);
    941         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
    942         if (U_FAILURE(status)) {
    943             return TRUE;
    944         }
    945         while (ce != firstce) {
    946             if (ce != UCOL_IGNORABLE) {
    947                 ignorable = FALSE;
    948             }
    949             ce = getCE(strsrch, ucol_next(coleiter, &status));
    950             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
    951                 return TRUE;
    952             }
    953         }
    954         if (!ignorable && inNormBuf(coleiter)) {
    955             // within normalization buffer, discontiguous handled here
    956             return TRUE;
    957         }
    958 
    959         // within text
    960         int32_t temp = start;
    961         // original code
    962         // accent = (getFCD(strsrch->search->text, &temp,
    963         //                  strsrch->search->textLength)
    964         //            >> SECOND_LAST_BYTE_SHIFT_);
    965         // however this code does not work well with VC7 .net in release mode.
    966         // maybe the inlines for getFCD combined with shifting has bugs in
    967         // VC7. anyways this is a work around.
    968         UBool accent = getFCD(strsrch->search->text, &temp,
    969                               strsrch->search->textLength) > 0xFF;
    970         if (!accent) {
    971             return checkExtraMatchAccents(strsrch, start, end, &status);
    972         }
    973         if (!ignorable) {
    974             return TRUE;
    975         }
    976         if (start > 0) {
    977             temp = start;
    978             UTF_BACK_1(strsrch->search->text, 0, temp);
    979             if (getFCD(strsrch->search->text, &temp,
    980                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
    981                 setColEIterOffset(coleiter, start);
    982                 ce = ucol_previous(coleiter, &status);
    983                 if (U_FAILURE(status) ||
    984                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
    985                     return TRUE;
    986                 }
    987             }
    988         }
    989     }
    990 
    991     return FALSE;
    992 }
    993 
    994 /**
    995 * Used by exact matches, checks if there are accents bounding the match.
    996 * Note this is the initial boundary check. If the potential match
    997 * starts or ends with composite characters, the accents in those
    998 * characters will be determined later.
    999 * Not doing backwards iteration here, since discontiguos contraction for
   1000 * backwards collation element iterator, use up too many characters.
   1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
   1002 * should fail since there is a acute at the end of \u01FA
   1003 * Note here that accents checking are slow and cautioned in the API docs.
   1004 * @param strsrch string search data
   1005 * @param start offset of match
   1006 * @param end end offset of the match
   1007 * @return TRUE if there are accents on either side of the match,
   1008 *         FALSE otherwise
   1009 */
   1010 static
   1011 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
   1012                                  int32_t    end)
   1013 {
   1014     if (strsrch->pattern.hasSuffixAccents) {
   1015         const UChar       *text       = strsrch->search->text;
   1016               int32_t  temp       = end;
   1017               int32_t      textlength = strsrch->search->textLength;
   1018         UTF_BACK_1(text, 0, temp);
   1019         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
   1020             int32_t             firstce  = strsrch->pattern.CE[0];
   1021             UCollationElements *coleiter = strsrch->textIter;
   1022             UErrorCode          status   = U_ZERO_ERROR;
   1023 			int32_t ce;
   1024             setColEIterOffset(coleiter, start);
   1025             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
   1026                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
   1027                     return TRUE;
   1028                 }
   1029             }
   1030             int32_t count = 1;
   1031             while (count < strsrch->pattern.CELength) {
   1032                 if (getCE(strsrch, ucol_next(coleiter, &status))
   1033                     == UCOL_IGNORABLE) {
   1034                     // Thai can give an ignorable here.
   1035                     count --;
   1036                 }
   1037                 if (U_FAILURE(status)) {
   1038                     return TRUE;
   1039                 }
   1040                 count ++;
   1041             }
   1042 
   1043 			ce = ucol_next(coleiter, &status);
   1044             if (U_FAILURE(status)) {
   1045                 return TRUE;
   1046             }
   1047             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1048             	ce = getCE(strsrch, ce);
   1049             }
   1050             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
   1051                 if (ucol_getOffset(coleiter) <= end) {
   1052                     return TRUE;
   1053                 }
   1054                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
   1055                     return TRUE;
   1056                 }
   1057             }
   1058         }
   1059     }
   1060     return FALSE;
   1061 }
   1062 #endif // #if BOYER_MOORE
   1063 
   1064 /**
   1065 * Checks if the offset runs out of the text string
   1066 * @param offset
   1067 * @param textlength of the text string
   1068 * @return TRUE if offset is out of bounds, FALSE otherwise
   1069 */
   1070 static
   1071 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
   1072 {
   1073     return offset < 0 || offset > textlength;
   1074 }
   1075 
   1076 /**
   1077 * Checks for identical match
   1078 * @param strsrch string search data
   1079 * @param start offset of possible match
   1080 * @param end offset of possible match
   1081 * @return TRUE if identical match is found
   1082 */
   1083 static
   1084 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
   1085                                   int32_t    end)
   1086 {
   1087     if (strsrch->strength != UCOL_IDENTICAL) {
   1088         return TRUE;
   1089     }
   1090 
   1091     // Note: We could use Normalizer::compare() or similar, but for short strings
   1092     // which may not be in FCD it might be faster to just NFD them.
   1093     UErrorCode status = U_ZERO_ERROR;
   1094     UnicodeString t2, p2;
   1095     strsrch->nfd->normalize(
   1096         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
   1097     strsrch->nfd->normalize(
   1098         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
   1099     // return FALSE if NFD failed
   1100     return U_SUCCESS(status) && t2 == p2;
   1101 }
   1102 
   1103 #if BOYER_MOORE
   1104 /**
   1105 * Checks to see if the match is repeated
   1106 * @param strsrch string search data
   1107 * @param start new match start index
   1108 * @param end new match end index
   1109 * @return TRUE if the the match is repeated, FALSE otherwise
   1110 */
   1111 static
   1112 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
   1113                                 int32_t    start,
   1114                                 int32_t    end)
   1115 {
   1116     int32_t lastmatchindex = strsrch->search->matchedIndex;
   1117     UBool       result;
   1118     if (lastmatchindex == USEARCH_DONE) {
   1119         return FALSE;
   1120     }
   1121     if (strsrch->search->isForwardSearching) {
   1122         result = start <= lastmatchindex;
   1123     }
   1124     else {
   1125         result = start >= lastmatchindex;
   1126     }
   1127     if (!result && !strsrch->search->isOverlap) {
   1128         if (strsrch->search->isForwardSearching) {
   1129             result = start < lastmatchindex + strsrch->search->matchedLength;
   1130         }
   1131         else {
   1132             result = end > lastmatchindex;
   1133         }
   1134     }
   1135     return result;
   1136 }
   1137 
   1138 /**
   1139 * Gets the collation element iterator's current offset.
   1140 * @param coleiter collation element iterator
   1141 * @param forwards flag TRUE if we are moving in th forwards direction
   1142 * @return current offset
   1143 */
   1144 static
   1145 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
   1146                                               UBool               forwards)
   1147 {
   1148     int32_t result = ucol_getOffset(coleiter);
   1149     // intricacies of the the backwards collation element iterator
   1150     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
   1151         result ++;
   1152     }
   1153     return result;
   1154 }
   1155 
   1156 /**
   1157 * Checks match for contraction.
   1158 * If the match ends with a partial contraction we fail.
   1159 * If the match starts too far off (because of backwards iteration) we try to
   1160 * chip off the extra characters depending on whether a breakiterator has
   1161 * been used.
   1162 * Internal method, error assumed to be success, caller has to check status
   1163 * before calling this method.
   1164 * @param strsrch string search data
   1165 * @param start offset of potential match, to be modified if necessary
   1166 * @param end offset of potential match, to be modified if necessary
   1167 * @param status output error status if any
   1168 * @return TRUE if match passes the contraction test, FALSE otherwise
   1169 */
   1170 
   1171 static
   1172 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
   1173                                      int32_t   *start,
   1174                                      int32_t   *end, UErrorCode  *status)
   1175 {
   1176           UCollationElements *coleiter   = strsrch->textIter;
   1177           int32_t             textlength = strsrch->search->textLength;
   1178           int32_t             temp       = *start;
   1179     const UCollator          *collator   = strsrch->collator;
   1180     const UChar              *text       = strsrch->search->text;
   1181     // This part checks if either ends of the match contains potential
   1182     // contraction. If so we'll have to iterate through them
   1183     // The start contraction needs to be checked since ucol_previous dumps
   1184     // all characters till the first safe character into the buffer.
   1185     // *start + 1 is used to test for the unsafe characters instead of *start
   1186     // because ucol_prev takes all unsafe characters till the first safe
   1187     // character ie *start. so by testing *start + 1, we can estimate if
   1188     // excess prefix characters has been included in the potential search
   1189     // results.
   1190     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1191         (*start + 1 < textlength
   1192          && ucol_unsafeCP(text[*start + 1], collator))) {
   1193         int32_t expansion  = getExpansionPrefix(coleiter);
   1194         UBool   expandflag = expansion > 0;
   1195         setColEIterOffset(coleiter, *start);
   1196         while (expansion > 0) {
   1197             // getting rid of the redundant ce, caused by setOffset.
   1198             // since backward contraction/expansion may have extra ces if we
   1199             // are in the normalization buffer, hasAccentsBeforeMatch would
   1200             // have taken care of it.
   1201             // E.g. the character \u01FA will have an expansion of 3, but if
   1202             // we are only looking for acute and ring \u030A and \u0301, we'll
   1203             // have to skip the first ce in the expansion buffer.
   1204             ucol_next(coleiter, status);
   1205             if (U_FAILURE(*status)) {
   1206                 return FALSE;
   1207             }
   1208             if (ucol_getOffset(coleiter) != temp) {
   1209                 *start = temp;
   1210                 temp  = ucol_getOffset(coleiter);
   1211             }
   1212             expansion --;
   1213         }
   1214 
   1215         int32_t  *patternce       = strsrch->pattern.CE;
   1216         int32_t   patterncelength = strsrch->pattern.CELength;
   1217         int32_t   count           = 0;
   1218         while (count < patterncelength) {
   1219             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1220             if (ce == UCOL_IGNORABLE) {
   1221                 continue;
   1222             }
   1223             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1224                 *start = temp;
   1225                 temp   = ucol_getOffset(coleiter);
   1226             }
   1227             if (U_FAILURE(*status) || ce != patternce[count]) {
   1228                 (*end) ++;
   1229                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1230                 return FALSE;
   1231             }
   1232             count ++;
   1233         }
   1234     }
   1235     return TRUE;
   1236 }
   1237 
   1238 /**
   1239 * Checks and sets the match information if found.
   1240 * Checks
   1241 * <ul>
   1242 * <li> the potential match does not repeat the previous match
   1243 * <li> boundaries are correct
   1244 * <li> exact matches has no extra accents
   1245 * <li> identical matchesb
   1246 * <li> potential match does not end in the middle of a contraction
   1247 * <\ul>
   1248 * Otherwise the offset will be shifted to the next character.
   1249 * Internal method, status assumed to be success, caller has to check status
   1250 * before calling this method.
   1251 * @param strsrch string search data
   1252 * @param textoffset offset in the collation element text. the returned value
   1253 *        will be the truncated end offset of the match or the new start
   1254 *        search offset.
   1255 * @param status output error status if any
   1256 * @return TRUE if the match is valid, FALSE otherwise
   1257 */
   1258 static
   1259 inline UBool checkNextExactMatch(UStringSearch *strsrch,
   1260                                  int32_t   *textoffset, UErrorCode *status)
   1261 {
   1262     UCollationElements *coleiter = strsrch->textIter;
   1263     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
   1264 
   1265     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
   1266         return FALSE;
   1267     }
   1268 
   1269     // this totally matches, however we need to check if it is repeating
   1270     if (!isBreakUnit(strsrch, start, *textoffset) ||
   1271         checkRepeatedMatch(strsrch, start, *textoffset) ||
   1272         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
   1273         !checkIdentical(strsrch, start, *textoffset) ||
   1274         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
   1275 
   1276         (*textoffset) ++;
   1277         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
   1278         return FALSE;
   1279     }
   1280 
   1281     //Add breakiterator boundary check for primary strength search.
   1282     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   1283     	checkBreakBoundary(strsrch, &start, textoffset);
   1284     }
   1285 
   1286     // totally match, we will get rid of the ending ignorables.
   1287     strsrch->search->matchedIndex  = start;
   1288     strsrch->search->matchedLength = *textoffset - start;
   1289     return TRUE;
   1290 }
   1291 
   1292 /**
   1293 * Getting the previous base character offset, or the current offset if the
   1294 * current character is a base character
   1295 * @param text string
   1296 * @param textoffset one offset after the current character
   1297 * @return the offset of the next character after the base character or the first
   1298 *         composed character with accents
   1299 */
   1300 static
   1301 inline int32_t getPreviousBaseOffset(const UChar       *text,
   1302                                                int32_t  textoffset)
   1303 {
   1304     if (textoffset > 0) {
   1305         for (;;) {
   1306             int32_t result = textoffset;
   1307             UTF_BACK_1(text, 0, textoffset);
   1308             int32_t temp = textoffset;
   1309             uint16_t fcd = getFCD(text, &temp, result);
   1310             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   1311                 if (fcd & LAST_BYTE_MASK_) {
   1312                     return textoffset;
   1313                 }
   1314                 return result;
   1315             }
   1316             if (textoffset == 0) {
   1317                 return 0;
   1318             }
   1319         }
   1320     }
   1321     return textoffset;
   1322 }
   1323 
   1324 /**
   1325 * Getting the indexes of the accents that are not blocked in the argument
   1326 * accent array
   1327 * @param accents array of accents in nfd terminated by a 0.
   1328 * @param accentsindex array of indexes of the accents that are not blocked
   1329 */
   1330 static
   1331 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
   1332 {
   1333     int32_t index     = 0;
   1334     int32_t     length    = u_strlen(accents);
   1335     UChar32     codepoint = 0;
   1336     int         cclass    = 0;
   1337     int         result    = 0;
   1338     int32_t temp;
   1339     while (index < length) {
   1340         temp = index;
   1341         UTF_NEXT_CHAR(accents, index, length, codepoint);
   1342         if (u_getCombiningClass(codepoint) != cclass) {
   1343             cclass        = u_getCombiningClass(codepoint);
   1344             accentsindex[result] = temp;
   1345             result ++;
   1346         }
   1347     }
   1348     accentsindex[result] = length;
   1349     return result;
   1350 }
   1351 
   1352 /**
   1353 * Appends 3 UChar arrays to a destination array.
   1354 * Creates a new array if we run out of space. The caller will have to
   1355 * manually deallocate the newly allocated array.
   1356 * Internal method, status assumed to be success, caller has to check status
   1357 * before calling this method. destination not to be NULL and has at least
   1358 * size destinationlength.
   1359 * @param destination target array
   1360 * @param destinationlength target array size, returning the appended length
   1361 * @param source1 null-terminated first array
   1362 * @param source2 second array
   1363 * @param source2length length of seond array
   1364 * @param source3 null-terminated third array
   1365 * @param status error status if any
   1366 * @return new destination array, destination if there was no new allocation
   1367 */
   1368 static
   1369 inline UChar * addToUCharArray(      UChar      *destination,
   1370                                      int32_t    *destinationlength,
   1371                                const UChar      *source1,
   1372                                const UChar      *source2,
   1373                                      int32_t     source2length,
   1374                                const UChar      *source3,
   1375                                      UErrorCode *status)
   1376 {
   1377     int32_t source1length = source1 ? u_strlen(source1) : 0;
   1378     int32_t source3length = source3 ? u_strlen(source3) : 0;
   1379     if (*destinationlength < source1length + source2length + source3length +
   1380                                                                            1)
   1381     {
   1382         destination = (UChar *)allocateMemory(
   1383           (source1length + source2length + source3length + 1) * sizeof(UChar),
   1384           status);
   1385         // if error allocating memory, status will be
   1386         // U_MEMORY_ALLOCATION_ERROR
   1387         if (U_FAILURE(*status)) {
   1388             *destinationlength = 0;
   1389             return NULL;
   1390         }
   1391     }
   1392     if (source1length != 0) {
   1393         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
   1394     }
   1395     if (source2length != 0) {
   1396         uprv_memcpy(destination + source1length, source2,
   1397                     sizeof(UChar) * source2length);
   1398     }
   1399     if (source3length != 0) {
   1400         uprv_memcpy(destination + source1length + source2length, source3,
   1401                     sizeof(UChar) * source3length);
   1402     }
   1403     *destinationlength = source1length + source2length + source3length;
   1404     return destination;
   1405 }
   1406 
   1407 /**
   1408 * Running through a collation element iterator to see if the contents matches
   1409 * pattern in string search data
   1410 * @param strsrch string search data
   1411 * @param coleiter collation element iterator
   1412 * @return TRUE if a match if found, FALSE otherwise
   1413 */
   1414 static
   1415 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
   1416                                        UCollationElements *coleiter)
   1417 {
   1418     int         patternceindex = strsrch->pattern.CELength;
   1419     int32_t    *patternce      = strsrch->pattern.CE;
   1420     UErrorCode  status = U_ZERO_ERROR;
   1421     while (patternceindex > 0) {
   1422         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
   1423         if (ce == UCOL_IGNORABLE) {
   1424             continue;
   1425         }
   1426         if (U_FAILURE(status) || ce != *patternce) {
   1427             return FALSE;
   1428         }
   1429         patternce ++;
   1430         patternceindex --;
   1431     }
   1432     return TRUE;
   1433 }
   1434 
   1435 /**
   1436 * Rearranges the front accents to try matching.
   1437 * Prefix accents in the text will be grouped according to their combining
   1438 * class and the groups will be mixed and matched to try find the perfect
   1439 * match with the pattern.
   1440 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1441 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1442 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1443 *         "\u0301\u0325".
   1444 * step 2: check if any of the generated substrings matches the pattern.
   1445 * Internal method, status is assumed to be success, caller has to check status
   1446 * before calling this method.
   1447 * @param strsrch string search match
   1448 * @param start first offset of the accents to start searching
   1449 * @param end start of the last accent set
   1450 * @param status output error status if any
   1451 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1452 *         offset of the match. Note this start includes all preceding accents.
   1453 */
   1454 static
   1455 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
   1456                                        int32_t    start,
   1457                                        int32_t    end,
   1458                                        UErrorCode    *status)
   1459 {
   1460     const UChar       *text       = strsrch->search->text;
   1461           int32_t      textlength = strsrch->search->textLength;
   1462           int32_t  tempstart  = start;
   1463 
   1464     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
   1465         // die... failed at a base character
   1466         return USEARCH_DONE;
   1467     }
   1468 
   1469     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
   1470     start = getPreviousBaseOffset(text, tempstart);
   1471 
   1472     UChar       accents[INITIAL_ARRAY_SIZE_];
   1473     // normalizing the offensive string
   1474     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
   1475                     INITIAL_ARRAY_SIZE_, status);
   1476     if (U_FAILURE(*status)) {
   1477         return USEARCH_DONE;
   1478     }
   1479 
   1480     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   1481     int32_t         accentsize = getUnblockedAccentIndex(accents,
   1482                                                                  accentsindex);
   1483     int32_t         count      = (2 << (accentsize - 1)) - 1;
   1484     UChar               buffer[INITIAL_ARRAY_SIZE_];
   1485     UCollationElements *coleiter   = strsrch->utilIter;
   1486     while (U_SUCCESS(*status) && count > 0) {
   1487         UChar *rearrange = strsrch->canonicalPrefixAccents;
   1488         // copy the base characters
   1489         for (int k = 0; k < accentsindex[0]; k ++) {
   1490             *rearrange ++ = accents[k];
   1491         }
   1492         // forming all possible canonical rearrangement by dropping
   1493         // sets of accents
   1494         for (int i = 0; i <= accentsize - 1; i ++) {
   1495             int32_t mask = 1 << (accentsize - i - 1);
   1496             if (count & mask) {
   1497                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1498                     *rearrange ++ = accents[j];
   1499                 }
   1500             }
   1501         }
   1502         *rearrange = 0;
   1503         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   1504         UChar   *match     = addToUCharArray(buffer, &matchsize,
   1505                                            strsrch->canonicalPrefixAccents,
   1506                                            strsrch->search->text + offset,
   1507                                            end - offset,
   1508                                            strsrch->canonicalSuffixAccents,
   1509                                            status);
   1510 
   1511         // if status is a failure, ucol_setText does nothing.
   1512         // run the collator iterator through this match
   1513         ucol_setText(coleiter, match, matchsize, status);
   1514         if (U_SUCCESS(*status)) {
   1515             if (checkCollationMatch(strsrch, coleiter)) {
   1516                 if (match != buffer) {
   1517                     uprv_free(match);
   1518                 }
   1519                 return start;
   1520             }
   1521         }
   1522         count --;
   1523     }
   1524     return USEARCH_DONE;
   1525 }
   1526 
   1527 /**
   1528 * Gets the offset to the safe point in text before textoffset.
   1529 * ie. not the middle of a contraction, swappable characters or supplementary
   1530 * characters.
   1531 * @param collator collation sata
   1532 * @param text string to work with
   1533 * @param textoffset offset in string
   1534 * @param textlength length of text string
   1535 * @return offset to the previous safe character
   1536 */
   1537 static
   1538 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
   1539                                       const UChar       *text,
   1540                                             int32_t  textoffset)
   1541 {
   1542     int32_t result = textoffset; // first contraction character
   1543     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
   1544         result --;
   1545     }
   1546     if (result != 0) {
   1547         // the first contraction character is consider unsafe here
   1548         result --;
   1549     }
   1550     return result;
   1551 }
   1552 
   1553 /**
   1554 * Cleaning up after we passed the safe zone
   1555 * @param strsrch string search data
   1556 * @param safetext safe text array
   1557 * @param safebuffer safe text buffer
   1558 * @param coleiter collation element iterator for safe text
   1559 */
   1560 static
   1561 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
   1562                                   UChar         *safebuffer)
   1563 {
   1564     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
   1565     {
   1566        uprv_free(safetext);
   1567     }
   1568 }
   1569 
   1570 /**
   1571 * Take the rearranged end accents and tries matching. If match failed at
   1572 * a seperate preceding set of accents (seperated from the rearranged on by
   1573 * at least a base character) then we rearrange the preceding accents and
   1574 * tries matching again.
   1575 * We allow skipping of the ends of the accent set if the ces do not match.
   1576 * However if the failure is found before the accent set, it fails.
   1577 * Internal method, status assumed to be success, caller has to check status
   1578 * before calling this method.
   1579 * @param strsrch string search data
   1580 * @param textoffset of the start of the rearranged accent
   1581 * @param status output error status if any
   1582 * @return USEARCH_DONE if a match is not found, otherwise return the starting
   1583 *         offset of the match. Note this start includes all preceding accents.
   1584 */
   1585 static
   1586 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
   1587                                        int32_t    textoffset,
   1588                                        UErrorCode    *status)
   1589 {
   1590     const UChar              *text           = strsrch->search->text;
   1591     const UCollator          *collator       = strsrch->collator;
   1592           int32_t             safelength     = 0;
   1593           UChar              *safetext;
   1594           int32_t             safetextlength;
   1595           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
   1596           UCollationElements *coleiter       = strsrch->utilIter;
   1597           int32_t         safeoffset     = textoffset;
   1598 
   1599     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
   1600                                          collator)) {
   1601         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
   1602         safelength     = textoffset - safeoffset;
   1603         safetextlength = INITIAL_ARRAY_SIZE_;
   1604         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
   1605                                          text + safeoffset, safelength,
   1606                                          strsrch->canonicalSuffixAccents,
   1607                                          status);
   1608     }
   1609     else {
   1610         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
   1611         safetext       = strsrch->canonicalSuffixAccents;
   1612     }
   1613 
   1614     // if status is a failure, ucol_setText does nothing
   1615     ucol_setText(coleiter, safetext, safetextlength, status);
   1616     // status checked in loop below
   1617 
   1618     int32_t  *ce        = strsrch->pattern.CE;
   1619     int32_t   celength  = strsrch->pattern.CELength;
   1620     int       ceindex   = celength - 1;
   1621     UBool     isSafe    = TRUE; // indication flag for position in safe zone
   1622 
   1623     while (ceindex >= 0) {
   1624         int32_t textce = ucol_previous(coleiter, status);
   1625         if (U_FAILURE(*status)) {
   1626             if (isSafe) {
   1627                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1628             }
   1629             return USEARCH_DONE;
   1630         }
   1631         if (textce == UCOL_NULLORDER) {
   1632             // check if we have passed the safe buffer
   1633             if (coleiter == strsrch->textIter) {
   1634                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1635                 return USEARCH_DONE;
   1636             }
   1637             cleanUpSafeText(strsrch, safetext, safebuffer);
   1638             safetext = safebuffer;
   1639             coleiter = strsrch->textIter;
   1640             setColEIterOffset(coleiter, safeoffset);
   1641             // status checked at the start of the loop
   1642             isSafe = FALSE;
   1643             continue;
   1644         }
   1645         textce = getCE(strsrch, textce);
   1646         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   1647             // do the beginning stuff
   1648             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
   1649             if (isSafe && failedoffset >= safelength) {
   1650                 // alas... no hope. failed at rearranged accent set
   1651                 cleanUpSafeText(strsrch, safetext, safebuffer);
   1652                 return USEARCH_DONE;
   1653             }
   1654             else {
   1655                 if (isSafe) {
   1656                     failedoffset += safeoffset;
   1657                     cleanUpSafeText(strsrch, safetext, safebuffer);
   1658                 }
   1659 
   1660                 // try rearranging the front accents
   1661                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
   1662                                         failedoffset, textoffset, status);
   1663                 if (result != USEARCH_DONE) {
   1664                     // if status is a failure, ucol_setOffset does nothing
   1665                     setColEIterOffset(strsrch->textIter, result);
   1666                 }
   1667                 if (U_FAILURE(*status)) {
   1668                     return USEARCH_DONE;
   1669                 }
   1670                 return result;
   1671             }
   1672         }
   1673         if (textce == ce[ceindex]) {
   1674             ceindex --;
   1675         }
   1676     }
   1677     // set offset here
   1678     if (isSafe) {
   1679         int32_t result     = getColElemIterOffset(coleiter, FALSE);
   1680         // sets the text iterator here with the correct expansion and offset
   1681         int32_t    leftoverces = getExpansionPrefix(coleiter);
   1682         cleanUpSafeText(strsrch, safetext, safebuffer);
   1683         if (result >= safelength) {
   1684             result = textoffset;
   1685         }
   1686         else {
   1687             result += safeoffset;
   1688         }
   1689         setColEIterOffset(strsrch->textIter, result);
   1690         strsrch->textIter->iteratordata_.toReturn =
   1691                        setExpansionPrefix(strsrch->textIter, leftoverces);
   1692         return result;
   1693     }
   1694 
   1695     return ucol_getOffset(coleiter);
   1696 }
   1697 
   1698 /**
   1699 * Trying out the substring and sees if it can be a canonical match.
   1700 * This will try normalizing the end accents and arranging them into canonical
   1701 * equivalents and check their corresponding ces with the pattern ce.
   1702 * Suffix accents in the text will be grouped according to their combining
   1703 * class and the groups will be mixed and matched to try find the perfect
   1704 * match with the pattern.
   1705 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   1706 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   1707 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   1708 *         "\u0301\u0325".
   1709 * step 2: check if any of the generated substrings matches the pattern.
   1710 * Internal method, status assumed to be success, caller has to check status
   1711 * before calling this method.
   1712 * @param strsrch string search data
   1713 * @param textoffset end offset in the collation element text that ends with
   1714 *                   the accents to be rearranged
   1715 * @param status error status if any
   1716 * @return TRUE if the match is valid, FALSE otherwise
   1717 */
   1718 static
   1719 UBool doNextCanonicalMatch(UStringSearch *strsrch,
   1720                            int32_t    textoffset,
   1721                            UErrorCode    *status)
   1722 {
   1723     const UChar       *text = strsrch->search->text;
   1724           int32_t  temp = textoffset;
   1725     UTF_BACK_1(text, 0, temp);
   1726     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
   1727         UCollationElements *coleiter = strsrch->textIter;
   1728         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
   1729         if (strsrch->pattern.hasPrefixAccents) {
   1730             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
   1731                                                 status);
   1732             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   1733                 setColEIterOffset(coleiter, offset);
   1734                 return TRUE;
   1735             }
   1736         }
   1737         return FALSE;
   1738     }
   1739 
   1740     if (!strsrch->pattern.hasSuffixAccents) {
   1741         return FALSE;
   1742     }
   1743 
   1744     UChar       accents[INITIAL_ARRAY_SIZE_];
   1745     // offset to the last base character in substring to search
   1746     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
   1747     // normalizing the offensive string
   1748     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
   1749                                0, accents, INITIAL_ARRAY_SIZE_, status);
   1750     // status checked in loop below
   1751 
   1752     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   1753     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   1754 
   1755     // 2 power n - 1 plus the full set of accents
   1756     int32_t  count = (2 << (size - 1)) - 1;
   1757     while (U_SUCCESS(*status) && count > 0) {
   1758         UChar *rearrange = strsrch->canonicalSuffixAccents;
   1759         // copy the base characters
   1760         for (int k = 0; k < accentsindex[0]; k ++) {
   1761             *rearrange ++ = accents[k];
   1762         }
   1763         // forming all possible canonical rearrangement by dropping
   1764         // sets of accents
   1765         for (int i = 0; i <= size - 1; i ++) {
   1766             int32_t mask = 1 << (size - i - 1);
   1767             if (count & mask) {
   1768                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   1769                     *rearrange ++ = accents[j];
   1770                 }
   1771             }
   1772         }
   1773         *rearrange = 0;
   1774         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
   1775                                                         status);
   1776         if (offset != USEARCH_DONE) {
   1777             return TRUE; // match found
   1778         }
   1779         count --;
   1780     }
   1781     return FALSE;
   1782 }
   1783 
   1784 /**
   1785 * Gets the previous base character offset depending on the string search
   1786 * pattern data
   1787 * @param strsrch string search data
   1788 * @param textoffset current offset, current character
   1789 * @return the offset of the next character after this base character or itself
   1790 *         if it is a composed character with accents
   1791 */
   1792 static
   1793 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
   1794                                                       int32_t textoffset)
   1795 {
   1796     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
   1797         const UChar       *text = strsrch->search->text;
   1798               int32_t  offset = textoffset;
   1799         if (getFCD(text, &offset, strsrch->search->textLength) >>
   1800                                                    SECOND_LAST_BYTE_SHIFT_) {
   1801             return getPreviousBaseOffset(text, textoffset);
   1802         }
   1803     }
   1804     return textoffset;
   1805 }
   1806 
   1807 /**
   1808 * Checks match for contraction.
   1809 * If the match ends with a partial contraction we fail.
   1810 * If the match starts too far off (because of backwards iteration) we try to
   1811 * chip off the extra characters
   1812 * Internal method, status assumed to be success, caller has to check status
   1813 * before calling this method.
   1814 * @param strsrch string search data
   1815 * @param start offset of potential match, to be modified if necessary
   1816 * @param end offset of potential match, to be modified if necessary
   1817 * @param status output error status if any
   1818 * @return TRUE if match passes the contraction test, FALSE otherwise
   1819 */
   1820 static
   1821 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
   1822                                          int32_t   *start,
   1823                                          int32_t   *end,
   1824                                          UErrorCode    *status)
   1825 {
   1826           UCollationElements *coleiter   = strsrch->textIter;
   1827           int32_t             textlength = strsrch->search->textLength;
   1828           int32_t         temp       = *start;
   1829     const UCollator          *collator   = strsrch->collator;
   1830     const UChar              *text       = strsrch->search->text;
   1831     // This part checks if either ends of the match contains potential
   1832     // contraction. If so we'll have to iterate through them
   1833     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
   1834         (*start + 1 < textlength
   1835          && ucol_unsafeCP(text[*start + 1], collator))) {
   1836         int32_t expansion  = getExpansionPrefix(coleiter);
   1837         UBool   expandflag = expansion > 0;
   1838         setColEIterOffset(coleiter, *start);
   1839         while (expansion > 0) {
   1840             // getting rid of the redundant ce, caused by setOffset.
   1841             // since backward contraction/expansion may have extra ces if we
   1842             // are in the normalization buffer, hasAccentsBeforeMatch would
   1843             // have taken care of it.
   1844             // E.g. the character \u01FA will have an expansion of 3, but if
   1845             // we are only looking for acute and ring \u030A and \u0301, we'll
   1846             // have to skip the first ce in the expansion buffer.
   1847             ucol_next(coleiter, status);
   1848             if (U_FAILURE(*status)) {
   1849                 return FALSE;
   1850             }
   1851             if (ucol_getOffset(coleiter) != temp) {
   1852                 *start = temp;
   1853                 temp  = ucol_getOffset(coleiter);
   1854             }
   1855             expansion --;
   1856         }
   1857 
   1858         int32_t  *patternce       = strsrch->pattern.CE;
   1859         int32_t   patterncelength = strsrch->pattern.CELength;
   1860         int32_t   count           = 0;
   1861         int32_t   textlength      = strsrch->search->textLength;
   1862         while (count < patterncelength) {
   1863             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
   1864             // status checked below, note that if status is a failure
   1865             // ucol_next returns UCOL_NULLORDER
   1866             if (ce == UCOL_IGNORABLE) {
   1867                 continue;
   1868             }
   1869             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
   1870                 *start = temp;
   1871                 temp   = ucol_getOffset(coleiter);
   1872             }
   1873 
   1874             if (count == 0 && ce != patternce[0]) {
   1875                 // accents may have extra starting ces, this occurs when a
   1876                 // pure accent pattern is matched without rearrangement
   1877                 // text \u0325\u0300 and looking for \u0300
   1878                 int32_t expected = patternce[0];
   1879                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
   1880                     ce = getCE(strsrch, ucol_next(coleiter, status));
   1881                     while (U_SUCCESS(*status) && ce != expected &&
   1882                            ce != UCOL_NULLORDER &&
   1883                            ucol_getOffset(coleiter) <= *end) {
   1884                         ce = getCE(strsrch, ucol_next(coleiter, status));
   1885                     }
   1886                 }
   1887             }
   1888             if (U_FAILURE(*status) || ce != patternce[count]) {
   1889                 (*end) ++;
   1890                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
   1891                 return FALSE;
   1892             }
   1893             count ++;
   1894         }
   1895     }
   1896     return TRUE;
   1897 }
   1898 
   1899 /**
   1900 * Checks and sets the match information if found.
   1901 * Checks
   1902 * <ul>
   1903 * <li> the potential match does not repeat the previous match
   1904 * <li> boundaries are correct
   1905 * <li> potential match does not end in the middle of a contraction
   1906 * <li> identical matches
   1907 * <\ul>
   1908 * Otherwise the offset will be shifted to the next character.
   1909 * Internal method, status assumed to be success, caller has to check the
   1910 * status before calling this method.
   1911 * @param strsrch string search data
   1912 * @param textoffset offset in the collation element text. the returned value
   1913 *        will be the truncated end offset of the match or the new start
   1914 *        search offset.
   1915 * @param status output error status if any
   1916 * @return TRUE if the match is valid, FALSE otherwise
   1917 */
   1918 static
   1919 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
   1920                                      int32_t   *textoffset,
   1921                                      UErrorCode    *status)
   1922 {
   1923     // to ensure that the start and ends are not composite characters
   1924     UCollationElements *coleiter = strsrch->textIter;
   1925     // if we have a canonical accent match
   1926     if ((strsrch->pattern.hasSuffixAccents &&
   1927         strsrch->canonicalSuffixAccents[0]) ||
   1928         (strsrch->pattern.hasPrefixAccents &&
   1929         strsrch->canonicalPrefixAccents[0])) {
   1930         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
   1931                                                     strsrch,
   1932                                                     ucol_getOffset(coleiter));
   1933         strsrch->search->matchedLength = *textoffset -
   1934                                                 strsrch->search->matchedIndex;
   1935         return TRUE;
   1936     }
   1937 
   1938     int32_t start = getColElemIterOffset(coleiter, FALSE);
   1939     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
   1940                                             status) || U_FAILURE(*status)) {
   1941         return FALSE;
   1942     }
   1943 
   1944     start = getPreviousUStringSearchBaseOffset(strsrch, start);
   1945     // this totally matches, however we need to check if it is repeating
   1946     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
   1947         !isBreakUnit(strsrch, start, *textoffset) ||
   1948         !checkIdentical(strsrch, start, *textoffset)) {
   1949         (*textoffset) ++;
   1950         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
   1951                                         strsrch->search->textLength);
   1952         return FALSE;
   1953     }
   1954 
   1955     strsrch->search->matchedIndex  = start;
   1956     strsrch->search->matchedLength = *textoffset - start;
   1957     return TRUE;
   1958 }
   1959 
   1960 /**
   1961 * Shifting the collation element iterator position forward to prepare for
   1962 * a preceding match. If the first character is a unsafe character, we'll only
   1963 * shift by 1 to capture contractions, normalization etc.
   1964 * Internal method, status assumed to be success, caller has to check status
   1965 * before calling this method.
   1966 * @param text strsrch string search data
   1967 * @param textoffset start text position to do search
   1968 * @param ce the text ce which failed the match.
   1969 * @param patternceindex index of the ce within the pattern ce buffer which
   1970 *        failed the match
   1971 * @return final offset
   1972 */
   1973 static
   1974 inline int32_t reverseShift(UStringSearch *strsrch,
   1975                                 int32_t    textoffset,
   1976                                 int32_t       ce,
   1977                                 int32_t        patternceindex)
   1978 {
   1979     if (strsrch->search->isOverlap) {
   1980         if (textoffset != strsrch->search->textLength) {
   1981             textoffset --;
   1982         }
   1983         else {
   1984             textoffset -= strsrch->pattern.defaultShiftSize;
   1985         }
   1986     }
   1987     else {
   1988         if (ce != UCOL_NULLORDER) {
   1989             int32_t shift = strsrch->pattern.backShift[hash(ce)];
   1990 
   1991             // this is to adjust for characters in the middle of the substring
   1992             // for matching that failed.
   1993             int32_t adjust = patternceindex;
   1994             if (adjust > 1 && shift > adjust) {
   1995                 shift -= adjust - 1;
   1996             }
   1997             textoffset -= shift;
   1998         }
   1999         else {
   2000             textoffset -= strsrch->pattern.defaultShiftSize;
   2001         }
   2002     }
   2003     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
   2004     return textoffset;
   2005 }
   2006 
   2007 /**
   2008 * Checks match for contraction.
   2009 * If the match starts with a partial contraction we fail.
   2010 * Internal method, status assumed to be success, caller has to check status
   2011 * before calling this method.
   2012 * @param strsrch string search data
   2013 * @param start offset of potential match, to be modified if necessary
   2014 * @param end offset of potential match, to be modified if necessary
   2015 * @param status output error status if any
   2016 * @return TRUE if match passes the contraction test, FALSE otherwise
   2017 */
   2018 static
   2019 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
   2020                                      int32_t   *start,
   2021                                      int32_t   *end, UErrorCode  *status)
   2022 {
   2023           UCollationElements *coleiter   = strsrch->textIter;
   2024           int32_t             textlength = strsrch->search->textLength;
   2025           int32_t             temp       = *end;
   2026     const UCollator          *collator   = strsrch->collator;
   2027     const UChar              *text       = strsrch->search->text;
   2028     // This part checks if either if the start of the match contains potential
   2029     // contraction. If so we'll have to iterate through them
   2030     // Since we used ucol_next while previously looking for the potential
   2031     // match, this guarantees that our end will not be a partial contraction,
   2032     // or a partial supplementary character.
   2033     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2034         int32_t expansion  = getExpansionSuffix(coleiter);
   2035         UBool   expandflag = expansion > 0;
   2036         setColEIterOffset(coleiter, *end);
   2037         while (U_SUCCESS(*status) && expansion > 0) {
   2038             // getting rid of the redundant ce
   2039             // since forward contraction/expansion may have extra ces
   2040             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2041             // would have taken care of it.
   2042             // E.g. the character \u01FA will have an expansion of 3, but if
   2043             // we are only looking for A ring A\u030A, we'll have to skip the
   2044             // last ce in the expansion buffer
   2045             ucol_previous(coleiter, status);
   2046             if (U_FAILURE(*status)) {
   2047                 return FALSE;
   2048             }
   2049             if (ucol_getOffset(coleiter) != temp) {
   2050                 *end = temp;
   2051                 temp  = ucol_getOffset(coleiter);
   2052             }
   2053             expansion --;
   2054         }
   2055 
   2056         int32_t  *patternce       = strsrch->pattern.CE;
   2057         int32_t   patterncelength = strsrch->pattern.CELength;
   2058         int32_t   count           = patterncelength;
   2059         while (count > 0) {
   2060             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2061             // status checked below, note that if status is a failure
   2062             // ucol_previous returns UCOL_NULLORDER
   2063             if (ce == UCOL_IGNORABLE) {
   2064                 continue;
   2065             }
   2066             if (expandflag && count == 0 &&
   2067                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2068                 *end = temp;
   2069                 temp  = ucol_getOffset(coleiter);
   2070             }
   2071             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2072                 (*start) --;
   2073                 *start = getPreviousBaseOffset(text, *start);
   2074                 return FALSE;
   2075             }
   2076             count --;
   2077         }
   2078     }
   2079     return TRUE;
   2080 }
   2081 
   2082 /**
   2083 * Checks and sets the match information if found.
   2084 * Checks
   2085 * <ul>
   2086 * <li> the current match does not repeat the last match
   2087 * <li> boundaries are correct
   2088 * <li> exact matches has no extra accents
   2089 * <li> identical matches
   2090 * <\ul>
   2091 * Otherwise the offset will be shifted to the preceding character.
   2092 * Internal method, status assumed to be success, caller has to check status
   2093 * before calling this method.
   2094 * @param strsrch string search data
   2095 * @param collator
   2096 * @param coleiter collation element iterator
   2097 * @param text string
   2098 * @param textoffset offset in the collation element text. the returned value
   2099 *        will be the truncated start offset of the match or the new start
   2100 *        search offset.
   2101 * @param status output error status if any
   2102 * @return TRUE if the match is valid, FALSE otherwise
   2103 */
   2104 static
   2105 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
   2106                                      int32_t   *textoffset,
   2107                                      UErrorCode    *status)
   2108 {
   2109     // to ensure that the start and ends are not composite characters
   2110     int32_t end = ucol_getOffset(strsrch->textIter);
   2111     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
   2112         || U_FAILURE(*status)) {
   2113             return FALSE;
   2114     }
   2115 
   2116     // this totally matches, however we need to check if it is repeating
   2117     // the old match
   2118     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2119         !isBreakUnit(strsrch, *textoffset, end) ||
   2120         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
   2121         !checkIdentical(strsrch, *textoffset, end) ||
   2122         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
   2123         (*textoffset) --;
   2124         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2125                                             *textoffset);
   2126         return FALSE;
   2127     }
   2128 
   2129     //Add breakiterator boundary check for primary strength search.
   2130     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
   2131     	checkBreakBoundary(strsrch, textoffset, &end);
   2132     }
   2133 
   2134     strsrch->search->matchedIndex = *textoffset;
   2135     strsrch->search->matchedLength = end - *textoffset;
   2136     return TRUE;
   2137 }
   2138 
   2139 /**
   2140 * Rearranges the end accents to try matching.
   2141 * Suffix accents in the text will be grouped according to their combining
   2142 * class and the groups will be mixed and matched to try find the perfect
   2143 * match with the pattern.
   2144 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2145 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2146 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2147 *         "\u0301\u0325".
   2148 * step 2: check if any of the generated substrings matches the pattern.
   2149 * Internal method, status assumed to be success, user has to check status
   2150 * before calling this method.
   2151 * @param strsrch string search match
   2152 * @param start offset of the first base character
   2153 * @param end start of the last accent set
   2154 * @param status only error status if any
   2155 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2156 *         offset of the match. Note this start includes all following accents.
   2157 */
   2158 static
   2159 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
   2160                                            int32_t    start,
   2161                                            int32_t    end,
   2162                                            UErrorCode    *status)
   2163 {
   2164     const UChar       *text       = strsrch->search->text;
   2165           int32_t  tempend    = end;
   2166 
   2167     UTF_BACK_1(text, 0, tempend);
   2168     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
   2169                                                            LAST_BYTE_MASK_)) {
   2170         // die... failed at a base character
   2171         return USEARCH_DONE;
   2172     }
   2173     end = getNextBaseOffset(text, end, strsrch->search->textLength);
   2174 
   2175     if (U_SUCCESS(*status)) {
   2176         UChar       accents[INITIAL_ARRAY_SIZE_];
   2177         int32_t offset = getPreviousBaseOffset(text, end);
   2178         // normalizing the offensive string
   2179         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
   2180                         INITIAL_ARRAY_SIZE_, status);
   2181 
   2182         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
   2183         int32_t         accentsize = getUnblockedAccentIndex(accents,
   2184                                                          accentsindex);
   2185         int32_t         count      = (2 << (accentsize - 1)) - 1;
   2186         UChar               buffer[INITIAL_ARRAY_SIZE_];
   2187         UCollationElements *coleiter = strsrch->utilIter;
   2188         while (U_SUCCESS(*status) && count > 0) {
   2189             UChar *rearrange = strsrch->canonicalSuffixAccents;
   2190             // copy the base characters
   2191             for (int k = 0; k < accentsindex[0]; k ++) {
   2192                 *rearrange ++ = accents[k];
   2193             }
   2194             // forming all possible canonical rearrangement by dropping
   2195             // sets of accents
   2196             for (int i = 0; i <= accentsize - 1; i ++) {
   2197                 int32_t mask = 1 << (accentsize - i - 1);
   2198                 if (count & mask) {
   2199                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2200                         *rearrange ++ = accents[j];
   2201                     }
   2202                 }
   2203             }
   2204             *rearrange = 0;
   2205             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
   2206             UChar   *match     = addToUCharArray(buffer, &matchsize,
   2207                                            strsrch->canonicalPrefixAccents,
   2208                                            strsrch->search->text + start,
   2209                                            offset - start,
   2210                                            strsrch->canonicalSuffixAccents,
   2211                                            status);
   2212 
   2213             // run the collator iterator through this match
   2214             // if status is a failure ucol_setText does nothing
   2215             ucol_setText(coleiter, match, matchsize, status);
   2216             if (U_SUCCESS(*status)) {
   2217                 if (checkCollationMatch(strsrch, coleiter)) {
   2218                     if (match != buffer) {
   2219                         uprv_free(match);
   2220                     }
   2221                     return end;
   2222                 }
   2223             }
   2224             count --;
   2225         }
   2226     }
   2227     return USEARCH_DONE;
   2228 }
   2229 
   2230 /**
   2231 * Take the rearranged start accents and tries matching. If match failed at
   2232 * a seperate following set of accents (seperated from the rearranged on by
   2233 * at least a base character) then we rearrange the preceding accents and
   2234 * tries matching again.
   2235 * We allow skipping of the ends of the accent set if the ces do not match.
   2236 * However if the failure is found before the accent set, it fails.
   2237 * Internal method, status assumed to be success, caller has to check status
   2238 * before calling this method.
   2239 * @param strsrch string search data
   2240 * @param textoffset of the ends of the rearranged accent
   2241 * @param status output error status if any
   2242 * @return USEARCH_DONE if a match is not found, otherwise return the ending
   2243 *         offset of the match. Note this start includes all following accents.
   2244 */
   2245 static
   2246 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
   2247                                            int32_t    textoffset,
   2248                                            UErrorCode    *status)
   2249 {
   2250     const UChar       *text       = strsrch->search->text;
   2251     const UCollator   *collator   = strsrch->collator;
   2252           int32_t      safelength = 0;
   2253           UChar       *safetext;
   2254           int32_t      safetextlength;
   2255           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
   2256           int32_t  safeoffset = textoffset;
   2257 
   2258     if (textoffset &&
   2259         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
   2260                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
   2261                                          ], collator)) {
   2262         safeoffset     = getNextSafeOffset(collator, text, textoffset,
   2263                                            strsrch->search->textLength);
   2264         safelength     = safeoffset - textoffset;
   2265         safetextlength = INITIAL_ARRAY_SIZE_;
   2266         safetext       = addToUCharArray(safebuffer, &safetextlength,
   2267                                          strsrch->canonicalPrefixAccents,
   2268                                          text + textoffset, safelength,
   2269                                          NULL, status);
   2270     }
   2271     else {
   2272         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
   2273         safetext       = strsrch->canonicalPrefixAccents;
   2274     }
   2275 
   2276     UCollationElements *coleiter = strsrch->utilIter;
   2277      // if status is a failure, ucol_setText does nothing
   2278     ucol_setText(coleiter, safetext, safetextlength, status);
   2279     // status checked in loop below
   2280 
   2281     int32_t  *ce           = strsrch->pattern.CE;
   2282     int32_t   celength     = strsrch->pattern.CELength;
   2283     int       ceindex      = 0;
   2284     UBool     isSafe       = TRUE; // safe zone indication flag for position
   2285     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
   2286 
   2287     while (ceindex < celength) {
   2288         int32_t textce = ucol_next(coleiter, status);
   2289         if (U_FAILURE(*status)) {
   2290             if (isSafe) {
   2291                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2292             }
   2293             return USEARCH_DONE;
   2294         }
   2295         if (textce == UCOL_NULLORDER) {
   2296             // check if we have passed the safe buffer
   2297             if (coleiter == strsrch->textIter) {
   2298                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2299                 return USEARCH_DONE;
   2300             }
   2301             cleanUpSafeText(strsrch, safetext, safebuffer);
   2302             safetext = safebuffer;
   2303             coleiter = strsrch->textIter;
   2304             setColEIterOffset(coleiter, safeoffset);
   2305             // status checked at the start of the loop
   2306             isSafe = FALSE;
   2307             continue;
   2308         }
   2309         textce = getCE(strsrch, textce);
   2310         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
   2311             // do the beginning stuff
   2312             int32_t failedoffset = ucol_getOffset(coleiter);
   2313             if (isSafe && failedoffset <= prefixlength) {
   2314                 // alas... no hope. failed at rearranged accent set
   2315                 cleanUpSafeText(strsrch, safetext, safebuffer);
   2316                 return USEARCH_DONE;
   2317             }
   2318             else {
   2319                 if (isSafe) {
   2320                     failedoffset = safeoffset - failedoffset;
   2321                     cleanUpSafeText(strsrch, safetext, safebuffer);
   2322                 }
   2323 
   2324                 // try rearranging the end accents
   2325                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
   2326                                         textoffset, failedoffset, status);
   2327                 if (result != USEARCH_DONE) {
   2328                     // if status is a failure, ucol_setOffset does nothing
   2329                     setColEIterOffset(strsrch->textIter, result);
   2330                 }
   2331                 if (U_FAILURE(*status)) {
   2332                     return USEARCH_DONE;
   2333                 }
   2334                 return result;
   2335             }
   2336         }
   2337         if (textce == ce[ceindex]) {
   2338             ceindex ++;
   2339         }
   2340     }
   2341     // set offset here
   2342     if (isSafe) {
   2343         int32_t result      = ucol_getOffset(coleiter);
   2344         // sets the text iterator here with the correct expansion and offset
   2345         int32_t     leftoverces = getExpansionSuffix(coleiter);
   2346         cleanUpSafeText(strsrch, safetext, safebuffer);
   2347         if (result <= prefixlength) {
   2348             result = textoffset;
   2349         }
   2350         else {
   2351             result = textoffset + (safeoffset - result);
   2352         }
   2353         setColEIterOffset(strsrch->textIter, result);
   2354         setExpansionSuffix(strsrch->textIter, leftoverces);
   2355         return result;
   2356     }
   2357 
   2358     return ucol_getOffset(coleiter);
   2359 }
   2360 
   2361 /**
   2362 * Trying out the substring and sees if it can be a canonical match.
   2363 * This will try normalizing the starting accents and arranging them into
   2364 * canonical equivalents and check their corresponding ces with the pattern ce.
   2365 * Prefix accents in the text will be grouped according to their combining
   2366 * class and the groups will be mixed and matched to try find the perfect
   2367 * match with the pattern.
   2368 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
   2369 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
   2370 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
   2371 *         "\u0301\u0325".
   2372 * step 2: check if any of the generated substrings matches the pattern.
   2373 * Internal method, status assumed to be success, caller has to check status
   2374 * before calling this method.
   2375 * @param strsrch string search data
   2376 * @param textoffset start offset in the collation element text that starts
   2377 *                   with the accents to be rearranged
   2378 * @param status output error status if any
   2379 * @return TRUE if the match is valid, FALSE otherwise
   2380 */
   2381 static
   2382 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
   2383                                int32_t    textoffset,
   2384                                UErrorCode    *status)
   2385 {
   2386     const UChar       *text       = strsrch->search->text;
   2387           int32_t  temp       = textoffset;
   2388           int32_t      textlength = strsrch->search->textLength;
   2389     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
   2390         UCollationElements *coleiter = strsrch->textIter;
   2391         int32_t         offset   = ucol_getOffset(coleiter);
   2392         if (strsrch->pattern.hasSuffixAccents) {
   2393             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
   2394                                                     offset, status);
   2395             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
   2396                 setColEIterOffset(coleiter, offset);
   2397                 return TRUE;
   2398             }
   2399         }
   2400         return FALSE;
   2401     }
   2402 
   2403     if (!strsrch->pattern.hasPrefixAccents) {
   2404         return FALSE;
   2405     }
   2406 
   2407     UChar       accents[INITIAL_ARRAY_SIZE_];
   2408     // offset to the last base character in substring to search
   2409     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
   2410     // normalizing the offensive string
   2411     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
   2412                                0, accents, INITIAL_ARRAY_SIZE_, status);
   2413     // status checked in loop
   2414 
   2415     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
   2416     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
   2417 
   2418     // 2 power n - 1 plus the full set of accents
   2419     int32_t  count = (2 << (size - 1)) - 1;
   2420     while (U_SUCCESS(*status) && count > 0) {
   2421         UChar *rearrange = strsrch->canonicalPrefixAccents;
   2422         // copy the base characters
   2423         for (int k = 0; k < accentsindex[0]; k ++) {
   2424             *rearrange ++ = accents[k];
   2425         }
   2426         // forming all possible canonical rearrangement by dropping
   2427         // sets of accents
   2428         for (int i = 0; i <= size - 1; i ++) {
   2429             int32_t mask = 1 << (size - i - 1);
   2430             if (count & mask) {
   2431                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
   2432                     *rearrange ++ = accents[j];
   2433                 }
   2434             }
   2435         }
   2436         *rearrange = 0;
   2437         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
   2438                                                           baseoffset, status);
   2439         if (offset != USEARCH_DONE) {
   2440             return TRUE; // match found
   2441         }
   2442         count --;
   2443     }
   2444     return FALSE;
   2445 }
   2446 
   2447 /**
   2448 * Checks match for contraction.
   2449 * If the match starts with a partial contraction we fail.
   2450 * Internal method, status assumed to be success, caller has to check status
   2451 * before calling this method.
   2452 * @param strsrch string search data
   2453 * @param start offset of potential match, to be modified if necessary
   2454 * @param end offset of potential match, to be modified if necessary
   2455 * @param status only error status if any
   2456 * @return TRUE if match passes the contraction test, FALSE otherwise
   2457 */
   2458 static
   2459 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
   2460                                      int32_t   *start,
   2461                                      int32_t   *end, UErrorCode  *status)
   2462 {
   2463           UCollationElements *coleiter   = strsrch->textIter;
   2464           int32_t             textlength = strsrch->search->textLength;
   2465           int32_t         temp       = *end;
   2466     const UCollator          *collator   = strsrch->collator;
   2467     const UChar              *text       = strsrch->search->text;
   2468     // This part checks if either if the start of the match contains potential
   2469     // contraction. If so we'll have to iterate through them
   2470     // Since we used ucol_next while previously looking for the potential
   2471     // match, this guarantees that our end will not be a partial contraction,
   2472     // or a partial supplementary character.
   2473     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
   2474         int32_t expansion  = getExpansionSuffix(coleiter);
   2475         UBool   expandflag = expansion > 0;
   2476         setColEIterOffset(coleiter, *end);
   2477         while (expansion > 0) {
   2478             // getting rid of the redundant ce
   2479             // since forward contraction/expansion may have extra ces
   2480             // if we are in the normalization buffer, hasAccentsBeforeMatch
   2481             // would have taken care of it.
   2482             // E.g. the character \u01FA will have an expansion of 3, but if
   2483             // we are only looking for A ring A\u030A, we'll have to skip the
   2484             // last ce in the expansion buffer
   2485             ucol_previous(coleiter, status);
   2486             if (U_FAILURE(*status)) {
   2487                 return FALSE;
   2488             }
   2489             if (ucol_getOffset(coleiter) != temp) {
   2490                 *end = temp;
   2491                 temp  = ucol_getOffset(coleiter);
   2492             }
   2493             expansion --;
   2494         }
   2495 
   2496         int32_t  *patternce       = strsrch->pattern.CE;
   2497         int32_t   patterncelength = strsrch->pattern.CELength;
   2498         int32_t   count           = patterncelength;
   2499         while (count > 0) {
   2500             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
   2501             // status checked below, note that if status is a failure
   2502             // ucol_previous returns UCOL_NULLORDER
   2503             if (ce == UCOL_IGNORABLE) {
   2504                 continue;
   2505             }
   2506             if (expandflag && count == 0 &&
   2507                 getColElemIterOffset(coleiter, FALSE) != temp) {
   2508                 *end = temp;
   2509                 temp  = ucol_getOffset(coleiter);
   2510             }
   2511             if (count == patterncelength &&
   2512                 ce != patternce[patterncelength - 1]) {
   2513                 // accents may have extra starting ces, this occurs when a
   2514                 // pure accent pattern is matched without rearrangement
   2515                 int32_t    expected = patternce[patterncelength - 1];
   2516                 UTF_BACK_1(text, 0, *end);
   2517                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
   2518                     ce = getCE(strsrch, ucol_previous(coleiter, status));
   2519                     while (U_SUCCESS(*status) && ce != expected &&
   2520                            ce != UCOL_NULLORDER &&
   2521                            ucol_getOffset(coleiter) <= *start) {
   2522                         ce = getCE(strsrch, ucol_previous(coleiter, status));
   2523                     }
   2524                 }
   2525             }
   2526             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
   2527                 (*start) --;
   2528                 *start = getPreviousBaseOffset(text, *start);
   2529                 return FALSE;
   2530             }
   2531             count --;
   2532         }
   2533     }
   2534     return TRUE;
   2535 }
   2536 
   2537 /**
   2538 * Checks and sets the match information if found.
   2539 * Checks
   2540 * <ul>
   2541 * <li> the potential match does not repeat the previous match
   2542 * <li> boundaries are correct
   2543 * <li> potential match does not end in the middle of a contraction
   2544 * <li> identical matches
   2545 * <\ul>
   2546 * Otherwise the offset will be shifted to the next character.
   2547 * Internal method, status assumed to be success, caller has to check status
   2548 * before calling this method.
   2549 * @param strsrch string search data
   2550 * @param textoffset offset in the collation element text. the returned value
   2551 *        will be the truncated start offset of the match or the new start
   2552 *        search offset.
   2553 * @param status only error status if any
   2554 * @return TRUE if the match is valid, FALSE otherwise
   2555 */
   2556 static
   2557 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
   2558                                          int32_t   *textoffset,
   2559                                          UErrorCode    *status)
   2560 {
   2561     // to ensure that the start and ends are not composite characters
   2562     UCollationElements *coleiter = strsrch->textIter;
   2563     // if we have a canonical accent match
   2564     if ((strsrch->pattern.hasSuffixAccents &&
   2565         strsrch->canonicalSuffixAccents[0]) ||
   2566         (strsrch->pattern.hasPrefixAccents &&
   2567         strsrch->canonicalPrefixAccents[0])) {
   2568         strsrch->search->matchedIndex  = *textoffset;
   2569         strsrch->search->matchedLength =
   2570             getNextUStringSearchBaseOffset(strsrch,
   2571                                       getColElemIterOffset(coleiter, FALSE))
   2572             - *textoffset;
   2573         return TRUE;
   2574     }
   2575 
   2576     int32_t end = ucol_getOffset(coleiter);
   2577     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
   2578                                                 status) ||
   2579          U_FAILURE(*status)) {
   2580         return FALSE;
   2581     }
   2582 
   2583     end = getNextUStringSearchBaseOffset(strsrch, end);
   2584     // this totally matches, however we need to check if it is repeating
   2585     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
   2586         !isBreakUnit(strsrch, *textoffset, end) ||
   2587         !checkIdentical(strsrch, *textoffset, end)) {
   2588         (*textoffset) --;
   2589         *textoffset = getPreviousBaseOffset(strsrch->search->text,
   2590                                             *textoffset);
   2591         return FALSE;
   2592     }
   2593 
   2594     strsrch->search->matchedIndex  = *textoffset;
   2595     strsrch->search->matchedLength = end - *textoffset;
   2596     return TRUE;
   2597 }
   2598 #endif // #if BOYER_MOORE
   2599 
   2600 // constructors and destructor -------------------------------------------
   2601 
   2602 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
   2603                                           int32_t         patternlength,
   2604                                     const UChar          *text,
   2605                                           int32_t         textlength,
   2606                                     const char           *locale,
   2607                                           UBreakIterator *breakiter,
   2608                                           UErrorCode     *status)
   2609 {
   2610     if (U_FAILURE(*status)) {
   2611         return NULL;
   2612     }
   2613 #if UCONFIG_NO_BREAK_ITERATION
   2614     if (breakiter != NULL) {
   2615         *status = U_UNSUPPORTED_ERROR;
   2616         return NULL;
   2617     }
   2618 #endif
   2619     if (locale) {
   2620         // ucol_open internally checks for status
   2621         UCollator     *collator = ucol_open(locale, status);
   2622         // pattern, text checks are done in usearch_openFromCollator
   2623         UStringSearch *result   = usearch_openFromCollator(pattern,
   2624                                               patternlength, text, textlength,
   2625                                               collator, breakiter, status);
   2626 
   2627         if (result == NULL || U_FAILURE(*status)) {
   2628             if (collator) {
   2629                 ucol_close(collator);
   2630             }
   2631             return NULL;
   2632         }
   2633         else {
   2634             result->ownCollator = TRUE;
   2635         }
   2636         return result;
   2637     }
   2638     *status = U_ILLEGAL_ARGUMENT_ERROR;
   2639     return NULL;
   2640 }
   2641 
   2642 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
   2643                                   const UChar          *pattern,
   2644                                         int32_t         patternlength,
   2645                                   const UChar          *text,
   2646                                         int32_t         textlength,
   2647                                   const UCollator      *collator,
   2648                                         UBreakIterator *breakiter,
   2649                                         UErrorCode     *status)
   2650 {
   2651     if (U_FAILURE(*status)) {
   2652         return NULL;
   2653     }
   2654 #if UCONFIG_NO_BREAK_ITERATION
   2655     if (breakiter != NULL) {
   2656         *status = U_UNSUPPORTED_ERROR;
   2657         return NULL;
   2658     }
   2659 #endif
   2660     if (pattern == NULL || text == NULL || collator == NULL) {
   2661         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2662         return NULL;
   2663     }
   2664 
   2665     // string search does not really work when numeric collation is turned on
   2666     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
   2667         *status = U_UNSUPPORTED_ERROR;
   2668         return NULL;
   2669     }
   2670 
   2671     if (U_SUCCESS(*status)) {
   2672         initializeFCD(status);
   2673         if (U_FAILURE(*status)) {
   2674             return NULL;
   2675         }
   2676 
   2677         UStringSearch *result;
   2678         if (textlength == -1) {
   2679             textlength = u_strlen(text);
   2680         }
   2681         if (patternlength == -1) {
   2682             patternlength = u_strlen(pattern);
   2683         }
   2684         if (textlength <= 0 || patternlength <= 0) {
   2685             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2686             return NULL;
   2687         }
   2688 
   2689         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
   2690         if (result == NULL) {
   2691             *status = U_MEMORY_ALLOCATION_ERROR;
   2692             return NULL;
   2693         }
   2694 
   2695         result->collator    = collator;
   2696         result->strength    = ucol_getStrength(collator);
   2697         result->ceMask      = getMask(result->strength);
   2698         result->toShift     =
   2699              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   2700                                                             UCOL_SHIFTED;
   2701         result->variableTop = ucol_getVariableTop(collator, status);
   2702 
   2703         result->nfd         = Normalizer2Factory::getNFDInstance(*status);
   2704 
   2705         if (U_FAILURE(*status)) {
   2706             uprv_free(result);
   2707             return NULL;
   2708         }
   2709 
   2710         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
   2711         if (result->search == NULL) {
   2712             *status = U_MEMORY_ALLOCATION_ERROR;
   2713             uprv_free(result);
   2714             return NULL;
   2715         }
   2716 
   2717         result->search->text       = text;
   2718         result->search->textLength = textlength;
   2719 
   2720         result->pattern.text       = pattern;
   2721         result->pattern.textLength = patternlength;
   2722         result->pattern.CE         = NULL;
   2723         result->pattern.PCE        = NULL;
   2724 
   2725         result->search->breakIter  = breakiter;
   2726 #if !UCONFIG_NO_BREAK_ITERATION
   2727         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
   2728         if (breakiter) {
   2729         	ubrk_setText(breakiter, text, textlength, status);
   2730         }
   2731 #endif
   2732 
   2733         result->ownCollator           = FALSE;
   2734         result->search->matchedLength = 0;
   2735         result->search->matchedIndex  = USEARCH_DONE;
   2736         result->utilIter              = NULL;
   2737         result->textIter              = ucol_openElements(collator, text,
   2738                                                           textlength, status);
   2739         if (U_FAILURE(*status)) {
   2740             usearch_close(result);
   2741             return NULL;
   2742         }
   2743 
   2744         result->search->isOverlap          = FALSE;
   2745         result->search->isCanonicalMatch   = FALSE;
   2746         result->search->elementComparisonType = 0;
   2747         result->search->isForwardSearching = TRUE;
   2748         result->search->reset              = TRUE;
   2749 
   2750         initialize(result, status);
   2751 
   2752         if (U_FAILURE(*status)) {
   2753             usearch_close(result);
   2754             return NULL;
   2755         }
   2756 
   2757         return result;
   2758     }
   2759     return NULL;
   2760 }
   2761 
   2762 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
   2763 {
   2764     if (strsrch) {
   2765         if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
   2766             strsrch->pattern.CE) {
   2767             uprv_free(strsrch->pattern.CE);
   2768         }
   2769 
   2770         if (strsrch->pattern.PCE != NULL &&
   2771             strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
   2772             uprv_free(strsrch->pattern.PCE);
   2773         }
   2774 
   2775         ucol_closeElements(strsrch->textIter);
   2776         ucol_closeElements(strsrch->utilIter);
   2777 
   2778         if (strsrch->ownCollator && strsrch->collator) {
   2779             ucol_close((UCollator *)strsrch->collator);
   2780         }
   2781 
   2782 #if !UCONFIG_NO_BREAK_ITERATION
   2783         if (strsrch->search->internalBreakIter) {
   2784         	ubrk_close(strsrch->search->internalBreakIter);
   2785         }
   2786 #endif
   2787 
   2788         uprv_free(strsrch->search);
   2789         uprv_free(strsrch);
   2790     }
   2791 }
   2792 
   2793 // set and get methods --------------------------------------------------
   2794 
   2795 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
   2796                                         int32_t    position,
   2797                                         UErrorCode    *status)
   2798 {
   2799     if (U_SUCCESS(*status) && strsrch) {
   2800         if (isOutOfBounds(strsrch->search->textLength, position)) {
   2801             *status = U_INDEX_OUTOFBOUNDS_ERROR;
   2802         }
   2803         else {
   2804             setColEIterOffset(strsrch->textIter, position);
   2805         }
   2806         strsrch->search->matchedIndex  = USEARCH_DONE;
   2807         strsrch->search->matchedLength = 0;
   2808         strsrch->search->reset         = FALSE;
   2809     }
   2810 }
   2811 
   2812 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
   2813 {
   2814     if (strsrch) {
   2815         int32_t result = ucol_getOffset(strsrch->textIter);
   2816         if (isOutOfBounds(strsrch->search->textLength, result)) {
   2817             return USEARCH_DONE;
   2818         }
   2819         return result;
   2820     }
   2821     return USEARCH_DONE;
   2822 }
   2823 
   2824 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
   2825                                  USearchAttribute attribute,
   2826                                  USearchAttributeValue value,
   2827                                  UErrorCode *status)
   2828 {
   2829     if (U_SUCCESS(*status) && strsrch) {
   2830         switch (attribute)
   2831         {
   2832         case USEARCH_OVERLAP :
   2833             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
   2834             break;
   2835         case USEARCH_CANONICAL_MATCH :
   2836             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
   2837                                                                       FALSE);
   2838             break;
   2839         case USEARCH_ELEMENT_COMPARISON :
   2840             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2841                 strsrch->search->elementComparisonType = (int16_t)value;
   2842             } else {
   2843                 strsrch->search->elementComparisonType = 0;
   2844             }
   2845             break;
   2846         case USEARCH_ATTRIBUTE_COUNT :
   2847         default:
   2848             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2849         }
   2850     }
   2851     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
   2852         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2853     }
   2854 }
   2855 
   2856 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
   2857                                                 const UStringSearch *strsrch,
   2858                                                 USearchAttribute attribute)
   2859 {
   2860     if (strsrch) {
   2861         switch (attribute) {
   2862         case USEARCH_OVERLAP :
   2863             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
   2864                                                         USEARCH_OFF);
   2865         case USEARCH_CANONICAL_MATCH :
   2866             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
   2867                                                                USEARCH_OFF);
   2868         case USEARCH_ELEMENT_COMPARISON :
   2869             {
   2870                 int16_t value = strsrch->search->elementComparisonType;
   2871                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
   2872                     return (USearchAttributeValue)value;
   2873                 } else {
   2874                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
   2875                 }
   2876             }
   2877         case USEARCH_ATTRIBUTE_COUNT :
   2878             return USEARCH_DEFAULT;
   2879         }
   2880     }
   2881     return USEARCH_DEFAULT;
   2882 }
   2883 
   2884 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
   2885                                                 const UStringSearch *strsrch)
   2886 {
   2887     if (strsrch == NULL) {
   2888         return USEARCH_DONE;
   2889     }
   2890     return strsrch->search->matchedIndex;
   2891 }
   2892 
   2893 
   2894 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
   2895                                             UChar         *result,
   2896                                             int32_t        resultCapacity,
   2897                                             UErrorCode    *status)
   2898 {
   2899     if (U_FAILURE(*status)) {
   2900         return USEARCH_DONE;
   2901     }
   2902     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
   2903         result == NULL)) {
   2904         *status = U_ILLEGAL_ARGUMENT_ERROR;
   2905         return USEARCH_DONE;
   2906     }
   2907 
   2908     int32_t     copylength = strsrch->search->matchedLength;
   2909     int32_t copyindex  = strsrch->search->matchedIndex;
   2910     if (copyindex == USEARCH_DONE) {
   2911         u_terminateUChars(result, resultCapacity, 0, status);
   2912         return USEARCH_DONE;
   2913     }
   2914 
   2915     if (resultCapacity < copylength) {
   2916         copylength = resultCapacity;
   2917     }
   2918     if (copylength > 0) {
   2919         uprv_memcpy(result, strsrch->search->text + copyindex,
   2920                     copylength * sizeof(UChar));
   2921     }
   2922     return u_terminateUChars(result, resultCapacity,
   2923                              strsrch->search->matchedLength, status);
   2924 }
   2925 
   2926 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
   2927                                               const UStringSearch *strsrch)
   2928 {
   2929     if (strsrch) {
   2930         return strsrch->search->matchedLength;
   2931     }
   2932     return USEARCH_DONE;
   2933 }
   2934 
   2935 #if !UCONFIG_NO_BREAK_ITERATION
   2936 
   2937 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
   2938                                                UBreakIterator *breakiter,
   2939                                                UErrorCode     *status)
   2940 {
   2941     if (U_SUCCESS(*status) && strsrch) {
   2942     	strsrch->search->breakIter = breakiter;
   2943         if (breakiter) {
   2944             ubrk_setText(breakiter, strsrch->search->text,
   2945                          strsrch->search->textLength, status);
   2946         }
   2947     }
   2948 }
   2949 
   2950 U_CAPI const UBreakIterator* U_EXPORT2
   2951 usearch_getBreakIterator(const UStringSearch *strsrch)
   2952 {
   2953     if (strsrch) {
   2954         return strsrch->search->breakIter;
   2955     }
   2956     return NULL;
   2957 }
   2958 
   2959 #endif
   2960 
   2961 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
   2962                                       const UChar         *text,
   2963                                             int32_t        textlength,
   2964                                             UErrorCode    *status)
   2965 {
   2966     if (U_SUCCESS(*status)) {
   2967         if (strsrch == NULL || text == NULL || textlength < -1 ||
   2968             textlength == 0) {
   2969             *status = U_ILLEGAL_ARGUMENT_ERROR;
   2970         }
   2971         else {
   2972             if (textlength == -1) {
   2973                 textlength = u_strlen(text);
   2974             }
   2975             strsrch->search->text       = text;
   2976             strsrch->search->textLength = textlength;
   2977             ucol_setText(strsrch->textIter, text, textlength, status);
   2978             strsrch->search->matchedIndex  = USEARCH_DONE;
   2979             strsrch->search->matchedLength = 0;
   2980             strsrch->search->reset         = TRUE;
   2981 #if !UCONFIG_NO_BREAK_ITERATION
   2982             if (strsrch->search->breakIter != NULL) {
   2983                 ubrk_setText(strsrch->search->breakIter, text,
   2984                              textlength, status);
   2985             }
   2986             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
   2987 #endif
   2988         }
   2989     }
   2990 }
   2991 
   2992 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
   2993                                                      int32_t       *length)
   2994 {
   2995     if (strsrch) {
   2996         *length = strsrch->search->textLength;
   2997         return strsrch->search->text;
   2998     }
   2999     return NULL;
   3000 }
   3001 
   3002 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
   3003                                           const UCollator     *collator,
   3004                                                 UErrorCode    *status)
   3005 {
   3006     if (U_SUCCESS(*status)) {
   3007         if (collator == NULL) {
   3008             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3009             return;
   3010         }
   3011 
   3012         if (strsrch) {
   3013             if (strsrch->ownCollator && (strsrch->collator != collator)) {
   3014                 ucol_close((UCollator *)strsrch->collator);
   3015                 strsrch->ownCollator = FALSE;
   3016             }
   3017             strsrch->collator    = collator;
   3018             strsrch->strength    = ucol_getStrength(collator);
   3019             strsrch->ceMask      = getMask(strsrch->strength);
   3020 #if !UCONFIG_NO_BREAK_ITERATION
   3021         	ubrk_close(strsrch->search->internalBreakIter);
   3022         	strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
   3023         											 strsrch->search->text, strsrch->search->textLength, status);
   3024 #endif
   3025             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3026             strsrch->toShift     =
   3027                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
   3028                                                                 UCOL_SHIFTED;
   3029             // if status is a failure, ucol_getVariableTop returns 0
   3030             strsrch->variableTop = ucol_getVariableTop(collator, status);
   3031             if (U_SUCCESS(*status)) {
   3032                 initialize(strsrch, status);
   3033                 if (U_SUCCESS(*status)) {
   3034                     /* free offset buffer to avoid memory leak before initializing. */
   3035                     ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
   3036                     uprv_init_collIterate(collator, strsrch->search->text,
   3037                                           strsrch->search->textLength,
   3038                                           &(strsrch->textIter->iteratordata_),
   3039                                           status);
   3040                     strsrch->utilIter->iteratordata_.coll = collator;
   3041                 }
   3042             }
   3043         }
   3044 
   3045         // **** are these calls needed?
   3046         // **** we call uprv_init_pce in initializePatternPCETable
   3047         // **** and the CEBuffer constructor...
   3048 #if 0
   3049         uprv_init_pce(strsrch->textIter);
   3050         uprv_init_pce(strsrch->utilIter);
   3051 #endif
   3052     }
   3053 }
   3054 
   3055 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
   3056 {
   3057     if (strsrch) {
   3058         return (UCollator *)strsrch->collator;
   3059     }
   3060     return NULL;
   3061 }
   3062 
   3063 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
   3064                                          const UChar         *pattern,
   3065                                                int32_t        patternlength,
   3066                                                UErrorCode    *status)
   3067 {
   3068     if (U_SUCCESS(*status)) {
   3069         if (strsrch == NULL || pattern == NULL) {
   3070             *status = U_ILLEGAL_ARGUMENT_ERROR;
   3071         }
   3072         else {
   3073             if (patternlength == -1) {
   3074                 patternlength = u_strlen(pattern);
   3075             }
   3076             if (patternlength == 0) {
   3077                 *status = U_ILLEGAL_ARGUMENT_ERROR;
   3078                 return;
   3079             }
   3080             strsrch->pattern.text       = pattern;
   3081             strsrch->pattern.textLength = patternlength;
   3082             initialize(strsrch, status);
   3083         }
   3084     }
   3085 }
   3086 
   3087 U_CAPI const UChar* U_EXPORT2
   3088 usearch_getPattern(const UStringSearch *strsrch,
   3089                    int32_t       *length)
   3090 {
   3091     if (strsrch) {
   3092         *length = strsrch->pattern.textLength;
   3093         return strsrch->pattern.text;
   3094     }
   3095     return NULL;
   3096 }
   3097 
   3098 // miscellanous methods --------------------------------------------------
   3099 
   3100 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
   3101                                            UErrorCode    *status)
   3102 {
   3103     if (strsrch && U_SUCCESS(*status)) {
   3104         strsrch->search->isForwardSearching = TRUE;
   3105         usearch_setOffset(strsrch, 0, status);
   3106         if (U_SUCCESS(*status)) {
   3107             return usearch_next(strsrch, status);
   3108         }
   3109     }
   3110     return USEARCH_DONE;
   3111 }
   3112 
   3113 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
   3114                                                int32_t    position,
   3115                                                UErrorCode    *status)
   3116 {
   3117     if (strsrch && U_SUCCESS(*status)) {
   3118         strsrch->search->isForwardSearching = TRUE;
   3119         // position checked in usearch_setOffset
   3120         usearch_setOffset(strsrch, position, status);
   3121         if (U_SUCCESS(*status)) {
   3122             return usearch_next(strsrch, status);
   3123         }
   3124     }
   3125     return USEARCH_DONE;
   3126 }
   3127 
   3128 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
   3129                                           UErrorCode    *status)
   3130 {
   3131     if (strsrch && U_SUCCESS(*status)) {
   3132         strsrch->search->isForwardSearching = FALSE;
   3133         usearch_setOffset(strsrch, strsrch->search->textLength, status);
   3134         if (U_SUCCESS(*status)) {
   3135             return usearch_previous(strsrch, status);
   3136         }
   3137     }
   3138     return USEARCH_DONE;
   3139 }
   3140 
   3141 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
   3142                                                int32_t    position,
   3143                                                UErrorCode    *status)
   3144 {
   3145     if (strsrch && U_SUCCESS(*status)) {
   3146         strsrch->search->isForwardSearching = FALSE;
   3147         // position checked in usearch_setOffset
   3148         usearch_setOffset(strsrch, position, status);
   3149         if (U_SUCCESS(*status)) {
   3150             return usearch_previous(strsrch, status);
   3151         }
   3152     }
   3153     return USEARCH_DONE;
   3154 }
   3155 
   3156 /**
   3157 * If a direction switch is required, we'll count the number of ces till the
   3158 * beginning of the collation element iterator and iterate forwards that
   3159 * number of times. This is so that we get to the correct point within the
   3160 * string to continue the search in. Imagine when we are in the middle of the
   3161 * normalization buffer when the change in direction is request. arrrgghh....
   3162 * After searching the offset within the collation element iterator will be
   3163 * shifted to the start of the match. If a match is not found, the offset would
   3164 * have been set to the end of the text string in the collation element
   3165 * iterator.
   3166 * Okay, here's my take on normalization buffer. The only time when there can
   3167 * be 2 matches within the same normalization is when the pattern is consists
   3168 * of all accents. But since the offset returned is from the text string, we
   3169 * should not confuse the caller by returning the second match within the
   3170 * same normalization buffer. If we do, the 2 results will have the same match
   3171 * offsets, and that'll be confusing. I'll return the next match that doesn't
   3172 * fall within the same normalization buffer. Note this does not affect the
   3173 * results of matches spanning the text and the normalization buffer.
   3174 * The position to start searching is taken from the collation element
   3175 * iterator. Callers of this API would have to set the offset in the collation
   3176 * element iterator before using this method.
   3177 */
   3178 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
   3179                                           UErrorCode    *status)
   3180 {
   3181     if (U_SUCCESS(*status) && strsrch) {
   3182         // note offset is either equivalent to the start of the previous match
   3183         // or is set by the user
   3184         int32_t      offset       = usearch_getOffset(strsrch);
   3185         USearch     *search       = strsrch->search;
   3186         search->reset             = FALSE;
   3187         int32_t      textlength   = search->textLength;
   3188         if (search->isForwardSearching) {
   3189 #if BOYER_MOORE
   3190             if (offset == textlength
   3191                 || (!search->isOverlap &&
   3192                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
   3193                     (search->matchedIndex != USEARCH_DONE &&
   3194                      offset + search->matchedLength >= textlength)))) {
   3195                 // not enough characters to match
   3196                 setMatchNotFound(strsrch);
   3197                 return USEARCH_DONE;
   3198             }
   3199 #else
   3200             if (offset == textlength ||
   3201                 (! search->isOverlap &&
   3202                 (search->matchedIndex != USEARCH_DONE &&
   3203                 offset + search->matchedLength > textlength))) {
   3204                     // not enough characters to match
   3205                     setMatchNotFound(strsrch);
   3206                     return USEARCH_DONE;
   3207             }
   3208 #endif
   3209         }
   3210         else {
   3211             // switching direction.
   3212             // if matchedIndex == USEARCH_DONE, it means that either a
   3213             // setOffset has been called or that previous ran off the text
   3214             // string. the iterator would have been set to offset 0 if a
   3215             // match is not found.
   3216             search->isForwardSearching = TRUE;
   3217             if (search->matchedIndex != USEARCH_DONE) {
   3218                 // there's no need to set the collation element iterator
   3219                 // the next call to next will set the offset.
   3220                 return search->matchedIndex;
   3221             }
   3222         }
   3223 
   3224         if (U_SUCCESS(*status)) {
   3225             if (strsrch->pattern.CELength == 0) {
   3226                 if (search->matchedIndex == USEARCH_DONE) {
   3227                     search->matchedIndex = offset;
   3228                 }
   3229                 else { // moves by codepoints
   3230                     UTF_FWD_1(search->text, search->matchedIndex, textlength);
   3231                 }
   3232 
   3233                 search->matchedLength = 0;
   3234                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3235                 // status checked below
   3236                 if (search->matchedIndex == textlength) {
   3237                     search->matchedIndex = USEARCH_DONE;
   3238                 }
   3239             }
   3240             else {
   3241                 if (search->matchedLength > 0) {
   3242                     // if matchlength is 0 we are at the start of the iteration
   3243                     if (search->isOverlap) {
   3244                         ucol_setOffset(strsrch->textIter, offset + 1, status);
   3245                     }
   3246                     else {
   3247                         ucol_setOffset(strsrch->textIter,
   3248                                        offset + search->matchedLength, status);
   3249                     }
   3250                 }
   3251                 else {
   3252                     // for boundary check purposes. this will ensure that the
   3253                     // next match will not preceed the current offset
   3254                     // note search->matchedIndex will always be set to something
   3255                     // in the code
   3256                     search->matchedIndex = offset - 1;
   3257                 }
   3258 
   3259                 if (search->isCanonicalMatch) {
   3260                     // can't use exact here since extra accents are allowed.
   3261                     usearch_handleNextCanonical(strsrch, status);
   3262                 }
   3263                 else {
   3264                     usearch_handleNextExact(strsrch, status);
   3265                 }
   3266             }
   3267 
   3268             if (U_FAILURE(*status)) {
   3269                 return USEARCH_DONE;
   3270             }
   3271 
   3272 #if !BOYER_MOORE
   3273             if (search->matchedIndex == USEARCH_DONE) {
   3274                 ucol_setOffset(strsrch->textIter, search->textLength, status);
   3275             } else {
   3276                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
   3277             }
   3278 #endif
   3279 
   3280             return search->matchedIndex;
   3281         }
   3282     }
   3283     return USEARCH_DONE;
   3284 }
   3285 
   3286 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
   3287                                               UErrorCode *status)
   3288 {
   3289     if (U_SUCCESS(*status) && strsrch) {
   3290         int32_t offset;
   3291         USearch *search = strsrch->search;
   3292         if (search->reset) {
   3293             offset                     = search->textLength;
   3294             search->isForwardSearching = FALSE;
   3295             search->reset              = FALSE;
   3296             setColEIterOffset(strsrch->textIter, offset);
   3297         }
   3298         else {
   3299             offset = usearch_getOffset(strsrch);
   3300         }
   3301 
   3302         int32_t matchedindex = search->matchedIndex;
   3303         if (search->isForwardSearching == TRUE) {
   3304             // switching direction.
   3305             // if matchedIndex == USEARCH_DONE, it means that either a
   3306             // setOffset has been called or that next ran off the text
   3307             // string. the iterator would have been set to offset textLength if
   3308             // a match is not found.
   3309             search->isForwardSearching = FALSE;
   3310             if (matchedindex != USEARCH_DONE) {
   3311                 return matchedindex;
   3312             }
   3313         }
   3314         else {
   3315 #if BOYER_MOORE
   3316             if (offset == 0 || matchedindex == 0 ||
   3317                 (!search->isOverlap &&
   3318                     (offset < strsrch->pattern.defaultShiftSize ||
   3319                     (matchedindex != USEARCH_DONE &&
   3320                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
   3321                 // not enough characters to match
   3322                 setMatchNotFound(strsrch);
   3323                 return USEARCH_DONE;
   3324             }
   3325 #else
   3326             // Could check pattern length, but the
   3327             // linear search will do the right thing
   3328             if (offset == 0 || matchedindex == 0) {
   3329                 setMatchNotFound(strsrch);
   3330                 return USEARCH_DONE;
   3331             }
   3332 #endif
   3333         }
   3334 
   3335         if (U_SUCCESS(*status)) {
   3336             if (strsrch->pattern.CELength == 0) {
   3337                 search->matchedIndex =
   3338                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
   3339                 if (search->matchedIndex == 0) {
   3340                     setMatchNotFound(strsrch);
   3341                     // status checked below
   3342                 }
   3343                 else { // move by codepoints
   3344                     UTF_BACK_1(search->text, 0, search->matchedIndex);
   3345                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
   3346                     // status checked below
   3347                     search->matchedLength = 0;
   3348                 }
   3349             }
   3350             else {
   3351 #if !BOYER_MOORE
   3352                 if (search->matchedIndex != USEARCH_DONE) {
   3353                     if (search->isOverlap) {
   3354                         ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status);
   3355                     }
   3356                 }
   3357 #endif
   3358 
   3359                 if (strsrch->search->isCanonicalMatch) {
   3360                     // can't use exact here since extra accents are allowed.
   3361                     usearch_handlePreviousCanonical(strsrch, status);
   3362                     // status checked below
   3363                 }
   3364                 else {
   3365                     usearch_handlePreviousExact(strsrch, status);
   3366                     // status checked below
   3367                 }
   3368             }
   3369 
   3370             if (U_FAILURE(*status)) {
   3371                 return USEARCH_DONE;
   3372             }
   3373 
   3374             return search->matchedIndex;
   3375         }
   3376     }
   3377     return USEARCH_DONE;
   3378 }
   3379 
   3380 
   3381 
   3382 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
   3383 {
   3384     /*
   3385     reset is setting the attributes that are already in
   3386     string search, hence all attributes in the collator should
   3387     be retrieved without any problems
   3388     */
   3389     if (strsrch) {
   3390         UErrorCode status            = U_ZERO_ERROR;
   3391         UBool      sameCollAttribute = TRUE;
   3392         uint32_t   ceMask;
   3393         UBool      shift;
   3394         uint32_t   varTop;
   3395 
   3396         // **** hack to deal w/ how processed CEs encode quaternary ****
   3397         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
   3398         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
   3399             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
   3400                 sameCollAttribute = FALSE;
   3401         }
   3402 
   3403         strsrch->strength    = ucol_getStrength(strsrch->collator);
   3404         ceMask = getMask(strsrch->strength);
   3405         if (strsrch->ceMask != ceMask) {
   3406             strsrch->ceMask = ceMask;
   3407             sameCollAttribute = FALSE;
   3408         }
   3409 
   3410         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
   3411         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
   3412                                   &status) == UCOL_SHIFTED;
   3413         if (strsrch->toShift != shift) {
   3414             strsrch->toShift  = shift;
   3415             sameCollAttribute = FALSE;
   3416         }
   3417 
   3418         // if status is a failure, ucol_getVariableTop returns 0
   3419         varTop = ucol_getVariableTop(strsrch->collator, &status);
   3420         if (strsrch->variableTop != varTop) {
   3421             strsrch->variableTop = varTop;
   3422             sameCollAttribute    = FALSE;
   3423         }
   3424         if (!sameCollAttribute) {
   3425             initialize(strsrch, &status);
   3426         }
   3427         /* free offset buffer to avoid memory leak before initializing. */
   3428         ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
   3429         uprv_init_collIterate(strsrch->collator, strsrch->search->text,
   3430                               strsrch->search->textLength,
   3431                               &(strsrch->textIter->iteratordata_),
   3432                               &status);
   3433         strsrch->search->matchedLength      = 0;
   3434         strsrch->search->matchedIndex       = USEARCH_DONE;
   3435         strsrch->search->isOverlap          = FALSE;
   3436         strsrch->search->isCanonicalMatch   = FALSE;
   3437         strsrch->search->elementComparisonType = 0;
   3438         strsrch->search->isForwardSearching = TRUE;
   3439         strsrch->search->reset              = TRUE;
   3440     }
   3441 }
   3442 
   3443 //
   3444 //  CEI  Collation Element + source text index.
   3445 //       These structs are kept in the circular buffer.
   3446 //
   3447 struct  CEI {
   3448     int64_t ce;
   3449     int32_t lowIndex;
   3450     int32_t highIndex;
   3451 };
   3452 
   3453 U_NAMESPACE_BEGIN
   3454 
   3455 
   3456 //
   3457 //  CEBuffer   A circular buffer of CEs from the text being searched.
   3458 //
   3459 #define   DEFAULT_CEBUFFER_SIZE 50
   3460 struct CEBuffer {
   3461     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
   3462     CEI                 *buf;
   3463     int32_t              bufSize;
   3464     int32_t              firstIx;
   3465     int32_t              limitIx;
   3466     UCollationElements  *ceIter;
   3467     UStringSearch       *strSearch;
   3468 
   3469 
   3470 
   3471                CEBuffer(UStringSearch *ss, UErrorCode *status);
   3472                ~CEBuffer();
   3473    const CEI   *get(int32_t index);
   3474    const CEI   *getPrevious(int32_t index);
   3475 };
   3476 
   3477 
   3478 CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
   3479     buf = defBuf;
   3480     strSearch = ss;
   3481     bufSize = ss->pattern.CELength+10;
   3482     ceIter    = ss->textIter;
   3483     firstIx = 0;
   3484     limitIx = 0;
   3485 
   3486     uprv_init_pce(ceIter);
   3487 
   3488     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
   3489         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
   3490         if (buf == NULL) {
   3491             *status = U_MEMORY_ALLOCATION_ERROR;
   3492         }
   3493     }
   3494 }
   3495 
   3496 // TODO: add a reset or init function so that allocated
   3497 //       buffers can be retained & reused.
   3498 
   3499 CEBuffer::~CEBuffer() {
   3500     if (buf != defBuf) {
   3501         uprv_free(buf);
   3502     }
   3503 }
   3504 
   3505 
   3506 // Get the CE with the specified index.
   3507 //   Index must be in the range
   3508 //          n-history_size < index < n+1
   3509 //   where n is the largest index to have been fetched by some previous call to this function.
   3510 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3511 //
   3512 const CEI *CEBuffer::get(int32_t index) {
   3513     int i = index % bufSize;
   3514 
   3515     if (index>=firstIx && index<limitIx) {
   3516         // The request was for an entry already in our buffer.
   3517         //  Just return it.
   3518         return &buf[i];
   3519     }
   3520 
   3521     // Caller is requesting a new, never accessed before, CE.
   3522     //   Verify that it is the next one in sequence, which is all
   3523     //   that is allowed.
   3524     if (index != limitIx) {
   3525         U_ASSERT(FALSE);
   3526 
   3527         return NULL;
   3528     }
   3529 
   3530     // Manage the circular CE buffer indexing
   3531     limitIx++;
   3532 
   3533     if (limitIx - firstIx >= bufSize) {
   3534         // The buffer is full, knock out the lowest-indexed entry.
   3535         firstIx++;
   3536     }
   3537 
   3538     UErrorCode status = U_ZERO_ERROR;
   3539 
   3540     buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
   3541 
   3542     return &buf[i];
   3543 }
   3544 
   3545 // Get the CE with the specified index.
   3546 //   Index must be in the range
   3547 //          n-history_size < index < n+1
   3548 //   where n is the largest index to have been fetched by some previous call to this function.
   3549 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
   3550 //
   3551 const CEI *CEBuffer::getPrevious(int32_t index) {
   3552     int i = index % bufSize;
   3553 
   3554     if (index>=firstIx && index<limitIx) {
   3555         // The request was for an entry already in our buffer.
   3556         //  Just return it.
   3557         return &buf[i];
   3558     }
   3559 
   3560     // Caller is requesting a new, never accessed before, CE.
   3561     //   Verify that it is the next one in sequence, which is all
   3562     //   that is allowed.
   3563     if (index != limitIx) {
   3564         U_ASSERT(FALSE);
   3565 
   3566         return NULL;
   3567     }
   3568 
   3569     // Manage the circular CE buffer indexing
   3570     limitIx++;
   3571 
   3572     if (limitIx - firstIx >= bufSize) {
   3573         // The buffer is full, knock out the lowest-indexed entry.
   3574         firstIx++;
   3575     }
   3576 
   3577     UErrorCode status = U_ZERO_ERROR;
   3578 
   3579     buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
   3580 
   3581     return &buf[i];
   3582 }
   3583 
   3584 U_NAMESPACE_END
   3585 
   3586 
   3587 // #define USEARCH_DEBUG
   3588 
   3589 #ifdef USEARCH_DEBUG
   3590 #include <stdio.h>
   3591 #include <stdlib.h>
   3592 #endif
   3593 
   3594 /*
   3595  * Find the next break boundary after startIndex. If the UStringSearch object
   3596  * has an external break iterator, use that. Otherwise use the internal character
   3597  * break iterator.
   3598  */
   3599 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
   3600 #if 0
   3601     const UChar *text = strsrch->search->text;
   3602     int32_t textLen   = strsrch->search->textLength;
   3603 
   3604     U_ASSERT(startIndex>=0);
   3605     U_ASSERT(startIndex<=textLen);
   3606 
   3607     if (startIndex >= textLen) {
   3608         return startIndex;
   3609     }
   3610 
   3611     UChar32  c;
   3612     int32_t  i = startIndex;
   3613     U16_NEXT(text, i, textLen, c);
   3614 
   3615     // If we are on a control character, stop without looking for combining marks.
   3616     //    Control characters do not combine.
   3617     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3618     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
   3619         return i;
   3620     }
   3621 
   3622     // The initial character was not a control, and can thus accept trailing
   3623     //   combining characters.  Advance over however many of them there are.
   3624     int32_t  indexOfLastCharChecked;
   3625     for (;;) {
   3626         indexOfLastCharChecked = i;
   3627         if (i>=textLen) {
   3628             break;
   3629         }
   3630         U16_NEXT(text, i, textLen, c);
   3631         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3632         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3633             break;
   3634         }
   3635     }
   3636     return indexOfLastCharChecked;
   3637 #elif !UCONFIG_NO_BREAK_ITERATION
   3638     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3639 
   3640     if (breakiterator == NULL) {
   3641         breakiterator = strsrch->search->internalBreakIter;
   3642     }
   3643 
   3644     if (breakiterator != NULL) {
   3645     	return ubrk_following(breakiterator, startIndex);
   3646     }
   3647 
   3648     return startIndex;
   3649 #else
   3650     // **** or should we use the original code? ****
   3651     return startIndex;
   3652 #endif
   3653 
   3654 }
   3655 
   3656 /*
   3657  * Returns TRUE if index is on a break boundary. If the UStringSearch
   3658  * has an external break iterator, test using that, otherwise test
   3659  * using the internal character break iterator.
   3660  */
   3661 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
   3662 #if 0
   3663     const UChar *text = strsrch->search->text;
   3664     int32_t textLen   = strsrch->search->textLength;
   3665 
   3666     U_ASSERT(index>=0);
   3667     U_ASSERT(index<=textLen);
   3668 
   3669     if (index>=textLen || index<=0) {
   3670         return FALSE;
   3671     }
   3672 
   3673     // If the character at the current index is not a GRAPHEME_EXTEND
   3674     //    then we can not be within a combining sequence.
   3675     UChar32  c;
   3676     U16_GET(text, 0, index, textLen, c);
   3677     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3678     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
   3679         return FALSE;
   3680     }
   3681 
   3682     // We are at a combining mark.  If the preceding character is anything
   3683     //   except a CONTROL, CR or LF, we are in a combining sequence.
   3684     U16_PREV(text, 0, index, c);
   3685     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
   3686     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
   3687     return combining;
   3688 #elif !UCONFIG_NO_BREAK_ITERATION
   3689     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3690 
   3691     if (breakiterator == NULL) {
   3692         breakiterator = strsrch->search->internalBreakIter;
   3693     }
   3694 
   3695     return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
   3696 #else
   3697     // **** or use the original code? ****
   3698     return FALSE;
   3699 #endif
   3700 }
   3701 
   3702 #if 0
   3703 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
   3704 {
   3705 #if !UCONFIG_NO_BREAK_ITERATION
   3706     UBreakIterator *breakiterator = strsrch->search->breakIter;
   3707 
   3708     if (breakiterator != NULL) {
   3709         int32_t startindex = ubrk_first(breakiterator);
   3710         int32_t endindex   = ubrk_last(breakiterator);
   3711 
   3712         // out-of-range indexes are never boundary positions
   3713         if (start < startindex || start > endindex ||
   3714             end < startindex || end > endindex) {
   3715             return FALSE;
   3716         }
   3717 
   3718         return ubrk_isBoundary(breakiterator, start) &&
   3719                ubrk_isBoundary(breakiterator, end);
   3720     }
   3721 #endif
   3722 
   3723     return TRUE;
   3724 }
   3725 #endif
   3726 
   3727 typedef enum {
   3728     U_CE_MATCH = -1,
   3729     U_CE_NO_MATCH = 0,
   3730     U_CE_SKIP_TARG,
   3731     U_CE_SKIP_PATN
   3732 } UCompareCEsResult;
   3733 #define U_CE_LEVEL2_BASE 0x00000005
   3734 #define U_CE_LEVEL3_BASE 0x00050000
   3735 
   3736 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
   3737     if (targCE == patCE) {
   3738         return U_CE_MATCH;
   3739     }
   3740     if (compareType == 0) {
   3741         return U_CE_NO_MATCH;
   3742     }
   3743 
   3744     int64_t targCEshifted = targCE >> 32;
   3745     int64_t patCEshifted = patCE >> 32;
   3746     int64_t mask;
   3747 
   3748     mask = 0xFFFF0000;
   3749     int32_t targLev1 = (int32_t)(targCEshifted & mask);
   3750     int32_t patLev1 = (int32_t)(patCEshifted & mask);
   3751     if ( targLev1 != patLev1 ) {
   3752         if ( targLev1 == 0 ) {
   3753             return U_CE_SKIP_TARG;
   3754         }
   3755         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3756             return U_CE_SKIP_PATN;
   3757         }
   3758         return U_CE_NO_MATCH;
   3759     }
   3760 
   3761     mask = 0x0000FFFF;
   3762     int32_t targLev2 = (int32_t)(targCEshifted & mask);
   3763     int32_t patLev2 = (int32_t)(patCEshifted & mask);
   3764     if ( targLev2 != patLev2 ) {
   3765         if ( targLev2 == 0 ) {
   3766             return U_CE_SKIP_TARG;
   3767         }
   3768         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
   3769             return U_CE_SKIP_PATN;
   3770         }
   3771         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
   3772             U_CE_MATCH: U_CE_NO_MATCH;
   3773     }
   3774 
   3775     mask = 0xFFFF0000;
   3776     int32_t targLev3 = (int32_t)(targCE & mask);
   3777     int32_t patLev3 = (int32_t)(patCE & mask);
   3778     if ( targLev3 != patLev3 ) {
   3779         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
   3780             U_CE_MATCH: U_CE_NO_MATCH;
   3781    }
   3782 
   3783     return U_CE_MATCH;
   3784 }
   3785 
   3786 #if BOYER_MOORE
   3787 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
   3788 #endif
   3789 
   3790 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
   3791                                        int32_t        startIdx,
   3792                                        int32_t        *matchStart,
   3793                                        int32_t        *matchLimit,
   3794                                        UErrorCode     *status)
   3795 {
   3796     if (U_FAILURE(*status)) {
   3797         return FALSE;
   3798     }
   3799 
   3800     // TODO:  reject search patterns beginning with a combining char.
   3801 
   3802 #ifdef USEARCH_DEBUG
   3803     if (getenv("USEARCH_DEBUG") != NULL) {
   3804         printf("Pattern CEs\n");
   3805         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   3806             printf(" %8x", strsrch->pattern.CE[ii]);
   3807         }
   3808         printf("\n");
   3809     }
   3810 
   3811 #endif
   3812     // Input parameter sanity check.
   3813     //  TODO:  should input indicies clip to the text length
   3814     //         in the same way that UText does.
   3815     if(strsrch->pattern.CELength == 0         ||
   3816        startIdx < 0                           ||
   3817        startIdx > strsrch->search->textLength ||
   3818        strsrch->pattern.CE == NULL) {
   3819            *status = U_ILLEGAL_ARGUMENT_ERROR;
   3820            return FALSE;
   3821     }
   3822 
   3823     if (strsrch->pattern.PCE == NULL) {
   3824         initializePatternPCETable(strsrch, status);
   3825     }
   3826 
   3827     ucol_setOffset(strsrch->textIter, startIdx, status);
   3828     CEBuffer ceb(strsrch, status);
   3829 
   3830 
   3831     int32_t    targetIx = 0;
   3832     const CEI *targetCEI = NULL;
   3833     int32_t    patIx;
   3834     UBool      found;
   3835 
   3836     int32_t  mStart = -1;
   3837     int32_t  mLimit = -1;
   3838     int32_t  minLimit;
   3839     int32_t  maxLimit;
   3840 
   3841 
   3842 
   3843     // Outer loop moves over match starting positions in the
   3844     //      target CE space.
   3845     // Here we see the target as a sequence of collation elements, resulting from the following:
   3846     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
   3847     //    (for example, digraphs such as IJ may be broken into two characters).
   3848     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
   3849     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
   3850     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
   3851     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
   3852     //    then the CE is deleted, so the following code sees only CEs that are relevant.
   3853     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
   3854     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
   3855     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
   3856     //
   3857     for(targetIx=0; ; targetIx++)
   3858     {
   3859         found = TRUE;
   3860         //  Inner loop checks for a match beginning at each
   3861         //  position from the outer loop.
   3862         int32_t targetIxOffset = 0;
   3863         int64_t patCE = 0;
   3864         for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
   3865             patCE = strsrch->pattern.PCE[patIx];
   3866             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
   3867             //  Compare CE from target string with CE from the pattern.
   3868             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
   3869             //    which will fail the compare, below.
   3870             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   3871             if ( ceMatch == U_CE_NO_MATCH ) {
   3872                 found = FALSE;
   3873                 break;
   3874             } else if ( ceMatch > U_CE_NO_MATCH ) {
   3875                 if ( ceMatch == U_CE_SKIP_TARG ) {
   3876                     // redo with same patCE, next targCE
   3877                     patIx--;
   3878                     targetIxOffset++;
   3879                 } else { // ceMatch == U_CE_SKIP_PATN
   3880                     // redo with same targCE, next patCE
   3881                     targetIxOffset--;
   3882                 }
   3883             }
   3884         }
   3885         targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far
   3886 
   3887         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   3888             // No match at this targetIx.  Try again at the next.
   3889             continue;
   3890         }
   3891 
   3892         if (!found) {
   3893             // No match at all, we have run off the end of the target text.
   3894             break;
   3895         }
   3896 
   3897 
   3898         // We have found a match in CE space.
   3899         // Now determine the bounds in string index space.
   3900         //  There still is a chance of match failure if the CE range not correspond to
   3901         //     an acceptable character range.
   3902         //
   3903         const CEI *firstCEI = ceb.get(targetIx);
   3904         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
   3905 
   3906         mStart   = firstCEI->lowIndex;
   3907         minLimit = lastCEI->lowIndex;
   3908 
   3909         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   3910         //   extended to the end of input, and the match is good.
   3911 
   3912         // Look at the high and low indices of the CE following the match. If
   3913         // they are the same it means one of two things:
   3914         //    1. The match extended to the last CE from the target text, which is OK, or
   3915         //    2. The last CE that was part of the match is in an expansion that extends
   3916         //       to the first CE after the match. In this case, we reject the match.
   3917         if (strsrch->search->elementComparisonType == 0) {
   3918             const CEI *nextCEI  = ceb.get(targetIx + targetIxOffset);
   3919             maxLimit = nextCEI->lowIndex;
   3920             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   3921                 found = FALSE;
   3922             }
   3923         } else {
   3924             const CEI *nextCEI;
   3925             for ( ; ; ++targetIxOffset ) {
   3926                 nextCEI = ceb.get(targetIx + targetIxOffset);
   3927                 maxLimit = nextCEI->lowIndex;
   3928 				// If we are at the end of the target too, match succeeds
   3929                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
   3930                     break;
   3931                 }
   3932                 // As long as the next CE has primary weight of 0,
   3933                 // it is part of the last target element matched by the pattern;
   3934                 // make sure it can be part of a match with the last patCE
   3935                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
   3936                 	UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
   3937                 	if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
   3938                 		found = FALSE;
   3939                 		break;
   3940                 	}
   3941                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
   3942                 // target element, but it has non-zero primary weight => match fails
   3943                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
   3944                 	found = false;
   3945                 	break;
   3946                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
   3947                 } else {
   3948                 	break;
   3949                 }
   3950             }
   3951         }
   3952 
   3953 
   3954         // Check for the start of the match being within a combining sequence.
   3955         //   This can happen if the pattern itself begins with a combining char, and
   3956         //   the match found combining marks in the target text that were attached
   3957         //    to something else.
   3958         //   This type of match should be rejected for not completely consuming a
   3959         //   combining sequence.
   3960         if (isBreakBoundary(strsrch, mStart)) {
   3961             found = FALSE;
   3962         }
   3963 
   3964         // Check for the start of the match being within an Collation Element Expansion,
   3965         //   meaning that the first char of the match is only partially matched.
   3966         //   With exapnsions, the first CE will report the index of the source
   3967         //   character, and all subsequent (expansions) CEs will report the source index of the
   3968         //    _following_ character.
   3969         int32_t secondIx = firstCEI->highIndex;
   3970         if (mStart == secondIx) {
   3971             found = FALSE;
   3972         }
   3973 
   3974         //  Advance the match end position to the first acceptable match boundary.
   3975         //    This advances the index over any combining charcters.
   3976         mLimit = maxLimit;
   3977         if (minLimit < maxLimit) {
   3978             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   3979 
   3980             if (nba >= lastCEI->highIndex) {
   3981                 mLimit = nba;
   3982             }
   3983         }
   3984 
   3985     #ifdef USEARCH_DEBUG
   3986         if (getenv("USEARCH_DEBUG") != NULL) {
   3987             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   3988         }
   3989     #endif
   3990 
   3991         // If advancing to the end of a combining sequence in character indexing space
   3992         //   advanced us beyond the end of the match in CE space, reject this match.
   3993         if (mLimit > maxLimit) {
   3994             found = FALSE;
   3995         }
   3996 
   3997         if (isBreakBoundary(strsrch, mLimit)) {
   3998             found = FALSE;
   3999         }
   4000 
   4001         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4002             found = FALSE;
   4003         }
   4004 
   4005         if (found) {
   4006             break;
   4007         }
   4008     }
   4009 
   4010     #ifdef USEARCH_DEBUG
   4011     if (getenv("USEARCH_DEBUG") != NULL) {
   4012         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4013         int32_t  lastToPrint = ceb.limitIx+2;
   4014         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4015             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4016         }
   4017         printf("\n%s\n", found? "match found" : "no match");
   4018     }
   4019     #endif
   4020 
   4021     // All Done.  Store back the match bounds to the caller.
   4022     //
   4023     if (found==FALSE) {
   4024         mLimit = -1;
   4025         mStart = -1;
   4026     }
   4027 
   4028     if (matchStart != NULL) {
   4029         *matchStart= mStart;
   4030     }
   4031 
   4032     if (matchLimit != NULL) {
   4033         *matchLimit = mLimit;
   4034     }
   4035 
   4036     return found;
   4037 }
   4038 
   4039 
   4040 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
   4041                                                 int32_t        startIdx,
   4042                                                 int32_t        *matchStart,
   4043                                                 int32_t        *matchLimit,
   4044                                                 UErrorCode     *status)
   4045 {
   4046     if (U_FAILURE(*status)) {
   4047         return FALSE;
   4048     }
   4049 
   4050     // TODO:  reject search patterns beginning with a combining char.
   4051 
   4052 #ifdef USEARCH_DEBUG
   4053     if (getenv("USEARCH_DEBUG") != NULL) {
   4054         printf("Pattern CEs\n");
   4055         for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
   4056             printf(" %8x", strsrch->pattern.CE[ii]);
   4057         }
   4058         printf("\n");
   4059     }
   4060 
   4061 #endif
   4062     // Input parameter sanity check.
   4063     //  TODO:  should input indicies clip to the text length
   4064     //         in the same way that UText does.
   4065     if(strsrch->pattern.CELength == 0         ||
   4066        startIdx < 0                           ||
   4067        startIdx > strsrch->search->textLength ||
   4068        strsrch->pattern.CE == NULL) {
   4069            *status = U_ILLEGAL_ARGUMENT_ERROR;
   4070            return FALSE;
   4071     }
   4072 
   4073     if (strsrch->pattern.PCE == NULL) {
   4074         initializePatternPCETable(strsrch, status);
   4075     }
   4076 
   4077     CEBuffer ceb(strsrch, status);
   4078     int32_t    targetIx = 0;
   4079 
   4080     /*
   4081      * Pre-load the buffer with the CE's for the grapheme
   4082      * after our starting position so that we're sure that
   4083      * we can look at the CE following the match when we
   4084      * check the match boundaries.
   4085      *
   4086      * This will also pre-fetch the first CE that we'll
   4087      * consider for the match.
   4088      */
   4089     if (startIdx < strsrch->search->textLength) {
   4090         UBreakIterator *bi = strsrch->search->internalBreakIter;
   4091         int32_t next = ubrk_following(bi, startIdx);
   4092 
   4093         ucol_setOffset(strsrch->textIter, next, status);
   4094 
   4095         for (targetIx = 0; ; targetIx += 1) {
   4096             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
   4097                 break;
   4098             }
   4099         }
   4100     } else {
   4101         ucol_setOffset(strsrch->textIter, startIdx, status);
   4102     }
   4103 
   4104 
   4105     const CEI *targetCEI = NULL;
   4106     int32_t    patIx;
   4107     UBool      found;
   4108 
   4109     int32_t  limitIx = targetIx;
   4110     int32_t  mStart = -1;
   4111     int32_t  mLimit = -1;
   4112     int32_t  minLimit;
   4113     int32_t  maxLimit;
   4114 
   4115 
   4116 
   4117     // Outer loop moves over match starting positions in the
   4118     //      target CE space.
   4119     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
   4120     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
   4121     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
   4122     // and the beginning of the base text.
   4123     for(targetIx = limitIx; ; targetIx += 1)
   4124     {
   4125         found = TRUE;
   4126         //  Inner loop checks for a match beginning at each
   4127         //  position from the outer loop.
   4128         int32_t targetIxOffset = 0;
   4129         for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
   4130             int64_t patCE = strsrch->pattern.PCE[patIx];
   4131 
   4132             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
   4133             //  Compare CE from target string with CE from the pattern.
   4134             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
   4135             //    which will fail the compare, below.
   4136             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
   4137             if ( ceMatch == U_CE_NO_MATCH ) {
   4138                 found = FALSE;
   4139                 break;
   4140             } else if ( ceMatch > U_CE_NO_MATCH ) {
   4141                 if ( ceMatch == U_CE_SKIP_TARG ) {
   4142                     // redo with same patCE, next targCE
   4143                     patIx++;
   4144                     targetIxOffset++;
   4145                 } else { // ceMatch == U_CE_SKIP_PATN
   4146                     // redo with same targCE, next patCE
   4147                     targetIxOffset--;
   4148                 }
   4149             }
   4150         }
   4151 
   4152         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
   4153             // No match at this targetIx.  Try again at the next.
   4154             continue;
   4155         }
   4156 
   4157         if (!found) {
   4158             // No match at all, we have run off the end of the target text.
   4159             break;
   4160         }
   4161 
   4162 
   4163         // We have found a match in CE space.
   4164         // Now determine the bounds in string index space.
   4165         //  There still is a chance of match failure if the CE range not correspond to
   4166         //     an acceptable character range.
   4167         //
   4168         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
   4169         const CEI *lastCEI  = ceb.getPrevious(targetIx);
   4170         const CEI *nextCEI  = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
   4171 
   4172         mStart   = firstCEI->lowIndex;
   4173         minLimit = lastCEI->lowIndex;
   4174         maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex;
   4175 
   4176         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
   4177         //   extended to the end of input, and the match is good.
   4178 
   4179         // Look at the high and low indices of the CE following the match. If
   4180         // they are the same it means one of two things:
   4181         //    1. The match extended to the last CE from the target text, which is OK, or
   4182         //    2. The last CE that was part of the match is in an expansion that extends
   4183         //       to the first CE after the match. In this case, we reject the match.
   4184         if (targetIx >= 1) {
   4185             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
   4186                 found = FALSE;
   4187             }
   4188         }
   4189 
   4190 
   4191         // Check for the start of the match being within a combining sequence.
   4192         //   This can happen if the pattern itself begins with a combining char, and
   4193         //   the match found combining marks in the target text that were attached
   4194         //    to something else.
   4195         //   This type of match should be rejected for not completely consuming a
   4196         //   combining sequence.
   4197         if (isBreakBoundary(strsrch, mStart)) {
   4198             found = FALSE;
   4199         }
   4200 
   4201         // Look at the high index of the first CE in the match. If it's the same as the
   4202         // low index, the first CE in the match is in the middle of an expansion.
   4203         if (mStart == firstCEI->highIndex) {
   4204             found = FALSE;
   4205         }
   4206 
   4207         //  Advance the match end position to the first acceptable match boundary.
   4208         //    This advances the index over any combining charcters.
   4209         mLimit = maxLimit;
   4210         if (/*targetIx > 0 &&*/ minLimit < maxLimit) {
   4211             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
   4212 
   4213             if (nba >= lastCEI->highIndex) {
   4214                 mLimit = nba;
   4215             }
   4216         }
   4217 
   4218     #ifdef USEARCH_DEBUG
   4219         if (getenv("USEARCH_DEBUG") != NULL) {
   4220             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
   4221         }
   4222     #endif
   4223 
   4224         // If advancing to the end of a combining sequence in character indexing space
   4225         //   advanced us beyond the end of the match in CE space, reject this match.
   4226         if (mLimit > maxLimit) {
   4227             found = FALSE;
   4228         }
   4229 
   4230         // Make sure the end of the match is on a break boundary
   4231         if (isBreakBoundary(strsrch, mLimit)) {
   4232             found = FALSE;
   4233         }
   4234 
   4235         if (! checkIdentical(strsrch, mStart, mLimit)) {
   4236             found = FALSE;
   4237         }
   4238 
   4239         if (found) {
   4240             break;
   4241         }
   4242     }
   4243 
   4244     #ifdef USEARCH_DEBUG
   4245     if (getenv("USEARCH_DEBUG") != NULL) {
   4246         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
   4247         int32_t  lastToPrint = ceb.limitIx+2;
   4248         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
   4249             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
   4250         }
   4251         printf("\n%s\n", found? "match found" : "no match");
   4252     }
   4253     #endif
   4254 
   4255     // All Done.  Store back the match bounds to the caller.
   4256     //
   4257     if (found==FALSE) {
   4258         mLimit = -1;
   4259         mStart = -1;
   4260     }
   4261 
   4262     if (matchStart != NULL) {
   4263         *matchStart= mStart;
   4264     }
   4265 
   4266     if (matchLimit != NULL) {
   4267         *matchLimit = mLimit;
   4268     }
   4269 
   4270     return found;
   4271 }
   4272 
   4273 
   4274 
   4275 
   4276 // internal use methods declared in usrchimp.h -----------------------------
   4277 
   4278 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
   4279 {
   4280     if (U_FAILURE(*status)) {
   4281         setMatchNotFound(strsrch);
   4282         return FALSE;
   4283     }
   4284 
   4285 #if BOYER_MOORE
   4286     UCollationElements *coleiter        = strsrch->textIter;
   4287     int32_t             textlength      = strsrch->search->textLength;
   4288     int32_t            *patternce       = strsrch->pattern.CE;
   4289     int32_t             patterncelength = strsrch->pattern.CELength;
   4290     int32_t             textoffset      = ucol_getOffset(coleiter);
   4291 
   4292     // status used in setting coleiter offset, since offset is checked in
   4293     // shiftForward before setting the coleiter offset, status never
   4294     // a failure
   4295     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4296                               patterncelength);
   4297     while (textoffset <= textlength)
   4298     {
   4299         uint32_t    patternceindex = patterncelength - 1;
   4300         int32_t     targetce;
   4301         UBool       found          = FALSE;
   4302         int32_t    lastce          = UCOL_NULLORDER;
   4303 
   4304         setColEIterOffset(coleiter, textoffset);
   4305 
   4306         for (;;) {
   4307             // finding the last pattern ce match, imagine composite characters
   4308             // for example: search for pattern A in text \u00C0
   4309             // we'll have to skip \u0300 the grave first before we get to A
   4310             targetce = ucol_previous(coleiter, status);
   4311             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4312                 found = FALSE;
   4313                 break;
   4314             }
   4315             targetce = getCE(strsrch, targetce);
   4316             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
   4317                 // this is for the text \u0315\u0300 that requires
   4318                 // normalization and pattern \u0300, where \u0315 is ignorable
   4319                 continue;
   4320             }
   4321             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4322                 lastce = targetce;
   4323             }
   4324             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4325             if (targetce == patternce[patternceindex]) {
   4326                 // the first ce can be a contraction
   4327                 found = TRUE;
   4328                 break;
   4329             }
   4330             if (!hasExpansion(coleiter)) {
   4331                 found = FALSE;
   4332                 break;
   4333             }
   4334         }
   4335 
   4336         //targetce = lastce;
   4337 
   4338         while (found && patternceindex > 0) {
   4339         	lastce = targetce;
   4340             targetce    = ucol_previous(coleiter, status);
   4341             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4342                 found = FALSE;
   4343                 break;
   4344             }
   4345             targetce    = getCE(strsrch, targetce);
   4346             if (targetce == UCOL_IGNORABLE) {
   4347                 continue;
   4348             }
   4349 
   4350             patternceindex --;
   4351             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4352             found = found && targetce == patternce[patternceindex];
   4353         }
   4354 
   4355         targetce = lastce;
   4356 
   4357         if (!found) {
   4358             if (U_FAILURE(*status)) {
   4359                 break;
   4360             }
   4361             textoffset = shiftForward(strsrch, textoffset, lastce,
   4362                                       patternceindex);
   4363             // status checked at loop.
   4364             patternceindex = patterncelength;
   4365             continue;
   4366         }
   4367 
   4368         if (checkNextExactMatch(strsrch, &textoffset, status)) {
   4369             // status checked in ucol_setOffset
   4370             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4371             return TRUE;
   4372         }
   4373     }
   4374     setMatchNotFound(strsrch);
   4375     return FALSE;
   4376 #else
   4377     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4378     int32_t start = -1;
   4379     int32_t end = -1;
   4380 
   4381     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4382         strsrch->search->matchedIndex  = start;
   4383         strsrch->search->matchedLength = end - start;
   4384         return TRUE;
   4385     } else {
   4386         setMatchNotFound(strsrch);
   4387         return FALSE;
   4388     }
   4389 #endif
   4390 }
   4391 
   4392 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
   4393 {
   4394     if (U_FAILURE(*status)) {
   4395         setMatchNotFound(strsrch);
   4396         return FALSE;
   4397     }
   4398 
   4399 #if BOYER_MOORE
   4400     UCollationElements *coleiter        = strsrch->textIter;
   4401     int32_t             textlength      = strsrch->search->textLength;
   4402     int32_t            *patternce       = strsrch->pattern.CE;
   4403     int32_t             patterncelength = strsrch->pattern.CELength;
   4404     int32_t             textoffset      = ucol_getOffset(coleiter);
   4405     UBool               hasPatternAccents =
   4406        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4407 
   4408     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
   4409                               patterncelength);
   4410     strsrch->canonicalPrefixAccents[0] = 0;
   4411     strsrch->canonicalSuffixAccents[0] = 0;
   4412 
   4413     while (textoffset <= textlength)
   4414     {
   4415         int32_t     patternceindex = patterncelength - 1;
   4416         int32_t     targetce;
   4417         UBool       found          = FALSE;
   4418         int32_t     lastce         = UCOL_NULLORDER;
   4419 
   4420         setColEIterOffset(coleiter, textoffset);
   4421 
   4422         for (;;) {
   4423             // finding the last pattern ce match, imagine composite characters
   4424             // for example: search for pattern A in text \u00C0
   4425             // we'll have to skip \u0300 the grave first before we get to A
   4426             targetce = ucol_previous(coleiter, status);
   4427             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4428                 found = FALSE;
   4429                 break;
   4430             }
   4431             targetce = getCE(strsrch, targetce);
   4432             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
   4433                 lastce = targetce;
   4434             }
   4435             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4436             if (targetce == patternce[patternceindex]) {
   4437                 // the first ce can be a contraction
   4438                 found = TRUE;
   4439                 break;
   4440             }
   4441             if (!hasExpansion(coleiter)) {
   4442                 found = FALSE;
   4443                 break;
   4444             }
   4445         }
   4446 
   4447         while (found && patternceindex > 0) {
   4448             targetce    = ucol_previous(coleiter, status);
   4449             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4450                 found = FALSE;
   4451                 break;
   4452             }
   4453             targetce    = getCE(strsrch, targetce);
   4454             if (targetce == UCOL_IGNORABLE) {
   4455                 continue;
   4456             }
   4457 
   4458             patternceindex --;
   4459             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4460             found = found && targetce == patternce[patternceindex];
   4461         }
   4462 
   4463         // initializing the rearranged accent array
   4464         if (hasPatternAccents && !found) {
   4465             strsrch->canonicalPrefixAccents[0] = 0;
   4466             strsrch->canonicalSuffixAccents[0] = 0;
   4467             if (U_FAILURE(*status)) {
   4468                 break;
   4469             }
   4470             found = doNextCanonicalMatch(strsrch, textoffset, status);
   4471         }
   4472 
   4473         if (!found) {
   4474             if (U_FAILURE(*status)) {
   4475                 break;
   4476             }
   4477             textoffset = shiftForward(strsrch, textoffset, lastce,
   4478                                       patternceindex);
   4479             // status checked at loop
   4480             patternceindex = patterncelength;
   4481             continue;
   4482         }
   4483 
   4484         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
   4485             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
   4486             return TRUE;
   4487         }
   4488     }
   4489     setMatchNotFound(strsrch);
   4490     return FALSE;
   4491 #else
   4492     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4493     int32_t start = -1;
   4494     int32_t end = -1;
   4495 
   4496     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
   4497         strsrch->search->matchedIndex  = start;
   4498         strsrch->search->matchedLength = end - start;
   4499         return TRUE;
   4500     } else {
   4501         setMatchNotFound(strsrch);
   4502         return FALSE;
   4503     }
   4504 #endif
   4505 }
   4506 
   4507 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
   4508 {
   4509     if (U_FAILURE(*status)) {
   4510         setMatchNotFound(strsrch);
   4511         return FALSE;
   4512     }
   4513 
   4514 #if BOYER_MOORE
   4515     UCollationElements *coleiter        = strsrch->textIter;
   4516     int32_t            *patternce       = strsrch->pattern.CE;
   4517     int32_t             patterncelength = strsrch->pattern.CELength;
   4518     int32_t             textoffset      = ucol_getOffset(coleiter);
   4519 
   4520     // shifting it check for setting offset
   4521     // if setOffset is called previously or there was no previous match, we
   4522     // leave the offset as it is.
   4523     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4524         textoffset = strsrch->search->matchedIndex;
   4525     }
   4526 
   4527     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4528                               patterncelength);
   4529 
   4530     while (textoffset >= 0)
   4531     {
   4532         int32_t     patternceindex = 1;
   4533         int32_t     targetce;
   4534         UBool       found          = FALSE;
   4535         int32_t     firstce        = UCOL_NULLORDER;
   4536 
   4537         // if status is a failure, ucol_setOffset does nothing
   4538         setColEIterOffset(coleiter, textoffset);
   4539 
   4540         for (;;) {
   4541             // finding the first pattern ce match, imagine composite
   4542             // characters. for example: search for pattern \u0300 in text
   4543             // \u00C0, we'll have to skip A first before we get to
   4544             // \u0300 the grave accent
   4545             targetce = ucol_next(coleiter, status);
   4546             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4547                 found = FALSE;
   4548                 break;
   4549             }
   4550             targetce = getCE(strsrch, targetce);
   4551             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4552                 firstce = targetce;
   4553             }
   4554             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
   4555                 continue;
   4556             }
   4557             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4558             if (targetce == patternce[0]) {
   4559                 found = TRUE;
   4560                 break;
   4561             }
   4562             if (!hasExpansion(coleiter)) {
   4563                 // checking for accents in composite character
   4564                 found = FALSE;
   4565                 break;
   4566             }
   4567         }
   4568 
   4569         //targetce = firstce;
   4570 
   4571         while (found && (patternceindex < patterncelength)) {
   4572         	firstce = targetce;
   4573             targetce    = ucol_next(coleiter, status);
   4574             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4575                 found = FALSE;
   4576                 break;
   4577             }
   4578             targetce    = getCE(strsrch, targetce);
   4579             if (targetce == UCOL_IGNORABLE) {
   4580                 continue;
   4581             }
   4582 
   4583             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4584             found = found && targetce == patternce[patternceindex];
   4585             patternceindex ++;
   4586         }
   4587 
   4588         targetce = firstce;
   4589 
   4590         if (!found) {
   4591             if (U_FAILURE(*status)) {
   4592                 break;
   4593             }
   4594 
   4595             textoffset = reverseShift(strsrch, textoffset, targetce,
   4596                                       patternceindex);
   4597             patternceindex = 0;
   4598             continue;
   4599         }
   4600 
   4601         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
   4602             setColEIterOffset(coleiter, textoffset);
   4603             return TRUE;
   4604         }
   4605     }
   4606     setMatchNotFound(strsrch);
   4607     return FALSE;
   4608 #else
   4609     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4610     int32_t start = -1;
   4611     int32_t end = -1;
   4612 
   4613     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4614         strsrch->search->matchedIndex = start;
   4615         strsrch->search->matchedLength = end - start;
   4616         return TRUE;
   4617     } else {
   4618         setMatchNotFound(strsrch);
   4619         return FALSE;
   4620     }
   4621 #endif
   4622 }
   4623 
   4624 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
   4625                                       UErrorCode    *status)
   4626 {
   4627     if (U_FAILURE(*status)) {
   4628         setMatchNotFound(strsrch);
   4629         return FALSE;
   4630     }
   4631 
   4632 #if BOYER_MOORE
   4633     UCollationElements *coleiter        = strsrch->textIter;
   4634     int32_t            *patternce       = strsrch->pattern.CE;
   4635     int32_t             patterncelength = strsrch->pattern.CELength;
   4636     int32_t             textoffset      = ucol_getOffset(coleiter);
   4637     UBool               hasPatternAccents =
   4638        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
   4639 
   4640     // shifting it check for setting offset
   4641     // if setOffset is called previously or there was no previous match, we
   4642     // leave the offset as it is.
   4643     if (strsrch->search->matchedIndex != USEARCH_DONE) {
   4644         textoffset = strsrch->search->matchedIndex;
   4645     }
   4646 
   4647     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
   4648                               patterncelength);
   4649     strsrch->canonicalPrefixAccents[0] = 0;
   4650     strsrch->canonicalSuffixAccents[0] = 0;
   4651 
   4652     while (textoffset >= 0)
   4653     {
   4654         int32_t     patternceindex = 1;
   4655         int32_t     targetce;
   4656         UBool       found          = FALSE;
   4657         int32_t     firstce        = UCOL_NULLORDER;
   4658 
   4659         setColEIterOffset(coleiter, textoffset);
   4660         for (;;) {
   4661             // finding the first pattern ce match, imagine composite
   4662             // characters. for example: search for pattern \u0300 in text
   4663             // \u00C0, we'll have to skip A first before we get to
   4664             // \u0300 the grave accent
   4665             targetce = ucol_next(coleiter, status);
   4666             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4667                 found = FALSE;
   4668                 break;
   4669             }
   4670             targetce = getCE(strsrch, targetce);
   4671             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
   4672                 firstce = targetce;
   4673             }
   4674 
   4675             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4676             if (targetce == patternce[0]) {
   4677                 // the first ce can be a contraction
   4678                 found = TRUE;
   4679                 break;
   4680             }
   4681             if (!hasExpansion(coleiter)) {
   4682                 // checking for accents in composite character
   4683                 found = FALSE;
   4684                 break;
   4685             }
   4686         }
   4687 
   4688         targetce = firstce;
   4689 
   4690         while (found && patternceindex < patterncelength) {
   4691             targetce    = ucol_next(coleiter, status);
   4692             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
   4693                 found = FALSE;
   4694                 break;
   4695             }
   4696             targetce = getCE(strsrch, targetce);
   4697             if (targetce == UCOL_IGNORABLE) {
   4698                 continue;
   4699             }
   4700 
   4701             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
   4702             found = found && targetce == patternce[patternceindex];
   4703             patternceindex ++;
   4704         }
   4705 
   4706         // initializing the rearranged accent array
   4707         if (hasPatternAccents && !found) {
   4708             strsrch->canonicalPrefixAccents[0] = 0;
   4709             strsrch->canonicalSuffixAccents[0] = 0;
   4710             if (U_FAILURE(*status)) {
   4711                 break;
   4712             }
   4713             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
   4714         }
   4715 
   4716         if (!found) {
   4717             if (U_FAILURE(*status)) {
   4718                 break;
   4719             }
   4720             textoffset = reverseShift(strsrch, textoffset, targetce,
   4721                                       patternceindex);
   4722             patternceindex = 0;
   4723             continue;
   4724         }
   4725 
   4726         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
   4727             setColEIterOffset(coleiter, textoffset);
   4728             return TRUE;
   4729         }
   4730     }
   4731     setMatchNotFound(strsrch);
   4732     return FALSE;
   4733 #else
   4734     int32_t textOffset = ucol_getOffset(strsrch->textIter);
   4735     int32_t start = -1;
   4736     int32_t end = -1;
   4737 
   4738     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
   4739         strsrch->search->matchedIndex = start;
   4740         strsrch->search->matchedLength = end - start;
   4741         return TRUE;
   4742     } else {
   4743         setMatchNotFound(strsrch);
   4744         return FALSE;
   4745     }
   4746 #endif
   4747 }
   4748 
   4749 #endif /* #if !UCONFIG_NO_COLLATION */
   4750