Home | History | Annotate | Download | only in i18n
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2009, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/usearch.h"
     15 
     16 #include "cmemory.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/coleitr.h"
     20 #include "unicode/ucoleitr.h"
     21 
     22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     23 
     24 #include "unicode/uniset.h"
     25 #include "unicode/uset.h"
     26 #include "unicode/ustring.h"
     27 #include "hash.h"
     28 #include "uhash.h"
     29 #include "ucol_imp.h"
     30 #include "unormimp.h"
     31 
     32 #include "unicode/colldata.h"
     33 #include "unicode/bmsearch.h"
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     40 
     41 
     42 struct CEI
     43 {
     44     uint32_t order;
     45     int32_t  lowOffset;
     46     int32_t  highOffset;
     47 };
     48 
     49 class Target : public UMemory
     50 {
     51 public:
     52     Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
     53     ~Target();
     54 
     55     void setTargetString(const UnicodeString *target);
     56 
     57     const CEI *nextCE(int32_t offset);
     58     const CEI *prevCE(int32_t offset);
     59 
     60     int32_t stringLength();
     61     UChar charAt(int32_t offset);
     62 
     63     UBool isBreakBoundary(int32_t offset);
     64     int32_t nextBreakBoundary(int32_t offset);
     65     int32_t nextSafeBoundary(int32_t offset);
     66 
     67     UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
     68 
     69     void setOffset(int32_t offset);
     70     void setLast(int32_t last);
     71     int32_t getOffset();
     72 
     73 private:
     74     CEI *ceb;
     75     int32_t bufferSize;
     76     int32_t bufferMin;
     77     int32_t bufferMax;
     78 
     79     uint32_t strengthMask;
     80     UCollationStrength strength;
     81     uint32_t variableTop;
     82     UBool toShift;
     83     UCollator *coll;
     84 
     85     const UnicodeString *targetString;
     86     const UChar *targetBuffer;
     87     int32_t targetLength;
     88 
     89     UCollationElements *elements;
     90     UBreakIterator *charBreakIterator;
     91 };
     92 
     93 Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
     94     : bufferSize(0), bufferMin(0), bufferMax(0),
     95       strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
     96       targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
     97 {
     98     strength = ucol_getStrength(coll);
     99     toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
    100     variableTop = ucol_getVariableTop(coll, &status);
    101 
    102     // find the largest expansion
    103     uint8_t maxExpansion = 0;
    104     for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
    105         if (*expansion > maxExpansion) {
    106             maxExpansion = *expansion;
    107         }
    108     }
    109 
    110     // room for an extra character on each end, plus 4 for safety
    111     bufferSize = patternLength + (2 * maxExpansion) + 4;
    112 
    113     ceb = NEW_ARRAY(CEI, bufferSize);
    114 
    115     if (ceb == NULL) {
    116         status = U_MEMORY_ALLOCATION_ERROR;
    117         return;
    118     }
    119 
    120     if (target != NULL) {
    121         setTargetString(target);
    122     }
    123 
    124     switch (strength)
    125     {
    126     default:
    127         strengthMask |= UCOL_TERTIARYORDERMASK;
    128         /* fall through */
    129 
    130     case UCOL_SECONDARY:
    131         strengthMask |= UCOL_SECONDARYORDERMASK;
    132         /* fall through */
    133 
    134     case UCOL_PRIMARY:
    135         strengthMask |= UCOL_PRIMARYORDERMASK;
    136     }
    137 }
    138 
    139 Target::~Target()
    140 {
    141     ubrk_close(charBreakIterator);
    142     ucol_closeElements(elements);
    143 
    144     DELETE_ARRAY(ceb);
    145 }
    146 
    147 void Target::setTargetString(const UnicodeString *target)
    148 {
    149     if (charBreakIterator != NULL) {
    150         ubrk_close(charBreakIterator);
    151         ucol_closeElements(elements);
    152     }
    153 
    154     targetString = target;
    155 
    156     if (targetString != NULL) {
    157         UErrorCode status = U_ZERO_ERROR;
    158 
    159         targetBuffer = targetString->getBuffer();
    160         targetLength = targetString->length();
    161 
    162         elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
    163         ucol_forceHanImplicit(elements, &status);
    164 
    165         charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
    166                                       targetBuffer, targetLength, &status);
    167     } else {
    168         targetBuffer = NULL;
    169         targetLength = 0;
    170     }
    171 }
    172 
    173 const CEI *Target::nextCE(int32_t offset)
    174 {
    175     UErrorCode status = U_ZERO_ERROR;
    176     int32_t low = -1, high = -1;
    177     uint32_t order;
    178     UBool cont = FALSE;
    179 
    180     if (offset >= bufferMin && offset < bufferMax) {
    181         return &ceb[offset];
    182     }
    183 
    184     if (bufferMax >= bufferSize || offset != bufferMax) {
    185         return NULL;
    186     }
    187 
    188     do {
    189         low   = ucol_getOffset(elements);
    190         order = ucol_next(elements, &status);
    191         high  = ucol_getOffset(elements);
    192 
    193         if (order == (uint32_t)UCOL_NULLORDER) {
    194           //high = low = -1;
    195             break;
    196         }
    197 
    198         cont = isContinuation(order);
    199         order &= strengthMask;
    200 
    201         if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
    202             if (strength >= UCOL_QUATERNARY) {
    203                 order &= UCOL_PRIMARYORDERMASK;
    204             } else {
    205                 order = UCOL_IGNORABLE;
    206             }
    207         }
    208     } while (order == UCOL_IGNORABLE);
    209 
    210     if (cont) {
    211         order |= UCOL_CONTINUATION_MARKER;
    212     }
    213 
    214     ceb[offset].order = order;
    215     ceb[offset].lowOffset = low;
    216     ceb[offset].highOffset = high;
    217 
    218     bufferMax += 1;
    219 
    220     return &ceb[offset];
    221 }
    222 
    223 const CEI *Target::prevCE(int32_t offset)
    224 {
    225     UErrorCode status = U_ZERO_ERROR;
    226     int32_t low = -1, high = -1;
    227     uint32_t order;
    228     UBool cont = FALSE;
    229 
    230     if (offset >= bufferMin && offset < bufferMax) {
    231         return &ceb[offset];
    232     }
    233 
    234     if (bufferMax >= bufferSize || offset != bufferMax) {
    235         return NULL;
    236     }
    237 
    238     do {
    239         high  = ucol_getOffset(elements);
    240         order = ucol_previous(elements, &status);
    241         low   = ucol_getOffset(elements);
    242 
    243         if (order == (uint32_t)UCOL_NULLORDER) {
    244             break;
    245         }
    246 
    247         cont = isContinuation(order);
    248         order &= strengthMask;
    249 
    250         if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
    251             if (strength >= UCOL_QUATERNARY) {
    252                 order &= UCOL_PRIMARYORDERMASK;
    253             } else {
    254                 order = UCOL_IGNORABLE;
    255             }
    256         }
    257     } while (order == UCOL_IGNORABLE);
    258 
    259     bufferMax += 1;
    260 
    261     if (cont) {
    262         order |= UCOL_CONTINUATION_MARKER;
    263     }
    264 
    265     ceb[offset].order       = order;
    266     ceb[offset].lowOffset   = low;
    267     ceb[offset].highOffset = high;
    268 
    269     return &ceb[offset];
    270 }
    271 
    272 int32_t Target::stringLength()
    273 {
    274     if (targetString != NULL) {
    275         return targetLength;
    276     }
    277 
    278     return 0;
    279 }
    280 
    281 UChar Target::charAt(int32_t offset)
    282 {
    283     if (targetString != NULL) {
    284         return targetBuffer[offset];
    285     }
    286 
    287     return 0x0000;
    288 }
    289 
    290 void Target::setOffset(int32_t offset)
    291 {
    292     UErrorCode status = U_ZERO_ERROR;
    293 
    294     bufferMin = 0;
    295     bufferMax = 0;
    296 
    297     ucol_setOffset(elements, offset, &status);
    298 }
    299 
    300 void Target::setLast(int32_t last)
    301 {
    302     UErrorCode status = U_ZERO_ERROR;
    303 
    304     bufferMin = 0;
    305     bufferMax = 1;
    306 
    307     ceb[0].order      = UCOL_NULLORDER;
    308     ceb[0].lowOffset  = last;
    309     ceb[0].highOffset = last;
    310 
    311     ucol_setOffset(elements, last, &status);
    312 }
    313 
    314 int32_t Target::getOffset()
    315 {
    316     return ucol_getOffset(elements);
    317 }
    318 
    319 UBool Target::isBreakBoundary(int32_t offset)
    320 {
    321     return ubrk_isBoundary(charBreakIterator, offset);
    322 }
    323 
    324 int32_t Target::nextBreakBoundary(int32_t offset)
    325 {
    326     return ubrk_following(charBreakIterator, offset);
    327 }
    328 
    329 int32_t Target::nextSafeBoundary(int32_t offset)
    330 {
    331     while (offset < targetLength) {
    332       //UChar ch = charAt(offset);
    333         UChar ch = targetBuffer[offset];
    334 
    335         if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
    336             return offset;
    337         }
    338 
    339         offset += 1;
    340     }
    341 
    342     return targetLength;
    343 }
    344 
    345 UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
    346 {
    347     if (strength < UCOL_IDENTICAL) {
    348         return TRUE;
    349     }
    350 
    351     UChar t2[32], p2[32];
    352     const UChar *pBuffer = pattern.getBuffer();
    353     int32_t pLength = pattern.length();
    354     int32_t length = end - start;
    355 
    356     UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
    357 
    358     int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2),
    359                                        targetBuffer + start, length,
    360                                        FALSE, 0, &status);
    361 
    362     // use separate status2 in case of buffer overflow
    363     if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2),
    364                                         pBuffer, pLength,
    365                                         FALSE, 0, &status2)) {
    366         return FALSE; // lengths are different
    367     }
    368 
    369     // compare contents
    370     UChar *text, *pat;
    371 
    372     if(U_SUCCESS(status)) {
    373         text = t2;
    374         pat = p2;
    375     } else if(status == U_BUFFER_OVERFLOW_ERROR) {
    376         status = U_ZERO_ERROR;
    377 
    378         // allocate one buffer for both decompositions
    379         text = NEW_ARRAY(UChar, decomplength * 2);
    380 
    381         // Check for allocation failure.
    382         if (text == NULL) {
    383         	return FALSE;
    384         }
    385 
    386         pat = text + decomplength;
    387 
    388         unorm_decompose(text, decomplength, targetBuffer + start,
    389                         length, FALSE, 0, &status);
    390 
    391         unorm_decompose(pat, decomplength, pBuffer,
    392                         pLength, FALSE, 0, &status);
    393     } else {
    394         // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
    395         // and that we don't uprv_free() an undefined text pointer
    396         text = pat = t2;
    397         decomplength = 0;
    398     }
    399 
    400     UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0);
    401 
    402     if(text != t2) {
    403         DELETE_ARRAY(text);
    404     }
    405 
    406     // return FALSE if NFD failed
    407     return U_SUCCESS(status) && result;
    408 }
    409 
    410 #define HASH_TABLE_SIZE 257
    411 
    412 class BadCharacterTable : public UMemory
    413 {
    414 public:
    415     BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
    416     ~BadCharacterTable();
    417 
    418     int32_t operator[](uint32_t ce) const;
    419     int32_t getMaxSkip() const;
    420     int32_t minLengthInChars(int32_t index);
    421 
    422 private:
    423     static int32_t hash(uint32_t ce);
    424 
    425     int32_t maxSkip;
    426     int32_t badCharacterTable[HASH_TABLE_SIZE];
    427 
    428     int32_t *minLengthCache;
    429 };
    430 
    431 BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
    432     : minLengthCache(NULL)
    433 {
    434     int32_t plen = patternCEs.size();
    435 
    436     // **** need a better way to deal with this ****
    437     if (U_FAILURE(status) || plen == 0) {
    438         return;
    439     }
    440 
    441     int32_t *history = NEW_ARRAY(int32_t, plen);
    442 
    443     if (history == NULL) {
    444         status = U_MEMORY_ALLOCATION_ERROR;
    445         return;
    446     }
    447 
    448     for (int32_t i = 0; i < plen; i += 1) {
    449         history[i] = -1;
    450     }
    451 
    452     minLengthCache = NEW_ARRAY(int32_t, plen + 1);
    453 
    454     if (minLengthCache == NULL) {
    455         DELETE_ARRAY(history);
    456         status = U_MEMORY_ALLOCATION_ERROR;
    457         return;
    458     }
    459 
    460     maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
    461 
    462     for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
    463         badCharacterTable[j] = maxSkip;
    464     }
    465 
    466     for(int32_t p = 1; p < plen; p += 1) {
    467         minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
    468 
    469         // Make sure this entry is not bigger than the previous one.
    470         // Otherwise, we might skip too far in some cases.
    471         if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
    472             minLengthCache[p] = minLengthCache[p - 1];
    473         }
    474     }
    475 
    476     minLengthCache[plen] = 0;
    477 
    478     for(int32_t p = 0; p < plen - 1; p += 1) {
    479         badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
    480     }
    481 
    482     DELETE_ARRAY(history);
    483 }
    484 
    485 BadCharacterTable::~BadCharacterTable()
    486 {
    487     DELETE_ARRAY(minLengthCache);
    488 }
    489 
    490 int32_t BadCharacterTable::operator[](uint32_t ce) const
    491 {
    492     return badCharacterTable[hash(ce)];
    493 }
    494 
    495 int32_t BadCharacterTable::getMaxSkip() const
    496 {
    497     return maxSkip;
    498 }
    499 
    500 int32_t BadCharacterTable::minLengthInChars(int32_t index)
    501 {
    502     return minLengthCache[index];
    503 }
    504 
    505 int32_t BadCharacterTable::hash(uint32_t ce)
    506 {
    507     return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
    508 }
    509 
    510 class GoodSuffixTable : public UMemory
    511 {
    512 public:
    513     GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
    514     ~GoodSuffixTable();
    515 
    516     int32_t operator[](int32_t offset) const;
    517 
    518 private:
    519     int32_t *goodSuffixTable;
    520 };
    521 
    522 GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
    523     : goodSuffixTable(NULL)
    524 {
    525     int32_t patlen = patternCEs.size();
    526 
    527     // **** need a better way to deal with this ****
    528     if (U_FAILURE(status) || patlen <= 0) {
    529         return;
    530     }
    531 
    532     int32_t *suff  = NEW_ARRAY(int32_t, patlen);
    533     int32_t start = patlen - 1, end = - 1;
    534     int32_t maxSkip = badCharacterTable.getMaxSkip();
    535 
    536     if (suff == NULL) {
    537         status = U_MEMORY_ALLOCATION_ERROR;
    538         return;
    539     }
    540 
    541     // initialze suff
    542     suff[patlen - 1] = patlen;
    543 
    544     for (int32_t i = patlen - 2; i >= 0; i -= 1) {
    545         // (i > start) means we're inside the last suffix match we found
    546         // ((patlen - 1) - end) is how far the end of that match is from end of pattern
    547         // (i - start) is how far we are from start of that match
    548         // (i + (patlen - 1) - end) is index of same character at end of pattern
    549         // so if any suffix match at that character doesn't extend beyond the last match,
    550         // it's the suffix for this character as well
    551         if (i > start && suff[i + patlen - 1 - end] < i - start) {
    552             suff[i] = suff[i + patlen - 1 - end];
    553         } else {
    554             start = end = i;
    555 
    556             int32_t s = patlen;
    557 
    558             while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
    559                 start -= 1;
    560             }
    561 
    562             suff[i] = end - start;
    563         }
    564     }
    565 
    566     // now build goodSuffixTable
    567     goodSuffixTable  = NEW_ARRAY(int32_t, patlen);
    568 
    569     if (goodSuffixTable == NULL) {
    570         DELETE_ARRAY(suff);
    571         status = U_MEMORY_ALLOCATION_ERROR;
    572         return;
    573     }
    574 
    575 
    576     // initialize entries to minLengthInChars of the pattern
    577     for (int32_t i = 0; i < patlen; i += 1) {
    578         goodSuffixTable[i] = maxSkip;
    579     }
    580 
    581     int32_t prefix = 0;
    582 
    583     for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
    584         if (suff[i] == i + 1) {
    585             // this matching suffix is a prefix of the pattern
    586             int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
    587 
    588             // for any mis-match before this suffix, we should skip
    589             // so that the front of the pattern (i.e. the prefix)
    590             // lines up with the front of the suffix.
    591             // (patlen - 1 - i) is the start of the suffix
    592             while (prefix < patlen - 1 - i) {
    593                 // value of maxSkip means never set...
    594                 if (goodSuffixTable[prefix] == maxSkip) {
    595                     goodSuffixTable[prefix] = prefixSkip;
    596                 }
    597 
    598                 prefix += 1;
    599             }
    600         }
    601     }
    602 
    603     for (int32_t i = 0; i < patlen - 1; i += 1) {
    604         goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
    605     }
    606 
    607     DELETE_ARRAY(suff);
    608 }
    609 
    610 GoodSuffixTable::~GoodSuffixTable()
    611 {
    612     DELETE_ARRAY(goodSuffixTable);
    613 }
    614 
    615 int32_t GoodSuffixTable::operator[](int32_t offset) const
    616 {
    617     return goodSuffixTable[offset];
    618 }
    619 
    620 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
    621 
    622 
    623 UBool BoyerMooreSearch::empty()
    624 {
    625     return patCEs->size() <= 0;
    626 }
    627 
    628 CollData *BoyerMooreSearch::getData()
    629 {
    630     return data;
    631 }
    632 
    633 CEList *BoyerMooreSearch::getPatternCEs()
    634 {
    635     return patCEs;
    636 }
    637 
    638 BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
    639 {
    640     return badCharacterTable;
    641 }
    642 
    643 GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
    644 {
    645     return goodSuffixTable;
    646 }
    647 
    648 BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
    649                                    UErrorCode &status)
    650     : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
    651 {
    652 
    653     if (U_FAILURE(status)) {
    654         return;
    655     }
    656 
    657     UCollator *collator = data->getCollator();
    658 
    659     patCEs = new CEList(collator, patternString, status);
    660 
    661     if (patCEs == NULL || U_FAILURE(status)) {
    662         return;
    663     }
    664 
    665     badCharacterTable = new BadCharacterTable(*patCEs, data, status);
    666 
    667     if (badCharacterTable == NULL || U_FAILURE(status)) {
    668         return;
    669     }
    670 
    671     goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
    672 
    673     if (targetString != NULL) {
    674         target = new Target(collator, targetString, patCEs->size(), status);
    675     }
    676 }
    677 
    678 BoyerMooreSearch::~BoyerMooreSearch()
    679 {
    680     delete target;
    681     delete goodSuffixTable;
    682     delete badCharacterTable;
    683     delete patCEs;
    684 }
    685 
    686 void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
    687 {
    688     if (U_FAILURE(status)) {
    689         return;
    690     }
    691 
    692     if (target == NULL) {
    693         target = new Target(data->getCollator(), targetString, patCEs->size(), status);
    694     } else {
    695         target->setTargetString(targetString);
    696     }
    697 }
    698 
    699 // **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
    700 /*
    701  * TODO:
    702  *  * deal with trailing (and leading?) ignorables.
    703  *  * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
    704  */
    705 UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
    706 {
    707     /*UCollator *coll =*/ data->getCollator();
    708     int32_t plen = patCEs->size();
    709     int32_t tlen = target->stringLength();
    710     int32_t maxSkip = badCharacterTable->getMaxSkip();
    711     int32_t tOffset = offset + maxSkip;
    712 
    713     if (plen <= 0) {
    714         // Searching for a zero length pattern always fails.
    715         start = end = -1;
    716         return FALSE;
    717     }
    718 
    719     while (tOffset <= tlen) {
    720         int32_t pIndex = plen - 1;
    721         int32_t tIndex = 0;
    722         int32_t lIndex = 0;
    723 
    724         if (tOffset < tlen) {
    725             // **** we really want to skip ahead enough to  ****
    726             // **** be sure we get at least 1 non-ignorable ****
    727             // **** CE after the end of the pattern.        ****
    728             int32_t next = target->nextSafeBoundary(tOffset + 1);
    729 
    730             target->setOffset(next);
    731 
    732             for (lIndex = 0; ; lIndex += 1) {
    733                 const CEI *cei = target->prevCE(lIndex);
    734                 int32_t low = cei->lowOffset;
    735                 int32_t high = cei->highOffset;
    736 
    737                 if (high == 0 || (low < high && low <= tOffset)) {
    738                     if (low < tOffset) {
    739                         while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
    740                             lIndex -= 1;
    741                         }
    742 
    743                         if (high > tOffset) {
    744                             tOffset = high;
    745                         }
    746                     }
    747 
    748                     break;
    749                 }
    750             }
    751         } else {
    752             target->setLast(tOffset);
    753             lIndex = 0;
    754         }
    755 
    756         tIndex = ++lIndex;
    757 
    758         // Iterate backward until we hit the beginning of the pattern
    759         while (pIndex >= 0) {
    760             uint32_t pce = (*patCEs)[pIndex];
    761             const CEI *tcei = target->prevCE(tIndex++);
    762 
    763 
    764             if (tcei->order != pce) {
    765                 // There is a mismatch at this position.  Decide how far
    766                 // over to shift the pattern, then try again.
    767 
    768                 int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
    769 #ifdef EXTRA_CAUTIOUS
    770                 int32_t old = tOffset;
    771 #endif
    772 
    773                 tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
    774 
    775                 if (gsOffset > tOffset) {
    776                     tOffset = gsOffset;
    777                 }
    778 
    779 #ifdef EXTRA_CAUTIOUS
    780                 // Make sure we don't skip backwards...
    781                 if (tOffset <= old) {
    782                     tOffset = old + 1;
    783                 }
    784 #endif
    785 
    786                 break;
    787             }
    788 
    789             pIndex -= 1;
    790         }
    791 
    792         if (pIndex < 0) {
    793             // We made it back to the beginning of the pattern,
    794             // which means we matched it all.  Return the location.
    795             const CEI firstCEI = *target->prevCE(tIndex - 1);
    796             const CEI lastCEI  = *target->prevCE(lIndex);
    797             int32_t mStart   = firstCEI.lowOffset;
    798             int32_t minLimit = lastCEI.lowOffset;
    799             int32_t maxLimit = lastCEI.highOffset;
    800             int32_t mLimit;
    801             UBool found = TRUE;
    802 
    803             target->setOffset(/*tOffset*/maxLimit);
    804 
    805             const CEI nextCEI = *target->nextCE(0);
    806 
    807             if (nextCEI.lowOffset > maxLimit) {
    808                 maxLimit = nextCEI.lowOffset;
    809             }
    810 
    811             if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
    812                 found = FALSE;
    813             }
    814 
    815             if (! target->isBreakBoundary(mStart)) {
    816                 found = FALSE;
    817             }
    818 
    819             if (firstCEI.lowOffset == firstCEI.highOffset) {
    820                 found = FALSE;
    821             }
    822 
    823             mLimit = maxLimit;
    824             if (minLimit < maxLimit) {
    825                 int32_t nbb = target->nextBreakBoundary(minLimit);
    826 
    827                 if (nbb >= lastCEI.highOffset) {
    828                     mLimit = nbb;
    829                 }
    830             }
    831 
    832             if (mLimit > maxLimit) {
    833                 found = FALSE;
    834             }
    835 
    836             if (! target->isBreakBoundary(mLimit)) {
    837                 found = FALSE;
    838             }
    839 
    840             if (! target->isIdentical(pattern, mStart, mLimit)) {
    841                 found = FALSE;
    842             }
    843 
    844             if (found) {
    845                 start = mStart;
    846                 end   = mLimit;
    847 
    848                 return TRUE;
    849             }
    850 
    851             tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
    852         }
    853         // Otherwise, we're here because of a mismatch, so keep going....
    854     }
    855 
    856     // no match
    857    start = -1;
    858    end = -1;
    859    return FALSE;
    860 }
    861 
    862 U_NAMESPACE_END
    863 
    864 #endif // #if !UCONFIG_NO_COLLATION
    865