Home | History | Annotate | Download | only in i18n
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2010, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/usearch.h"
     15 
     16 #include "cmemory.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/coleitr.h"
     20 #include "unicode/ucoleitr.h"
     21 
     22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     23 
     24 #include "unicode/uniset.h"
     25 #include "unicode/uset.h"
     26 #include "unicode/ustring.h"
     27 #include "hash.h"
     28 #include "uhash.h"
     29 #include "ucol_imp.h"
     30 #include "normalizer2impl.h"
     31 
     32 #include "unicode/colldata.h"
     33 #include "unicode/bmsearch.h"
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     40 
     41 
     42 struct CEI
     43 {
     44     uint32_t order;
     45     int32_t  lowOffset;
     46     int32_t  highOffset;
     47 };
     48 
     49 class Target : public UMemory
     50 {
     51 public:
     52     Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
     53     ~Target();
     54 
     55     void setTargetString(const UnicodeString *target);
     56 
     57     const CEI *nextCE(int32_t offset);
     58     const CEI *prevCE(int32_t offset);
     59 
     60     int32_t stringLength();
     61     UChar charAt(int32_t offset);
     62 
     63     UBool isBreakBoundary(int32_t offset);
     64     int32_t nextBreakBoundary(int32_t offset);
     65     int32_t nextSafeBoundary(int32_t offset);
     66 
     67     UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
     68 
     69     void setOffset(int32_t offset);
     70     void setLast(int32_t last);
     71     int32_t getOffset();
     72 
     73 private:
     74     CEI *ceb;
     75     int32_t bufferSize;
     76     int32_t bufferMin;
     77     int32_t bufferMax;
     78 
     79     uint32_t strengthMask;
     80     UCollationStrength strength;
     81     uint32_t variableTop;
     82     UBool toShift;
     83     UCollator *coll;
     84     const Normalizer2 &nfd;
     85 
     86     const UnicodeString *targetString;
     87     const UChar *targetBuffer;
     88     int32_t targetLength;
     89 
     90     UCollationElements *elements;
     91     UBreakIterator *charBreakIterator;
     92 };
     93 
     94 Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
     95     : bufferSize(0), bufferMin(0), bufferMax(0),
     96       strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
     97       nfd(*Normalizer2Factory::getNFDInstance(status)),
     98       targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
     99 {
    100     strength = ucol_getStrength(coll);
    101     toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
    102     variableTop = ucol_getVariableTop(coll, &status);
    103 
    104     // find the largest expansion
    105     uint8_t maxExpansion = 0;
    106     for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
    107         if (*expansion > maxExpansion) {
    108             maxExpansion = *expansion;
    109         }
    110     }
    111 
    112     // room for an extra character on each end, plus 4 for safety
    113     bufferSize = patternLength + (2 * maxExpansion) + 4;
    114 
    115     ceb = NEW_ARRAY(CEI, bufferSize);
    116 
    117     if (ceb == NULL) {
    118         status = U_MEMORY_ALLOCATION_ERROR;
    119         return;
    120     }
    121 
    122     if (target != NULL) {
    123         setTargetString(target);
    124     }
    125 
    126     switch (strength)
    127     {
    128     default:
    129         strengthMask |= UCOL_TERTIARYORDERMASK;
    130         /* fall through */
    131 
    132     case UCOL_SECONDARY:
    133         strengthMask |= UCOL_SECONDARYORDERMASK;
    134         /* fall through */
    135 
    136     case UCOL_PRIMARY:
    137         strengthMask |= UCOL_PRIMARYORDERMASK;
    138     }
    139 }
    140 
    141 Target::~Target()
    142 {
    143     ubrk_close(charBreakIterator);
    144     ucol_closeElements(elements);
    145 
    146     DELETE_ARRAY(ceb);
    147 }
    148 
    149 void Target::setTargetString(const UnicodeString *target)
    150 {
    151     if (charBreakIterator != NULL) {
    152         ubrk_close(charBreakIterator);
    153         ucol_closeElements(elements);
    154     }
    155 
    156     targetString = target;
    157 
    158     if (targetString != NULL) {
    159         UErrorCode status = U_ZERO_ERROR;
    160 
    161         targetBuffer = targetString->getBuffer();
    162         targetLength = targetString->length();
    163 
    164         elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
    165         ucol_forceHanImplicit(elements, &status);
    166 
    167         charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
    168                                       targetBuffer, targetLength, &status);
    169     } else {
    170         targetBuffer = NULL;
    171         targetLength = 0;
    172     }
    173 }
    174 
    175 const CEI *Target::nextCE(int32_t offset)
    176 {
    177     UErrorCode status = U_ZERO_ERROR;
    178     int32_t low = -1, high = -1;
    179     uint32_t order;
    180     UBool cont = FALSE;
    181 
    182     if (offset >= bufferMin && offset < bufferMax) {
    183         return &ceb[offset];
    184     }
    185 
    186     if (bufferMax >= bufferSize || offset != bufferMax) {
    187         return NULL;
    188     }
    189 
    190     do {
    191         low   = ucol_getOffset(elements);
    192         order = ucol_next(elements, &status);
    193         high  = ucol_getOffset(elements);
    194 
    195         if (order == (uint32_t)UCOL_NULLORDER) {
    196           //high = low = -1;
    197             break;
    198         }
    199 
    200         cont = isContinuation(order);
    201         order &= strengthMask;
    202 
    203         if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
    204             if (strength >= UCOL_QUATERNARY) {
    205                 order &= UCOL_PRIMARYORDERMASK;
    206             } else {
    207                 order = UCOL_IGNORABLE;
    208             }
    209         }
    210     } while (order == UCOL_IGNORABLE);
    211 
    212     if (cont) {
    213         order |= UCOL_CONTINUATION_MARKER;
    214     }
    215 
    216     ceb[offset].order = order;
    217     ceb[offset].lowOffset = low;
    218     ceb[offset].highOffset = high;
    219 
    220     bufferMax += 1;
    221 
    222     return &ceb[offset];
    223 }
    224 
    225 const CEI *Target::prevCE(int32_t offset)
    226 {
    227     UErrorCode status = U_ZERO_ERROR;
    228     int32_t low = -1, high = -1;
    229     uint32_t order;
    230     UBool cont = FALSE;
    231 
    232     if (offset >= bufferMin && offset < bufferMax) {
    233         return &ceb[offset];
    234     }
    235 
    236     if (bufferMax >= bufferSize || offset != bufferMax) {
    237         return NULL;
    238     }
    239 
    240     do {
    241         high  = ucol_getOffset(elements);
    242         order = ucol_previous(elements, &status);
    243         low   = ucol_getOffset(elements);
    244 
    245         if (order == (uint32_t)UCOL_NULLORDER) {
    246             break;
    247         }
    248 
    249         cont = isContinuation(order);
    250         order &= strengthMask;
    251 
    252         if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
    253             if (strength >= UCOL_QUATERNARY) {
    254                 order &= UCOL_PRIMARYORDERMASK;
    255             } else {
    256                 order = UCOL_IGNORABLE;
    257             }
    258         }
    259     } while (order == UCOL_IGNORABLE);
    260 
    261     bufferMax += 1;
    262 
    263     if (cont) {
    264         order |= UCOL_CONTINUATION_MARKER;
    265     }
    266 
    267     ceb[offset].order       = order;
    268     ceb[offset].lowOffset   = low;
    269     ceb[offset].highOffset = high;
    270 
    271     return &ceb[offset];
    272 }
    273 
    274 int32_t Target::stringLength()
    275 {
    276     if (targetString != NULL) {
    277         return targetLength;
    278     }
    279 
    280     return 0;
    281 }
    282 
    283 UChar Target::charAt(int32_t offset)
    284 {
    285     if (targetString != NULL) {
    286         return targetBuffer[offset];
    287     }
    288 
    289     return 0x0000;
    290 }
    291 
    292 void Target::setOffset(int32_t offset)
    293 {
    294     UErrorCode status = U_ZERO_ERROR;
    295 
    296     bufferMin = 0;
    297     bufferMax = 0;
    298 
    299     ucol_setOffset(elements, offset, &status);
    300 }
    301 
    302 void Target::setLast(int32_t last)
    303 {
    304     UErrorCode status = U_ZERO_ERROR;
    305 
    306     bufferMin = 0;
    307     bufferMax = 1;
    308 
    309     ceb[0].order      = UCOL_NULLORDER;
    310     ceb[0].lowOffset  = last;
    311     ceb[0].highOffset = last;
    312 
    313     ucol_setOffset(elements, last, &status);
    314 }
    315 
    316 int32_t Target::getOffset()
    317 {
    318     return ucol_getOffset(elements);
    319 }
    320 
    321 UBool Target::isBreakBoundary(int32_t offset)
    322 {
    323     return ubrk_isBoundary(charBreakIterator, offset);
    324 }
    325 
    326 int32_t Target::nextBreakBoundary(int32_t offset)
    327 {
    328     return ubrk_following(charBreakIterator, offset);
    329 }
    330 
    331 int32_t Target::nextSafeBoundary(int32_t offset)
    332 {
    333     while (offset < targetLength) {
    334       //UChar ch = charAt(offset);
    335         UChar ch = targetBuffer[offset];
    336 
    337         if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
    338             return offset;
    339         }
    340 
    341         offset += 1;
    342     }
    343 
    344     return targetLength;
    345 }
    346 
    347 UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
    348 {
    349     if (strength < UCOL_IDENTICAL) {
    350         return TRUE;
    351     }
    352 
    353     // Note: We could use Normalizer::compare() or similar, but for short strings
    354     // which may not be in FCD it might be faster to just NFD them.
    355     UErrorCode status = U_ZERO_ERROR;
    356     UnicodeString t2, p2;
    357     nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
    358     nfd.normalize(pattern, p2, status);
    359     // return FALSE if NFD failed
    360     return U_SUCCESS(status) && t2 == p2;
    361 }
    362 
    363 #define HASH_TABLE_SIZE 257
    364 
    365 class BadCharacterTable : public UMemory
    366 {
    367 public:
    368     BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
    369     ~BadCharacterTable();
    370 
    371     int32_t operator[](uint32_t ce) const;
    372     int32_t getMaxSkip() const;
    373     int32_t minLengthInChars(int32_t index);
    374 
    375 private:
    376     static int32_t hash(uint32_t ce);
    377 
    378     int32_t maxSkip;
    379     int32_t badCharacterTable[HASH_TABLE_SIZE];
    380 
    381     int32_t *minLengthCache;
    382 };
    383 
    384 BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
    385     : minLengthCache(NULL)
    386 {
    387     int32_t plen = patternCEs.size();
    388 
    389     // **** need a better way to deal with this ****
    390     if (U_FAILURE(status) || plen == 0) {
    391         return;
    392     }
    393 
    394     int32_t *history = NEW_ARRAY(int32_t, plen);
    395 
    396     if (history == NULL) {
    397         status = U_MEMORY_ALLOCATION_ERROR;
    398         return;
    399     }
    400 
    401     for (int32_t i = 0; i < plen; i += 1) {
    402         history[i] = -1;
    403     }
    404 
    405     minLengthCache = NEW_ARRAY(int32_t, plen + 1);
    406 
    407     if (minLengthCache == NULL) {
    408         DELETE_ARRAY(history);
    409         status = U_MEMORY_ALLOCATION_ERROR;
    410         return;
    411     }
    412 
    413     maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
    414 
    415     for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
    416         badCharacterTable[j] = maxSkip;
    417     }
    418 
    419     for(int32_t p = 1; p < plen; p += 1) {
    420         minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
    421 
    422         // Make sure this entry is not bigger than the previous one.
    423         // Otherwise, we might skip too far in some cases.
    424         if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
    425             minLengthCache[p] = minLengthCache[p - 1];
    426         }
    427     }
    428 
    429     minLengthCache[plen] = 0;
    430 
    431     for(int32_t p = 0; p < plen - 1; p += 1) {
    432         badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
    433     }
    434 
    435     DELETE_ARRAY(history);
    436 }
    437 
    438 BadCharacterTable::~BadCharacterTable()
    439 {
    440     DELETE_ARRAY(minLengthCache);
    441 }
    442 
    443 int32_t BadCharacterTable::operator[](uint32_t ce) const
    444 {
    445     return badCharacterTable[hash(ce)];
    446 }
    447 
    448 int32_t BadCharacterTable::getMaxSkip() const
    449 {
    450     return maxSkip;
    451 }
    452 
    453 int32_t BadCharacterTable::minLengthInChars(int32_t index)
    454 {
    455     return minLengthCache[index];
    456 }
    457 
    458 int32_t BadCharacterTable::hash(uint32_t ce)
    459 {
    460     return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
    461 }
    462 
    463 class GoodSuffixTable : public UMemory
    464 {
    465 public:
    466     GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
    467     ~GoodSuffixTable();
    468 
    469     int32_t operator[](int32_t offset) const;
    470 
    471 private:
    472     int32_t *goodSuffixTable;
    473 };
    474 
    475 GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
    476     : goodSuffixTable(NULL)
    477 {
    478     int32_t patlen = patternCEs.size();
    479 
    480     // **** need a better way to deal with this ****
    481     if (U_FAILURE(status) || patlen <= 0) {
    482         return;
    483     }
    484 
    485     int32_t *suff  = NEW_ARRAY(int32_t, patlen);
    486     int32_t start = patlen - 1, end = - 1;
    487     int32_t maxSkip = badCharacterTable.getMaxSkip();
    488 
    489     if (suff == NULL) {
    490         status = U_MEMORY_ALLOCATION_ERROR;
    491         return;
    492     }
    493 
    494     // initialze suff
    495     suff[patlen - 1] = patlen;
    496 
    497     for (int32_t i = patlen - 2; i >= 0; i -= 1) {
    498         // (i > start) means we're inside the last suffix match we found
    499         // ((patlen - 1) - end) is how far the end of that match is from end of pattern
    500         // (i - start) is how far we are from start of that match
    501         // (i + (patlen - 1) - end) is index of same character at end of pattern
    502         // so if any suffix match at that character doesn't extend beyond the last match,
    503         // it's the suffix for this character as well
    504         if (i > start && suff[i + patlen - 1 - end] < i - start) {
    505             suff[i] = suff[i + patlen - 1 - end];
    506         } else {
    507             start = end = i;
    508 
    509             int32_t s = patlen;
    510 
    511             while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
    512                 start -= 1;
    513             }
    514 
    515             suff[i] = end - start;
    516         }
    517     }
    518 
    519     // now build goodSuffixTable
    520     goodSuffixTable  = NEW_ARRAY(int32_t, patlen);
    521 
    522     if (goodSuffixTable == NULL) {
    523         DELETE_ARRAY(suff);
    524         status = U_MEMORY_ALLOCATION_ERROR;
    525         return;
    526     }
    527 
    528 
    529     // initialize entries to minLengthInChars of the pattern
    530     for (int32_t i = 0; i < patlen; i += 1) {
    531         goodSuffixTable[i] = maxSkip;
    532     }
    533 
    534     int32_t prefix = 0;
    535 
    536     for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
    537         if (suff[i] == i + 1) {
    538             // this matching suffix is a prefix of the pattern
    539             int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
    540 
    541             // for any mis-match before this suffix, we should skip
    542             // so that the front of the pattern (i.e. the prefix)
    543             // lines up with the front of the suffix.
    544             // (patlen - 1 - i) is the start of the suffix
    545             while (prefix < patlen - 1 - i) {
    546                 // value of maxSkip means never set...
    547                 if (goodSuffixTable[prefix] == maxSkip) {
    548                     goodSuffixTable[prefix] = prefixSkip;
    549                 }
    550 
    551                 prefix += 1;
    552             }
    553         }
    554     }
    555 
    556     for (int32_t i = 0; i < patlen - 1; i += 1) {
    557         goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
    558     }
    559 
    560     DELETE_ARRAY(suff);
    561 }
    562 
    563 GoodSuffixTable::~GoodSuffixTable()
    564 {
    565     DELETE_ARRAY(goodSuffixTable);
    566 }
    567 
    568 int32_t GoodSuffixTable::operator[](int32_t offset) const
    569 {
    570     return goodSuffixTable[offset];
    571 }
    572 
    573 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
    574 
    575 
    576 UBool BoyerMooreSearch::empty()
    577 {
    578     return patCEs->size() <= 0;
    579 }
    580 
    581 CollData *BoyerMooreSearch::getData()
    582 {
    583     return data;
    584 }
    585 
    586 CEList *BoyerMooreSearch::getPatternCEs()
    587 {
    588     return patCEs;
    589 }
    590 
    591 BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
    592 {
    593     return badCharacterTable;
    594 }
    595 
    596 GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
    597 {
    598     return goodSuffixTable;
    599 }
    600 
    601 BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
    602                                    UErrorCode &status)
    603     : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
    604 {
    605 
    606     if (U_FAILURE(status)) {
    607         return;
    608     }
    609 
    610     UCollator *collator = data->getCollator();
    611 
    612     patCEs = new CEList(collator, patternString, status);
    613 
    614     if (patCEs == NULL || U_FAILURE(status)) {
    615         return;
    616     }
    617 
    618     badCharacterTable = new BadCharacterTable(*patCEs, data, status);
    619 
    620     if (badCharacterTable == NULL || U_FAILURE(status)) {
    621         return;
    622     }
    623 
    624     goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
    625 
    626     if (targetString != NULL) {
    627         target = new Target(collator, targetString, patCEs->size(), status);
    628     }
    629 }
    630 
    631 BoyerMooreSearch::~BoyerMooreSearch()
    632 {
    633     delete target;
    634     delete goodSuffixTable;
    635     delete badCharacterTable;
    636     delete patCEs;
    637 }
    638 
    639 void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
    640 {
    641     if (U_FAILURE(status)) {
    642         return;
    643     }
    644 
    645     if (target == NULL) {
    646         target = new Target(data->getCollator(), targetString, patCEs->size(), status);
    647     } else {
    648         target->setTargetString(targetString);
    649     }
    650 }
    651 
    652 // **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
    653 /*
    654  * TODO:
    655  *  * deal with trailing (and leading?) ignorables.
    656  *  * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
    657  */
    658 UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
    659 {
    660     /*UCollator *coll =*/ data->getCollator();
    661     int32_t plen = patCEs->size();
    662     int32_t tlen = target->stringLength();
    663     int32_t maxSkip = badCharacterTable->getMaxSkip();
    664     int32_t tOffset = offset + maxSkip;
    665 
    666     if (plen <= 0) {
    667         // Searching for a zero length pattern always fails.
    668         start = end = -1;
    669         return FALSE;
    670     }
    671 
    672     while (tOffset <= tlen) {
    673         int32_t pIndex = plen - 1;
    674         int32_t tIndex = 0;
    675         int32_t lIndex = 0;
    676 
    677         if (tOffset < tlen) {
    678             // **** we really want to skip ahead enough to  ****
    679             // **** be sure we get at least 1 non-ignorable ****
    680             // **** CE after the end of the pattern.        ****
    681             int32_t next = target->nextSafeBoundary(tOffset + 1);
    682 
    683             target->setOffset(next);
    684 
    685             for (lIndex = 0; ; lIndex += 1) {
    686                 const CEI *cei = target->prevCE(lIndex);
    687                 int32_t low = cei->lowOffset;
    688                 int32_t high = cei->highOffset;
    689 
    690                 if (high == 0 || (low < high && low <= tOffset)) {
    691                     if (low < tOffset) {
    692                         while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
    693                             lIndex -= 1;
    694                         }
    695 
    696                         if (high > tOffset) {
    697                             tOffset = high;
    698                         }
    699                     }
    700 
    701                     break;
    702                 }
    703             }
    704         } else {
    705             target->setLast(tOffset);
    706             lIndex = 0;
    707         }
    708 
    709         tIndex = ++lIndex;
    710 
    711         // Iterate backward until we hit the beginning of the pattern
    712         while (pIndex >= 0) {
    713             uint32_t pce = (*patCEs)[pIndex];
    714             const CEI *tcei = target->prevCE(tIndex++);
    715 
    716 
    717             if (tcei->order != pce) {
    718                 // There is a mismatch at this position.  Decide how far
    719                 // over to shift the pattern, then try again.
    720 
    721                 int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
    722 #ifdef EXTRA_CAUTIOUS
    723                 int32_t old = tOffset;
    724 #endif
    725 
    726                 tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
    727 
    728                 if (gsOffset > tOffset) {
    729                     tOffset = gsOffset;
    730                 }
    731 
    732 #ifdef EXTRA_CAUTIOUS
    733                 // Make sure we don't skip backwards...
    734                 if (tOffset <= old) {
    735                     tOffset = old + 1;
    736                 }
    737 #endif
    738 
    739                 break;
    740             }
    741 
    742             pIndex -= 1;
    743         }
    744 
    745         if (pIndex < 0) {
    746             // We made it back to the beginning of the pattern,
    747             // which means we matched it all.  Return the location.
    748             const CEI firstCEI = *target->prevCE(tIndex - 1);
    749             const CEI lastCEI  = *target->prevCE(lIndex);
    750             int32_t mStart   = firstCEI.lowOffset;
    751             int32_t minLimit = lastCEI.lowOffset;
    752             int32_t maxLimit = lastCEI.highOffset;
    753             int32_t mLimit;
    754             UBool found = TRUE;
    755 
    756             target->setOffset(/*tOffset*/maxLimit);
    757 
    758             const CEI nextCEI = *target->nextCE(0);
    759 
    760             if (nextCEI.lowOffset > maxLimit) {
    761                 maxLimit = nextCEI.lowOffset;
    762             }
    763 
    764             if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
    765                 found = FALSE;
    766             }
    767 
    768             if (! target->isBreakBoundary(mStart)) {
    769                 found = FALSE;
    770             }
    771 
    772             if (firstCEI.lowOffset == firstCEI.highOffset) {
    773                 found = FALSE;
    774             }
    775 
    776             mLimit = maxLimit;
    777             if (minLimit < maxLimit) {
    778                 int32_t nbb = target->nextBreakBoundary(minLimit);
    779 
    780                 if (nbb >= lastCEI.highOffset) {
    781                     mLimit = nbb;
    782                 }
    783             }
    784 
    785             if (mLimit > maxLimit) {
    786                 found = FALSE;
    787             }
    788 
    789             if (! target->isBreakBoundary(mLimit)) {
    790                 found = FALSE;
    791             }
    792 
    793             if (! target->isIdentical(pattern, mStart, mLimit)) {
    794                 found = FALSE;
    795             }
    796 
    797             if (found) {
    798                 start = mStart;
    799                 end   = mLimit;
    800 
    801                 return TRUE;
    802             }
    803 
    804             tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
    805         }
    806         // Otherwise, we're here because of a mismatch, so keep going....
    807     }
    808 
    809     // no match
    810    start = -1;
    811    end = -1;
    812    return FALSE;
    813 }
    814 
    815 U_NAMESPACE_END
    816 
    817 #endif // #if !UCONFIG_NO_COLLATION
    818