Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationiterator.cpp
      9 *
     10 * created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "utypeinfo.h"  // for 'typeid' to work
     15 
     16 #include "unicode/utypes.h"
     17 
     18 #if !UCONFIG_NO_COLLATION
     19 
     20 #include "unicode/ucharstrie.h"
     21 #include "unicode/ustringtrie.h"
     22 #include "charstr.h"
     23 #include "cmemory.h"
     24 #include "collation.h"
     25 #include "collationdata.h"
     26 #include "collationfcd.h"
     27 #include "collationiterator.h"
     28 #include "normalizer2impl.h"
     29 #include "uassert.h"
     30 #include "uvectr32.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 CollationIterator::CEBuffer::~CEBuffer() {}
     35 
     36 UBool
     37 CollationIterator::CEBuffer::ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode) {
     38     int32_t capacity = buffer.getCapacity();
     39     if((length + appCap) <= capacity) { return TRUE; }
     40     if(U_FAILURE(errorCode)) { return FALSE; }
     41     do {
     42         if(capacity < 1000) {
     43             capacity *= 4;
     44         } else {
     45             capacity *= 2;
     46         }
     47     } while(capacity < (length + appCap));
     48     int64_t *p = buffer.resize(capacity, length);
     49     if(p == NULL) {
     50         errorCode = U_MEMORY_ALLOCATION_ERROR;
     51         return FALSE;
     52     }
     53     return TRUE;
     54 }
     55 
     56 // State of combining marks skipped in discontiguous contraction.
     57 // We create a state object on first use and keep it around deactivated between uses.
     58 class SkippedState : public UMemory {
     59 public:
     60     // Born active but empty.
     61     SkippedState() : pos(0), skipLengthAtMatch(0) {}
     62     void clear() {
     63         oldBuffer.remove();
     64         pos = 0;
     65         // The newBuffer is reset by setFirstSkipped().
     66     }
     67 
     68     UBool isEmpty() const { return oldBuffer.isEmpty(); }
     69 
     70     UBool hasNext() const { return pos < oldBuffer.length(); }
     71 
     72     // Requires hasNext().
     73     UChar32 next() {
     74         UChar32 c = oldBuffer.char32At(pos);
     75         pos += U16_LENGTH(c);
     76         return c;
     77     }
     78 
     79     // Accounts for one more input code point read beyond the end of the marks buffer.
     80     void incBeyond() {
     81         U_ASSERT(!hasNext());
     82         ++pos;
     83     }
     84 
     85     // Goes backward through the skipped-marks buffer.
     86     // Returns the number of code points read beyond the skipped marks
     87     // that need to be backtracked through normal input.
     88     int32_t backwardNumCodePoints(int32_t n) {
     89         int32_t length = oldBuffer.length();
     90         int32_t beyond = pos - length;
     91         if(beyond > 0) {
     92             if(beyond >= n) {
     93                 // Not back far enough to re-enter the oldBuffer.
     94                 pos -= n;
     95                 return n;
     96             } else {
     97                 // Back out all beyond-oldBuffer code points and re-enter the buffer.
     98                 pos = oldBuffer.moveIndex32(length, beyond - n);
     99                 return beyond;
    100             }
    101         } else {
    102             // Go backwards from inside the oldBuffer.
    103             pos = oldBuffer.moveIndex32(pos, -n);
    104             return 0;
    105         }
    106     }
    107 
    108     void setFirstSkipped(UChar32 c) {
    109         skipLengthAtMatch = 0;
    110         newBuffer.setTo(c);
    111     }
    112 
    113     void skip(UChar32 c) {
    114         newBuffer.append(c);
    115     }
    116 
    117     void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
    118 
    119     // Replaces the characters we consumed with the newly skipped ones.
    120     void replaceMatch() {
    121         // Note: UnicodeString.replace() pins pos to at most length().
    122         oldBuffer.replace(0, pos, newBuffer, 0, skipLengthAtMatch);
    123         pos = 0;
    124     }
    125 
    126     void saveTrieState(const UCharsTrie &trie) { trie.saveState(state); }
    127     void resetToTrieState(UCharsTrie &trie) const { trie.resetToState(state); }
    128 
    129 private:
    130     // Combining marks skipped in previous discontiguous-contraction matching.
    131     // After that discontiguous contraction was completed, we start reading them from here.
    132     UnicodeString oldBuffer;
    133     // Combining marks newly skipped in current discontiguous-contraction matching.
    134     // These might have been read from the normal text or from the oldBuffer.
    135     UnicodeString newBuffer;
    136     // Reading index in oldBuffer,
    137     // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
    138     int32_t pos;
    139     // newBuffer.length() at the time of the last matching character.
    140     // When a partial match fails, we back out skipped and partial-matching input characters.
    141     int32_t skipLengthAtMatch;
    142     // We save the trie state before we attempt to match a character,
    143     // so that we can skip it and try the next one.
    144     UCharsTrie::State state;
    145 };
    146 
    147 CollationIterator::CollationIterator(const CollationIterator &other)
    148         : UObject(other),
    149           trie(other.trie),
    150           data(other.data),
    151           cesIndex(other.cesIndex),
    152           skipped(NULL),
    153           numCpFwd(other.numCpFwd),
    154           isNumeric(other.isNumeric) {
    155     UErrorCode errorCode = U_ZERO_ERROR;
    156     int32_t length = other.ceBuffer.length;
    157     if(length > 0 && ceBuffer.ensureAppendCapacity(length, errorCode)) {
    158         for(int32_t i = 0; i < length; ++i) {
    159             ceBuffer.set(i, other.ceBuffer.get(i));
    160         }
    161         ceBuffer.length = length;
    162     } else {
    163         cesIndex = 0;
    164     }
    165 }
    166 
    167 CollationIterator::~CollationIterator() {
    168     delete skipped;
    169 }
    170 
    171 UBool
    172 CollationIterator::operator==(const CollationIterator &other) const {
    173     // Subclasses: Call this method and then add more specific checks.
    174     // Compare the iterator state but not the collation data (trie & data fields):
    175     // Assume that the caller compares the data.
    176     // Ignore skipped since that should be unused between calls to nextCE().
    177     // (It only stays around to avoid another memory allocation.)
    178     if(!(typeid(*this) == typeid(other) &&
    179             ceBuffer.length == other.ceBuffer.length &&
    180             cesIndex == other.cesIndex &&
    181             numCpFwd == other.numCpFwd &&
    182             isNumeric == other.isNumeric)) {
    183         return FALSE;
    184     }
    185     for(int32_t i = 0; i < ceBuffer.length; ++i) {
    186         if(ceBuffer.get(i) != other.ceBuffer.get(i)) { return FALSE; }
    187     }
    188     return TRUE;
    189 }
    190 
    191 void
    192 CollationIterator::reset() {
    193     cesIndex = ceBuffer.length = 0;
    194     if(skipped != NULL) { skipped->clear(); }
    195 }
    196 
    197 int32_t
    198 CollationIterator::fetchCEs(UErrorCode &errorCode) {
    199     while(U_SUCCESS(errorCode) && nextCE(errorCode) != Collation::NO_CE) {
    200         // No need to loop for each expansion CE.
    201         cesIndex = ceBuffer.length;
    202     }
    203     return ceBuffer.length;
    204 }
    205 
    206 uint32_t
    207 CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    208     c = nextCodePoint(errorCode);
    209     return (c < 0) ? Collation::FALLBACK_CE32 : data->getCE32(c);
    210 }
    211 
    212 UChar
    213 CollationIterator::handleGetTrailSurrogate() {
    214     return 0;
    215 }
    216 
    217 UBool
    218 CollationIterator::foundNULTerminator() {
    219     return FALSE;
    220 }
    221 
    222 UBool
    223 CollationIterator::forbidSurrogateCodePoints() const {
    224     return FALSE;
    225 }
    226 
    227 uint32_t
    228 CollationIterator::getDataCE32(UChar32 c) const {
    229     return data->getCE32(c);
    230 }
    231 
    232 uint32_t
    233 CollationIterator::getCE32FromBuilderData(uint32_t /*ce32*/, UErrorCode &errorCode) {
    234     if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
    235     return 0;
    236 }
    237 
    238 int64_t
    239 CollationIterator::nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
    240                                   UErrorCode &errorCode) {
    241     --ceBuffer.length;  // Undo ceBuffer.incLength().
    242     appendCEsFromCE32(d, c, ce32, TRUE, errorCode);
    243     if(U_SUCCESS(errorCode)) {
    244         return ceBuffer.get(cesIndex++);
    245     } else {
    246         return Collation::NO_CE_PRIMARY;
    247     }
    248 }
    249 
    250 void
    251 CollationIterator::appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
    252                                      UBool forward, UErrorCode &errorCode) {
    253     while(Collation::isSpecialCE32(ce32)) {
    254         switch(Collation::tagFromCE32(ce32)) {
    255         case Collation::FALLBACK_TAG:
    256         case Collation::RESERVED_TAG_3:
    257             if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
    258             return;
    259         case Collation::LONG_PRIMARY_TAG:
    260             ceBuffer.append(Collation::ceFromLongPrimaryCE32(ce32), errorCode);
    261             return;
    262         case Collation::LONG_SECONDARY_TAG:
    263             ceBuffer.append(Collation::ceFromLongSecondaryCE32(ce32), errorCode);
    264             return;
    265         case Collation::LATIN_EXPANSION_TAG:
    266             if(ceBuffer.ensureAppendCapacity(2, errorCode)) {
    267                 ceBuffer.set(ceBuffer.length, Collation::latinCE0FromCE32(ce32));
    268                 ceBuffer.set(ceBuffer.length + 1, Collation::latinCE1FromCE32(ce32));
    269                 ceBuffer.length += 2;
    270             }
    271             return;
    272         case Collation::EXPANSION32_TAG: {
    273             const uint32_t *ce32s = d->ce32s + Collation::indexFromCE32(ce32);
    274             int32_t length = Collation::lengthFromCE32(ce32);
    275             if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
    276                 do {
    277                     ceBuffer.appendUnsafe(Collation::ceFromCE32(*ce32s++));
    278                 } while(--length > 0);
    279             }
    280             return;
    281         }
    282         case Collation::EXPANSION_TAG: {
    283             const int64_t *ces = d->ces + Collation::indexFromCE32(ce32);
    284             int32_t length = Collation::lengthFromCE32(ce32);
    285             if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
    286                 do {
    287                     ceBuffer.appendUnsafe(*ces++);
    288                 } while(--length > 0);
    289             }
    290             return;
    291         }
    292         case Collation::BUILDER_DATA_TAG:
    293             ce32 = getCE32FromBuilderData(ce32, errorCode);
    294             if(U_FAILURE(errorCode)) { return; }
    295             if(ce32 == Collation::FALLBACK_CE32) {
    296                 d = data->base;
    297                 ce32 = d->getCE32(c);
    298             }
    299             break;
    300         case Collation::PREFIX_TAG:
    301             if(forward) { backwardNumCodePoints(1, errorCode); }
    302             ce32 = getCE32FromPrefix(d, ce32, errorCode);
    303             if(forward) { forwardNumCodePoints(1, errorCode); }
    304             break;
    305         case Collation::CONTRACTION_TAG: {
    306             const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
    307             uint32_t defaultCE32 = CollationData::readCE32(p);  // Default if no suffix match.
    308             if(!forward) {
    309                 // Backward contractions are handled by previousCEUnsafe().
    310                 // c has contractions but they were not found.
    311                 ce32 = defaultCE32;
    312                 break;
    313             }
    314             UChar32 nextCp;
    315             if(skipped == NULL && numCpFwd < 0) {
    316                 // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
    317                 // avoiding the function call and the nextSkippedCodePoint() overhead.
    318                 nextCp = nextCodePoint(errorCode);
    319                 if(nextCp < 0) {
    320                     // No more text.
    321                     ce32 = defaultCE32;
    322                     break;
    323                 } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
    324                         !CollationFCD::mayHaveLccc(nextCp)) {
    325                     // All contraction suffixes start with characters with lccc!=0
    326                     // but the next code point has lccc==0.
    327                     backwardNumCodePoints(1, errorCode);
    328                     ce32 = defaultCE32;
    329                     break;
    330                 }
    331             } else {
    332                 nextCp = nextSkippedCodePoint(errorCode);
    333                 if(nextCp < 0) {
    334                     // No more text.
    335                     ce32 = defaultCE32;
    336                     break;
    337                 } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
    338                         !CollationFCD::mayHaveLccc(nextCp)) {
    339                     // All contraction suffixes start with characters with lccc!=0
    340                     // but the next code point has lccc==0.
    341                     backwardNumSkipped(1, errorCode);
    342                     ce32 = defaultCE32;
    343                     break;
    344                 }
    345             }
    346             ce32 = nextCE32FromContraction(d, ce32, p + 2, defaultCE32, nextCp, errorCode);
    347             if(ce32 == Collation::NO_CE32) {
    348                 // CEs from a discontiguous contraction plus the skipped combining marks
    349                 // have been appended already.
    350                 return;
    351             }
    352             break;
    353         }
    354         case Collation::DIGIT_TAG:
    355             if(isNumeric) {
    356                 appendNumericCEs(ce32, forward, errorCode);
    357                 return;
    358             } else {
    359                 // Fetch the non-numeric-collation CE32 and continue.
    360                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
    361                 break;
    362             }
    363         case Collation::U0000_TAG:
    364             U_ASSERT(c == 0);
    365             if(forward && foundNULTerminator()) {
    366                 // Handle NUL-termination. (Not needed in Java.)
    367                 ceBuffer.append(Collation::NO_CE, errorCode);
    368                 return;
    369             } else {
    370                 // Fetch the normal ce32 for U+0000 and continue.
    371                 ce32 = d->ce32s[0];
    372                 break;
    373             }
    374         case Collation::HANGUL_TAG: {
    375             const uint32_t *jamoCE32s = d->jamoCE32s;
    376             c -= Hangul::HANGUL_BASE;
    377             UChar32 t = c % Hangul::JAMO_T_COUNT;
    378             c /= Hangul::JAMO_T_COUNT;
    379             UChar32 v = c % Hangul::JAMO_V_COUNT;
    380             c /= Hangul::JAMO_V_COUNT;
    381             if((ce32 & Collation::HANGUL_NO_SPECIAL_JAMO) != 0) {
    382                 // None of the Jamo CE32s are isSpecialCE32().
    383                 // Avoid recursive function calls and per-Jamo tests.
    384                 if(ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3, errorCode)) {
    385                     ceBuffer.set(ceBuffer.length, Collation::ceFromCE32(jamoCE32s[c]));
    386                     ceBuffer.set(ceBuffer.length + 1, Collation::ceFromCE32(jamoCE32s[19 + v]));
    387                     ceBuffer.length += 2;
    388                     if(t != 0) {
    389                         ceBuffer.appendUnsafe(Collation::ceFromCE32(jamoCE32s[39 + t]));
    390                     }
    391                 }
    392                 return;
    393             } else {
    394                 // We should not need to compute each Jamo code point.
    395                 // In particular, there should be no offset or implicit ce32.
    396                 appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[c], forward, errorCode);
    397                 appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[19 + v], forward, errorCode);
    398                 if(t == 0) { return; }
    399                 // offset 39 = 19 + 21 - 1:
    400                 // 19 = JAMO_L_COUNT
    401                 // 21 = JAMO_T_COUNT
    402                 // -1 = omit t==0
    403                 ce32 = jamoCE32s[39 + t];
    404                 c = U_SENTINEL;
    405                 break;
    406             }
    407         }
    408         case Collation::LEAD_SURROGATE_TAG: {
    409             U_ASSERT(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
    410             U_ASSERT(U16_IS_LEAD(c));
    411             UChar trail;
    412             if(U16_IS_TRAIL(trail = handleGetTrailSurrogate())) {
    413                 c = U16_GET_SUPPLEMENTARY(c, trail);
    414                 ce32 &= Collation::LEAD_TYPE_MASK;
    415                 if(ce32 == Collation::LEAD_ALL_UNASSIGNED) {
    416                     ce32 = Collation::UNASSIGNED_CE32;  // unassigned-implicit
    417                 } else if(ce32 == Collation::LEAD_ALL_FALLBACK ||
    418                         (ce32 = d->getCE32FromSupplementary(c)) == Collation::FALLBACK_CE32) {
    419                     // fall back to the base data
    420                     d = d->base;
    421                     ce32 = d->getCE32FromSupplementary(c);
    422                 }
    423             } else {
    424                 // c is an unpaired surrogate.
    425                 ce32 = Collation::UNASSIGNED_CE32;
    426             }
    427             break;
    428         }
    429         case Collation::OFFSET_TAG:
    430             U_ASSERT(c >= 0);
    431             ceBuffer.append(d->getCEFromOffsetCE32(c, ce32), errorCode);
    432             return;
    433         case Collation::IMPLICIT_TAG:
    434             U_ASSERT(c >= 0);
    435             if(U_IS_SURROGATE(c) && forbidSurrogateCodePoints()) {
    436                 ce32 = Collation::FFFD_CE32;
    437                 break;
    438             } else {
    439                 ceBuffer.append(Collation::unassignedCEFromCodePoint(c), errorCode);
    440                 return;
    441             }
    442         }
    443     }
    444     ceBuffer.append(Collation::ceFromSimpleCE32(ce32), errorCode);
    445 }
    446 
    447 uint32_t
    448 CollationIterator::getCE32FromPrefix(const CollationData *d, uint32_t ce32,
    449                                      UErrorCode &errorCode) {
    450     const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
    451     ce32 = CollationData::readCE32(p);  // Default if no prefix match.
    452     p += 2;
    453     // Number of code points read before the original code point.
    454     int32_t lookBehind = 0;
    455     UCharsTrie prefixes(p);
    456     for(;;) {
    457         UChar32 c = previousCodePoint(errorCode);
    458         if(c < 0) { break; }
    459         ++lookBehind;
    460         UStringTrieResult match = prefixes.nextForCodePoint(c);
    461         if(USTRINGTRIE_HAS_VALUE(match)) {
    462             ce32 = (uint32_t)prefixes.getValue();
    463         }
    464         if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
    465     }
    466     forwardNumCodePoints(lookBehind, errorCode);
    467     return ce32;
    468 }
    469 
    470 UChar32
    471 CollationIterator::nextSkippedCodePoint(UErrorCode &errorCode) {
    472     if(skipped != NULL && skipped->hasNext()) { return skipped->next(); }
    473     if(numCpFwd == 0) { return U_SENTINEL; }
    474     UChar32 c = nextCodePoint(errorCode);
    475     if(skipped != NULL && !skipped->isEmpty() && c >= 0) { skipped->incBeyond(); }
    476     if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
    477     return c;
    478 }
    479 
    480 void
    481 CollationIterator::backwardNumSkipped(int32_t n, UErrorCode &errorCode) {
    482     if(skipped != NULL && !skipped->isEmpty()) {
    483         n = skipped->backwardNumCodePoints(n);
    484     }
    485     backwardNumCodePoints(n, errorCode);
    486     if(numCpFwd >= 0) { numCpFwd += n; }
    487 }
    488 
    489 uint32_t
    490 CollationIterator::nextCE32FromContraction(const CollationData *d, uint32_t contractionCE32,
    491                                            const UChar *p, uint32_t ce32, UChar32 c,
    492                                            UErrorCode &errorCode) {
    493     // c: next code point after the original one
    494 
    495     // Number of code points read beyond the original code point.
    496     // Needed for discontiguous contraction matching.
    497     int32_t lookAhead = 1;
    498     // Number of code points read since the last match (initially only c).
    499     int32_t sinceMatch = 1;
    500     // Normally we only need a contiguous match,
    501     // and therefore need not remember the suffixes state from before a mismatch for retrying.
    502     // If we are already processing skipped combining marks, then we do track the state.
    503     UCharsTrie suffixes(p);
    504     if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
    505     UStringTrieResult match = suffixes.firstForCodePoint(c);
    506     for(;;) {
    507         UChar32 nextCp;
    508         if(USTRINGTRIE_HAS_VALUE(match)) {
    509             ce32 = (uint32_t)suffixes.getValue();
    510             if(!USTRINGTRIE_HAS_NEXT(match) || (c = nextSkippedCodePoint(errorCode)) < 0) {
    511                 return ce32;
    512             }
    513             if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
    514             sinceMatch = 1;
    515         } else if(match == USTRINGTRIE_NO_MATCH || (nextCp = nextSkippedCodePoint(errorCode)) < 0) {
    516             // No match for c, or partial match (USTRINGTRIE_NO_VALUE) and no further text.
    517             // Back up if necessary, and try a discontiguous contraction.
    518             if((contractionCE32 & Collation::CONTRACT_TRAILING_CCC) != 0 &&
    519                     // Discontiguous contraction matching extends an existing match.
    520                     // If there is no match yet, then there is nothing to do.
    521                     ((contractionCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
    522                         sinceMatch < lookAhead)) {
    523                 // The last character of at least one suffix has lccc!=0,
    524                 // allowing for discontiguous contractions.
    525                 // UCA S2.1.1 only processes non-starters immediately following
    526                 // "a match in the table" (sinceMatch=1).
    527                 if(sinceMatch > 1) {
    528                     // Return to the state after the last match.
    529                     // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
    530                     backwardNumSkipped(sinceMatch, errorCode);
    531                     c = nextSkippedCodePoint(errorCode);
    532                     lookAhead -= sinceMatch - 1;
    533                     sinceMatch = 1;
    534                 }
    535                 if(d->getFCD16(c) > 0xff) {
    536                     return nextCE32FromDiscontiguousContraction(
    537                         d, suffixes, ce32, lookAhead, c, errorCode);
    538                 }
    539             }
    540             break;
    541         } else {
    542             // Continue after partial match (USTRINGTRIE_NO_VALUE) for c.
    543             // It does not have a result value, therefore it is not itself "a match in the table".
    544             // If a partially-matched c has ccc!=0 then
    545             // it might be skipped in discontiguous contraction.
    546             c = nextCp;
    547             ++sinceMatch;
    548         }
    549         ++lookAhead;
    550         match = suffixes.nextForCodePoint(c);
    551     }
    552     backwardNumSkipped(sinceMatch, errorCode);
    553     return ce32;
    554 }
    555 
    556 uint32_t
    557 CollationIterator::nextCE32FromDiscontiguousContraction(
    558         const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,
    559         int32_t lookAhead, UChar32 c,
    560         UErrorCode &errorCode) {
    561     if(U_FAILURE(errorCode)) { return 0; }
    562 
    563     // UCA section 3.3.2 Contractions:
    564     // Contractions that end with non-starter characters
    565     // are known as discontiguous contractions.
    566     // ... discontiguous contractions must be detected in input text
    567     // whenever the final sequence of non-starter characters could be rearranged
    568     // so as to make a contiguous matching sequence that is canonically equivalent.
    569 
    570     // UCA: http://www.unicode.org/reports/tr10/#S2.1
    571     // S2.1 Find the longest initial substring S at each point that has a match in the table.
    572     // S2.1.1 If there are any non-starters following S, process each non-starter C.
    573     // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
    574     //     Note: A non-starter in a string is called blocked
    575     //     if there is another non-starter of the same canonical combining class or zero
    576     //     between it and the last character of canonical combining class 0.
    577     // S2.1.3 If there is a match, replace S by S + C, and remove C.
    578 
    579     // First: Is a discontiguous contraction even possible?
    580     uint16_t fcd16 = d->getFCD16(c);
    581     U_ASSERT(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
    582     UChar32 nextCp = nextSkippedCodePoint(errorCode);
    583     if(nextCp < 0) {
    584         // No further text.
    585         backwardNumSkipped(1, errorCode);
    586         return ce32;
    587     }
    588     ++lookAhead;
    589     uint8_t prevCC = (uint8_t)fcd16;
    590     fcd16 = d->getFCD16(nextCp);
    591     if(fcd16 <= 0xff) {
    592         // The next code point after c is a starter (S2.1.1 "process each non-starter").
    593         backwardNumSkipped(2, errorCode);
    594         return ce32;
    595     }
    596 
    597     // We have read and matched (lookAhead-2) code points,
    598     // read non-matching c and peeked ahead at nextCp.
    599     // Return to the state before the mismatch and continue matching with nextCp.
    600     if(skipped == NULL || skipped->isEmpty()) {
    601         if(skipped == NULL) {
    602             skipped = new SkippedState();
    603             if(skipped == NULL) {
    604                 errorCode = U_MEMORY_ALLOCATION_ERROR;
    605                 return 0;
    606             }
    607         }
    608         suffixes.reset();
    609         if(lookAhead > 2) {
    610             // Replay the partial match so far.
    611             backwardNumCodePoints(lookAhead, errorCode);
    612             suffixes.firstForCodePoint(nextCodePoint(errorCode));
    613             for(int32_t i = 3; i < lookAhead; ++i) {
    614                 suffixes.nextForCodePoint(nextCodePoint(errorCode));
    615             }
    616             // Skip c (which did not match) and nextCp (which we will try now).
    617             forwardNumCodePoints(2, errorCode);
    618         }
    619         skipped->saveTrieState(suffixes);
    620     } else {
    621         // Reset to the trie state before the failed match of c.
    622         skipped->resetToTrieState(suffixes);
    623     }
    624 
    625     skipped->setFirstSkipped(c);
    626     // Number of code points read since the last match (at this point: c and nextCp).
    627     int32_t sinceMatch = 2;
    628     c = nextCp;
    629     for(;;) {
    630         UStringTrieResult match;
    631         // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
    632         if(prevCC < (fcd16 >> 8) && USTRINGTRIE_HAS_VALUE(match = suffixes.nextForCodePoint(c))) {
    633             // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
    634             // Keep prevCC unchanged.
    635             ce32 = (uint32_t)suffixes.getValue();
    636             sinceMatch = 0;
    637             skipped->recordMatch();
    638             if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
    639             skipped->saveTrieState(suffixes);
    640         } else {
    641             // No match for "S + C", skip C.
    642             skipped->skip(c);
    643             skipped->resetToTrieState(suffixes);
    644             prevCC = (uint8_t)fcd16;
    645         }
    646         if((c = nextSkippedCodePoint(errorCode)) < 0) { break; }
    647         ++sinceMatch;
    648         fcd16 = d->getFCD16(c);
    649         if(fcd16 <= 0xff) {
    650             // The next code point after c is a starter (S2.1.1 "process each non-starter").
    651             break;
    652         }
    653     }
    654     backwardNumSkipped(sinceMatch, errorCode);
    655     UBool isTopDiscontiguous = skipped->isEmpty();
    656     skipped->replaceMatch();
    657     if(isTopDiscontiguous && !skipped->isEmpty()) {
    658         // We did get a match after skipping one or more combining marks,
    659         // and we are not in a recursive discontiguous contraction.
    660         // Append CEs from the contraction ce32
    661         // and then from the combining marks that we skipped before the match.
    662         c = U_SENTINEL;
    663         for(;;) {
    664             appendCEsFromCE32(d, c, ce32, TRUE, errorCode);
    665             // Fetch CE32s for skipped combining marks from the normal data, with fallback,
    666             // rather than from the CollationData where we found the contraction.
    667             if(!skipped->hasNext()) { break; }
    668             c = skipped->next();
    669             ce32 = getDataCE32(c);
    670             if(ce32 == Collation::FALLBACK_CE32) {
    671                 d = data->base;
    672                 ce32 = d->getCE32(c);
    673             } else {
    674                 d = data;
    675             }
    676             // Note: A nested discontiguous-contraction match
    677             // replaces consumed combining marks with newly skipped ones
    678             // and resets the reading position to the beginning.
    679         }
    680         skipped->clear();
    681         ce32 = Collation::NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
    682     }
    683     return ce32;
    684 }
    685 
    686 void
    687 CollationIterator::appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode) {
    688     // Collect digits.
    689     CharString digits;
    690     if(forward) {
    691         for(;;) {
    692             char digit = Collation::digitFromCE32(ce32);
    693             digits.append(digit, errorCode);
    694             if(numCpFwd == 0) { break; }
    695             UChar32 c = nextCodePoint(errorCode);
    696             if(c < 0) { break; }
    697             ce32 = data->getCE32(c);
    698             if(ce32 == Collation::FALLBACK_CE32) {
    699                 ce32 = data->base->getCE32(c);
    700             }
    701             if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
    702                 backwardNumCodePoints(1, errorCode);
    703                 break;
    704             }
    705             if(numCpFwd > 0) { --numCpFwd; }
    706         }
    707     } else {
    708         for(;;) {
    709             char digit = Collation::digitFromCE32(ce32);
    710             digits.append(digit, errorCode);
    711             UChar32 c = previousCodePoint(errorCode);
    712             if(c < 0) { break; }
    713             ce32 = data->getCE32(c);
    714             if(ce32 == Collation::FALLBACK_CE32) {
    715                 ce32 = data->base->getCE32(c);
    716             }
    717             if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
    718                 forwardNumCodePoints(1, errorCode);
    719                 break;
    720             }
    721         }
    722         // Reverse the digit string.
    723         char *p = digits.data();
    724         char *q = p + digits.length() - 1;
    725         while(p < q) {
    726             char digit = *p;
    727             *p++ = *q;
    728             *q-- = digit;
    729         }
    730     }
    731     if(U_FAILURE(errorCode)) { return; }
    732     int32_t pos = 0;
    733     do {
    734         // Skip leading zeros.
    735         while(pos < (digits.length() - 1) && digits[pos] == 0) { ++pos; }
    736         // Write a sequence of CEs for at most 254 digits at a time.
    737         int32_t segmentLength = digits.length() - pos;
    738         if(segmentLength > 254) { segmentLength = 254; }
    739         appendNumericSegmentCEs(digits.data() + pos, segmentLength, errorCode);
    740         pos += segmentLength;
    741     } while(U_SUCCESS(errorCode) && pos < digits.length());
    742 }
    743 
    744 void
    745 CollationIterator::appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode) {
    746     U_ASSERT(1 <= length && length <= 254);
    747     U_ASSERT(length == 1 || digits[0] != 0);
    748     uint32_t numericPrimary = data->numericPrimary;
    749     // Note: We use primary byte values 2..255: digits are not compressible.
    750     if(length <= 7) {
    751         // Very dense encoding for small numbers.
    752         int32_t value = digits[0];
    753         for(int32_t i = 1; i < length; ++i) {
    754             value = value * 10 + digits[i];
    755         }
    756         // Primary weight second byte values:
    757         //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
    758         //     40 byte values  76..115 for medium numbers in three-byte primary weights.
    759         //     16 byte values 116..131 for large numbers in four-byte primary weights.
    760         //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
    761         int32_t firstByte = 2;
    762         int32_t numBytes = 74;
    763         if(value < numBytes) {
    764             // Two-byte primary for 0..73, good for day & month numbers etc.
    765             uint32_t primary = numericPrimary | ((firstByte + value) << 16);
    766             ceBuffer.append(Collation::makeCE(primary), errorCode);
    767             return;
    768         }
    769         value -= numBytes;
    770         firstByte += numBytes;
    771         numBytes = 40;
    772         if(value < numBytes * 254) {
    773             // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
    774             uint32_t primary = numericPrimary |
    775                 ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
    776             ceBuffer.append(Collation::makeCE(primary), errorCode);
    777             return;
    778         }
    779         value -= numBytes * 254;
    780         firstByte += numBytes;
    781         numBytes = 16;
    782         if(value < numBytes * 254 * 254) {
    783             // Four-byte primary for 10234..1042489=10234+16*254*254-1.
    784             uint32_t primary = numericPrimary | (2 + value % 254);
    785             value /= 254;
    786             primary |= (2 + value % 254) << 8;
    787             value /= 254;
    788             primary |= (firstByte + value % 254) << 16;
    789             ceBuffer.append(Collation::makeCE(primary), errorCode);
    790             return;
    791         }
    792         // original value > 1042489
    793     }
    794     U_ASSERT(length >= 7);
    795 
    796     // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
    797     // then we generate primary bytes with those pairs.
    798     // Omit trailing 00 pairs.
    799     // Decrement the value for the last pair.
    800 
    801     // Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255.
    802     int32_t numPairs = (length + 1) / 2;
    803     uint32_t primary = numericPrimary | ((132 - 4 + numPairs) << 16);
    804     // Find the length without trailing 00 pairs.
    805     while(digits[length - 1] == 0 && digits[length - 2] == 0) {
    806         length -= 2;
    807     }
    808     // Read the first pair.
    809     uint32_t pair;
    810     int32_t pos;
    811     if(length & 1) {
    812         // Only "half a pair" if we have an odd number of digits.
    813         pair = digits[0];
    814         pos = 1;
    815     } else {
    816         pair = digits[0] * 10 + digits[1];
    817         pos = 2;
    818     }
    819     pair = 11 + 2 * pair;
    820     // Add the pairs of digits between pos and length.
    821     int32_t shift = 8;
    822     while(pos < length) {
    823         if(shift == 0) {
    824             // Every three pairs/bytes we need to store a 4-byte-primary CE
    825             // and start with a new CE with the '0' primary lead byte.
    826             primary |= pair;
    827             ceBuffer.append(Collation::makeCE(primary), errorCode);
    828             primary = numericPrimary;
    829             shift = 16;
    830         } else {
    831             primary |= pair << shift;
    832             shift -= 8;
    833         }
    834         pair = 11 + 2 * (digits[pos] * 10 + digits[pos + 1]);
    835         pos += 2;
    836     }
    837     primary |= (pair - 1) << shift;
    838     ceBuffer.append(Collation::makeCE(primary), errorCode);
    839 }
    840 
    841 int64_t
    842 CollationIterator::previousCE(UVector32 &offsets, UErrorCode &errorCode) {
    843     if(ceBuffer.length > 0) {
    844         // Return the previous buffered CE.
    845         return ceBuffer.get(--ceBuffer.length);
    846     }
    847     offsets.removeAllElements();
    848     int32_t limitOffset = getOffset();
    849     UChar32 c = previousCodePoint(errorCode);
    850     if(c < 0) { return Collation::NO_CE; }
    851     if(data->isUnsafeBackward(c, isNumeric)) {
    852         return previousCEUnsafe(c, offsets, errorCode);
    853     }
    854     // Simple, safe-backwards iteration:
    855     // Get a CE going backwards, handle prefixes but no contractions.
    856     uint32_t ce32 = data->getCE32(c);
    857     const CollationData *d;
    858     if(ce32 == Collation::FALLBACK_CE32) {
    859         d = data->base;
    860         ce32 = d->getCE32(c);
    861     } else {
    862         d = data;
    863     }
    864     if(Collation::isSimpleOrLongCE32(ce32)) {
    865         return Collation::ceFromCE32(ce32);
    866     }
    867     appendCEsFromCE32(d, c, ce32, FALSE, errorCode);
    868     if(U_SUCCESS(errorCode)) {
    869         if(ceBuffer.length > 1) {
    870             offsets.addElement(getOffset(), errorCode);
    871             // For an expansion, the offset of each non-initial CE is the limit offset,
    872             // consistent with forward iteration.
    873             while(offsets.size() <= ceBuffer.length) {
    874                 offsets.addElement(limitOffset, errorCode);
    875             };
    876         }
    877         return ceBuffer.get(--ceBuffer.length);
    878     } else {
    879         return Collation::NO_CE_PRIMARY;
    880     }
    881 }
    882 
    883 int64_t
    884 CollationIterator::previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode) {
    885     // We just move through the input counting safe and unsafe code points
    886     // without collecting the unsafe-backward substring into a buffer and
    887     // switching to it.
    888     // This is to keep the logic simple. Otherwise we would have to handle
    889     // prefix matching going before the backward buffer, switching
    890     // to iteration and back, etc.
    891     // In the most important case of iterating over a normal string,
    892     // reading from the string itself is already maximally fast.
    893     // The only drawback there is that after getting the CEs we always
    894     // skip backward to the safe character rather than switching out
    895     // of a backwardBuffer.
    896     // But this should not be the common case for previousCE(),
    897     // and correctness and maintainability are more important than
    898     // complex optimizations.
    899     // Find the first safe character before c.
    900     int32_t numBackward = 1;
    901     while((c = previousCodePoint(errorCode)) >= 0) {
    902         ++numBackward;
    903         if(!data->isUnsafeBackward(c, isNumeric)) {
    904             break;
    905         }
    906     }
    907     // Set the forward iteration limit.
    908     // Note: This counts code points.
    909     // We cannot enforce a limit in the middle of a surrogate pair or similar.
    910     numCpFwd = numBackward;
    911     // Reset the forward iterator.
    912     cesIndex = 0;
    913     U_ASSERT(ceBuffer.length == 0);
    914     // Go forward and collect the CEs.
    915     int32_t offset = getOffset();
    916     while(numCpFwd > 0) {
    917         // nextCE() normally reads one code point.
    918         // Contraction matching and digit specials read more and check numCpFwd.
    919         --numCpFwd;
    920         // Append one or more CEs to the ceBuffer.
    921         (void)nextCE(errorCode);
    922         U_ASSERT(U_FAILURE(errorCode) || ceBuffer.get(ceBuffer.length - 1) != Collation::NO_CE);
    923         // No need to loop for getting each expansion CE from nextCE().
    924         cesIndex = ceBuffer.length;
    925         // However, we need to write an offset for each CE.
    926         // This is for CollationElementIterator::getOffset() to return
    927         // intermediate offsets from the unsafe-backwards segment.
    928         U_ASSERT(offsets.size() < ceBuffer.length);
    929         offsets.addElement(offset, errorCode);
    930         // For an expansion, the offset of each non-initial CE is the limit offset,
    931         // consistent with forward iteration.
    932         offset = getOffset();
    933         while(offsets.size() < ceBuffer.length) {
    934             offsets.addElement(offset, errorCode);
    935         };
    936     }
    937     U_ASSERT(offsets.size() == ceBuffer.length);
    938     // End offset corresponding to just after the unsafe-backwards segment.
    939     offsets.addElement(offset, errorCode);
    940     // Reset the forward iteration limit
    941     // and move backward to before the segment for which we fetched CEs.
    942     numCpFwd = -1;
    943     backwardNumCodePoints(numBackward, errorCode);
    944     // Use the collected CEs and return the last one.
    945     cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
    946     if(U_SUCCESS(errorCode)) {
    947         return ceBuffer.get(--ceBuffer.length);
    948     } else {
    949         return Collation::NO_CE_PRIMARY;
    950     }
    951 }
    952 
    953 U_NAMESPACE_END
    954 
    955 #endif  // !UCONFIG_NO_COLLATION
    956