Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2010-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * utf16collationiterator.cpp
      7 *
      8 * created on: 2010oct27
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "charstr.h"
     17 #include "cmemory.h"
     18 #include "collation.h"
     19 #include "collationdata.h"
     20 #include "collationfcd.h"
     21 #include "collationiterator.h"
     22 #include "normalizer2impl.h"
     23 #include "uassert.h"
     24 #include "utf16collationiterator.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
     29                                                const UChar *newText)
     30         : CollationIterator(other),
     31           start(newText),
     32           pos(newText + (other.pos - other.start)),
     33           limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
     34 }
     35 
     36 UTF16CollationIterator::~UTF16CollationIterator() {}
     37 
     38 UBool
     39 UTF16CollationIterator::operator==(const CollationIterator &other) const {
     40     if(!CollationIterator::operator==(other)) { return FALSE; }
     41     const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
     42     // Compare the iterator state but not the text: Assume that the caller does that.
     43     return (pos - start) == (o.pos - o.start);
     44 }
     45 
     46 void
     47 UTF16CollationIterator::resetToOffset(int32_t newOffset) {
     48     reset();
     49     pos = start + newOffset;
     50 }
     51 
     52 int32_t
     53 UTF16CollationIterator::getOffset() const {
     54     return (int32_t)(pos - start);
     55 }
     56 
     57 uint32_t
     58 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
     59     if(pos == limit) {
     60         c = U_SENTINEL;
     61         return Collation::FALLBACK_CE32;
     62     }
     63     c = *pos++;
     64     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
     65 }
     66 
     67 UChar
     68 UTF16CollationIterator::handleGetTrailSurrogate() {
     69     if(pos == limit) { return 0; }
     70     UChar trail;
     71     if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
     72     return trail;
     73 }
     74 
     75 UBool
     76 UTF16CollationIterator::foundNULTerminator() {
     77     if(limit == NULL) {
     78         limit = --pos;
     79         return TRUE;
     80     } else {
     81         return FALSE;
     82     }
     83 }
     84 
     85 UChar32
     86 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
     87     if(pos == limit) {
     88         return U_SENTINEL;
     89     }
     90     UChar32 c = *pos;
     91     if(c == 0 && limit == NULL) {
     92         limit = pos;
     93         return U_SENTINEL;
     94     }
     95     ++pos;
     96     UChar trail;
     97     if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
     98         ++pos;
     99         return U16_GET_SUPPLEMENTARY(c, trail);
    100     } else {
    101         return c;
    102     }
    103 }
    104 
    105 UChar32
    106 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
    107     if(pos == start) {
    108         return U_SENTINEL;
    109     }
    110     UChar32 c = *--pos;
    111     UChar lead;
    112     if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
    113         --pos;
    114         return U16_GET_SUPPLEMENTARY(lead, c);
    115     } else {
    116         return c;
    117     }
    118 }
    119 
    120 void
    121 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    122     while(num > 0 && pos != limit) {
    123         UChar32 c = *pos;
    124         if(c == 0 && limit == NULL) {
    125             limit = pos;
    126             break;
    127         }
    128         ++pos;
    129         --num;
    130         if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
    131             ++pos;
    132         }
    133     }
    134 }
    135 
    136 void
    137 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    138     while(num > 0 && pos != start) {
    139         UChar32 c = *--pos;
    140         --num;
    141         if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
    142             --pos;
    143         }
    144     }
    145 }
    146 
    147 // FCDUTF16CollationIterator ----------------------------------------------- ***
    148 
    149 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
    150                                                      const UChar *newText)
    151         : UTF16CollationIterator(other),
    152           rawStart(newText),
    153           segmentStart(newText + (other.segmentStart - other.rawStart)),
    154           segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
    155           rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
    156           nfcImpl(other.nfcImpl),
    157           normalized(other.normalized),
    158           checkDir(other.checkDir) {
    159     if(checkDir != 0 || other.start == other.segmentStart) {
    160         start = newText + (other.start - other.rawStart);
    161         pos = newText + (other.pos - other.rawStart);
    162         limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
    163     } else {
    164         start = normalized.getBuffer();
    165         pos = start + (other.pos - other.start);
    166         limit = start + normalized.length();
    167     }
    168 }
    169 
    170 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
    171 
    172 UBool
    173 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
    174     // Skip the UTF16CollationIterator and call its parent.
    175     if(!CollationIterator::operator==(other)) { return FALSE; }
    176     const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
    177     // Compare the iterator state but not the text: Assume that the caller does that.
    178     if(checkDir != o.checkDir) { return FALSE; }
    179     if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
    180     if(checkDir != 0 || start == segmentStart) {
    181         return (pos - rawStart) == (o.pos - o.rawStart);
    182     } else {
    183         return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
    184                 (pos - start) == (o.pos - o.start);
    185     }
    186 }
    187 
    188 void
    189 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
    190     reset();
    191     start = segmentStart = pos = rawStart + newOffset;
    192     limit = rawLimit;
    193     checkDir = 1;
    194 }
    195 
    196 int32_t
    197 FCDUTF16CollationIterator::getOffset() const {
    198     if(checkDir != 0 || start == segmentStart) {
    199         return (int32_t)(pos - rawStart);
    200     } else if(pos == start) {
    201         return (int32_t)(segmentStart - rawStart);
    202     } else {
    203         return (int32_t)(segmentLimit - rawStart);
    204     }
    205 }
    206 
    207 uint32_t
    208 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    209     for(;;) {
    210         if(checkDir > 0) {
    211             if(pos == limit) {
    212                 c = U_SENTINEL;
    213                 return Collation::FALLBACK_CE32;
    214             }
    215             c = *pos++;
    216             if(CollationFCD::hasTccc(c)) {
    217                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    218                         (pos != limit && CollationFCD::hasLccc(*pos))) {
    219                     --pos;
    220                     if(!nextSegment(errorCode)) {
    221                         c = U_SENTINEL;
    222                         return Collation::FALLBACK_CE32;
    223                     }
    224                     c = *pos++;
    225                 }
    226             }
    227             break;
    228         } else if(checkDir == 0 && pos != limit) {
    229             c = *pos++;
    230             break;
    231         } else {
    232             switchToForward();
    233         }
    234     }
    235     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    236 }
    237 
    238 UBool
    239 FCDUTF16CollationIterator::foundNULTerminator() {
    240     if(limit == NULL) {
    241         limit = rawLimit = --pos;
    242         return TRUE;
    243     } else {
    244         return FALSE;
    245     }
    246 }
    247 
    248 UChar32
    249 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
    250     UChar32 c;
    251     for(;;) {
    252         if(checkDir > 0) {
    253             if(pos == limit) {
    254                 return U_SENTINEL;
    255             }
    256             c = *pos++;
    257             if(CollationFCD::hasTccc(c)) {
    258                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    259                         (pos != limit && CollationFCD::hasLccc(*pos))) {
    260                     --pos;
    261                     if(!nextSegment(errorCode)) {
    262                         return U_SENTINEL;
    263                     }
    264                     c = *pos++;
    265                 }
    266             } else if(c == 0 && limit == NULL) {
    267                 limit = rawLimit = --pos;
    268                 return U_SENTINEL;
    269             }
    270             break;
    271         } else if(checkDir == 0 && pos != limit) {
    272             c = *pos++;
    273             break;
    274         } else {
    275             switchToForward();
    276         }
    277     }
    278     UChar trail;
    279     if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
    280         ++pos;
    281         return U16_GET_SUPPLEMENTARY(c, trail);
    282     } else {
    283         return c;
    284     }
    285 }
    286 
    287 UChar32
    288 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
    289     UChar32 c;
    290     for(;;) {
    291         if(checkDir < 0) {
    292             if(pos == start) {
    293                 return U_SENTINEL;
    294             }
    295             c = *--pos;
    296             if(CollationFCD::hasLccc(c)) {
    297                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    298                         (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
    299                     ++pos;
    300                     if(!previousSegment(errorCode)) {
    301                         return U_SENTINEL;
    302                     }
    303                     c = *--pos;
    304                 }
    305             }
    306             break;
    307         } else if(checkDir == 0 && pos != start) {
    308             c = *--pos;
    309             break;
    310         } else {
    311             switchToBackward();
    312         }
    313     }
    314     UChar lead;
    315     if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
    316         --pos;
    317         return U16_GET_SUPPLEMENTARY(lead, c);
    318     } else {
    319         return c;
    320     }
    321 }
    322 
    323 void
    324 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    325     // Specify the class to avoid a virtual-function indirection.
    326     // In Java, we would declare this class final.
    327     while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
    328         --num;
    329     }
    330 }
    331 
    332 void
    333 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    334     // Specify the class to avoid a virtual-function indirection.
    335     // In Java, we would declare this class final.
    336     while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
    337         --num;
    338     }
    339 }
    340 
    341 void
    342 FCDUTF16CollationIterator::switchToForward() {
    343     U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
    344     if(checkDir < 0) {
    345         // Turn around from backward checking.
    346         start = segmentStart = pos;
    347         if(pos == segmentLimit) {
    348             limit = rawLimit;
    349             checkDir = 1;  // Check forward.
    350         } else {  // pos < segmentLimit
    351             checkDir = 0;  // Stay in FCD segment.
    352         }
    353     } else {
    354         // Reached the end of the FCD segment.
    355         if(start == segmentStart) {
    356             // The input text segment is FCD, extend it forward.
    357         } else {
    358             // The input text segment needed to be normalized.
    359             // Switch to checking forward from it.
    360             pos = start = segmentStart = segmentLimit;
    361             // Note: If this segment is at the end of the input text,
    362             // then it might help to return FALSE to indicate that, so that
    363             // we do not have to re-check and normalize when we turn around and go backwards.
    364             // However, that would complicate the call sites for an optimization of an unusual case.
    365         }
    366         limit = rawLimit;
    367         checkDir = 1;
    368     }
    369 }
    370 
    371 UBool
    372 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
    373     if(U_FAILURE(errorCode)) { return FALSE; }
    374     U_ASSERT(checkDir > 0 && pos != limit);
    375     // The input text [segmentStart..pos[ passes the FCD check.
    376     const UChar *p = pos;
    377     uint8_t prevCC = 0;
    378     for(;;) {
    379         // Fetch the next character's fcd16 value.
    380         const UChar *q = p;
    381         uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
    382         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
    383         if(leadCC == 0 && q != pos) {
    384             // FCD boundary before the [q, p[ character.
    385             limit = segmentLimit = q;
    386             break;
    387         }
    388         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    389             // Fails FCD check. Find the next FCD boundary and normalize.
    390             do {
    391                 q = p;
    392             } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
    393             if(!normalize(pos, q, errorCode)) { return FALSE; }
    394             pos = start;
    395             break;
    396         }
    397         prevCC = (uint8_t)fcd16;
    398         if(p == rawLimit || prevCC == 0) {
    399             // FCD boundary after the last character.
    400             limit = segmentLimit = p;
    401             break;
    402         }
    403     }
    404     U_ASSERT(pos != limit);
    405     checkDir = 0;
    406     return TRUE;
    407 }
    408 
    409 void
    410 FCDUTF16CollationIterator::switchToBackward() {
    411     U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
    412     if(checkDir > 0) {
    413         // Turn around from forward checking.
    414         limit = segmentLimit = pos;
    415         if(pos == segmentStart) {
    416             start = rawStart;
    417             checkDir = -1;  // Check backward.
    418         } else {  // pos > segmentStart
    419             checkDir = 0;  // Stay in FCD segment.
    420         }
    421     } else {
    422         // Reached the start of the FCD segment.
    423         if(start == segmentStart) {
    424             // The input text segment is FCD, extend it backward.
    425         } else {
    426             // The input text segment needed to be normalized.
    427             // Switch to checking backward from it.
    428             pos = limit = segmentLimit = segmentStart;
    429         }
    430         start = rawStart;
    431         checkDir = -1;
    432     }
    433 }
    434 
    435 UBool
    436 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
    437     if(U_FAILURE(errorCode)) { return FALSE; }
    438     U_ASSERT(checkDir < 0 && pos != start);
    439     // The input text [pos..segmentLimit[ passes the FCD check.
    440     const UChar *p = pos;
    441     uint8_t nextCC = 0;
    442     for(;;) {
    443         // Fetch the previous character's fcd16 value.
    444         const UChar *q = p;
    445         uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
    446         uint8_t trailCC = (uint8_t)fcd16;
    447         if(trailCC == 0 && q != pos) {
    448             // FCD boundary after the [p, q[ character.
    449             start = segmentStart = q;
    450             break;
    451         }
    452         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    453                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    454             // Fails FCD check. Find the previous FCD boundary and normalize.
    455             do {
    456                 q = p;
    457             } while(fcd16 > 0xff && p != rawStart &&
    458                     (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
    459             if(!normalize(q, pos, errorCode)) { return FALSE; }
    460             pos = limit;
    461             break;
    462         }
    463         nextCC = (uint8_t)(fcd16 >> 8);
    464         if(p == rawStart || nextCC == 0) {
    465             // FCD boundary before the following character.
    466             start = segmentStart = p;
    467             break;
    468         }
    469     }
    470     U_ASSERT(pos != start);
    471     checkDir = 0;
    472     return TRUE;
    473 }
    474 
    475 UBool
    476 FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
    477     // NFD without argument checking.
    478     U_ASSERT(U_SUCCESS(errorCode));
    479     nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
    480     if(U_FAILURE(errorCode)) { return FALSE; }
    481     // Switch collation processing into the FCD buffer
    482     // with the result of normalizing [segmentStart, segmentLimit[.
    483     segmentStart = from;
    484     segmentLimit = to;
    485     start = normalized.getBuffer();
    486     limit = start + normalized.length();
    487     return TRUE;
    488 }
    489 
    490 U_NAMESPACE_END
    491 
    492 #endif  // !UCONFIG_NO_COLLATION
    493