Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * utf16collationiterator.cpp
      9 *
     10 * created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "charstr.h"
     19 #include "cmemory.h"
     20 #include "collation.h"
     21 #include "collationdata.h"
     22 #include "collationfcd.h"
     23 #include "collationiterator.h"
     24 #include "normalizer2impl.h"
     25 #include "uassert.h"
     26 #include "utf16collationiterator.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
     31                                                const UChar *newText)
     32         : CollationIterator(other),
     33           start(newText),
     34           pos(newText + (other.pos - other.start)),
     35           limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
     36 }
     37 
     38 UTF16CollationIterator::~UTF16CollationIterator() {}
     39 
     40 UBool
     41 UTF16CollationIterator::operator==(const CollationIterator &other) const {
     42     if(!CollationIterator::operator==(other)) { return FALSE; }
     43     const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
     44     // Compare the iterator state but not the text: Assume that the caller does that.
     45     return (pos - start) == (o.pos - o.start);
     46 }
     47 
     48 void
     49 UTF16CollationIterator::resetToOffset(int32_t newOffset) {
     50     reset();
     51     pos = start + newOffset;
     52 }
     53 
     54 int32_t
     55 UTF16CollationIterator::getOffset() const {
     56     return (int32_t)(pos - start);
     57 }
     58 
     59 uint32_t
     60 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
     61     if(pos == limit) {
     62         c = U_SENTINEL;
     63         return Collation::FALLBACK_CE32;
     64     }
     65     c = *pos++;
     66     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
     67 }
     68 
     69 UChar
     70 UTF16CollationIterator::handleGetTrailSurrogate() {
     71     if(pos == limit) { return 0; }
     72     UChar trail;
     73     if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
     74     return trail;
     75 }
     76 
     77 UBool
     78 UTF16CollationIterator::foundNULTerminator() {
     79     if(limit == NULL) {
     80         limit = --pos;
     81         return TRUE;
     82     } else {
     83         return FALSE;
     84     }
     85 }
     86 
     87 UChar32
     88 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
     89     if(pos == limit) {
     90         return U_SENTINEL;
     91     }
     92     UChar32 c = *pos;
     93     if(c == 0 && limit == NULL) {
     94         limit = pos;
     95         return U_SENTINEL;
     96     }
     97     ++pos;
     98     UChar trail;
     99     if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
    100         ++pos;
    101         return U16_GET_SUPPLEMENTARY(c, trail);
    102     } else {
    103         return c;
    104     }
    105 }
    106 
    107 UChar32
    108 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
    109     if(pos == start) {
    110         return U_SENTINEL;
    111     }
    112     UChar32 c = *--pos;
    113     UChar lead;
    114     if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
    115         --pos;
    116         return U16_GET_SUPPLEMENTARY(lead, c);
    117     } else {
    118         return c;
    119     }
    120 }
    121 
    122 void
    123 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    124     while(num > 0 && pos != limit) {
    125         UChar32 c = *pos;
    126         if(c == 0 && limit == NULL) {
    127             limit = pos;
    128             break;
    129         }
    130         ++pos;
    131         --num;
    132         if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
    133             ++pos;
    134         }
    135     }
    136 }
    137 
    138 void
    139 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    140     while(num > 0 && pos != start) {
    141         UChar32 c = *--pos;
    142         --num;
    143         if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
    144             --pos;
    145         }
    146     }
    147 }
    148 
    149 // FCDUTF16CollationIterator ----------------------------------------------- ***
    150 
    151 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
    152                                                      const UChar *newText)
    153         : UTF16CollationIterator(other),
    154           rawStart(newText),
    155           segmentStart(newText + (other.segmentStart - other.rawStart)),
    156           segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
    157           rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
    158           nfcImpl(other.nfcImpl),
    159           normalized(other.normalized),
    160           checkDir(other.checkDir) {
    161     if(checkDir != 0 || other.start == other.segmentStart) {
    162         start = newText + (other.start - other.rawStart);
    163         pos = newText + (other.pos - other.rawStart);
    164         limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
    165     } else {
    166         start = normalized.getBuffer();
    167         pos = start + (other.pos - other.start);
    168         limit = start + normalized.length();
    169     }
    170 }
    171 
    172 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
    173 
    174 UBool
    175 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
    176     // Skip the UTF16CollationIterator and call its parent.
    177     if(!CollationIterator::operator==(other)) { return FALSE; }
    178     const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
    179     // Compare the iterator state but not the text: Assume that the caller does that.
    180     if(checkDir != o.checkDir) { return FALSE; }
    181     if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
    182     if(checkDir != 0 || start == segmentStart) {
    183         return (pos - rawStart) == (o.pos - o.rawStart);
    184     } else {
    185         return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
    186                 (pos - start) == (o.pos - o.start);
    187     }
    188 }
    189 
    190 void
    191 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
    192     reset();
    193     start = segmentStart = pos = rawStart + newOffset;
    194     limit = rawLimit;
    195     checkDir = 1;
    196 }
    197 
    198 int32_t
    199 FCDUTF16CollationIterator::getOffset() const {
    200     if(checkDir != 0 || start == segmentStart) {
    201         return (int32_t)(pos - rawStart);
    202     } else if(pos == start) {
    203         return (int32_t)(segmentStart - rawStart);
    204     } else {
    205         return (int32_t)(segmentLimit - rawStart);
    206     }
    207 }
    208 
    209 uint32_t
    210 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    211     for(;;) {
    212         if(checkDir > 0) {
    213             if(pos == limit) {
    214                 c = U_SENTINEL;
    215                 return Collation::FALLBACK_CE32;
    216             }
    217             c = *pos++;
    218             if(CollationFCD::hasTccc(c)) {
    219                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    220                         (pos != limit && CollationFCD::hasLccc(*pos))) {
    221                     --pos;
    222                     if(!nextSegment(errorCode)) {
    223                         c = U_SENTINEL;
    224                         return Collation::FALLBACK_CE32;
    225                     }
    226                     c = *pos++;
    227                 }
    228             }
    229             break;
    230         } else if(checkDir == 0 && pos != limit) {
    231             c = *pos++;
    232             break;
    233         } else {
    234             switchToForward();
    235         }
    236     }
    237     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    238 }
    239 
    240 UBool
    241 FCDUTF16CollationIterator::foundNULTerminator() {
    242     if(limit == NULL) {
    243         limit = rawLimit = --pos;
    244         return TRUE;
    245     } else {
    246         return FALSE;
    247     }
    248 }
    249 
    250 UChar32
    251 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
    252     UChar32 c;
    253     for(;;) {
    254         if(checkDir > 0) {
    255             if(pos == limit) {
    256                 return U_SENTINEL;
    257             }
    258             c = *pos++;
    259             if(CollationFCD::hasTccc(c)) {
    260                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    261                         (pos != limit && CollationFCD::hasLccc(*pos))) {
    262                     --pos;
    263                     if(!nextSegment(errorCode)) {
    264                         return U_SENTINEL;
    265                     }
    266                     c = *pos++;
    267                 }
    268             } else if(c == 0 && limit == NULL) {
    269                 limit = rawLimit = --pos;
    270                 return U_SENTINEL;
    271             }
    272             break;
    273         } else if(checkDir == 0 && pos != limit) {
    274             c = *pos++;
    275             break;
    276         } else {
    277             switchToForward();
    278         }
    279     }
    280     UChar trail;
    281     if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
    282         ++pos;
    283         return U16_GET_SUPPLEMENTARY(c, trail);
    284     } else {
    285         return c;
    286     }
    287 }
    288 
    289 UChar32
    290 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
    291     UChar32 c;
    292     for(;;) {
    293         if(checkDir < 0) {
    294             if(pos == start) {
    295                 return U_SENTINEL;
    296             }
    297             c = *--pos;
    298             if(CollationFCD::hasLccc(c)) {
    299                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    300                         (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
    301                     ++pos;
    302                     if(!previousSegment(errorCode)) {
    303                         return U_SENTINEL;
    304                     }
    305                     c = *--pos;
    306                 }
    307             }
    308             break;
    309         } else if(checkDir == 0 && pos != start) {
    310             c = *--pos;
    311             break;
    312         } else {
    313             switchToBackward();
    314         }
    315     }
    316     UChar lead;
    317     if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
    318         --pos;
    319         return U16_GET_SUPPLEMENTARY(lead, c);
    320     } else {
    321         return c;
    322     }
    323 }
    324 
    325 void
    326 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    327     // Specify the class to avoid a virtual-function indirection.
    328     // In Java, we would declare this class final.
    329     while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
    330         --num;
    331     }
    332 }
    333 
    334 void
    335 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    336     // Specify the class to avoid a virtual-function indirection.
    337     // In Java, we would declare this class final.
    338     while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
    339         --num;
    340     }
    341 }
    342 
    343 void
    344 FCDUTF16CollationIterator::switchToForward() {
    345     U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
    346     if(checkDir < 0) {
    347         // Turn around from backward checking.
    348         start = segmentStart = pos;
    349         if(pos == segmentLimit) {
    350             limit = rawLimit;
    351             checkDir = 1;  // Check forward.
    352         } else {  // pos < segmentLimit
    353             checkDir = 0;  // Stay in FCD segment.
    354         }
    355     } else {
    356         // Reached the end of the FCD segment.
    357         if(start == segmentStart) {
    358             // The input text segment is FCD, extend it forward.
    359         } else {
    360             // The input text segment needed to be normalized.
    361             // Switch to checking forward from it.
    362             pos = start = segmentStart = segmentLimit;
    363             // Note: If this segment is at the end of the input text,
    364             // then it might help to return FALSE to indicate that, so that
    365             // we do not have to re-check and normalize when we turn around and go backwards.
    366             // However, that would complicate the call sites for an optimization of an unusual case.
    367         }
    368         limit = rawLimit;
    369         checkDir = 1;
    370     }
    371 }
    372 
    373 UBool
    374 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
    375     if(U_FAILURE(errorCode)) { return FALSE; }
    376     U_ASSERT(checkDir > 0 && pos != limit);
    377     // The input text [segmentStart..pos[ passes the FCD check.
    378     const UChar *p = pos;
    379     uint8_t prevCC = 0;
    380     for(;;) {
    381         // Fetch the next character's fcd16 value.
    382         const UChar *q = p;
    383         uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
    384         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
    385         if(leadCC == 0 && q != pos) {
    386             // FCD boundary before the [q, p[ character.
    387             limit = segmentLimit = q;
    388             break;
    389         }
    390         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    391             // Fails FCD check. Find the next FCD boundary and normalize.
    392             do {
    393                 q = p;
    394             } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
    395             if(!normalize(pos, q, errorCode)) { return FALSE; }
    396             pos = start;
    397             break;
    398         }
    399         prevCC = (uint8_t)fcd16;
    400         if(p == rawLimit || prevCC == 0) {
    401             // FCD boundary after the last character.
    402             limit = segmentLimit = p;
    403             break;
    404         }
    405     }
    406     U_ASSERT(pos != limit);
    407     checkDir = 0;
    408     return TRUE;
    409 }
    410 
    411 void
    412 FCDUTF16CollationIterator::switchToBackward() {
    413     U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
    414     if(checkDir > 0) {
    415         // Turn around from forward checking.
    416         limit = segmentLimit = pos;
    417         if(pos == segmentStart) {
    418             start = rawStart;
    419             checkDir = -1;  // Check backward.
    420         } else {  // pos > segmentStart
    421             checkDir = 0;  // Stay in FCD segment.
    422         }
    423     } else {
    424         // Reached the start of the FCD segment.
    425         if(start == segmentStart) {
    426             // The input text segment is FCD, extend it backward.
    427         } else {
    428             // The input text segment needed to be normalized.
    429             // Switch to checking backward from it.
    430             pos = limit = segmentLimit = segmentStart;
    431         }
    432         start = rawStart;
    433         checkDir = -1;
    434     }
    435 }
    436 
    437 UBool
    438 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
    439     if(U_FAILURE(errorCode)) { return FALSE; }
    440     U_ASSERT(checkDir < 0 && pos != start);
    441     // The input text [pos..segmentLimit[ passes the FCD check.
    442     const UChar *p = pos;
    443     uint8_t nextCC = 0;
    444     for(;;) {
    445         // Fetch the previous character's fcd16 value.
    446         const UChar *q = p;
    447         uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
    448         uint8_t trailCC = (uint8_t)fcd16;
    449         if(trailCC == 0 && q != pos) {
    450             // FCD boundary after the [p, q[ character.
    451             start = segmentStart = q;
    452             break;
    453         }
    454         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    455                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    456             // Fails FCD check. Find the previous FCD boundary and normalize.
    457             do {
    458                 q = p;
    459             } while(fcd16 > 0xff && p != rawStart &&
    460                     (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
    461             if(!normalize(q, pos, errorCode)) { return FALSE; }
    462             pos = limit;
    463             break;
    464         }
    465         nextCC = (uint8_t)(fcd16 >> 8);
    466         if(p == rawStart || nextCC == 0) {
    467             // FCD boundary before the following character.
    468             start = segmentStart = p;
    469             break;
    470         }
    471     }
    472     U_ASSERT(pos != start);
    473     checkDir = 0;
    474     return TRUE;
    475 }
    476 
    477 UBool
    478 FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
    479     // NFD without argument checking.
    480     U_ASSERT(U_SUCCESS(errorCode));
    481     nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
    482     if(U_FAILURE(errorCode)) { return FALSE; }
    483     // Switch collation processing into the FCD buffer
    484     // with the result of normalizing [segmentStart, segmentLimit[.
    485     segmentStart = from;
    486     segmentLimit = to;
    487     start = normalized.getBuffer();
    488     limit = start + normalized.length();
    489     return TRUE;
    490 }
    491 
    492 U_NAMESPACE_END
    493 
    494 #endif  // !UCONFIG_NO_COLLATION
    495