Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * uitercollationiterator.cpp
      9 *
     10 * created on: 2012sep23 (from utf16collationiterator.cpp)
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/uiter.h"
     19 #include "charstr.h"
     20 #include "cmemory.h"
     21 #include "collation.h"
     22 #include "collationdata.h"
     23 #include "collationfcd.h"
     24 #include "collationiterator.h"
     25 #include "normalizer2impl.h"
     26 #include "uassert.h"
     27 #include "uitercollationiterator.h"
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 UIterCollationIterator::~UIterCollationIterator() {}
     32 
     33 void
     34 UIterCollationIterator::resetToOffset(int32_t newOffset) {
     35     reset();
     36     iter.move(&iter, newOffset, UITER_START);
     37 }
     38 
     39 int32_t
     40 UIterCollationIterator::getOffset() const {
     41     return iter.getIndex(&iter, UITER_CURRENT);
     42 }
     43 
     44 uint32_t
     45 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
     46     c = iter.next(&iter);
     47     if(c < 0) {
     48         return Collation::FALLBACK_CE32;
     49     }
     50     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
     51 }
     52 
     53 UChar
     54 UIterCollationIterator::handleGetTrailSurrogate() {
     55     UChar32 trail = iter.next(&iter);
     56     if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
     57     return (UChar)trail;
     58 }
     59 
     60 UChar32
     61 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
     62     return uiter_next32(&iter);
     63 }
     64 
     65 UChar32
     66 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
     67     return uiter_previous32(&iter);
     68 }
     69 
     70 void
     71 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
     72     while(num > 0 && (uiter_next32(&iter)) >= 0) {
     73         --num;
     74     }
     75 }
     76 
     77 void
     78 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
     79     while(num > 0 && (uiter_previous32(&iter)) >= 0) {
     80         --num;
     81     }
     82 }
     83 
     84 // FCDUIterCollationIterator ----------------------------------------------- ***
     85 
     86 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
     87 
     88 void
     89 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
     90     UIterCollationIterator::resetToOffset(newOffset);
     91     start = newOffset;
     92     state = ITER_CHECK_FWD;
     93 }
     94 
     95 int32_t
     96 FCDUIterCollationIterator::getOffset() const {
     97     if(state <= ITER_CHECK_BWD) {
     98         return iter.getIndex(&iter, UITER_CURRENT);
     99     } else if(state == ITER_IN_FCD_SEGMENT) {
    100         return pos;
    101     } else if(pos == 0) {
    102         return start;
    103     } else {
    104         return limit;
    105     }
    106 }
    107 
    108 uint32_t
    109 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    110     for(;;) {
    111         if(state == ITER_CHECK_FWD) {
    112             c = iter.next(&iter);
    113             if(c < 0) {
    114                 return Collation::FALLBACK_CE32;
    115             }
    116             if(CollationFCD::hasTccc(c)) {
    117                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    118                         CollationFCD::hasLccc(iter.current(&iter))) {
    119                     iter.previous(&iter);
    120                     if(!nextSegment(errorCode)) {
    121                         c = U_SENTINEL;
    122                         return Collation::FALLBACK_CE32;
    123                     }
    124                     continue;
    125                 }
    126             }
    127             break;
    128         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
    129             c = iter.next(&iter);
    130             ++pos;
    131             U_ASSERT(c >= 0);
    132             break;
    133         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
    134             c = normalized[pos++];
    135             break;
    136         } else {
    137             switchToForward();
    138         }
    139     }
    140     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    141 }
    142 
    143 UChar
    144 FCDUIterCollationIterator::handleGetTrailSurrogate() {
    145     if(state <= ITER_IN_FCD_SEGMENT) {
    146         UChar32 trail = iter.next(&iter);
    147         if(U16_IS_TRAIL(trail)) {
    148             if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
    149         } else if(trail >= 0) {
    150             iter.previous(&iter);
    151         }
    152         return (UChar)trail;
    153     } else {
    154         U_ASSERT(pos < normalized.length());
    155         UChar trail;
    156         if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
    157         return trail;
    158     }
    159 }
    160 
    161 UChar32
    162 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
    163     UChar32 c;
    164     for(;;) {
    165         if(state == ITER_CHECK_FWD) {
    166             c = iter.next(&iter);
    167             if(c < 0) {
    168                 return c;
    169             }
    170             if(CollationFCD::hasTccc(c)) {
    171                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    172                         CollationFCD::hasLccc(iter.current(&iter))) {
    173                     iter.previous(&iter);
    174                     if(!nextSegment(errorCode)) {
    175                         return U_SENTINEL;
    176                     }
    177                     continue;
    178                 }
    179             }
    180             if(U16_IS_LEAD(c)) {
    181                 UChar32 trail = iter.next(&iter);
    182                 if(U16_IS_TRAIL(trail)) {
    183                     return U16_GET_SUPPLEMENTARY(c, trail);
    184                 } else if(trail >= 0) {
    185                     iter.previous(&iter);
    186                 }
    187             }
    188             return c;
    189         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
    190             c = uiter_next32(&iter);
    191             pos += U16_LENGTH(c);
    192             U_ASSERT(c >= 0);
    193             return c;
    194         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
    195             c = normalized.char32At(pos);
    196             pos += U16_LENGTH(c);
    197             return c;
    198         } else {
    199             switchToForward();
    200         }
    201     }
    202 }
    203 
    204 UChar32
    205 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
    206     UChar32 c;
    207     for(;;) {
    208         if(state == ITER_CHECK_BWD) {
    209             c = iter.previous(&iter);
    210             if(c < 0) {
    211                 start = pos = 0;
    212                 state = ITER_IN_FCD_SEGMENT;
    213                 return U_SENTINEL;
    214             }
    215             if(CollationFCD::hasLccc(c)) {
    216                 UChar32 prev = U_SENTINEL;
    217                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    218                         CollationFCD::hasTccc(prev = iter.previous(&iter))) {
    219                     iter.next(&iter);
    220                     if(prev >= 0) {
    221                         iter.next(&iter);
    222                     }
    223                     if(!previousSegment(errorCode)) {
    224                         return U_SENTINEL;
    225                     }
    226                     continue;
    227                 }
    228                 // hasLccc(trail)=true for all trail surrogates
    229                 if(U16_IS_TRAIL(c)) {
    230                     if(prev < 0) {
    231                         prev = iter.previous(&iter);
    232                     }
    233                     if(U16_IS_LEAD(prev)) {
    234                         return U16_GET_SUPPLEMENTARY(prev, c);
    235                     }
    236                 }
    237                 if(prev >= 0) {
    238                     iter.next(&iter);
    239                 }
    240             }
    241             return c;
    242         } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
    243             c = uiter_previous32(&iter);
    244             pos -= U16_LENGTH(c);
    245             U_ASSERT(c >= 0);
    246             return c;
    247         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
    248             c = normalized.char32At(pos - 1);
    249             pos -= U16_LENGTH(c);
    250             return c;
    251         } else {
    252             switchToBackward();
    253         }
    254     }
    255 }
    256 
    257 void
    258 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    259     // Specify the class to avoid a virtual-function indirection.
    260     // In Java, we would declare this class final.
    261     while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
    262         --num;
    263     }
    264 }
    265 
    266 void
    267 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    268     // Specify the class to avoid a virtual-function indirection.
    269     // In Java, we would declare this class final.
    270     while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
    271         --num;
    272     }
    273 }
    274 
    275 void
    276 FCDUIterCollationIterator::switchToForward() {
    277     U_ASSERT(state == ITER_CHECK_BWD ||
    278              (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
    279              (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
    280     if(state == ITER_CHECK_BWD) {
    281         // Turn around from backward checking.
    282         start = pos = iter.getIndex(&iter, UITER_CURRENT);
    283         if(pos == limit) {
    284             state = ITER_CHECK_FWD;  // Check forward.
    285         } else {  // pos < limit
    286             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    287         }
    288     } else {
    289         // Reached the end of the FCD segment.
    290         if(state == ITER_IN_FCD_SEGMENT) {
    291             // The input text segment is FCD, extend it forward.
    292         } else {
    293             // The input text segment needed to be normalized.
    294             // Switch to checking forward from it.
    295             if(state == IN_NORM_ITER_AT_START) {
    296                 iter.move(&iter, limit - start, UITER_CURRENT);
    297             }
    298             start = limit;
    299         }
    300         state = ITER_CHECK_FWD;
    301     }
    302 }
    303 
    304 UBool
    305 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
    306     if(U_FAILURE(errorCode)) { return FALSE; }
    307     U_ASSERT(state == ITER_CHECK_FWD);
    308     // The input text [start..(iter index)[ passes the FCD check.
    309     pos = iter.getIndex(&iter, UITER_CURRENT);
    310     // Collect the characters being checked, in case they need to be normalized.
    311     UnicodeString s;
    312     uint8_t prevCC = 0;
    313     for(;;) {
    314         // Fetch the next character and its fcd16 value.
    315         UChar32 c = uiter_next32(&iter);
    316         if(c < 0) { break; }
    317         uint16_t fcd16 = nfcImpl.getFCD16(c);
    318         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
    319         if(leadCC == 0 && !s.isEmpty()) {
    320             // FCD boundary before this character.
    321             uiter_previous32(&iter);
    322             break;
    323         }
    324         s.append(c);
    325         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    326             // Fails FCD check. Find the next FCD boundary and normalize.
    327             for(;;) {
    328                 c = uiter_next32(&iter);
    329                 if(c < 0) { break; }
    330                 if(nfcImpl.getFCD16(c) <= 0xff) {
    331                     uiter_previous32(&iter);
    332                     break;
    333                 }
    334                 s.append(c);
    335             }
    336             if(!normalize(s, errorCode)) { return FALSE; }
    337             start = pos;
    338             limit = pos + s.length();
    339             state = IN_NORM_ITER_AT_LIMIT;
    340             pos = 0;
    341             return TRUE;
    342         }
    343         prevCC = (uint8_t)fcd16;
    344         if(prevCC == 0) {
    345             // FCD boundary after the last character.
    346             break;
    347         }
    348     }
    349     limit = pos + s.length();
    350     U_ASSERT(pos != limit);
    351     iter.move(&iter, -s.length(), UITER_CURRENT);
    352     state = ITER_IN_FCD_SEGMENT;
    353     return TRUE;
    354 }
    355 
    356 void
    357 FCDUIterCollationIterator::switchToBackward() {
    358     U_ASSERT(state == ITER_CHECK_FWD ||
    359              (state == ITER_IN_FCD_SEGMENT && pos == start) ||
    360              (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
    361     if(state == ITER_CHECK_FWD) {
    362         // Turn around from forward checking.
    363         limit = pos = iter.getIndex(&iter, UITER_CURRENT);
    364         if(pos == start) {
    365             state = ITER_CHECK_BWD;  // Check backward.
    366         } else {  // pos > start
    367             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    368         }
    369     } else {
    370         // Reached the start of the FCD segment.
    371         if(state == ITER_IN_FCD_SEGMENT) {
    372             // The input text segment is FCD, extend it backward.
    373         } else {
    374             // The input text segment needed to be normalized.
    375             // Switch to checking backward from it.
    376             if(state == IN_NORM_ITER_AT_LIMIT) {
    377                 iter.move(&iter, start - limit, UITER_CURRENT);
    378             }
    379             limit = start;
    380         }
    381         state = ITER_CHECK_BWD;
    382     }
    383 }
    384 
    385 UBool
    386 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
    387     if(U_FAILURE(errorCode)) { return FALSE; }
    388     U_ASSERT(state == ITER_CHECK_BWD);
    389     // The input text [(iter index)..limit[ passes the FCD check.
    390     pos = iter.getIndex(&iter, UITER_CURRENT);
    391     // Collect the characters being checked, in case they need to be normalized.
    392     UnicodeString s;
    393     uint8_t nextCC = 0;
    394     for(;;) {
    395         // Fetch the previous character and its fcd16 value.
    396         UChar32 c = uiter_previous32(&iter);
    397         if(c < 0) { break; }
    398         uint16_t fcd16 = nfcImpl.getFCD16(c);
    399         uint8_t trailCC = (uint8_t)fcd16;
    400         if(trailCC == 0 && !s.isEmpty()) {
    401             // FCD boundary after this character.
    402             uiter_next32(&iter);
    403             break;
    404         }
    405         s.append(c);
    406         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    407                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    408             // Fails FCD check. Find the previous FCD boundary and normalize.
    409             while(fcd16 > 0xff) {
    410                 c = uiter_previous32(&iter);
    411                 if(c < 0) { break; }
    412                 fcd16 = nfcImpl.getFCD16(c);
    413                 if(fcd16 == 0) {
    414                     (void)uiter_next32(&iter);
    415                     break;
    416                 }
    417                 s.append(c);
    418             }
    419             s.reverse();
    420             if(!normalize(s, errorCode)) { return FALSE; }
    421             limit = pos;
    422             start = pos - s.length();
    423             state = IN_NORM_ITER_AT_START;
    424             pos = normalized.length();
    425             return TRUE;
    426         }
    427         nextCC = (uint8_t)(fcd16 >> 8);
    428         if(nextCC == 0) {
    429             // FCD boundary before the following character.
    430             break;
    431         }
    432     }
    433     start = pos - s.length();
    434     U_ASSERT(pos != start);
    435     iter.move(&iter, s.length(), UITER_CURRENT);
    436     state = ITER_IN_FCD_SEGMENT;
    437     return TRUE;
    438 }
    439 
    440 UBool
    441 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
    442     // NFD without argument checking.
    443     U_ASSERT(U_SUCCESS(errorCode));
    444     nfcImpl.decompose(s, normalized, errorCode);
    445     return U_SUCCESS(errorCode);
    446 }
    447 
    448 U_NAMESPACE_END
    449 
    450 #endif  // !UCONFIG_NO_COLLATION
    451