Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2012-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * uitercollationiterator.cpp
      7 *
      8 * created on: 2012sep23 (from utf16collationiterator.cpp)
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/uiter.h"
     17 #include "charstr.h"
     18 #include "cmemory.h"
     19 #include "collation.h"
     20 #include "collationdata.h"
     21 #include "collationfcd.h"
     22 #include "collationiterator.h"
     23 #include "normalizer2impl.h"
     24 #include "uassert.h"
     25 #include "uitercollationiterator.h"
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 UIterCollationIterator::~UIterCollationIterator() {}
     30 
     31 void
     32 UIterCollationIterator::resetToOffset(int32_t newOffset) {
     33     reset();
     34     iter.move(&iter, newOffset, UITER_START);
     35 }
     36 
     37 int32_t
     38 UIterCollationIterator::getOffset() const {
     39     return iter.getIndex(&iter, UITER_CURRENT);
     40 }
     41 
     42 uint32_t
     43 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
     44     c = iter.next(&iter);
     45     if(c < 0) {
     46         return Collation::FALLBACK_CE32;
     47     }
     48     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
     49 }
     50 
     51 UChar
     52 UIterCollationIterator::handleGetTrailSurrogate() {
     53     UChar32 trail = iter.next(&iter);
     54     if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
     55     return (UChar)trail;
     56 }
     57 
     58 UChar32
     59 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
     60     return uiter_next32(&iter);
     61 }
     62 
     63 UChar32
     64 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
     65     return uiter_previous32(&iter);
     66 }
     67 
     68 void
     69 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
     70     while(num > 0 && (uiter_next32(&iter)) >= 0) {
     71         --num;
     72     }
     73 }
     74 
     75 void
     76 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
     77     while(num > 0 && (uiter_previous32(&iter)) >= 0) {
     78         --num;
     79     }
     80 }
     81 
     82 // FCDUIterCollationIterator ----------------------------------------------- ***
     83 
     84 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
     85 
     86 void
     87 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
     88     UIterCollationIterator::resetToOffset(newOffset);
     89     start = newOffset;
     90     state = ITER_CHECK_FWD;
     91 }
     92 
     93 int32_t
     94 FCDUIterCollationIterator::getOffset() const {
     95     if(state <= ITER_CHECK_BWD) {
     96         return iter.getIndex(&iter, UITER_CURRENT);
     97     } else if(state == ITER_IN_FCD_SEGMENT) {
     98         return pos;
     99     } else if(pos == 0) {
    100         return start;
    101     } else {
    102         return limit;
    103     }
    104 }
    105 
    106 uint32_t
    107 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    108     for(;;) {
    109         if(state == ITER_CHECK_FWD) {
    110             c = iter.next(&iter);
    111             if(c < 0) {
    112                 return Collation::FALLBACK_CE32;
    113             }
    114             if(CollationFCD::hasTccc(c)) {
    115                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    116                         CollationFCD::hasLccc(iter.current(&iter))) {
    117                     iter.previous(&iter);
    118                     if(!nextSegment(errorCode)) {
    119                         c = U_SENTINEL;
    120                         return Collation::FALLBACK_CE32;
    121                     }
    122                     continue;
    123                 }
    124             }
    125             break;
    126         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
    127             c = iter.next(&iter);
    128             ++pos;
    129             U_ASSERT(c >= 0);
    130             break;
    131         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
    132             c = normalized[pos++];
    133             break;
    134         } else {
    135             switchToForward();
    136         }
    137     }
    138     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    139 }
    140 
    141 UChar
    142 FCDUIterCollationIterator::handleGetTrailSurrogate() {
    143     if(state <= ITER_IN_FCD_SEGMENT) {
    144         UChar32 trail = iter.next(&iter);
    145         if(U16_IS_TRAIL(trail)) {
    146             if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
    147         } else if(trail >= 0) {
    148             iter.previous(&iter);
    149         }
    150         return (UChar)trail;
    151     } else {
    152         U_ASSERT(pos < normalized.length());
    153         UChar trail;
    154         if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
    155         return trail;
    156     }
    157 }
    158 
    159 UChar32
    160 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
    161     UChar32 c;
    162     for(;;) {
    163         if(state == ITER_CHECK_FWD) {
    164             c = iter.next(&iter);
    165             if(c < 0) {
    166                 return c;
    167             }
    168             if(CollationFCD::hasTccc(c)) {
    169                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    170                         CollationFCD::hasLccc(iter.current(&iter))) {
    171                     iter.previous(&iter);
    172                     if(!nextSegment(errorCode)) {
    173                         return U_SENTINEL;
    174                     }
    175                     continue;
    176                 }
    177             }
    178             if(U16_IS_LEAD(c)) {
    179                 UChar32 trail = iter.next(&iter);
    180                 if(U16_IS_TRAIL(trail)) {
    181                     return U16_GET_SUPPLEMENTARY(c, trail);
    182                 } else if(trail >= 0) {
    183                     iter.previous(&iter);
    184                 }
    185             }
    186             return c;
    187         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
    188             c = uiter_next32(&iter);
    189             pos += U16_LENGTH(c);
    190             U_ASSERT(c >= 0);
    191             return c;
    192         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
    193             c = normalized.char32At(pos);
    194             pos += U16_LENGTH(c);
    195             return c;
    196         } else {
    197             switchToForward();
    198         }
    199     }
    200 }
    201 
    202 UChar32
    203 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
    204     UChar32 c;
    205     for(;;) {
    206         if(state == ITER_CHECK_BWD) {
    207             c = iter.previous(&iter);
    208             if(c < 0) {
    209                 start = pos = 0;
    210                 state = ITER_IN_FCD_SEGMENT;
    211                 return U_SENTINEL;
    212             }
    213             if(CollationFCD::hasLccc(c)) {
    214                 UChar32 prev = U_SENTINEL;
    215                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    216                         CollationFCD::hasTccc(prev = iter.previous(&iter))) {
    217                     iter.next(&iter);
    218                     if(prev >= 0) {
    219                         iter.next(&iter);
    220                     }
    221                     if(!previousSegment(errorCode)) {
    222                         return U_SENTINEL;
    223                     }
    224                     continue;
    225                 }
    226                 // hasLccc(trail)=true for all trail surrogates
    227                 if(U16_IS_TRAIL(c)) {
    228                     if(prev < 0) {
    229                         prev = iter.previous(&iter);
    230                     }
    231                     if(U16_IS_LEAD(prev)) {
    232                         return U16_GET_SUPPLEMENTARY(prev, c);
    233                     }
    234                 }
    235                 if(prev >= 0) {
    236                     iter.next(&iter);
    237                 }
    238             }
    239             return c;
    240         } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
    241             c = uiter_previous32(&iter);
    242             pos -= U16_LENGTH(c);
    243             U_ASSERT(c >= 0);
    244             return c;
    245         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
    246             c = normalized.char32At(pos - 1);
    247             pos -= U16_LENGTH(c);
    248             return c;
    249         } else {
    250             switchToBackward();
    251         }
    252     }
    253 }
    254 
    255 void
    256 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    257     // Specify the class to avoid a virtual-function indirection.
    258     // In Java, we would declare this class final.
    259     while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
    260         --num;
    261     }
    262 }
    263 
    264 void
    265 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    266     // Specify the class to avoid a virtual-function indirection.
    267     // In Java, we would declare this class final.
    268     while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
    269         --num;
    270     }
    271 }
    272 
    273 void
    274 FCDUIterCollationIterator::switchToForward() {
    275     U_ASSERT(state == ITER_CHECK_BWD ||
    276              (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
    277              (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
    278     if(state == ITER_CHECK_BWD) {
    279         // Turn around from backward checking.
    280         start = pos = iter.getIndex(&iter, UITER_CURRENT);
    281         if(pos == limit) {
    282             state = ITER_CHECK_FWD;  // Check forward.
    283         } else {  // pos < limit
    284             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    285         }
    286     } else {
    287         // Reached the end of the FCD segment.
    288         if(state == ITER_IN_FCD_SEGMENT) {
    289             // The input text segment is FCD, extend it forward.
    290         } else {
    291             // The input text segment needed to be normalized.
    292             // Switch to checking forward from it.
    293             if(state == IN_NORM_ITER_AT_START) {
    294                 iter.move(&iter, limit - start, UITER_CURRENT);
    295             }
    296             start = limit;
    297         }
    298         state = ITER_CHECK_FWD;
    299     }
    300 }
    301 
    302 UBool
    303 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
    304     if(U_FAILURE(errorCode)) { return FALSE; }
    305     U_ASSERT(state == ITER_CHECK_FWD);
    306     // The input text [start..(iter index)[ passes the FCD check.
    307     pos = iter.getIndex(&iter, UITER_CURRENT);
    308     // Collect the characters being checked, in case they need to be normalized.
    309     UnicodeString s;
    310     uint8_t prevCC = 0;
    311     for(;;) {
    312         // Fetch the next character and its fcd16 value.
    313         UChar32 c = uiter_next32(&iter);
    314         if(c < 0) { break; }
    315         uint16_t fcd16 = nfcImpl.getFCD16(c);
    316         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
    317         if(leadCC == 0 && !s.isEmpty()) {
    318             // FCD boundary before this character.
    319             uiter_previous32(&iter);
    320             break;
    321         }
    322         s.append(c);
    323         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    324             // Fails FCD check. Find the next FCD boundary and normalize.
    325             for(;;) {
    326                 c = uiter_next32(&iter);
    327                 if(c < 0) { break; }
    328                 if(nfcImpl.getFCD16(c) <= 0xff) {
    329                     uiter_previous32(&iter);
    330                     break;
    331                 }
    332                 s.append(c);
    333             }
    334             if(!normalize(s, errorCode)) { return FALSE; }
    335             start = pos;
    336             limit = pos + s.length();
    337             state = IN_NORM_ITER_AT_LIMIT;
    338             pos = 0;
    339             return TRUE;
    340         }
    341         prevCC = (uint8_t)fcd16;
    342         if(prevCC == 0) {
    343             // FCD boundary after the last character.
    344             break;
    345         }
    346     }
    347     limit = pos + s.length();
    348     U_ASSERT(pos != limit);
    349     iter.move(&iter, -s.length(), UITER_CURRENT);
    350     state = ITER_IN_FCD_SEGMENT;
    351     return TRUE;
    352 }
    353 
    354 void
    355 FCDUIterCollationIterator::switchToBackward() {
    356     U_ASSERT(state == ITER_CHECK_FWD ||
    357              (state == ITER_IN_FCD_SEGMENT && pos == start) ||
    358              (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
    359     if(state == ITER_CHECK_FWD) {
    360         // Turn around from forward checking.
    361         limit = pos = iter.getIndex(&iter, UITER_CURRENT);
    362         if(pos == start) {
    363             state = ITER_CHECK_BWD;  // Check backward.
    364         } else {  // pos > start
    365             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    366         }
    367     } else {
    368         // Reached the start of the FCD segment.
    369         if(state == ITER_IN_FCD_SEGMENT) {
    370             // The input text segment is FCD, extend it backward.
    371         } else {
    372             // The input text segment needed to be normalized.
    373             // Switch to checking backward from it.
    374             if(state == IN_NORM_ITER_AT_LIMIT) {
    375                 iter.move(&iter, start - limit, UITER_CURRENT);
    376             }
    377             limit = start;
    378         }
    379         state = ITER_CHECK_BWD;
    380     }
    381 }
    382 
    383 UBool
    384 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
    385     if(U_FAILURE(errorCode)) { return FALSE; }
    386     U_ASSERT(state == ITER_CHECK_BWD);
    387     // The input text [(iter index)..limit[ passes the FCD check.
    388     pos = iter.getIndex(&iter, UITER_CURRENT);
    389     // Collect the characters being checked, in case they need to be normalized.
    390     UnicodeString s;
    391     uint8_t nextCC = 0;
    392     for(;;) {
    393         // Fetch the previous character and its fcd16 value.
    394         UChar32 c = uiter_previous32(&iter);
    395         if(c < 0) { break; }
    396         uint16_t fcd16 = nfcImpl.getFCD16(c);
    397         uint8_t trailCC = (uint8_t)fcd16;
    398         if(trailCC == 0 && !s.isEmpty()) {
    399             // FCD boundary after this character.
    400             uiter_next32(&iter);
    401             break;
    402         }
    403         s.append(c);
    404         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    405                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    406             // Fails FCD check. Find the previous FCD boundary and normalize.
    407             while(fcd16 > 0xff) {
    408                 c = uiter_previous32(&iter);
    409                 if(c < 0) { break; }
    410                 fcd16 = nfcImpl.getFCD16(c);
    411                 if(fcd16 == 0) {
    412                     (void)uiter_next32(&iter);
    413                     break;
    414                 }
    415                 s.append(c);
    416             }
    417             s.reverse();
    418             if(!normalize(s, errorCode)) { return FALSE; }
    419             limit = pos;
    420             start = pos - s.length();
    421             state = IN_NORM_ITER_AT_START;
    422             pos = normalized.length();
    423             return TRUE;
    424         }
    425         nextCC = (uint8_t)(fcd16 >> 8);
    426         if(nextCC == 0) {
    427             // FCD boundary before the following character.
    428             break;
    429         }
    430     }
    431     start = pos - s.length();
    432     U_ASSERT(pos != start);
    433     iter.move(&iter, s.length(), UITER_CURRENT);
    434     state = ITER_IN_FCD_SEGMENT;
    435     return TRUE;
    436 }
    437 
    438 UBool
    439 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
    440     // NFD without argument checking.
    441     U_ASSERT(U_SUCCESS(errorCode));
    442     nfcImpl.decompose(s, normalized, errorCode);
    443     return U_SUCCESS(errorCode);
    444 }
    445 
    446 U_NAMESPACE_END
    447 
    448 #endif  // !UCONFIG_NO_COLLATION
    449