Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
      9 *
     10 * C++ version created on: 2012sep23 (from utf16collationiterator.h)
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import com.ibm.icu.impl.Normalizer2Impl;
     17 import com.ibm.icu.text.UCharacterIterator;
     18 
     19 /**
     20  * Incrementally checks the input text for FCD and normalizes where necessary.
     21  */
     22 public final class FCDIterCollationIterator extends IterCollationIterator {
     23     public FCDIterCollationIterator(CollationData data, boolean numeric,
     24             UCharacterIterator ui, int startIndex) {
     25         super(data, numeric, ui);
     26         state = State.ITER_CHECK_FWD;
     27         start = startIndex;
     28         nfcImpl = data.nfcImpl;
     29     }
     30 
     31     @Override
     32     public void resetToOffset(int newOffset) {
     33         super.resetToOffset(newOffset);
     34         start = newOffset;
     35         state = State.ITER_CHECK_FWD;
     36     }
     37 
     38     @Override
     39     public int getOffset() {
     40         if(state.compareTo(State.ITER_CHECK_BWD) <= 0) {
     41             return iter.getIndex();
     42         } else if(state == State.ITER_IN_FCD_SEGMENT) {
     43             return pos;
     44         } else if(pos == 0) {
     45             return start;
     46         } else {
     47             return limit;
     48         }
     49     }
     50 
     51     @Override
     52     public int nextCodePoint() {
     53         int c;
     54         for(;;) {
     55             if(state == State.ITER_CHECK_FWD) {
     56                 c = iter.next();
     57                 if(c < 0) {
     58                     return c;
     59                 }
     60                 if(CollationFCD.hasTccc(c)) {
     61                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
     62                             CollationFCD.hasLccc(iter.current())) {
     63                         iter.previous();
     64                         if(!nextSegment()) {
     65                             return Collation.SENTINEL_CP;
     66                         }
     67                         continue;
     68                     }
     69                 }
     70                 if(isLeadSurrogate(c)) {
     71                     int trail = iter.next();
     72                     if(isTrailSurrogate(trail)) {
     73                         return Character.toCodePoint((char)c, (char)trail);
     74                     } else if(trail >= 0) {
     75                         iter.previous();
     76                     }
     77                 }
     78                 return c;
     79             } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
     80                 c = iter.nextCodePoint();
     81                 pos += Character.charCount(c);
     82                 assert(c >= 0);
     83                 return c;
     84             } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
     85                     pos != normalized.length()) {
     86                 c = normalized.codePointAt(pos);
     87                 pos += Character.charCount(c);
     88                 return c;
     89             } else {
     90                 switchToForward();
     91             }
     92         }
     93     }
     94 
     95     @Override
     96     public int previousCodePoint() {
     97         int c;
     98         for(;;) {
     99             if(state == State.ITER_CHECK_BWD) {
    100                 c = iter.previous();
    101                 if(c < 0) {
    102                     start = pos = 0;
    103                     state = State.ITER_IN_FCD_SEGMENT;
    104                     return Collation.SENTINEL_CP;
    105                 }
    106                 if(CollationFCD.hasLccc(c)) {
    107                     int prev = Collation.SENTINEL_CP;
    108                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
    109                             CollationFCD.hasTccc(prev = iter.previous())) {
    110                         iter.next();
    111                         if(prev >= 0) {
    112                             iter.next();
    113                         }
    114                         if(!previousSegment()) {
    115                             return Collation.SENTINEL_CP;
    116                         }
    117                         continue;
    118                     }
    119                     // hasLccc(trail)=true for all trail surrogates
    120                     if(isTrailSurrogate(c)) {
    121                         if(prev < 0) {
    122                             prev = iter.previous();
    123                         }
    124                         if(isLeadSurrogate(prev)) {
    125                             return Character.toCodePoint((char)prev, (char)c);
    126                         }
    127                     }
    128                     if(prev >= 0) {
    129                         iter.next();
    130                     }
    131                 }
    132                 return c;
    133             } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) {
    134                 c = iter.previousCodePoint();
    135                 pos -= Character.charCount(c);
    136                 assert(c >= 0);
    137                 return c;
    138             } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
    139                 c = normalized.codePointBefore(pos);
    140                 pos -= Character.charCount(c);
    141                 return c;
    142             } else {
    143                 switchToBackward();
    144             }
    145         }
    146     }
    147 
    148     @Override
    149     protected long handleNextCE32() {
    150         int c;
    151         for(;;) {
    152             if(state == State.ITER_CHECK_FWD) {
    153                 c = iter.next();
    154                 if(c < 0) {
    155                     return NO_CP_AND_CE32;
    156                 }
    157                 if(CollationFCD.hasTccc(c)) {
    158                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
    159                             CollationFCD.hasLccc(iter.current())) {
    160                         iter.previous();
    161                         if(!nextSegment()) {
    162                             c = Collation.SENTINEL_CP;
    163                             return Collation.FALLBACK_CE32;
    164                         }
    165                         continue;
    166                     }
    167                 }
    168                 break;
    169             } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
    170                 c = iter.next();
    171                 ++pos;
    172                 assert(c >= 0);
    173                 break;
    174             } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 &&
    175                     pos != normalized.length()) {
    176                 c = normalized.charAt(pos++);
    177                 break;
    178             } else {
    179                 switchToForward();
    180             }
    181         }
    182         return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c));
    183     }
    184 
    185     @Override
    186     protected char handleGetTrailSurrogate() {
    187         if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
    188             int trail = iter.next();
    189             if(isTrailSurrogate(trail)) {
    190                 if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; }
    191             } else if(trail >= 0) {
    192                 iter.previous();
    193             }
    194             return (char)trail;
    195         } else {
    196             assert(pos < normalized.length());
    197             char trail;
    198             if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; }
    199             return trail;
    200         }
    201     }
    202 
    203     @Override
    204     protected void forwardNumCodePoints(int num) {
    205         // Specify the class to avoid a virtual-function indirection.
    206         // In Java, we would declare this class final.
    207         while(num > 0 && nextCodePoint() >= 0) {
    208             --num;
    209         }
    210     }
    211 
    212     @Override
    213     protected void backwardNumCodePoints(int num) {
    214         // Specify the class to avoid a virtual-function indirection.
    215         // In Java, we would declare this class final.
    216         while(num > 0 && previousCodePoint() >= 0) {
    217             --num;
    218         }
    219     }
    220 
    221     /**
    222      * Switches to forward checking if possible.
    223      */
    224     private void switchToForward() {
    225         assert(state == State.ITER_CHECK_BWD ||
    226                 (state == State.ITER_IN_FCD_SEGMENT && pos == limit) ||
    227                 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length()));
    228         if(state == State.ITER_CHECK_BWD) {
    229             // Turn around from backward checking.
    230             start = pos = iter.getIndex();
    231             if(pos == limit) {
    232                 state = State.ITER_CHECK_FWD;  // Check forward.
    233             } else {  // pos < limit
    234                 state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    235             }
    236         } else {
    237             // Reached the end of the FCD segment.
    238             if(state == State.ITER_IN_FCD_SEGMENT) {
    239                 // The input text segment is FCD, extend it forward.
    240             } else {
    241                 // The input text segment needed to be normalized.
    242                 // Switch to checking forward from it.
    243                 if(state == State.IN_NORM_ITER_AT_START) {
    244                     iter.moveIndex(limit - start);
    245                 }
    246                 start = limit;
    247             }
    248             state = State.ITER_CHECK_FWD;
    249         }
    250     }
    251 
    252     /**
    253      * Extends the FCD text segment forward or normalizes around pos.
    254      * @return true if success
    255      */
    256     private boolean nextSegment() {
    257         assert(state == State.ITER_CHECK_FWD);
    258         // The input text [start..(iter index)[ passes the FCD check.
    259         pos = iter.getIndex();
    260         // Collect the characters being checked, in case they need to be normalized.
    261         if(s == null) {
    262             s = new StringBuilder();
    263         } else {
    264             s.setLength(0);
    265         }
    266         int prevCC = 0;
    267         for(;;) {
    268             // Fetch the next character and its fcd16 value.
    269             int c = iter.nextCodePoint();
    270             if(c < 0) { break; }
    271             int fcd16 = nfcImpl.getFCD16(c);
    272             int leadCC = fcd16 >> 8;
    273             if(leadCC == 0 && s.length() != 0) {
    274                 // FCD boundary before this character.
    275                 iter.previousCodePoint();
    276                 break;
    277             }
    278             s.appendCodePoint(c);
    279             if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
    280                 // Fails FCD check. Find the next FCD boundary and normalize.
    281                 for(;;) {
    282                     c = iter.nextCodePoint();
    283                     if(c < 0) { break; }
    284                     if(nfcImpl.getFCD16(c) <= 0xff) {
    285                         iter.previousCodePoint();
    286                         break;
    287                     }
    288                     s.appendCodePoint(c);
    289                 }
    290                 normalize(s);
    291                 start = pos;
    292                 limit = pos + s.length();
    293                 state = State.IN_NORM_ITER_AT_LIMIT;
    294                 pos = 0;
    295                 return true;
    296             }
    297             prevCC = fcd16 & 0xff;
    298             if(prevCC == 0) {
    299                 // FCD boundary after the last character.
    300                 break;
    301             }
    302         }
    303         limit = pos + s.length();
    304         assert(pos != limit);
    305         iter.moveIndex(-s.length());
    306         state = State.ITER_IN_FCD_SEGMENT;
    307         return true;
    308     }
    309 
    310     /**
    311      * Switches to backward checking.
    312      */
    313     private void switchToBackward() {
    314         assert(state == State.ITER_CHECK_FWD ||
    315                 (state == State.ITER_IN_FCD_SEGMENT && pos == start) ||
    316                 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
    317         if(state == State.ITER_CHECK_FWD) {
    318             // Turn around from forward checking.
    319             limit = pos = iter.getIndex();
    320             if(pos == start) {
    321                 state = State.ITER_CHECK_BWD;  // Check backward.
    322             } else {  // pos > start
    323                 state = State.ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    324             }
    325         } else {
    326             // Reached the start of the FCD segment.
    327             if(state == State.ITER_IN_FCD_SEGMENT) {
    328                 // The input text segment is FCD, extend it backward.
    329             } else {
    330                 // The input text segment needed to be normalized.
    331                 // Switch to checking backward from it.
    332                 if(state == State.IN_NORM_ITER_AT_LIMIT) {
    333                     iter.moveIndex(start - limit);
    334                 }
    335                 limit = start;
    336             }
    337             state = State.ITER_CHECK_BWD;
    338         }
    339     }
    340 
    341     /**
    342      * Extends the FCD text segment backward or normalizes around pos.
    343      * @return true if success
    344      */
    345     private boolean previousSegment() {
    346         assert(state == State.ITER_CHECK_BWD);
    347         // The input text [(iter index)..limit[ passes the FCD check.
    348         pos = iter.getIndex();
    349         // Collect the characters being checked, in case they need to be normalized.
    350         if(s == null) {
    351             s = new StringBuilder();
    352         } else {
    353             s.setLength(0);
    354         }
    355         int nextCC = 0;
    356         for(;;) {
    357             // Fetch the previous character and its fcd16 value.
    358             int c = iter.previousCodePoint();
    359             if(c < 0) { break; }
    360             int fcd16 = nfcImpl.getFCD16(c);
    361             int trailCC = fcd16 & 0xff;
    362             if(trailCC == 0 && s.length() != 0) {
    363                 // FCD boundary after this character.
    364                 iter.nextCodePoint();
    365                 break;
    366             }
    367             s.appendCodePoint(c);
    368             if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    369                                 CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
    370                 // Fails FCD check. Find the previous FCD boundary and normalize.
    371                 while(fcd16 > 0xff) {
    372                     c = iter.previousCodePoint();
    373                     if(c < 0) { break; }
    374                     fcd16 = nfcImpl.getFCD16(c);
    375                     if(fcd16 == 0) {
    376                         iter.nextCodePoint();
    377                         break;
    378                     }
    379                     s.appendCodePoint(c);
    380                 }
    381                 s.reverse();
    382                 normalize(s);
    383                 limit = pos;
    384                 start = pos - s.length();
    385                 state = State.IN_NORM_ITER_AT_START;
    386                 pos = normalized.length();
    387                 return true;
    388             }
    389             nextCC = fcd16 >> 8;
    390             if(nextCC == 0) {
    391                 // FCD boundary before the following character.
    392                 break;
    393             }
    394         }
    395         start = pos - s.length();
    396         assert(pos != start);
    397         iter.moveIndex(s.length());
    398         state = State.ITER_IN_FCD_SEGMENT;
    399         return true;
    400     }
    401 
    402     private void normalize(CharSequence s) {
    403         if(normalized == null) {
    404             normalized = new StringBuilder();
    405         }
    406         // NFD without argument checking.
    407         nfcImpl.decompose(s, normalized);
    408     }
    409 
    410     private enum State {
    411         /**
    412          * The input text [start..(iter index)[ passes the FCD check.
    413          * Moving forward checks incrementally.
    414          * pos & limit are undefined.
    415          */
    416         ITER_CHECK_FWD,
    417         /**
    418          * The input text [(iter index)..limit[ passes the FCD check.
    419          * Moving backward checks incrementally.
    420          * start & pos are undefined.
    421          */
    422         ITER_CHECK_BWD,
    423         /**
    424          * The input text [start..limit[ passes the FCD check.
    425          * pos tracks the current text index.
    426          */
    427         ITER_IN_FCD_SEGMENT,
    428         /**
    429          * The input text [start..limit[ failed the FCD check and was normalized.
    430          * pos tracks the current index in the normalized string.
    431          * The text iterator is at the limit index.
    432          */
    433         IN_NORM_ITER_AT_LIMIT,
    434         /**
    435          * The input text [start..limit[ failed the FCD check and was normalized.
    436          * pos tracks the current index in the normalized string.
    437          * The text iterator is at the start index.
    438          */
    439         IN_NORM_ITER_AT_START
    440     }
    441 
    442     private State state;
    443 
    444     private int start;
    445     private int pos;
    446     private int limit;
    447 
    448     private final Normalizer2Impl nfcImpl;
    449     private StringBuilder s;
    450     private StringBuilder normalized;
    451 }
    452