Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp
      9 *
     10 * C++ version created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import com.ibm.icu.impl.Normalizer2Impl;
     17 
     18 /**
     19  * Incrementally checks the input text for FCD and normalizes where necessary.
     20  */
     21 public final class FCDUTF16CollationIterator extends UTF16CollationIterator {
     22     /**
     23      * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}.
     24      */
     25     public FCDUTF16CollationIterator(CollationData d) {
     26         super(d);
     27         nfcImpl = d.nfcImpl;
     28     }
     29 
     30     public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) {
     31         super(data, numeric, s, p);
     32         rawSeq = s;
     33         segmentStart = p;
     34         rawLimit = s.length();
     35         nfcImpl = data.nfcImpl;
     36         checkDir = 1;
     37     }
     38 
     39     @Override
     40     public boolean equals(Object other) {
     41         // Skip the UTF16CollationIterator and call its parent.
     42         if (!(other instanceof CollationIterator)
     43             || !((CollationIterator)this).equals(other)
     44             || !(other instanceof FCDUTF16CollationIterator))
     45         {
     46             return false;
     47         }
     48         FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other;
     49         // Compare the iterator state but not the text: Assume that the caller does that.
     50         if (checkDir != o.checkDir) {
     51             return false;
     52         }
     53         if (checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) {
     54             return false;
     55         }
     56         if (checkDir != 0 || seq == rawSeq) {
     57             return (pos - rawStart) == (o.pos - /*o.*/ rawStart);
     58         }
     59         else {
     60             return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) &&
     61                     (pos - start) == (o.pos - o.start);
     62         }
     63     }
     64 
     65     @Override
     66     public int hashCode() {
     67         assert false : "hashCode not designed";
     68         return 42; // any arbitrary constant will do
     69     }
     70 
     71     @Override
     72     public void resetToOffset(int newOffset) {
     73         reset();
     74         seq = rawSeq;
     75         start = segmentStart = pos = rawStart + newOffset;
     76         limit = rawLimit;
     77         checkDir = 1;
     78     }
     79 
     80     @Override
     81     public int getOffset() {
     82         if(checkDir != 0 || seq == rawSeq) {
     83             return pos - rawStart;
     84         } else if(pos == start) {
     85             return segmentStart - rawStart;
     86         } else {
     87             return segmentLimit - rawStart;
     88         }
     89     }
     90 
     91     @Override
     92     public void setText(boolean numeric, CharSequence s, int p) {
     93         super.setText(numeric, s, p);
     94         rawSeq = s;
     95         segmentStart = p;
     96         rawLimit = limit = s.length();
     97         checkDir = 1;
     98     }
     99 
    100     @Override
    101     public int nextCodePoint() {
    102         char c;
    103         for(;;) {
    104             if(checkDir > 0) {
    105                 if(pos == limit) {
    106                     return Collation.SENTINEL_CP;
    107                 }
    108                 c = seq.charAt(pos++);
    109                 if(CollationFCD.hasTccc(c)) {
    110                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
    111                             (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
    112                         --pos;
    113                         nextSegment();
    114                         c = seq.charAt(pos++);
    115                     }
    116                 }
    117                 break;
    118             } else if(checkDir == 0 && pos != limit) {
    119                 c = seq.charAt(pos++);
    120                 break;
    121             } else {
    122                 switchToForward();
    123             }
    124         }
    125         char trail;
    126         if(Character.isHighSurrogate(c) && pos != limit &&
    127                 Character.isLowSurrogate(trail = seq.charAt(pos))) {
    128             ++pos;
    129             return Character.toCodePoint(c, trail);
    130         } else {
    131             return c;
    132         }
    133     }
    134 
    135     @Override
    136     public int previousCodePoint() {
    137         char c;
    138         for(;;) {
    139             if(checkDir < 0) {
    140                 if(pos == start) {
    141                     return Collation.SENTINEL_CP;
    142                 }
    143                 c = seq.charAt(--pos);
    144                 if(CollationFCD.hasLccc(c)) {
    145                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
    146                             (pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) {
    147                         ++pos;
    148                         previousSegment();
    149                         c = seq.charAt(--pos);
    150                     }
    151                 }
    152                 break;
    153             } else if(checkDir == 0 && pos != start) {
    154                 c = seq.charAt(--pos);
    155                 break;
    156             } else {
    157                 switchToBackward();
    158             }
    159         }
    160         char lead;
    161         if(Character.isLowSurrogate(c) && pos != start &&
    162                 Character.isHighSurrogate(lead = seq.charAt(pos - 1))) {
    163             --pos;
    164             return Character.toCodePoint(lead, c);
    165         } else {
    166             return c;
    167         }
    168     }
    169 
    170     @Override
    171     protected long handleNextCE32() {
    172         char c;
    173         for(;;) {
    174             if(checkDir > 0) {
    175                 if(pos == limit) {
    176                     return NO_CP_AND_CE32;
    177                 }
    178                 c = seq.charAt(pos++);
    179                 if(CollationFCD.hasTccc(c)) {
    180                     if(CollationFCD.maybeTibetanCompositeVowel(c) ||
    181                             (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) {
    182                         --pos;
    183                         nextSegment();
    184                         c = seq.charAt(pos++);
    185                     }
    186                 }
    187                 break;
    188             } else if(checkDir == 0 && pos != limit) {
    189                 c = seq.charAt(pos++);
    190                 break;
    191             } else {
    192                 switchToForward();
    193             }
    194         }
    195         return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c));
    196     }
    197 
    198     /* boolean foundNULTerminator(); */
    199 
    200     @Override
    201     protected void forwardNumCodePoints(int num) {
    202         // Specify the class to avoid a virtual-function indirection.
    203         // In Java, we would declare this class final.
    204         while(num > 0 && nextCodePoint() >= 0) {
    205             --num;
    206         }
    207     }
    208 
    209     @Override
    210     protected void backwardNumCodePoints(int num) {
    211         // Specify the class to avoid a virtual-function indirection.
    212         // In Java, we would declare this class final.
    213         while(num > 0 && previousCodePoint() >= 0) {
    214             --num;
    215         }
    216     }
    217 
    218     /**
    219      * Switches to forward checking if possible.
    220      * To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
    221      * Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
    222      */
    223     private void switchToForward() {
    224         assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit));
    225         if(checkDir < 0) {
    226             // Turn around from backward checking.
    227             start = segmentStart = pos;
    228             if(pos == segmentLimit) {
    229                 limit = rawLimit;
    230                 checkDir = 1;  // Check forward.
    231             } else {  // pos < segmentLimit
    232                 checkDir = 0;  // Stay in FCD segment.
    233             }
    234         } else {
    235             // Reached the end of the FCD segment.
    236             if(seq == rawSeq) {
    237                 // The input text segment is FCD, extend it forward.
    238             } else {
    239                 // The input text segment needed to be normalized.
    240                 // Switch to checking forward from it.
    241                 seq = rawSeq;
    242                 pos = start = segmentStart = segmentLimit;
    243                 // Note: If this segment is at the end of the input text,
    244                 // then it might help to return false to indicate that, so that
    245                 // we do not have to re-check and normalize when we turn around and go backwards.
    246                 // However, that would complicate the call sites for an optimization of an unusual case.
    247             }
    248             limit = rawLimit;
    249             checkDir = 1;
    250         }
    251     }
    252 
    253     /**
    254      * Extend the FCD text segment forward or normalize around pos.
    255      * To be called when checkDir > 0 && pos != limit.
    256      * Returns with checkDir == 0 and pos != limit.
    257      */
    258     private void nextSegment() {
    259         assert(checkDir > 0 && seq == rawSeq && pos != limit);
    260         // The input text [segmentStart..pos[ passes the FCD check.
    261         int p = pos;
    262         int prevCC = 0;
    263         for(;;) {
    264             // Fetch the next character's fcd16 value.
    265             int q = p;
    266             int c = Character.codePointAt(seq, p);
    267             p += Character.charCount(c);
    268             int fcd16 = nfcImpl.getFCD16(c);
    269             int leadCC = fcd16 >> 8;
    270             if(leadCC == 0 && q != pos) {
    271                 // FCD boundary before the [q, p[ character.
    272                 limit = segmentLimit = q;
    273                 break;
    274             }
    275             if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
    276                 // Fails FCD check. Find the next FCD boundary and normalize.
    277                 do {
    278                     q = p;
    279                     if(p == rawLimit) { break; }
    280                     c = Character.codePointAt(seq, p);
    281                     p += Character.charCount(c);
    282                 } while(nfcImpl.getFCD16(c) > 0xff);
    283                 normalize(pos, q);
    284                 pos = start;
    285                 break;
    286             }
    287             prevCC = fcd16 & 0xff;
    288             if(p == rawLimit || prevCC == 0) {
    289                 // FCD boundary after the last character.
    290                 limit = segmentLimit = p;
    291                 break;
    292             }
    293         }
    294         assert(pos != limit);
    295         checkDir = 0;
    296     }
    297 
    298     /**
    299      * Switches to backward checking.
    300      * To be called when checkDir > 0 || (checkDir == 0 && pos == start).
    301      * Returns with checkDir < 0 || (checkDir == 0 && pos != start).
    302      */
    303     private void switchToBackward() {
    304         assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start));
    305         if(checkDir > 0) {
    306             // Turn around from forward checking.
    307             limit = segmentLimit = pos;
    308             if(pos == segmentStart) {
    309                 start = rawStart;
    310                 checkDir = -1;  // Check backward.
    311             } else {  // pos > segmentStart
    312                 checkDir = 0;  // Stay in FCD segment.
    313             }
    314         } else {
    315             // Reached the start of the FCD segment.
    316             if(seq == rawSeq) {
    317                 // The input text segment is FCD, extend it backward.
    318             } else {
    319                 // The input text segment needed to be normalized.
    320                 // Switch to checking backward from it.
    321                 seq = rawSeq;
    322                 pos = limit = segmentLimit = segmentStart;
    323             }
    324             start = rawStart;
    325             checkDir = -1;
    326         }
    327     }
    328 
    329     /**
    330      * Extend the FCD text segment backward or normalize around pos.
    331      * To be called when checkDir < 0 && pos != start.
    332      * Returns with checkDir == 0 and pos != start.
    333      */
    334     private void previousSegment() {
    335         assert(checkDir < 0 && seq == rawSeq && pos != start);
    336         // The input text [pos..segmentLimit[ passes the FCD check.
    337         int p = pos;
    338         int nextCC = 0;
    339         for(;;) {
    340             // Fetch the previous character's fcd16 value.
    341             int q = p;
    342             int c = Character.codePointBefore(seq, p);
    343             p -= Character.charCount(c);
    344             int fcd16 = nfcImpl.getFCD16(c);
    345             int trailCC = fcd16 & 0xff;
    346             if(trailCC == 0 && q != pos) {
    347                 // FCD boundary after the [p, q[ character.
    348                 start = segmentStart = q;
    349                 break;
    350             }
    351             if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    352                                 CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
    353                 // Fails FCD check. Find the previous FCD boundary and normalize.
    354                 do {
    355                     q = p;
    356                     if(fcd16 <= 0xff || p == rawStart) { break; }
    357                     c = Character.codePointBefore(seq, p);
    358                     p -= Character.charCount(c);
    359                 } while((fcd16 = nfcImpl.getFCD16(c)) != 0);
    360                 normalize(q, pos);
    361                 pos = limit;
    362                 break;
    363             }
    364             nextCC = fcd16 >> 8;
    365             if(p == rawStart || nextCC == 0) {
    366                 // FCD boundary before the following character.
    367                 start = segmentStart = p;
    368                 break;
    369             }
    370         }
    371         assert(pos != start);
    372         checkDir = 0;
    373     }
    374 
    375     private void normalize(int from, int to) {
    376         if(normalized == null) {
    377             normalized = new StringBuilder();
    378         }
    379         // NFD without argument checking.
    380         nfcImpl.decompose(rawSeq, from, to, normalized, to - from);
    381         // Switch collation processing into the FCD buffer
    382         // with the result of normalizing [segmentStart, segmentLimit[.
    383         segmentStart = from;
    384         segmentLimit = to;
    385         seq = normalized;
    386         start = 0;
    387         limit = start + normalized.length();
    388     }
    389 
    390     // Text pointers: The input text is rawSeq[rawStart, rawLimit[.
    391     // (In C++, these are const UChar * pointers.
    392     // In Java, we use CharSequence rawSeq and the parent class' seq
    393     // together with int indexes.)
    394     //
    395     // checkDir > 0:
    396     //
    397     // The input text rawSeq[segmentStart..pos[ passes the FCD check.
    398     // Moving forward checks incrementally.
    399     // segmentLimit is undefined. seq == rawSeq. limit == rawLimit.
    400     //
    401     // checkDir < 0:
    402     // The input text rawSeq[pos..segmentLimit[ passes the FCD check.
    403     // Moving backward checks incrementally.
    404     // segmentStart is undefined. seq == rawSeq. start == rawStart.
    405     //
    406     // checkDir == 0:
    407     //
    408     // The input text rawSeq[segmentStart..segmentLimit[ is being processed.
    409     // These pointers are at FCD boundaries.
    410     // Either this text segment already passes the FCD check
    411     // and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit,
    412     // or the current segment had to be normalized so that
    413     // rawSeq[segmentStart..segmentLimit[ turned into the normalized string,
    414     // corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length().
    415     private CharSequence rawSeq;
    416     private static final int rawStart = 0;
    417     private int segmentStart;
    418     private int segmentLimit;
    419     private int rawLimit;
    420 
    421     private final Normalizer2Impl nfcImpl;
    422     private StringBuilder normalized;
    423     // Direction of incremental FCD check. See comments before rawStart.
    424     private int checkDir;
    425 }
    426