Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2010-2014, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * CollationIterator.java, ported from collationiterator.h/.cpp
     10 *
     11 * C++ version created on: 2010oct27
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import android.icu.impl.Normalizer2Impl.Hangul;
     18 import android.icu.impl.Trie2_32;
     19 import android.icu.util.BytesTrie;
     20 import android.icu.util.CharsTrie;
     21 import android.icu.util.ICUException;
     22 
     23 /**
     24  * Collation element iterator and abstract character iterator.
     25  *
     26  * When a method returns a code point value, it must be in 0..10FFFF,
     27  * except it can be negative as a sentinel value.
     28  * @hide Only a subset of ICU is exposed in Android
     29  */
     30 public abstract class CollationIterator {
     31     private static final class CEBuffer {
     32         /** Large enough for CEs of most short strings. */
     33         private static final int INITIAL_CAPACITY = 40;
     34 
     35         CEBuffer() {}
     36 
     37         void append(long ce) {
     38             if(length >= INITIAL_CAPACITY) {
     39                 ensureAppendCapacity(1);
     40             }
     41             buffer[length++] = ce;
     42         }
     43 
     44         void appendUnsafe(long ce) {
     45             buffer[length++] = ce;
     46         }
     47 
     48         void ensureAppendCapacity(int appCap) {
     49             int capacity = buffer.length;
     50             if((length + appCap) <= capacity) { return; }
     51             do {
     52                 if(capacity < 1000) {
     53                     capacity *= 4;
     54                 } else {
     55                     capacity *= 2;
     56                 }
     57             } while(capacity < (length + appCap));
     58             long[] newBuffer = new long[capacity];
     59             System.arraycopy(buffer, 0, newBuffer, 0, length);
     60             buffer = newBuffer;
     61         }
     62 
     63         void incLength() {
     64             // Use INITIAL_CAPACITY for a very simple fastpath.
     65             // (Rather than buffer.getCapacity().)
     66             if(length >= INITIAL_CAPACITY) {
     67                 ensureAppendCapacity(1);
     68             }
     69             ++length;
     70         }
     71 
     72         long set(int i, long ce) {
     73             return buffer[i] = ce;
     74         }
     75         long get(int i) { return buffer[i]; }
     76 
     77         long[] getCEs() { return buffer; }
     78 
     79         int length = 0;
     80 
     81         private long[] buffer = new long[INITIAL_CAPACITY];
     82     }
     83 
     84     // State of combining marks skipped in discontiguous contraction.
     85     // We create a state object on first use and keep it around deactivated between uses.
     86     private static final class SkippedState {
     87         // Born active but empty.
     88         SkippedState() {}
     89         void clear() {
     90             oldBuffer.setLength(0);
     91             pos = 0;
     92             // The newBuffer is reset by setFirstSkipped().
     93         }
     94 
     95         boolean isEmpty() { return oldBuffer.length() == 0; }
     96 
     97         boolean hasNext() { return pos < oldBuffer.length(); }
     98 
     99         // Requires hasNext().
    100         int next() {
    101             int c = oldBuffer.codePointAt(pos);
    102             pos += Character.charCount(c);
    103             return c;
    104         }
    105 
    106         // Accounts for one more input code point read beyond the end of the marks buffer.
    107         void incBeyond() {
    108             assert(!hasNext());
    109             ++pos;
    110         }
    111 
    112         // Goes backward through the skipped-marks buffer.
    113         // Returns the number of code points read beyond the skipped marks
    114         // that need to be backtracked through normal input.
    115         int backwardNumCodePoints(int n) {
    116             int length = oldBuffer.length();
    117             int beyond = pos - length;
    118             if(beyond > 0) {
    119                 if(beyond >= n) {
    120                     // Not back far enough to re-enter the oldBuffer.
    121                     pos -= n;
    122                     return n;
    123                 } else {
    124                     // Back out all beyond-oldBuffer code points and re-enter the buffer.
    125                     pos = oldBuffer.offsetByCodePoints(length, beyond - n);
    126                     return beyond;
    127                 }
    128             } else {
    129                 // Go backwards from inside the oldBuffer.
    130                 pos = oldBuffer.offsetByCodePoints(pos, -n);
    131                 return 0;
    132             }
    133         }
    134 
    135         void setFirstSkipped(int c) {
    136             skipLengthAtMatch = 0;
    137             newBuffer.setLength(0);
    138             newBuffer.appendCodePoint(c);
    139         }
    140 
    141         void skip(int c) {
    142             newBuffer.appendCodePoint(c);
    143         }
    144 
    145         void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
    146 
    147         // Replaces the characters we consumed with the newly skipped ones.
    148         void replaceMatch() {
    149             // Note: UnicodeString.replace() pins pos to at most length().
    150             int oldLength = oldBuffer.length();
    151             if(pos > oldLength) { pos = oldLength; }
    152             oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch);
    153             pos = 0;
    154         }
    155 
    156         void saveTrieState(CharsTrie trie) { trie.saveState(state); }
    157         void resetToTrieState(CharsTrie trie) { trie.resetToState(state); }
    158 
    159         // Combining marks skipped in previous discontiguous-contraction matching.
    160         // After that discontiguous contraction was completed, we start reading them from here.
    161         private final StringBuilder oldBuffer = new StringBuilder();
    162         // Combining marks newly skipped in current discontiguous-contraction matching.
    163         // These might have been read from the normal text or from the oldBuffer.
    164         private final StringBuilder newBuffer = new StringBuilder();
    165         // Reading index in oldBuffer,
    166         // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
    167         private int pos;
    168         // newBuffer.length() at the time of the last matching character.
    169         // When a partial match fails, we back out skipped and partial-matching input characters.
    170         private int skipLengthAtMatch;
    171         // We save the trie state before we attempt to match a character,
    172         // so that we can skip it and try the next one.
    173         private CharsTrie.State state = new CharsTrie.State();
    174     };
    175 
    176     /**
    177      * Partially constructs the iterator.
    178      * In Java, we cache partially constructed iterators
    179      * and finish their setup when starting to work on text
    180      * (via reset(boolean) and the setText(numeric, ...) methods of subclasses).
    181      * This avoids memory allocations for iterators that remain unused.
    182      *
    183      * <p>In C++, there is only one constructor, and iterators are
    184      * stack-allocated as needed.
    185      */
    186     public CollationIterator(CollationData d) {
    187         trie = d.trie;
    188         data = d;
    189         numCpFwd = -1;
    190         isNumeric = false;
    191         ceBuffer = null;
    192     }
    193 
    194     public CollationIterator(CollationData d, boolean numeric) {
    195         trie = d.trie;
    196         data = d;
    197         numCpFwd = -1;
    198         isNumeric = numeric;
    199         ceBuffer = new CEBuffer();
    200     }
    201 
    202     @Override
    203     public boolean equals(Object other) {
    204         // Subclasses: Call this method and then add more specific checks.
    205         // Compare the iterator state but not the collation data (trie & data fields):
    206         // Assume that the caller compares the data.
    207         // Ignore skipped since that should be unused between calls to nextCE().
    208         // (It only stays around to avoid another memory allocation.)
    209         if(other == null) { return false; }
    210         if(!this.getClass().equals(other.getClass())) { return false; }
    211         CollationIterator o = (CollationIterator)other;
    212         if(!(ceBuffer.length == o.ceBuffer.length &&
    213                 cesIndex == o.cesIndex &&
    214                 numCpFwd == o.numCpFwd &&
    215                 isNumeric == o.isNumeric)) {
    216             return false;
    217         }
    218         for(int i = 0; i < ceBuffer.length; ++i) {
    219             if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; }
    220         }
    221         return true;
    222     }
    223 
    224     @Override
    225     public int hashCode() {
    226         // Dummy return to prevent compile warnings.
    227         return 0;
    228     }
    229 
    230     /**
    231      * Resets the iterator state and sets the position to the specified offset.
    232      * Subclasses must implement, and must call the parent class method,
    233      * or CollationIterator.reset().
    234      */
    235     public abstract void resetToOffset(int newOffset);
    236 
    237     public abstract int getOffset();
    238 
    239     /**
    240      * Returns the next collation element.
    241      */
    242     public final long nextCE() {
    243         if(cesIndex < ceBuffer.length) {
    244             // Return the next buffered CE.
    245             return ceBuffer.get(cesIndex++);
    246         }
    247         assert cesIndex == ceBuffer.length;
    248         ceBuffer.incLength();
    249         long cAndCE32 = handleNextCE32();
    250         int c = (int)(cAndCE32 >> 32);
    251         int ce32 = (int)cAndCE32;
    252         int t = ce32 & 0xff;
    253         if(t < Collation.SPECIAL_CE32_LOW_BYTE) {  // Forced-inline of isSpecialCE32(ce32).
    254             // Normal CE from the main data.
    255             // Forced-inline of ceFromSimpleCE32(ce32).
    256             return ceBuffer.set(cesIndex++,
    257                     ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
    258         }
    259         CollationData d;
    260         // The compiler should be able to optimize the previous and the following
    261         // comparisons of t with the same constant.
    262         if(t == Collation.SPECIAL_CE32_LOW_BYTE) {
    263             if(c < 0) {
    264                 return ceBuffer.set(cesIndex++, Collation.NO_CE);
    265             }
    266             d = data.base;
    267             ce32 = d.getCE32(c);
    268             t = ce32 & 0xff;
    269             if(t < Collation.SPECIAL_CE32_LOW_BYTE) {
    270                 // Normal CE from the base data.
    271                 return ceBuffer.set(cesIndex++,
    272                         ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
    273             }
    274         } else {
    275             d = data;
    276         }
    277         if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) {
    278             // Forced-inline of ceFromLongPrimaryCE32(ce32).
    279             return ceBuffer.set(cesIndex++,
    280                     ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE);
    281         }
    282         return nextCEFromCE32(d, c, ce32);
    283     }
    284 
    285     /**
    286      * Fetches all CEs.
    287      * @return getCEsLength()
    288      */
    289     public final int fetchCEs() {
    290         while(nextCE() != Collation.NO_CE) {
    291             // No need to loop for each expansion CE.
    292             cesIndex = ceBuffer.length;
    293         }
    294         return ceBuffer.length;
    295     }
    296 
    297     /**
    298      * Overwrites the current CE (the last one returned by nextCE()).
    299      */
    300     final void setCurrentCE(long ce) {
    301         assert cesIndex > 0;
    302         ceBuffer.set(cesIndex - 1, ce);
    303     }
    304 
    305     /**
    306      * Returns the previous collation element.
    307      */
    308     public final long previousCE(UVector32 offsets) {
    309         if(ceBuffer.length > 0) {
    310             // Return the previous buffered CE.
    311             return ceBuffer.get(--ceBuffer.length);
    312         }
    313         offsets.removeAllElements();
    314         int limitOffset = getOffset();
    315         int c = previousCodePoint();
    316         if(c < 0) { return Collation.NO_CE; }
    317         if(data.isUnsafeBackward(c, isNumeric)) {
    318             return previousCEUnsafe(c, offsets);
    319         }
    320         // Simple, safe-backwards iteration:
    321         // Get a CE going backwards, handle prefixes but no contractions.
    322         int ce32 = data.getCE32(c);
    323         CollationData d;
    324         if(ce32 == Collation.FALLBACK_CE32) {
    325             d = data.base;
    326             ce32 = d.getCE32(c);
    327         } else {
    328             d = data;
    329         }
    330         if(Collation.isSimpleOrLongCE32(ce32)) {
    331             return Collation.ceFromCE32(ce32);
    332         }
    333         appendCEsFromCE32(d, c, ce32, false);
    334         if(ceBuffer.length > 1) {
    335             offsets.addElement(getOffset());
    336             // For an expansion, the offset of each non-initial CE is the limit offset,
    337             // consistent with forward iteration.
    338             while(offsets.size() <= ceBuffer.length) {
    339                 offsets.addElement(limitOffset);
    340             };
    341         }
    342         return ceBuffer.get(--ceBuffer.length);
    343     }
    344 
    345     public final int getCEsLength() {
    346         return ceBuffer.length;
    347     }
    348 
    349     public final long getCE(int i) {
    350         return ceBuffer.get(i);
    351     }
    352 
    353     public final long[] getCEs() {
    354         return ceBuffer.getCEs();
    355     }
    356 
    357     final void clearCEs() {
    358         cesIndex = ceBuffer.length = 0;
    359     }
    360 
    361     public final void clearCEsIfNoneRemaining() {
    362         if(cesIndex == ceBuffer.length) { clearCEs(); }
    363     }
    364 
    365     /**
    366      * Returns the next code point (with post-increment).
    367      * Public for identical-level comparison and for testing.
    368      */
    369     public abstract int nextCodePoint();
    370 
    371     /**
    372      * Returns the previous code point (with pre-decrement).
    373      * Public for identical-level comparison and for testing.
    374      */
    375     public abstract int previousCodePoint();
    376 
    377     protected final void reset() {
    378         cesIndex = ceBuffer.length = 0;
    379         if(skipped != null) { skipped.clear(); }
    380     }
    381     /**
    382      * Resets the state as well as the numeric setting,
    383      * and completes the initialization.
    384      * Only exists in Java where we reset cached CollationIterator instances
    385      * rather than stack-allocating temporary ones.
    386      * (See also the constructor comments.)
    387      */
    388     protected final void reset(boolean numeric) {
    389         if(ceBuffer == null) {
    390             ceBuffer = new CEBuffer();
    391         }
    392         reset();
    393         isNumeric = numeric;
    394     }
    395 
    396     /**
    397      * Returns the next code point and its local CE32 value.
    398      * Returns Collation.FALLBACK_CE32 at the end of the text (c<0)
    399      * or when c's CE32 value is to be looked up in the base data (fallback).
    400      *
    401      * The code point is used for fallbacks, context and implicit weights.
    402      * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
    403      *
    404      * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0.
    405      */
    406     protected long handleNextCE32() {
    407         int c = nextCodePoint();
    408         if(c < 0) { return NO_CP_AND_CE32; }
    409         return makeCodePointAndCE32Pair(c, data.getCE32(c));
    410     }
    411     protected long makeCodePointAndCE32Pair(int c, int ce32) {
    412         return ((long)c << 32) | (ce32 & 0xffffffffL);
    413     }
    414     protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL);
    415 
    416     /**
    417      * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
    418      * Returns the trail surrogate in that case and advances past it,
    419      * if a trail surrogate follows the lead surrogate.
    420      * Otherwise returns any other code unit and does not advance.
    421      */
    422     protected char handleGetTrailSurrogate() {
    423         return 0;
    424     }
    425 
    426     /**
    427      * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
    428      * (Not needed in Java.)
    429      */
    430     /*protected boolean foundNULTerminator() {
    431         return false;
    432     }*/
    433 
    434     /**
    435      * @return false if surrogate code points U+D800..U+DFFF
    436      *         map to their own implicit primary weights (for UTF-16),
    437      *         or true if they map to CE(U+FFFD) (for UTF-8)
    438      */
    439     protected boolean forbidSurrogateCodePoints() {
    440         return false;
    441     }
    442 
    443     protected abstract void forwardNumCodePoints(int num);
    444 
    445     protected abstract void backwardNumCodePoints(int num);
    446 
    447     /**
    448      * Returns the CE32 from the data trie.
    449      * Normally the same as data.getCE32(), but overridden in the builder.
    450      * Call this only when the faster data.getCE32() cannot be used.
    451      */
    452     protected int getDataCE32(int c) {
    453         return data.getCE32(c);
    454     }
    455 
    456     protected int getCE32FromBuilderData(int ce32) {
    457         throw new ICUException("internal program error: should be unreachable");
    458     }
    459 
    460     protected final void appendCEsFromCE32(CollationData d, int c, int ce32,
    461                            boolean forward) {
    462         while(Collation.isSpecialCE32(ce32)) {
    463             switch(Collation.tagFromCE32(ce32)) {
    464             case Collation.FALLBACK_TAG:
    465             case Collation.RESERVED_TAG_3:
    466                 throw new ICUException("internal program error: should be unreachable");
    467             case Collation.LONG_PRIMARY_TAG:
    468                 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32));
    469                 return;
    470             case Collation.LONG_SECONDARY_TAG:
    471                 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32));
    472                 return;
    473             case Collation.LATIN_EXPANSION_TAG:
    474                 ceBuffer.ensureAppendCapacity(2);
    475                 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32));
    476                 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32));
    477                 ceBuffer.length += 2;
    478                 return;
    479             case Collation.EXPANSION32_TAG: {
    480                 int index = Collation.indexFromCE32(ce32);
    481                 int length = Collation.lengthFromCE32(ce32);
    482                 ceBuffer.ensureAppendCapacity(length);
    483                 do {
    484                     ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++]));
    485                 } while(--length > 0);
    486                 return;
    487             }
    488             case Collation.EXPANSION_TAG: {
    489                 int index = Collation.indexFromCE32(ce32);
    490                 int length = Collation.lengthFromCE32(ce32);
    491                 ceBuffer.ensureAppendCapacity(length);
    492                 do {
    493                     ceBuffer.appendUnsafe(d.ces[index++]);
    494                 } while(--length > 0);
    495                 return;
    496             }
    497             case Collation.BUILDER_DATA_TAG:
    498                 ce32 = getCE32FromBuilderData(ce32);
    499                 if(ce32 == Collation.FALLBACK_CE32) {
    500                     d = data.base;
    501                     ce32 = d.getCE32(c);
    502                 }
    503                 break;
    504             case Collation.PREFIX_TAG:
    505                 if(forward) { backwardNumCodePoints(1); }
    506                 ce32 = getCE32FromPrefix(d, ce32);
    507                 if(forward) { forwardNumCodePoints(1); }
    508                 break;
    509             case Collation.CONTRACTION_TAG: {
    510                 int index = Collation.indexFromCE32(ce32);
    511                 int defaultCE32 = d.getCE32FromContexts(index);  // Default if no suffix match.
    512                 if(!forward) {
    513                     // Backward contractions are handled by previousCEUnsafe().
    514                     // c has contractions but they were not found.
    515                     ce32 = defaultCE32;
    516                     break;
    517                 }
    518                 int nextCp;
    519                 if(skipped == null && numCpFwd < 0) {
    520                     // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
    521                     // avoiding the function call and the nextSkippedCodePoint() overhead.
    522                     nextCp = nextCodePoint();
    523                     if(nextCp < 0) {
    524                         // No more text.
    525                         ce32 = defaultCE32;
    526                         break;
    527                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
    528                             !CollationFCD.mayHaveLccc(nextCp)) {
    529                         // All contraction suffixes start with characters with lccc!=0
    530                         // but the next code point has lccc==0.
    531                         backwardNumCodePoints(1);
    532                         ce32 = defaultCE32;
    533                         break;
    534                     }
    535                 } else {
    536                     nextCp = nextSkippedCodePoint();
    537                     if(nextCp < 0) {
    538                         // No more text.
    539                         ce32 = defaultCE32;
    540                         break;
    541                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
    542                             !CollationFCD.mayHaveLccc(nextCp)) {
    543                         // All contraction suffixes start with characters with lccc!=0
    544                         // but the next code point has lccc==0.
    545                         backwardNumSkipped(1);
    546                         ce32 = defaultCE32;
    547                         break;
    548                     }
    549                 }
    550                 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp);
    551                 if(ce32 == Collation.NO_CE32) {
    552                     // CEs from a discontiguous contraction plus the skipped combining marks
    553                     // have been appended already.
    554                     return;
    555                 }
    556                 break;
    557             }
    558             case Collation.DIGIT_TAG:
    559                 if(isNumeric) {
    560                     appendNumericCEs(ce32, forward);
    561                     return;
    562                 } else {
    563                     // Fetch the non-numeric-collation CE32 and continue.
    564                     ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
    565                     break;
    566                 }
    567             case Collation.U0000_TAG:
    568                 assert(c == 0);
    569                 // NUL-terminated input not supported in Java.
    570                 // Fetch the normal ce32 for U+0000 and continue.
    571                 ce32 = d.ce32s[0];
    572                 break;
    573             case Collation.HANGUL_TAG: {
    574                 int[] jamoCE32s = d.jamoCE32s;
    575                 c -= Hangul.HANGUL_BASE;
    576                 int t = c % Hangul.JAMO_T_COUNT;
    577                 c /= Hangul.JAMO_T_COUNT;
    578                 int v = c % Hangul.JAMO_V_COUNT;
    579                 c /= Hangul.JAMO_V_COUNT;
    580                 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) {
    581                     // None of the Jamo CE32s are isSpecialCE32().
    582                     // Avoid recursive function calls and per-Jamo tests.
    583                     ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3);
    584                     ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c]));
    585                     ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v]));
    586                     ceBuffer.length += 2;
    587                     if(t != 0) {
    588                         ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t]));
    589                     }
    590                     return;
    591                 } else {
    592                     // We should not need to compute each Jamo code point.
    593                     // In particular, there should be no offset or implicit ce32.
    594                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward);
    595                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward);
    596                     if(t == 0) { return; }
    597                     // offset 39 = 19 + 21 - 1:
    598                     // 19 = JAMO_L_COUNT
    599                     // 21 = JAMO_T_COUNT
    600                     // -1 = omit t==0
    601                     ce32 = jamoCE32s[39 + t];
    602                     c = Collation.SENTINEL_CP;
    603                     break;
    604                 }
    605             }
    606             case Collation.LEAD_SURROGATE_TAG: {
    607                 assert(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
    608                 assert(isLeadSurrogate(c));
    609                 char trail;
    610                 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) {
    611                     c = Character.toCodePoint((char)c, trail);
    612                     ce32 &= Collation.LEAD_TYPE_MASK;
    613                     if(ce32 == Collation.LEAD_ALL_UNASSIGNED) {
    614                         ce32 = Collation.UNASSIGNED_CE32;  // unassigned-implicit
    615                     } else if(ce32 == Collation.LEAD_ALL_FALLBACK ||
    616                             (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) {
    617                         // fall back to the base data
    618                         d = d.base;
    619                         ce32 = d.getCE32FromSupplementary(c);
    620                     }
    621                 } else {
    622                     // c is an unpaired surrogate.
    623                     ce32 = Collation.UNASSIGNED_CE32;
    624                 }
    625                 break;
    626             }
    627             case Collation.OFFSET_TAG:
    628                 assert(c >= 0);
    629                 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32));
    630                 return;
    631             case Collation.IMPLICIT_TAG:
    632                 assert(c >= 0);
    633                 if(isSurrogate(c) && forbidSurrogateCodePoints()) {
    634                     ce32 = Collation.FFFD_CE32;
    635                     break;
    636                 } else {
    637                     ceBuffer.append(Collation.unassignedCEFromCodePoint(c));
    638                     return;
    639                 }
    640             }
    641         }
    642         ceBuffer.append(Collation.ceFromSimpleCE32(ce32));
    643     }
    644 
    645     // TODO: Propose widening the UTF16 method.
    646     private static final boolean isSurrogate(int c) {
    647         return (c & 0xfffff800) == 0xd800;
    648     }
    649 
    650     // TODO: Propose widening the UTF16 method.
    651     protected static final boolean isLeadSurrogate(int c) {
    652         return (c & 0xfffffc00) == 0xd800;
    653     }
    654 
    655     // TODO: Propose widening the UTF16 method.
    656     protected static final boolean isTrailSurrogate(int c) {
    657         return (c & 0xfffffc00) == 0xdc00;
    658     }
    659 
    660     // Main lookup trie of the data object.
    661     protected final Trie2_32 trie;
    662     protected final CollationData data;
    663 
    664     private final long nextCEFromCE32(CollationData d, int c, int ce32) {
    665         --ceBuffer.length;  // Undo ceBuffer.incLength().
    666         appendCEsFromCE32(d, c, ce32, true);
    667         return ceBuffer.get(cesIndex++);
    668     }
    669 
    670     private final int getCE32FromPrefix(CollationData d, int ce32) {
    671         int index = Collation.indexFromCE32(ce32);
    672         ce32 = d.getCE32FromContexts(index);  // Default if no prefix match.
    673         index += 2;
    674         // Number of code points read before the original code point.
    675         int lookBehind = 0;
    676         CharsTrie prefixes = new CharsTrie(d.contexts, index);
    677         for(;;) {
    678             int c = previousCodePoint();
    679             if(c < 0) { break; }
    680             ++lookBehind;
    681             BytesTrie.Result match = prefixes.nextForCodePoint(c);
    682             if(match.hasValue()) {
    683                 ce32 = prefixes.getValue();
    684             }
    685             if(!match.hasNext()) { break; }
    686         }
    687         forwardNumCodePoints(lookBehind);
    688         return ce32;
    689     }
    690 
    691     private final int nextSkippedCodePoint() {
    692         if(skipped != null && skipped.hasNext()) { return skipped.next(); }
    693         if(numCpFwd == 0) { return Collation.SENTINEL_CP; }
    694         int c = nextCodePoint();
    695         if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); }
    696         if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
    697         return c;
    698     }
    699 
    700     private final void backwardNumSkipped(int n) {
    701         if(skipped != null && !skipped.isEmpty()) {
    702             n = skipped.backwardNumCodePoints(n);
    703         }
    704         backwardNumCodePoints(n);
    705         if(numCpFwd >= 0) { numCpFwd += n; }
    706     }
    707 
    708     private final int nextCE32FromContraction(
    709             CollationData d, int contractionCE32,
    710             CharSequence trieChars, int trieOffset, int ce32, int c) {
    711         // c: next code point after the original one
    712 
    713         // Number of code points read beyond the original code point.
    714         // Needed for discontiguous contraction matching.
    715         int lookAhead = 1;
    716         // Number of code points read since the last match (initially only c).
    717         int sinceMatch = 1;
    718         // Normally we only need a contiguous match,
    719         // and therefore need not remember the suffixes state from before a mismatch for retrying.
    720         // If we are already processing skipped combining marks, then we do track the state.
    721         CharsTrie suffixes = new CharsTrie(trieChars, trieOffset);
    722         if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
    723         BytesTrie.Result match = suffixes.firstForCodePoint(c);
    724         for(;;) {
    725             int nextCp;
    726             if(match.hasValue()) {
    727                 ce32 = suffixes.getValue();
    728                 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) {
    729                     return ce32;
    730                 }
    731                 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
    732                 sinceMatch = 1;
    733             } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) {
    734                 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text.
    735                 // Back up if necessary, and try a discontiguous contraction.
    736                 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 &&
    737                         // Discontiguous contraction matching extends an existing match.
    738                         // If there is no match yet, then there is nothing to do.
    739                         ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
    740                             sinceMatch < lookAhead)) {
    741                     // The last character of at least one suffix has lccc!=0,
    742                     // allowing for discontiguous contractions.
    743                     // UCA S2.1.1 only processes non-starters immediately following
    744                     // "a match in the table" (sinceMatch=1).
    745                     if(sinceMatch > 1) {
    746                         // Return to the state after the last match.
    747                         // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
    748                         backwardNumSkipped(sinceMatch);
    749                         c = nextSkippedCodePoint();
    750                         lookAhead -= sinceMatch - 1;
    751                         sinceMatch = 1;
    752                     }
    753                     if(d.getFCD16(c) > 0xff) {
    754                         return nextCE32FromDiscontiguousContraction(
    755                             d, suffixes, ce32, lookAhead, c);
    756                     }
    757                 }
    758                 break;
    759             } else {
    760                 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c.
    761                 // It does not have a result value, therefore it is not itself "a match in the table".
    762                 // If a partially-matched c has ccc!=0 then
    763                 // it might be skipped in discontiguous contraction.
    764                 c = nextCp;
    765                 ++sinceMatch;
    766             }
    767             ++lookAhead;
    768             match = suffixes.nextForCodePoint(c);
    769         }
    770         backwardNumSkipped(sinceMatch);
    771         return ce32;
    772     }
    773 
    774     private final int nextCE32FromDiscontiguousContraction(
    775             CollationData d, CharsTrie suffixes, int ce32,
    776             int lookAhead, int c) {
    777         // UCA section 3.3.2 Contractions:
    778         // Contractions that end with non-starter characters
    779         // are known as discontiguous contractions.
    780         // ... discontiguous contractions must be detected in input text
    781         // whenever the final sequence of non-starter characters could be rearranged
    782         // so as to make a contiguous matching sequence that is canonically equivalent.
    783 
    784         // UCA: http://www.unicode.org/reports/tr10/#S2.1
    785         // S2.1 Find the longest initial substring S at each point that has a match in the table.
    786         // S2.1.1 If there are any non-starters following S, process each non-starter C.
    787         // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
    788         //     Note: A non-starter in a string is called blocked
    789         //     if there is another non-starter of the same canonical combining class or zero
    790         //     between it and the last character of canonical combining class 0.
    791         // S2.1.3 If there is a match, replace S by S + C, and remove C.
    792 
    793         // First: Is a discontiguous contraction even possible?
    794         int fcd16 = d.getFCD16(c);
    795         assert(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
    796         int nextCp = nextSkippedCodePoint();
    797         if(nextCp < 0) {
    798             // No further text.
    799             backwardNumSkipped(1);
    800             return ce32;
    801         }
    802         ++lookAhead;
    803         int prevCC = fcd16 & 0xff;
    804         fcd16 = d.getFCD16(nextCp);
    805         if(fcd16 <= 0xff) {
    806             // The next code point after c is a starter (S2.1.1 "process each non-starter").
    807             backwardNumSkipped(2);
    808             return ce32;
    809         }
    810 
    811         // We have read and matched (lookAhead-2) code points,
    812         // read non-matching c and peeked ahead at nextCp.
    813         // Return to the state before the mismatch and continue matching with nextCp.
    814         if(skipped == null || skipped.isEmpty()) {
    815             if(skipped == null) {
    816                 skipped = new SkippedState();
    817             }
    818             suffixes.reset();
    819             if(lookAhead > 2) {
    820                 // Replay the partial match so far.
    821                 backwardNumCodePoints(lookAhead);
    822                 suffixes.firstForCodePoint(nextCodePoint());
    823                 for(int i = 3; i < lookAhead; ++i) {
    824                     suffixes.nextForCodePoint(nextCodePoint());
    825                 }
    826                 // Skip c (which did not match) and nextCp (which we will try now).
    827                 forwardNumCodePoints(2);
    828             }
    829             skipped.saveTrieState(suffixes);
    830         } else {
    831             // Reset to the trie state before the failed match of c.
    832             skipped.resetToTrieState(suffixes);
    833         }
    834 
    835         skipped.setFirstSkipped(c);
    836         // Number of code points read since the last match (at this point: c and nextCp).
    837         int sinceMatch = 2;
    838         c = nextCp;
    839         for(;;) {
    840             BytesTrie.Result match;
    841             // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
    842             if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) {
    843                 // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
    844                 // Keep prevCC unchanged.
    845                 ce32 = suffixes.getValue();
    846                 sinceMatch = 0;
    847                 skipped.recordMatch();
    848                 if(!match.hasNext()) { break; }
    849                 skipped.saveTrieState(suffixes);
    850             } else {
    851                 // No match for "S + C", skip C.
    852                 skipped.skip(c);
    853                 skipped.resetToTrieState(suffixes);
    854                 prevCC = fcd16 & 0xff;
    855             }
    856             if((c = nextSkippedCodePoint()) < 0) { break; }
    857             ++sinceMatch;
    858             fcd16 = d.getFCD16(c);
    859             if(fcd16 <= 0xff) {
    860                 // The next code point after c is a starter (S2.1.1 "process each non-starter").
    861                 break;
    862             }
    863         }
    864         backwardNumSkipped(sinceMatch);
    865         boolean isTopDiscontiguous = skipped.isEmpty();
    866         skipped.replaceMatch();
    867         if(isTopDiscontiguous && !skipped.isEmpty()) {
    868             // We did get a match after skipping one or more combining marks,
    869             // and we are not in a recursive discontiguous contraction.
    870             // Append CEs from the contraction ce32
    871             // and then from the combining marks that we skipped before the match.
    872             c = Collation.SENTINEL_CP;
    873             for(;;) {
    874                 appendCEsFromCE32(d, c, ce32, true);
    875                 // Fetch CE32s for skipped combining marks from the normal data, with fallback,
    876                 // rather than from the CollationData where we found the contraction.
    877                 if(!skipped.hasNext()) { break; }
    878                 c = skipped.next();
    879                 ce32 = getDataCE32(c);
    880                 if(ce32 == Collation.FALLBACK_CE32) {
    881                     d = data.base;
    882                     ce32 = d.getCE32(c);
    883                 } else {
    884                     d = data;
    885                 }
    886                 // Note: A nested discontiguous-contraction match
    887                 // replaces consumed combining marks with newly skipped ones
    888                 // and resets the reading position to the beginning.
    889             }
    890             skipped.clear();
    891             ce32 = Collation.NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
    892         }
    893         return ce32;
    894     }
    895 
    896     /**
    897      * Returns the previous CE when data.isUnsafeBackward(c, isNumeric).
    898      */
    899     private final long previousCEUnsafe(int c, UVector32 offsets) {
    900         // We just move through the input counting safe and unsafe code points
    901         // without collecting the unsafe-backward substring into a buffer and
    902         // switching to it.
    903         // This is to keep the logic simple. Otherwise we would have to handle
    904         // prefix matching going before the backward buffer, switching
    905         // to iteration and back, etc.
    906         // In the most important case of iterating over a normal string,
    907         // reading from the string itself is already maximally fast.
    908         // The only drawback there is that after getting the CEs we always
    909         // skip backward to the safe character rather than switching out
    910         // of a backwardBuffer.
    911         // But this should not be the common case for previousCE(),
    912         // and correctness and maintainability are more important than
    913         // complex optimizations.
    914         // Find the first safe character before c.
    915         int numBackward = 1;
    916         while((c = previousCodePoint()) >= 0) {
    917             ++numBackward;
    918             if(!data.isUnsafeBackward(c, isNumeric)) {
    919                 break;
    920             }
    921         }
    922         // Set the forward iteration limit.
    923         // Note: This counts code points.
    924         // We cannot enforce a limit in the middle of a surrogate pair or similar.
    925         numCpFwd = numBackward;
    926         // Reset the forward iterator.
    927         cesIndex = 0;
    928         assert(ceBuffer.length == 0);
    929         // Go forward and collect the CEs.
    930         int offset = getOffset();
    931         while(numCpFwd > 0) {
    932             // nextCE() normally reads one code point.
    933             // Contraction matching and digit specials read more and check numCpFwd.
    934             --numCpFwd;
    935             // Append one or more CEs to the ceBuffer.
    936             nextCE();
    937             assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE);
    938             // No need to loop for getting each expansion CE from nextCE().
    939             cesIndex = ceBuffer.length;
    940             // However, we need to write an offset for each CE.
    941             // This is for CollationElementIterator.getOffset() to return
    942             // intermediate offsets from the unsafe-backwards segment.
    943             assert(offsets.size() < ceBuffer.length);
    944             offsets.addElement(offset);
    945             // For an expansion, the offset of each non-initial CE is the limit offset,
    946             // consistent with forward iteration.
    947             offset = getOffset();
    948             while(offsets.size() < ceBuffer.length) {
    949                 offsets.addElement(offset);
    950             };
    951         }
    952         assert(offsets.size() == ceBuffer.length);
    953         // End offset corresponding to just after the unsafe-backwards segment.
    954         offsets.addElement(offset);
    955         // Reset the forward iteration limit
    956         // and move backward to before the segment for which we fetched CEs.
    957         numCpFwd = -1;
    958         backwardNumCodePoints(numBackward);
    959         // Use the collected CEs and return the last one.
    960         cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
    961         return ceBuffer.get(--ceBuffer.length);
    962     }
    963 
    964     /**
    965      * Turns a string of digits (bytes 0..9)
    966      * into a sequence of CEs that will sort in numeric order.
    967      *
    968      * Starts from this ce32's digit value and consumes the following/preceding digits.
    969      * The digits string must not be empty and must not have leading zeros.
    970      */
    971     private final void appendNumericCEs(int ce32, boolean forward) {
    972         // Collect digits.
    973         // TODO: Use some kind of a byte buffer? We only store values 0..9.
    974         StringBuilder digits = new StringBuilder();
    975         if(forward) {
    976             for(;;) {
    977                 char digit = Collation.digitFromCE32(ce32);
    978                 digits.append(digit);
    979                 if(numCpFwd == 0) { break; }
    980                 int c = nextCodePoint();
    981                 if(c < 0) { break; }
    982                 ce32 = data.getCE32(c);
    983                 if(ce32 == Collation.FALLBACK_CE32) {
    984                     ce32 = data.base.getCE32(c);
    985                 }
    986                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
    987                     backwardNumCodePoints(1);
    988                     break;
    989                 }
    990                 if(numCpFwd > 0) { --numCpFwd; }
    991             }
    992         } else {
    993             for(;;) {
    994                 char digit = Collation.digitFromCE32(ce32);
    995                 digits.append(digit);
    996                 int c = previousCodePoint();
    997                 if(c < 0) { break; }
    998                 ce32 = data.getCE32(c);
    999                 if(ce32 == Collation.FALLBACK_CE32) {
   1000                     ce32 = data.base.getCE32(c);
   1001                 }
   1002                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
   1003                     forwardNumCodePoints(1);
   1004                     break;
   1005                 }
   1006             }
   1007             // Reverse the digit string.
   1008             digits.reverse();
   1009         }
   1010         int pos = 0;
   1011         do {
   1012             // Skip leading zeros.
   1013             while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; }
   1014             // Write a sequence of CEs for at most 254 digits at a time.
   1015             int segmentLength = digits.length() - pos;
   1016             if(segmentLength > 254) { segmentLength = 254; }
   1017             appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength));
   1018             pos += segmentLength;
   1019         } while(pos < digits.length());
   1020     }
   1021 
   1022     /**
   1023      * Turns 1..254 digits into a sequence of CEs.
   1024      * Called by appendNumericCEs() for each segment of at most 254 digits.
   1025      */
   1026     private final void appendNumericSegmentCEs(CharSequence digits) {
   1027         int length = digits.length();
   1028         assert(1 <= length && length <= 254);
   1029         assert(length == 1 || digits.charAt(0) != 0);
   1030         long numericPrimary = data.numericPrimary;
   1031         // Note: We use primary byte values 2..255: digits are not compressible.
   1032         if(length <= 7) {
   1033             // Very dense encoding for small numbers.
   1034             int value = digits.charAt(0);
   1035             for(int i = 1; i < length; ++i) {
   1036                 value = value * 10 + digits.charAt(i);
   1037             }
   1038             // Primary weight second byte values:
   1039             //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
   1040             //     40 byte values  76..115 for medium numbers in three-byte primary weights.
   1041             //     16 byte values 116..131 for large numbers in four-byte primary weights.
   1042             //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
   1043             int firstByte = 2;
   1044             int numBytes = 74;
   1045             if(value < numBytes) {
   1046                 // Two-byte primary for 0..73, good for day & month numbers etc.
   1047                 long primary = numericPrimary | ((firstByte + value) << 16);
   1048                 ceBuffer.append(Collation.makeCE(primary));
   1049                 return;
   1050             }
   1051             value -= numBytes;
   1052             firstByte += numBytes;
   1053             numBytes = 40;
   1054             if(value < numBytes * 254) {
   1055                 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
   1056                 long primary = numericPrimary |
   1057                     ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
   1058                 ceBuffer.append(Collation.makeCE(primary));
   1059                 return;
   1060             }
   1061             value -= numBytes * 254;
   1062             firstByte += numBytes;
   1063             numBytes = 16;
   1064             if(value < numBytes * 254 * 254) {
   1065                 // Four-byte primary for 10234..1042489=10234+16*254*254-1.
   1066                 long primary = numericPrimary | (2 + value % 254);
   1067                 value /= 254;
   1068                 primary |= (2 + value % 254) << 8;
   1069                 value /= 254;
   1070                 primary |= (firstByte + value % 254) << 16;
   1071                 ceBuffer.append(Collation.makeCE(primary));
   1072                 return;
   1073             }
   1074             // original value > 1042489
   1075         }
   1076         assert(length >= 7);
   1077 
   1078         // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
   1079         // then we generate primary bytes with those pairs.
   1080         // Omit trailing 00 pairs.
   1081         // Decrement the value for the last pair.
   1082 
   1083         // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255.
   1084         int numPairs = (length + 1) / 2;
   1085         long primary = numericPrimary | ((132 - 4 + numPairs) << 16);
   1086         // Find the length without trailing 00 pairs.
   1087         while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) {
   1088             length -= 2;
   1089         }
   1090         // Read the first pair.
   1091         int pair;
   1092         int pos;
   1093         if((length & 1) != 0) {
   1094             // Only "half a pair" if we have an odd number of digits.
   1095             pair = digits.charAt(0);
   1096             pos = 1;
   1097         } else {
   1098             pair = digits.charAt(0) * 10 + digits.charAt(1);
   1099             pos = 2;
   1100         }
   1101         pair = 11 + 2 * pair;
   1102         // Add the pairs of digits between pos and length.
   1103         int shift = 8;
   1104         while(pos < length) {
   1105             if(shift == 0) {
   1106                 // Every three pairs/bytes we need to store a 4-byte-primary CE
   1107                 // and start with a new CE with the '0' primary lead byte.
   1108                 primary |= pair;
   1109                 ceBuffer.append(Collation.makeCE(primary));
   1110                 primary = numericPrimary;
   1111                 shift = 16;
   1112             } else {
   1113                 primary |= pair << shift;
   1114                 shift -= 8;
   1115             }
   1116             pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1));
   1117             pos += 2;
   1118         }
   1119         primary |= (pair - 1) << shift;
   1120         ceBuffer.append(Collation.makeCE(primary));
   1121     }
   1122 
   1123     private CEBuffer ceBuffer;
   1124     private int cesIndex;
   1125 
   1126     private SkippedState skipped;
   1127 
   1128     // Number of code points to read forward, or -1.
   1129     // Used as a forward iteration limit in previousCEUnsafe().
   1130     private int numCpFwd;
   1131     // Numeric collation (CollationSettings.NUMERIC).
   1132     private boolean isNumeric;
   1133 }
   1134