Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * CollationIterator.java, ported from collationiterator.h/.cpp
      9 *
     10 * C++ version created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import com.ibm.icu.impl.Normalizer2Impl.Hangul;
     17 import com.ibm.icu.impl.Trie2_32;
     18 import com.ibm.icu.util.BytesTrie;
     19 import com.ibm.icu.util.CharsTrie;
     20 import com.ibm.icu.util.ICUException;
     21 
     22 /**
     23  * Collation element iterator and abstract character iterator.
     24  *
     25  * When a method returns a code point value, it must be in 0..10FFFF,
     26  * except it can be negative as a sentinel value.
     27  */
     28 public abstract class CollationIterator {
     29     private static final class CEBuffer {
     30         /** Large enough for CEs of most short strings. */
     31         private static final int INITIAL_CAPACITY = 40;
     32 
     33         CEBuffer() {}
     34 
     35         void append(long ce) {
     36             if(length >= INITIAL_CAPACITY) {
     37                 ensureAppendCapacity(1);
     38             }
     39             buffer[length++] = ce;
     40         }
     41 
     42         void appendUnsafe(long ce) {
     43             buffer[length++] = ce;
     44         }
     45 
     46         void ensureAppendCapacity(int appCap) {
     47             int capacity = buffer.length;
     48             if((length + appCap) <= capacity) { return; }
     49             do {
     50                 if(capacity < 1000) {
     51                     capacity *= 4;
     52                 } else {
     53                     capacity *= 2;
     54                 }
     55             } while(capacity < (length + appCap));
     56             long[] newBuffer = new long[capacity];
     57             System.arraycopy(buffer, 0, newBuffer, 0, length);
     58             buffer = newBuffer;
     59         }
     60 
     61         void incLength() {
     62             // Use INITIAL_CAPACITY for a very simple fastpath.
     63             // (Rather than buffer.getCapacity().)
     64             if(length >= INITIAL_CAPACITY) {
     65                 ensureAppendCapacity(1);
     66             }
     67             ++length;
     68         }
     69 
     70         long set(int i, long ce) {
     71             return buffer[i] = ce;
     72         }
     73         long get(int i) { return buffer[i]; }
     74 
     75         long[] getCEs() { return buffer; }
     76 
     77         int length = 0;
     78 
     79         private long[] buffer = new long[INITIAL_CAPACITY];
     80     }
     81 
     82     // State of combining marks skipped in discontiguous contraction.
     83     // We create a state object on first use and keep it around deactivated between uses.
     84     private static final class SkippedState {
     85         // Born active but empty.
     86         SkippedState() {}
     87         void clear() {
     88             oldBuffer.setLength(0);
     89             pos = 0;
     90             // The newBuffer is reset by setFirstSkipped().
     91         }
     92 
     93         boolean isEmpty() { return oldBuffer.length() == 0; }
     94 
     95         boolean hasNext() { return pos < oldBuffer.length(); }
     96 
     97         // Requires hasNext().
     98         int next() {
     99             int c = oldBuffer.codePointAt(pos);
    100             pos += Character.charCount(c);
    101             return c;
    102         }
    103 
    104         // Accounts for one more input code point read beyond the end of the marks buffer.
    105         void incBeyond() {
    106             assert(!hasNext());
    107             ++pos;
    108         }
    109 
    110         // Goes backward through the skipped-marks buffer.
    111         // Returns the number of code points read beyond the skipped marks
    112         // that need to be backtracked through normal input.
    113         int backwardNumCodePoints(int n) {
    114             int length = oldBuffer.length();
    115             int beyond = pos - length;
    116             if(beyond > 0) {
    117                 if(beyond >= n) {
    118                     // Not back far enough to re-enter the oldBuffer.
    119                     pos -= n;
    120                     return n;
    121                 } else {
    122                     // Back out all beyond-oldBuffer code points and re-enter the buffer.
    123                     pos = oldBuffer.offsetByCodePoints(length, beyond - n);
    124                     return beyond;
    125                 }
    126             } else {
    127                 // Go backwards from inside the oldBuffer.
    128                 pos = oldBuffer.offsetByCodePoints(pos, -n);
    129                 return 0;
    130             }
    131         }
    132 
    133         void setFirstSkipped(int c) {
    134             skipLengthAtMatch = 0;
    135             newBuffer.setLength(0);
    136             newBuffer.appendCodePoint(c);
    137         }
    138 
    139         void skip(int c) {
    140             newBuffer.appendCodePoint(c);
    141         }
    142 
    143         void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
    144 
    145         // Replaces the characters we consumed with the newly skipped ones.
    146         void replaceMatch() {
    147             // Note: UnicodeString.replace() pins pos to at most length().
    148             int oldLength = oldBuffer.length();
    149             if(pos > oldLength) { pos = oldLength; }
    150             oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch);
    151             pos = 0;
    152         }
    153 
    154         void saveTrieState(CharsTrie trie) { trie.saveState(state); }
    155         void resetToTrieState(CharsTrie trie) { trie.resetToState(state); }
    156 
    157         // Combining marks skipped in previous discontiguous-contraction matching.
    158         // After that discontiguous contraction was completed, we start reading them from here.
    159         private final StringBuilder oldBuffer = new StringBuilder();
    160         // Combining marks newly skipped in current discontiguous-contraction matching.
    161         // These might have been read from the normal text or from the oldBuffer.
    162         private final StringBuilder newBuffer = new StringBuilder();
    163         // Reading index in oldBuffer,
    164         // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
    165         private int pos;
    166         // newBuffer.length() at the time of the last matching character.
    167         // When a partial match fails, we back out skipped and partial-matching input characters.
    168         private int skipLengthAtMatch;
    169         // We save the trie state before we attempt to match a character,
    170         // so that we can skip it and try the next one.
    171         private CharsTrie.State state = new CharsTrie.State();
    172     };
    173 
    174     /**
    175      * Partially constructs the iterator.
    176      * In Java, we cache partially constructed iterators
    177      * and finish their setup when starting to work on text
    178      * (via reset(boolean) and the setText(numeric, ...) methods of subclasses).
    179      * This avoids memory allocations for iterators that remain unused.
    180      *
    181      * <p>In C++, there is only one constructor, and iterators are
    182      * stack-allocated as needed.
    183      */
    184     public CollationIterator(CollationData d) {
    185         trie = d.trie;
    186         data = d;
    187         numCpFwd = -1;
    188         isNumeric = false;
    189         ceBuffer = null;
    190     }
    191 
    192     public CollationIterator(CollationData d, boolean numeric) {
    193         trie = d.trie;
    194         data = d;
    195         numCpFwd = -1;
    196         isNumeric = numeric;
    197         ceBuffer = new CEBuffer();
    198     }
    199 
    200     @Override
    201     public boolean equals(Object other) {
    202         // Subclasses: Call this method and then add more specific checks.
    203         // Compare the iterator state but not the collation data (trie & data fields):
    204         // Assume that the caller compares the data.
    205         // Ignore skipped since that should be unused between calls to nextCE().
    206         // (It only stays around to avoid another memory allocation.)
    207         if(other == null) { return false; }
    208         if(!this.getClass().equals(other.getClass())) { return false; }
    209         CollationIterator o = (CollationIterator)other;
    210         if(!(ceBuffer.length == o.ceBuffer.length &&
    211                 cesIndex == o.cesIndex &&
    212                 numCpFwd == o.numCpFwd &&
    213                 isNumeric == o.isNumeric)) {
    214             return false;
    215         }
    216         for(int i = 0; i < ceBuffer.length; ++i) {
    217             if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; }
    218         }
    219         return true;
    220     }
    221 
    222     @Override
    223     public int hashCode() {
    224         // Dummy return to prevent compile warnings.
    225         return 0;
    226     }
    227 
    228     /**
    229      * Resets the iterator state and sets the position to the specified offset.
    230      * Subclasses must implement, and must call the parent class method,
    231      * or CollationIterator.reset().
    232      */
    233     public abstract void resetToOffset(int newOffset);
    234 
    235     public abstract int getOffset();
    236 
    237     /**
    238      * Returns the next collation element.
    239      */
    240     public final long nextCE() {
    241         if(cesIndex < ceBuffer.length) {
    242             // Return the next buffered CE.
    243             return ceBuffer.get(cesIndex++);
    244         }
    245         assert cesIndex == ceBuffer.length;
    246         ceBuffer.incLength();
    247         long cAndCE32 = handleNextCE32();
    248         int c = (int)(cAndCE32 >> 32);
    249         int ce32 = (int)cAndCE32;
    250         int t = ce32 & 0xff;
    251         if(t < Collation.SPECIAL_CE32_LOW_BYTE) {  // Forced-inline of isSpecialCE32(ce32).
    252             // Normal CE from the main data.
    253             // Forced-inline of ceFromSimpleCE32(ce32).
    254             return ceBuffer.set(cesIndex++,
    255                     ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
    256         }
    257         CollationData d;
    258         // The compiler should be able to optimize the previous and the following
    259         // comparisons of t with the same constant.
    260         if(t == Collation.SPECIAL_CE32_LOW_BYTE) {
    261             if(c < 0) {
    262                 return ceBuffer.set(cesIndex++, Collation.NO_CE);
    263             }
    264             d = data.base;
    265             ce32 = d.getCE32(c);
    266             t = ce32 & 0xff;
    267             if(t < Collation.SPECIAL_CE32_LOW_BYTE) {
    268                 // Normal CE from the base data.
    269                 return ceBuffer.set(cesIndex++,
    270                         ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
    271             }
    272         } else {
    273             d = data;
    274         }
    275         if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) {
    276             // Forced-inline of ceFromLongPrimaryCE32(ce32).
    277             return ceBuffer.set(cesIndex++,
    278                     ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE);
    279         }
    280         return nextCEFromCE32(d, c, ce32);
    281     }
    282 
    283     /**
    284      * Fetches all CEs.
    285      * @return getCEsLength()
    286      */
    287     public final int fetchCEs() {
    288         while(nextCE() != Collation.NO_CE) {
    289             // No need to loop for each expansion CE.
    290             cesIndex = ceBuffer.length;
    291         }
    292         return ceBuffer.length;
    293     }
    294 
    295     /**
    296      * Overwrites the current CE (the last one returned by nextCE()).
    297      */
    298     final void setCurrentCE(long ce) {
    299         assert cesIndex > 0;
    300         ceBuffer.set(cesIndex - 1, ce);
    301     }
    302 
    303     /**
    304      * Returns the previous collation element.
    305      */
    306     public final long previousCE(UVector32 offsets) {
    307         if(ceBuffer.length > 0) {
    308             // Return the previous buffered CE.
    309             return ceBuffer.get(--ceBuffer.length);
    310         }
    311         offsets.removeAllElements();
    312         int limitOffset = getOffset();
    313         int c = previousCodePoint();
    314         if(c < 0) { return Collation.NO_CE; }
    315         if(data.isUnsafeBackward(c, isNumeric)) {
    316             return previousCEUnsafe(c, offsets);
    317         }
    318         // Simple, safe-backwards iteration:
    319         // Get a CE going backwards, handle prefixes but no contractions.
    320         int ce32 = data.getCE32(c);
    321         CollationData d;
    322         if(ce32 == Collation.FALLBACK_CE32) {
    323             d = data.base;
    324             ce32 = d.getCE32(c);
    325         } else {
    326             d = data;
    327         }
    328         if(Collation.isSimpleOrLongCE32(ce32)) {
    329             return Collation.ceFromCE32(ce32);
    330         }
    331         appendCEsFromCE32(d, c, ce32, false);
    332         if(ceBuffer.length > 1) {
    333             offsets.addElement(getOffset());
    334             // For an expansion, the offset of each non-initial CE is the limit offset,
    335             // consistent with forward iteration.
    336             while(offsets.size() <= ceBuffer.length) {
    337                 offsets.addElement(limitOffset);
    338             };
    339         }
    340         return ceBuffer.get(--ceBuffer.length);
    341     }
    342 
    343     public final int getCEsLength() {
    344         return ceBuffer.length;
    345     }
    346 
    347     public final long getCE(int i) {
    348         return ceBuffer.get(i);
    349     }
    350 
    351     public final long[] getCEs() {
    352         return ceBuffer.getCEs();
    353     }
    354 
    355     final void clearCEs() {
    356         cesIndex = ceBuffer.length = 0;
    357     }
    358 
    359     public final void clearCEsIfNoneRemaining() {
    360         if(cesIndex == ceBuffer.length) { clearCEs(); }
    361     }
    362 
    363     /**
    364      * Returns the next code point (with post-increment).
    365      * Public for identical-level comparison and for testing.
    366      */
    367     public abstract int nextCodePoint();
    368 
    369     /**
    370      * Returns the previous code point (with pre-decrement).
    371      * Public for identical-level comparison and for testing.
    372      */
    373     public abstract int previousCodePoint();
    374 
    375     protected final void reset() {
    376         cesIndex = ceBuffer.length = 0;
    377         if(skipped != null) { skipped.clear(); }
    378     }
    379     /**
    380      * Resets the state as well as the numeric setting,
    381      * and completes the initialization.
    382      * Only exists in Java where we reset cached CollationIterator instances
    383      * rather than stack-allocating temporary ones.
    384      * (See also the constructor comments.)
    385      */
    386     protected final void reset(boolean numeric) {
    387         if(ceBuffer == null) {
    388             ceBuffer = new CEBuffer();
    389         }
    390         reset();
    391         isNumeric = numeric;
    392     }
    393 
    394     /**
    395      * Returns the next code point and its local CE32 value.
    396      * Returns Collation.FALLBACK_CE32 at the end of the text (c<0)
    397      * or when c's CE32 value is to be looked up in the base data (fallback).
    398      *
    399      * The code point is used for fallbacks, context and implicit weights.
    400      * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
    401      *
    402      * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0.
    403      */
    404     protected long handleNextCE32() {
    405         int c = nextCodePoint();
    406         if(c < 0) { return NO_CP_AND_CE32; }
    407         return makeCodePointAndCE32Pair(c, data.getCE32(c));
    408     }
    409     protected long makeCodePointAndCE32Pair(int c, int ce32) {
    410         return ((long)c << 32) | (ce32 & 0xffffffffL);
    411     }
    412     protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL);
    413 
    414     /**
    415      * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
    416      * Returns the trail surrogate in that case and advances past it,
    417      * if a trail surrogate follows the lead surrogate.
    418      * Otherwise returns any other code unit and does not advance.
    419      */
    420     protected char handleGetTrailSurrogate() {
    421         return 0;
    422     }
    423 
    424     /**
    425      * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
    426      * (Not needed in Java.)
    427      */
    428     /*protected boolean foundNULTerminator() {
    429         return false;
    430     }*/
    431 
    432     /**
    433      * @return false if surrogate code points U+D800..U+DFFF
    434      *         map to their own implicit primary weights (for UTF-16),
    435      *         or true if they map to CE(U+FFFD) (for UTF-8)
    436      */
    437     protected boolean forbidSurrogateCodePoints() {
    438         return false;
    439     }
    440 
    441     protected abstract void forwardNumCodePoints(int num);
    442 
    443     protected abstract void backwardNumCodePoints(int num);
    444 
    445     /**
    446      * Returns the CE32 from the data trie.
    447      * Normally the same as data.getCE32(), but overridden in the builder.
    448      * Call this only when the faster data.getCE32() cannot be used.
    449      */
    450     protected int getDataCE32(int c) {
    451         return data.getCE32(c);
    452     }
    453 
    454     protected int getCE32FromBuilderData(int ce32) {
    455         throw new ICUException("internal program error: should be unreachable");
    456     }
    457 
    458     protected final void appendCEsFromCE32(CollationData d, int c, int ce32,
    459                            boolean forward) {
    460         while(Collation.isSpecialCE32(ce32)) {
    461             switch(Collation.tagFromCE32(ce32)) {
    462             case Collation.FALLBACK_TAG:
    463             case Collation.RESERVED_TAG_3:
    464                 throw new ICUException("internal program error: should be unreachable");
    465             case Collation.LONG_PRIMARY_TAG:
    466                 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32));
    467                 return;
    468             case Collation.LONG_SECONDARY_TAG:
    469                 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32));
    470                 return;
    471             case Collation.LATIN_EXPANSION_TAG:
    472                 ceBuffer.ensureAppendCapacity(2);
    473                 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32));
    474                 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32));
    475                 ceBuffer.length += 2;
    476                 return;
    477             case Collation.EXPANSION32_TAG: {
    478                 int index = Collation.indexFromCE32(ce32);
    479                 int length = Collation.lengthFromCE32(ce32);
    480                 ceBuffer.ensureAppendCapacity(length);
    481                 do {
    482                     ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++]));
    483                 } while(--length > 0);
    484                 return;
    485             }
    486             case Collation.EXPANSION_TAG: {
    487                 int index = Collation.indexFromCE32(ce32);
    488                 int length = Collation.lengthFromCE32(ce32);
    489                 ceBuffer.ensureAppendCapacity(length);
    490                 do {
    491                     ceBuffer.appendUnsafe(d.ces[index++]);
    492                 } while(--length > 0);
    493                 return;
    494             }
    495             case Collation.BUILDER_DATA_TAG:
    496                 ce32 = getCE32FromBuilderData(ce32);
    497                 if(ce32 == Collation.FALLBACK_CE32) {
    498                     d = data.base;
    499                     ce32 = d.getCE32(c);
    500                 }
    501                 break;
    502             case Collation.PREFIX_TAG:
    503                 if(forward) { backwardNumCodePoints(1); }
    504                 ce32 = getCE32FromPrefix(d, ce32);
    505                 if(forward) { forwardNumCodePoints(1); }
    506                 break;
    507             case Collation.CONTRACTION_TAG: {
    508                 int index = Collation.indexFromCE32(ce32);
    509                 int defaultCE32 = d.getCE32FromContexts(index);  // Default if no suffix match.
    510                 if(!forward) {
    511                     // Backward contractions are handled by previousCEUnsafe().
    512                     // c has contractions but they were not found.
    513                     ce32 = defaultCE32;
    514                     break;
    515                 }
    516                 int nextCp;
    517                 if(skipped == null && numCpFwd < 0) {
    518                     // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
    519                     // avoiding the function call and the nextSkippedCodePoint() overhead.
    520                     nextCp = nextCodePoint();
    521                     if(nextCp < 0) {
    522                         // No more text.
    523                         ce32 = defaultCE32;
    524                         break;
    525                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
    526                             !CollationFCD.mayHaveLccc(nextCp)) {
    527                         // All contraction suffixes start with characters with lccc!=0
    528                         // but the next code point has lccc==0.
    529                         backwardNumCodePoints(1);
    530                         ce32 = defaultCE32;
    531                         break;
    532                     }
    533                 } else {
    534                     nextCp = nextSkippedCodePoint();
    535                     if(nextCp < 0) {
    536                         // No more text.
    537                         ce32 = defaultCE32;
    538                         break;
    539                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
    540                             !CollationFCD.mayHaveLccc(nextCp)) {
    541                         // All contraction suffixes start with characters with lccc!=0
    542                         // but the next code point has lccc==0.
    543                         backwardNumSkipped(1);
    544                         ce32 = defaultCE32;
    545                         break;
    546                     }
    547                 }
    548                 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp);
    549                 if(ce32 == Collation.NO_CE32) {
    550                     // CEs from a discontiguous contraction plus the skipped combining marks
    551                     // have been appended already.
    552                     return;
    553                 }
    554                 break;
    555             }
    556             case Collation.DIGIT_TAG:
    557                 if(isNumeric) {
    558                     appendNumericCEs(ce32, forward);
    559                     return;
    560                 } else {
    561                     // Fetch the non-numeric-collation CE32 and continue.
    562                     ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
    563                     break;
    564                 }
    565             case Collation.U0000_TAG:
    566                 assert(c == 0);
    567                 // NUL-terminated input not supported in Java.
    568                 // Fetch the normal ce32 for U+0000 and continue.
    569                 ce32 = d.ce32s[0];
    570                 break;
    571             case Collation.HANGUL_TAG: {
    572                 int[] jamoCE32s = d.jamoCE32s;
    573                 c -= Hangul.HANGUL_BASE;
    574                 int t = c % Hangul.JAMO_T_COUNT;
    575                 c /= Hangul.JAMO_T_COUNT;
    576                 int v = c % Hangul.JAMO_V_COUNT;
    577                 c /= Hangul.JAMO_V_COUNT;
    578                 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) {
    579                     // None of the Jamo CE32s are isSpecialCE32().
    580                     // Avoid recursive function calls and per-Jamo tests.
    581                     ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3);
    582                     ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c]));
    583                     ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v]));
    584                     ceBuffer.length += 2;
    585                     if(t != 0) {
    586                         ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t]));
    587                     }
    588                     return;
    589                 } else {
    590                     // We should not need to compute each Jamo code point.
    591                     // In particular, there should be no offset or implicit ce32.
    592                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward);
    593                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward);
    594                     if(t == 0) { return; }
    595                     // offset 39 = 19 + 21 - 1:
    596                     // 19 = JAMO_L_COUNT
    597                     // 21 = JAMO_T_COUNT
    598                     // -1 = omit t==0
    599                     ce32 = jamoCE32s[39 + t];
    600                     c = Collation.SENTINEL_CP;
    601                     break;
    602                 }
    603             }
    604             case Collation.LEAD_SURROGATE_TAG: {
    605                 assert(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
    606                 assert(isLeadSurrogate(c));
    607                 char trail;
    608                 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) {
    609                     c = Character.toCodePoint((char)c, trail);
    610                     ce32 &= Collation.LEAD_TYPE_MASK;
    611                     if(ce32 == Collation.LEAD_ALL_UNASSIGNED) {
    612                         ce32 = Collation.UNASSIGNED_CE32;  // unassigned-implicit
    613                     } else if(ce32 == Collation.LEAD_ALL_FALLBACK ||
    614                             (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) {
    615                         // fall back to the base data
    616                         d = d.base;
    617                         ce32 = d.getCE32FromSupplementary(c);
    618                     }
    619                 } else {
    620                     // c is an unpaired surrogate.
    621                     ce32 = Collation.UNASSIGNED_CE32;
    622                 }
    623                 break;
    624             }
    625             case Collation.OFFSET_TAG:
    626                 assert(c >= 0);
    627                 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32));
    628                 return;
    629             case Collation.IMPLICIT_TAG:
    630                 assert(c >= 0);
    631                 if(isSurrogate(c) && forbidSurrogateCodePoints()) {
    632                     ce32 = Collation.FFFD_CE32;
    633                     break;
    634                 } else {
    635                     ceBuffer.append(Collation.unassignedCEFromCodePoint(c));
    636                     return;
    637                 }
    638             }
    639         }
    640         ceBuffer.append(Collation.ceFromSimpleCE32(ce32));
    641     }
    642 
    643     // TODO: Propose widening the UTF16 method.
    644     private static final boolean isSurrogate(int c) {
    645         return (c & 0xfffff800) == 0xd800;
    646     }
    647 
    648     // TODO: Propose widening the UTF16 method.
    649     protected static final boolean isLeadSurrogate(int c) {
    650         return (c & 0xfffffc00) == 0xd800;
    651     }
    652 
    653     // TODO: Propose widening the UTF16 method.
    654     protected static final boolean isTrailSurrogate(int c) {
    655         return (c & 0xfffffc00) == 0xdc00;
    656     }
    657 
    658     // Main lookup trie of the data object.
    659     protected final Trie2_32 trie;
    660     protected final CollationData data;
    661 
    662     private final long nextCEFromCE32(CollationData d, int c, int ce32) {
    663         --ceBuffer.length;  // Undo ceBuffer.incLength().
    664         appendCEsFromCE32(d, c, ce32, true);
    665         return ceBuffer.get(cesIndex++);
    666     }
    667 
    668     private final int getCE32FromPrefix(CollationData d, int ce32) {
    669         int index = Collation.indexFromCE32(ce32);
    670         ce32 = d.getCE32FromContexts(index);  // Default if no prefix match.
    671         index += 2;
    672         // Number of code points read before the original code point.
    673         int lookBehind = 0;
    674         CharsTrie prefixes = new CharsTrie(d.contexts, index);
    675         for(;;) {
    676             int c = previousCodePoint();
    677             if(c < 0) { break; }
    678             ++lookBehind;
    679             BytesTrie.Result match = prefixes.nextForCodePoint(c);
    680             if(match.hasValue()) {
    681                 ce32 = prefixes.getValue();
    682             }
    683             if(!match.hasNext()) { break; }
    684         }
    685         forwardNumCodePoints(lookBehind);
    686         return ce32;
    687     }
    688 
    689     private final int nextSkippedCodePoint() {
    690         if(skipped != null && skipped.hasNext()) { return skipped.next(); }
    691         if(numCpFwd == 0) { return Collation.SENTINEL_CP; }
    692         int c = nextCodePoint();
    693         if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); }
    694         if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
    695         return c;
    696     }
    697 
    698     private final void backwardNumSkipped(int n) {
    699         if(skipped != null && !skipped.isEmpty()) {
    700             n = skipped.backwardNumCodePoints(n);
    701         }
    702         backwardNumCodePoints(n);
    703         if(numCpFwd >= 0) { numCpFwd += n; }
    704     }
    705 
    706     private final int nextCE32FromContraction(
    707             CollationData d, int contractionCE32,
    708             CharSequence trieChars, int trieOffset, int ce32, int c) {
    709         // c: next code point after the original one
    710 
    711         // Number of code points read beyond the original code point.
    712         // Needed for discontiguous contraction matching.
    713         int lookAhead = 1;
    714         // Number of code points read since the last match (initially only c).
    715         int sinceMatch = 1;
    716         // Normally we only need a contiguous match,
    717         // and therefore need not remember the suffixes state from before a mismatch for retrying.
    718         // If we are already processing skipped combining marks, then we do track the state.
    719         CharsTrie suffixes = new CharsTrie(trieChars, trieOffset);
    720         if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
    721         BytesTrie.Result match = suffixes.firstForCodePoint(c);
    722         for(;;) {
    723             int nextCp;
    724             if(match.hasValue()) {
    725                 ce32 = suffixes.getValue();
    726                 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) {
    727                     return ce32;
    728                 }
    729                 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
    730                 sinceMatch = 1;
    731             } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) {
    732                 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text.
    733                 // Back up if necessary, and try a discontiguous contraction.
    734                 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 &&
    735                         // Discontiguous contraction matching extends an existing match.
    736                         // If there is no match yet, then there is nothing to do.
    737                         ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
    738                             sinceMatch < lookAhead)) {
    739                     // The last character of at least one suffix has lccc!=0,
    740                     // allowing for discontiguous contractions.
    741                     // UCA S2.1.1 only processes non-starters immediately following
    742                     // "a match in the table" (sinceMatch=1).
    743                     if(sinceMatch > 1) {
    744                         // Return to the state after the last match.
    745                         // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
    746                         backwardNumSkipped(sinceMatch);
    747                         c = nextSkippedCodePoint();
    748                         lookAhead -= sinceMatch - 1;
    749                         sinceMatch = 1;
    750                     }
    751                     if(d.getFCD16(c) > 0xff) {
    752                         return nextCE32FromDiscontiguousContraction(
    753                             d, suffixes, ce32, lookAhead, c);
    754                     }
    755                 }
    756                 break;
    757             } else {
    758                 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c.
    759                 // It does not have a result value, therefore it is not itself "a match in the table".
    760                 // If a partially-matched c has ccc!=0 then
    761                 // it might be skipped in discontiguous contraction.
    762                 c = nextCp;
    763                 ++sinceMatch;
    764             }
    765             ++lookAhead;
    766             match = suffixes.nextForCodePoint(c);
    767         }
    768         backwardNumSkipped(sinceMatch);
    769         return ce32;
    770     }
    771 
    772     private final int nextCE32FromDiscontiguousContraction(
    773             CollationData d, CharsTrie suffixes, int ce32,
    774             int lookAhead, int c) {
    775         // UCA section 3.3.2 Contractions:
    776         // Contractions that end with non-starter characters
    777         // are known as discontiguous contractions.
    778         // ... discontiguous contractions must be detected in input text
    779         // whenever the final sequence of non-starter characters could be rearranged
    780         // so as to make a contiguous matching sequence that is canonically equivalent.
    781 
    782         // UCA: http://www.unicode.org/reports/tr10/#S2.1
    783         // S2.1 Find the longest initial substring S at each point that has a match in the table.
    784         // S2.1.1 If there are any non-starters following S, process each non-starter C.
    785         // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
    786         //     Note: A non-starter in a string is called blocked
    787         //     if there is another non-starter of the same canonical combining class or zero
    788         //     between it and the last character of canonical combining class 0.
    789         // S2.1.3 If there is a match, replace S by S + C, and remove C.
    790 
    791         // First: Is a discontiguous contraction even possible?
    792         int fcd16 = d.getFCD16(c);
    793         assert(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
    794         int nextCp = nextSkippedCodePoint();
    795         if(nextCp < 0) {
    796             // No further text.
    797             backwardNumSkipped(1);
    798             return ce32;
    799         }
    800         ++lookAhead;
    801         int prevCC = fcd16 & 0xff;
    802         fcd16 = d.getFCD16(nextCp);
    803         if(fcd16 <= 0xff) {
    804             // The next code point after c is a starter (S2.1.1 "process each non-starter").
    805             backwardNumSkipped(2);
    806             return ce32;
    807         }
    808 
    809         // We have read and matched (lookAhead-2) code points,
    810         // read non-matching c and peeked ahead at nextCp.
    811         // Return to the state before the mismatch and continue matching with nextCp.
    812         if(skipped == null || skipped.isEmpty()) {
    813             if(skipped == null) {
    814                 skipped = new SkippedState();
    815             }
    816             suffixes.reset();
    817             if(lookAhead > 2) {
    818                 // Replay the partial match so far.
    819                 backwardNumCodePoints(lookAhead);
    820                 suffixes.firstForCodePoint(nextCodePoint());
    821                 for(int i = 3; i < lookAhead; ++i) {
    822                     suffixes.nextForCodePoint(nextCodePoint());
    823                 }
    824                 // Skip c (which did not match) and nextCp (which we will try now).
    825                 forwardNumCodePoints(2);
    826             }
    827             skipped.saveTrieState(suffixes);
    828         } else {
    829             // Reset to the trie state before the failed match of c.
    830             skipped.resetToTrieState(suffixes);
    831         }
    832 
    833         skipped.setFirstSkipped(c);
    834         // Number of code points read since the last match (at this point: c and nextCp).
    835         int sinceMatch = 2;
    836         c = nextCp;
    837         for(;;) {
    838             BytesTrie.Result match;
    839             // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
    840             if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) {
    841                 // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
    842                 // Keep prevCC unchanged.
    843                 ce32 = suffixes.getValue();
    844                 sinceMatch = 0;
    845                 skipped.recordMatch();
    846                 if(!match.hasNext()) { break; }
    847                 skipped.saveTrieState(suffixes);
    848             } else {
    849                 // No match for "S + C", skip C.
    850                 skipped.skip(c);
    851                 skipped.resetToTrieState(suffixes);
    852                 prevCC = fcd16 & 0xff;
    853             }
    854             if((c = nextSkippedCodePoint()) < 0) { break; }
    855             ++sinceMatch;
    856             fcd16 = d.getFCD16(c);
    857             if(fcd16 <= 0xff) {
    858                 // The next code point after c is a starter (S2.1.1 "process each non-starter").
    859                 break;
    860             }
    861         }
    862         backwardNumSkipped(sinceMatch);
    863         boolean isTopDiscontiguous = skipped.isEmpty();
    864         skipped.replaceMatch();
    865         if(isTopDiscontiguous && !skipped.isEmpty()) {
    866             // We did get a match after skipping one or more combining marks,
    867             // and we are not in a recursive discontiguous contraction.
    868             // Append CEs from the contraction ce32
    869             // and then from the combining marks that we skipped before the match.
    870             c = Collation.SENTINEL_CP;
    871             for(;;) {
    872                 appendCEsFromCE32(d, c, ce32, true);
    873                 // Fetch CE32s for skipped combining marks from the normal data, with fallback,
    874                 // rather than from the CollationData where we found the contraction.
    875                 if(!skipped.hasNext()) { break; }
    876                 c = skipped.next();
    877                 ce32 = getDataCE32(c);
    878                 if(ce32 == Collation.FALLBACK_CE32) {
    879                     d = data.base;
    880                     ce32 = d.getCE32(c);
    881                 } else {
    882                     d = data;
    883                 }
    884                 // Note: A nested discontiguous-contraction match
    885                 // replaces consumed combining marks with newly skipped ones
    886                 // and resets the reading position to the beginning.
    887             }
    888             skipped.clear();
    889             ce32 = Collation.NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
    890         }
    891         return ce32;
    892     }
    893 
    894     /**
    895      * Returns the previous CE when data.isUnsafeBackward(c, isNumeric).
    896      */
    897     private final long previousCEUnsafe(int c, UVector32 offsets) {
    898         // We just move through the input counting safe and unsafe code points
    899         // without collecting the unsafe-backward substring into a buffer and
    900         // switching to it.
    901         // This is to keep the logic simple. Otherwise we would have to handle
    902         // prefix matching going before the backward buffer, switching
    903         // to iteration and back, etc.
    904         // In the most important case of iterating over a normal string,
    905         // reading from the string itself is already maximally fast.
    906         // The only drawback there is that after getting the CEs we always
    907         // skip backward to the safe character rather than switching out
    908         // of a backwardBuffer.
    909         // But this should not be the common case for previousCE(),
    910         // and correctness and maintainability are more important than
    911         // complex optimizations.
    912         // Find the first safe character before c.
    913         int numBackward = 1;
    914         while((c = previousCodePoint()) >= 0) {
    915             ++numBackward;
    916             if(!data.isUnsafeBackward(c, isNumeric)) {
    917                 break;
    918             }
    919         }
    920         // Set the forward iteration limit.
    921         // Note: This counts code points.
    922         // We cannot enforce a limit in the middle of a surrogate pair or similar.
    923         numCpFwd = numBackward;
    924         // Reset the forward iterator.
    925         cesIndex = 0;
    926         assert(ceBuffer.length == 0);
    927         // Go forward and collect the CEs.
    928         int offset = getOffset();
    929         while(numCpFwd > 0) {
    930             // nextCE() normally reads one code point.
    931             // Contraction matching and digit specials read more and check numCpFwd.
    932             --numCpFwd;
    933             // Append one or more CEs to the ceBuffer.
    934             nextCE();
    935             assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE);
    936             // No need to loop for getting each expansion CE from nextCE().
    937             cesIndex = ceBuffer.length;
    938             // However, we need to write an offset for each CE.
    939             // This is for CollationElementIterator.getOffset() to return
    940             // intermediate offsets from the unsafe-backwards segment.
    941             assert(offsets.size() < ceBuffer.length);
    942             offsets.addElement(offset);
    943             // For an expansion, the offset of each non-initial CE is the limit offset,
    944             // consistent with forward iteration.
    945             offset = getOffset();
    946             while(offsets.size() < ceBuffer.length) {
    947                 offsets.addElement(offset);
    948             };
    949         }
    950         assert(offsets.size() == ceBuffer.length);
    951         // End offset corresponding to just after the unsafe-backwards segment.
    952         offsets.addElement(offset);
    953         // Reset the forward iteration limit
    954         // and move backward to before the segment for which we fetched CEs.
    955         numCpFwd = -1;
    956         backwardNumCodePoints(numBackward);
    957         // Use the collected CEs and return the last one.
    958         cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
    959         return ceBuffer.get(--ceBuffer.length);
    960     }
    961 
    962     /**
    963      * Turns a string of digits (bytes 0..9)
    964      * into a sequence of CEs that will sort in numeric order.
    965      *
    966      * Starts from this ce32's digit value and consumes the following/preceding digits.
    967      * The digits string must not be empty and must not have leading zeros.
    968      */
    969     private final void appendNumericCEs(int ce32, boolean forward) {
    970         // Collect digits.
    971         // TODO: Use some kind of a byte buffer? We only store values 0..9.
    972         StringBuilder digits = new StringBuilder();
    973         if(forward) {
    974             for(;;) {
    975                 char digit = Collation.digitFromCE32(ce32);
    976                 digits.append(digit);
    977                 if(numCpFwd == 0) { break; }
    978                 int c = nextCodePoint();
    979                 if(c < 0) { break; }
    980                 ce32 = data.getCE32(c);
    981                 if(ce32 == Collation.FALLBACK_CE32) {
    982                     ce32 = data.base.getCE32(c);
    983                 }
    984                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
    985                     backwardNumCodePoints(1);
    986                     break;
    987                 }
    988                 if(numCpFwd > 0) { --numCpFwd; }
    989             }
    990         } else {
    991             for(;;) {
    992                 char digit = Collation.digitFromCE32(ce32);
    993                 digits.append(digit);
    994                 int c = previousCodePoint();
    995                 if(c < 0) { break; }
    996                 ce32 = data.getCE32(c);
    997                 if(ce32 == Collation.FALLBACK_CE32) {
    998                     ce32 = data.base.getCE32(c);
    999                 }
   1000                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
   1001                     forwardNumCodePoints(1);
   1002                     break;
   1003                 }
   1004             }
   1005             // Reverse the digit string.
   1006             digits.reverse();
   1007         }
   1008         int pos = 0;
   1009         do {
   1010             // Skip leading zeros.
   1011             while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; }
   1012             // Write a sequence of CEs for at most 254 digits at a time.
   1013             int segmentLength = digits.length() - pos;
   1014             if(segmentLength > 254) { segmentLength = 254; }
   1015             appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength));
   1016             pos += segmentLength;
   1017         } while(pos < digits.length());
   1018     }
   1019 
   1020     /**
   1021      * Turns 1..254 digits into a sequence of CEs.
   1022      * Called by appendNumericCEs() for each segment of at most 254 digits.
   1023      */
   1024     private final void appendNumericSegmentCEs(CharSequence digits) {
   1025         int length = digits.length();
   1026         assert(1 <= length && length <= 254);
   1027         assert(length == 1 || digits.charAt(0) != 0);
   1028         long numericPrimary = data.numericPrimary;
   1029         // Note: We use primary byte values 2..255: digits are not compressible.
   1030         if(length <= 7) {
   1031             // Very dense encoding for small numbers.
   1032             int value = digits.charAt(0);
   1033             for(int i = 1; i < length; ++i) {
   1034                 value = value * 10 + digits.charAt(i);
   1035             }
   1036             // Primary weight second byte values:
   1037             //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
   1038             //     40 byte values  76..115 for medium numbers in three-byte primary weights.
   1039             //     16 byte values 116..131 for large numbers in four-byte primary weights.
   1040             //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
   1041             int firstByte = 2;
   1042             int numBytes = 74;
   1043             if(value < numBytes) {
   1044                 // Two-byte primary for 0..73, good for day & month numbers etc.
   1045                 long primary = numericPrimary | ((firstByte + value) << 16);
   1046                 ceBuffer.append(Collation.makeCE(primary));
   1047                 return;
   1048             }
   1049             value -= numBytes;
   1050             firstByte += numBytes;
   1051             numBytes = 40;
   1052             if(value < numBytes * 254) {
   1053                 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
   1054                 long primary = numericPrimary |
   1055                     ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
   1056                 ceBuffer.append(Collation.makeCE(primary));
   1057                 return;
   1058             }
   1059             value -= numBytes * 254;
   1060             firstByte += numBytes;
   1061             numBytes = 16;
   1062             if(value < numBytes * 254 * 254) {
   1063                 // Four-byte primary for 10234..1042489=10234+16*254*254-1.
   1064                 long primary = numericPrimary | (2 + value % 254);
   1065                 value /= 254;
   1066                 primary |= (2 + value % 254) << 8;
   1067                 value /= 254;
   1068                 primary |= (firstByte + value % 254) << 16;
   1069                 ceBuffer.append(Collation.makeCE(primary));
   1070                 return;
   1071             }
   1072             // original value > 1042489
   1073         }
   1074         assert(length >= 7);
   1075 
   1076         // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
   1077         // then we generate primary bytes with those pairs.
   1078         // Omit trailing 00 pairs.
   1079         // Decrement the value for the last pair.
   1080 
   1081         // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255.
   1082         int numPairs = (length + 1) / 2;
   1083         long primary = numericPrimary | ((132 - 4 + numPairs) << 16);
   1084         // Find the length without trailing 00 pairs.
   1085         while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) {
   1086             length -= 2;
   1087         }
   1088         // Read the first pair.
   1089         int pair;
   1090         int pos;
   1091         if((length & 1) != 0) {
   1092             // Only "half a pair" if we have an odd number of digits.
   1093             pair = digits.charAt(0);
   1094             pos = 1;
   1095         } else {
   1096             pair = digits.charAt(0) * 10 + digits.charAt(1);
   1097             pos = 2;
   1098         }
   1099         pair = 11 + 2 * pair;
   1100         // Add the pairs of digits between pos and length.
   1101         int shift = 8;
   1102         while(pos < length) {
   1103             if(shift == 0) {
   1104                 // Every three pairs/bytes we need to store a 4-byte-primary CE
   1105                 // and start with a new CE with the '0' primary lead byte.
   1106                 primary |= pair;
   1107                 ceBuffer.append(Collation.makeCE(primary));
   1108                 primary = numericPrimary;
   1109                 shift = 16;
   1110             } else {
   1111                 primary |= pair << shift;
   1112                 shift -= 8;
   1113             }
   1114             pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1));
   1115             pos += 2;
   1116         }
   1117         primary |= (pair - 1) << shift;
   1118         ceBuffer.append(Collation.makeCE(primary));
   1119     }
   1120 
   1121     private CEBuffer ceBuffer;
   1122     private int cesIndex;
   1123 
   1124     private SkippedState skipped;
   1125 
   1126     // Number of code points to read forward, or -1.
   1127     // Used as a forward iteration limit in previousCEUnsafe().
   1128     private int numCpFwd;
   1129     // Numeric collation (CollationSettings.NUMERIC).
   1130     private boolean isNumeric;
   1131 }
   1132