Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2012-2015, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp
     10 *
     11 * C++ version created on: 2012apr01
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import java.util.ArrayList;
     18 import java.util.Arrays;
     19 import java.util.Iterator;
     20 
     21 import android.icu.impl.Norm2AllModes;
     22 import android.icu.impl.Normalizer2Impl;
     23 import android.icu.impl.Normalizer2Impl.Hangul;
     24 import android.icu.impl.Trie2;
     25 import android.icu.impl.Trie2Writable;
     26 import android.icu.lang.UCharacter;
     27 import android.icu.text.UnicodeSet;
     28 import android.icu.text.UnicodeSetIterator;
     29 import android.icu.util.CharsTrie;
     30 import android.icu.util.CharsTrieBuilder;
     31 import android.icu.util.StringTrieBuilder;
     32 
     33 /**
     34  * Low-level CollationData builder.
     35  * Takes (character, CE) pairs and builds them into runtime data structures.
     36  * Supports characters with context prefixes and contraction suffixes.
     37  */
     38 final class CollationDataBuilder {  // not final in C++
     39     /**
     40      * Collation element modifier. Interface class for a modifier
     41      * that changes a tailoring builder's temporary CEs to final CEs.
     42      * Called for every non-special CE32 and every expansion CE.
     43      */
     44     interface CEModifier {
     45         /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */
     46         long modifyCE32(int ce32);
     47         /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */
     48         long modifyCE(long ce);
     49     }
     50 
     51     CollationDataBuilder() {
     52         nfcImpl = Norm2AllModes.getNFCInstance().impl;
     53         base = null;
     54         baseSettings = null;
     55         trie = null;
     56         ce32s = new UVector32();
     57         ce64s = new UVector64();
     58         conditionalCE32s = new ArrayList<ConditionalCE32>();
     59         modified = false;
     60         fastLatinEnabled = false;
     61         fastLatinBuilder = null;
     62         collIter = null;
     63         // Reserve the first CE32 for U+0000.
     64         ce32s.addElement(0);
     65     }
     66 
     67     void initForTailoring(CollationData b) {
     68         if(trie != null) {
     69             throw new IllegalStateException("attempt to reuse a CollationDataBuilder");
     70         }
     71         if(b == null) {
     72             throw new IllegalArgumentException("null CollationData");
     73         }
     74         base = b;
     75 
     76         // For a tailoring, the default is to fall back to the base.
     77         trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32);
     78 
     79         // Set the Latin-1 letters block so that it is allocated first in the data array,
     80         // to try to improve locality of reference when sorting Latin-1 text.
     81         // Do not use utrie2_setRange32() since that will not actually allocate blocks
     82         // that are filled with the default value.
     83         // ASCII (0..7F) is already preallocated anyway.
     84         for(int c = 0xc0; c <= 0xff; ++c) {
     85             trie.set(c, Collation.FALLBACK_CE32);
     86         }
     87 
     88         // Hangul syllables are not tailorable (except via tailoring Jamos).
     89         // Always set the Hangul tag to help performance.
     90         // Do this here, rather than in buildMappings(),
     91         // so that we see the HANGUL_TAG in various assertions.
     92         int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
     93         trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true);
     94 
     95         // Copy the set contents but don't copy/clone the set as a whole because
     96         // that would copy the isFrozen state too.
     97         unsafeBackwardSet.addAll(b.unsafeBackwardSet);
     98     }
     99 
    100     boolean isCompressibleLeadByte(int b) {
    101         return base.isCompressibleLeadByte(b);
    102     }
    103 
    104     boolean isCompressiblePrimary(long p) {
    105         return isCompressibleLeadByte((int)p >>> 24);
    106     }
    107 
    108     /**
    109      * @return true if this builder has mappings (e.g., add() has been called)
    110      */
    111     boolean hasMappings() { return modified; }
    112 
    113     /**
    114      * @return true if c has CEs in this builder
    115      */
    116     boolean isAssigned(int c) {
    117         return Collation.isAssignedCE32(trie.get(c));
    118     }
    119 
    120     void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
    121         int ce32 = encodeCEs(ces, cesLength);
    122         addCE32(prefix, s, ce32);
    123     }
    124 
    125     /**
    126      * Encodes the ces as either the returned ce32 by itself,
    127      * or by storing an expansion, with the returned ce32 referring to that.
    128      *
    129      * <p>add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
    130      */
    131     int encodeCEs(long ces[], int cesLength) {
    132         if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) {
    133             throw new IllegalArgumentException("mapping to too many CEs");
    134         }
    135         if(!isMutable()) {
    136             throw new IllegalStateException("attempt to add mappings after build()");
    137         }
    138         if(cesLength == 0) {
    139             // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
    140             // Do this here so that callers need not do it.
    141             return encodeOneCEAsCE32(0);
    142         } else if(cesLength == 1) {
    143             return encodeOneCE(ces[0]);
    144         } else if(cesLength == 2) {
    145             // Try to encode two CEs as one CE32.
    146             long ce0 = ces[0];
    147             long ce1 = ces[1];
    148             long p0 = ce0 >>> 32;
    149             if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE &&
    150                     (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE &&
    151                     p0 != 0) {
    152                 // Latin mini expansion
    153                 return
    154                     (int)p0 |
    155                     (((int)ce0 & 0xff00) << 8) |
    156                     (((int)ce1 >> 16) & 0xff00) |
    157                     Collation.SPECIAL_CE32_LOW_BYTE |
    158                     Collation.LATIN_EXPANSION_TAG;
    159             }
    160         }
    161         // Try to encode two or more CEs as CE32s.
    162         int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH];  // TODO: instance field?
    163         for(int i = 0;; ++i) {
    164             if(i == cesLength) {
    165                 return encodeExpansion32(newCE32s, 0, cesLength);
    166             }
    167             int ce32 = encodeOneCEAsCE32(ces[i]);
    168             if(ce32 == Collation.NO_CE32) { break; }
    169             newCE32s[i] = ce32;
    170         }
    171         return encodeExpansion(ces, 0, cesLength);
    172     }
    173 
    174     void addCE32(CharSequence prefix, CharSequence s, int ce32) {
    175         if(s.length() == 0) {
    176             throw new IllegalArgumentException("mapping from empty string");
    177         }
    178         if(!isMutable()) {
    179             throw new IllegalStateException("attempt to add mappings after build()");
    180         }
    181         int c = Character.codePointAt(s, 0);
    182         int cLength = Character.charCount(c);
    183         int oldCE32 = trie.get(c);
    184         boolean hasContext = prefix.length() != 0|| s.length() > cLength;
    185         if(oldCE32 == Collation.FALLBACK_CE32) {
    186             // First tailoring for c.
    187             // If c has contextual base mappings or if we add a contextual mapping,
    188             // then copy the base mappings.
    189             // Otherwise we just override the base mapping.
    190             int baseCE32 = base.getFinalCE32(base.getCE32(c));
    191             if(hasContext || Collation.ce32HasContext(baseCE32)) {
    192                 oldCE32 = copyFromBaseCE32(c, baseCE32, true);
    193                 trie.set(c, oldCE32);
    194             }
    195         }
    196         if(!hasContext) {
    197             // No prefix, no contraction.
    198             if(!isBuilderContextCE32(oldCE32)) {
    199                 trie.set(c, ce32);
    200             } else {
    201                 ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32);
    202                 cond.builtCE32 = Collation.NO_CE32;
    203                 cond.ce32 = ce32;
    204             }
    205         } else {
    206             ConditionalCE32 cond;
    207             if(!isBuilderContextCE32(oldCE32)) {
    208                 // Replace the simple oldCE32 with a builder context CE32
    209                 // pointing to a new ConditionalCE32 list head.
    210                 int index = addConditionalCE32("\0", oldCE32);
    211                 int contextCE32 = makeBuilderContextCE32(index);
    212                 trie.set(c, contextCE32);
    213                 contextChars.add(c);
    214                 cond = getConditionalCE32(index);
    215             } else {
    216                 cond = getConditionalCE32ForCE32(oldCE32);
    217                 cond.builtCE32 = Collation.NO_CE32;
    218             }
    219             CharSequence suffix = s.subSequence(cLength, s.length());
    220             String context = new StringBuilder().append((char)prefix.length()).
    221                     append(prefix).append(suffix).toString();
    222             unsafeBackwardSet.addAll(suffix);
    223             for(;;) {
    224                 // invariant: context > cond.context
    225                 int next = cond.next;
    226                 if(next < 0) {
    227                     // Append a new ConditionalCE32 after cond.
    228                     int index = addConditionalCE32(context, ce32);
    229                     cond.next = index;
    230                     break;
    231                 }
    232                 ConditionalCE32 nextCond = getConditionalCE32(next);
    233                 int cmp = context.compareTo(nextCond.context);
    234                 if(cmp < 0) {
    235                     // Insert a new ConditionalCE32 between cond and nextCond.
    236                     int index = addConditionalCE32(context, ce32);
    237                     cond.next = index;
    238                     getConditionalCE32(index).next = next;
    239                     break;
    240                 } else if(cmp == 0) {
    241                     // Same context as before, overwrite its ce32.
    242                     nextCond.ce32 = ce32;
    243                     break;
    244                 }
    245                 cond = nextCond;
    246             }
    247         }
    248         modified = true;
    249     }
    250 
    251     /**
    252      * Copies all mappings from the src builder, with modifications.
    253      * This builder here must not be built yet, and should be empty.
    254      */
    255     void copyFrom(CollationDataBuilder src, CEModifier modifier) {
    256         if(!isMutable()) {
    257             throw new IllegalStateException("attempt to copyFrom() after build()");
    258         }
    259         CopyHelper helper = new CopyHelper(src, this, modifier);
    260         Iterator<Trie2.Range> trieIterator = src.trie.iterator();
    261         Trie2.Range range;
    262         while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
    263             enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper);
    264         }
    265         // Update the contextChars and the unsafeBackwardSet while copying,
    266         // in case a character had conditional mappings in the source builder
    267         // and they were removed later.
    268         modified |= src.modified;
    269     }
    270 
    271     void optimize(UnicodeSet set) {
    272         if(set.isEmpty()) { return; }
    273         UnicodeSetIterator iter = new UnicodeSetIterator(set);
    274         while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
    275             int c = iter.codepoint;
    276             int ce32 = trie.get(c);
    277             if(ce32 == Collation.FALLBACK_CE32) {
    278                 ce32 = base.getFinalCE32(base.getCE32(c));
    279                 ce32 = copyFromBaseCE32(c, ce32, true);
    280                 trie.set(c, ce32);
    281             }
    282         }
    283         modified = true;
    284     }
    285 
    286     void suppressContractions(UnicodeSet set) {
    287         if(set.isEmpty()) { return; }
    288         UnicodeSetIterator iter = new UnicodeSetIterator(set);
    289         while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
    290             int c = iter.codepoint;
    291             int ce32 = trie.get(c);
    292             if(ce32 == Collation.FALLBACK_CE32) {
    293                 ce32 = base.getFinalCE32(base.getCE32(c));
    294                 if(Collation.ce32HasContext(ce32)) {
    295                     ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
    296                     trie.set(c, ce32);
    297                 }
    298             } else if(isBuilderContextCE32(ce32)) {
    299                 ce32 = getConditionalCE32ForCE32(ce32).ce32;
    300                 // Simply abandon the list of ConditionalCE32.
    301                 // The caller will copy this builder in the end,
    302                 // eliminating unreachable data.
    303                 trie.set(c, ce32);
    304                 contextChars.remove(c);
    305             }
    306         }
    307         modified = true;
    308     }
    309 
    310     void enableFastLatin() { fastLatinEnabled = true; }
    311     void build(CollationData data) {
    312         buildMappings(data);
    313         if(base != null) {
    314             data.numericPrimary = base.numericPrimary;
    315             data.compressibleBytes = base.compressibleBytes;
    316             data.numScripts = base.numScripts;
    317             data.scriptsIndex = base.scriptsIndex;
    318             data.scriptStarts = base.scriptStarts;
    319         }
    320         buildFastLatinTable(data);
    321     }
    322 
    323     /**
    324      * Looks up CEs for s and appends them to the ces array.
    325      * Does not handle normalization: s should be in FCD form.
    326      *
    327      * Does not write completely ignorable CEs.
    328      * Does not write beyond Collation.MAX_EXPANSION_LENGTH.
    329      *
    330      * @return incremented cesLength
    331      */
    332     int getCEs(CharSequence s, long ces[], int cesLength) {
    333         return getCEs(s, 0, ces, cesLength);
    334     }
    335 
    336     int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
    337         int prefixLength = prefix.length();
    338         if(prefixLength == 0) {
    339             return getCEs(s, 0, ces, cesLength);
    340         } else {
    341             return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength);
    342         }
    343     }
    344 
    345     /**
    346      * Build-time context and CE32 for a code point.
    347      * If a code point has contextual mappings, then the default (no-context) mapping
    348      * and all conditional mappings are stored in a singly-linked list
    349      * of ConditionalCE32, sorted by context strings.
    350      *
    351      * Context strings sort by prefix length, then by prefix, then by contraction suffix.
    352      * Context strings must be unique and in ascending order.
    353      */
    354     private static final class ConditionalCE32 {
    355         ConditionalCE32(String ct, int ce) {
    356             context = ct;
    357             ce32 = ce;
    358             defaultCE32 = Collation.NO_CE32;
    359             builtCE32 = Collation.NO_CE32;
    360             next = -1;
    361         }
    362 
    363         boolean hasContext() { return context.length() > 1; }
    364         int prefixLength() { return context.charAt(0); }
    365 
    366         /**
    367          * "\0" for the first entry for any code point, with its default CE32.
    368          *
    369          * Otherwise one unit with the length of the prefix string,
    370          * then the prefix string, then the contraction suffix.
    371          */
    372         String context;
    373         /**
    374          * CE32 for the code point and its context.
    375          * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
    376          */
    377         int ce32;
    378         /**
    379          * Default CE32 for all contexts with this same prefix.
    380          * Initially NO_CE32. Set only while building runtime data structures,
    381          * and only on one of the nodes of a sub-list with the same prefix.
    382          */
    383         int defaultCE32;
    384         /**
    385          * CE32 for the built contexts.
    386          * When fetching CEs from the builder, the contexts are built into their runtime form
    387          * so that the normal collation implementation can process them.
    388          * The result is cached in the list head. It is reset when the contexts are modified.
    389          */
    390         int builtCE32;
    391         /**
    392          * Index of the next ConditionalCE32.
    393          * Negative for the end of the list.
    394          */
    395         int next;
    396     }
    397 
    398     protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) {
    399         int i = Collation.indexFromCE32(ce32);
    400         long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i);
    401         long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
    402         return Collation.makeLongPrimaryCE32(p);
    403     }
    404 
    405     protected int addCE(long ce) {
    406         int length = ce64s.size();
    407         for(int i = 0; i < length; ++i) {
    408             if(ce == ce64s.elementAti(i)) { return i; }
    409         }
    410         ce64s.addElement(ce);
    411         return length;
    412     }
    413 
    414     protected int addCE32(int ce32) {
    415         int length = ce32s.size();
    416         for(int i = 0; i < length; ++i) {
    417             if(ce32 == ce32s.elementAti(i)) { return i; }
    418         }
    419         ce32s.addElement(ce32);
    420         return length;
    421     }
    422 
    423     protected int addConditionalCE32(String context, int ce32) {
    424         assert(context.length() != 0);
    425         int index = conditionalCE32s.size();
    426         if(index > Collation.MAX_INDEX) {
    427             throw new IndexOutOfBoundsException("too many context-sensitive mappings");
    428             // BufferOverflowException is a better fit
    429             // but cannot be constructed with a message string.
    430         }
    431         ConditionalCE32 cond = new ConditionalCE32(context, ce32);
    432         conditionalCE32s.add(cond);
    433         return index;
    434     }
    435 
    436     protected ConditionalCE32 getConditionalCE32(int index) {
    437         return conditionalCE32s.get(index);
    438     }
    439     protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) {
    440         return getConditionalCE32(Collation.indexFromCE32(ce32));
    441     }
    442 
    443     protected static int makeBuilderContextCE32(int index) {
    444         return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index);
    445     }
    446     protected static boolean isBuilderContextCE32(int ce32) {
    447         return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG);
    448     }
    449 
    450     protected static int encodeOneCEAsCE32(long ce) {
    451         long p = ce >>> 32;
    452         int lower32 = (int)ce;
    453         int t = lower32 & 0xffff;
    454         assert((t & 0xc000) != 0xc000);  // Impossible case bits 11 mark special CE32s.
    455         if((ce & 0xffff00ff00ffL) == 0) {
    456             // normal form ppppsstt
    457             return (int)p | (lower32 >>> 16) | (t >> 8);
    458         } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) {
    459             // long-primary form ppppppC1
    460             return Collation.makeLongPrimaryCE32(p);
    461         } else if(p == 0 && (t & 0xff) == 0) {
    462             // long-secondary form ssssttC2
    463             return Collation.makeLongSecondaryCE32(lower32);
    464         }
    465         return Collation.NO_CE32;
    466     }
    467 
    468     protected int encodeOneCE(long ce) {
    469         // Try to encode one CE as one CE32.
    470         int ce32 = encodeOneCEAsCE32(ce);
    471         if(ce32 != Collation.NO_CE32) { return ce32; }
    472         int index = addCE(ce);
    473         if(index > Collation.MAX_INDEX) {
    474             throw new IndexOutOfBoundsException("too many mappings");
    475             // BufferOverflowException is a better fit
    476             // but cannot be constructed with a message string.
    477         }
    478         return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1);
    479     }
    480 
    481     protected int encodeExpansion(long ces[], int start, int length) {
    482         // See if this sequence of CEs has already been stored.
    483         long first = ces[start];
    484         int ce64sMax = ce64s.size() - length;
    485         for(int i = 0; i <= ce64sMax; ++i) {
    486             if(first == ce64s.elementAti(i)) {
    487                 if(i > Collation.MAX_INDEX) {
    488                     throw new IndexOutOfBoundsException("too many mappings");
    489                     // BufferOverflowException is a better fit
    490                     // but cannot be constructed with a message string.
    491                 }
    492                 for(int j = 1;; ++j) {
    493                     if(j == length) {
    494                         return Collation.makeCE32FromTagIndexAndLength(
    495                                 Collation.EXPANSION_TAG, i, length);
    496                     }
    497                     if(ce64s.elementAti(i + j) != ces[start + j]) { break; }
    498                 }
    499             }
    500         }
    501         // Store the new sequence.
    502         int i = ce64s.size();
    503         if(i > Collation.MAX_INDEX) {
    504             throw new IndexOutOfBoundsException("too many mappings");
    505             // BufferOverflowException is a better fit
    506             // but cannot be constructed with a message string.
    507         }
    508         for(int j = 0; j < length; ++j) {
    509             ce64s.addElement(ces[start + j]);
    510         }
    511         return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length);
    512     }
    513 
    514     protected int encodeExpansion32(int newCE32s[], int start, int length) {
    515         // See if this sequence of CE32s has already been stored.
    516         int first = newCE32s[start];
    517         int ce32sMax = ce32s.size() - length;
    518         for(int i = 0; i <= ce32sMax; ++i) {
    519             if(first == ce32s.elementAti(i)) {
    520                 if(i > Collation.MAX_INDEX) {
    521                     throw new IndexOutOfBoundsException("too many mappings");
    522                     // BufferOverflowException is a better fit
    523                     // but cannot be constructed with a message string.
    524                 }
    525                 for(int j = 1;; ++j) {
    526                     if(j == length) {
    527                         return Collation.makeCE32FromTagIndexAndLength(
    528                                 Collation.EXPANSION32_TAG, i, length);
    529                     }
    530                     if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; }
    531                 }
    532             }
    533         }
    534         // Store the new sequence.
    535         int i = ce32s.size();
    536         if(i > Collation.MAX_INDEX) {
    537             throw new IndexOutOfBoundsException("too many mappings");
    538             // BufferOverflowException is a better fit
    539             // but cannot be constructed with a message string.
    540         }
    541         for(int j = 0; j < length; ++j) {
    542             ce32s.addElement(newCE32s[start + j]);
    543         }
    544         return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length);
    545     }
    546 
    547     protected int copyFromBaseCE32(int c, int ce32, boolean withContext) {
    548         if(!Collation.isSpecialCE32(ce32)) { return ce32; }
    549         switch(Collation.tagFromCE32(ce32)) {
    550         case Collation.LONG_PRIMARY_TAG:
    551         case Collation.LONG_SECONDARY_TAG:
    552         case Collation.LATIN_EXPANSION_TAG:
    553             // copy as is
    554             break;
    555         case Collation.EXPANSION32_TAG: {
    556             int index = Collation.indexFromCE32(ce32);
    557             int length = Collation.lengthFromCE32(ce32);
    558             ce32 = encodeExpansion32(base.ce32s, index, length);
    559             break;
    560         }
    561         case Collation.EXPANSION_TAG: {
    562             int index = Collation.indexFromCE32(ce32);
    563             int length = Collation.lengthFromCE32(ce32);
    564             ce32 = encodeExpansion(base.ces, index, length);
    565             break;
    566         }
    567         case Collation.PREFIX_TAG: {
    568             // Flatten prefixes and nested suffixes (contractions)
    569             // into a linear list of ConditionalCE32.
    570             int trieIndex = Collation.indexFromCE32(ce32);
    571             ce32 = base.getCE32FromContexts(trieIndex);  // Default if no prefix match.
    572             if(!withContext) {
    573                 return copyFromBaseCE32(c, ce32, false);
    574             }
    575             ConditionalCE32 head = new ConditionalCE32("", 0);
    576             StringBuilder context = new StringBuilder("\0");
    577             int index;
    578             if(Collation.isContractionCE32(ce32)) {
    579                 index = copyContractionsFromBaseCE32(context, c, ce32, head);
    580             } else {
    581                 ce32 = copyFromBaseCE32(c, ce32, true);
    582                 head.next = index = addConditionalCE32(context.toString(), ce32);
    583             }
    584             ConditionalCE32 cond = getConditionalCE32(index);  // the last ConditionalCE32 so far
    585             CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
    586             while(prefixes.hasNext()) {
    587                 CharsTrie.Entry entry = prefixes.next();
    588                 context.setLength(0);
    589                 context.append(entry.chars).reverse().insert(0, (char)entry.chars.length());
    590                 ce32 = entry.value;
    591                 if(Collation.isContractionCE32(ce32)) {
    592                     index = copyContractionsFromBaseCE32(context, c, ce32, cond);
    593                 } else {
    594                     ce32 = copyFromBaseCE32(c, ce32, true);
    595                     cond.next = index = addConditionalCE32(context.toString(), ce32);
    596                 }
    597                 cond = getConditionalCE32(index);
    598             }
    599             ce32 = makeBuilderContextCE32(head.next);
    600             contextChars.add(c);
    601             break;
    602         }
    603         case Collation.CONTRACTION_TAG: {
    604             if(!withContext) {
    605                 int index = Collation.indexFromCE32(ce32);
    606                 ce32 = base.getCE32FromContexts(index);  // Default if no suffix match.
    607                 return copyFromBaseCE32(c, ce32, false);
    608             }
    609             ConditionalCE32 head = new ConditionalCE32("", 0);
    610             StringBuilder context = new StringBuilder("\0");
    611             copyContractionsFromBaseCE32(context, c, ce32, head);
    612             ce32 = makeBuilderContextCE32(head.next);
    613             contextChars.add(c);
    614             break;
    615         }
    616         case Collation.HANGUL_TAG:
    617             throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables.");
    618         case Collation.OFFSET_TAG:
    619             ce32 = getCE32FromOffsetCE32(true, c, ce32);
    620             break;
    621         case Collation.IMPLICIT_TAG:
    622             ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c));
    623             break;
    624         default:
    625             throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " +
    626                     "requires ce32 == base.getFinalCE32(ce32)");
    627         }
    628         return ce32;
    629     }
    630 
    631     /**
    632      * Copies base contractions to a list of ConditionalCE32.
    633      * Sets cond.next to the index of the first new item
    634      * and returns the index of the last new item.
    635      */
    636     protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32,
    637             ConditionalCE32 cond) {
    638         int trieIndex = Collation.indexFromCE32(ce32);
    639         int index;
    640         if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
    641             // No match on the single code point.
    642             // We are underneath a prefix, and the default mapping is just
    643             // a fallback to the mappings for a shorter prefix.
    644             assert(context.length() > 1);
    645             index = -1;
    646         } else {
    647             ce32 = base.getCE32FromContexts(trieIndex);  // Default if no suffix match.
    648             assert(!Collation.isContractionCE32(ce32));
    649             ce32 = copyFromBaseCE32(c, ce32, true);
    650             cond.next = index = addConditionalCE32(context.toString(), ce32);
    651             cond = getConditionalCE32(index);
    652         }
    653 
    654         int suffixStart = context.length();
    655         CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
    656         while(suffixes.hasNext()) {
    657             CharsTrie.Entry entry = suffixes.next();
    658             context.append(entry.chars);
    659             ce32 = copyFromBaseCE32(c, entry.value, true);
    660             cond.next = index = addConditionalCE32(context.toString(), ce32);
    661             // No need to update the unsafeBackwardSet because the tailoring set
    662             // is already a copy of the base set.
    663             cond = getConditionalCE32(index);
    664             context.setLength(suffixStart);
    665         }
    666         assert(index >= 0);
    667         return index;
    668     }
    669 
    670     private static final class CopyHelper {
    671         CopyHelper(CollationDataBuilder s, CollationDataBuilder d,
    672                   CollationDataBuilder.CEModifier m) {
    673             src = s;
    674             dest = d;
    675             modifier = m;
    676         }
    677 
    678         void copyRangeCE32(int start, int end, int ce32) {
    679             ce32 = copyCE32(ce32);
    680             dest.trie.setRange(start, end, ce32, true);
    681             if(CollationDataBuilder.isBuilderContextCE32(ce32)) {
    682                 dest.contextChars.add(start, end);
    683             }
    684         }
    685 
    686         int copyCE32(int ce32) {
    687             if(!Collation.isSpecialCE32(ce32)) {
    688                 long ce = modifier.modifyCE32(ce32);
    689                 if(ce != Collation.NO_CE) {
    690                     ce32 = dest.encodeOneCE(ce);
    691                 }
    692             } else {
    693                 int tag = Collation.tagFromCE32(ce32);
    694                 if(tag == Collation.EXPANSION32_TAG) {
    695                     int[] srcCE32s = src.ce32s.getBuffer();
    696                     int srcIndex = Collation.indexFromCE32(ce32);
    697                     int length = Collation.lengthFromCE32(ce32);
    698                     // Inspect the source CE32s. Just copy them if none are modified.
    699                     // Otherwise copy to modifiedCEs, with modifications.
    700                     boolean isModified = false;
    701                     for(int i = 0; i < length; ++i) {
    702                         ce32 = srcCE32s[srcIndex + i];
    703                         long ce;
    704                         if(Collation.isSpecialCE32(ce32) ||
    705                                 (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) {
    706                             if(isModified) {
    707                                 modifiedCEs[i] = Collation.ceFromCE32(ce32);
    708                             }
    709                         } else {
    710                             if(!isModified) {
    711                                 for(int j = 0; j < i; ++j) {
    712                                     modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]);
    713                                 }
    714                                 isModified = true;
    715                             }
    716                             modifiedCEs[i] = ce;
    717                         }
    718                     }
    719                     if(isModified) {
    720                         ce32 = dest.encodeCEs(modifiedCEs, length);
    721                     } else {
    722                         ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length);
    723                     }
    724                 } else if(tag == Collation.EXPANSION_TAG) {
    725                     long[] srcCEs = src.ce64s.getBuffer();
    726                     int srcIndex = Collation.indexFromCE32(ce32);
    727                     int length = Collation.lengthFromCE32(ce32);
    728                     // Inspect the source CEs. Just copy them if none are modified.
    729                     // Otherwise copy to modifiedCEs, with modifications.
    730                     boolean isModified = false;
    731                     for(int i = 0; i < length; ++i) {
    732                         long srcCE = srcCEs[srcIndex + i];
    733                         long ce = modifier.modifyCE(srcCE);
    734                         if(ce == Collation.NO_CE) {
    735                             if(isModified) {
    736                                 modifiedCEs[i] = srcCE;
    737                             }
    738                         } else {
    739                             if(!isModified) {
    740                                 for(int j = 0; j < i; ++j) {
    741                                     modifiedCEs[j] = srcCEs[srcIndex + j];
    742                                 }
    743                                 isModified = true;
    744                             }
    745                             modifiedCEs[i] = ce;
    746                         }
    747                     }
    748                     if(isModified) {
    749                         ce32 = dest.encodeCEs(modifiedCEs, length);
    750                     } else {
    751                         ce32 = dest.encodeExpansion(srcCEs, srcIndex, length);
    752                     }
    753                 } else if(tag == Collation.BUILDER_DATA_TAG) {
    754                     // Copy the list of ConditionalCE32.
    755                     ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32);
    756                     assert(!cond.hasContext());
    757                     int destIndex = dest.addConditionalCE32(
    758                             cond.context, copyCE32(cond.ce32));
    759                     ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex);
    760                     while(cond.next >= 0) {
    761                         cond = src.getConditionalCE32(cond.next);
    762                         ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex);
    763                         destIndex = dest.addConditionalCE32(
    764                                 cond.context, copyCE32(cond.ce32));
    765                         int suffixStart = cond.prefixLength() + 1;
    766                         dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart));
    767                         prevDestCond.next = destIndex;
    768                     }
    769                 } else {
    770                     // Just copy long CEs and Latin mini expansions (and other expected values) as is,
    771                     // assuming that the modifier would not modify them.
    772                     assert(tag == Collation.LONG_PRIMARY_TAG ||
    773                             tag == Collation.LONG_SECONDARY_TAG ||
    774                             tag == Collation.LATIN_EXPANSION_TAG ||
    775                             tag == Collation.HANGUL_TAG);
    776                 }
    777             }
    778             return ce32;
    779         }
    780 
    781         CollationDataBuilder src;
    782         CollationDataBuilder dest;
    783         CollationDataBuilder.CEModifier modifier;
    784         long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH];
    785     }
    786 
    787     private static void
    788     enumRangeForCopy(int start, int end, int value, CopyHelper helper) {
    789         if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) {
    790             helper.copyRangeCE32(start, end, value);
    791         }
    792     }
    793 
    794     protected boolean getJamoCE32s(int jamoCE32s[]) {
    795         boolean anyJamoAssigned = base == null;  // always set jamoCE32s in the base data
    796         boolean needToCopyFromBase = false;
    797         for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
    798             int jamo = jamoCpFromIndex(j);
    799             boolean fromBase = false;
    800             int ce32 = trie.get(jamo);
    801             anyJamoAssigned |= Collation.isAssignedCE32(ce32);
    802             // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
    803             // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
    804             if(ce32 == Collation.FALLBACK_CE32) {
    805                 fromBase = true;
    806                 ce32 = base.getCE32(jamo);
    807             }
    808             if(Collation.isSpecialCE32(ce32)) {
    809                 switch(Collation.tagFromCE32(ce32)) {
    810                 case Collation.LONG_PRIMARY_TAG:
    811                 case Collation.LONG_SECONDARY_TAG:
    812                 case Collation.LATIN_EXPANSION_TAG:
    813                     // Copy the ce32 as-is.
    814                     break;
    815                 case Collation.EXPANSION32_TAG:
    816                 case Collation.EXPANSION_TAG:
    817                 case Collation.PREFIX_TAG:
    818                 case Collation.CONTRACTION_TAG:
    819                     if(fromBase) {
    820                         // Defer copying until we know if anyJamoAssigned.
    821                         ce32 = Collation.FALLBACK_CE32;
    822                         needToCopyFromBase = true;
    823                     }
    824                     break;
    825                 case Collation.IMPLICIT_TAG:
    826                     // An unassigned Jamo should only occur in tests with incomplete bases.
    827                     assert(fromBase);
    828                     ce32 = Collation.FALLBACK_CE32;
    829                     needToCopyFromBase = true;
    830                     break;
    831                 case Collation.OFFSET_TAG:
    832                     ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
    833                     break;
    834                 case Collation.FALLBACK_TAG:
    835                 case Collation.RESERVED_TAG_3:
    836                 case Collation.BUILDER_DATA_TAG:
    837                 case Collation.DIGIT_TAG:
    838                 case Collation.U0000_TAG:
    839                 case Collation.HANGUL_TAG:
    840                 case Collation.LEAD_SURROGATE_TAG:
    841                     throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32));
    842                 }
    843             }
    844             jamoCE32s[j] = ce32;
    845         }
    846         if(anyJamoAssigned && needToCopyFromBase) {
    847             for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {
    848                 if(jamoCE32s[j] == Collation.FALLBACK_CE32) {
    849                     int jamo = jamoCpFromIndex(j);
    850                     jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo),
    851                                                     /*withContext=*/ true);
    852                 }
    853             }
    854         }
    855         return anyJamoAssigned;
    856     }
    857 
    858     protected void setDigitTags() {
    859         UnicodeSet digits = new UnicodeSet("[:Nd:]");
    860         UnicodeSetIterator iter = new UnicodeSetIterator(digits);
    861         while(iter.next()) {
    862             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
    863             int c = iter.codepoint;
    864             int ce32 = trie.get(c);
    865             if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
    866                 int index = addCE32(ce32);
    867                 if(index > Collation.MAX_INDEX) {
    868                     throw new IndexOutOfBoundsException("too many mappings");
    869                     // BufferOverflowException is a better fit
    870                     // but cannot be constructed with a message string.
    871                 }
    872                 ce32 = Collation.makeCE32FromTagIndexAndLength(
    873                         Collation.DIGIT_TAG, index, UCharacter.digit(c));  // u_charDigitValue(c)
    874                 trie.set(c, ce32);
    875             }
    876         }
    877     }
    878 
    879     protected void setLeadSurrogates() {
    880         for(char lead = 0xd800; lead < 0xdc00; ++lead) {
    881             int leadValue = -1;
    882             // utrie2_enumForLeadSurrogate(trie, lead, null, , &value);
    883             Iterator<Trie2.Range> trieIterator = trie.iteratorForLeadSurrogate(lead);
    884             while(trieIterator.hasNext()) {
    885                 Trie2.Range range = trieIterator.next();
    886                 // The rest of this loop is equivalent to C++ enumRangeLeadValue().
    887                 int value = range.value;
    888                 if(value == Collation.UNASSIGNED_CE32) {
    889                     value = Collation.LEAD_ALL_UNASSIGNED;
    890                 } else if(value == Collation.FALLBACK_CE32) {
    891                     value = Collation.LEAD_ALL_FALLBACK;
    892                 } else {
    893                     leadValue = Collation.LEAD_MIXED;
    894                     break;
    895                 }
    896                 if(leadValue < 0) {
    897                     leadValue = value;
    898                 } else if(leadValue != value) {
    899                     leadValue = Collation.LEAD_MIXED;
    900                     break;
    901                 }
    902             }
    903             trie.setForLeadSurrogateCodeUnit(lead,
    904                     Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue);
    905         }
    906     }
    907 
    908     protected void buildMappings(CollationData data) {
    909         if(!isMutable()) {
    910             throw new IllegalStateException("attempt to build() after build()");
    911         }
    912 
    913         buildContexts();
    914 
    915         int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
    916         int jamoIndex = -1;
    917         if(getJamoCE32s(jamoCE32s)) {
    918             jamoIndex = ce32s.size();
    919             for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
    920                 ce32s.addElement(jamoCE32s[i]);
    921             }
    922             // Small optimization: Use a bit in the Hangul ce32
    923             // to indicate that none of the Jamo CE32s are isSpecialCE32()
    924             // (as it should be in the root collator).
    925             // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
    926             // In order to still have good trie compression and keep this code simple,
    927             // we only set this flag if a whole block of 588 Hangul syllables starting with
    928             // a common leading consonant (Jamo L) has this property.
    929             boolean isAnyJamoVTSpecial = false;
    930             for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
    931                 if(Collation.isSpecialCE32(jamoCE32s[i])) {
    932                     isAnyJamoVTSpecial = true;
    933                     break;
    934                 }
    935             }
    936             int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
    937             int c = Hangul.HANGUL_BASE;
    938             for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) {  // iterate over the Jamo L
    939                 int ce32 = hangulCE32;
    940                 if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) {
    941                     ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO;
    942                 }
    943                 int limit = c + Hangul.JAMO_VT_COUNT;
    944                 trie.setRange(c, limit - 1, ce32, true);
    945                 c = limit;
    946             }
    947         } else {
    948             // Copy the Hangul CE32s from the base in blocks per Jamo L,
    949             // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
    950             for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) {
    951                 int ce32 = base.getCE32(c);
    952                 assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG));
    953                 int limit = c + Hangul.JAMO_VT_COUNT;
    954                 trie.setRange(c, limit - 1, ce32, true);
    955                 c = limit;
    956             }
    957         }
    958 
    959         setDigitTags();
    960         setLeadSurrogates();
    961 
    962         // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
    963         ce32s.setElementAt(trie.get(0), 0);
    964         trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0));
    965 
    966         data.trie = trie.toTrie2_32();
    967 
    968         // Mark each lead surrogate as "unsafe"
    969         // if any of its 1024 associated supplementary code points is "unsafe".
    970         int c = 0x10000;
    971         for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
    972             if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
    973                 unsafeBackwardSet.add(lead);
    974             }
    975         }
    976         unsafeBackwardSet.freeze();
    977 
    978         data.ce32s = ce32s.getBuffer();
    979         data.ces = ce64s.getBuffer();
    980         data.contexts = contexts.toString();
    981 
    982         data.base = base;
    983         if(jamoIndex >= 0) {
    984             data.jamoCE32s = jamoCE32s;  // C++: data.ce32s + jamoIndex
    985         } else {
    986             data.jamoCE32s = base.jamoCE32s;
    987         }
    988         data.unsafeBackwardSet = unsafeBackwardSet;
    989     }
    990 
    991     protected void clearContexts() {
    992         contexts.setLength(0);
    993         UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
    994         while(iter.next()) {
    995             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
    996             int ce32 = trie.get(iter.codepoint);
    997             assert(isBuilderContextCE32(ce32));
    998             getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32;
    999         }
   1000     }
   1001 
   1002     protected void buildContexts() {
   1003         // Ignore abandoned lists and the cached builtCE32,
   1004         // and build all contexts from scratch.
   1005         contexts.setLength(0);
   1006         UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
   1007         while(iter.next()) {
   1008             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
   1009             int c = iter.codepoint;
   1010             int ce32 = trie.get(c);
   1011             if(!isBuilderContextCE32(ce32)) {
   1012                 throw new AssertionError("Impossible: No context data for c in contextChars.");
   1013             }
   1014             ConditionalCE32 cond = getConditionalCE32ForCE32(ce32);
   1015             ce32 = buildContext(cond);
   1016             trie.set(c, ce32);
   1017         }
   1018     }
   1019 
   1020     protected int buildContext(ConditionalCE32 head) {
   1021         // The list head must have no context.
   1022         assert(!head.hasContext());
   1023         // The list head must be followed by one or more nodes that all do have context.
   1024         assert(head.next >= 0);
   1025         CharsTrieBuilder prefixBuilder = new CharsTrieBuilder();
   1026         CharsTrieBuilder contractionBuilder = new CharsTrieBuilder();
   1027         for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) {
   1028             // After the list head, the prefix or suffix can be empty, but not both.
   1029             assert(cond == head || cond.hasContext());
   1030             int prefixLength = cond.prefixLength();
   1031             StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1);
   1032             String prefixString = prefix.toString();
   1033             // Collect all contraction suffixes for one prefix.
   1034             ConditionalCE32 firstCond = cond;
   1035             ConditionalCE32 lastCond = cond;
   1036             while(cond.next >= 0 &&
   1037                     (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)) {
   1038                 lastCond = cond;
   1039             }
   1040             int ce32;
   1041             int suffixStart = prefixLength + 1;  // == prefix.length()
   1042             if(lastCond.context.length() == suffixStart) {
   1043                 // One prefix without contraction suffix.
   1044                 assert(firstCond == lastCond);
   1045                 ce32 = lastCond.ce32;
   1046                 cond = lastCond;
   1047             } else {
   1048                 // Build the contractions trie.
   1049                 contractionBuilder.clear();
   1050                 // Entry for an empty suffix, to be stored before the trie.
   1051                 int emptySuffixCE32 = Collation.NO_CE32;  // Will always be set to a real value.
   1052                 int flags = 0;
   1053                 if(firstCond.context.length() == suffixStart) {
   1054                     // There is a mapping for the prefix and the single character c. (p|c)
   1055                     // If no other suffix matches, then we return this value.
   1056                     emptySuffixCE32 = firstCond.ce32;
   1057                     cond = getConditionalCE32(firstCond.next);
   1058                 } else {
   1059                     // There is no mapping for the prefix and just the single character.
   1060                     // (There is no p|c, only p|cd, p|ce etc.)
   1061                     flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH;
   1062                     // When the prefix matches but none of the prefix-specific suffixes,
   1063                     // then we fall back to the mappings with the next-longest prefix,
   1064                     // and ultimately to mappings with no prefix.
   1065                     // Each fallback might be another set of contractions.
   1066                     // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
   1067                     // then in text "pch" we find the ch contraction.
   1068                     for(cond = head;; cond = getConditionalCE32(cond.next)) {
   1069                         int length = cond.prefixLength();
   1070                         if(length == prefixLength) { break; }
   1071                         if(cond.defaultCE32 != Collation.NO_CE32 &&
   1072                                 (length==0 || prefixString.regionMatches(
   1073                                         prefix.length() - length, cond.context, 1, length)
   1074                                         /* C++: prefix.endsWith(cond.context, 1, length) */)) {
   1075                             emptySuffixCE32 = cond.defaultCE32;
   1076                         }
   1077                     }
   1078                     cond = firstCond;
   1079                 }
   1080                 // Optimization: Set a flag when
   1081                 // the first character of every contraction suffix has lccc!=0.
   1082                 // Short-circuits contraction matching when a normal letter follows.
   1083                 flags |= Collation.CONTRACT_NEXT_CCC;
   1084                 // Add all of the non-empty suffixes into the contraction trie.
   1085                 for(;;) {
   1086                     String suffix = cond.context.substring(suffixStart);
   1087                     int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0));
   1088                     if(fcd16 <= 0xff) {
   1089                         flags &= ~Collation.CONTRACT_NEXT_CCC;
   1090                     }
   1091                     fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length()));
   1092                     if(fcd16 > 0xff) {
   1093                         // The last suffix character has lccc!=0, allowing for discontiguous contractions.
   1094                         flags |= Collation.CONTRACT_TRAILING_CCC;
   1095                     }
   1096                     contractionBuilder.add(suffix, cond.ce32);
   1097                     if(cond == lastCond) { break; }
   1098                     cond = getConditionalCE32(cond.next);
   1099                 }
   1100                 int index = addContextTrie(emptySuffixCE32, contractionBuilder);
   1101                 if(index > Collation.MAX_INDEX) {
   1102                     throw new IndexOutOfBoundsException("too many context-sensitive mappings");
   1103                     // BufferOverflowException is a better fit
   1104                     // but cannot be constructed with a message string.
   1105                 }
   1106                 ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags;
   1107             }
   1108             assert(cond == lastCond);
   1109             firstCond.defaultCE32 = ce32;
   1110             if(prefixLength == 0) {
   1111                 if(cond.next < 0) {
   1112                     // No non-empty prefixes, only contractions.
   1113                     return ce32;
   1114                 }
   1115             } else {
   1116                 prefix.delete(0, 1);  // Remove the length unit.
   1117                 prefix.reverse();
   1118                 prefixBuilder.add(prefix, ce32);
   1119                 if(cond.next < 0) { break; }
   1120             }
   1121         }
   1122         assert(head.defaultCE32 != Collation.NO_CE32);
   1123         int index = addContextTrie(head.defaultCE32, prefixBuilder);
   1124         if(index > Collation.MAX_INDEX) {
   1125             throw new IndexOutOfBoundsException("too many context-sensitive mappings");
   1126             // BufferOverflowException is a better fit
   1127             // but cannot be constructed with a message string.
   1128         }
   1129         return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index);
   1130     }
   1131 
   1132     protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) {
   1133         StringBuilder context = new StringBuilder();
   1134         context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32);
   1135         context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL));
   1136         int index = contexts.indexOf(context.toString());
   1137         if(index < 0) {
   1138             index = contexts.length();
   1139             contexts.append(context);
   1140         }
   1141         return index;
   1142     }
   1143 
   1144     protected void buildFastLatinTable(CollationData data) {
   1145         if(!fastLatinEnabled) { return; }
   1146 
   1147         fastLatinBuilder = new CollationFastLatinBuilder();
   1148         if(fastLatinBuilder.forData(data)) {
   1149             char[] header = fastLatinBuilder.getHeader();
   1150             char[] table = fastLatinBuilder.getTable();
   1151             if(base != null &&
   1152                     Arrays.equals(header, base.fastLatinTableHeader) &&
   1153                     Arrays.equals(table, base.fastLatinTable)) {
   1154                 // Same fast Latin table as in the base, use that one instead.
   1155                 fastLatinBuilder = null;
   1156                 header = base.fastLatinTableHeader;
   1157                 table = base.fastLatinTable;
   1158             }
   1159             data.fastLatinTableHeader = header;
   1160             data.fastLatinTable = table;
   1161         } else {
   1162             fastLatinBuilder = null;
   1163         }
   1164     }
   1165 
   1166     protected int getCEs(CharSequence s, int start, long ces[], int cesLength) {
   1167         if(collIter == null) {
   1168             collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl));
   1169             if(collIter == null) { return 0; }
   1170         }
   1171         return collIter.fetchCEs(s, start, ces, cesLength);
   1172     }
   1173 
   1174     protected static int jamoCpFromIndex(int i) {
   1175         // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27
   1176         if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; }
   1177         i -= Hangul.JAMO_L_COUNT;
   1178         if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; }
   1179         i -= Hangul.JAMO_V_COUNT;
   1180         // i < 27
   1181         return Hangul.JAMO_T_BASE + 1 + i;
   1182     }
   1183 
   1184     /**
   1185      * Build-time collation element and character iterator.
   1186      * Uses the runtime CollationIterator for fetching CEs for a string
   1187      * but reads from the builder's unfinished data structures.
   1188      * In particular, this class reads from the unfinished trie
   1189      * and has to avoid CollationIterator.nextCE() and redirect other
   1190      * calls to data.getCE32() and data.getCE32FromSupplementary().
   1191      *
   1192      * We do this so that we need not implement the collation algorithm
   1193      * again for the builder and make it behave exactly like the runtime code.
   1194      * That would be more difficult to test and maintain than this indirection.
   1195      *
   1196      * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
   1197      * so the data accesses from those code paths need not be modified.
   1198      *
   1199      * This class iterates directly over whole code points
   1200      * so that the CollationIterator does not need the finished trie
   1201      * for handling the LEAD_SURROGATE_TAG.
   1202      */
   1203     private static final class DataBuilderCollationIterator extends CollationIterator {
   1204         DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) {
   1205             super(newData, /*numeric=*/ false);
   1206             builder = b;
   1207             builderData = newData;
   1208             builderData.base = builder.base;
   1209             // Set all of the jamoCE32s[] to indirection CE32s.
   1210             for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
   1211                 int jamo = CollationDataBuilder.jamoCpFromIndex(j);
   1212                 jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) |
   1213                         CollationDataBuilder.IS_BUILDER_JAMO_CE32;
   1214             }
   1215             builderData.jamoCE32s = jamoCE32s;
   1216         }
   1217 
   1218         int fetchCEs(CharSequence str, int start, long ces[], int cesLength) {
   1219             // Set the pointers each time, in case they changed due to reallocation.
   1220             builderData.ce32s = builder.ce32s.getBuffer();
   1221             builderData.ces = builder.ce64s.getBuffer();
   1222             builderData.contexts = builder.contexts.toString();
   1223             // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32().
   1224             reset();
   1225             s = str;
   1226             pos = start;
   1227             while(pos < s.length()) {
   1228                 // No need to keep all CEs in the iterator buffer.
   1229                 clearCEs();
   1230                 int c = Character.codePointAt(s, pos);
   1231                 pos += Character.charCount(c);
   1232                 int ce32 = builder.trie.get(c);
   1233                 CollationData d;
   1234                 if(ce32 == Collation.FALLBACK_CE32) {
   1235                     d = builder.base;
   1236                     ce32 = builder.base.getCE32(c);
   1237                 } else {
   1238                     d = builderData;
   1239                 }
   1240                 appendCEsFromCE32(d, c, ce32, /*forward=*/ true);
   1241                 for(int i = 0; i < getCEsLength(); ++i) {
   1242                     long ce = getCE(i);
   1243                     if(ce != 0) {
   1244                         if(cesLength < Collation.MAX_EXPANSION_LENGTH) {
   1245                             ces[cesLength] = ce;
   1246                         }
   1247                         ++cesLength;
   1248                     }
   1249                 }
   1250             }
   1251             return cesLength;
   1252         }
   1253 
   1254         @Override
   1255         public void resetToOffset(int newOffset) {
   1256             reset();
   1257             pos = newOffset;
   1258         }
   1259 
   1260         @Override
   1261         public int getOffset() {
   1262             return pos;
   1263         }
   1264 
   1265         @Override
   1266         public int nextCodePoint() {
   1267             if(pos == s.length()) {
   1268                 return Collation.SENTINEL_CP;
   1269             }
   1270             int c = Character.codePointAt(s, pos);
   1271             pos += Character.charCount(c);
   1272             return c;
   1273         }
   1274 
   1275         @Override
   1276         public int previousCodePoint() {
   1277             if(pos == 0) {
   1278                 return Collation.SENTINEL_CP;
   1279             }
   1280             int c = Character.codePointBefore(s, pos);
   1281             pos -= Character.charCount(c);
   1282             return c;
   1283         }
   1284 
   1285         @Override
   1286         protected void forwardNumCodePoints(int num) {
   1287             pos = Character.offsetByCodePoints(s, pos, num);
   1288         }
   1289 
   1290         @Override
   1291         protected void backwardNumCodePoints(int num) {
   1292             pos = Character.offsetByCodePoints(s, pos, -num);
   1293         }
   1294 
   1295         @Override
   1296         protected int getDataCE32(int c) {
   1297             return builder.trie.get(c);
   1298         }
   1299 
   1300         @Override
   1301         protected int getCE32FromBuilderData(int ce32) {
   1302             assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG));
   1303             if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) {
   1304                 int jamo = Collation.indexFromCE32(ce32);
   1305                 return builder.trie.get(jamo);
   1306             } else {
   1307                 ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32);
   1308                 if(cond.builtCE32 == Collation.NO_CE32) {
   1309                     // Build the context-sensitive mappings into their runtime form and cache the result.
   1310                     try {
   1311                         cond.builtCE32 = builder.buildContext(cond);
   1312                     } catch(IndexOutOfBoundsException e) {
   1313                         builder.clearContexts();
   1314                         cond.builtCE32 = builder.buildContext(cond);
   1315                     }
   1316                     builderData.contexts = builder.contexts.toString();
   1317                 }
   1318                 return cond.builtCE32;
   1319             }
   1320         }
   1321 
   1322         protected final CollationDataBuilder builder;
   1323         protected final CollationData builderData;
   1324         protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
   1325         protected CharSequence s;
   1326         protected int pos;
   1327     }
   1328 
   1329     protected final boolean isMutable() {
   1330         // C++ tests !(trie == NULL || utrie2_isFrozen(trie))
   1331         // but Java Trie2Writable does not have an observable isFrozen() state.
   1332         return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen();
   1333     }
   1334 
   1335     /** @see Collation.BUILDER_DATA_TAG */
   1336     private static final int IS_BUILDER_JAMO_CE32 = 0x100;
   1337 
   1338     protected Normalizer2Impl nfcImpl;
   1339     protected CollationData base;
   1340     protected CollationSettings baseSettings;
   1341     protected Trie2Writable trie;
   1342     protected UVector32 ce32s;
   1343     protected UVector64 ce64s;
   1344     protected ArrayList<ConditionalCE32> conditionalCE32s;  // vector of ConditionalCE32
   1345     // Characters that have context (prefixes or contraction suffixes).
   1346     protected UnicodeSet contextChars = new UnicodeSet();
   1347     // Serialized UCharsTrie structures for finalized contexts.
   1348     protected StringBuilder contexts = new StringBuilder();
   1349     protected UnicodeSet unsafeBackwardSet = new UnicodeSet();
   1350     protected boolean modified;
   1351 
   1352     protected boolean fastLatinEnabled;
   1353     protected CollationFastLatinBuilder fastLatinBuilder;
   1354 
   1355     protected DataBuilderCollationIterator collIter;
   1356 }
   1357