Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2013-2015, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp
     10 *
     11 * C++ version created on: 2013aug09
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import android.icu.lang.UScript;
     18 import android.icu.text.Collator;
     19 import android.icu.util.CharsTrie;
     20 
     21 final class CollationFastLatinBuilder {
     22     // #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0  // 0 or 1 or 2
     23 
     24     /**
     25      * Compare two signed long values as if they were unsigned.
     26      */
     27     private static final int compareInt64AsUnsigned(long a, long b) {
     28         a += 0x8000000000000000L;
     29         b += 0x8000000000000000L;
     30         if(a < b) {
     31             return -1;
     32         } else if(a > b) {
     33             return 1;
     34         } else {
     35             return 0;
     36         }
     37     }
     38 
     39     /**
     40      * Like Java Collections.binarySearch(List, String, Comparator).
     41      *
     42      * @return the index>=0 where the item was found,
     43      *         or the index<0 for inserting the string at ~index in sorted order
     44      */
     45     private static final int binarySearch(long[] list, int limit, long ce) {
     46         if (limit == 0) { return ~0; }
     47         int start = 0;
     48         for (;;) {
     49             int i = (int)(((long)start + (long)limit) / 2);
     50             int cmp = compareInt64AsUnsigned(ce, list[i]);
     51             if (cmp == 0) {
     52                 return i;
     53             } else if (cmp < 0) {
     54                 if (i == start) {
     55                     return ~start;  // insert ce before i
     56                 }
     57                 limit = i;
     58             } else {
     59                 if (i == start) {
     60                     return ~(start + 1);  // insert ce after i
     61                 }
     62                 start = i;
     63             }
     64         }
     65     }
     66 
     67     CollationFastLatinBuilder() {
     68         ce0 = 0;
     69         ce1 = 0;
     70         contractionCEs = new UVector64();
     71         uniqueCEs = new UVector64();
     72         miniCEs = null;
     73         firstDigitPrimary = 0;
     74         firstLatinPrimary = 0;
     75         lastLatinPrimary = 0;
     76         firstShortPrimary = 0;
     77         shortPrimaryOverflow = false;
     78         headerLength = 0;
     79     }
     80 
     81     boolean forData(CollationData data) {
     82         if(result.length() != 0) {  // This builder is not reusable.
     83             throw new IllegalStateException("attempt to reuse a CollationFastLatinBuilder");
     84         }
     85         if(!loadGroups(data)) { return false; }
     86 
     87         // Fast handling of digits.
     88         firstShortPrimary = firstDigitPrimary;
     89         getCEs(data);
     90         encodeUniqueCEs();
     91         if(shortPrimaryOverflow) {
     92             // Give digits long mini primaries,
     93             // so that there are more short primaries for letters.
     94             firstShortPrimary = firstLatinPrimary;
     95             resetCEs();
     96             getCEs(data);
     97             encodeUniqueCEs();
     98         }
     99         // Note: If we still have a short-primary overflow but not a long-primary overflow,
    100         // then we could calculate how many more long primaries would fit,
    101         // and set the firstShortPrimary to that many after the current firstShortPrimary,
    102         // and try again.
    103         // However, this might only benefit the en_US_POSIX tailoring,
    104         // and it is simpler to suppress building fast Latin data for it in genrb,
    105         // or by returning false here if shortPrimaryOverflow.
    106 
    107         boolean ok = !shortPrimaryOverflow;
    108         if(ok) {
    109             encodeCharCEs();
    110             encodeContractions();
    111         }
    112         contractionCEs.removeAllElements();  // might reduce heap memory usage
    113         uniqueCEs.removeAllElements();
    114         return ok;
    115     }
    116 
    117     // C++ returns one combined array with the contents of the result buffer.
    118     // Java returns two arrays (header & table) because we cannot use pointer arithmetic,
    119     // and we do not want to index into the table with an offset.
    120     char[] getHeader() {
    121         char[] resultArray = new char[headerLength];
    122         result.getChars(0, headerLength, resultArray, 0);
    123         return resultArray;
    124     }
    125 
    126     char[] getTable() {
    127         char[] resultArray = new char[result.length() - headerLength];
    128         result.getChars(headerLength, result.length(), resultArray, 0);
    129         return resultArray;
    130     }
    131 
    132     private boolean loadGroups(CollationData data) {
    133         headerLength = 1 + NUM_SPECIAL_GROUPS;
    134         int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
    135         result.append((char)r0);
    136         // The first few reordering groups should be special groups
    137         // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
    138         for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
    139             lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i);
    140             if(lastSpecialPrimaries[i] == 0) {
    141                 // missing data
    142                 return false;
    143             }
    144             result.append(0);  // reserve a slot for this group
    145         }
    146 
    147         firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT);
    148         firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN);
    149         lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN);
    150         if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
    151             // missing data
    152             return false;
    153         }
    154         return true;
    155     }
    156 
    157     private boolean inSameGroup(long p, long q) {
    158         // Both or neither need to be encoded as short primaries,
    159         // so that we can test only one and use the same bit mask.
    160         if(p >= firstShortPrimary) {
    161             return q >= firstShortPrimary;
    162         } else if(q >= firstShortPrimary) {
    163             return false;
    164         }
    165         // Both or neither must be potentially-variable,
    166         // so that we can test only one and determine if both are variable.
    167         long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
    168         if(p > lastVariablePrimary) {
    169             return q > lastVariablePrimary;
    170         } else if(q > lastVariablePrimary) {
    171             return false;
    172         }
    173         // Both will be encoded with long mini primaries.
    174         // They must be in the same special reordering group,
    175         // so that we can test only one and determine if both are variable.
    176         assert(p != 0 && q != 0);
    177         for(int i = 0;; ++i) {  // will terminate
    178             long lastPrimary = lastSpecialPrimaries[i];
    179             if(p <= lastPrimary) {
    180                 return q <= lastPrimary;
    181             } else if(q <= lastPrimary) {
    182                 return false;
    183             }
    184         }
    185     }
    186 
    187     private void resetCEs() {
    188         contractionCEs.removeAllElements();
    189         uniqueCEs.removeAllElements();
    190         shortPrimaryOverflow = false;
    191         result.setLength(headerLength);
    192     }
    193 
    194     private void getCEs(CollationData data) {
    195         int i = 0;
    196         for(char c = 0;; ++i, ++c) {
    197             if(c == CollationFastLatin.LATIN_LIMIT) {
    198                 c = CollationFastLatin.PUNCT_START;
    199             } else if(c == CollationFastLatin.PUNCT_LIMIT) {
    200                 break;
    201             }
    202             CollationData d;
    203             int ce32 = data.getCE32(c);
    204             if(ce32 == Collation.FALLBACK_CE32) {
    205                 d = data.base;
    206                 ce32 = d.getCE32(c);
    207             } else {
    208                 d = data;
    209             }
    210             if(getCEsFromCE32(d, c, ce32)) {
    211                 charCEs[i][0] = ce0;
    212                 charCEs[i][1] = ce1;
    213                 addUniqueCE(ce0);
    214                 addUniqueCE(ce1);
    215             } else {
    216                 // bail out for c
    217                 charCEs[i][0] = ce0 = Collation.NO_CE;
    218                 charCEs[i][1] = ce1 = 0;
    219             }
    220             if(c == 0 && !isContractionCharCE(ce0)) {
    221                 // Always map U+0000 to a contraction.
    222                 // Write a contraction list with only a default value if there is no real contraction.
    223                 assert(contractionCEs.isEmpty());
    224                 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
    225                 charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;
    226                 charCEs[0][1] = 0;
    227             }
    228         }
    229         // Terminate the last contraction list.
    230         contractionCEs.addElement(CollationFastLatin.CONTR_CHAR_MASK);
    231     }
    232 
    233     private boolean getCEsFromCE32(CollationData data, int c, int ce32) {
    234         ce32 = data.getFinalCE32(ce32);
    235         ce1 = 0;
    236         if(Collation.isSimpleOrLongCE32(ce32)) {
    237             ce0 = Collation.ceFromCE32(ce32);
    238         } else {
    239             switch(Collation.tagFromCE32(ce32)) {
    240             case Collation.LATIN_EXPANSION_TAG:
    241                 ce0 = Collation.latinCE0FromCE32(ce32);
    242                 ce1 = Collation.latinCE1FromCE32(ce32);
    243                 break;
    244             case Collation.EXPANSION32_TAG: {
    245                 int index = Collation.indexFromCE32(ce32);
    246                 int length = Collation.lengthFromCE32(ce32);
    247                 if(length <= 2) {
    248                     ce0 = Collation.ceFromCE32(data.ce32s[index]);
    249                     if(length == 2) {
    250                         ce1 = Collation.ceFromCE32(data.ce32s[index + 1]);
    251                     }
    252                     break;
    253                 } else {
    254                     return false;
    255                 }
    256             }
    257             case Collation.EXPANSION_TAG: {
    258                 int index = Collation.indexFromCE32(ce32);
    259                 int length = Collation.lengthFromCE32(ce32);
    260                 if(length <= 2) {
    261                     ce0 = data.ces[index];
    262                     if(length == 2) {
    263                         ce1 = data.ces[index + 1];
    264                     }
    265                     break;
    266                 } else {
    267                     return false;
    268                 }
    269             }
    270             // Note: We could support PREFIX_TAG (assert c>=0)
    271             // by recursing on its default CE32 and checking that none of the prefixes starts
    272             // with a fast Latin character.
    273             // However, currently (2013) there are only the L-before-middle-dot
    274             // prefix mappings in the Latin range, and those would be rejected anyway.
    275             case Collation.CONTRACTION_TAG:
    276                 assert(c >= 0);
    277                 return getCEsFromContractionCE32(data, ce32);
    278             case Collation.OFFSET_TAG:
    279                 assert(c >= 0);
    280                 ce0 = data.getCEFromOffsetCE32(c, ce32);
    281                 break;
    282             default:
    283                 return false;
    284             }
    285         }
    286         // A mapping can be completely ignorable.
    287         if(ce0 == 0) { return ce1 == 0; }
    288         // We do not support an ignorable ce0 unless it is completely ignorable.
    289         long p0 = ce0 >>> 32;
    290         if(p0 == 0) { return false; }
    291         // We only support primaries up to the Latin script.
    292         if(p0 > lastLatinPrimary) { return false; }
    293         // We support non-common secondary and case weights only together with short primaries.
    294         int lower32_0 = (int)ce0;
    295         if(p0 < firstShortPrimary) {
    296             int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK;
    297             if(sc0 != Collation.COMMON_SECONDARY_CE) { return false; }
    298         }
    299         // No below-common tertiary weights.
    300         if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
    301         if(ce1 != 0) {
    302             // Both primaries must be in the same group,
    303             // or both must get short mini primaries,
    304             // or a short-primary CE is followed by a secondary CE.
    305             // This is so that we can test the first primary and use the same mask for both,
    306             // and determine for both whether they are variable.
    307             long p1 = ce1 >>> 32;
    308             if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return false; }
    309             int lower32_1 = (int)ce1;
    310             // No tertiary CEs.
    311             if((lower32_1 >>> 16) == 0) { return false; }
    312             // We support non-common secondary and case weights
    313             // only for secondary CEs or together with short primaries.
    314             if(p1 != 0 && p1 < firstShortPrimary) {
    315                 int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK;
    316                 if(sc1 != Collation.COMMON_SECONDARY_CE) { return false; }
    317             }
    318             // No below-common tertiary weights.
    319             if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
    320         }
    321         // No quaternary weights.
    322         if(((ce0 | ce1) & Collation.QUATERNARY_MASK) != 0) { return false; }
    323         return true;
    324     }
    325 
    326     private boolean getCEsFromContractionCE32(CollationData data, int ce32) {
    327         int trieIndex = Collation.indexFromCE32(ce32);
    328         ce32 = data.getCE32FromContexts(trieIndex);  // Default if no suffix match.
    329         // Since the original ce32 is not a prefix mapping,
    330         // the default ce32 must not be another contraction.
    331         assert(!Collation.isContractionCE32(ce32));
    332         int contractionIndex = contractionCEs.size();
    333         if(getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
    334             addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
    335         } else {
    336             // Bail out for c-without-contraction.
    337             addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NO_CE, 0);
    338         }
    339         // Handle an encodable contraction unless the next contraction is too long
    340         // and starts with the same character.
    341         int prevX = -1;
    342         boolean addContraction = false;
    343         CharsTrie.Iterator suffixes = CharsTrie.iterator(data.contexts, trieIndex + 2, 0);
    344         while(suffixes.hasNext()) {
    345             CharsTrie.Entry entry = suffixes.next();
    346             CharSequence suffix = entry.chars;
    347             int x = CollationFastLatin.getCharIndex(suffix.charAt(0));
    348             if(x < 0) { continue; }  // ignore anything but fast Latin text
    349             if(x == prevX) {
    350                 if(addContraction) {
    351                     // Bail out for all contractions starting with this character.
    352                     addContractionEntry(x, Collation.NO_CE, 0);
    353                     addContraction = false;
    354                 }
    355                 continue;
    356             }
    357             if(addContraction) {
    358                 addContractionEntry(prevX, ce0, ce1);
    359             }
    360             ce32 = entry.value;
    361             if(suffix.length() == 1 && getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
    362                 addContraction = true;
    363             } else {
    364                 addContractionEntry(x, Collation.NO_CE, 0);
    365                 addContraction = false;
    366             }
    367             prevX = x;
    368         }
    369         if(addContraction) {
    370             addContractionEntry(prevX, ce0, ce1);
    371         }
    372         // Note: There might not be any fast Latin contractions, but
    373         // we need to enter contraction handling anyway so that we can bail out
    374         // when there is a non-fast-Latin character following.
    375         // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
    376         // following umlaut and bail out, rather than return the difference of Y vs. u.
    377         ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;
    378         ce1 = 0;
    379         return true;
    380     }
    381 
    382     private void addContractionEntry(int x, long cce0, long cce1) {
    383         contractionCEs.addElement(x);
    384         contractionCEs.addElement(cce0);
    385         contractionCEs.addElement(cce1);
    386         addUniqueCE(cce0);
    387         addUniqueCE(cce1);
    388     }
    389 
    390     private void addUniqueCE(long ce) {
    391         if(ce == 0 || (ce >>> 32) == Collation.NO_CE_PRIMARY) { return; }
    392         ce &= ~(long)Collation.CASE_MASK;  // blank out case bits
    393         int i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
    394         if(i < 0) {
    395             uniqueCEs.insertElementAt(ce, ~i);
    396         }
    397     }
    398 
    399     private int getMiniCE(long ce) {
    400         ce &= ~(long)Collation.CASE_MASK;  // blank out case bits
    401         int index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
    402         assert(index >= 0);
    403         return miniCEs[index];
    404     }
    405 
    406     private void encodeUniqueCEs() {
    407         miniCEs = new char[uniqueCEs.size()];
    408         int group = 0;
    409         long lastGroupPrimary = lastSpecialPrimaries[group];
    410         // The lowest unique CE must be at least a secondary CE.
    411         assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0);
    412         long prevPrimary = 0;
    413         int prevSecondary = 0;
    414         int pri = 0;
    415         int sec = 0;
    416         int ter = CollationFastLatin.COMMON_TER;
    417         for(int i = 0; i < uniqueCEs.size(); ++i) {
    418             long ce = uniqueCEs.elementAti(i);
    419             // Note: At least one of the p/s/t weights changes from one unique CE to the next.
    420             // (uniqueCEs does not store case bits.)
    421             long p = ce >>> 32;
    422             if(p != prevPrimary) {
    423                 while(p > lastGroupPrimary) {
    424                     assert(pri <= CollationFastLatin.MAX_LONG);
    425                     // Set the group's header entry to the
    426                     // last "long primary" in or before the group.
    427                     result.setCharAt(1 + group, (char)pri);
    428                     if(++group < NUM_SPECIAL_GROUPS) {
    429                         lastGroupPrimary = lastSpecialPrimaries[group];
    430                     } else {
    431                         lastGroupPrimary = 0xffffffffL;
    432                         break;
    433                     }
    434                 }
    435                 if(p < firstShortPrimary) {
    436                     if(pri == 0) {
    437                         pri = CollationFastLatin.MIN_LONG;
    438                     } else if(pri < CollationFastLatin.MAX_LONG) {
    439                         pri += CollationFastLatin.LONG_INC;
    440                     } else {
    441     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
    442                         printf("long-primary overflow for %08x\n", p);
    443     #endif */
    444                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
    445                         continue;
    446                     }
    447                 } else {
    448                     if(pri < CollationFastLatin.MIN_SHORT) {
    449                         pri = CollationFastLatin.MIN_SHORT;
    450                     } else if(pri < (CollationFastLatin.MAX_SHORT - CollationFastLatin.SHORT_INC)) {
    451                         // Reserve the highest primary weight for U+FFFF.
    452                         pri += CollationFastLatin.SHORT_INC;
    453                     } else {
    454     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
    455                         printf("short-primary overflow for %08x\n", p);
    456     #endif */
    457                         shortPrimaryOverflow = true;
    458                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
    459                         continue;
    460                     }
    461                 }
    462                 prevPrimary = p;
    463                 prevSecondary = Collation.COMMON_WEIGHT16;
    464                 sec = CollationFastLatin.COMMON_SEC;
    465                 ter = CollationFastLatin.COMMON_TER;
    466             }
    467             int lower32 = (int)ce;
    468             int s = lower32 >>> 16;
    469             if(s != prevSecondary) {
    470                 if(pri == 0) {
    471                     if(sec == 0) {
    472                         sec = CollationFastLatin.MIN_SEC_HIGH;
    473                     } else if(sec < CollationFastLatin.MAX_SEC_HIGH) {
    474                         sec += CollationFastLatin.SEC_INC;
    475                     } else {
    476                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
    477                         continue;
    478                     }
    479                     prevSecondary = s;
    480                     ter = CollationFastLatin.COMMON_TER;
    481                 } else if(s < Collation.COMMON_WEIGHT16) {
    482                     if(sec == CollationFastLatin.COMMON_SEC) {
    483                         sec = CollationFastLatin.MIN_SEC_BEFORE;
    484                     } else if(sec < CollationFastLatin.MAX_SEC_BEFORE) {
    485                         sec += CollationFastLatin.SEC_INC;
    486                     } else {
    487                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
    488                         continue;
    489                     }
    490                 } else if(s == Collation.COMMON_WEIGHT16) {
    491                     sec = CollationFastLatin.COMMON_SEC;
    492                 } else {
    493                     if(sec < CollationFastLatin.MIN_SEC_AFTER) {
    494                         sec = CollationFastLatin.MIN_SEC_AFTER;
    495                     } else if(sec < CollationFastLatin.MAX_SEC_AFTER) {
    496                         sec += CollationFastLatin.SEC_INC;
    497                     } else {
    498                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
    499                         continue;
    500                     }
    501                 }
    502                 prevSecondary = s;
    503                 ter = CollationFastLatin.COMMON_TER;
    504             }
    505             assert((lower32 & Collation.CASE_MASK) == 0);  // blanked out in uniqueCEs
    506             int t = lower32 & Collation.ONLY_TERTIARY_MASK;
    507             if(t > Collation.COMMON_WEIGHT16) {
    508                 if(ter < CollationFastLatin.MAX_TER_AFTER) {
    509                     ++ter;
    510                 } else {
    511                     miniCEs[i] = CollationFastLatin.BAIL_OUT;
    512                     continue;
    513                 }
    514             }
    515             if(CollationFastLatin.MIN_LONG <= pri && pri <= CollationFastLatin.MAX_LONG) {
    516                 assert(sec == CollationFastLatin.COMMON_SEC);
    517                 miniCEs[i] = (char)(pri | ter);
    518             } else {
    519                 miniCEs[i] = (char)(pri | sec | ter);
    520             }
    521         }
    522     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
    523         printf("last mini primary: %04x\n", pri);
    524     #endif */
    525     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
    526         for(int i = 0; i < uniqueCEs.size(); ++i) {
    527             long ce = uniqueCEs.elementAti(i);
    528             printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
    529         }
    530     #endif */
    531     }
    532 
    533     private void encodeCharCEs() {
    534         int miniCEsStart = result.length();
    535         for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
    536             result.append(0);  // initialize to completely ignorable
    537         }
    538         int indexBase = result.length();
    539         for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
    540             long ce = charCEs[i][0];
    541             if(isContractionCharCE(ce)) { continue; }  // defer contraction
    542             int miniCE = encodeTwoCEs(ce, charCEs[i][1]);
    543             if((miniCE >>> 16) > 0) {   // if ((unsigned)miniCE > 0xffff)
    544                 // Note: There is a chance that this new expansion is the same as a previous one,
    545                 // and if so, then we could reuse the other expansion.
    546                 // However, that seems unlikely.
    547                 int expansionIndex = result.length() - indexBase;
    548                 if(expansionIndex > CollationFastLatin.INDEX_MASK) {
    549                     miniCE = CollationFastLatin.BAIL_OUT;
    550                 } else {
    551                     result.append((char)(miniCE >> 16)).append((char)miniCE);
    552                     miniCE = CollationFastLatin.EXPANSION | expansionIndex;
    553                 }
    554             }
    555             result.setCharAt(miniCEsStart + i, (char)miniCE);
    556         }
    557     }
    558 
    559     private void encodeContractions() {
    560         // We encode all contraction lists so that the first word of a list
    561         // terminates the previous list, and we only need one additional terminator at the end.
    562         int indexBase = headerLength + CollationFastLatin.NUM_FAST_CHARS;
    563         int firstContractionIndex = result.length();
    564         for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
    565             long ce = charCEs[i][0];
    566             if(!isContractionCharCE(ce)) { continue; }
    567             int contractionIndex = result.length() - indexBase;
    568             if(contractionIndex > CollationFastLatin.INDEX_MASK) {
    569                 result.setCharAt(headerLength + i, (char) CollationFastLatin.BAIL_OUT);
    570                 continue;
    571             }
    572             boolean firstTriple = true;
    573             for(int index = (int)ce & 0x7fffffff;; index += 3) {
    574                 long x = contractionCEs.elementAti(index);
    575                 if(x == CollationFastLatin.CONTR_CHAR_MASK && !firstTriple) { break; }
    576                 long cce0 = contractionCEs.elementAti(index + 1);
    577                 long cce1 = contractionCEs.elementAti(index + 2);
    578                 int miniCE = encodeTwoCEs(cce0, cce1);
    579                 if(miniCE == CollationFastLatin.BAIL_OUT) {
    580                     result.append((char)(x | (1 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
    581                 } else if((miniCE >>> 16) == 0) {  // if ((unsigned)miniCE <= 0xffff)
    582                     result.append((char)(x | (2 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
    583                     result.append((char)miniCE);
    584                 } else {
    585                     result.append((char)(x | (3 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
    586                     result.append((char)(miniCE >> 16)).append((char)miniCE);
    587                 }
    588                 firstTriple = false;
    589             }
    590             // Note: There is a chance that this new contraction list is the same as a previous one,
    591             // and if so, then we could truncate the result and reuse the other list.
    592             // However, that seems unlikely.
    593             result.setCharAt(headerLength + i,
    594                             (char)(CollationFastLatin.CONTRACTION | contractionIndex));
    595         }
    596         if(result.length() > firstContractionIndex) {
    597             // Terminate the last contraction list.
    598             result.append((char)CollationFastLatin.CONTR_CHAR_MASK);
    599         }
    600     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
    601         printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
    602         puts("   header & below-digit groups map");
    603         int i = 0;
    604         for(; i < headerLength; ++i) {
    605             printf(" %04x", result[i]);
    606         }
    607         printf("\n   char mini CEs");
    608         assert(CollationFastLatin.NUM_FAST_CHARS % 16 == 0);
    609         for(; i < indexBase; i += 16) {
    610             int c = i - headerLength;
    611             if(c >= CollationFastLatin.LATIN_LIMIT) {
    612                 c = CollationFastLatin.PUNCT_START + c - CollationFastLatin.LATIN_LIMIT;
    613             }
    614             printf("\n %04x:", c);
    615             for(int j = 0; j < 16; ++j) {
    616                 printf(" %04x", result[i + j]);
    617             }
    618         }
    619         printf("\n   expansions & contractions");
    620         for(; i < result.length(); ++i) {
    621             if((i - indexBase) % 16 == 0) { puts(""); }
    622             printf(" %04x", result[i]);
    623         }
    624         puts("");
    625     #endif */
    626     }
    627 
    628     private int encodeTwoCEs(long first, long second) {
    629         if(first == 0) {
    630             return 0;  // completely ignorable
    631         }
    632         if(first == Collation.NO_CE) {
    633             return CollationFastLatin.BAIL_OUT;
    634         }
    635         assert((first >>> 32) != Collation.NO_CE_PRIMARY);
    636 
    637         int miniCE = getMiniCE(first);
    638         if(miniCE == CollationFastLatin.BAIL_OUT) { return miniCE; }
    639         if(miniCE >= CollationFastLatin.MIN_SHORT) {
    640             // Extract & copy the case bits.
    641             // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
    642             int c = (((int)first & Collation.CASE_MASK) >> (14 - 3));
    643             // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
    644             c += CollationFastLatin.LOWER_CASE;
    645             miniCE |= c;
    646         }
    647         if(second == 0) { return miniCE; }
    648 
    649         int miniCE1 = getMiniCE(second);
    650         if(miniCE1 == CollationFastLatin.BAIL_OUT) { return miniCE1; }
    651 
    652         int case1 = (int)second & Collation.CASE_MASK;
    653         if(miniCE >= CollationFastLatin.MIN_SHORT &&
    654                 (miniCE & CollationFastLatin.SECONDARY_MASK) == CollationFastLatin.COMMON_SEC) {
    655             // Try to combine the two mini CEs into one.
    656             int sec1 = miniCE1 & CollationFastLatin.SECONDARY_MASK;
    657             int ter1 = miniCE1 & CollationFastLatin.TERTIARY_MASK;
    658             if(sec1 >= CollationFastLatin.MIN_SEC_HIGH && case1 == 0 &&
    659                     ter1 == CollationFastLatin.COMMON_TER) {
    660                 // sec1>=sec_high implies pri1==0.
    661                 return (miniCE & ~CollationFastLatin.SECONDARY_MASK) | sec1;
    662             }
    663         }
    664 
    665         if(miniCE1 <= CollationFastLatin.SECONDARY_MASK || CollationFastLatin.MIN_SHORT <= miniCE1) {
    666             // Secondary CE, or a CE with a short primary, copy the case bits.
    667             case1 = (case1 >> (14 - 3)) + CollationFastLatin.LOWER_CASE;
    668             miniCE1 |= case1;
    669         }
    670         return (miniCE << 16) | miniCE1;
    671     }
    672 
    673     private static boolean isContractionCharCE(long ce) {
    674         return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE;
    675     }
    676 
    677     // space, punct, symbol, currency (not digit)
    678     private static final int NUM_SPECIAL_GROUPS =
    679             Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1;
    680 
    681     private static final long CONTRACTION_FLAG = 0x80000000L;
    682 
    683     // temporary "buffer"
    684     private long ce0, ce1;
    685 
    686     private long[][] charCEs = new long[CollationFastLatin.NUM_FAST_CHARS][2];
    687 
    688     private UVector64 contractionCEs;
    689     private UVector64 uniqueCEs;
    690 
    691     /** One 16-bit mini CE per unique CE. */
    692     private char[] miniCEs;
    693 
    694     // These are constant for a given root collator.
    695     long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS];
    696     private long firstDigitPrimary;
    697     private long firstLatinPrimary;
    698     private long lastLatinPrimary;
    699     // This determines the first normal primary weight which is mapped to
    700     // a short mini primary. It must be >=firstDigitPrimary.
    701     private long firstShortPrimary;
    702 
    703     private boolean shortPrimaryOverflow;
    704 
    705     private StringBuilder result = new StringBuilder();
    706     private int headerLength;
    707 }
    708