Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationbuilder.cpp
      9 *
     10 * (replaced the former ucol_bld.cpp)
     11 *
     12 * created on: 2013may06
     13 * created by: Markus W. Scherer
     14 */
     15 
     16 #ifdef DEBUG_COLLATION_BUILDER
     17 #include <stdio.h>
     18 #endif
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_COLLATION
     23 
     24 #include "unicode/caniter.h"
     25 #include "unicode/normalizer2.h"
     26 #include "unicode/tblcoll.h"
     27 #include "unicode/parseerr.h"
     28 #include "unicode/uchar.h"
     29 #include "unicode/ucol.h"
     30 #include "unicode/unistr.h"
     31 #include "unicode/usetiter.h"
     32 #include "unicode/utf16.h"
     33 #include "unicode/uversion.h"
     34 #include "cmemory.h"
     35 #include "collation.h"
     36 #include "collationbuilder.h"
     37 #include "collationdata.h"
     38 #include "collationdatabuilder.h"
     39 #include "collationfastlatin.h"
     40 #include "collationroot.h"
     41 #include "collationrootelements.h"
     42 #include "collationruleparser.h"
     43 #include "collationsettings.h"
     44 #include "collationtailoring.h"
     45 #include "collationweights.h"
     46 #include "normalizer2impl.h"
     47 #include "uassert.h"
     48 #include "ucol_imp.h"
     49 #include "utf16collationiterator.h"
     50 
     51 U_NAMESPACE_BEGIN
     52 
     53 namespace {
     54 
     55 class BundleImporter : public CollationRuleParser::Importer {
     56 public:
     57     BundleImporter() {}
     58     virtual ~BundleImporter();
     59     virtual void getRules(
     60             const char *localeID, const char *collationType,
     61             UnicodeString &rules,
     62             const char *&errorReason, UErrorCode &errorCode);
     63 };
     64 
     65 BundleImporter::~BundleImporter() {}
     66 
     67 void
     68 BundleImporter::getRules(
     69         const char *localeID, const char *collationType,
     70         UnicodeString &rules,
     71         const char *& /*errorReason*/, UErrorCode &errorCode) {
     72     CollationLoader::loadRules(localeID, collationType, rules, errorCode);
     73 }
     74 
     75 }  // namespace
     76 
     77 // RuleBasedCollator implementation ---------------------------------------- ***
     78 
     79 // These methods are here, rather than in rulebasedcollator.cpp,
     80 // for modularization:
     81 // Most code using Collator does not need to build a Collator from rules.
     82 // By moving these constructors and helper methods to a separate file,
     83 // most code will not have a static dependency on the builder code.
     84 
     85 RuleBasedCollator::RuleBasedCollator()
     86         : data(NULL),
     87           settings(NULL),
     88           tailoring(NULL),
     89           cacheEntry(NULL),
     90           validLocale(""),
     91           explicitlySetAttributes(0),
     92           actualLocaleIsSameAsValid(FALSE) {
     93 }
     94 
     95 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)
     96         : data(NULL),
     97           settings(NULL),
     98           tailoring(NULL),
     99           cacheEntry(NULL),
    100           validLocale(""),
    101           explicitlySetAttributes(0),
    102           actualLocaleIsSameAsValid(FALSE) {
    103     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode);
    104 }
    105 
    106 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,
    107                                      UErrorCode &errorCode)
    108         : data(NULL),
    109           settings(NULL),
    110           tailoring(NULL),
    111           cacheEntry(NULL),
    112           validLocale(""),
    113           explicitlySetAttributes(0),
    114           actualLocaleIsSameAsValid(FALSE) {
    115     internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode);
    116 }
    117 
    118 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
    119                                      UColAttributeValue decompositionMode,
    120                                      UErrorCode &errorCode)
    121         : data(NULL),
    122           settings(NULL),
    123           tailoring(NULL),
    124           cacheEntry(NULL),
    125           validLocale(""),
    126           explicitlySetAttributes(0),
    127           actualLocaleIsSameAsValid(FALSE) {
    128     internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode);
    129 }
    130 
    131 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
    132                                      ECollationStrength strength,
    133                                      UColAttributeValue decompositionMode,
    134                                      UErrorCode &errorCode)
    135         : data(NULL),
    136           settings(NULL),
    137           tailoring(NULL),
    138           cacheEntry(NULL),
    139           validLocale(""),
    140           explicitlySetAttributes(0),
    141           actualLocaleIsSameAsValid(FALSE) {
    142     internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode);
    143 }
    144 
    145 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
    146                                      UParseError &parseError, UnicodeString &reason,
    147                                      UErrorCode &errorCode)
    148         : data(NULL),
    149           settings(NULL),
    150           tailoring(NULL),
    151           cacheEntry(NULL),
    152           validLocale(""),
    153           explicitlySetAttributes(0),
    154           actualLocaleIsSameAsValid(FALSE) {
    155     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);
    156 }
    157 
    158 void
    159 RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,
    160                                           int32_t strength,
    161                                           UColAttributeValue decompositionMode,
    162                                           UParseError *outParseError, UnicodeString *outReason,
    163                                           UErrorCode &errorCode) {
    164     const CollationTailoring *base = CollationRoot::getRoot(errorCode);
    165     if(U_FAILURE(errorCode)) { return; }
    166     if(outReason != NULL) { outReason->remove(); }
    167     CollationBuilder builder(base, errorCode);
    168     UVersionInfo noVersion = { 0, 0, 0, 0 };
    169     BundleImporter importer;
    170     LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,
    171                                                              &importer,
    172                                                              outParseError, errorCode));
    173     if(U_FAILURE(errorCode)) {
    174         const char *reason = builder.getErrorReason();
    175         if(reason != NULL && outReason != NULL) {
    176             *outReason = UnicodeString(reason, -1, US_INV);
    177         }
    178         return;
    179     }
    180     t->actualLocale.setToBogus();
    181     adoptTailoring(t.orphan(), errorCode);
    182     // Set attributes after building the collator,
    183     // to keep the default settings consistent with the rule string.
    184     if(strength != UCOL_DEFAULT) {
    185         setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);
    186     }
    187     if(decompositionMode != UCOL_DEFAULT) {
    188         setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);
    189     }
    190 }
    191 
    192 // CollationBuilder implementation ----------------------------------------- ***
    193 
    194 // Some compilers don't care if constants are defined in the .cpp file.
    195 // MS Visual C++ does not like it, but gcc requires it. clang does not care.
    196 #ifndef _MSC_VER
    197 const int32_t CollationBuilder::HAS_BEFORE2;
    198 const int32_t CollationBuilder::HAS_BEFORE3;
    199 #endif
    200 
    201 CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
    202         : nfd(*Normalizer2::getNFDInstance(errorCode)),
    203           fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
    204           nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
    205           base(b),
    206           baseData(b->data),
    207           rootElements(b->data->rootElements, b->data->rootElementsLength),
    208           variableTop(0),
    209           dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),
    210           errorReason(NULL),
    211           cesLength(0),
    212           rootPrimaryIndexes(errorCode), nodes(errorCode) {
    213     nfcImpl.ensureCanonIterData(errorCode);
    214     if(U_FAILURE(errorCode)) {
    215         errorReason = "CollationBuilder fields initialization failed";
    216         return;
    217     }
    218     if(dataBuilder == NULL) {
    219         errorCode = U_MEMORY_ALLOCATION_ERROR;
    220         return;
    221     }
    222     dataBuilder->initForTailoring(baseData, errorCode);
    223     if(U_FAILURE(errorCode)) {
    224         errorReason = "CollationBuilder initialization failed";
    225     }
    226 }
    227 
    228 CollationBuilder::~CollationBuilder() {
    229     delete dataBuilder;
    230 }
    231 
    232 CollationTailoring *
    233 CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
    234                                 const UVersionInfo rulesVersion,
    235                                 CollationRuleParser::Importer *importer,
    236                                 UParseError *outParseError,
    237                                 UErrorCode &errorCode) {
    238     if(U_FAILURE(errorCode)) { return NULL; }
    239     if(baseData->rootElements == NULL) {
    240         errorCode = U_MISSING_RESOURCE_ERROR;
    241         errorReason = "missing root elements data, tailoring not supported";
    242         return NULL;
    243     }
    244     LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));
    245     if(tailoring.isNull() || tailoring->isBogus()) {
    246         errorCode = U_MEMORY_ALLOCATION_ERROR;
    247         return NULL;
    248     }
    249     CollationRuleParser parser(baseData, errorCode);
    250     if(U_FAILURE(errorCode)) { return NULL; }
    251     // Note: This always bases &[last variable] and &[first regular]
    252     // on the root collator's maxVariable/variableTop.
    253     // If we wanted this to change after [maxVariable x], then we would keep
    254     // the tailoring.settings pointer here and read its variableTop when we need it.
    255     // See http://unicode.org/cldr/trac/ticket/6070
    256     variableTop = base->settings->variableTop;
    257     parser.setSink(this);
    258     parser.setImporter(importer);
    259     CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);
    260     parser.parse(ruleString, ownedSettings, outParseError, errorCode);
    261     errorReason = parser.getErrorReason();
    262     if(U_FAILURE(errorCode)) { return NULL; }
    263     if(dataBuilder->hasMappings()) {
    264         makeTailoredCEs(errorCode);
    265         closeOverComposites(errorCode);
    266         finalizeCEs(errorCode);
    267         // Copy all of ASCII, and Latin-1 letters, into each tailoring.
    268         optimizeSet.add(0, 0x7f);
    269         optimizeSet.add(0xc0, 0xff);
    270         // Hangul is decomposed on the fly during collation,
    271         // and the tailoring data is always built with HANGUL_TAG specials.
    272         optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
    273         dataBuilder->optimize(optimizeSet, errorCode);
    274         tailoring->ensureOwnedData(errorCode);
    275         if(U_FAILURE(errorCode)) { return NULL; }
    276         if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
    277         dataBuilder->build(*tailoring->ownedData, errorCode);
    278         tailoring->builder = dataBuilder;
    279         dataBuilder = NULL;
    280     } else {
    281         tailoring->data = baseData;
    282     }
    283     if(U_FAILURE(errorCode)) { return NULL; }
    284     ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
    285         tailoring->data, ownedSettings,
    286         ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
    287     tailoring->rules = ruleString;
    288     tailoring->rules.getTerminatedBuffer();  // ensure NUL-termination
    289     tailoring->setVersion(base->version, rulesVersion);
    290     return tailoring.orphan();
    291 }
    292 
    293 void
    294 CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
    295                            const char *&parserErrorReason, UErrorCode &errorCode) {
    296     if(U_FAILURE(errorCode)) { return; }
    297     U_ASSERT(!str.isEmpty());
    298     if(str.charAt(0) == CollationRuleParser::POS_LEAD) {
    299         ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);
    300         cesLength = 1;
    301         if(U_FAILURE(errorCode)) { return; }
    302         U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);
    303     } else {
    304         // normal reset to a character or string
    305         UnicodeString nfdString = nfd.normalize(str, errorCode);
    306         if(U_FAILURE(errorCode)) {
    307             parserErrorReason = "normalizing the reset position";
    308             return;
    309         }
    310         cesLength = dataBuilder->getCEs(nfdString, ces, 0);
    311         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
    312             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    313             parserErrorReason = "reset position maps to too many collation elements (more than 31)";
    314             return;
    315         }
    316     }
    317     if(strength == UCOL_IDENTICAL) { return; }  // simple reset-at-position
    318 
    319     // &[before strength]position
    320     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);
    321     int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
    322     if(U_FAILURE(errorCode)) { return; }
    323 
    324     int64_t node = nodes.elementAti(index);
    325     // If the index is for a "weaker" node,
    326     // then skip backwards over this and further "weaker" nodes.
    327     while(strengthFromNode(node) > strength) {
    328         index = previousIndexFromNode(node);
    329         node = nodes.elementAti(index);
    330     }
    331 
    332     // Find or insert a node whose index we will put into a temporary CE.
    333     if(strengthFromNode(node) == strength && isTailoredNode(node)) {
    334         // Reset to just before this same-strength tailored node.
    335         index = previousIndexFromNode(node);
    336     } else if(strength == UCOL_PRIMARY) {
    337         // root primary node (has no previous index)
    338         uint32_t p = weight32FromNode(node);
    339         if(p == 0) {
    340             errorCode = U_UNSUPPORTED_ERROR;
    341             parserErrorReason = "reset primary-before ignorable not possible";
    342             return;
    343         }
    344         if(p <= rootElements.getFirstPrimary()) {
    345             // There is no primary gap between ignorables and the space-first-primary.
    346             errorCode = U_UNSUPPORTED_ERROR;
    347             parserErrorReason = "reset primary-before first non-ignorable not supported";
    348             return;
    349         }
    350         if(p == Collation::FIRST_TRAILING_PRIMARY) {
    351             // We do not support tailoring to an unassigned-implicit CE.
    352             errorCode = U_UNSUPPORTED_ERROR;
    353             parserErrorReason = "reset primary-before [first trailing] not supported";
    354             return;
    355         }
    356         p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));
    357         index = findOrInsertNodeForPrimary(p, errorCode);
    358         // Go to the last node in this list:
    359         // Tailor after the last node between adjacent root nodes.
    360         for(;;) {
    361             node = nodes.elementAti(index);
    362             int32_t nextIndex = nextIndexFromNode(node);
    363             if(nextIndex == 0) { break; }
    364             index = nextIndex;
    365         }
    366     } else {
    367         // &[before 2] or &[before 3]
    368         index = findCommonNode(index, UCOL_SECONDARY);
    369         if(strength >= UCOL_TERTIARY) {
    370             index = findCommonNode(index, UCOL_TERTIARY);
    371         }
    372         // findCommonNode() stayed on the stronger node or moved to
    373         // an explicit common-weight node of the reset-before strength.
    374         node = nodes.elementAti(index);
    375         if(strengthFromNode(node) == strength) {
    376             // Found a same-strength node with an explicit weight.
    377             uint32_t weight16 = weight16FromNode(node);
    378             if(weight16 == 0) {
    379                 errorCode = U_UNSUPPORTED_ERROR;
    380                 if(strength == UCOL_SECONDARY) {
    381                     parserErrorReason = "reset secondary-before secondary ignorable not possible";
    382                 } else {
    383                     parserErrorReason = "reset tertiary-before completely ignorable not possible";
    384                 }
    385                 return;
    386             }
    387             U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);
    388             // Reset to just before this node.
    389             // Insert the preceding same-level explicit weight if it is not there already.
    390             // Which explicit weight immediately precedes this one?
    391             weight16 = getWeight16Before(index, node, strength);
    392             // Does this preceding weight have a node?
    393             uint32_t previousWeight16;
    394             int32_t previousIndex = previousIndexFromNode(node);
    395             for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
    396                 node = nodes.elementAti(i);
    397                 int32_t previousStrength = strengthFromNode(node);
    398                 if(previousStrength < strength) {
    399                     U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);
    400                     // Either the reset element has an above-common weight and
    401                     // the parent node provides the implied common weight,
    402                     // or the reset element has a weight<=common in the node
    403                     // right after the parent, and we need to insert the preceding weight.
    404                     previousWeight16 = Collation::COMMON_WEIGHT16;
    405                     break;
    406                 } else if(previousStrength == strength && !isTailoredNode(node)) {
    407                     previousWeight16 = weight16FromNode(node);
    408                     break;
    409                 }
    410                 // Skip weaker nodes and same-level tailored nodes.
    411             }
    412             if(previousWeight16 == weight16) {
    413                 // The preceding weight has a node,
    414                 // maybe with following weaker or tailored nodes.
    415                 // Reset to the last of them.
    416                 index = previousIndex;
    417             } else {
    418                 // Insert a node with the preceding weight, reset to that.
    419                 node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
    420                 index = insertNodeBetween(previousIndex, index, node, errorCode);
    421             }
    422         } else {
    423             // Found a stronger node with implied strength-common weight.
    424             uint32_t weight16 = getWeight16Before(index, node, strength);
    425             index = findOrInsertWeakNode(index, weight16, strength, errorCode);
    426         }
    427         // Strength of the temporary CE = strength of its reset position.
    428         // Code above raises an error if the before-strength is stronger.
    429         strength = ceStrength(ces[cesLength - 1]);
    430     }
    431     if(U_FAILURE(errorCode)) {
    432         parserErrorReason = "inserting reset position for &[before n]";
    433         return;
    434     }
    435     ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
    436 }
    437 
    438 uint32_t
    439 CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
    440     U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));
    441     // Collect the root CE weights if this node is for a root CE.
    442     // If it is not, then return the low non-primary boundary for a tailored CE.
    443     uint32_t t;
    444     if(strengthFromNode(node) == UCOL_TERTIARY) {
    445         t = weight16FromNode(node);
    446     } else {
    447         t = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
    448     }
    449     while(strengthFromNode(node) > UCOL_SECONDARY) {
    450         index = previousIndexFromNode(node);
    451         node = nodes.elementAti(index);
    452     }
    453     if(isTailoredNode(node)) {
    454         return Collation::BEFORE_WEIGHT16;
    455     }
    456     uint32_t s;
    457     if(strengthFromNode(node) == UCOL_SECONDARY) {
    458         s = weight16FromNode(node);
    459     } else {
    460         s = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
    461     }
    462     while(strengthFromNode(node) > UCOL_PRIMARY) {
    463         index = previousIndexFromNode(node);
    464         node = nodes.elementAti(index);
    465     }
    466     if(isTailoredNode(node)) {
    467         return Collation::BEFORE_WEIGHT16;
    468     }
    469     // [p, s, t] is a root CE. Return the preceding weight for the requested level.
    470     uint32_t p = weight32FromNode(node);
    471     uint32_t weight16;
    472     if(level == UCOL_SECONDARY) {
    473         weight16 = rootElements.getSecondaryBefore(p, s);
    474     } else {
    475         weight16 = rootElements.getTertiaryBefore(p, s, t);
    476         U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
    477     }
    478     return weight16;
    479 }
    480 
    481 int64_t
    482 CollationBuilder::getSpecialResetPosition(const UnicodeString &str,
    483                                           const char *&parserErrorReason, UErrorCode &errorCode) {
    484     U_ASSERT(str.length() == 2);
    485     int64_t ce;
    486     int32_t strength = UCOL_PRIMARY;
    487     UBool isBoundary = FALSE;
    488     UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;
    489     U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);
    490     switch(pos) {
    491     case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:
    492         // Quaternary CEs are not supported.
    493         // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
    494         return 0;
    495     case CollationRuleParser::LAST_TERTIARY_IGNORABLE:
    496         return 0;
    497     case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {
    498         // Look for a tailored tertiary node after [0, 0, 0].
    499         int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);
    500         if(U_FAILURE(errorCode)) { return 0; }
    501         int64_t node = nodes.elementAti(index);
    502         if((index = nextIndexFromNode(node)) != 0) {
    503             node = nodes.elementAti(index);
    504             U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);
    505             if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {
    506                 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);
    507             }
    508         }
    509         return rootElements.getFirstTertiaryCE();
    510         // No need to look for nodeHasAnyBefore() on a tertiary node.
    511     }
    512     case CollationRuleParser::LAST_SECONDARY_IGNORABLE:
    513         ce = rootElements.getLastTertiaryCE();
    514         strength = UCOL_TERTIARY;
    515         break;
    516     case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {
    517         // Look for a tailored secondary node after [0, 0, *].
    518         int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);
    519         if(U_FAILURE(errorCode)) { return 0; }
    520         int64_t node = nodes.elementAti(index);
    521         while((index = nextIndexFromNode(node)) != 0) {
    522             node = nodes.elementAti(index);
    523             strength = strengthFromNode(node);
    524             if(strength < UCOL_SECONDARY) { break; }
    525             if(strength == UCOL_SECONDARY) {
    526                 if(isTailoredNode(node)) {
    527                     if(nodeHasBefore3(node)) {
    528                         index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
    529                         U_ASSERT(isTailoredNode(nodes.elementAti(index)));
    530                     }
    531                     return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);
    532                 } else {
    533                     break;
    534                 }
    535             }
    536         }
    537         ce = rootElements.getFirstSecondaryCE();
    538         strength = UCOL_SECONDARY;
    539         break;
    540     }
    541     case CollationRuleParser::LAST_PRIMARY_IGNORABLE:
    542         ce = rootElements.getLastSecondaryCE();
    543         strength = UCOL_SECONDARY;
    544         break;
    545     case CollationRuleParser::FIRST_VARIABLE:
    546         ce = rootElements.getFirstPrimaryCE();
    547         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
    548         break;
    549     case CollationRuleParser::LAST_VARIABLE:
    550         ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
    551         break;
    552     case CollationRuleParser::FIRST_REGULAR:
    553         ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
    554         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
    555         break;
    556     case CollationRuleParser::LAST_REGULAR:
    557         // Use the Hani-first-primary rather than the actual last "regular" CE before it,
    558         // for backward compatibility with behavior before the introduction of
    559         // script-first-primary CEs in the root collator.
    560         ce = rootElements.firstCEWithPrimaryAtLeast(
    561             baseData->getFirstPrimaryForGroup(USCRIPT_HAN));
    562         break;
    563     case CollationRuleParser::FIRST_IMPLICIT:
    564         ce = baseData->getSingleCE(0x4e00, errorCode);
    565         break;
    566     case CollationRuleParser::LAST_IMPLICIT:
    567         // We do not support tailoring to an unassigned-implicit CE.
    568         errorCode = U_UNSUPPORTED_ERROR;
    569         parserErrorReason = "reset to [last implicit] not supported";
    570         return 0;
    571     case CollationRuleParser::FIRST_TRAILING:
    572         ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
    573         isBoundary = TRUE;  // trailing first primary (there is no mapping for it)
    574         break;
    575     case CollationRuleParser::LAST_TRAILING:
    576         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    577         parserErrorReason = "LDML forbids tailoring to U+FFFF";
    578         return 0;
    579     default:
    580         U_ASSERT(FALSE);
    581         return 0;
    582     }
    583 
    584     int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);
    585     if(U_FAILURE(errorCode)) { return 0; }
    586     int64_t node = nodes.elementAti(index);
    587     if((pos & 1) == 0) {
    588         // even pos = [first xyz]
    589         if(!nodeHasAnyBefore(node) && isBoundary) {
    590             // A <group> first primary boundary is artificially added to FractionalUCA.txt.
    591             // It is reachable via its special contraction, but is not normally used.
    592             // Find the first character tailored after the boundary CE,
    593             // or the first real root CE after it.
    594             if((index = nextIndexFromNode(node)) != 0) {
    595                 // If there is a following node, then it must be tailored
    596                 // because there are no root CEs with a boundary primary
    597                 // and non-common secondary/tertiary weights.
    598                 node = nodes.elementAti(index);
    599                 U_ASSERT(isTailoredNode(node));
    600                 ce = tempCEFromIndexAndStrength(index, strength);
    601             } else {
    602                 U_ASSERT(strength == UCOL_PRIMARY);
    603                 uint32_t p = (uint32_t)(ce >> 32);
    604                 int32_t pIndex = rootElements.findPrimary(p);
    605                 UBool isCompressible = baseData->isCompressiblePrimary(p);
    606                 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
    607                 ce = Collation::makeCE(p);
    608                 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);
    609                 if(U_FAILURE(errorCode)) { return 0; }
    610                 node = nodes.elementAti(index);
    611             }
    612         }
    613         if(nodeHasAnyBefore(node)) {
    614             // Get the first node that was tailored before this one at a weaker strength.
    615             if(nodeHasBefore2(node)) {
    616                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
    617                 node = nodes.elementAti(index);
    618             }
    619             if(nodeHasBefore3(node)) {
    620                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
    621             }
    622             U_ASSERT(isTailoredNode(nodes.elementAti(index)));
    623             ce = tempCEFromIndexAndStrength(index, strength);
    624         }
    625     } else {
    626         // odd pos = [last xyz]
    627         // Find the last node that was tailored after the [last xyz]
    628         // at a strength no greater than the position's strength.
    629         for(;;) {
    630             int32_t nextIndex = nextIndexFromNode(node);
    631             if(nextIndex == 0) { break; }
    632             int64_t nextNode = nodes.elementAti(nextIndex);
    633             if(strengthFromNode(nextNode) < strength) { break; }
    634             index = nextIndex;
    635             node = nextNode;
    636         }
    637         // Do not make a temporary CE for a root node.
    638         // This last node might be the node for the root CE itself,
    639         // or a node with a common secondary or tertiary weight.
    640         if(isTailoredNode(node)) {
    641             ce = tempCEFromIndexAndStrength(index, strength);
    642         }
    643     }
    644     return ce;
    645 }
    646 
    647 void
    648 CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
    649                               const UnicodeString &str, const UnicodeString &extension,
    650                               const char *&parserErrorReason, UErrorCode &errorCode) {
    651     if(U_FAILURE(errorCode)) { return; }
    652     UnicodeString nfdPrefix;
    653     if(!prefix.isEmpty()) {
    654         nfd.normalize(prefix, nfdPrefix, errorCode);
    655         if(U_FAILURE(errorCode)) {
    656             parserErrorReason = "normalizing the relation prefix";
    657             return;
    658         }
    659     }
    660     UnicodeString nfdString = nfd.normalize(str, errorCode);
    661     if(U_FAILURE(errorCode)) {
    662         parserErrorReason = "normalizing the relation string";
    663         return;
    664     }
    665 
    666     // The runtime code decomposes Hangul syllables on the fly,
    667     // with recursive processing but without making the Jamo pieces visible for matching.
    668     // It does not work with certain types of contextual mappings.
    669     int32_t nfdLength = nfdString.length();
    670     if(nfdLength >= 2) {
    671         UChar c = nfdString.charAt(0);
    672         if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {
    673             // While handling a Hangul syllable, contractions starting with Jamo L or V
    674             // would not see the following Jamo of that syllable.
    675             errorCode = U_UNSUPPORTED_ERROR;
    676             parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";
    677             return;
    678         }
    679         c = nfdString.charAt(nfdLength - 1);
    680         if(Hangul::isJamoL(c) ||
    681                 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {
    682             // A contraction ending with Jamo L or L+V would require
    683             // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
    684             // or decomposing a following Hangul syllable on the fly, during contraction matching.
    685             errorCode = U_UNSUPPORTED_ERROR;
    686             parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";
    687             return;
    688         }
    689         // A Hangul syllable completely inside a contraction is ok.
    690     }
    691     // Note: If there is a prefix, then the parser checked that
    692     // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
    693     // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
    694     // (While handling a Hangul syllable, prefixes on Jamo V or T
    695     // would not see the previous Jamo of that syllable.)
    696 
    697     if(strength != UCOL_IDENTICAL) {
    698         // Find the node index after which we insert the new tailored node.
    699         int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
    700         U_ASSERT(cesLength > 0);
    701         int64_t ce = ces[cesLength - 1];
    702         if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) {
    703             // There is no primary gap between ignorables and the space-first-primary.
    704             errorCode = U_UNSUPPORTED_ERROR;
    705             parserErrorReason = "tailoring primary after ignorables not supported";
    706             return;
    707         }
    708         if(strength == UCOL_QUATERNARY && ce == 0) {
    709             // The CE data structure does not support non-zero quaternary weights
    710             // on tertiary ignorables.
    711             errorCode = U_UNSUPPORTED_ERROR;
    712             parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";
    713             return;
    714         }
    715         // Insert the new tailored node.
    716         index = insertTailoredNodeAfter(index, strength, errorCode);
    717         if(U_FAILURE(errorCode)) {
    718             parserErrorReason = "modifying collation elements";
    719             return;
    720         }
    721         // Strength of the temporary CE:
    722         // The new relation may yield a stronger CE but not a weaker one.
    723         int32_t tempStrength = ceStrength(ce);
    724         if(strength < tempStrength) { tempStrength = strength; }
    725         ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
    726     }
    727 
    728     setCaseBits(nfdString, parserErrorReason, errorCode);
    729     if(U_FAILURE(errorCode)) { return; }
    730 
    731     int32_t cesLengthBeforeExtension = cesLength;
    732     if(!extension.isEmpty()) {
    733         UnicodeString nfdExtension = nfd.normalize(extension, errorCode);
    734         if(U_FAILURE(errorCode)) {
    735             parserErrorReason = "normalizing the relation extension";
    736             return;
    737         }
    738         cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);
    739         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
    740             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    741             parserErrorReason =
    742                 "extension string adds too many collation elements (more than 31 total)";
    743             return;
    744         }
    745     }
    746     uint32_t ce32 = Collation::UNASSIGNED_CE32;
    747     if((prefix != nfdPrefix || str != nfdString) &&
    748             !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
    749         // Map from the original input to the CEs.
    750         // We do this in case the canonical closure is incomplete,
    751         // so that it is possible to explicitly provide the missing mappings.
    752         ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
    753     }
    754     addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
    755     if(U_FAILURE(errorCode)) {
    756         parserErrorReason = "writing collation elements";
    757         return;
    758     }
    759     cesLength = cesLengthBeforeExtension;
    760 }
    761 
    762 int32_t
    763 CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,
    764                                          UErrorCode &errorCode) {
    765     if(U_FAILURE(errorCode)) { return 0; }
    766     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);
    767 
    768     // Find the last CE that is at least as "strong" as the requested difference.
    769     // Note: Stronger is smaller (UCOL_PRIMARY=0).
    770     int64_t ce;
    771     for(;; --cesLength) {
    772         if(cesLength == 0) {
    773             ce = ces[0] = 0;
    774             cesLength = 1;
    775             break;
    776         } else {
    777             ce = ces[cesLength - 1];
    778         }
    779         if(ceStrength(ce) <= strength) { break; }
    780     }
    781 
    782     if(isTempCE(ce)) {
    783         // No need to findCommonNode() here for lower levels
    784         // because insertTailoredNodeAfter() will do that anyway.
    785         return indexFromTempCE(ce);
    786     }
    787 
    788     // root CE
    789     if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {
    790         errorCode = U_UNSUPPORTED_ERROR;
    791         parserErrorReason = "tailoring relative to an unassigned code point not supported";
    792         return 0;
    793     }
    794     return findOrInsertNodeForRootCE(ce, strength, errorCode);
    795 }
    796 
    797 int32_t
    798 CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {
    799     if(U_FAILURE(errorCode)) { return 0; }
    800     U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);
    801 
    802     // Find or insert the node for each of the root CE's weights,
    803     // down to the requested level/strength.
    804     // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
    805     U_ASSERT((ce & 0xc0) == 0);
    806     int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode);
    807     if(strength >= UCOL_SECONDARY) {
    808         uint32_t lower32 = (uint32_t)ce;
    809         index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
    810         if(strength >= UCOL_TERTIARY) {
    811             index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,
    812                                          UCOL_TERTIARY, errorCode);
    813         }
    814     }
    815     return index;
    816 }
    817 
    818 namespace {
    819 
    820 /**
    821  * Like Java Collections.binarySearch(List, key, Comparator).
    822  *
    823  * @return the index>=0 where the item was found,
    824  *         or the index<0 for inserting the string at ~index in sorted order
    825  *         (index into rootPrimaryIndexes)
    826  */
    827 int32_t
    828 binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,
    829                                const int64_t *nodes, uint32_t p) {
    830     if(length == 0) { return ~0; }
    831     int32_t start = 0;
    832     int32_t limit = length;
    833     for (;;) {
    834         int32_t i = (start + limit) / 2;
    835         int64_t node = nodes[rootPrimaryIndexes[i]];
    836         uint32_t nodePrimary = (uint32_t)(node >> 32);  // weight32FromNode(node)
    837         if (p == nodePrimary) {
    838             return i;
    839         } else if (p < nodePrimary) {
    840             if (i == start) {
    841                 return ~start;  // insert s before i
    842             }
    843             limit = i;
    844         } else {
    845             if (i == start) {
    846                 return ~(start + 1);  // insert s after i
    847             }
    848             start = i;
    849         }
    850     }
    851 }
    852 
    853 }  // namespace
    854 
    855 int32_t
    856 CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {
    857     if(U_FAILURE(errorCode)) { return 0; }
    858 
    859     int32_t rootIndex = binarySearchForRootPrimaryNode(
    860         rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
    861     if(rootIndex >= 0) {
    862         return rootPrimaryIndexes.elementAti(rootIndex);
    863     } else {
    864         // Start a new list of nodes with this primary.
    865         int32_t index = nodes.size();
    866         nodes.addElement(nodeFromWeight32(p), errorCode);
    867         rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);
    868         return index;
    869     }
    870 }
    871 
    872 int32_t
    873 CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
    874     if(U_FAILURE(errorCode)) { return 0; }
    875     U_ASSERT(0 <= index && index < nodes.size());
    876     U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);
    877 
    878     if(weight16 == Collation::COMMON_WEIGHT16) {
    879         return findCommonNode(index, level);
    880     }
    881 
    882     // If this will be the first below-common weight for the parent node,
    883     // then we will also need to insert a common weight after it.
    884     int64_t node = nodes.elementAti(index);
    885     U_ASSERT(strengthFromNode(node) < level);  // parent node is stronger
    886     if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
    887         int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
    888         if((node & hasThisLevelBefore) == 0) {
    889             // The parent node has an implied level-common weight.
    890             int64_t commonNode =
    891                 nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);
    892             if(level == UCOL_SECONDARY) {
    893                 // Move the HAS_BEFORE3 flag from the parent node
    894                 // to the new secondary common node.
    895                 commonNode |= node & HAS_BEFORE3;
    896                 node &= ~(int64_t)HAS_BEFORE3;
    897             }
    898             nodes.setElementAt(node | hasThisLevelBefore, index);
    899             // Insert below-common-weight node.
    900             int32_t nextIndex = nextIndexFromNode(node);
    901             node = nodeFromWeight16(weight16) | nodeFromStrength(level);
    902             index = insertNodeBetween(index, nextIndex, node, errorCode);
    903             // Insert common-weight node.
    904             insertNodeBetween(index, nextIndex, commonNode, errorCode);
    905             // Return index of below-common-weight node.
    906             return index;
    907         }
    908     }
    909 
    910     // Find the root CE's weight for this level.
    911     // Postpone insertion if not found:
    912     // Insert the new root node before the next stronger node,
    913     // or before the next root node with the same strength and a larger weight.
    914     int32_t nextIndex;
    915     while((nextIndex = nextIndexFromNode(node)) != 0) {
    916         node = nodes.elementAti(nextIndex);
    917         int32_t nextStrength = strengthFromNode(node);
    918         if(nextStrength <= level) {
    919             // Insert before a stronger node.
    920             if(nextStrength < level) { break; }
    921             // nextStrength == level
    922             if(!isTailoredNode(node)) {
    923                 uint32_t nextWeight16 = weight16FromNode(node);
    924                 if(nextWeight16 == weight16) {
    925                     // Found the node for the root CE up to this level.
    926                     return nextIndex;
    927                 }
    928                 // Insert before a node with a larger same-strength weight.
    929                 if(nextWeight16 > weight16) { break; }
    930             }
    931         }
    932         // Skip the next node.
    933         index = nextIndex;
    934     }
    935     node = nodeFromWeight16(weight16) | nodeFromStrength(level);
    936     return insertNodeBetween(index, nextIndex, node, errorCode);
    937 }
    938 
    939 int32_t
    940 CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {
    941     if(U_FAILURE(errorCode)) { return 0; }
    942     U_ASSERT(0 <= index && index < nodes.size());
    943     if(strength >= UCOL_SECONDARY) {
    944         index = findCommonNode(index, UCOL_SECONDARY);
    945         if(strength >= UCOL_TERTIARY) {
    946             index = findCommonNode(index, UCOL_TERTIARY);
    947         }
    948     }
    949     // Postpone insertion:
    950     // Insert the new node before the next one with a strength at least as strong.
    951     int64_t node = nodes.elementAti(index);
    952     int32_t nextIndex;
    953     while((nextIndex = nextIndexFromNode(node)) != 0) {
    954         node = nodes.elementAti(nextIndex);
    955         if(strengthFromNode(node) <= strength) { break; }
    956         // Skip the next node which has a weaker (larger) strength than the new one.
    957         index = nextIndex;
    958     }
    959     node = IS_TAILORED | nodeFromStrength(strength);
    960     return insertNodeBetween(index, nextIndex, node, errorCode);
    961 }
    962 
    963 int32_t
    964 CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,
    965                                     UErrorCode &errorCode) {
    966     if(U_FAILURE(errorCode)) { return 0; }
    967     U_ASSERT(previousIndexFromNode(node) == 0);
    968     U_ASSERT(nextIndexFromNode(node) == 0);
    969     U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
    970     // Append the new node and link it to the existing nodes.
    971     int32_t newIndex = nodes.size();
    972     node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
    973     nodes.addElement(node, errorCode);
    974     if(U_FAILURE(errorCode)) { return 0; }
    975     // nodes[index].nextIndex = newIndex
    976     node = nodes.elementAti(index);
    977     nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
    978     // nodes[nextIndex].previousIndex = newIndex
    979     if(nextIndex != 0) {
    980         node = nodes.elementAti(nextIndex);
    981         nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
    982     }
    983     return newIndex;
    984 }
    985 
    986 int32_t
    987 CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
    988     U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);
    989     int64_t node = nodes.elementAti(index);
    990     if(strengthFromNode(node) >= strength) {
    991         // The current node is no stronger.
    992         return index;
    993     }
    994     if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
    995         // The current node implies the strength-common weight.
    996         return index;
    997     }
    998     index = nextIndexFromNode(node);
    999     node = nodes.elementAti(index);
   1000     U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
   1001             weight16FromNode(node) < Collation::COMMON_WEIGHT16);
   1002     // Skip to the explicit common node.
   1003     do {
   1004         index = nextIndexFromNode(node);
   1005         node = nodes.elementAti(index);
   1006         U_ASSERT(strengthFromNode(node) >= strength);
   1007     } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
   1008             weight16FromNode(node) < Collation::COMMON_WEIGHT16);
   1009     U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
   1010     return index;
   1011 }
   1012 
   1013 void
   1014 CollationBuilder::setCaseBits(const UnicodeString &nfdString,
   1015                               const char *&parserErrorReason, UErrorCode &errorCode) {
   1016     if(U_FAILURE(errorCode)) { return; }
   1017     int32_t numTailoredPrimaries = 0;
   1018     for(int32_t i = 0; i < cesLength; ++i) {
   1019         if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }
   1020     }
   1021     // We should not be able to get too many case bits because
   1022     // cesLength<=31==MAX_EXPANSION_LENGTH.
   1023     // 31 pairs of case bits fit into an int64_t without setting its sign bit.
   1024     U_ASSERT(numTailoredPrimaries <= 31);
   1025 
   1026     int64_t cases = 0;
   1027     if(numTailoredPrimaries > 0) {
   1028         const UChar *s = nfdString.getBuffer();
   1029         UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length());
   1030         int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;
   1031         if(U_FAILURE(errorCode)) {
   1032             parserErrorReason = "fetching root CEs for tailored string";
   1033             return;
   1034         }
   1035         U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);
   1036 
   1037         uint32_t lastCase = 0;
   1038         int32_t numBasePrimaries = 0;
   1039         for(int32_t i = 0; i < baseCEsLength; ++i) {
   1040             int64_t ce = baseCEs.getCE(i);
   1041             if((ce >> 32) != 0) {
   1042                 ++numBasePrimaries;
   1043                 uint32_t c = ((uint32_t)ce >> 14) & 3;
   1044                 U_ASSERT(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
   1045                 if(numBasePrimaries < numTailoredPrimaries) {
   1046                     cases |= (int64_t)c << ((numBasePrimaries - 1) * 2);
   1047                 } else if(numBasePrimaries == numTailoredPrimaries) {
   1048                     lastCase = c;
   1049                 } else if(c != lastCase) {
   1050                     // There are more base primary CEs than tailored primaries.
   1051                     // Set mixed case if the case bits of the remainder differ.
   1052                     lastCase = 1;
   1053                     // Nothing more can change.
   1054                     break;
   1055                 }
   1056             }
   1057         }
   1058         if(numBasePrimaries >= numTailoredPrimaries) {
   1059             cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2);
   1060         }
   1061     }
   1062 
   1063     for(int32_t i = 0; i < cesLength; ++i) {
   1064         int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff);  // clear old case bits
   1065         int32_t strength = ceStrength(ce);
   1066         if(strength == UCOL_PRIMARY) {
   1067             ce |= (cases & 3) << 14;
   1068             cases >>= 2;
   1069         } else if(strength == UCOL_TERTIARY) {
   1070             // Tertiary CEs must have uppercase bits.
   1071             // See the LDML spec, and comments in class CollationCompare.
   1072             ce |= 0x8000;
   1073         }
   1074         // Tertiary ignorable CEs must have 0 case bits.
   1075         // We set 0 case bits for secondary CEs too
   1076         // since currently only U+0345 is cased and maps to a secondary CE,
   1077         // and it is lowercase. Other secondaries are uncased.
   1078         // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
   1079         ces[i] = ce;
   1080     }
   1081 }
   1082 
   1083 void
   1084 CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,
   1085                                        UErrorCode &errorCode) {
   1086     if(U_FAILURE(errorCode)) { return; }
   1087     dataBuilder->suppressContractions(set, errorCode);
   1088     if(U_FAILURE(errorCode)) {
   1089         parserErrorReason = "application of [suppressContractions [set]] failed";
   1090     }
   1091 }
   1092 
   1093 void
   1094 CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,
   1095                            UErrorCode &errorCode) {
   1096     if(U_FAILURE(errorCode)) { return; }
   1097     optimizeSet.addAll(set);
   1098 }
   1099 
   1100 uint32_t
   1101 CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
   1102                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
   1103                                  UErrorCode &errorCode) {
   1104     // Map from the NFD input to the CEs.
   1105     ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
   1106     ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
   1107     addTailComposites(nfdPrefix, nfdString, errorCode);
   1108     return ce32;
   1109 }
   1110 
   1111 uint32_t
   1112 CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
   1113                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
   1114                                  UErrorCode &errorCode) {
   1115     if(U_FAILURE(errorCode)) { return ce32; }
   1116 
   1117     // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
   1118     if(nfdPrefix.isEmpty()) {
   1119         CanonicalIterator stringIter(nfdString, errorCode);
   1120         if(U_FAILURE(errorCode)) { return ce32; }
   1121         UnicodeString prefix;
   1122         for(;;) {
   1123             UnicodeString str = stringIter.next();
   1124             if(str.isBogus()) { break; }
   1125             if(ignoreString(str, errorCode) || str == nfdString) { continue; }
   1126             ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
   1127             if(U_FAILURE(errorCode)) { return ce32; }
   1128         }
   1129     } else {
   1130         CanonicalIterator prefixIter(nfdPrefix, errorCode);
   1131         CanonicalIterator stringIter(nfdString, errorCode);
   1132         if(U_FAILURE(errorCode)) { return ce32; }
   1133         for(;;) {
   1134             UnicodeString prefix = prefixIter.next();
   1135             if(prefix.isBogus()) { break; }
   1136             if(ignorePrefix(prefix, errorCode)) { continue; }
   1137             UBool samePrefix = prefix == nfdPrefix;
   1138             for(;;) {
   1139                 UnicodeString str = stringIter.next();
   1140                 if(str.isBogus()) { break; }
   1141                 if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }
   1142                 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
   1143                 if(U_FAILURE(errorCode)) { return ce32; }
   1144             }
   1145             stringIter.reset();
   1146         }
   1147     }
   1148     return ce32;
   1149 }
   1150 
   1151 void
   1152 CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
   1153                                     UErrorCode &errorCode) {
   1154     if(U_FAILURE(errorCode)) { return; }
   1155 
   1156     // Look for the last starter in the NFD string.
   1157     UChar32 lastStarter;
   1158     int32_t indexAfterLastStarter = nfdString.length();
   1159     for(;;) {
   1160         if(indexAfterLastStarter == 0) { return; }  // no starter at all
   1161         lastStarter = nfdString.char32At(indexAfterLastStarter - 1);
   1162         if(nfd.getCombiningClass(lastStarter) == 0) { break; }
   1163         indexAfterLastStarter -= U16_LENGTH(lastStarter);
   1164     }
   1165     // No closure to Hangul syllables since we decompose them on the fly.
   1166     if(Hangul::isJamoL(lastStarter)) { return; }
   1167 
   1168     // Are there any composites whose decomposition starts with the lastStarter?
   1169     // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
   1170     // We might find some more equivalent mappings here if it did.
   1171     UnicodeSet composites;
   1172     if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
   1173 
   1174     UnicodeString decomp;
   1175     UnicodeString newNFDString, newString;
   1176     int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];
   1177     UnicodeSetIterator iter(composites);
   1178     while(iter.next()) {
   1179         U_ASSERT(!iter.isString());
   1180         UChar32 composite = iter.getCodepoint();
   1181         nfd.getDecomposition(composite, decomp);
   1182         if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
   1183                                      newNFDString, newString, errorCode)) {
   1184             continue;
   1185         }
   1186         int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);
   1187         if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {
   1188             // Ignore mappings that we cannot store.
   1189             continue;
   1190         }
   1191         // Note: It is possible that the newCEs do not make use of the mapping
   1192         // for which we are adding the tail composites, in which case we might be adding
   1193         // unnecessary mappings.
   1194         // For example, when we add tail composites for ae^ (^=combining circumflex),
   1195         // UCA discontiguous-contraction matching does not find any matches
   1196         // for ae_^ (_=any combining diacritic below) *unless* there is also
   1197         // a contraction mapping for ae.
   1198         // Thus, if there is no ae contraction, then the ae^ mapping is ignored
   1199         // while fetching the newCEs for ae_^.
   1200         // TODO: Try to detect this effectively.
   1201         // (Alternatively, print a warning when prefix contractions are missing.)
   1202 
   1203         // We do not need an explicit mapping for the NFD strings.
   1204         // It is fine if the NFD input collates like this via a sequence of mappings.
   1205         // It also saves a little bit of space, and may reduce the set of characters with contractions.
   1206         uint32_t ce32 = addIfDifferent(nfdPrefix, newString,
   1207                                        newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);
   1208         if(ce32 != Collation::UNASSIGNED_CE32) {
   1209             // was different, was added
   1210             addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);
   1211         }
   1212     }
   1213 }
   1214 
   1215 UBool
   1216 CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,
   1217                                            int32_t indexAfterLastStarter,
   1218                                            UChar32 composite, const UnicodeString &decomp,
   1219                                            UnicodeString &newNFDString, UnicodeString &newString,
   1220                                            UErrorCode &errorCode) const {
   1221     if(U_FAILURE(errorCode)) { return FALSE; }
   1222     U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));
   1223     int32_t lastStarterLength = decomp.moveIndex32(0, 1);
   1224     if(lastStarterLength == decomp.length()) {
   1225         // Singleton decompositions should be found by addWithClosure()
   1226         // and the CanonicalIterator, so we can ignore them here.
   1227         return FALSE;
   1228     }
   1229     if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,
   1230                          decomp, lastStarterLength, 0x7fffffff) == 0) {
   1231         // same strings, nothing new to be found here
   1232         return FALSE;
   1233     }
   1234 
   1235     // Make new FCD strings that combine a composite, or its decomposition,
   1236     // into the nfdString's last starter and the combining marks following it.
   1237     // Make an NFD version, and a version with the composite.
   1238     newNFDString.setTo(nfdString, 0, indexAfterLastStarter);
   1239     newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);
   1240 
   1241     // The following is related to discontiguous contraction matching,
   1242     // but builds only FCD strings (or else returns FALSE).
   1243     int32_t sourceIndex = indexAfterLastStarter;
   1244     int32_t decompIndex = lastStarterLength;
   1245     // Small optimization: We keep the source character across loop iterations
   1246     // because we do not always consume it,
   1247     // and then need not fetch it again nor look up its combining class again.
   1248     UChar32 sourceChar = U_SENTINEL;
   1249     // The cc variables need to be declared before the loop so that at the end
   1250     // they are set to the last combining classes seen.
   1251     uint8_t sourceCC = 0;
   1252     uint8_t decompCC = 0;
   1253     for(;;) {
   1254         if(sourceChar < 0) {
   1255             if(sourceIndex >= nfdString.length()) { break; }
   1256             sourceChar = nfdString.char32At(sourceIndex);
   1257             sourceCC = nfd.getCombiningClass(sourceChar);
   1258             U_ASSERT(sourceCC != 0);
   1259         }
   1260         // We consume a decomposition character in each iteration.
   1261         if(decompIndex >= decomp.length()) { break; }
   1262         UChar32 decompChar = decomp.char32At(decompIndex);
   1263         decompCC = nfd.getCombiningClass(decompChar);
   1264         // Compare the two characters and their combining classes.
   1265         if(decompCC == 0) {
   1266             // Unable to merge because the source contains a non-zero combining mark
   1267             // but the composite's decomposition contains another starter.
   1268             // The strings would not be equivalent.
   1269             return FALSE;
   1270         } else if(sourceCC < decompCC) {
   1271             // Composite + sourceChar would not be FCD.
   1272             return FALSE;
   1273         } else if(decompCC < sourceCC) {
   1274             newNFDString.append(decompChar);
   1275             decompIndex += U16_LENGTH(decompChar);
   1276         } else if(decompChar != sourceChar) {
   1277             // Blocked because same combining class.
   1278             return FALSE;
   1279         } else {  // match: decompChar == sourceChar
   1280             newNFDString.append(decompChar);
   1281             decompIndex += U16_LENGTH(decompChar);
   1282             sourceIndex += U16_LENGTH(decompChar);
   1283             sourceChar = U_SENTINEL;
   1284         }
   1285     }
   1286     // We are at the end of at least one of the two inputs.
   1287     if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
   1288         if(sourceCC < decompCC) {
   1289             // Appending the next source character to the composite would not be FCD.
   1290             return FALSE;
   1291         }
   1292         newNFDString.append(nfdString, sourceIndex, 0x7fffffff);
   1293         newString.append(nfdString, sourceIndex, 0x7fffffff);
   1294     } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
   1295         newNFDString.append(decomp, decompIndex, 0x7fffffff);
   1296     }
   1297     U_ASSERT(nfd.isNormalized(newNFDString, errorCode));
   1298     U_ASSERT(fcd.isNormalized(newString, errorCode));
   1299     U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString);  // canonically equivalent
   1300     return TRUE;
   1301 }
   1302 
   1303 UBool
   1304 CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {
   1305     // Do not map non-FCD prefixes.
   1306     return !isFCD(s, errorCode);
   1307 }
   1308 
   1309 UBool
   1310 CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {
   1311     // Do not map non-FCD strings.
   1312     // Do not map strings that start with Hangul syllables: We decompose those on the fly.
   1313     return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));
   1314 }
   1315 
   1316 UBool
   1317 CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {
   1318     return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);
   1319 }
   1320 
   1321 void
   1322 CollationBuilder::closeOverComposites(UErrorCode &errorCode) {
   1323     UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode);  // Java: static final
   1324     if(U_FAILURE(errorCode)) { return; }
   1325     // Hangul is decomposed on the fly during collation.
   1326     composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
   1327     UnicodeString prefix;  // empty
   1328     UnicodeString nfdString;
   1329     UnicodeSetIterator iter(composites);
   1330     while(iter.next()) {
   1331         U_ASSERT(!iter.isString());
   1332         nfd.getDecomposition(iter.getCodepoint(), nfdString);
   1333         cesLength = dataBuilder->getCEs(nfdString, ces, 0);
   1334         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
   1335             // Too many CEs from the decomposition (unusual), ignore this composite.
   1336             // We could add a capacity parameter to getCEs() and reallocate if necessary.
   1337             // However, this can only really happen in contrived cases.
   1338             continue;
   1339         }
   1340         const UnicodeString &composite(iter.getString());
   1341         addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);
   1342     }
   1343 }
   1344 
   1345 uint32_t
   1346 CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,
   1347                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
   1348                                  UErrorCode &errorCode) {
   1349     if(U_FAILURE(errorCode)) { return ce32; }
   1350     int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];
   1351     int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);
   1352     if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
   1353         if(ce32 == Collation::UNASSIGNED_CE32) {
   1354             ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);
   1355         }
   1356         dataBuilder->addCE32(prefix, str, ce32, errorCode);
   1357     }
   1358     return ce32;
   1359 }
   1360 
   1361 UBool
   1362 CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,
   1363                           const int64_t ces2[], int32_t ces2Length) {
   1364     if(ces1Length != ces2Length) {
   1365         return FALSE;
   1366     }
   1367     U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);
   1368     for(int32_t i = 0; i < ces1Length; ++i) {
   1369         if(ces1[i] != ces2[i]) { return FALSE; }
   1370     }
   1371     return TRUE;
   1372 }
   1373 
   1374 #ifdef DEBUG_COLLATION_BUILDER
   1375 
   1376 uint32_t
   1377 alignWeightRight(uint32_t w) {
   1378     if(w != 0) {
   1379         while((w & 0xff) == 0) { w >>= 8; }
   1380     }
   1381     return w;
   1382 }
   1383 
   1384 #endif
   1385 
   1386 void
   1387 CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
   1388     if(U_FAILURE(errorCode)) { return; }
   1389 
   1390     CollationWeights primaries, secondaries, tertiaries;
   1391     int64_t *nodesArray = nodes.getBuffer();
   1392 #ifdef DEBUG_COLLATION_BUILDER
   1393         puts("\nCollationBuilder::makeTailoredCEs()");
   1394 #endif
   1395 
   1396     for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
   1397         int32_t i = rootPrimaryIndexes.elementAti(rpi);
   1398         int64_t node = nodesArray[i];
   1399         uint32_t p = weight32FromNode(node);
   1400         uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;
   1401         uint32_t t = s;
   1402         uint32_t q = 0;
   1403         UBool pIsTailored = FALSE;
   1404         UBool sIsTailored = FALSE;
   1405         UBool tIsTailored = FALSE;
   1406 #ifdef DEBUG_COLLATION_BUILDER
   1407         printf("\nprimary     %lx\n", (long)alignWeightRight(p));
   1408 #endif
   1409         int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
   1410         int32_t nextIndex = nextIndexFromNode(node);
   1411         while(nextIndex != 0) {
   1412             i = nextIndex;
   1413             node = nodesArray[i];
   1414             nextIndex = nextIndexFromNode(node);
   1415             int32_t strength = strengthFromNode(node);
   1416             if(strength == UCOL_QUATERNARY) {
   1417                 U_ASSERT(isTailoredNode(node));
   1418 #ifdef DEBUG_COLLATION_BUILDER
   1419                 printf("      quat+     ");
   1420 #endif
   1421                 if(q == 3) {
   1422                     errorCode = U_BUFFER_OVERFLOW_ERROR;
   1423                     errorReason = "quaternary tailoring gap too small";
   1424                     return;
   1425                 }
   1426                 ++q;
   1427             } else {
   1428                 if(strength == UCOL_TERTIARY) {
   1429                     if(isTailoredNode(node)) {
   1430 #ifdef DEBUG_COLLATION_BUILDER
   1431                         printf("    ter+        ");
   1432 #endif
   1433                         if(!tIsTailored) {
   1434                             // First tailored tertiary node for [p, s].
   1435                             int32_t tCount = countTailoredNodes(nodesArray, nextIndex,
   1436                                                                 UCOL_TERTIARY) + 1;
   1437                             uint32_t tLimit;
   1438                             if(t == 0) {
   1439                                 // Gap at the beginning of the tertiary CE range.
   1440                                 t = rootElements.getTertiaryBoundary() - 0x100;
   1441                                 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
   1442                             } else if(!pIsTailored && !sIsTailored) {
   1443                                 // p and s are root weights.
   1444                                 tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
   1445                             } else if(t == Collation::BEFORE_WEIGHT16) {
   1446                                 tLimit = Collation::COMMON_WEIGHT16;
   1447                             } else {
   1448                                 // [p, s] is tailored.
   1449                                 U_ASSERT(t == Collation::COMMON_WEIGHT16);
   1450                                 tLimit = rootElements.getTertiaryBoundary();
   1451                             }
   1452                             U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);
   1453                             tertiaries.initForTertiary();
   1454                             if(!tertiaries.allocWeights(t, tLimit, tCount)) {
   1455                                 errorCode = U_BUFFER_OVERFLOW_ERROR;
   1456                                 errorReason = "tertiary tailoring gap too small";
   1457                                 return;
   1458                             }
   1459                             tIsTailored = TRUE;
   1460                         }
   1461                         t = tertiaries.nextWeight();
   1462                         U_ASSERT(t != 0xffffffff);
   1463                     } else {
   1464                         t = weight16FromNode(node);
   1465                         tIsTailored = FALSE;
   1466 #ifdef DEBUG_COLLATION_BUILDER
   1467                         printf("    ter     %lx\n", (long)alignWeightRight(t));
   1468 #endif
   1469                     }
   1470                 } else {
   1471                     if(strength == UCOL_SECONDARY) {
   1472                         if(isTailoredNode(node)) {
   1473 #ifdef DEBUG_COLLATION_BUILDER
   1474                             printf("  sec+          ");
   1475 #endif
   1476                             if(!sIsTailored) {
   1477                                 // First tailored secondary node for p.
   1478                                 int32_t sCount = countTailoredNodes(nodesArray, nextIndex,
   1479                                                                     UCOL_SECONDARY) + 1;
   1480                                 uint32_t sLimit;
   1481                                 if(s == 0) {
   1482                                     // Gap at the beginning of the secondary CE range.
   1483                                     s = rootElements.getSecondaryBoundary() - 0x100;
   1484                                     sLimit = rootElements.getFirstSecondaryCE() >> 16;
   1485                                 } else if(!pIsTailored) {
   1486                                     // p is a root primary.
   1487                                     sLimit = rootElements.getSecondaryAfter(pIndex, s);
   1488                                 } else if(s == Collation::BEFORE_WEIGHT16) {
   1489                                     sLimit = Collation::COMMON_WEIGHT16;
   1490                                 } else {
   1491                                     // p is a tailored primary.
   1492                                     U_ASSERT(s == Collation::COMMON_WEIGHT16);
   1493                                     sLimit = rootElements.getSecondaryBoundary();
   1494                                 }
   1495                                 if(s == Collation::COMMON_WEIGHT16) {
   1496                                     // Do not tailor into the getSortKey() range of
   1497                                     // compressed common secondaries.
   1498                                     s = rootElements.getLastCommonSecondary();
   1499                                 }
   1500                                 secondaries.initForSecondary();
   1501                                 if(!secondaries.allocWeights(s, sLimit, sCount)) {
   1502                                     errorCode = U_BUFFER_OVERFLOW_ERROR;
   1503                                     errorReason = "secondary tailoring gap too small";
   1504 #ifdef DEBUG_COLLATION_BUILDER
   1505                                     printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",
   1506                                            (long)alignWeightRight(s), (long)alignWeightRight(sLimit),
   1507                                            (long)alignWeightRight(sCount));
   1508 #endif
   1509                                     return;
   1510                                 }
   1511                                 sIsTailored = TRUE;
   1512                             }
   1513                             s = secondaries.nextWeight();
   1514                             U_ASSERT(s != 0xffffffff);
   1515                         } else {
   1516                             s = weight16FromNode(node);
   1517                             sIsTailored = FALSE;
   1518 #ifdef DEBUG_COLLATION_BUILDER
   1519                             printf("  sec       %lx\n", (long)alignWeightRight(s));
   1520 #endif
   1521                         }
   1522                     } else /* UCOL_PRIMARY */ {
   1523                         U_ASSERT(isTailoredNode(node));
   1524 #ifdef DEBUG_COLLATION_BUILDER
   1525                         printf("pri+            ");
   1526 #endif
   1527                         if(!pIsTailored) {
   1528                             // First tailored primary node in this list.
   1529                             int32_t pCount = countTailoredNodes(nodesArray, nextIndex,
   1530                                                                 UCOL_PRIMARY) + 1;
   1531                             UBool isCompressible = baseData->isCompressiblePrimary(p);
   1532                             uint32_t pLimit =
   1533                                 rootElements.getPrimaryAfter(p, pIndex, isCompressible);
   1534                             primaries.initForPrimary(isCompressible);
   1535                             if(!primaries.allocWeights(p, pLimit, pCount)) {
   1536                                 errorCode = U_BUFFER_OVERFLOW_ERROR;  // TODO: introduce a more specific UErrorCode?
   1537                                 errorReason = "primary tailoring gap too small";
   1538                                 return;
   1539                             }
   1540                             pIsTailored = TRUE;
   1541                         }
   1542                         p = primaries.nextWeight();
   1543                         U_ASSERT(p != 0xffffffff);
   1544                         s = Collation::COMMON_WEIGHT16;
   1545                         sIsTailored = FALSE;
   1546                     }
   1547                     t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;
   1548                     tIsTailored = FALSE;
   1549                 }
   1550                 q = 0;
   1551             }
   1552             if(isTailoredNode(node)) {
   1553                 nodesArray[i] = Collation::makeCE(p, s, t, q);
   1554 #ifdef DEBUG_COLLATION_BUILDER
   1555                 printf("%016llx\n", (long long)nodesArray[i]);
   1556 #endif
   1557             }
   1558         }
   1559     }
   1560 }
   1561 
   1562 int32_t
   1563 CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {
   1564     int32_t count = 0;
   1565     for(;;) {
   1566         if(i == 0) { break; }
   1567         int64_t node = nodesArray[i];
   1568         if(strengthFromNode(node) < strength) { break; }
   1569         if(strengthFromNode(node) == strength) {
   1570             if(isTailoredNode(node)) {
   1571                 ++count;
   1572             } else {
   1573                 break;
   1574             }
   1575         }
   1576         i = nextIndexFromNode(node);
   1577     }
   1578     return count;
   1579 }
   1580 
   1581 class CEFinalizer : public CollationDataBuilder::CEModifier {
   1582 public:
   1583     CEFinalizer(const int64_t *ces) : finalCEs(ces) {}
   1584     virtual ~CEFinalizer();
   1585     virtual int64_t modifyCE32(uint32_t ce32) const {
   1586         U_ASSERT(!Collation::isSpecialCE32(ce32));
   1587         if(CollationBuilder::isTempCE32(ce32)) {
   1588             // retain case bits
   1589             return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
   1590         } else {
   1591             return Collation::NO_CE;
   1592         }
   1593     }
   1594     virtual int64_t modifyCE(int64_t ce) const {
   1595         if(CollationBuilder::isTempCE(ce)) {
   1596             // retain case bits
   1597             return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);
   1598         } else {
   1599             return Collation::NO_CE;
   1600         }
   1601     }
   1602 
   1603 private:
   1604     const int64_t *finalCEs;
   1605 };
   1606 
   1607 CEFinalizer::~CEFinalizer() {}
   1608 
   1609 void
   1610 CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
   1611     if(U_FAILURE(errorCode)) { return; }
   1612     LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode);
   1613     if(U_FAILURE(errorCode)) {
   1614         return;
   1615     }
   1616     newBuilder->initForTailoring(baseData, errorCode);
   1617     CEFinalizer finalizer(nodes.getBuffer());
   1618     newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);
   1619     if(U_FAILURE(errorCode)) { return; }
   1620     delete dataBuilder;
   1621     dataBuilder = newBuilder.orphan();
   1622 }
   1623 
   1624 int32_t
   1625 CollationBuilder::ceStrength(int64_t ce) {
   1626     return
   1627         isTempCE(ce) ? strengthFromTempCE(ce) :
   1628         (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :
   1629         ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY :
   1630         ce != 0 ? UCOL_TERTIARY :
   1631         UCOL_IDENTICAL;
   1632 }
   1633 
   1634 U_NAMESPACE_END
   1635 
   1636 U_NAMESPACE_USE
   1637 
   1638 U_CAPI UCollator * U_EXPORT2
   1639 ucol_openRules(const UChar *rules, int32_t rulesLength,
   1640                UColAttributeValue normalizationMode, UCollationStrength strength,
   1641                UParseError *parseError, UErrorCode *pErrorCode) {
   1642     if(U_FAILURE(*pErrorCode)) { return NULL; }
   1643     if(rules == NULL && rulesLength != 0) {
   1644         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1645         return NULL;
   1646     }
   1647     RuleBasedCollator *coll = new RuleBasedCollator();
   1648     if(coll == NULL) {
   1649         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
   1650         return NULL;
   1651     }
   1652     UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength);
   1653     coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode);
   1654     if(U_FAILURE(*pErrorCode)) {
   1655         delete coll;
   1656         return NULL;
   1657     }
   1658     return coll->toUCollator();
   1659 }
   1660 
   1661 static const int32_t internalBufferSize = 512;
   1662 
   1663 // The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp
   1664 // because it calls UnicodeSet "builder" code that depends on all Unicode properties,
   1665 // and the rest of the collation "runtime" code only depends on normalization.
   1666 // This function is not related to the collation builder,
   1667 // but it did not seem worth moving it into its own .cpp file,
   1668 // nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.
   1669 U_CAPI int32_t U_EXPORT2
   1670 ucol_getUnsafeSet( const UCollator *coll,
   1671                   USet *unsafe,
   1672                   UErrorCode *status)
   1673 {
   1674     UChar buffer[internalBufferSize];
   1675     int32_t len = 0;
   1676 
   1677     uset_clear(unsafe);
   1678 
   1679     // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
   1680     static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
   1681                                     0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
   1682 
   1683     // add chars that fail the fcd check
   1684     uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
   1685 
   1686     // add lead/trail surrogates
   1687     // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,
   1688     // not when testing code *points*)
   1689     uset_addRange(unsafe, 0xd800, 0xdfff);
   1690 
   1691     USet *contractions = uset_open(0,0);
   1692 
   1693     int32_t i = 0, j = 0;
   1694     ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
   1695     int32_t contsSize = uset_size(contractions);
   1696     UChar32 c = 0;
   1697     // Contraction set consists only of strings
   1698     // to get unsafe code points, we need to
   1699     // break the strings apart and add them to the unsafe set
   1700     for(i = 0; i < contsSize; i++) {
   1701         len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
   1702         if(len > 0) {
   1703             j = 0;
   1704             while(j < len) {
   1705                 U16_NEXT(buffer, j, len, c);
   1706                 if(j < len) {
   1707                     uset_add(unsafe, c);
   1708                 }
   1709             }
   1710         }
   1711     }
   1712 
   1713     uset_close(contractions);
   1714 
   1715     return uset_size(unsafe);
   1716 }
   1717 
   1718 #endif  // !UCONFIG_NO_COLLATION
   1719