Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationbuilder.cpp
      7 *
      8 * (replaced the former ucol_bld.cpp)
      9 *
     10 * created on: 2013may06
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifdef DEBUG_COLLATION_BUILDER
     15 #include <stdio.h>
     16 #endif
     17 
     18 #include "unicode/utypes.h"
     19 
     20 #if !UCONFIG_NO_COLLATION
     21 
     22 #include "unicode/caniter.h"
     23 #include "unicode/normalizer2.h"
     24 #include "unicode/tblcoll.h"
     25 #include "unicode/parseerr.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/ucol.h"
     28 #include "unicode/unistr.h"
     29 #include "unicode/usetiter.h"
     30 #include "unicode/utf16.h"
     31 #include "unicode/uversion.h"
     32 #include "cmemory.h"
     33 #include "collation.h"
     34 #include "collationbuilder.h"
     35 #include "collationdata.h"
     36 #include "collationdatabuilder.h"
     37 #include "collationfastlatin.h"
     38 #include "collationroot.h"
     39 #include "collationrootelements.h"
     40 #include "collationruleparser.h"
     41 #include "collationsettings.h"
     42 #include "collationtailoring.h"
     43 #include "collationweights.h"
     44 #include "normalizer2impl.h"
     45 #include "uassert.h"
     46 #include "ucol_imp.h"
     47 #include "utf16collationiterator.h"
     48 
     49 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     50 
     51 U_NAMESPACE_BEGIN
     52 
     53 namespace {
     54 
     55 class BundleImporter : public CollationRuleParser::Importer {
     56 public:
     57     BundleImporter() : rules(NULL) {}
     58     virtual ~BundleImporter();
     59     virtual const UnicodeString *getRules(
     60             const char *localeID, const char *collationType,
     61             const char *&errorReason, UErrorCode &errorCode);
     62 
     63 private:
     64     UnicodeString *rules;
     65 };
     66 
     67 BundleImporter::~BundleImporter() {
     68     delete rules;
     69 }
     70 
     71 const UnicodeString *
     72 BundleImporter::getRules(
     73         const char *localeID, const char *collationType,
     74         const char *& /*errorReason*/, UErrorCode &errorCode) {
     75     delete rules;
     76     return rules = CollationLoader::loadRules(localeID, collationType, errorCode);
     77 }
     78 
     79 }  // namespace
     80 
     81 // RuleBasedCollator implementation ---------------------------------------- ***
     82 
     83 // These methods are here, rather than in rulebasedcollator.cpp,
     84 // for modularization:
     85 // Most code using Collator does not need to build a Collator from rules.
     86 // By moving these constructors and helper methods to a separate file,
     87 // most code will not have a static dependency on the builder code.
     88 
     89 RuleBasedCollator::RuleBasedCollator()
     90         : data(NULL),
     91           settings(NULL),
     92           tailoring(NULL),
     93           validLocale(""),
     94           explicitlySetAttributes(0),
     95           actualLocaleIsSameAsValid(FALSE) {
     96 }
     97 
     98 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)
     99         : data(NULL),
    100           settings(NULL),
    101           tailoring(NULL),
    102           validLocale(""),
    103           explicitlySetAttributes(0),
    104           actualLocaleIsSameAsValid(FALSE) {
    105     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode);
    106 }
    107 
    108 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,
    109                                      UErrorCode &errorCode)
    110         : data(NULL),
    111           settings(NULL),
    112           tailoring(NULL),
    113           validLocale(""),
    114           explicitlySetAttributes(0),
    115           actualLocaleIsSameAsValid(FALSE) {
    116     internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode);
    117 }
    118 
    119 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
    120                                      UColAttributeValue decompositionMode,
    121                                      UErrorCode &errorCode)
    122         : data(NULL),
    123           settings(NULL),
    124           tailoring(NULL),
    125           validLocale(""),
    126           explicitlySetAttributes(0),
    127           actualLocaleIsSameAsValid(FALSE) {
    128     internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode);
    129 }
    130 
    131 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
    132                                      ECollationStrength strength,
    133                                      UColAttributeValue decompositionMode,
    134                                      UErrorCode &errorCode)
    135         : data(NULL),
    136           settings(NULL),
    137           tailoring(NULL),
    138           validLocale(""),
    139           explicitlySetAttributes(0),
    140           actualLocaleIsSameAsValid(FALSE) {
    141     internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode);
    142 }
    143 
    144 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
    145                                      UParseError &parseError, UnicodeString &reason,
    146                                      UErrorCode &errorCode)
    147         : data(NULL),
    148           settings(NULL),
    149           tailoring(NULL),
    150           validLocale(""),
    151           explicitlySetAttributes(0),
    152           actualLocaleIsSameAsValid(FALSE) {
    153     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);
    154 }
    155 
    156 void
    157 RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,
    158                                           int32_t strength,
    159                                           UColAttributeValue decompositionMode,
    160                                           UParseError *outParseError, UnicodeString *outReason,
    161                                           UErrorCode &errorCode) {
    162     const CollationTailoring *base = CollationRoot::getRoot(errorCode);
    163     if(U_FAILURE(errorCode)) { return; }
    164     if(outReason != NULL) { outReason->remove(); }
    165     CollationBuilder builder(base, errorCode);
    166     UVersionInfo noVersion = { 0, 0, 0, 0 };
    167     BundleImporter importer;
    168     LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,
    169                                                              &importer,
    170                                                              outParseError, errorCode));
    171     if(U_FAILURE(errorCode)) {
    172         const char *reason = builder.getErrorReason();
    173         if(reason != NULL && outReason != NULL) {
    174             *outReason = UnicodeString(reason, -1, US_INV);
    175         }
    176         return;
    177     }
    178     t->actualLocale.setToBogus();
    179     adoptTailoring(t.orphan());
    180     // Set attributes after building the collator,
    181     // to keep the default settings consistent with the rule string.
    182     if(strength != UCOL_DEFAULT) {
    183         setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);
    184     }
    185     if(decompositionMode != UCOL_DEFAULT) {
    186         setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);
    187     }
    188 }
    189 
    190 // CollationBuilder implementation ----------------------------------------- ***
    191 
    192 CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
    193         : nfd(*Normalizer2::getNFDInstance(errorCode)),
    194           fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
    195           nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
    196           base(b),
    197           baseData(b->data),
    198           rootElements(b->data->rootElements, b->data->rootElementsLength),
    199           variableTop(0),
    200           dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),
    201           errorReason(NULL),
    202           cesLength(0),
    203           rootPrimaryIndexes(errorCode), nodes(errorCode) {
    204     nfcImpl.ensureCanonIterData(errorCode);
    205     if(U_FAILURE(errorCode)) {
    206         errorReason = "CollationBuilder fields initialization failed";
    207         return;
    208     }
    209     if(dataBuilder == NULL) {
    210         errorCode = U_MEMORY_ALLOCATION_ERROR;
    211         return;
    212     }
    213     dataBuilder->initForTailoring(baseData, errorCode);
    214     if(U_FAILURE(errorCode)) {
    215         errorReason = "CollationBuilder initialization failed";
    216     }
    217 }
    218 
    219 CollationBuilder::~CollationBuilder() {
    220     delete dataBuilder;
    221 }
    222 
    223 CollationTailoring *
    224 CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
    225                                 const UVersionInfo rulesVersion,
    226                                 CollationRuleParser::Importer *importer,
    227                                 UParseError *outParseError,
    228                                 UErrorCode &errorCode) {
    229     if(U_FAILURE(errorCode)) { return NULL; }
    230     if(baseData->rootElements == NULL) {
    231         errorCode = U_MISSING_RESOURCE_ERROR;
    232         errorReason = "missing root elements data, tailoring not supported";
    233         return NULL;
    234     }
    235     LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));
    236     if(tailoring.isNull() || tailoring->isBogus()) {
    237         errorCode = U_MEMORY_ALLOCATION_ERROR;
    238         return NULL;
    239     }
    240     CollationRuleParser parser(baseData, errorCode);
    241     if(U_FAILURE(errorCode)) { return NULL; }
    242     // Note: This always bases &[last variable] and &[first regular]
    243     // on the root collator's maxVariable/variableTop.
    244     // If we wanted this to change after [maxVariable x], then we would keep
    245     // the tailoring.settings pointer here and read its variableTop when we need it.
    246     // See http://unicode.org/cldr/trac/ticket/6070
    247     variableTop = base->settings->variableTop;
    248     parser.setSink(this);
    249     parser.setImporter(importer);
    250     CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);
    251     parser.parse(ruleString, ownedSettings, outParseError, errorCode);
    252     errorReason = parser.getErrorReason();
    253     if(U_FAILURE(errorCode)) { return NULL; }
    254     if(dataBuilder->hasMappings()) {
    255         makeTailoredCEs(errorCode);
    256         closeOverComposites(errorCode);
    257         finalizeCEs(errorCode);
    258         // Copy all of ASCII, and Latin-1 letters, into each tailoring.
    259         optimizeSet.add(0, 0x7f);
    260         optimizeSet.add(0xc0, 0xff);
    261         // Hangul is decomposed on the fly during collation,
    262         // and the tailoring data is always built with HANGUL_TAG specials.
    263         optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
    264         dataBuilder->optimize(optimizeSet, errorCode);
    265         tailoring->ensureOwnedData(errorCode);
    266         if(U_FAILURE(errorCode)) { return NULL; }
    267         if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
    268         dataBuilder->build(*tailoring->ownedData, errorCode);
    269         tailoring->builder = dataBuilder;
    270         dataBuilder = NULL;
    271     } else {
    272         tailoring->data = baseData;
    273     }
    274     if(U_FAILURE(errorCode)) { return NULL; }
    275     ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
    276         tailoring->data, ownedSettings,
    277         ownedSettings.fastLatinPrimaries, LENGTHOF(ownedSettings.fastLatinPrimaries));
    278     tailoring->rules = ruleString;
    279     tailoring->rules.getTerminatedBuffer();  // ensure NUL-termination
    280     tailoring->setVersion(base->version, rulesVersion);
    281     return tailoring.orphan();
    282 }
    283 
    284 void
    285 CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
    286                            const char *&parserErrorReason, UErrorCode &errorCode) {
    287     if(U_FAILURE(errorCode)) { return; }
    288     U_ASSERT(!str.isEmpty());
    289     if(str.charAt(0) == CollationRuleParser::POS_LEAD) {
    290         ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);
    291         cesLength = 1;
    292         if(U_FAILURE(errorCode)) { return; }
    293         U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);
    294     } else {
    295         // normal reset to a character or string
    296         UnicodeString nfdString = nfd.normalize(str, errorCode);
    297         if(U_FAILURE(errorCode)) {
    298             parserErrorReason = "normalizing the reset position";
    299             return;
    300         }
    301         cesLength = dataBuilder->getCEs(nfdString, ces, 0);
    302         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
    303             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    304             parserErrorReason = "reset position maps to too many collation elements (more than 31)";
    305             return;
    306         }
    307     }
    308     if(strength == UCOL_IDENTICAL) { return; }  // simple reset-at-position
    309 
    310     // &[before strength]position
    311     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);
    312     int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
    313     if(U_FAILURE(errorCode)) { return; }
    314 
    315     int64_t node = nodes.elementAti(index);
    316     // If the index is for a "weaker" tailored node,
    317     // then skip backwards over this and further "weaker" nodes.
    318     while(strengthFromNode(node) > strength) {
    319         index = previousIndexFromNode(node);
    320         node = nodes.elementAti(index);
    321     }
    322 
    323     // Find or insert a node whose index we will put into a temporary CE.
    324     if(strengthFromNode(node) == strength && isTailoredNode(node)) {
    325         // Reset to just before this same-strength tailored node.
    326         index = previousIndexFromNode(node);
    327     } else if(strength == UCOL_PRIMARY) {
    328         // root primary node (has no previous index)
    329         uint32_t p = weight32FromNode(node);
    330         if(p == 0) {
    331             errorCode = U_UNSUPPORTED_ERROR;
    332             parserErrorReason = "reset primary-before ignorable not possible";
    333             return;
    334         }
    335         if(p <= rootElements.getFirstPrimary()) {
    336             // There is no primary gap between ignorables and the space-first-primary.
    337             errorCode = U_UNSUPPORTED_ERROR;
    338             parserErrorReason = "reset primary-before first non-ignorable not supported";
    339             return;
    340         }
    341         if(p == Collation::FIRST_TRAILING_PRIMARY) {
    342             // We do not support tailoring to an unassigned-implicit CE.
    343             errorCode = U_UNSUPPORTED_ERROR;
    344             parserErrorReason = "reset primary-before [first trailing] not supported";
    345             return;
    346         }
    347         p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));
    348         index = findOrInsertNodeForPrimary(p, errorCode);
    349         // Go to the last node in this list:
    350         // Tailor after the last node between adjacent root nodes.
    351         for(;;) {
    352             node = nodes.elementAti(index);
    353             int32_t nextIndex = nextIndexFromNode(node);
    354             if(nextIndex == 0) { break; }
    355             index = nextIndex;
    356         }
    357     } else {
    358         // &[before 2] or &[before 3]
    359         index = findCommonNode(index, UCOL_SECONDARY);
    360         if(strength >= UCOL_TERTIARY) {
    361             index = findCommonNode(index, UCOL_TERTIARY);
    362         }
    363         node = nodes.elementAti(index);
    364         if(strengthFromNode(node) == strength) {
    365             // Found a same-strength node with an explicit weight.
    366             uint32_t weight16 = weight16FromNode(node);
    367             if(weight16 == 0) {
    368                 errorCode = U_UNSUPPORTED_ERROR;
    369                 if(strength == UCOL_SECONDARY) {
    370                     parserErrorReason = "reset secondary-before secondary ignorable not possible";
    371                 } else {
    372                     parserErrorReason = "reset tertiary-before completely ignorable not possible";
    373                 }
    374                 return;
    375             }
    376             U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16);
    377             int32_t previousIndex = previousIndexFromNode(node);
    378             if(weight16 == Collation::COMMON_WEIGHT16) {
    379                 // Reset to just before this same-strength common-weight node.
    380                 index = previousIndex;
    381             } else {
    382                 // A non-common weight is only possible from a root CE.
    383                 // Find the higher-level weights, which must all be explicit,
    384                 // and then find the preceding weight for this level.
    385                 uint32_t previousWeight16 = 0;
    386                 int32_t previousWeightIndex = -1;
    387                 int32_t i = index;
    388                 if(strength == UCOL_SECONDARY) {
    389                     uint32_t p;
    390                     do {
    391                         i = previousIndexFromNode(node);
    392                         node = nodes.elementAti(i);
    393                         if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) &&
    394                                 previousWeightIndex < 0) {
    395                             previousWeightIndex = i;
    396                             previousWeight16 = weight16FromNode(node);
    397                         }
    398                     } while(strengthFromNode(node) > UCOL_PRIMARY);
    399                     U_ASSERT(!isTailoredNode(node));
    400                     p = weight32FromNode(node);
    401                     weight16 = rootElements.getSecondaryBefore(p, weight16);
    402                 } else {
    403                     uint32_t p, s;
    404                     do {
    405                         i = previousIndexFromNode(node);
    406                         node = nodes.elementAti(i);
    407                         if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) &&
    408                                 previousWeightIndex < 0) {
    409                             previousWeightIndex = i;
    410                             previousWeight16 = weight16FromNode(node);
    411                         }
    412                     } while(strengthFromNode(node) > UCOL_SECONDARY);
    413                     U_ASSERT(!isTailoredNode(node));
    414                     if(strengthFromNode(node) == UCOL_SECONDARY) {
    415                         s = weight16FromNode(node);
    416                         do {
    417                             i = previousIndexFromNode(node);
    418                             node = nodes.elementAti(i);
    419                         } while(strengthFromNode(node) > UCOL_PRIMARY);
    420                         U_ASSERT(!isTailoredNode(node));
    421                     } else {
    422                         U_ASSERT(!nodeHasBefore2(node));
    423                         s = Collation::COMMON_WEIGHT16;
    424                     }
    425                     p = weight32FromNode(node);
    426                     weight16 = rootElements.getTertiaryBefore(p, s, weight16);
    427                     U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
    428                 }
    429                 // Find or insert the new explicit weight before the current one.
    430                 if(previousWeightIndex >= 0 && weight16 == previousWeight16) {
    431                     // Tailor after the last node between adjacent root nodes.
    432                     index = previousIndex;
    433                 } else {
    434                     node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
    435                     index = insertNodeBetween(previousIndex, index, node, errorCode);
    436                 }
    437             }
    438         } else {
    439             // Found a stronger node with implied strength-common weight.
    440             int64_t hasBefore3 = 0;
    441             if(strength == UCOL_SECONDARY) {
    442                 U_ASSERT(!nodeHasBefore2(node));
    443                 // Move the HAS_BEFORE3 flag from the parent node
    444                 // to the new secondary common node.
    445                 hasBefore3 = node & HAS_BEFORE3;
    446                 node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2;
    447             } else {
    448                 U_ASSERT(!nodeHasBefore3(node));
    449                 node |= HAS_BEFORE3;
    450             }
    451             nodes.setElementAt(node, index);
    452             int32_t nextIndex = nextIndexFromNode(node);
    453             // Insert default nodes with weights 02 and 05, reset to the 02 node.
    454             node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength);
    455             index = insertNodeBetween(index, nextIndex, node, errorCode);
    456             node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 |
    457                     nodeFromStrength(strength);
    458             insertNodeBetween(index, nextIndex, node, errorCode);
    459         }
    460         // Strength of the temporary CE = strength of its reset position.
    461         // Code above raises an error if the before-strength is stronger.
    462         strength = ceStrength(ces[cesLength - 1]);
    463     }
    464     if(U_FAILURE(errorCode)) {
    465         parserErrorReason = "inserting reset position for &[before n]";
    466         return;
    467     }
    468     ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
    469 }
    470 
    471 int64_t
    472 CollationBuilder::getSpecialResetPosition(const UnicodeString &str,
    473                                           const char *&parserErrorReason, UErrorCode &errorCode) {
    474     U_ASSERT(str.length() == 2);
    475     int64_t ce;
    476     int32_t strength = UCOL_PRIMARY;
    477     UBool isBoundary = FALSE;
    478     UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;
    479     U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);
    480     switch(pos) {
    481     case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:
    482         // Quaternary CEs are not supported.
    483         // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
    484         return 0;
    485     case CollationRuleParser::LAST_TERTIARY_IGNORABLE:
    486         return 0;
    487     case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {
    488         // Look for a tailored tertiary node after [0, 0, 0].
    489         int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);
    490         if(U_FAILURE(errorCode)) { return 0; }
    491         int64_t node = nodes.elementAti(index);
    492         if((index = nextIndexFromNode(node)) != 0) {
    493             node = nodes.elementAti(index);
    494             U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);
    495             if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {
    496                 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);
    497             }
    498         }
    499         return rootElements.getFirstTertiaryCE();
    500         // No need to look for nodeHasAnyBefore() on a tertiary node.
    501     }
    502     case CollationRuleParser::LAST_SECONDARY_IGNORABLE:
    503         ce = rootElements.getLastTertiaryCE();
    504         strength = UCOL_TERTIARY;
    505         break;
    506     case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {
    507         // Look for a tailored secondary node after [0, 0, *].
    508         int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);
    509         if(U_FAILURE(errorCode)) { return 0; }
    510         int64_t node = nodes.elementAti(index);
    511         while((index = nextIndexFromNode(node)) != 0) {
    512             node = nodes.elementAti(index);
    513             strength = strengthFromNode(node);
    514             if(strength < UCOL_SECONDARY) { break; }
    515             if(strength == UCOL_SECONDARY) {
    516                 if(isTailoredNode(node)) {
    517                     if(nodeHasBefore3(node)) {
    518                         index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
    519                         U_ASSERT(isTailoredNode(nodes.elementAti(index)));
    520                     }
    521                     return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);
    522                 } else {
    523                     break;
    524                 }
    525             }
    526         }
    527         ce = rootElements.getFirstSecondaryCE();
    528         strength = UCOL_SECONDARY;
    529         break;
    530     }
    531     case CollationRuleParser::LAST_PRIMARY_IGNORABLE:
    532         ce = rootElements.getLastSecondaryCE();
    533         strength = UCOL_SECONDARY;
    534         break;
    535     case CollationRuleParser::FIRST_VARIABLE:
    536         ce = rootElements.getFirstPrimaryCE();
    537         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
    538         break;
    539     case CollationRuleParser::LAST_VARIABLE:
    540         ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
    541         break;
    542     case CollationRuleParser::FIRST_REGULAR:
    543         ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
    544         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
    545         break;
    546     case CollationRuleParser::LAST_REGULAR:
    547         // Use the Hani-first-primary rather than the actual last "regular" CE before it,
    548         // for backward compatibility with behavior before the introduction of
    549         // script-first-primary CEs in the root collator.
    550         ce = rootElements.firstCEWithPrimaryAtLeast(
    551             baseData->getFirstPrimaryForGroup(USCRIPT_HAN));
    552         break;
    553     case CollationRuleParser::FIRST_IMPLICIT: {
    554         uint32_t ce32 = baseData->getCE32(0x4e00);
    555         U_ASSERT(Collation::hasCE32Tag(ce32, Collation::OFFSET_TAG));
    556         ce = baseData->getCEFromOffsetCE32(0x4e00, ce32);
    557         break;
    558     }
    559     case CollationRuleParser::LAST_IMPLICIT:
    560         // We do not support tailoring to an unassigned-implicit CE.
    561         errorCode = U_UNSUPPORTED_ERROR;
    562         parserErrorReason = "reset to [last implicit] not supported";
    563         return 0;
    564     case CollationRuleParser::FIRST_TRAILING:
    565         ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
    566         isBoundary = TRUE;  // trailing first primary (there is no mapping for it)
    567         break;
    568     case CollationRuleParser::LAST_TRAILING:
    569         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    570         parserErrorReason = "LDML forbids tailoring to U+FFFF";
    571         return 0;
    572     default:
    573         U_ASSERT(FALSE);
    574         return 0;
    575     }
    576 
    577     int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);
    578     if(U_FAILURE(errorCode)) { return 0; }
    579     int64_t node = nodes.elementAti(index);
    580     if((pos & 1) == 0) {
    581         // even pos = [first xyz]
    582         if(!nodeHasAnyBefore(node) && isBoundary) {
    583             // A <group> first primary boundary is artificially added to FractionalUCA.txt.
    584             // It is reachable via its special contraction, but is not normally used.
    585             // Find the first character tailored after the boundary CE,
    586             // or the first real root CE after it.
    587             if((index = nextIndexFromNode(node)) != 0) {
    588                 // If there is a following node, then it must be tailored
    589                 // because there are no root CEs with a boundary primary
    590                 // and non-common secondary/tertiary weights.
    591                 node = nodes.elementAti(index);
    592                 U_ASSERT(isTailoredNode(node));
    593                 ce = tempCEFromIndexAndStrength(index, strength);
    594             } else {
    595                 U_ASSERT(strength == UCOL_PRIMARY);
    596                 uint32_t p = (uint32_t)(ce >> 32);
    597                 int32_t pIndex = rootElements.findPrimary(p);
    598                 UBool isCompressible = baseData->isCompressiblePrimary(p);
    599                 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
    600                 ce = Collation::makeCE(p);
    601                 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);
    602                 if(U_FAILURE(errorCode)) { return 0; }
    603                 node = nodes.elementAti(index);
    604             }
    605         }
    606         if(nodeHasAnyBefore(node)) {
    607             // Get the first node that was tailored before this one at a weaker strength.
    608             if(nodeHasBefore2(node)) {
    609                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
    610                 node = nodes.elementAti(index);
    611             }
    612             if(nodeHasBefore3(node)) {
    613                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
    614             }
    615             U_ASSERT(isTailoredNode(nodes.elementAti(index)));
    616             ce = tempCEFromIndexAndStrength(index, strength);
    617         }
    618     } else {
    619         // odd pos = [last xyz]
    620         // Find the last node that was tailored after the [last xyz]
    621         // at a strength no greater than the position's strength.
    622         for(;;) {
    623             int32_t nextIndex = nextIndexFromNode(node);
    624             if(nextIndex == 0) { break; }
    625             int64_t nextNode = nodes.elementAti(nextIndex);
    626             if(strengthFromNode(nextNode) < strength) { break; }
    627             index = nextIndex;
    628             node = nextNode;
    629         }
    630         // Do not make a temporary CE for a root node.
    631         // This last node might be the node for the root CE itself,
    632         // or a node with a common secondary or tertiary weight.
    633         if(isTailoredNode(node)) {
    634             ce = tempCEFromIndexAndStrength(index, strength);
    635         }
    636     }
    637     return ce;
    638 }
    639 
    640 void
    641 CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
    642                               const UnicodeString &str, const UnicodeString &extension,
    643                               const char *&parserErrorReason, UErrorCode &errorCode) {
    644     if(U_FAILURE(errorCode)) { return; }
    645     UnicodeString nfdPrefix;
    646     if(!prefix.isEmpty()) {
    647         nfd.normalize(prefix, nfdPrefix, errorCode);
    648         if(U_FAILURE(errorCode)) {
    649             parserErrorReason = "normalizing the relation prefix";
    650             return;
    651         }
    652     }
    653     UnicodeString nfdString = nfd.normalize(str, errorCode);
    654     if(U_FAILURE(errorCode)) {
    655         parserErrorReason = "normalizing the relation string";
    656         return;
    657     }
    658 
    659     // The runtime code decomposes Hangul syllables on the fly,
    660     // with recursive processing but without making the Jamo pieces visible for matching.
    661     // It does not work with certain types of contextual mappings.
    662     int32_t nfdLength = nfdString.length();
    663     if(nfdLength >= 2) {
    664         UChar c = nfdString.charAt(0);
    665         if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {
    666             // While handling a Hangul syllable, contractions starting with Jamo L or V
    667             // would not see the following Jamo of that syllable.
    668             errorCode = U_UNSUPPORTED_ERROR;
    669             parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";
    670             return;
    671         }
    672         c = nfdString.charAt(nfdLength - 1);
    673         if(Hangul::isJamoL(c) ||
    674                 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {
    675             // A contraction ending with Jamo L or L+V would require
    676             // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
    677             // or decomposing a following Hangul syllable on the fly, during contraction matching.
    678             errorCode = U_UNSUPPORTED_ERROR;
    679             parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";
    680             return;
    681         }
    682         // A Hangul syllable completely inside a contraction is ok.
    683     }
    684     // Note: If there is a prefix, then the parser checked that
    685     // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
    686     // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
    687     // (While handling a Hangul syllable, prefixes on Jamo V or T
    688     // would not see the previous Jamo of that syllable.)
    689 
    690     if(strength != UCOL_IDENTICAL) {
    691         // Find the node index after which we insert the new tailored node.
    692         int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
    693         U_ASSERT(cesLength > 0);
    694         int64_t ce = ces[cesLength - 1];
    695         if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) {
    696             // There is no primary gap between ignorables and the space-first-primary.
    697             errorCode = U_UNSUPPORTED_ERROR;
    698             parserErrorReason = "tailoring primary after ignorables not supported";
    699             return;
    700         }
    701         if(strength == UCOL_QUATERNARY && ce == 0) {
    702             // The CE data structure does not support non-zero quaternary weights
    703             // on tertiary ignorables.
    704             errorCode = U_UNSUPPORTED_ERROR;
    705             parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";
    706             return;
    707         }
    708         // Insert the new tailored node.
    709         index = insertTailoredNodeAfter(index, strength, errorCode);
    710         if(U_FAILURE(errorCode)) {
    711             parserErrorReason = "modifying collation elements";
    712             return;
    713         }
    714         // Strength of the temporary CE:
    715         // The new relation may yield a stronger CE but not a weaker one.
    716         int32_t tempStrength = ceStrength(ce);
    717         if(strength < tempStrength) { tempStrength = strength; }
    718         ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
    719     }
    720 
    721     setCaseBits(nfdString, parserErrorReason, errorCode);
    722     if(U_FAILURE(errorCode)) { return; }
    723 
    724     int32_t cesLengthBeforeExtension = cesLength;
    725     if(!extension.isEmpty()) {
    726         UnicodeString nfdExtension = nfd.normalize(extension, errorCode);
    727         if(U_FAILURE(errorCode)) {
    728             parserErrorReason = "normalizing the relation extension";
    729             return;
    730         }
    731         cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);
    732         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
    733             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    734             parserErrorReason =
    735                 "extension string adds too many collation elements (more than 31 total)";
    736             return;
    737         }
    738     }
    739     uint32_t ce32 = Collation::UNASSIGNED_CE32;
    740     if((prefix != nfdPrefix || str != nfdString) &&
    741             !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
    742         // Map from the original input to the CEs.
    743         // We do this in case the canonical closure is incomplete,
    744         // so that it is possible to explicitly provide the missing mappings.
    745         ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
    746     }
    747     addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
    748     if(U_FAILURE(errorCode)) {
    749         parserErrorReason = "writing collation elements";
    750         return;
    751     }
    752     cesLength = cesLengthBeforeExtension;
    753 }
    754 
    755 int32_t
    756 CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,
    757                                          UErrorCode &errorCode) {
    758     if(U_FAILURE(errorCode)) { return 0; }
    759     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);
    760 
    761     // Find the last CE that is at least as "strong" as the requested difference.
    762     // Note: Stronger is smaller (UCOL_PRIMARY=0).
    763     int64_t ce;
    764     for(;; --cesLength) {
    765         if(cesLength == 0) {
    766             ce = ces[0] = 0;
    767             cesLength = 1;
    768             break;
    769         } else {
    770             ce = ces[cesLength - 1];
    771         }
    772         if(ceStrength(ce) <= strength) { break; }
    773     }
    774 
    775     if(isTempCE(ce)) {
    776         // No need to findCommonNode() here for lower levels
    777         // because insertTailoredNodeAfter() will do that anyway.
    778         return indexFromTempCE(ce);
    779     }
    780 
    781     // root CE
    782     if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {
    783         errorCode = U_UNSUPPORTED_ERROR;
    784         parserErrorReason = "tailoring relative to an unassigned code point not supported";
    785         return 0;
    786     }
    787     return findOrInsertNodeForRootCE(ce, strength, errorCode);
    788 }
    789 
    790 int32_t
    791 CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {
    792     if(U_FAILURE(errorCode)) { return 0; }
    793     U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);
    794 
    795     // Find or insert the node for each of the root CE's weights,
    796     // down to the requested level/strength.
    797     // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
    798     U_ASSERT((ce & 0xc0) == 0);
    799     int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode);
    800     if(strength >= UCOL_SECONDARY) {
    801         uint32_t lower32 = (uint32_t)ce;
    802         index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
    803         if(strength >= UCOL_TERTIARY) {
    804             index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,
    805                                          UCOL_TERTIARY, errorCode);
    806         }
    807     }
    808     return index;
    809 }
    810 
    811 namespace {
    812 
    813 /**
    814  * Like Java Collections.binarySearch(List, key, Comparator).
    815  *
    816  * @return the index>=0 where the item was found,
    817  *         or the index<0 for inserting the string at ~index in sorted order
    818  *         (index into rootPrimaryIndexes)
    819  */
    820 int32_t
    821 binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,
    822                                const int64_t *nodes, uint32_t p) {
    823     if(length == 0) { return ~0; }
    824     int32_t start = 0;
    825     int32_t limit = length;
    826     for (;;) {
    827         int32_t i = (start + limit) / 2;
    828         int64_t node = nodes[rootPrimaryIndexes[i]];
    829         uint32_t nodePrimary = (uint32_t)(node >> 32);  // weight32FromNode(node)
    830         if (p == nodePrimary) {
    831             return i;
    832         } else if (p < nodePrimary) {
    833             if (i == start) {
    834                 return ~start;  // insert s before i
    835             }
    836             limit = i;
    837         } else {
    838             if (i == start) {
    839                 return ~(start + 1);  // insert s after i
    840             }
    841             start = i;
    842         }
    843     }
    844 }
    845 
    846 }  // namespace
    847 
    848 int32_t
    849 CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {
    850     if(U_FAILURE(errorCode)) { return 0; }
    851 
    852     int32_t rootIndex = binarySearchForRootPrimaryNode(
    853         rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
    854     if(rootIndex >= 0) {
    855         return rootPrimaryIndexes.elementAti(rootIndex);
    856     } else {
    857         // Start a new list of nodes with this primary.
    858         int32_t index = nodes.size();
    859         nodes.addElement(nodeFromWeight32(p), errorCode);
    860         rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);
    861         return index;
    862     }
    863 }
    864 
    865 int32_t
    866 CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
    867     if(U_FAILURE(errorCode)) { return 0; }
    868     U_ASSERT(0 <= index && index < nodes.size());
    869 
    870     U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16);
    871     // Only reset-before inserts common weights.
    872     if(weight16 == Collation::COMMON_WEIGHT16) {
    873         return findCommonNode(index, level);
    874     }
    875     // Find the root CE's weight for this level.
    876     // Postpone insertion if not found:
    877     // Insert the new root node before the next stronger node,
    878     // or before the next root node with the same strength and a larger weight.
    879     int64_t node = nodes.elementAti(index);
    880     int32_t nextIndex;
    881     while((nextIndex = nextIndexFromNode(node)) != 0) {
    882         node = nodes.elementAti(nextIndex);
    883         int32_t nextStrength = strengthFromNode(node);
    884         if(nextStrength <= level) {
    885             // Insert before a stronger node.
    886             if(nextStrength < level) { break; }
    887             // nextStrength == level
    888             if(!isTailoredNode(node)) {
    889                 uint32_t nextWeight16 = weight16FromNode(node);
    890                 if(nextWeight16 == weight16) {
    891                     // Found the node for the root CE up to this level.
    892                     return nextIndex;
    893                 }
    894                 // Insert before a node with a larger same-strength weight.
    895                 if(nextWeight16 > weight16) { break; }
    896             }
    897         }
    898         // Skip the next node.
    899         index = nextIndex;
    900     }
    901     node = nodeFromWeight16(weight16) | nodeFromStrength(level);
    902     return insertNodeBetween(index, nextIndex, node, errorCode);
    903 }
    904 
    905 int32_t
    906 CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {
    907     if(U_FAILURE(errorCode)) { return 0; }
    908     U_ASSERT(0 <= index && index < nodes.size());
    909     if(strength >= UCOL_SECONDARY) {
    910         index = findCommonNode(index, UCOL_SECONDARY);
    911         if(strength >= UCOL_TERTIARY) {
    912             index = findCommonNode(index, UCOL_TERTIARY);
    913         }
    914     }
    915     // Postpone insertion:
    916     // Insert the new node before the next one with a strength at least as strong.
    917     int64_t node = nodes.elementAti(index);
    918     int32_t nextIndex;
    919     while((nextIndex = nextIndexFromNode(node)) != 0) {
    920         node = nodes.elementAti(nextIndex);
    921         if(strengthFromNode(node) <= strength) { break; }
    922         // Skip the next node which has a weaker (larger) strength than the new one.
    923         index = nextIndex;
    924     }
    925     node = IS_TAILORED | nodeFromStrength(strength);
    926     return insertNodeBetween(index, nextIndex, node, errorCode);
    927 }
    928 
    929 int32_t
    930 CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,
    931                                     UErrorCode &errorCode) {
    932     if(U_FAILURE(errorCode)) { return 0; }
    933     U_ASSERT(previousIndexFromNode(node) == 0);
    934     U_ASSERT(nextIndexFromNode(node) == 0);
    935     U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
    936     // Append the new node and link it to the existing nodes.
    937     int32_t newIndex = nodes.size();
    938     node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
    939     nodes.addElement(node, errorCode);
    940     if(U_FAILURE(errorCode)) { return 0; }
    941     // nodes[index].nextIndex = newIndex
    942     node = nodes.elementAti(index);
    943     nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
    944     // nodes[nextIndex].previousIndex = newIndex
    945     if(nextIndex != 0) {
    946         node = nodes.elementAti(nextIndex);
    947         nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
    948     }
    949     return newIndex;
    950 }
    951 
    952 int32_t
    953 CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
    954     U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);
    955     int64_t node = nodes.elementAti(index);
    956     if(strengthFromNode(node) >= strength) {
    957         // The current node is no stronger.
    958         return index;
    959     }
    960     if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
    961         // The current node implies the strength-common weight.
    962         return index;
    963     }
    964     index = nextIndexFromNode(node);
    965     node = nodes.elementAti(index);
    966     U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
    967             weight16FromNode(node) == BEFORE_WEIGHT16);
    968     // Skip to the explicit common node.
    969     do {
    970         index = nextIndexFromNode(node);
    971         node = nodes.elementAti(index);
    972         U_ASSERT(strengthFromNode(node) >= strength);
    973     } while(isTailoredNode(node) || strengthFromNode(node) > strength);
    974     U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
    975     return index;
    976 }
    977 
    978 void
    979 CollationBuilder::setCaseBits(const UnicodeString &nfdString,
    980                               const char *&parserErrorReason, UErrorCode &errorCode) {
    981     if(U_FAILURE(errorCode)) { return; }
    982     int32_t numTailoredPrimaries = 0;
    983     for(int32_t i = 0; i < cesLength; ++i) {
    984         if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }
    985     }
    986     // We should not be able to get too many case bits because
    987     // cesLength<=31==MAX_EXPANSION_LENGTH.
    988     // 31 pairs of case bits fit into an int64_t without setting its sign bit.
    989     U_ASSERT(numTailoredPrimaries <= 31);
    990 
    991     int64_t cases = 0;
    992     if(numTailoredPrimaries > 0) {
    993         const UChar *s = nfdString.getBuffer();
    994         UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length());
    995         int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;
    996         if(U_FAILURE(errorCode)) {
    997             parserErrorReason = "fetching root CEs for tailored string";
    998             return;
    999         }
   1000         U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);
   1001 
   1002         uint32_t lastCase = 0;
   1003         int32_t numBasePrimaries = 0;
   1004         for(int32_t i = 0; i < baseCEsLength; ++i) {
   1005             int64_t ce = baseCEs.getCE(i);
   1006             if((ce >> 32) != 0) {
   1007                 ++numBasePrimaries;
   1008                 uint32_t c = ((uint32_t)ce >> 14) & 3;
   1009                 U_ASSERT(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
   1010                 if(numBasePrimaries < numTailoredPrimaries) {
   1011                     cases |= (int64_t)c << ((numBasePrimaries - 1) * 2);
   1012                 } else if(numBasePrimaries == numTailoredPrimaries) {
   1013                     lastCase = c;
   1014                 } else if(c != lastCase) {
   1015                     // There are more base primary CEs than tailored primaries.
   1016                     // Set mixed case if the case bits of the remainder differ.
   1017                     lastCase = 1;
   1018                     // Nothing more can change.
   1019                     break;
   1020                 }
   1021             }
   1022         }
   1023         if(numBasePrimaries >= numTailoredPrimaries) {
   1024             cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2);
   1025         }
   1026     }
   1027 
   1028     for(int32_t i = 0; i < cesLength; ++i) {
   1029         int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff);  // clear old case bits
   1030         int32_t strength = ceStrength(ce);
   1031         if(strength == UCOL_PRIMARY) {
   1032             ce |= (cases & 3) << 14;
   1033             cases >>= 2;
   1034         } else if(strength == UCOL_TERTIARY) {
   1035             // Tertiary CEs must have uppercase bits.
   1036             // See the LDML spec, and comments in class CollationCompare.
   1037             ce |= 0x8000;
   1038         }
   1039         // Tertiary ignorable CEs must have 0 case bits.
   1040         // We set 0 case bits for secondary CEs too
   1041         // since currently only U+0345 is cased and maps to a secondary CE,
   1042         // and it is lowercase. Other secondaries are uncased.
   1043         // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
   1044         ces[i] = ce;
   1045     }
   1046 }
   1047 
   1048 void
   1049 CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,
   1050                                        UErrorCode &errorCode) {
   1051     if(U_FAILURE(errorCode)) { return; }
   1052     dataBuilder->suppressContractions(set, errorCode);
   1053     if(U_FAILURE(errorCode)) {
   1054         parserErrorReason = "application of [suppressContractions [set]] failed";
   1055     }
   1056 }
   1057 
   1058 void
   1059 CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,
   1060                            UErrorCode &errorCode) {
   1061     if(U_FAILURE(errorCode)) { return; }
   1062     optimizeSet.addAll(set);
   1063 }
   1064 
   1065 uint32_t
   1066 CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
   1067                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
   1068                                  UErrorCode &errorCode) {
   1069     // Map from the NFD input to the CEs.
   1070     ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
   1071     ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
   1072     addTailComposites(nfdPrefix, nfdString, errorCode);
   1073     return ce32;
   1074 }
   1075 
   1076 uint32_t
   1077 CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
   1078                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
   1079                                  UErrorCode &errorCode) {
   1080     if(U_FAILURE(errorCode)) { return ce32; }
   1081 
   1082     // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
   1083     if(nfdPrefix.isEmpty()) {
   1084         CanonicalIterator stringIter(nfdString, errorCode);
   1085         if(U_FAILURE(errorCode)) { return ce32; }
   1086         UnicodeString prefix;
   1087         for(;;) {
   1088             UnicodeString str = stringIter.next();
   1089             if(str.isBogus()) { break; }
   1090             if(ignoreString(str, errorCode) || str == nfdString) { continue; }
   1091             ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
   1092             if(U_FAILURE(errorCode)) { return ce32; }
   1093         }
   1094     } else {
   1095         CanonicalIterator prefixIter(nfdPrefix, errorCode);
   1096         CanonicalIterator stringIter(nfdString, errorCode);
   1097         if(U_FAILURE(errorCode)) { return ce32; }
   1098         for(;;) {
   1099             UnicodeString prefix = prefixIter.next();
   1100             if(prefix.isBogus()) { break; }
   1101             if(ignorePrefix(prefix, errorCode)) { continue; }
   1102             UBool samePrefix = prefix == nfdPrefix;
   1103             for(;;) {
   1104                 UnicodeString str = stringIter.next();
   1105                 if(str.isBogus()) { break; }
   1106                 if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }
   1107                 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
   1108                 if(U_FAILURE(errorCode)) { return ce32; }
   1109             }
   1110             stringIter.reset();
   1111         }
   1112     }
   1113     return ce32;
   1114 }
   1115 
   1116 void
   1117 CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
   1118                                     UErrorCode &errorCode) {
   1119     if(U_FAILURE(errorCode)) { return; }
   1120 
   1121     // Look for the last starter in the NFD string.
   1122     UChar32 lastStarter;
   1123     int32_t indexAfterLastStarter = nfdString.length();
   1124     for(;;) {
   1125         if(indexAfterLastStarter == 0) { return; }  // no starter at all
   1126         lastStarter = nfdString.char32At(indexAfterLastStarter - 1);
   1127         if(nfd.getCombiningClass(lastStarter) == 0) { break; }
   1128         indexAfterLastStarter -= U16_LENGTH(lastStarter);
   1129     }
   1130     // No closure to Hangul syllables since we decompose them on the fly.
   1131     if(Hangul::isJamoL(lastStarter)) { return; }
   1132 
   1133     // Are there any composites whose decomposition starts with the lastStarter?
   1134     // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
   1135     // We might find some more equivalent mappings here if it did.
   1136     UnicodeSet composites;
   1137     if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
   1138 
   1139     UnicodeString decomp;
   1140     UnicodeString newNFDString, newString;
   1141     int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];
   1142     UnicodeSetIterator iter(composites);
   1143     while(iter.next()) {
   1144         U_ASSERT(!iter.isString());
   1145         UChar32 composite = iter.getCodepoint();
   1146         nfd.getDecomposition(composite, decomp);
   1147         if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
   1148                                      newNFDString, newString, errorCode)) {
   1149             continue;
   1150         }
   1151         int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);
   1152         if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {
   1153             // Ignore mappings that we cannot store.
   1154             continue;
   1155         }
   1156         // Note: It is possible that the newCEs do not make use of the mapping
   1157         // for which we are adding the tail composites, in which case we might be adding
   1158         // unnecessary mappings.
   1159         // For example, when we add tail composites for ae^ (^=combining circumflex),
   1160         // UCA discontiguous-contraction matching does not find any matches
   1161         // for ae_^ (_=any combining diacritic below) *unless* there is also
   1162         // a contraction mapping for ae.
   1163         // Thus, if there is no ae contraction, then the ae^ mapping is ignored
   1164         // while fetching the newCEs for ae_^.
   1165         // TODO: Try to detect this effectively.
   1166         // (Alternatively, print a warning when prefix contractions are missing.)
   1167 
   1168         // We do not need an explicit mapping for the NFD strings.
   1169         // It is fine if the NFD input collates like this via a sequence of mappings.
   1170         // It also saves a little bit of space, and may reduce the set of characters with contractions.
   1171         uint32_t ce32 = addIfDifferent(nfdPrefix, newString,
   1172                                        newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);
   1173         if(ce32 != Collation::UNASSIGNED_CE32) {
   1174             // was different, was added
   1175             addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);
   1176         }
   1177     }
   1178 }
   1179 
   1180 UBool
   1181 CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,
   1182                                            int32_t indexAfterLastStarter,
   1183                                            UChar32 composite, const UnicodeString &decomp,
   1184                                            UnicodeString &newNFDString, UnicodeString &newString,
   1185                                            UErrorCode &errorCode) const {
   1186     if(U_FAILURE(errorCode)) { return FALSE; }
   1187     U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));
   1188     int32_t lastStarterLength = decomp.moveIndex32(0, 1);
   1189     if(lastStarterLength == decomp.length()) {
   1190         // Singleton decompositions should be found by addWithClosure()
   1191         // and the CanonicalIterator, so we can ignore them here.
   1192         return FALSE;
   1193     }
   1194     if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,
   1195                          decomp, lastStarterLength, 0x7fffffff) == 0) {
   1196         // same strings, nothing new to be found here
   1197         return FALSE;
   1198     }
   1199 
   1200     // Make new FCD strings that combine a composite, or its decomposition,
   1201     // into the nfdString's last starter and the combining marks following it.
   1202     // Make an NFD version, and a version with the composite.
   1203     newNFDString.setTo(nfdString, 0, indexAfterLastStarter);
   1204     newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);
   1205 
   1206     // The following is related to discontiguous contraction matching,
   1207     // but builds only FCD strings (or else returns FALSE).
   1208     int32_t sourceIndex = indexAfterLastStarter;
   1209     int32_t decompIndex = lastStarterLength;
   1210     // Small optimization: We keep the source character across loop iterations
   1211     // because we do not always consume it,
   1212     // and then need not fetch it again nor look up its combining class again.
   1213     UChar32 sourceChar = U_SENTINEL;
   1214     // The cc variables need to be declared before the loop so that at the end
   1215     // they are set to the last combining classes seen.
   1216     uint8_t sourceCC = 0;
   1217     uint8_t decompCC = 0;
   1218     for(;;) {
   1219         if(sourceChar < 0) {
   1220             if(sourceIndex >= nfdString.length()) { break; }
   1221             sourceChar = nfdString.char32At(sourceIndex);
   1222             sourceCC = nfd.getCombiningClass(sourceChar);
   1223             U_ASSERT(sourceCC != 0);
   1224         }
   1225         // We consume a decomposition character in each iteration.
   1226         if(decompIndex >= decomp.length()) { break; }
   1227         UChar32 decompChar = decomp.char32At(decompIndex);
   1228         decompCC = nfd.getCombiningClass(decompChar);
   1229         // Compare the two characters and their combining classes.
   1230         if(decompCC == 0) {
   1231             // Unable to merge because the source contains a non-zero combining mark
   1232             // but the composite's decomposition contains another starter.
   1233             // The strings would not be equivalent.
   1234             return FALSE;
   1235         } else if(sourceCC < decompCC) {
   1236             // Composite + sourceChar would not be FCD.
   1237             return FALSE;
   1238         } else if(decompCC < sourceCC) {
   1239             newNFDString.append(decompChar);
   1240             decompIndex += U16_LENGTH(decompChar);
   1241         } else if(decompChar != sourceChar) {
   1242             // Blocked because same combining class.
   1243             return FALSE;
   1244         } else {  // match: decompChar == sourceChar
   1245             newNFDString.append(decompChar);
   1246             decompIndex += U16_LENGTH(decompChar);
   1247             sourceIndex += U16_LENGTH(decompChar);
   1248             sourceChar = U_SENTINEL;
   1249         }
   1250     }
   1251     // We are at the end of at least one of the two inputs.
   1252     if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
   1253         if(sourceCC < decompCC) {
   1254             // Appending the next source character to the composite would not be FCD.
   1255             return FALSE;
   1256         }
   1257         newNFDString.append(nfdString, sourceIndex, 0x7fffffff);
   1258         newString.append(nfdString, sourceIndex, 0x7fffffff);
   1259     } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
   1260         newNFDString.append(decomp, decompIndex, 0x7fffffff);
   1261     }
   1262     U_ASSERT(nfd.isNormalized(newNFDString, errorCode));
   1263     U_ASSERT(fcd.isNormalized(newString, errorCode));
   1264     U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString);  // canonically equivalent
   1265     return TRUE;
   1266 }
   1267 
   1268 UBool
   1269 CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {
   1270     // Do not map non-FCD prefixes.
   1271     return !isFCD(s, errorCode);
   1272 }
   1273 
   1274 UBool
   1275 CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {
   1276     // Do not map non-FCD strings.
   1277     // Do not map strings that start with Hangul syllables: We decompose those on the fly.
   1278     return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));
   1279 }
   1280 
   1281 UBool
   1282 CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {
   1283     return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);
   1284 }
   1285 
   1286 void
   1287 CollationBuilder::closeOverComposites(UErrorCode &errorCode) {
   1288     UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode);  // Java: static final
   1289     if(U_FAILURE(errorCode)) { return; }
   1290     // Hangul is decomposed on the fly during collation.
   1291     composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
   1292     UnicodeString prefix;  // empty
   1293     UnicodeString nfdString;
   1294     UnicodeSetIterator iter(composites);
   1295     while(iter.next()) {
   1296         U_ASSERT(!iter.isString());
   1297         nfd.getDecomposition(iter.getCodepoint(), nfdString);
   1298         cesLength = dataBuilder->getCEs(nfdString, ces, 0);
   1299         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
   1300             // Too many CEs from the decomposition (unusual), ignore this composite.
   1301             // We could add a capacity parameter to getCEs() and reallocate if necessary.
   1302             // However, this can only really happen in contrived cases.
   1303             continue;
   1304         }
   1305         const UnicodeString &composite(iter.getString());
   1306         addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);
   1307     }
   1308 }
   1309 
   1310 uint32_t
   1311 CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,
   1312                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
   1313                                  UErrorCode &errorCode) {
   1314     if(U_FAILURE(errorCode)) { return ce32; }
   1315     int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];
   1316     int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);
   1317     if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
   1318         if(ce32 == Collation::UNASSIGNED_CE32) {
   1319             ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);
   1320         }
   1321         dataBuilder->addCE32(prefix, str, ce32, errorCode);
   1322     }
   1323     return ce32;
   1324 }
   1325 
   1326 UBool
   1327 CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,
   1328                           const int64_t ces2[], int32_t ces2Length) {
   1329     if(ces1Length != ces2Length) {
   1330         return FALSE;
   1331     }
   1332     U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);
   1333     for(int32_t i = 0; i < ces1Length; ++i) {
   1334         if(ces1[i] != ces2[i]) { return FALSE; }
   1335     }
   1336     return TRUE;
   1337 }
   1338 
   1339 #ifdef DEBUG_COLLATION_BUILDER
   1340 
   1341 uint32_t
   1342 alignWeightRight(uint32_t w) {
   1343     if(w != 0) {
   1344         while((w & 0xff) == 0) { w >>= 8; }
   1345     }
   1346     return w;
   1347 }
   1348 
   1349 #endif
   1350 
   1351 void
   1352 CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
   1353     if(U_FAILURE(errorCode)) { return; }
   1354 
   1355     CollationWeights primaries, secondaries, tertiaries;
   1356     int64_t *nodesArray = nodes.getBuffer();
   1357 
   1358     for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
   1359         int32_t i = rootPrimaryIndexes.elementAti(rpi);
   1360         int64_t node = nodesArray[i];
   1361         uint32_t p = weight32FromNode(node);
   1362         uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;
   1363         uint32_t t = s;
   1364         uint32_t q = 0;
   1365         UBool pIsTailored = FALSE;
   1366         UBool sIsTailored = FALSE;
   1367         UBool tIsTailored = FALSE;
   1368 #ifdef DEBUG_COLLATION_BUILDER
   1369         printf("\nprimary     %lx\n", (long)alignWeightRight(p));
   1370 #endif
   1371         int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
   1372         int32_t nextIndex = nextIndexFromNode(node);
   1373         while(nextIndex != 0) {
   1374             i = nextIndex;
   1375             node = nodesArray[i];
   1376             nextIndex = nextIndexFromNode(node);
   1377             int32_t strength = strengthFromNode(node);
   1378             if(strength == UCOL_QUATERNARY) {
   1379                 U_ASSERT(isTailoredNode(node));
   1380 #ifdef DEBUG_COLLATION_BUILDER
   1381                 printf("      quat+     ");
   1382 #endif
   1383                 if(q == 3) {
   1384                     errorCode = U_BUFFER_OVERFLOW_ERROR;
   1385                     errorReason = "quaternary tailoring gap too small";
   1386                     return;
   1387                 }
   1388                 ++q;
   1389             } else {
   1390                 if(strength == UCOL_TERTIARY) {
   1391                     if(isTailoredNode(node)) {
   1392 #ifdef DEBUG_COLLATION_BUILDER
   1393                         printf("    ter+        ");
   1394 #endif
   1395                         if(!tIsTailored) {
   1396                             // First tailored tertiary node for [p, s].
   1397                             int32_t tCount = countTailoredNodes(nodesArray, nextIndex,
   1398                                                                 UCOL_TERTIARY) + 1;
   1399                             uint32_t tLimit;
   1400                             if(t == 0) {
   1401                                 // Gap at the beginning of the tertiary CE range.
   1402                                 t = rootElements.getTertiaryBoundary() - 0x100;
   1403                                 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
   1404                             } else if(t == BEFORE_WEIGHT16) {
   1405                                 tLimit = Collation::COMMON_WEIGHT16;
   1406                             } else if(!pIsTailored && !sIsTailored) {
   1407                                 // p and s are root weights.
   1408                                 tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
   1409                             } else {
   1410                                 // [p, s] is tailored.
   1411                                 U_ASSERT(t == Collation::COMMON_WEIGHT16);
   1412                                 tLimit = rootElements.getTertiaryBoundary();
   1413                             }
   1414                             U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);
   1415                             tertiaries.initForTertiary();
   1416                             if(!tertiaries.allocWeights(t, tLimit, tCount)) {
   1417                                 errorCode = U_BUFFER_OVERFLOW_ERROR;
   1418                                 errorReason = "tertiary tailoring gap too small";
   1419                                 return;
   1420                             }
   1421                             tIsTailored = TRUE;
   1422                         }
   1423                         t = tertiaries.nextWeight();
   1424                         U_ASSERT(t != 0xffffffff);
   1425                     } else {
   1426                         t = weight16FromNode(node);
   1427                         tIsTailored = FALSE;
   1428 #ifdef DEBUG_COLLATION_BUILDER
   1429                         printf("    ter     %lx\n", (long)alignWeightRight(t));
   1430 #endif
   1431                     }
   1432                 } else {
   1433                     if(strength == UCOL_SECONDARY) {
   1434                         if(isTailoredNode(node)) {
   1435 #ifdef DEBUG_COLLATION_BUILDER
   1436                             printf("  sec+          ");
   1437 #endif
   1438                             if(!sIsTailored) {
   1439                                 // First tailored secondary node for p.
   1440                                 int32_t sCount = countTailoredNodes(nodesArray, nextIndex,
   1441                                                                     UCOL_SECONDARY) + 1;
   1442                                 uint32_t sLimit;
   1443                                 if(s == 0) {
   1444                                     // Gap at the beginning of the secondary CE range.
   1445                                     s = rootElements.getSecondaryBoundary() - 0x100;
   1446                                     sLimit = rootElements.getFirstSecondaryCE() >> 16;
   1447                                 } else if(s == BEFORE_WEIGHT16) {
   1448                                     sLimit = Collation::COMMON_WEIGHT16;
   1449                                 } else if(!pIsTailored) {
   1450                                     // p is a root primary.
   1451                                     sLimit = rootElements.getSecondaryAfter(pIndex, s);
   1452                                 } else {
   1453                                     // p is a tailored primary.
   1454                                     U_ASSERT(s == Collation::COMMON_WEIGHT16);
   1455                                     sLimit = rootElements.getSecondaryBoundary();
   1456                                 }
   1457                                 if(s == Collation::COMMON_WEIGHT16) {
   1458                                     // Do not tailor into the getSortKey() range of
   1459                                     // compressed common secondaries.
   1460                                     s = rootElements.getLastCommonSecondary();
   1461                                 }
   1462                                 secondaries.initForSecondary();
   1463                                 if(!secondaries.allocWeights(s, sLimit, sCount)) {
   1464                                     errorCode = U_BUFFER_OVERFLOW_ERROR;
   1465                                     errorReason = "secondary tailoring gap too small";
   1466                                     return;
   1467                                 }
   1468                                 sIsTailored = TRUE;
   1469                             }
   1470                             s = secondaries.nextWeight();
   1471                             U_ASSERT(s != 0xffffffff);
   1472                         } else {
   1473                             s = weight16FromNode(node);
   1474                             sIsTailored = FALSE;
   1475 #ifdef DEBUG_COLLATION_BUILDER
   1476                             printf("  sec       %lx\n", (long)alignWeightRight(s));
   1477 #endif
   1478                         }
   1479                     } else /* UCOL_PRIMARY */ {
   1480                         U_ASSERT(isTailoredNode(node));
   1481 #ifdef DEBUG_COLLATION_BUILDER
   1482                         printf("pri+            ");
   1483 #endif
   1484                         if(!pIsTailored) {
   1485                             // First tailored primary node in this list.
   1486                             int32_t pCount = countTailoredNodes(nodesArray, nextIndex,
   1487                                                                 UCOL_PRIMARY) + 1;
   1488                             UBool isCompressible = baseData->isCompressiblePrimary(p);
   1489                             uint32_t pLimit =
   1490                                 rootElements.getPrimaryAfter(p, pIndex, isCompressible);
   1491                             primaries.initForPrimary(isCompressible);
   1492                             if(!primaries.allocWeights(p, pLimit, pCount)) {
   1493                                 errorCode = U_BUFFER_OVERFLOW_ERROR;  // TODO: introduce a more specific UErrorCode?
   1494                                 errorReason = "primary tailoring gap too small";
   1495                                 return;
   1496                             }
   1497                             pIsTailored = TRUE;
   1498                         }
   1499                         p = primaries.nextWeight();
   1500                         U_ASSERT(p != 0xffffffff);
   1501                         s = Collation::COMMON_WEIGHT16;
   1502                         sIsTailored = FALSE;
   1503                     }
   1504                     t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;
   1505                     tIsTailored = FALSE;
   1506                 }
   1507                 q = 0;
   1508             }
   1509             if(isTailoredNode(node)) {
   1510                 nodesArray[i] = Collation::makeCE(p, s, t, q);
   1511 #ifdef DEBUG_COLLATION_BUILDER
   1512                 printf("%016llx\n", (long long)nodesArray[i]);
   1513 #endif
   1514             }
   1515         }
   1516     }
   1517 }
   1518 
   1519 int32_t
   1520 CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {
   1521     int32_t count = 0;
   1522     for(;;) {
   1523         if(i == 0) { break; }
   1524         int64_t node = nodesArray[i];
   1525         if(strengthFromNode(node) < strength) { break; }
   1526         if(strengthFromNode(node) == strength) {
   1527             if(isTailoredNode(node)) {
   1528                 ++count;
   1529             } else {
   1530                 break;
   1531             }
   1532         }
   1533         i = nextIndexFromNode(node);
   1534     }
   1535     return count;
   1536 }
   1537 
   1538 class CEFinalizer : public CollationDataBuilder::CEModifier {
   1539 public:
   1540     CEFinalizer(const int64_t *ces) : finalCEs(ces) {}
   1541     virtual ~CEFinalizer();
   1542     virtual int64_t modifyCE32(uint32_t ce32) const {
   1543         U_ASSERT(!Collation::isSpecialCE32(ce32));
   1544         if(CollationBuilder::isTempCE32(ce32)) {
   1545             // retain case bits
   1546             return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
   1547         } else {
   1548             return Collation::NO_CE;
   1549         }
   1550     }
   1551     virtual int64_t modifyCE(int64_t ce) const {
   1552         if(CollationBuilder::isTempCE(ce)) {
   1553             // retain case bits
   1554             return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);
   1555         } else {
   1556             return Collation::NO_CE;
   1557         }
   1558     }
   1559 
   1560 private:
   1561     const int64_t *finalCEs;
   1562 };
   1563 
   1564 CEFinalizer::~CEFinalizer() {}
   1565 
   1566 void
   1567 CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
   1568     if(U_FAILURE(errorCode)) { return; }
   1569     LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode));
   1570     if(newBuilder.isNull()) {
   1571         errorCode = U_MEMORY_ALLOCATION_ERROR;
   1572         return;
   1573     }
   1574     newBuilder->initForTailoring(baseData, errorCode);
   1575     CEFinalizer finalizer(nodes.getBuffer());
   1576     newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);
   1577     if(U_FAILURE(errorCode)) { return; }
   1578     delete dataBuilder;
   1579     dataBuilder = newBuilder.orphan();
   1580 }
   1581 
   1582 int32_t
   1583 CollationBuilder::ceStrength(int64_t ce) {
   1584     return
   1585         isTempCE(ce) ? strengthFromTempCE(ce) :
   1586         (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :
   1587         ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY :
   1588         ce != 0 ? UCOL_TERTIARY :
   1589         UCOL_IDENTICAL;
   1590 }
   1591 
   1592 U_NAMESPACE_END
   1593 
   1594 U_NAMESPACE_USE
   1595 
   1596 U_CAPI UCollator * U_EXPORT2
   1597 ucol_openRules(const UChar *rules, int32_t rulesLength,
   1598                UColAttributeValue normalizationMode, UCollationStrength strength,
   1599                UParseError *parseError, UErrorCode *pErrorCode) {
   1600     if(U_FAILURE(*pErrorCode)) { return NULL; }
   1601     if(rules == NULL && rulesLength != 0) {
   1602         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1603         return NULL;
   1604     }
   1605     RuleBasedCollator *coll = new RuleBasedCollator();
   1606     if(coll == NULL) {
   1607         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
   1608         return NULL;
   1609     }
   1610     UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength);
   1611     coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode);
   1612     if(U_FAILURE(*pErrorCode)) {
   1613         delete coll;
   1614         return NULL;
   1615     }
   1616     return coll->toUCollator();
   1617 }
   1618 
   1619 static const int32_t internalBufferSize = 512;
   1620 
   1621 // The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp
   1622 // because it calls UnicodeSet "builder" code that depends on all Unicode properties,
   1623 // and the rest of the collation "runtime" code only depends on normalization.
   1624 // This function is not related to the collation builder,
   1625 // but it did not seem worth moving it into its own .cpp file,
   1626 // nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.
   1627 U_CAPI int32_t U_EXPORT2
   1628 ucol_getUnsafeSet( const UCollator *coll,
   1629                   USet *unsafe,
   1630                   UErrorCode *status)
   1631 {
   1632     UChar buffer[internalBufferSize];
   1633     int32_t len = 0;
   1634 
   1635     uset_clear(unsafe);
   1636 
   1637     // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
   1638     static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
   1639                                     0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
   1640 
   1641     // add chars that fail the fcd check
   1642     uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
   1643 
   1644     // add lead/trail surrogates
   1645     // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,
   1646     // not when testing code *points*)
   1647     uset_addRange(unsafe, 0xd800, 0xdfff);
   1648 
   1649     USet *contractions = uset_open(0,0);
   1650 
   1651     int32_t i = 0, j = 0;
   1652     ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
   1653     int32_t contsSize = uset_size(contractions);
   1654     UChar32 c = 0;
   1655     // Contraction set consists only of strings
   1656     // to get unsafe code points, we need to
   1657     // break the strings apart and add them to the unsafe set
   1658     for(i = 0; i < contsSize; i++) {
   1659         len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
   1660         if(len > 0) {
   1661             j = 0;
   1662             while(j < len) {
   1663                 U16_NEXT(buffer, j, len, c);
   1664                 if(j < len) {
   1665                     uset_add(unsafe, c);
   1666                 }
   1667             }
   1668         }
   1669     }
   1670 
   1671     uset_close(contractions);
   1672 
   1673     return uset_size(unsafe);
   1674 }
   1675 
   1676 #endif  // !UCONFIG_NO_COLLATION
   1677