1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationBuilder.java, ported from collationbuilder.h/.cpp 9 * 10 * C++ version created on: 2013may06 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import java.text.ParseException; 17 18 import com.ibm.icu.impl.Norm2AllModes; 19 import com.ibm.icu.impl.Normalizer2Impl; 20 import com.ibm.icu.impl.Normalizer2Impl.Hangul; 21 import com.ibm.icu.lang.UScript; 22 import com.ibm.icu.text.CanonicalIterator; 23 import com.ibm.icu.text.Collator; 24 import com.ibm.icu.text.Normalizer2; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.text.UnicodeSetIterator; 27 import com.ibm.icu.util.ULocale; 28 29 public final class CollationBuilder extends CollationRuleParser.Sink { 30 private static final boolean DEBUG = false; 31 private static final class BundleImporter implements CollationRuleParser.Importer { 32 BundleImporter() {} 33 @Override 34 public String getRules(String localeID, String collationType) { 35 return CollationLoader.loadRules(new ULocale(localeID), collationType); 36 } 37 } 38 39 public CollationBuilder(CollationTailoring b) { 40 nfd = Normalizer2.getNFDInstance(); 41 fcd = Norm2AllModes.getFCDNormalizer2(); 42 nfcImpl = Norm2AllModes.getNFCInstance().impl; 43 base = b; 44 baseData = b.data; 45 rootElements = new CollationRootElements(b.data.rootElements); 46 variableTop = 0; 47 dataBuilder = new CollationDataBuilder(); 48 fastLatinEnabled = true; 49 cesLength = 0; 50 rootPrimaryIndexes = new UVector32(); 51 nodes = new UVector64(); 52 nfcImpl.ensureCanonIterData(); 53 dataBuilder.initForTailoring(baseData); 54 } 55 56 public CollationTailoring parseAndBuild(String ruleString) throws ParseException { 57 if(baseData.rootElements == null) { 58 // C++ U_MISSING_RESOURCE_ERROR 59 throw new UnsupportedOperationException( 60 "missing root elements data, tailoring not supported"); 61 } 62 CollationTailoring tailoring = new CollationTailoring(base.settings); 63 CollationRuleParser parser = new CollationRuleParser(baseData); 64 // Note: This always bases &[last variable] and &[first regular] 65 // on the root collator's maxVariable/variableTop. 66 // If we wanted this to change after [maxVariable x], then we would keep 67 // the tailoring.settings pointer here and read its variableTop when we need it. 68 // See http://unicode.org/cldr/trac/ticket/6070 69 variableTop = base.settings.readOnly().variableTop; 70 parser.setSink(this); 71 // In Java, there is only one Importer implementation. 72 // In C++, the importer is a parameter for this method. 73 parser.setImporter(new BundleImporter()); 74 CollationSettings ownedSettings = tailoring.settings.copyOnWrite(); 75 parser.parse(ruleString, ownedSettings); 76 if(dataBuilder.hasMappings()) { 77 makeTailoredCEs(); 78 closeOverComposites(); 79 finalizeCEs(); 80 // Copy all of ASCII, and Latin-1 letters, into each tailoring. 81 optimizeSet.add(0, 0x7f); 82 optimizeSet.add(0xc0, 0xff); 83 // Hangul is decomposed on the fly during collation, 84 // and the tailoring data is always built with HANGUL_TAG specials. 85 optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 86 dataBuilder.optimize(optimizeSet); 87 tailoring.ensureOwnedData(); 88 if(fastLatinEnabled) { dataBuilder.enableFastLatin(); } 89 dataBuilder.build(tailoring.ownedData); 90 // C++ tailoring.builder = dataBuilder; 91 dataBuilder = null; 92 } else { 93 tailoring.data = baseData; 94 } 95 ownedSettings.fastLatinOptions = CollationFastLatin.getOptions( 96 tailoring.data, ownedSettings, 97 ownedSettings.fastLatinPrimaries); 98 tailoring.setRules(ruleString); 99 // In Java, we do not have a rules version. 100 // In C++, the genrb build tool reads and supplies one, 101 // and the rulesVersion is a parameter for this method. 102 tailoring.setVersion(base.version, 0 /* rulesVersion */); 103 return tailoring; 104 } 105 106 /** Implements CollationRuleParser.Sink. */ 107 @Override 108 void addReset(int strength, CharSequence str) { 109 assert(str.length() != 0); 110 if(str.charAt(0) == CollationRuleParser.POS_LEAD) { 111 ces[0] = getSpecialResetPosition(str); 112 cesLength = 1; 113 assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0); 114 } else { 115 // normal reset to a character or string 116 String nfdString = nfd.normalize(str); 117 cesLength = dataBuilder.getCEs(nfdString, ces, 0); 118 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 119 throw new IllegalArgumentException( 120 "reset position maps to too many collation elements (more than 31)"); 121 } 122 } 123 if(strength == Collator.IDENTICAL) { return; } // simple reset-at-position 124 125 // &[before strength]position 126 assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY); 127 int index = findOrInsertNodeForCEs(strength); 128 129 long node = nodes.elementAti(index); 130 // If the index is for a "weaker" node, 131 // then skip backwards over this and further "weaker" nodes. 132 while(strengthFromNode(node) > strength) { 133 index = previousIndexFromNode(node); 134 node = nodes.elementAti(index); 135 } 136 137 // Find or insert a node whose index we will put into a temporary CE. 138 if(strengthFromNode(node) == strength && isTailoredNode(node)) { 139 // Reset to just before this same-strength tailored node. 140 index = previousIndexFromNode(node); 141 } else if(strength == Collator.PRIMARY) { 142 // root primary node (has no previous index) 143 long p = weight32FromNode(node); 144 if(p == 0) { 145 throw new UnsupportedOperationException( 146 "reset primary-before ignorable not possible"); 147 } 148 if(p <= rootElements.getFirstPrimary()) { 149 // There is no primary gap between ignorables and the space-first-primary. 150 throw new UnsupportedOperationException( 151 "reset primary-before first non-ignorable not supported"); 152 } 153 if(p == Collation.FIRST_TRAILING_PRIMARY) { 154 // We do not support tailoring to an unassigned-implicit CE. 155 throw new UnsupportedOperationException( 156 "reset primary-before [first trailing] not supported"); 157 } 158 p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p)); 159 index = findOrInsertNodeForPrimary(p); 160 // Go to the last node in this list: 161 // Tailor after the last node between adjacent root nodes. 162 for(;;) { 163 node = nodes.elementAti(index); 164 int nextIndex = nextIndexFromNode(node); 165 if(nextIndex == 0) { break; } 166 index = nextIndex; 167 } 168 } else { 169 // &[before 2] or &[before 3] 170 index = findCommonNode(index, Collator.SECONDARY); 171 if(strength >= Collator.TERTIARY) { 172 index = findCommonNode(index, Collator.TERTIARY); 173 } 174 // findCommonNode() stayed on the stronger node or moved to 175 // an explicit common-weight node of the reset-before strength. 176 node = nodes.elementAti(index); 177 if(strengthFromNode(node) == strength) { 178 // Found a same-strength node with an explicit weight. 179 int weight16 = weight16FromNode(node); 180 if(weight16 == 0) { 181 throw new UnsupportedOperationException( 182 (strength == Collator.SECONDARY) ? 183 "reset secondary-before secondary ignorable not possible" : 184 "reset tertiary-before completely ignorable not possible"); 185 } 186 assert(weight16 > Collation.BEFORE_WEIGHT16); 187 // Reset to just before this node. 188 // Insert the preceding same-level explicit weight if it is not there already. 189 // Which explicit weight immediately precedes this one? 190 weight16 = getWeight16Before(index, node, strength); 191 // Does this preceding weight have a node? 192 int previousWeight16; 193 int previousIndex = previousIndexFromNode(node); 194 for(int i = previousIndex;; i = previousIndexFromNode(node)) { 195 node = nodes.elementAti(i); 196 int previousStrength = strengthFromNode(node); 197 if(previousStrength < strength) { 198 assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex); 199 // Either the reset element has an above-common weight and 200 // the parent node provides the implied common weight, 201 // or the reset element has a weight<=common in the node 202 // right after the parent, and we need to insert the preceding weight. 203 previousWeight16 = Collation.COMMON_WEIGHT16; 204 break; 205 } else if(previousStrength == strength && !isTailoredNode(node)) { 206 previousWeight16 = weight16FromNode(node); 207 break; 208 } 209 // Skip weaker nodes and same-level tailored nodes. 210 } 211 if(previousWeight16 == weight16) { 212 // The preceding weight has a node, 213 // maybe with following weaker or tailored nodes. 214 // Reset to the last of them. 215 index = previousIndex; 216 } else { 217 // Insert a node with the preceding weight, reset to that. 218 node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 219 index = insertNodeBetween(previousIndex, index, node); 220 } 221 } else { 222 // Found a stronger node with implied strength-common weight. 223 int weight16 = getWeight16Before(index, node, strength); 224 index = findOrInsertWeakNode(index, weight16, strength); 225 } 226 // Strength of the temporary CE = strength of its reset position. 227 // Code above raises an error if the before-strength is stronger. 228 strength = ceStrength(ces[cesLength - 1]); 229 } 230 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 231 } 232 233 /** 234 * Returns the secondary or tertiary weight preceding the current node's weight. 235 * node=nodes[index]. 236 */ 237 private int getWeight16Before(int index, long node, int level) { 238 assert(strengthFromNode(node) < level || !isTailoredNode(node)); 239 // Collect the root CE weights if this node is for a root CE. 240 // If it is not, then return the low non-primary boundary for a tailored CE. 241 int t; 242 if(strengthFromNode(node) == Collator.TERTIARY) { 243 t = weight16FromNode(node); 244 } else { 245 t = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 246 } 247 while(strengthFromNode(node) > Collator.SECONDARY) { 248 index = previousIndexFromNode(node); 249 node = nodes.elementAti(index); 250 } 251 if(isTailoredNode(node)) { 252 return Collation.BEFORE_WEIGHT16; 253 } 254 int s; 255 if(strengthFromNode(node) == Collator.SECONDARY) { 256 s = weight16FromNode(node); 257 } else { 258 s = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 259 } 260 while(strengthFromNode(node) > Collator.PRIMARY) { 261 index = previousIndexFromNode(node); 262 node = nodes.elementAti(index); 263 } 264 if(isTailoredNode(node)) { 265 return Collation.BEFORE_WEIGHT16; 266 } 267 // [p, s, t] is a root CE. Return the preceding weight for the requested level. 268 long p = weight32FromNode(node); 269 int weight16; 270 if(level == Collator.SECONDARY) { 271 weight16 = rootElements.getSecondaryBefore(p, s); 272 } else { 273 weight16 = rootElements.getTertiaryBefore(p, s, t); 274 assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0); 275 } 276 return weight16; 277 } 278 279 private long getSpecialResetPosition(CharSequence str) { 280 assert(str.length() == 2); 281 long ce; 282 int strength = Collator.PRIMARY; 283 boolean isBoundary = false; 284 CollationRuleParser.Position pos = 285 CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE]; 286 switch(pos) { 287 case FIRST_TERTIARY_IGNORABLE: 288 // Quaternary CEs are not supported. 289 // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 290 return 0; 291 case LAST_TERTIARY_IGNORABLE: 292 return 0; 293 case FIRST_SECONDARY_IGNORABLE: { 294 // Look for a tailored tertiary node after [0, 0, 0]. 295 int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY); 296 long node = nodes.elementAti(index); 297 if((index = nextIndexFromNode(node)) != 0) { 298 node = nodes.elementAti(index); 299 assert(strengthFromNode(node) <= Collator.TERTIARY); 300 if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) { 301 return tempCEFromIndexAndStrength(index, Collator.TERTIARY); 302 } 303 } 304 return rootElements.getFirstTertiaryCE(); 305 // No need to look for nodeHasAnyBefore() on a tertiary node. 306 } 307 case LAST_SECONDARY_IGNORABLE: 308 ce = rootElements.getLastTertiaryCE(); 309 strength = Collator.TERTIARY; 310 break; 311 case FIRST_PRIMARY_IGNORABLE: { 312 // Look for a tailored secondary node after [0, 0, *]. 313 int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY); 314 long node = nodes.elementAti(index); 315 while((index = nextIndexFromNode(node)) != 0) { 316 node = nodes.elementAti(index); 317 strength = strengthFromNode(node); 318 if(strength < Collator.SECONDARY) { break; } 319 if(strength == Collator.SECONDARY) { 320 if(isTailoredNode(node)) { 321 if(nodeHasBefore3(node)) { 322 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 323 assert(isTailoredNode(nodes.elementAti(index))); 324 } 325 return tempCEFromIndexAndStrength(index, Collator.SECONDARY); 326 } else { 327 break; 328 } 329 } 330 } 331 ce = rootElements.getFirstSecondaryCE(); 332 strength = Collator.SECONDARY; 333 break; 334 } 335 case LAST_PRIMARY_IGNORABLE: 336 ce = rootElements.getLastSecondaryCE(); 337 strength = Collator.SECONDARY; 338 break; 339 case FIRST_VARIABLE: 340 ce = rootElements.getFirstPrimaryCE(); 341 isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 342 break; 343 case LAST_VARIABLE: 344 ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 345 break; 346 case FIRST_REGULAR: 347 ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 348 isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 349 break; 350 case LAST_REGULAR: 351 // Use the Hani-first-primary rather than the actual last "regular" CE before it, 352 // for backward compatibility with behavior before the introduction of 353 // script-first-primary CEs in the root collator. 354 ce = rootElements.firstCEWithPrimaryAtLeast( 355 baseData.getFirstPrimaryForGroup(UScript.HAN)); 356 break; 357 case FIRST_IMPLICIT: 358 ce = baseData.getSingleCE(0x4e00); 359 break; 360 case LAST_IMPLICIT: 361 // We do not support tailoring to an unassigned-implicit CE. 362 throw new UnsupportedOperationException( 363 "reset to [last implicit] not supported"); 364 case FIRST_TRAILING: 365 ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY); 366 isBoundary = true; // trailing first primary (there is no mapping for it) 367 break; 368 case LAST_TRAILING: 369 throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF"); 370 default: 371 assert(false); 372 return 0; 373 } 374 375 int index = findOrInsertNodeForRootCE(ce, strength); 376 long node = nodes.elementAti(index); 377 if((pos.ordinal() & 1) == 0) { 378 // even pos = [first xyz] 379 if(!nodeHasAnyBefore(node) && isBoundary) { 380 // A <group> first primary boundary is artificially added to FractionalUCA.txt. 381 // It is reachable via its special contraction, but is not normally used. 382 // Find the first character tailored after the boundary CE, 383 // or the first real root CE after it. 384 if((index = nextIndexFromNode(node)) != 0) { 385 // If there is a following node, then it must be tailored 386 // because there are no root CEs with a boundary primary 387 // and non-common secondary/tertiary weights. 388 node = nodes.elementAti(index); 389 assert(isTailoredNode(node)); 390 ce = tempCEFromIndexAndStrength(index, strength); 391 } else { 392 assert(strength == Collator.PRIMARY); 393 long p = ce >>> 32; 394 int pIndex = rootElements.findPrimary(p); 395 boolean isCompressible = baseData.isCompressiblePrimary(p); 396 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 397 ce = Collation.makeCE(p); 398 index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY); 399 node = nodes.elementAti(index); 400 } 401 } 402 if(nodeHasAnyBefore(node)) { 403 // Get the first node that was tailored before this one at a weaker strength. 404 if(nodeHasBefore2(node)) { 405 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 406 node = nodes.elementAti(index); 407 } 408 if(nodeHasBefore3(node)) { 409 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 410 } 411 assert(isTailoredNode(nodes.elementAti(index))); 412 ce = tempCEFromIndexAndStrength(index, strength); 413 } 414 } else { 415 // odd pos = [last xyz] 416 // Find the last node that was tailored after the [last xyz] 417 // at a strength no greater than the position's strength. 418 for(;;) { 419 int nextIndex = nextIndexFromNode(node); 420 if(nextIndex == 0) { break; } 421 long nextNode = nodes.elementAti(nextIndex); 422 if(strengthFromNode(nextNode) < strength) { break; } 423 index = nextIndex; 424 node = nextNode; 425 } 426 // Do not make a temporary CE for a root node. 427 // This last node might be the node for the root CE itself, 428 // or a node with a common secondary or tertiary weight. 429 if(isTailoredNode(node)) { 430 ce = tempCEFromIndexAndStrength(index, strength); 431 } 432 } 433 return ce; 434 } 435 436 /** Implements CollationRuleParser.Sink. */ 437 @Override 438 void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) { 439 String nfdPrefix; 440 if(prefix.length() == 0) { 441 nfdPrefix = ""; 442 } else { 443 nfdPrefix = nfd.normalize(prefix); 444 } 445 String nfdString = nfd.normalize(str); 446 447 // The runtime code decomposes Hangul syllables on the fly, 448 // with recursive processing but without making the Jamo pieces visible for matching. 449 // It does not work with certain types of contextual mappings. 450 int nfdLength = nfdString.length(); 451 if(nfdLength >= 2) { 452 char c = nfdString.charAt(0); 453 if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) { 454 // While handling a Hangul syllable, contractions starting with Jamo L or V 455 // would not see the following Jamo of that syllable. 456 throw new UnsupportedOperationException( 457 "contractions starting with conjoining Jamo L or V not supported"); 458 } 459 c = nfdString.charAt(nfdLength - 1); 460 if(Hangul.isJamoL(c) || 461 (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) { 462 // A contraction ending with Jamo L or L+V would require 463 // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 464 // or decomposing a following Hangul syllable on the fly, during contraction matching. 465 throw new UnsupportedOperationException( 466 "contractions ending with conjoining Jamo L or L+V not supported"); 467 } 468 // A Hangul syllable completely inside a contraction is ok. 469 } 470 // Note: If there is a prefix, then the parser checked that 471 // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 472 // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 473 // (While handling a Hangul syllable, prefixes on Jamo V or T 474 // would not see the previous Jamo of that syllable.) 475 476 if(strength != Collator.IDENTICAL) { 477 // Find the node index after which we insert the new tailored node. 478 int index = findOrInsertNodeForCEs(strength); 479 assert(cesLength > 0); 480 long ce = ces[cesLength - 1]; 481 if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) { 482 // There is no primary gap between ignorables and the space-first-primary. 483 throw new UnsupportedOperationException( 484 "tailoring primary after ignorables not supported"); 485 } 486 if(strength == Collator.QUATERNARY && ce == 0) { 487 // The CE data structure does not support non-zero quaternary weights 488 // on tertiary ignorables. 489 throw new UnsupportedOperationException( 490 "tailoring quaternary after tertiary ignorables not supported"); 491 } 492 // Insert the new tailored node. 493 index = insertTailoredNodeAfter(index, strength); 494 // Strength of the temporary CE: 495 // The new relation may yield a stronger CE but not a weaker one. 496 int tempStrength = ceStrength(ce); 497 if(strength < tempStrength) { tempStrength = strength; } 498 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 499 } 500 501 setCaseBits(nfdString); 502 503 int cesLengthBeforeExtension = cesLength; 504 if(extension.length() != 0) { 505 String nfdExtension = nfd.normalize(extension); 506 cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength); 507 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 508 throw new IllegalArgumentException( 509 "extension string adds too many collation elements (more than 31 total)"); 510 } 511 } 512 int ce32 = Collation.UNASSIGNED_CE32; 513 if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) && 514 !ignorePrefix(prefix) && !ignoreString(str)) { 515 // Map from the original input to the CEs. 516 // We do this in case the canonical closure is incomplete, 517 // so that it is possible to explicitly provide the missing mappings. 518 ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32); 519 } 520 addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32); 521 cesLength = cesLengthBeforeExtension; 522 } 523 524 /** 525 * Picks one of the current CEs and finds or inserts a node in the graph 526 * for the CE + strength. 527 */ 528 private int findOrInsertNodeForCEs(int strength) { 529 assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY); 530 531 // Find the last CE that is at least as "strong" as the requested difference. 532 // Note: Stronger is smaller (Collator.PRIMARY=0). 533 long ce; 534 for(;; --cesLength) { 535 if(cesLength == 0) { 536 ce = ces[0] = 0; 537 cesLength = 1; 538 break; 539 } else { 540 ce = ces[cesLength - 1]; 541 } 542 if(ceStrength(ce) <= strength) { break; } 543 } 544 545 if(isTempCE(ce)) { 546 // No need to findCommonNode() here for lower levels 547 // because insertTailoredNodeAfter() will do that anyway. 548 return indexFromTempCE(ce); 549 } 550 551 // root CE 552 if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) { 553 throw new UnsupportedOperationException( 554 "tailoring relative to an unassigned code point not supported"); 555 } 556 return findOrInsertNodeForRootCE(ce, strength); 557 } 558 559 private int findOrInsertNodeForRootCE(long ce, int strength) { 560 assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE); 561 562 // Find or insert the node for each of the root CE's weights, 563 // down to the requested level/strength. 564 // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 565 assert((ce & 0xc0) == 0); 566 int index = findOrInsertNodeForPrimary(ce >>> 32); 567 if(strength >= Collator.SECONDARY) { 568 int lower32 = (int)ce; 569 index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY); 570 if(strength >= Collator.TERTIARY) { 571 index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK, 572 Collator.TERTIARY); 573 } 574 } 575 return index; 576 } 577 578 /** 579 * Like Java Collections.binarySearch(List, key, Comparator). 580 * 581 * @return the index>=0 where the item was found, 582 * or the index<0 for inserting the string at ~index in sorted order 583 * (index into rootPrimaryIndexes) 584 */ 585 private static final int binarySearchForRootPrimaryNode( 586 int[] rootPrimaryIndexes, int length, long[] nodes, long p) { 587 if(length == 0) { return ~0; } 588 int start = 0; 589 int limit = length; 590 for (;;) { 591 int i = (int)(((long)start + (long)limit) / 2); 592 long node = nodes[rootPrimaryIndexes[i]]; 593 long nodePrimary = node >>> 32; // weight32FromNode(node) 594 if (p == nodePrimary) { 595 return i; 596 } else if (p < nodePrimary) { 597 if (i == start) { 598 return ~start; // insert s before i 599 } 600 limit = i; 601 } else { 602 if (i == start) { 603 return ~(start + 1); // insert s after i 604 } 605 start = i; 606 } 607 } 608 } 609 610 /** Finds or inserts the node for a root CE's primary weight. */ 611 private int findOrInsertNodeForPrimary(long p) { 612 int rootIndex = binarySearchForRootPrimaryNode( 613 rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 614 if(rootIndex >= 0) { 615 return rootPrimaryIndexes.elementAti(rootIndex); 616 } else { 617 // Start a new list of nodes with this primary. 618 int index = nodes.size(); 619 nodes.addElement(nodeFromWeight32(p)); 620 rootPrimaryIndexes.insertElementAt(index, ~rootIndex); 621 return index; 622 } 623 } 624 625 /** Finds or inserts the node for a secondary or tertiary weight. */ 626 private int findOrInsertWeakNode(int index, int weight16, int level) { 627 assert(0 <= index && index < nodes.size()); 628 assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY); 629 630 if(weight16 == Collation.COMMON_WEIGHT16) { 631 return findCommonNode(index, level); 632 } 633 634 // If this will be the first below-common weight for the parent node, 635 // then we will also need to insert a common weight after it. 636 long node = nodes.elementAti(index); 637 assert(strengthFromNode(node) < level); // parent node is stronger 638 if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) { 639 int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; 640 if((node & hasThisLevelBefore) == 0) { 641 // The parent node has an implied level-common weight. 642 long commonNode = 643 nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level); 644 if(level == Collator.SECONDARY) { 645 // Move the HAS_BEFORE3 flag from the parent node 646 // to the new secondary common node. 647 commonNode |= node & HAS_BEFORE3; 648 node &= ~(long)HAS_BEFORE3; 649 } 650 nodes.setElementAt(node | hasThisLevelBefore, index); 651 // Insert below-common-weight node. 652 int nextIndex = nextIndexFromNode(node); 653 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 654 index = insertNodeBetween(index, nextIndex, node); 655 // Insert common-weight node. 656 insertNodeBetween(index, nextIndex, commonNode); 657 // Return index of below-common-weight node. 658 return index; 659 } 660 } 661 662 // Find the root CE's weight for this level. 663 // Postpone insertion if not found: 664 // Insert the new root node before the next stronger node, 665 // or before the next root node with the same strength and a larger weight. 666 int nextIndex; 667 while((nextIndex = nextIndexFromNode(node)) != 0) { 668 node = nodes.elementAti(nextIndex); 669 int nextStrength = strengthFromNode(node); 670 if(nextStrength <= level) { 671 // Insert before a stronger node. 672 if(nextStrength < level) { break; } 673 // nextStrength == level 674 if(!isTailoredNode(node)) { 675 int nextWeight16 = weight16FromNode(node); 676 if(nextWeight16 == weight16) { 677 // Found the node for the root CE up to this level. 678 return nextIndex; 679 } 680 // Insert before a node with a larger same-strength weight. 681 if(nextWeight16 > weight16) { break; } 682 } 683 } 684 // Skip the next node. 685 index = nextIndex; 686 } 687 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 688 return insertNodeBetween(index, nextIndex, node); 689 } 690 691 /** 692 * Makes and inserts a new tailored node into the list, after the one at index. 693 * Skips over nodes of weaker strength to maintain collation order 694 * ("postpone insertion"). 695 * @return the new node's index 696 */ 697 private int insertTailoredNodeAfter(int index, int strength) { 698 assert(0 <= index && index < nodes.size()); 699 if(strength >= Collator.SECONDARY) { 700 index = findCommonNode(index, Collator.SECONDARY); 701 if(strength >= Collator.TERTIARY) { 702 index = findCommonNode(index, Collator.TERTIARY); 703 } 704 } 705 // Postpone insertion: 706 // Insert the new node before the next one with a strength at least as strong. 707 long node = nodes.elementAti(index); 708 int nextIndex; 709 while((nextIndex = nextIndexFromNode(node)) != 0) { 710 node = nodes.elementAti(nextIndex); 711 if(strengthFromNode(node) <= strength) { break; } 712 // Skip the next node which has a weaker (larger) strength than the new one. 713 index = nextIndex; 714 } 715 node = IS_TAILORED | nodeFromStrength(strength); 716 return insertNodeBetween(index, nextIndex, node); 717 } 718 719 /** 720 * Inserts a new node into the list, between list-adjacent items. 721 * The node's previous and next indexes must not be set yet. 722 * @return the new node's index 723 */ 724 private int insertNodeBetween(int index, int nextIndex, long node) { 725 assert(previousIndexFromNode(node) == 0); 726 assert(nextIndexFromNode(node) == 0); 727 assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 728 // Append the new node and link it to the existing nodes. 729 int newIndex = nodes.size(); 730 node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 731 nodes.addElement(node); 732 // nodes[index].nextIndex = newIndex 733 node = nodes.elementAti(index); 734 nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 735 // nodes[nextIndex].previousIndex = newIndex 736 if(nextIndex != 0) { 737 node = nodes.elementAti(nextIndex); 738 nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 739 } 740 return newIndex; 741 } 742 743 /** 744 * Finds the node which implies or contains a common=05 weight of the given strength 745 * (secondary or tertiary), if the current node is stronger. 746 * Skips weaker nodes and tailored nodes if the current node is stronger 747 * and is followed by an explicit-common-weight node. 748 * Always returns the input index if that node is no stronger than the given strength. 749 */ 750 private int findCommonNode(int index, int strength) { 751 assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY); 752 long node = nodes.elementAti(index); 753 if(strengthFromNode(node) >= strength) { 754 // The current node is no stronger. 755 return index; 756 } 757 if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 758 // The current node implies the strength-common weight. 759 return index; 760 } 761 index = nextIndexFromNode(node); 762 node = nodes.elementAti(index); 763 assert(!isTailoredNode(node) && strengthFromNode(node) == strength && 764 weight16FromNode(node) < Collation.COMMON_WEIGHT16); 765 // Skip to the explicit common node. 766 do { 767 index = nextIndexFromNode(node); 768 node = nodes.elementAti(index); 769 assert(strengthFromNode(node) >= strength); 770 } while(isTailoredNode(node) || strengthFromNode(node) > strength || 771 weight16FromNode(node) < Collation.COMMON_WEIGHT16); 772 assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16); 773 return index; 774 } 775 776 private void setCaseBits(CharSequence nfdString) { 777 int numTailoredPrimaries = 0; 778 for(int i = 0; i < cesLength; ++i) { 779 if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; } 780 } 781 // We should not be able to get too many case bits because 782 // cesLength<=31==MAX_EXPANSION_LENGTH. 783 // 31 pairs of case bits fit into an long without setting its sign bit. 784 assert(numTailoredPrimaries <= 31); 785 786 long cases = 0; 787 if(numTailoredPrimaries > 0) { 788 CharSequence s = nfdString; 789 UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0); 790 int baseCEsLength = baseCEs.fetchCEs() - 1; 791 assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE); 792 793 int lastCase = 0; 794 int numBasePrimaries = 0; 795 for(int i = 0; i < baseCEsLength; ++i) { 796 long ce = baseCEs.getCE(i); 797 if((ce >>> 32) != 0) { 798 ++numBasePrimaries; 799 int c = ((int)ce >> 14) & 3; 800 assert(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 801 if(numBasePrimaries < numTailoredPrimaries) { 802 cases |= (long)c << ((numBasePrimaries - 1) * 2); 803 } else if(numBasePrimaries == numTailoredPrimaries) { 804 lastCase = c; 805 } else if(c != lastCase) { 806 // There are more base primary CEs than tailored primaries. 807 // Set mixed case if the case bits of the remainder differ. 808 lastCase = 1; 809 // Nothing more can change. 810 break; 811 } 812 } 813 } 814 if(numBasePrimaries >= numTailoredPrimaries) { 815 cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2); 816 } 817 } 818 819 for(int i = 0; i < cesLength; ++i) { 820 long ce = ces[i] & 0xffffffffffff3fffL; // clear old case bits 821 int strength = ceStrength(ce); 822 if(strength == Collator.PRIMARY) { 823 ce |= (cases & 3) << 14; 824 cases >>>= 2; 825 } else if(strength == Collator.TERTIARY) { 826 // Tertiary CEs must have uppercase bits. 827 // See the LDML spec, and comments in class CollationCompare. 828 ce |= 0x8000; 829 } 830 // Tertiary ignorable CEs must have 0 case bits. 831 // We set 0 case bits for secondary CEs too 832 // since currently only U+0345 is cased and maps to a secondary CE, 833 // and it is lowercase. Other secondaries are uncased. 834 // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 835 ces[i] = ce; 836 } 837 } 838 839 /** Implements CollationRuleParser.Sink. */ 840 @Override 841 void suppressContractions(UnicodeSet set) { 842 dataBuilder.suppressContractions(set); 843 } 844 845 /** Implements CollationRuleParser.Sink. */ 846 @Override 847 void optimize(UnicodeSet set) { 848 optimizeSet.addAll(set); 849 } 850 851 /** 852 * Adds the mapping and its canonical closure. 853 * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder 854 * need not re-encode the CEs multiple times. 855 */ 856 private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString, 857 long[] newCEs, int newCEsLength, int ce32) { 858 // Map from the NFD input to the CEs. 859 ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 860 ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 861 addTailComposites(nfdPrefix, nfdString); 862 return ce32; 863 } 864 865 private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, 866 long[] newCEs, int newCEsLength, int ce32) { 867 // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 868 // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String 869 if(nfdPrefix.length() == 0) { 870 CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 871 String prefix = ""; 872 for(;;) { 873 String str = stringIter.next(); 874 if(str == null) { break; } 875 if(ignoreString(str) || str.contentEquals(nfdString)) { continue; } 876 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 877 } 878 } else { 879 CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString()); 880 CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 881 for(;;) { 882 String prefix = prefixIter.next(); 883 if(prefix == null) { break; } 884 if(ignorePrefix(prefix)) { continue; } 885 boolean samePrefix = prefix.contentEquals(nfdPrefix); 886 for(;;) { 887 String str = stringIter.next(); 888 if(str == null) { break; } 889 if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; } 890 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 891 } 892 stringIter.reset(); 893 } 894 } 895 return ce32; 896 } 897 898 private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) { 899 // Look for the last starter in the NFD string. 900 int lastStarter; 901 int indexAfterLastStarter = nfdString.length(); 902 for(;;) { 903 if(indexAfterLastStarter == 0) { return; } // no starter at all 904 lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter); 905 if(nfd.getCombiningClass(lastStarter) == 0) { break; } 906 indexAfterLastStarter -= Character.charCount(lastStarter); 907 } 908 // No closure to Hangul syllables since we decompose them on the fly. 909 if(Hangul.isJamoL(lastStarter)) { return; } 910 911 // Are there any composites whose decomposition starts with the lastStarter? 912 // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 913 // We might find some more equivalent mappings here if it did. 914 UnicodeSet composites = new UnicodeSet(); 915 if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 916 917 StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder(); 918 long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 919 UnicodeSetIterator iter = new UnicodeSetIterator(composites); 920 while(iter.next()) { 921 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 922 int composite = iter.codepoint; 923 String decomp = nfd.getDecomposition(composite); 924 if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 925 newNFDString, newString)) { 926 continue; 927 } 928 int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0); 929 if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) { 930 // Ignore mappings that we cannot store. 931 continue; 932 } 933 // Note: It is possible that the newCEs do not make use of the mapping 934 // for which we are adding the tail composites, in which case we might be adding 935 // unnecessary mappings. 936 // For example, when we add tail composites for ae^ (^=combining circumflex), 937 // UCA discontiguous-contraction matching does not find any matches 938 // for ae_^ (_=any combining diacritic below) *unless* there is also 939 // a contraction mapping for ae. 940 // Thus, if there is no ae contraction, then the ae^ mapping is ignored 941 // while fetching the newCEs for ae_^. 942 // TODO: Try to detect this effectively. 943 // (Alternatively, print a warning when prefix contractions are missing.) 944 945 // We do not need an explicit mapping for the NFD strings. 946 // It is fine if the NFD input collates like this via a sequence of mappings. 947 // It also saves a little bit of space, and may reduce the set of characters with contractions. 948 int ce32 = addIfDifferent(nfdPrefix, newString, 949 newCEs, newCEsLength, Collation.UNASSIGNED_CE32); 950 if(ce32 != Collation.UNASSIGNED_CE32) { 951 // was different, was added 952 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32); 953 } 954 } 955 } 956 957 private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter, 958 int composite, CharSequence decomp, 959 StringBuilder newNFDString, StringBuilder newString) { 960 assert(Character.codePointBefore(nfdString, indexAfterLastStarter) == 961 Character.codePointAt(decomp, 0)); 962 int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1); 963 if(lastStarterLength == decomp.length()) { 964 // Singleton decompositions should be found by addWithClosure() 965 // and the CanonicalIterator, so we can ignore them here. 966 return false; 967 } 968 if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) { 969 // same strings, nothing new to be found here 970 return false; 971 } 972 973 // Make new FCD strings that combine a composite, or its decomposition, 974 // into the nfdString's last starter and the combining marks following it. 975 // Make an NFD version, and a version with the composite. 976 newNFDString.setLength(0); 977 newNFDString.append(nfdString, 0, indexAfterLastStarter); 978 newString.setLength(0); 979 newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength) 980 .appendCodePoint(composite); 981 982 // The following is related to discontiguous contraction matching, 983 // but builds only FCD strings (or else returns false). 984 int sourceIndex = indexAfterLastStarter; 985 int decompIndex = lastStarterLength; 986 // Small optimization: We keep the source character across loop iterations 987 // because we do not always consume it, 988 // and then need not fetch it again nor look up its combining class again. 989 int sourceChar = Collation.SENTINEL_CP; 990 // The cc variables need to be declared before the loop so that at the end 991 // they are set to the last combining classes seen. 992 int sourceCC = 0; 993 int decompCC = 0; 994 for(;;) { 995 if(sourceChar < 0) { 996 if(sourceIndex >= nfdString.length()) { break; } 997 sourceChar = Character.codePointAt(nfdString, sourceIndex); 998 sourceCC = nfd.getCombiningClass(sourceChar); 999 assert(sourceCC != 0); 1000 } 1001 // We consume a decomposition character in each iteration. 1002 if(decompIndex >= decomp.length()) { break; } 1003 int decompChar = Character.codePointAt(decomp, decompIndex); 1004 decompCC = nfd.getCombiningClass(decompChar); 1005 // Compare the two characters and their combining classes. 1006 if(decompCC == 0) { 1007 // Unable to merge because the source contains a non-zero combining mark 1008 // but the composite's decomposition contains another starter. 1009 // The strings would not be equivalent. 1010 return false; 1011 } else if(sourceCC < decompCC) { 1012 // Composite + sourceChar would not be FCD. 1013 return false; 1014 } else if(decompCC < sourceCC) { 1015 newNFDString.appendCodePoint(decompChar); 1016 decompIndex += Character.charCount(decompChar); 1017 } else if(decompChar != sourceChar) { 1018 // Blocked because same combining class. 1019 return false; 1020 } else { // match: decompChar == sourceChar 1021 newNFDString.appendCodePoint(decompChar); 1022 decompIndex += Character.charCount(decompChar); 1023 sourceIndex += Character.charCount(decompChar); 1024 sourceChar = Collation.SENTINEL_CP; 1025 } 1026 } 1027 // We are at the end of at least one of the two inputs. 1028 if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1029 if(sourceCC < decompCC) { 1030 // Appending the next source character to the composite would not be FCD. 1031 return false; 1032 } 1033 newNFDString.append(nfdString, sourceIndex, nfdString.length()); 1034 newString.append(nfdString, sourceIndex, nfdString.length()); 1035 } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1036 newNFDString.append(decomp, decompIndex, decomp.length()); 1037 } 1038 assert(nfd.isNormalized(newNFDString)); 1039 assert(fcd.isNormalized(newString)); 1040 assert(nfd.normalize(newString).equals(newNFDString.toString())); // canonically equivalent 1041 return true; 1042 } 1043 1044 private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) { 1045 // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0 1046 int leftLength = left.length(); 1047 if((leftLength - leftStart) != (right.length() - rightStart)) { return false; } 1048 while(leftStart < leftLength) { 1049 if(left.charAt(leftStart++) != right.charAt(rightStart++)) { 1050 return false; 1051 } 1052 } 1053 return true; 1054 } 1055 private boolean ignorePrefix(CharSequence s) { 1056 // Do not map non-FCD prefixes. 1057 return !isFCD(s); 1058 } 1059 private boolean ignoreString(CharSequence s) { 1060 // Do not map non-FCD strings. 1061 // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1062 return !isFCD(s) || Hangul.isHangul(s.charAt(0)); 1063 } 1064 private boolean isFCD(CharSequence s) { 1065 return fcd.isNormalized(s); 1066 } 1067 1068 private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]"); 1069 static { 1070 // Hangul is decomposed on the fly during collation. 1071 COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 1072 } 1073 1074 private void closeOverComposites() { 1075 String prefix = ""; // empty 1076 UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES); 1077 while(iter.next()) { 1078 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 1079 String nfdString = nfd.getDecomposition(iter.codepoint); 1080 cesLength = dataBuilder.getCEs(nfdString, ces, 0); 1081 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 1082 // Too many CEs from the decomposition (unusual), ignore this composite. 1083 // We could add a capacity parameter to getCEs() and reallocate if necessary. 1084 // However, this can only really happen in contrived cases. 1085 continue; 1086 } 1087 String composite = iter.getString(); 1088 addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32); 1089 } 1090 } 1091 1092 private int addIfDifferent(CharSequence prefix, CharSequence str, 1093 long[] newCEs, int newCEsLength, int ce32) { 1094 long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 1095 int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0); 1096 if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1097 if(ce32 == Collation.UNASSIGNED_CE32) { 1098 ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength); 1099 } 1100 dataBuilder.addCE32(prefix, str, ce32); 1101 } 1102 return ce32; 1103 } 1104 1105 private static boolean sameCEs(long[] ces1, int ces1Length, 1106 long[] ces2, int ces2Length) { 1107 if(ces1Length != ces2Length) { 1108 return false; 1109 } 1110 assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH); 1111 for(int i = 0; i < ces1Length; ++i) { 1112 if(ces1[i] != ces2[i]) { return false; } 1113 } 1114 return true; 1115 } 1116 1117 private static final int alignWeightRight(int w) { 1118 if(w != 0) { 1119 while((w & 0xff) == 0) { w >>>= 8; } 1120 } 1121 return w; 1122 } 1123 1124 /** 1125 * Walks the tailoring graph and overwrites tailored nodes with new CEs. 1126 * After this, the graph is destroyed. 1127 * The nodes array can then be used only as a source of tailored CEs. 1128 */ 1129 private void makeTailoredCEs() { 1130 CollationWeights primaries = new CollationWeights(); 1131 CollationWeights secondaries = new CollationWeights(); 1132 CollationWeights tertiaries = new CollationWeights(); 1133 long[] nodesArray = nodes.getBuffer(); 1134 if(DEBUG) { 1135 System.out.println("\nCollationBuilder.makeTailoredCEs()"); 1136 } 1137 1138 for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1139 int i = rootPrimaryIndexes.elementAti(rpi); 1140 long node = nodesArray[i]; 1141 long p = weight32FromNode(node); 1142 int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16; 1143 int t = s; 1144 int q = 0; 1145 boolean pIsTailored = false; 1146 boolean sIsTailored = false; 1147 boolean tIsTailored = false; 1148 if(DEBUG) { 1149 System.out.printf("\nprimary %x\n", alignWeightRight((int)p)); 1150 } 1151 int pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1152 int nextIndex = nextIndexFromNode(node); 1153 while(nextIndex != 0) { 1154 i = nextIndex; 1155 node = nodesArray[i]; 1156 nextIndex = nextIndexFromNode(node); 1157 int strength = strengthFromNode(node); 1158 if(strength == Collator.QUATERNARY) { 1159 assert(isTailoredNode(node)); 1160 if(DEBUG) { 1161 System.out.print(" quat+ "); 1162 } 1163 if(q == 3) { 1164 // C++ U_BUFFER_OVERFLOW_ERROR 1165 throw new UnsupportedOperationException("quaternary tailoring gap too small"); 1166 } 1167 ++q; 1168 } else { 1169 if(strength == Collator.TERTIARY) { 1170 if(isTailoredNode(node)) { 1171 if(DEBUG) { 1172 System.out.print(" ter+ "); 1173 } 1174 if(!tIsTailored) { 1175 // First tailored tertiary node for [p, s]. 1176 int tCount = countTailoredNodes(nodesArray, nextIndex, 1177 Collator.TERTIARY) + 1; 1178 int tLimit; 1179 if(t == 0) { 1180 // Gap at the beginning of the tertiary CE range. 1181 t = rootElements.getTertiaryBoundary() - 0x100; 1182 tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK; 1183 } else if(!pIsTailored && !sIsTailored) { 1184 // p and s are root weights. 1185 tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1186 } else if(t == Collation.BEFORE_WEIGHT16) { 1187 tLimit = Collation.COMMON_WEIGHT16; 1188 } else { 1189 // [p, s] is tailored. 1190 assert(t == Collation.COMMON_WEIGHT16); 1191 tLimit = rootElements.getTertiaryBoundary(); 1192 } 1193 assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0); 1194 tertiaries.initForTertiary(); 1195 if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1196 // C++ U_BUFFER_OVERFLOW_ERROR 1197 throw new UnsupportedOperationException("tertiary tailoring gap too small"); 1198 } 1199 tIsTailored = true; 1200 } 1201 t = (int)tertiaries.nextWeight(); 1202 assert(t != 0xffffffff); 1203 } else { 1204 t = weight16FromNode(node); 1205 tIsTailored = false; 1206 if(DEBUG) { 1207 System.out.printf(" ter %x\n", alignWeightRight(t)); 1208 } 1209 } 1210 } else { 1211 if(strength == Collator.SECONDARY) { 1212 if(isTailoredNode(node)) { 1213 if(DEBUG) { 1214 System.out.print(" sec+ "); 1215 } 1216 if(!sIsTailored) { 1217 // First tailored secondary node for p. 1218 int sCount = countTailoredNodes(nodesArray, nextIndex, 1219 Collator.SECONDARY) + 1; 1220 int sLimit; 1221 if(s == 0) { 1222 // Gap at the beginning of the secondary CE range. 1223 s = rootElements.getSecondaryBoundary() - 0x100; 1224 sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16); 1225 } else if(!pIsTailored) { 1226 // p is a root primary. 1227 sLimit = rootElements.getSecondaryAfter(pIndex, s); 1228 } else if(s == Collation.BEFORE_WEIGHT16) { 1229 sLimit = Collation.COMMON_WEIGHT16; 1230 } else { 1231 // p is a tailored primary. 1232 assert(s == Collation.COMMON_WEIGHT16); 1233 sLimit = rootElements.getSecondaryBoundary(); 1234 } 1235 if(s == Collation.COMMON_WEIGHT16) { 1236 // Do not tailor into the getSortKey() range of 1237 // compressed common secondaries. 1238 s = rootElements.getLastCommonSecondary(); 1239 } 1240 secondaries.initForSecondary(); 1241 if(!secondaries.allocWeights(s, sLimit, sCount)) { 1242 // C++ U_BUFFER_OVERFLOW_ERROR 1243 if(DEBUG) { 1244 System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n", 1245 alignWeightRight(s), alignWeightRight(sLimit), 1246 alignWeightRight(sCount)); 1247 } 1248 throw new UnsupportedOperationException("secondary tailoring gap too small"); 1249 } 1250 sIsTailored = true; 1251 } 1252 s = (int)secondaries.nextWeight(); 1253 assert(s != 0xffffffff); 1254 } else { 1255 s = weight16FromNode(node); 1256 sIsTailored = false; 1257 if(DEBUG) { 1258 System.out.printf(" sec %x\n", alignWeightRight(s)); 1259 } 1260 } 1261 } else /* Collator.PRIMARY */ { 1262 assert(isTailoredNode(node)); 1263 if(DEBUG) { 1264 System.out.print("pri+ "); 1265 } 1266 if(!pIsTailored) { 1267 // First tailored primary node in this list. 1268 int pCount = countTailoredNodes(nodesArray, nextIndex, 1269 Collator.PRIMARY) + 1; 1270 boolean isCompressible = baseData.isCompressiblePrimary(p); 1271 long pLimit = 1272 rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1273 primaries.initForPrimary(isCompressible); 1274 if(!primaries.allocWeights(p, pLimit, pCount)) { 1275 // C++ U_BUFFER_OVERFLOW_ERROR // TODO: introduce a more specific UErrorCode? 1276 throw new UnsupportedOperationException("primary tailoring gap too small"); 1277 } 1278 pIsTailored = true; 1279 } 1280 p = primaries.nextWeight(); 1281 assert(p != 0xffffffffL); 1282 s = Collation.COMMON_WEIGHT16; 1283 sIsTailored = false; 1284 } 1285 t = s == 0 ? 0 : Collation.COMMON_WEIGHT16; 1286 tIsTailored = false; 1287 } 1288 q = 0; 1289 } 1290 if(isTailoredNode(node)) { 1291 nodesArray[i] = Collation.makeCE(p, s, t, q); 1292 if(DEBUG) { 1293 System.out.printf("%016x\n", nodesArray[i]); 1294 } 1295 } 1296 } 1297 } 1298 } 1299 1300 /** 1301 * Counts the tailored nodes of the given strength up to the next node 1302 * which is either stronger or has an explicit weight of this strength. 1303 */ 1304 private static int countTailoredNodes(long[] nodesArray, int i, int strength) { 1305 int count = 0; 1306 for(;;) { 1307 if(i == 0) { break; } 1308 long node = nodesArray[i]; 1309 if(strengthFromNode(node) < strength) { break; } 1310 if(strengthFromNode(node) == strength) { 1311 if(isTailoredNode(node)) { 1312 ++count; 1313 } else { 1314 break; 1315 } 1316 } 1317 i = nextIndexFromNode(node); 1318 } 1319 return count; 1320 } 1321 1322 private static final class CEFinalizer implements CollationDataBuilder.CEModifier { 1323 CEFinalizer(long[] ces) { 1324 finalCEs = ces; 1325 } 1326 @Override 1327 public long modifyCE32(int ce32) { 1328 assert(!Collation.isSpecialCE32(ce32)); 1329 if(CollationBuilder.isTempCE32(ce32)) { 1330 // retain case bits 1331 return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1332 } else { 1333 return Collation.NO_CE; 1334 } 1335 } 1336 @Override 1337 public long modifyCE(long ce) { 1338 if(CollationBuilder.isTempCE(ce)) { 1339 // retain case bits 1340 return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000); 1341 } else { 1342 return Collation.NO_CE; 1343 } 1344 } 1345 1346 private long[] finalCEs; 1347 }; 1348 1349 /** Replaces temporary CEs with the final CEs they point to. */ 1350 private void finalizeCEs() { 1351 CollationDataBuilder newBuilder = new CollationDataBuilder(); 1352 newBuilder.initForTailoring(baseData); 1353 CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer()); 1354 newBuilder.copyFrom(dataBuilder, finalizer); 1355 dataBuilder = newBuilder; 1356 } 1357 1358 /** 1359 * Encodes "temporary CE" data into a CE that fits into the CE32 data structure, 1360 * with 2-byte primary, 1-byte secondary and 6-bit tertiary, 1361 * with valid CE byte values. 1362 * 1363 * The index must not exceed 20 bits (0xfffff). 1364 * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY). 1365 * 1366 * Temporary CEs are distinguished from real CEs by their use of 1367 * secondary weights 06..45 which are otherwise reserved for compressed sort keys. 1368 * 1369 * The case bits are unused and available. 1370 */ 1371 private static long tempCEFromIndexAndStrength(int index, int strength) { 1372 return 1373 // CE byte offsets, to ensure valid CE bytes, and case bits 11 1374 0x4040000006002000L + 1375 // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF) 1376 ((long)(index & 0xfe000) << 43) + 1377 // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF) 1378 ((long)(index & 0x1fc0) << 42) + 1379 // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45) 1380 ((index & 0x3f) << 24) + 1381 // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23) 1382 (strength << 8); 1383 } 1384 private static int indexFromTempCE(long tempCE) { 1385 tempCE -= 0x4040000006002000L; 1386 return 1387 ((int)(tempCE >> 43) & 0xfe000) | 1388 ((int)(tempCE >> 42) & 0x1fc0) | 1389 ((int)(tempCE >> 24) & 0x3f); 1390 } 1391 private static int strengthFromTempCE(long tempCE) { 1392 return ((int)tempCE >> 8) & 3; 1393 } 1394 private static boolean isTempCE(long ce) { 1395 int sec = (int)ce >>> 24; 1396 return 6 <= sec && sec <= 0x45; 1397 } 1398 1399 private static int indexFromTempCE32(int tempCE32) { 1400 tempCE32 -= 0x40400620; 1401 return 1402 ((tempCE32 >> 11) & 0xfe000) | 1403 ((tempCE32 >> 10) & 0x1fc0) | 1404 ((tempCE32 >> 8) & 0x3f); 1405 } 1406 private static boolean isTempCE32(int ce32) { 1407 return 1408 (ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE32 1409 6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45; 1410 } 1411 1412 private static int ceStrength(long ce) { 1413 return 1414 isTempCE(ce) ? strengthFromTempCE(ce) : 1415 (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY : 1416 ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY : 1417 ce != 0 ? Collator.TERTIARY : 1418 Collator.IDENTICAL; 1419 } 1420 1421 /** At most 1M nodes, limited by the 20 bits in node bit fields. */ 1422 private static final int MAX_INDEX = 0xfffff; 1423 /** 1424 * Node bit 6 is set on a primary node if there are nodes 1425 * with secondary values below the common secondary weight (05). 1426 */ 1427 private static final int HAS_BEFORE2 = 0x40; 1428 /** 1429 * Node bit 5 is set on a primary or secondary node if there are nodes 1430 * with tertiary values below the common tertiary weight (05). 1431 */ 1432 private static final int HAS_BEFORE3 = 0x20; 1433 /** 1434 * Node bit 3 distinguishes a tailored node, which has no weight value, 1435 * from a node with an explicit (root or default) weight. 1436 */ 1437 private static final int IS_TAILORED = 8; 1438 1439 private static long nodeFromWeight32(long weight32) { 1440 return weight32 << 32; 1441 } 1442 private static long nodeFromWeight16(int weight16) { 1443 return (long)weight16 << 48; 1444 } 1445 private static long nodeFromPreviousIndex(int previous) { 1446 return (long)previous << 28; 1447 } 1448 private static long nodeFromNextIndex(int next) { 1449 return next << 8; 1450 } 1451 private static long nodeFromStrength(int strength) { 1452 return strength; 1453 } 1454 1455 private static long weight32FromNode(long node) { 1456 return node >>> 32; 1457 } 1458 private static int weight16FromNode(long node) { 1459 return (int)(node >> 48) & 0xffff; 1460 } 1461 private static int previousIndexFromNode(long node) { 1462 return (int)(node >> 28) & MAX_INDEX; 1463 } 1464 private static int nextIndexFromNode(long node) { 1465 return ((int)node >> 8) & MAX_INDEX; 1466 } 1467 private static int strengthFromNode(long node) { 1468 return (int)node & 3; 1469 } 1470 1471 private static boolean nodeHasBefore2(long node) { 1472 return (node & HAS_BEFORE2) != 0; 1473 } 1474 private static boolean nodeHasBefore3(long node) { 1475 return (node & HAS_BEFORE3) != 0; 1476 } 1477 private static boolean nodeHasAnyBefore(long node) { 1478 return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0; 1479 } 1480 private static boolean isTailoredNode(long node) { 1481 return (node & IS_TAILORED) != 0; 1482 } 1483 1484 private static long changeNodePreviousIndex(long node, int previous) { 1485 return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous); 1486 } 1487 private static long changeNodeNextIndex(long node, int next) { 1488 return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next); 1489 } 1490 1491 private Normalizer2 nfd, fcd; 1492 private Normalizer2Impl nfcImpl; 1493 1494 private CollationTailoring base; 1495 private CollationData baseData; 1496 private CollationRootElements rootElements; 1497 private long variableTop; 1498 1499 private CollationDataBuilder dataBuilder; 1500 private boolean fastLatinEnabled; 1501 private UnicodeSet optimizeSet = new UnicodeSet(); 1502 1503 private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH]; 1504 private int cesLength; 1505 1506 /** 1507 * Indexes of nodes with root primary weights, sorted by primary. 1508 * Compact form of a TreeMap from root primary to node index. 1509 * 1510 * This is a performance optimization for finding reset positions. 1511 * Without this, we would have to search through the entire nodes list. 1512 * It also allows storing root primary weights in list head nodes, 1513 * without previous index, leaving room in root primary nodes for 32-bit primary weights. 1514 */ 1515 private UVector32 rootPrimaryIndexes; 1516 /** 1517 * Data structure for assigning tailored weights and CEs. 1518 * Doubly-linked lists of nodes in mostly collation order. 1519 * Each list starts with a root primary node and ends with a nextIndex of 0. 1520 * 1521 * When there are any nodes in the list, then there is always a root primary node at index 0. 1522 * This allows some code not to have to check explicitly for nextIndex==0. 1523 * 1524 * Root primary nodes have 32-bit weights but do not have previous indexes. 1525 * All other nodes have at most 16-bit weights and do have previous indexes. 1526 * 1527 * Nodes with explicit weights store root collator weights, 1528 * or default weak weights (e.g., secondary 05) for stronger nodes. 1529 * "Tailored" nodes, with the IS_TAILORED bit set, 1530 * do not store explicit weights but rather 1531 * create a difference of a certain strength from the preceding node. 1532 * 1533 * A root node is followed by either 1534 * - a root/default node of the same strength, or 1535 * - a root/default node of the next-weaker strength, or 1536 * - a tailored node of the same strength. 1537 * 1538 * A node of a given strength normally implies "common" weights on weaker levels. 1539 * 1540 * A node with HAS_BEFORE2 must be immediately followed by 1541 * a secondary node with an explicit below-common weight, then a secondary tailored node, 1542 * and later an explicit common-secondary node. 1543 * The below-common weight can be a root weight, 1544 * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight 1545 * or before the lowest root weight. 1546 * (&[before 2] resets to an explicit secondary node so that 1547 * the following addRelation(secondary) tailors right after that. 1548 * If we did not have this node and instead were to reset on the primary node, 1549 * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.) 1550 * 1551 * If the flag is not set, then there are no explicit secondary nodes 1552 * with the common or lower weights. 1553 * 1554 * Same for HAS_BEFORE3 for tertiary nodes and weights. 1555 * A node must not have both flags set. 1556 * 1557 * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs 1558 * which point to stable indexes in this list, 1559 * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes. 1560 * 1561 * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node, 1562 * until the next relation is added. 1563 * 1564 * At the end, the tailored weights are allocated as necessary, 1565 * then the tailored nodes are replaced with final CEs, 1566 * and the CollationData is rewritten by replacing temporary CEs with final ones. 1567 * 1568 * We cannot simply insert new nodes in the middle of the array 1569 * because that would invalidate the indexes stored in existing temporary CEs. 1570 * We need to use a linked graph with stable indexes to existing nodes. 1571 * A doubly-linked list seems easiest to maintain. 1572 * 1573 * Each node is stored as an long, with its fields stored as bit fields. 1574 * 1575 * Root primary node: 1576 * - primary weight: 32 bits 63..32 1577 * - reserved/unused/zero: 4 bits 31..28 1578 * 1579 * Weaker root nodes & tailored nodes: 1580 * - a weight: 16 bits 63..48 1581 * + a root or default weight for a non-tailored node 1582 * + unused/zero for a tailored node 1583 * - index to the previous node: 20 bits 47..28 1584 * 1585 * All types of nodes: 1586 * - index to the next node: 20 bits 27..8 1587 * + nextIndex=0 in last node per root-primary list 1588 * - reserved/unused/zero bits: bits 7, 4, 2 1589 * - HAS_BEFORE2: bit 6 1590 * - HAS_BEFORE3: bit 5 1591 * - IS_TAILORED: bit 3 1592 * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0 1593 * 1594 * We could allocate structs with pointers, but we would have to store them 1595 * in a pointer list so that they can be indexed from temporary CEs, 1596 * and they would require more memory allocations. 1597 */ 1598 private UVector64 nodes; 1599 } 1600