1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2012-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp 10 * 11 * C++ version created on: 2012apr01 12 * created by: Markus W. Scherer 13 */ 14 15 package android.icu.impl.coll; 16 17 import java.util.ArrayList; 18 import java.util.Arrays; 19 import java.util.Iterator; 20 21 import android.icu.impl.Norm2AllModes; 22 import android.icu.impl.Normalizer2Impl; 23 import android.icu.impl.Normalizer2Impl.Hangul; 24 import android.icu.impl.Trie2; 25 import android.icu.impl.Trie2Writable; 26 import android.icu.lang.UCharacter; 27 import android.icu.text.UnicodeSet; 28 import android.icu.text.UnicodeSetIterator; 29 import android.icu.util.CharsTrie; 30 import android.icu.util.CharsTrieBuilder; 31 import android.icu.util.StringTrieBuilder; 32 33 /** 34 * Low-level CollationData builder. 35 * Takes (character, CE) pairs and builds them into runtime data structures. 36 * Supports characters with context prefixes and contraction suffixes. 37 */ 38 final class CollationDataBuilder { // not final in C++ 39 /** 40 * Collation element modifier. Interface class for a modifier 41 * that changes a tailoring builder's temporary CEs to final CEs. 42 * Called for every non-special CE32 and every expansion CE. 43 */ 44 interface CEModifier { 45 /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */ 46 long modifyCE32(int ce32); 47 /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */ 48 long modifyCE(long ce); 49 } 50 51 CollationDataBuilder() { 52 nfcImpl = Norm2AllModes.getNFCInstance().impl; 53 base = null; 54 baseSettings = null; 55 trie = null; 56 ce32s = new UVector32(); 57 ce64s = new UVector64(); 58 conditionalCE32s = new ArrayList<ConditionalCE32>(); 59 modified = false; 60 fastLatinEnabled = false; 61 fastLatinBuilder = null; 62 collIter = null; 63 // Reserve the first CE32 for U+0000. 64 ce32s.addElement(0); 65 } 66 67 void initForTailoring(CollationData b) { 68 if(trie != null) { 69 throw new IllegalStateException("attempt to reuse a CollationDataBuilder"); 70 } 71 if(b == null) { 72 throw new IllegalArgumentException("null CollationData"); 73 } 74 base = b; 75 76 // For a tailoring, the default is to fall back to the base. 77 trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32); 78 79 // Set the Latin-1 letters block so that it is allocated first in the data array, 80 // to try to improve locality of reference when sorting Latin-1 text. 81 // Do not use utrie2_setRange32() since that will not actually allocate blocks 82 // that are filled with the default value. 83 // ASCII (0..7F) is already preallocated anyway. 84 for(int c = 0xc0; c <= 0xff; ++c) { 85 trie.set(c, Collation.FALLBACK_CE32); 86 } 87 88 // Hangul syllables are not tailorable (except via tailoring Jamos). 89 // Always set the Hangul tag to help performance. 90 // Do this here, rather than in buildMappings(), 91 // so that we see the HANGUL_TAG in various assertions. 92 int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0); 93 trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true); 94 95 // Copy the set contents but don't copy/clone the set as a whole because 96 // that would copy the isFrozen state too. 97 unsafeBackwardSet.addAll(b.unsafeBackwardSet); 98 } 99 100 boolean isCompressibleLeadByte(int b) { 101 return base.isCompressibleLeadByte(b); 102 } 103 104 boolean isCompressiblePrimary(long p) { 105 return isCompressibleLeadByte((int)p >>> 24); 106 } 107 108 /** 109 * @return true if this builder has mappings (e.g., add() has been called) 110 */ 111 boolean hasMappings() { return modified; } 112 113 /** 114 * @return true if c has CEs in this builder 115 */ 116 boolean isAssigned(int c) { 117 return Collation.isAssignedCE32(trie.get(c)); 118 } 119 120 void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) { 121 int ce32 = encodeCEs(ces, cesLength); 122 addCE32(prefix, s, ce32); 123 } 124 125 /** 126 * Encodes the ces as either the returned ce32 by itself, 127 * or by storing an expansion, with the returned ce32 referring to that. 128 * 129 * <p>add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) 130 */ 131 int encodeCEs(long ces[], int cesLength) { 132 if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) { 133 throw new IllegalArgumentException("mapping to too many CEs"); 134 } 135 if(!isMutable()) { 136 throw new IllegalStateException("attempt to add mappings after build()"); 137 } 138 if(cesLength == 0) { 139 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. 140 // Do this here so that callers need not do it. 141 return encodeOneCEAsCE32(0); 142 } else if(cesLength == 1) { 143 return encodeOneCE(ces[0]); 144 } else if(cesLength == 2) { 145 // Try to encode two CEs as one CE32. 146 long ce0 = ces[0]; 147 long ce1 = ces[1]; 148 long p0 = ce0 >>> 32; 149 if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE && 150 (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE && 151 p0 != 0) { 152 // Latin mini expansion 153 return 154 (int)p0 | 155 (((int)ce0 & 0xff00) << 8) | 156 (((int)ce1 >> 16) & 0xff00) | 157 Collation.SPECIAL_CE32_LOW_BYTE | 158 Collation.LATIN_EXPANSION_TAG; 159 } 160 } 161 // Try to encode two or more CEs as CE32s. 162 int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH]; // TODO: instance field? 163 for(int i = 0;; ++i) { 164 if(i == cesLength) { 165 return encodeExpansion32(newCE32s, 0, cesLength); 166 } 167 int ce32 = encodeOneCEAsCE32(ces[i]); 168 if(ce32 == Collation.NO_CE32) { break; } 169 newCE32s[i] = ce32; 170 } 171 return encodeExpansion(ces, 0, cesLength); 172 } 173 174 void addCE32(CharSequence prefix, CharSequence s, int ce32) { 175 if(s.length() == 0) { 176 throw new IllegalArgumentException("mapping from empty string"); 177 } 178 if(!isMutable()) { 179 throw new IllegalStateException("attempt to add mappings after build()"); 180 } 181 int c = Character.codePointAt(s, 0); 182 int cLength = Character.charCount(c); 183 int oldCE32 = trie.get(c); 184 boolean hasContext = prefix.length() != 0|| s.length() > cLength; 185 if(oldCE32 == Collation.FALLBACK_CE32) { 186 // First tailoring for c. 187 // If c has contextual base mappings or if we add a contextual mapping, 188 // then copy the base mappings. 189 // Otherwise we just override the base mapping. 190 int baseCE32 = base.getFinalCE32(base.getCE32(c)); 191 if(hasContext || Collation.ce32HasContext(baseCE32)) { 192 oldCE32 = copyFromBaseCE32(c, baseCE32, true); 193 trie.set(c, oldCE32); 194 } 195 } 196 if(!hasContext) { 197 // No prefix, no contraction. 198 if(!isBuilderContextCE32(oldCE32)) { 199 trie.set(c, ce32); 200 } else { 201 ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32); 202 cond.builtCE32 = Collation.NO_CE32; 203 cond.ce32 = ce32; 204 } 205 } else { 206 ConditionalCE32 cond; 207 if(!isBuilderContextCE32(oldCE32)) { 208 // Replace the simple oldCE32 with a builder context CE32 209 // pointing to a new ConditionalCE32 list head. 210 int index = addConditionalCE32("\0", oldCE32); 211 int contextCE32 = makeBuilderContextCE32(index); 212 trie.set(c, contextCE32); 213 contextChars.add(c); 214 cond = getConditionalCE32(index); 215 } else { 216 cond = getConditionalCE32ForCE32(oldCE32); 217 cond.builtCE32 = Collation.NO_CE32; 218 } 219 CharSequence suffix = s.subSequence(cLength, s.length()); 220 String context = new StringBuilder().append((char)prefix.length()). 221 append(prefix).append(suffix).toString(); 222 unsafeBackwardSet.addAll(suffix); 223 for(;;) { 224 // invariant: context > cond.context 225 int next = cond.next; 226 if(next < 0) { 227 // Append a new ConditionalCE32 after cond. 228 int index = addConditionalCE32(context, ce32); 229 cond.next = index; 230 break; 231 } 232 ConditionalCE32 nextCond = getConditionalCE32(next); 233 int cmp = context.compareTo(nextCond.context); 234 if(cmp < 0) { 235 // Insert a new ConditionalCE32 between cond and nextCond. 236 int index = addConditionalCE32(context, ce32); 237 cond.next = index; 238 getConditionalCE32(index).next = next; 239 break; 240 } else if(cmp == 0) { 241 // Same context as before, overwrite its ce32. 242 nextCond.ce32 = ce32; 243 break; 244 } 245 cond = nextCond; 246 } 247 } 248 modified = true; 249 } 250 251 /** 252 * Copies all mappings from the src builder, with modifications. 253 * This builder here must not be built yet, and should be empty. 254 */ 255 void copyFrom(CollationDataBuilder src, CEModifier modifier) { 256 if(!isMutable()) { 257 throw new IllegalStateException("attempt to copyFrom() after build()"); 258 } 259 CopyHelper helper = new CopyHelper(src, this, modifier); 260 Iterator<Trie2.Range> trieIterator = src.trie.iterator(); 261 Trie2.Range range; 262 while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 263 enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper); 264 } 265 // Update the contextChars and the unsafeBackwardSet while copying, 266 // in case a character had conditional mappings in the source builder 267 // and they were removed later. 268 modified |= src.modified; 269 } 270 271 void optimize(UnicodeSet set) { 272 if(set.isEmpty()) { return; } 273 UnicodeSetIterator iter = new UnicodeSetIterator(set); 274 while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { 275 int c = iter.codepoint; 276 int ce32 = trie.get(c); 277 if(ce32 == Collation.FALLBACK_CE32) { 278 ce32 = base.getFinalCE32(base.getCE32(c)); 279 ce32 = copyFromBaseCE32(c, ce32, true); 280 trie.set(c, ce32); 281 } 282 } 283 modified = true; 284 } 285 286 void suppressContractions(UnicodeSet set) { 287 if(set.isEmpty()) { return; } 288 UnicodeSetIterator iter = new UnicodeSetIterator(set); 289 while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { 290 int c = iter.codepoint; 291 int ce32 = trie.get(c); 292 if(ce32 == Collation.FALLBACK_CE32) { 293 ce32 = base.getFinalCE32(base.getCE32(c)); 294 if(Collation.ce32HasContext(ce32)) { 295 ce32 = copyFromBaseCE32(c, ce32, false /* without context */); 296 trie.set(c, ce32); 297 } 298 } else if(isBuilderContextCE32(ce32)) { 299 ce32 = getConditionalCE32ForCE32(ce32).ce32; 300 // Simply abandon the list of ConditionalCE32. 301 // The caller will copy this builder in the end, 302 // eliminating unreachable data. 303 trie.set(c, ce32); 304 contextChars.remove(c); 305 } 306 } 307 modified = true; 308 } 309 310 void enableFastLatin() { fastLatinEnabled = true; } 311 void build(CollationData data) { 312 buildMappings(data); 313 if(base != null) { 314 data.numericPrimary = base.numericPrimary; 315 data.compressibleBytes = base.compressibleBytes; 316 data.numScripts = base.numScripts; 317 data.scriptsIndex = base.scriptsIndex; 318 data.scriptStarts = base.scriptStarts; 319 } 320 buildFastLatinTable(data); 321 } 322 323 /** 324 * Looks up CEs for s and appends them to the ces array. 325 * Does not handle normalization: s should be in FCD form. 326 * 327 * Does not write completely ignorable CEs. 328 * Does not write beyond Collation.MAX_EXPANSION_LENGTH. 329 * 330 * @return incremented cesLength 331 */ 332 int getCEs(CharSequence s, long ces[], int cesLength) { 333 return getCEs(s, 0, ces, cesLength); 334 } 335 336 int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) { 337 int prefixLength = prefix.length(); 338 if(prefixLength == 0) { 339 return getCEs(s, 0, ces, cesLength); 340 } else { 341 return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength); 342 } 343 } 344 345 /** 346 * Build-time context and CE32 for a code point. 347 * If a code point has contextual mappings, then the default (no-context) mapping 348 * and all conditional mappings are stored in a singly-linked list 349 * of ConditionalCE32, sorted by context strings. 350 * 351 * Context strings sort by prefix length, then by prefix, then by contraction suffix. 352 * Context strings must be unique and in ascending order. 353 */ 354 private static final class ConditionalCE32 { 355 ConditionalCE32(String ct, int ce) { 356 context = ct; 357 ce32 = ce; 358 defaultCE32 = Collation.NO_CE32; 359 builtCE32 = Collation.NO_CE32; 360 next = -1; 361 } 362 363 boolean hasContext() { return context.length() > 1; } 364 int prefixLength() { return context.charAt(0); } 365 366 /** 367 * "\0" for the first entry for any code point, with its default CE32. 368 * 369 * Otherwise one unit with the length of the prefix string, 370 * then the prefix string, then the contraction suffix. 371 */ 372 String context; 373 /** 374 * CE32 for the code point and its context. 375 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). 376 */ 377 int ce32; 378 /** 379 * Default CE32 for all contexts with this same prefix. 380 * Initially NO_CE32. Set only while building runtime data structures, 381 * and only on one of the nodes of a sub-list with the same prefix. 382 */ 383 int defaultCE32; 384 /** 385 * CE32 for the built contexts. 386 * When fetching CEs from the builder, the contexts are built into their runtime form 387 * so that the normal collation implementation can process them. 388 * The result is cached in the list head. It is reset when the contexts are modified. 389 */ 390 int builtCE32; 391 /** 392 * Index of the next ConditionalCE32. 393 * Negative for the end of the list. 394 */ 395 int next; 396 } 397 398 protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) { 399 int i = Collation.indexFromCE32(ce32); 400 long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i); 401 long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); 402 return Collation.makeLongPrimaryCE32(p); 403 } 404 405 protected int addCE(long ce) { 406 int length = ce64s.size(); 407 for(int i = 0; i < length; ++i) { 408 if(ce == ce64s.elementAti(i)) { return i; } 409 } 410 ce64s.addElement(ce); 411 return length; 412 } 413 414 protected int addCE32(int ce32) { 415 int length = ce32s.size(); 416 for(int i = 0; i < length; ++i) { 417 if(ce32 == ce32s.elementAti(i)) { return i; } 418 } 419 ce32s.addElement(ce32); 420 return length; 421 } 422 423 protected int addConditionalCE32(String context, int ce32) { 424 assert(context.length() != 0); 425 int index = conditionalCE32s.size(); 426 if(index > Collation.MAX_INDEX) { 427 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 428 // BufferOverflowException is a better fit 429 // but cannot be constructed with a message string. 430 } 431 ConditionalCE32 cond = new ConditionalCE32(context, ce32); 432 conditionalCE32s.add(cond); 433 return index; 434 } 435 436 protected ConditionalCE32 getConditionalCE32(int index) { 437 return conditionalCE32s.get(index); 438 } 439 protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) { 440 return getConditionalCE32(Collation.indexFromCE32(ce32)); 441 } 442 443 protected static int makeBuilderContextCE32(int index) { 444 return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index); 445 } 446 protected static boolean isBuilderContextCE32(int ce32) { 447 return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG); 448 } 449 450 protected static int encodeOneCEAsCE32(long ce) { 451 long p = ce >>> 32; 452 int lower32 = (int)ce; 453 int t = lower32 & 0xffff; 454 assert((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. 455 if((ce & 0xffff00ff00ffL) == 0) { 456 // normal form ppppsstt 457 return (int)p | (lower32 >>> 16) | (t >> 8); 458 } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) { 459 // long-primary form ppppppC1 460 return Collation.makeLongPrimaryCE32(p); 461 } else if(p == 0 && (t & 0xff) == 0) { 462 // long-secondary form ssssttC2 463 return Collation.makeLongSecondaryCE32(lower32); 464 } 465 return Collation.NO_CE32; 466 } 467 468 protected int encodeOneCE(long ce) { 469 // Try to encode one CE as one CE32. 470 int ce32 = encodeOneCEAsCE32(ce); 471 if(ce32 != Collation.NO_CE32) { return ce32; } 472 int index = addCE(ce); 473 if(index > Collation.MAX_INDEX) { 474 throw new IndexOutOfBoundsException("too many mappings"); 475 // BufferOverflowException is a better fit 476 // but cannot be constructed with a message string. 477 } 478 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1); 479 } 480 481 protected int encodeExpansion(long ces[], int start, int length) { 482 // See if this sequence of CEs has already been stored. 483 long first = ces[start]; 484 int ce64sMax = ce64s.size() - length; 485 for(int i = 0; i <= ce64sMax; ++i) { 486 if(first == ce64s.elementAti(i)) { 487 if(i > Collation.MAX_INDEX) { 488 throw new IndexOutOfBoundsException("too many mappings"); 489 // BufferOverflowException is a better fit 490 // but cannot be constructed with a message string. 491 } 492 for(int j = 1;; ++j) { 493 if(j == length) { 494 return Collation.makeCE32FromTagIndexAndLength( 495 Collation.EXPANSION_TAG, i, length); 496 } 497 if(ce64s.elementAti(i + j) != ces[start + j]) { break; } 498 } 499 } 500 } 501 // Store the new sequence. 502 int i = ce64s.size(); 503 if(i > Collation.MAX_INDEX) { 504 throw new IndexOutOfBoundsException("too many mappings"); 505 // BufferOverflowException is a better fit 506 // but cannot be constructed with a message string. 507 } 508 for(int j = 0; j < length; ++j) { 509 ce64s.addElement(ces[start + j]); 510 } 511 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length); 512 } 513 514 protected int encodeExpansion32(int newCE32s[], int start, int length) { 515 // See if this sequence of CE32s has already been stored. 516 int first = newCE32s[start]; 517 int ce32sMax = ce32s.size() - length; 518 for(int i = 0; i <= ce32sMax; ++i) { 519 if(first == ce32s.elementAti(i)) { 520 if(i > Collation.MAX_INDEX) { 521 throw new IndexOutOfBoundsException("too many mappings"); 522 // BufferOverflowException is a better fit 523 // but cannot be constructed with a message string. 524 } 525 for(int j = 1;; ++j) { 526 if(j == length) { 527 return Collation.makeCE32FromTagIndexAndLength( 528 Collation.EXPANSION32_TAG, i, length); 529 } 530 if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; } 531 } 532 } 533 } 534 // Store the new sequence. 535 int i = ce32s.size(); 536 if(i > Collation.MAX_INDEX) { 537 throw new IndexOutOfBoundsException("too many mappings"); 538 // BufferOverflowException is a better fit 539 // but cannot be constructed with a message string. 540 } 541 for(int j = 0; j < length; ++j) { 542 ce32s.addElement(newCE32s[start + j]); 543 } 544 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length); 545 } 546 547 protected int copyFromBaseCE32(int c, int ce32, boolean withContext) { 548 if(!Collation.isSpecialCE32(ce32)) { return ce32; } 549 switch(Collation.tagFromCE32(ce32)) { 550 case Collation.LONG_PRIMARY_TAG: 551 case Collation.LONG_SECONDARY_TAG: 552 case Collation.LATIN_EXPANSION_TAG: 553 // copy as is 554 break; 555 case Collation.EXPANSION32_TAG: { 556 int index = Collation.indexFromCE32(ce32); 557 int length = Collation.lengthFromCE32(ce32); 558 ce32 = encodeExpansion32(base.ce32s, index, length); 559 break; 560 } 561 case Collation.EXPANSION_TAG: { 562 int index = Collation.indexFromCE32(ce32); 563 int length = Collation.lengthFromCE32(ce32); 564 ce32 = encodeExpansion(base.ces, index, length); 565 break; 566 } 567 case Collation.PREFIX_TAG: { 568 // Flatten prefixes and nested suffixes (contractions) 569 // into a linear list of ConditionalCE32. 570 int trieIndex = Collation.indexFromCE32(ce32); 571 ce32 = base.getCE32FromContexts(trieIndex); // Default if no prefix match. 572 if(!withContext) { 573 return copyFromBaseCE32(c, ce32, false); 574 } 575 ConditionalCE32 head = new ConditionalCE32("", 0); 576 StringBuilder context = new StringBuilder("\0"); 577 int index; 578 if(Collation.isContractionCE32(ce32)) { 579 index = copyContractionsFromBaseCE32(context, c, ce32, head); 580 } else { 581 ce32 = copyFromBaseCE32(c, ce32, true); 582 head.next = index = addConditionalCE32(context.toString(), ce32); 583 } 584 ConditionalCE32 cond = getConditionalCE32(index); // the last ConditionalCE32 so far 585 CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); 586 while(prefixes.hasNext()) { 587 CharsTrie.Entry entry = prefixes.next(); 588 context.setLength(0); 589 context.append(entry.chars).reverse().insert(0, (char)entry.chars.length()); 590 ce32 = entry.value; 591 if(Collation.isContractionCE32(ce32)) { 592 index = copyContractionsFromBaseCE32(context, c, ce32, cond); 593 } else { 594 ce32 = copyFromBaseCE32(c, ce32, true); 595 cond.next = index = addConditionalCE32(context.toString(), ce32); 596 } 597 cond = getConditionalCE32(index); 598 } 599 ce32 = makeBuilderContextCE32(head.next); 600 contextChars.add(c); 601 break; 602 } 603 case Collation.CONTRACTION_TAG: { 604 if(!withContext) { 605 int index = Collation.indexFromCE32(ce32); 606 ce32 = base.getCE32FromContexts(index); // Default if no suffix match. 607 return copyFromBaseCE32(c, ce32, false); 608 } 609 ConditionalCE32 head = new ConditionalCE32("", 0); 610 StringBuilder context = new StringBuilder("\0"); 611 copyContractionsFromBaseCE32(context, c, ce32, head); 612 ce32 = makeBuilderContextCE32(head.next); 613 contextChars.add(c); 614 break; 615 } 616 case Collation.HANGUL_TAG: 617 throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables."); 618 case Collation.OFFSET_TAG: 619 ce32 = getCE32FromOffsetCE32(true, c, ce32); 620 break; 621 case Collation.IMPLICIT_TAG: 622 ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c)); 623 break; 624 default: 625 throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " + 626 "requires ce32 == base.getFinalCE32(ce32)"); 627 } 628 return ce32; 629 } 630 631 /** 632 * Copies base contractions to a list of ConditionalCE32. 633 * Sets cond.next to the index of the first new item 634 * and returns the index of the last new item. 635 */ 636 protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32, 637 ConditionalCE32 cond) { 638 int trieIndex = Collation.indexFromCE32(ce32); 639 int index; 640 if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 641 // No match on the single code point. 642 // We are underneath a prefix, and the default mapping is just 643 // a fallback to the mappings for a shorter prefix. 644 assert(context.length() > 1); 645 index = -1; 646 } else { 647 ce32 = base.getCE32FromContexts(trieIndex); // Default if no suffix match. 648 assert(!Collation.isContractionCE32(ce32)); 649 ce32 = copyFromBaseCE32(c, ce32, true); 650 cond.next = index = addConditionalCE32(context.toString(), ce32); 651 cond = getConditionalCE32(index); 652 } 653 654 int suffixStart = context.length(); 655 CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); 656 while(suffixes.hasNext()) { 657 CharsTrie.Entry entry = suffixes.next(); 658 context.append(entry.chars); 659 ce32 = copyFromBaseCE32(c, entry.value, true); 660 cond.next = index = addConditionalCE32(context.toString(), ce32); 661 // No need to update the unsafeBackwardSet because the tailoring set 662 // is already a copy of the base set. 663 cond = getConditionalCE32(index); 664 context.setLength(suffixStart); 665 } 666 assert(index >= 0); 667 return index; 668 } 669 670 private static final class CopyHelper { 671 CopyHelper(CollationDataBuilder s, CollationDataBuilder d, 672 CollationDataBuilder.CEModifier m) { 673 src = s; 674 dest = d; 675 modifier = m; 676 } 677 678 void copyRangeCE32(int start, int end, int ce32) { 679 ce32 = copyCE32(ce32); 680 dest.trie.setRange(start, end, ce32, true); 681 if(CollationDataBuilder.isBuilderContextCE32(ce32)) { 682 dest.contextChars.add(start, end); 683 } 684 } 685 686 int copyCE32(int ce32) { 687 if(!Collation.isSpecialCE32(ce32)) { 688 long ce = modifier.modifyCE32(ce32); 689 if(ce != Collation.NO_CE) { 690 ce32 = dest.encodeOneCE(ce); 691 } 692 } else { 693 int tag = Collation.tagFromCE32(ce32); 694 if(tag == Collation.EXPANSION32_TAG) { 695 int[] srcCE32s = src.ce32s.getBuffer(); 696 int srcIndex = Collation.indexFromCE32(ce32); 697 int length = Collation.lengthFromCE32(ce32); 698 // Inspect the source CE32s. Just copy them if none are modified. 699 // Otherwise copy to modifiedCEs, with modifications. 700 boolean isModified = false; 701 for(int i = 0; i < length; ++i) { 702 ce32 = srcCE32s[srcIndex + i]; 703 long ce; 704 if(Collation.isSpecialCE32(ce32) || 705 (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) { 706 if(isModified) { 707 modifiedCEs[i] = Collation.ceFromCE32(ce32); 708 } 709 } else { 710 if(!isModified) { 711 for(int j = 0; j < i; ++j) { 712 modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]); 713 } 714 isModified = true; 715 } 716 modifiedCEs[i] = ce; 717 } 718 } 719 if(isModified) { 720 ce32 = dest.encodeCEs(modifiedCEs, length); 721 } else { 722 ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length); 723 } 724 } else if(tag == Collation.EXPANSION_TAG) { 725 long[] srcCEs = src.ce64s.getBuffer(); 726 int srcIndex = Collation.indexFromCE32(ce32); 727 int length = Collation.lengthFromCE32(ce32); 728 // Inspect the source CEs. Just copy them if none are modified. 729 // Otherwise copy to modifiedCEs, with modifications. 730 boolean isModified = false; 731 for(int i = 0; i < length; ++i) { 732 long srcCE = srcCEs[srcIndex + i]; 733 long ce = modifier.modifyCE(srcCE); 734 if(ce == Collation.NO_CE) { 735 if(isModified) { 736 modifiedCEs[i] = srcCE; 737 } 738 } else { 739 if(!isModified) { 740 for(int j = 0; j < i; ++j) { 741 modifiedCEs[j] = srcCEs[srcIndex + j]; 742 } 743 isModified = true; 744 } 745 modifiedCEs[i] = ce; 746 } 747 } 748 if(isModified) { 749 ce32 = dest.encodeCEs(modifiedCEs, length); 750 } else { 751 ce32 = dest.encodeExpansion(srcCEs, srcIndex, length); 752 } 753 } else if(tag == Collation.BUILDER_DATA_TAG) { 754 // Copy the list of ConditionalCE32. 755 ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32); 756 assert(!cond.hasContext()); 757 int destIndex = dest.addConditionalCE32( 758 cond.context, copyCE32(cond.ce32)); 759 ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex); 760 while(cond.next >= 0) { 761 cond = src.getConditionalCE32(cond.next); 762 ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex); 763 destIndex = dest.addConditionalCE32( 764 cond.context, copyCE32(cond.ce32)); 765 int suffixStart = cond.prefixLength() + 1; 766 dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart)); 767 prevDestCond.next = destIndex; 768 } 769 } else { 770 // Just copy long CEs and Latin mini expansions (and other expected values) as is, 771 // assuming that the modifier would not modify them. 772 assert(tag == Collation.LONG_PRIMARY_TAG || 773 tag == Collation.LONG_SECONDARY_TAG || 774 tag == Collation.LATIN_EXPANSION_TAG || 775 tag == Collation.HANGUL_TAG); 776 } 777 } 778 return ce32; 779 } 780 781 CollationDataBuilder src; 782 CollationDataBuilder dest; 783 CollationDataBuilder.CEModifier modifier; 784 long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 785 } 786 787 private static void 788 enumRangeForCopy(int start, int end, int value, CopyHelper helper) { 789 if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) { 790 helper.copyRangeCE32(start, end, value); 791 } 792 } 793 794 protected boolean getJamoCE32s(int jamoCE32s[]) { 795 boolean anyJamoAssigned = base == null; // always set jamoCE32s in the base data 796 boolean needToCopyFromBase = false; 797 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 798 int jamo = jamoCpFromIndex(j); 799 boolean fromBase = false; 800 int ce32 = trie.get(jamo); 801 anyJamoAssigned |= Collation.isAssignedCE32(ce32); 802 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. 803 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) 804 if(ce32 == Collation.FALLBACK_CE32) { 805 fromBase = true; 806 ce32 = base.getCE32(jamo); 807 } 808 if(Collation.isSpecialCE32(ce32)) { 809 switch(Collation.tagFromCE32(ce32)) { 810 case Collation.LONG_PRIMARY_TAG: 811 case Collation.LONG_SECONDARY_TAG: 812 case Collation.LATIN_EXPANSION_TAG: 813 // Copy the ce32 as-is. 814 break; 815 case Collation.EXPANSION32_TAG: 816 case Collation.EXPANSION_TAG: 817 case Collation.PREFIX_TAG: 818 case Collation.CONTRACTION_TAG: 819 if(fromBase) { 820 // Defer copying until we know if anyJamoAssigned. 821 ce32 = Collation.FALLBACK_CE32; 822 needToCopyFromBase = true; 823 } 824 break; 825 case Collation.IMPLICIT_TAG: 826 // An unassigned Jamo should only occur in tests with incomplete bases. 827 assert(fromBase); 828 ce32 = Collation.FALLBACK_CE32; 829 needToCopyFromBase = true; 830 break; 831 case Collation.OFFSET_TAG: 832 ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); 833 break; 834 case Collation.FALLBACK_TAG: 835 case Collation.RESERVED_TAG_3: 836 case Collation.BUILDER_DATA_TAG: 837 case Collation.DIGIT_TAG: 838 case Collation.U0000_TAG: 839 case Collation.HANGUL_TAG: 840 case Collation.LEAD_SURROGATE_TAG: 841 throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32)); 842 } 843 } 844 jamoCE32s[j] = ce32; 845 } 846 if(anyJamoAssigned && needToCopyFromBase) { 847 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { 848 if(jamoCE32s[j] == Collation.FALLBACK_CE32) { 849 int jamo = jamoCpFromIndex(j); 850 jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo), 851 /*withContext=*/ true); 852 } 853 } 854 } 855 return anyJamoAssigned; 856 } 857 858 protected void setDigitTags() { 859 UnicodeSet digits = new UnicodeSet("[:Nd:]"); 860 UnicodeSetIterator iter = new UnicodeSetIterator(digits); 861 while(iter.next()) { 862 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 863 int c = iter.codepoint; 864 int ce32 = trie.get(c); 865 if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) { 866 int index = addCE32(ce32); 867 if(index > Collation.MAX_INDEX) { 868 throw new IndexOutOfBoundsException("too many mappings"); 869 // BufferOverflowException is a better fit 870 // but cannot be constructed with a message string. 871 } 872 ce32 = Collation.makeCE32FromTagIndexAndLength( 873 Collation.DIGIT_TAG, index, UCharacter.digit(c)); // u_charDigitValue(c) 874 trie.set(c, ce32); 875 } 876 } 877 } 878 879 protected void setLeadSurrogates() { 880 for(char lead = 0xd800; lead < 0xdc00; ++lead) { 881 int leadValue = -1; 882 // utrie2_enumForLeadSurrogate(trie, lead, null, , &value); 883 Iterator<Trie2.Range> trieIterator = trie.iteratorForLeadSurrogate(lead); 884 while(trieIterator.hasNext()) { 885 Trie2.Range range = trieIterator.next(); 886 // The rest of this loop is equivalent to C++ enumRangeLeadValue(). 887 int value = range.value; 888 if(value == Collation.UNASSIGNED_CE32) { 889 value = Collation.LEAD_ALL_UNASSIGNED; 890 } else if(value == Collation.FALLBACK_CE32) { 891 value = Collation.LEAD_ALL_FALLBACK; 892 } else { 893 leadValue = Collation.LEAD_MIXED; 894 break; 895 } 896 if(leadValue < 0) { 897 leadValue = value; 898 } else if(leadValue != value) { 899 leadValue = Collation.LEAD_MIXED; 900 break; 901 } 902 } 903 trie.setForLeadSurrogateCodeUnit(lead, 904 Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue); 905 } 906 } 907 908 protected void buildMappings(CollationData data) { 909 if(!isMutable()) { 910 throw new IllegalStateException("attempt to build() after build()"); 911 } 912 913 buildContexts(); 914 915 int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 916 int jamoIndex = -1; 917 if(getJamoCE32s(jamoCE32s)) { 918 jamoIndex = ce32s.size(); 919 for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) { 920 ce32s.addElement(jamoCE32s[i]); 921 } 922 // Small optimization: Use a bit in the Hangul ce32 923 // to indicate that none of the Jamo CE32s are isSpecialCE32() 924 // (as it should be in the root collator). 925 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. 926 // In order to still have good trie compression and keep this code simple, 927 // we only set this flag if a whole block of 588 Hangul syllables starting with 928 // a common leading consonant (Jamo L) has this property. 929 boolean isAnyJamoVTSpecial = false; 930 for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) { 931 if(Collation.isSpecialCE32(jamoCE32s[i])) { 932 isAnyJamoVTSpecial = true; 933 break; 934 } 935 } 936 int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0); 937 int c = Hangul.HANGUL_BASE; 938 for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) { // iterate over the Jamo L 939 int ce32 = hangulCE32; 940 if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) { 941 ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO; 942 } 943 int limit = c + Hangul.JAMO_VT_COUNT; 944 trie.setRange(c, limit - 1, ce32, true); 945 c = limit; 946 } 947 } else { 948 // Copy the Hangul CE32s from the base in blocks per Jamo L, 949 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. 950 for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) { 951 int ce32 = base.getCE32(c); 952 assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG)); 953 int limit = c + Hangul.JAMO_VT_COUNT; 954 trie.setRange(c, limit - 1, ce32, true); 955 c = limit; 956 } 957 } 958 959 setDigitTags(); 960 setLeadSurrogates(); 961 962 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. 963 ce32s.setElementAt(trie.get(0), 0); 964 trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0)); 965 966 data.trie = trie.toTrie2_32(); 967 968 // Mark each lead surrogate as "unsafe" 969 // if any of its 1024 associated supplementary code points is "unsafe". 970 int c = 0x10000; 971 for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 972 if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { 973 unsafeBackwardSet.add(lead); 974 } 975 } 976 unsafeBackwardSet.freeze(); 977 978 data.ce32s = ce32s.getBuffer(); 979 data.ces = ce64s.getBuffer(); 980 data.contexts = contexts.toString(); 981 982 data.base = base; 983 if(jamoIndex >= 0) { 984 data.jamoCE32s = jamoCE32s; // C++: data.ce32s + jamoIndex 985 } else { 986 data.jamoCE32s = base.jamoCE32s; 987 } 988 data.unsafeBackwardSet = unsafeBackwardSet; 989 } 990 991 protected void clearContexts() { 992 contexts.setLength(0); 993 UnicodeSetIterator iter = new UnicodeSetIterator(contextChars); 994 while(iter.next()) { 995 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 996 int ce32 = trie.get(iter.codepoint); 997 assert(isBuilderContextCE32(ce32)); 998 getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32; 999 } 1000 } 1001 1002 protected void buildContexts() { 1003 // Ignore abandoned lists and the cached builtCE32, 1004 // and build all contexts from scratch. 1005 contexts.setLength(0); 1006 UnicodeSetIterator iter = new UnicodeSetIterator(contextChars); 1007 while(iter.next()) { 1008 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 1009 int c = iter.codepoint; 1010 int ce32 = trie.get(c); 1011 if(!isBuilderContextCE32(ce32)) { 1012 throw new AssertionError("Impossible: No context data for c in contextChars."); 1013 } 1014 ConditionalCE32 cond = getConditionalCE32ForCE32(ce32); 1015 ce32 = buildContext(cond); 1016 trie.set(c, ce32); 1017 } 1018 } 1019 1020 protected int buildContext(ConditionalCE32 head) { 1021 // The list head must have no context. 1022 assert(!head.hasContext()); 1023 // The list head must be followed by one or more nodes that all do have context. 1024 assert(head.next >= 0); 1025 CharsTrieBuilder prefixBuilder = new CharsTrieBuilder(); 1026 CharsTrieBuilder contractionBuilder = new CharsTrieBuilder(); 1027 for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) { 1028 // After the list head, the prefix or suffix can be empty, but not both. 1029 assert(cond == head || cond.hasContext()); 1030 int prefixLength = cond.prefixLength(); 1031 StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1); 1032 String prefixString = prefix.toString(); 1033 // Collect all contraction suffixes for one prefix. 1034 ConditionalCE32 firstCond = cond; 1035 ConditionalCE32 lastCond = cond; 1036 while(cond.next >= 0 && 1037 (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)) { 1038 lastCond = cond; 1039 } 1040 int ce32; 1041 int suffixStart = prefixLength + 1; // == prefix.length() 1042 if(lastCond.context.length() == suffixStart) { 1043 // One prefix without contraction suffix. 1044 assert(firstCond == lastCond); 1045 ce32 = lastCond.ce32; 1046 cond = lastCond; 1047 } else { 1048 // Build the contractions trie. 1049 contractionBuilder.clear(); 1050 // Entry for an empty suffix, to be stored before the trie. 1051 int emptySuffixCE32 = Collation.NO_CE32; // Will always be set to a real value. 1052 int flags = 0; 1053 if(firstCond.context.length() == suffixStart) { 1054 // There is a mapping for the prefix and the single character c. (p|c) 1055 // If no other suffix matches, then we return this value. 1056 emptySuffixCE32 = firstCond.ce32; 1057 cond = getConditionalCE32(firstCond.next); 1058 } else { 1059 // There is no mapping for the prefix and just the single character. 1060 // (There is no p|c, only p|cd, p|ce etc.) 1061 flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH; 1062 // When the prefix matches but none of the prefix-specific suffixes, 1063 // then we fall back to the mappings with the next-longest prefix, 1064 // and ultimately to mappings with no prefix. 1065 // Each fallback might be another set of contractions. 1066 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, 1067 // then in text "pch" we find the ch contraction. 1068 for(cond = head;; cond = getConditionalCE32(cond.next)) { 1069 int length = cond.prefixLength(); 1070 if(length == prefixLength) { break; } 1071 if(cond.defaultCE32 != Collation.NO_CE32 && 1072 (length==0 || prefixString.regionMatches( 1073 prefix.length() - length, cond.context, 1, length) 1074 /* C++: prefix.endsWith(cond.context, 1, length) */)) { 1075 emptySuffixCE32 = cond.defaultCE32; 1076 } 1077 } 1078 cond = firstCond; 1079 } 1080 // Optimization: Set a flag when 1081 // the first character of every contraction suffix has lccc!=0. 1082 // Short-circuits contraction matching when a normal letter follows. 1083 flags |= Collation.CONTRACT_NEXT_CCC; 1084 // Add all of the non-empty suffixes into the contraction trie. 1085 for(;;) { 1086 String suffix = cond.context.substring(suffixStart); 1087 int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0)); 1088 if(fcd16 <= 0xff) { 1089 flags &= ~Collation.CONTRACT_NEXT_CCC; 1090 } 1091 fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length())); 1092 if(fcd16 > 0xff) { 1093 // The last suffix character has lccc!=0, allowing for discontiguous contractions. 1094 flags |= Collation.CONTRACT_TRAILING_CCC; 1095 } 1096 contractionBuilder.add(suffix, cond.ce32); 1097 if(cond == lastCond) { break; } 1098 cond = getConditionalCE32(cond.next); 1099 } 1100 int index = addContextTrie(emptySuffixCE32, contractionBuilder); 1101 if(index > Collation.MAX_INDEX) { 1102 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 1103 // BufferOverflowException is a better fit 1104 // but cannot be constructed with a message string. 1105 } 1106 ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags; 1107 } 1108 assert(cond == lastCond); 1109 firstCond.defaultCE32 = ce32; 1110 if(prefixLength == 0) { 1111 if(cond.next < 0) { 1112 // No non-empty prefixes, only contractions. 1113 return ce32; 1114 } 1115 } else { 1116 prefix.delete(0, 1); // Remove the length unit. 1117 prefix.reverse(); 1118 prefixBuilder.add(prefix, ce32); 1119 if(cond.next < 0) { break; } 1120 } 1121 } 1122 assert(head.defaultCE32 != Collation.NO_CE32); 1123 int index = addContextTrie(head.defaultCE32, prefixBuilder); 1124 if(index > Collation.MAX_INDEX) { 1125 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 1126 // BufferOverflowException is a better fit 1127 // but cannot be constructed with a message string. 1128 } 1129 return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index); 1130 } 1131 1132 protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) { 1133 StringBuilder context = new StringBuilder(); 1134 context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32); 1135 context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL)); 1136 int index = contexts.indexOf(context.toString()); 1137 if(index < 0) { 1138 index = contexts.length(); 1139 contexts.append(context); 1140 } 1141 return index; 1142 } 1143 1144 protected void buildFastLatinTable(CollationData data) { 1145 if(!fastLatinEnabled) { return; } 1146 1147 fastLatinBuilder = new CollationFastLatinBuilder(); 1148 if(fastLatinBuilder.forData(data)) { 1149 char[] header = fastLatinBuilder.getHeader(); 1150 char[] table = fastLatinBuilder.getTable(); 1151 if(base != null && 1152 Arrays.equals(header, base.fastLatinTableHeader) && 1153 Arrays.equals(table, base.fastLatinTable)) { 1154 // Same fast Latin table as in the base, use that one instead. 1155 fastLatinBuilder = null; 1156 header = base.fastLatinTableHeader; 1157 table = base.fastLatinTable; 1158 } 1159 data.fastLatinTableHeader = header; 1160 data.fastLatinTable = table; 1161 } else { 1162 fastLatinBuilder = null; 1163 } 1164 } 1165 1166 protected int getCEs(CharSequence s, int start, long ces[], int cesLength) { 1167 if(collIter == null) { 1168 collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl)); 1169 if(collIter == null) { return 0; } 1170 } 1171 return collIter.fetchCEs(s, start, ces, cesLength); 1172 } 1173 1174 protected static int jamoCpFromIndex(int i) { 1175 // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27 1176 if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; } 1177 i -= Hangul.JAMO_L_COUNT; 1178 if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; } 1179 i -= Hangul.JAMO_V_COUNT; 1180 // i < 27 1181 return Hangul.JAMO_T_BASE + 1 + i; 1182 } 1183 1184 /** 1185 * Build-time collation element and character iterator. 1186 * Uses the runtime CollationIterator for fetching CEs for a string 1187 * but reads from the builder's unfinished data structures. 1188 * In particular, this class reads from the unfinished trie 1189 * and has to avoid CollationIterator.nextCE() and redirect other 1190 * calls to data.getCE32() and data.getCE32FromSupplementary(). 1191 * 1192 * We do this so that we need not implement the collation algorithm 1193 * again for the builder and make it behave exactly like the runtime code. 1194 * That would be more difficult to test and maintain than this indirection. 1195 * 1196 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, 1197 * so the data accesses from those code paths need not be modified. 1198 * 1199 * This class iterates directly over whole code points 1200 * so that the CollationIterator does not need the finished trie 1201 * for handling the LEAD_SURROGATE_TAG. 1202 */ 1203 private static final class DataBuilderCollationIterator extends CollationIterator { 1204 DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) { 1205 super(newData, /*numeric=*/ false); 1206 builder = b; 1207 builderData = newData; 1208 builderData.base = builder.base; 1209 // Set all of the jamoCE32s[] to indirection CE32s. 1210 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 1211 int jamo = CollationDataBuilder.jamoCpFromIndex(j); 1212 jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) | 1213 CollationDataBuilder.IS_BUILDER_JAMO_CE32; 1214 } 1215 builderData.jamoCE32s = jamoCE32s; 1216 } 1217 1218 int fetchCEs(CharSequence str, int start, long ces[], int cesLength) { 1219 // Set the pointers each time, in case they changed due to reallocation. 1220 builderData.ce32s = builder.ce32s.getBuffer(); 1221 builderData.ces = builder.ce64s.getBuffer(); 1222 builderData.contexts = builder.contexts.toString(); 1223 // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32(). 1224 reset(); 1225 s = str; 1226 pos = start; 1227 while(pos < s.length()) { 1228 // No need to keep all CEs in the iterator buffer. 1229 clearCEs(); 1230 int c = Character.codePointAt(s, pos); 1231 pos += Character.charCount(c); 1232 int ce32 = builder.trie.get(c); 1233 CollationData d; 1234 if(ce32 == Collation.FALLBACK_CE32) { 1235 d = builder.base; 1236 ce32 = builder.base.getCE32(c); 1237 } else { 1238 d = builderData; 1239 } 1240 appendCEsFromCE32(d, c, ce32, /*forward=*/ true); 1241 for(int i = 0; i < getCEsLength(); ++i) { 1242 long ce = getCE(i); 1243 if(ce != 0) { 1244 if(cesLength < Collation.MAX_EXPANSION_LENGTH) { 1245 ces[cesLength] = ce; 1246 } 1247 ++cesLength; 1248 } 1249 } 1250 } 1251 return cesLength; 1252 } 1253 1254 @Override 1255 public void resetToOffset(int newOffset) { 1256 reset(); 1257 pos = newOffset; 1258 } 1259 1260 @Override 1261 public int getOffset() { 1262 return pos; 1263 } 1264 1265 @Override 1266 public int nextCodePoint() { 1267 if(pos == s.length()) { 1268 return Collation.SENTINEL_CP; 1269 } 1270 int c = Character.codePointAt(s, pos); 1271 pos += Character.charCount(c); 1272 return c; 1273 } 1274 1275 @Override 1276 public int previousCodePoint() { 1277 if(pos == 0) { 1278 return Collation.SENTINEL_CP; 1279 } 1280 int c = Character.codePointBefore(s, pos); 1281 pos -= Character.charCount(c); 1282 return c; 1283 } 1284 1285 @Override 1286 protected void forwardNumCodePoints(int num) { 1287 pos = Character.offsetByCodePoints(s, pos, num); 1288 } 1289 1290 @Override 1291 protected void backwardNumCodePoints(int num) { 1292 pos = Character.offsetByCodePoints(s, pos, -num); 1293 } 1294 1295 @Override 1296 protected int getDataCE32(int c) { 1297 return builder.trie.get(c); 1298 } 1299 1300 @Override 1301 protected int getCE32FromBuilderData(int ce32) { 1302 assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG)); 1303 if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) { 1304 int jamo = Collation.indexFromCE32(ce32); 1305 return builder.trie.get(jamo); 1306 } else { 1307 ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32); 1308 if(cond.builtCE32 == Collation.NO_CE32) { 1309 // Build the context-sensitive mappings into their runtime form and cache the result. 1310 try { 1311 cond.builtCE32 = builder.buildContext(cond); 1312 } catch(IndexOutOfBoundsException e) { 1313 builder.clearContexts(); 1314 cond.builtCE32 = builder.buildContext(cond); 1315 } 1316 builderData.contexts = builder.contexts.toString(); 1317 } 1318 return cond.builtCE32; 1319 } 1320 } 1321 1322 protected final CollationDataBuilder builder; 1323 protected final CollationData builderData; 1324 protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 1325 protected CharSequence s; 1326 protected int pos; 1327 } 1328 1329 protected final boolean isMutable() { 1330 // C++ tests !(trie == NULL || utrie2_isFrozen(trie)) 1331 // but Java Trie2Writable does not have an observable isFrozen() state. 1332 return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen(); 1333 } 1334 1335 /** @see Collation.BUILDER_DATA_TAG */ 1336 private static final int IS_BUILDER_JAMO_CE32 = 0x100; 1337 1338 protected Normalizer2Impl nfcImpl; 1339 protected CollationData base; 1340 protected CollationSettings baseSettings; 1341 protected Trie2Writable trie; 1342 protected UVector32 ce32s; 1343 protected UVector64 ce64s; 1344 protected ArrayList<ConditionalCE32> conditionalCE32s; // vector of ConditionalCE32 1345 // Characters that have context (prefixes or contraction suffixes). 1346 protected UnicodeSet contextChars = new UnicodeSet(); 1347 // Serialized UCharsTrie structures for finalized contexts. 1348 protected StringBuilder contexts = new StringBuilder(); 1349 protected UnicodeSet unsafeBackwardSet = new UnicodeSet(); 1350 protected boolean modified; 1351 1352 protected boolean fastLatinEnabled; 1353 protected CollationFastLatinBuilder fastLatinBuilder; 1354 1355 protected DataBuilderCollationIterator collIter; 1356 } 1357