1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2010-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationIterator.java, ported from collationiterator.h/.cpp 10 * 11 * C++ version created on: 2010oct27 12 * created by: Markus W. Scherer 13 */ 14 15 package android.icu.impl.coll; 16 17 import android.icu.impl.Normalizer2Impl.Hangul; 18 import android.icu.impl.Trie2_32; 19 import android.icu.util.BytesTrie; 20 import android.icu.util.CharsTrie; 21 import android.icu.util.ICUException; 22 23 /** 24 * Collation element iterator and abstract character iterator. 25 * 26 * When a method returns a code point value, it must be in 0..10FFFF, 27 * except it can be negative as a sentinel value. 28 * @hide Only a subset of ICU is exposed in Android 29 */ 30 public abstract class CollationIterator { 31 private static final class CEBuffer { 32 /** Large enough for CEs of most short strings. */ 33 private static final int INITIAL_CAPACITY = 40; 34 35 CEBuffer() {} 36 37 void append(long ce) { 38 if(length >= INITIAL_CAPACITY) { 39 ensureAppendCapacity(1); 40 } 41 buffer[length++] = ce; 42 } 43 44 void appendUnsafe(long ce) { 45 buffer[length++] = ce; 46 } 47 48 void ensureAppendCapacity(int appCap) { 49 int capacity = buffer.length; 50 if((length + appCap) <= capacity) { return; } 51 do { 52 if(capacity < 1000) { 53 capacity *= 4; 54 } else { 55 capacity *= 2; 56 } 57 } while(capacity < (length + appCap)); 58 long[] newBuffer = new long[capacity]; 59 System.arraycopy(buffer, 0, newBuffer, 0, length); 60 buffer = newBuffer; 61 } 62 63 void incLength() { 64 // Use INITIAL_CAPACITY for a very simple fastpath. 65 // (Rather than buffer.getCapacity().) 66 if(length >= INITIAL_CAPACITY) { 67 ensureAppendCapacity(1); 68 } 69 ++length; 70 } 71 72 long set(int i, long ce) { 73 return buffer[i] = ce; 74 } 75 long get(int i) { return buffer[i]; } 76 77 long[] getCEs() { return buffer; } 78 79 int length = 0; 80 81 private long[] buffer = new long[INITIAL_CAPACITY]; 82 } 83 84 // State of combining marks skipped in discontiguous contraction. 85 // We create a state object on first use and keep it around deactivated between uses. 86 private static final class SkippedState { 87 // Born active but empty. 88 SkippedState() {} 89 void clear() { 90 oldBuffer.setLength(0); 91 pos = 0; 92 // The newBuffer is reset by setFirstSkipped(). 93 } 94 95 boolean isEmpty() { return oldBuffer.length() == 0; } 96 97 boolean hasNext() { return pos < oldBuffer.length(); } 98 99 // Requires hasNext(). 100 int next() { 101 int c = oldBuffer.codePointAt(pos); 102 pos += Character.charCount(c); 103 return c; 104 } 105 106 // Accounts for one more input code point read beyond the end of the marks buffer. 107 void incBeyond() { 108 assert(!hasNext()); 109 ++pos; 110 } 111 112 // Goes backward through the skipped-marks buffer. 113 // Returns the number of code points read beyond the skipped marks 114 // that need to be backtracked through normal input. 115 int backwardNumCodePoints(int n) { 116 int length = oldBuffer.length(); 117 int beyond = pos - length; 118 if(beyond > 0) { 119 if(beyond >= n) { 120 // Not back far enough to re-enter the oldBuffer. 121 pos -= n; 122 return n; 123 } else { 124 // Back out all beyond-oldBuffer code points and re-enter the buffer. 125 pos = oldBuffer.offsetByCodePoints(length, beyond - n); 126 return beyond; 127 } 128 } else { 129 // Go backwards from inside the oldBuffer. 130 pos = oldBuffer.offsetByCodePoints(pos, -n); 131 return 0; 132 } 133 } 134 135 void setFirstSkipped(int c) { 136 skipLengthAtMatch = 0; 137 newBuffer.setLength(0); 138 newBuffer.appendCodePoint(c); 139 } 140 141 void skip(int c) { 142 newBuffer.appendCodePoint(c); 143 } 144 145 void recordMatch() { skipLengthAtMatch = newBuffer.length(); } 146 147 // Replaces the characters we consumed with the newly skipped ones. 148 void replaceMatch() { 149 // Note: UnicodeString.replace() pins pos to at most length(). 150 int oldLength = oldBuffer.length(); 151 if(pos > oldLength) { pos = oldLength; } 152 oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch); 153 pos = 0; 154 } 155 156 void saveTrieState(CharsTrie trie) { trie.saveState(state); } 157 void resetToTrieState(CharsTrie trie) { trie.resetToState(state); } 158 159 // Combining marks skipped in previous discontiguous-contraction matching. 160 // After that discontiguous contraction was completed, we start reading them from here. 161 private final StringBuilder oldBuffer = new StringBuilder(); 162 // Combining marks newly skipped in current discontiguous-contraction matching. 163 // These might have been read from the normal text or from the oldBuffer. 164 private final StringBuilder newBuffer = new StringBuilder(); 165 // Reading index in oldBuffer, 166 // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()). 167 private int pos; 168 // newBuffer.length() at the time of the last matching character. 169 // When a partial match fails, we back out skipped and partial-matching input characters. 170 private int skipLengthAtMatch; 171 // We save the trie state before we attempt to match a character, 172 // so that we can skip it and try the next one. 173 private CharsTrie.State state = new CharsTrie.State(); 174 }; 175 176 /** 177 * Partially constructs the iterator. 178 * In Java, we cache partially constructed iterators 179 * and finish their setup when starting to work on text 180 * (via reset(boolean) and the setText(numeric, ...) methods of subclasses). 181 * This avoids memory allocations for iterators that remain unused. 182 * 183 * <p>In C++, there is only one constructor, and iterators are 184 * stack-allocated as needed. 185 */ 186 public CollationIterator(CollationData d) { 187 trie = d.trie; 188 data = d; 189 numCpFwd = -1; 190 isNumeric = false; 191 ceBuffer = null; 192 } 193 194 public CollationIterator(CollationData d, boolean numeric) { 195 trie = d.trie; 196 data = d; 197 numCpFwd = -1; 198 isNumeric = numeric; 199 ceBuffer = new CEBuffer(); 200 } 201 202 @Override 203 public boolean equals(Object other) { 204 // Subclasses: Call this method and then add more specific checks. 205 // Compare the iterator state but not the collation data (trie & data fields): 206 // Assume that the caller compares the data. 207 // Ignore skipped since that should be unused between calls to nextCE(). 208 // (It only stays around to avoid another memory allocation.) 209 if(other == null) { return false; } 210 if(!this.getClass().equals(other.getClass())) { return false; } 211 CollationIterator o = (CollationIterator)other; 212 if(!(ceBuffer.length == o.ceBuffer.length && 213 cesIndex == o.cesIndex && 214 numCpFwd == o.numCpFwd && 215 isNumeric == o.isNumeric)) { 216 return false; 217 } 218 for(int i = 0; i < ceBuffer.length; ++i) { 219 if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; } 220 } 221 return true; 222 } 223 224 @Override 225 public int hashCode() { 226 // Dummy return to prevent compile warnings. 227 return 0; 228 } 229 230 /** 231 * Resets the iterator state and sets the position to the specified offset. 232 * Subclasses must implement, and must call the parent class method, 233 * or CollationIterator.reset(). 234 */ 235 public abstract void resetToOffset(int newOffset); 236 237 public abstract int getOffset(); 238 239 /** 240 * Returns the next collation element. 241 */ 242 public final long nextCE() { 243 if(cesIndex < ceBuffer.length) { 244 // Return the next buffered CE. 245 return ceBuffer.get(cesIndex++); 246 } 247 assert cesIndex == ceBuffer.length; 248 ceBuffer.incLength(); 249 long cAndCE32 = handleNextCE32(); 250 int c = (int)(cAndCE32 >> 32); 251 int ce32 = (int)cAndCE32; 252 int t = ce32 & 0xff; 253 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32). 254 // Normal CE from the main data. 255 // Forced-inline of ceFromSimpleCE32(ce32). 256 return ceBuffer.set(cesIndex++, 257 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 258 } 259 CollationData d; 260 // The compiler should be able to optimize the previous and the following 261 // comparisons of t with the same constant. 262 if(t == Collation.SPECIAL_CE32_LOW_BYTE) { 263 if(c < 0) { 264 return ceBuffer.set(cesIndex++, Collation.NO_CE); 265 } 266 d = data.base; 267 ce32 = d.getCE32(c); 268 t = ce32 & 0xff; 269 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { 270 // Normal CE from the base data. 271 return ceBuffer.set(cesIndex++, 272 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 273 } 274 } else { 275 d = data; 276 } 277 if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) { 278 // Forced-inline of ceFromLongPrimaryCE32(ce32). 279 return ceBuffer.set(cesIndex++, 280 ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE); 281 } 282 return nextCEFromCE32(d, c, ce32); 283 } 284 285 /** 286 * Fetches all CEs. 287 * @return getCEsLength() 288 */ 289 public final int fetchCEs() { 290 while(nextCE() != Collation.NO_CE) { 291 // No need to loop for each expansion CE. 292 cesIndex = ceBuffer.length; 293 } 294 return ceBuffer.length; 295 } 296 297 /** 298 * Overwrites the current CE (the last one returned by nextCE()). 299 */ 300 final void setCurrentCE(long ce) { 301 assert cesIndex > 0; 302 ceBuffer.set(cesIndex - 1, ce); 303 } 304 305 /** 306 * Returns the previous collation element. 307 */ 308 public final long previousCE(UVector32 offsets) { 309 if(ceBuffer.length > 0) { 310 // Return the previous buffered CE. 311 return ceBuffer.get(--ceBuffer.length); 312 } 313 offsets.removeAllElements(); 314 int limitOffset = getOffset(); 315 int c = previousCodePoint(); 316 if(c < 0) { return Collation.NO_CE; } 317 if(data.isUnsafeBackward(c, isNumeric)) { 318 return previousCEUnsafe(c, offsets); 319 } 320 // Simple, safe-backwards iteration: 321 // Get a CE going backwards, handle prefixes but no contractions. 322 int ce32 = data.getCE32(c); 323 CollationData d; 324 if(ce32 == Collation.FALLBACK_CE32) { 325 d = data.base; 326 ce32 = d.getCE32(c); 327 } else { 328 d = data; 329 } 330 if(Collation.isSimpleOrLongCE32(ce32)) { 331 return Collation.ceFromCE32(ce32); 332 } 333 appendCEsFromCE32(d, c, ce32, false); 334 if(ceBuffer.length > 1) { 335 offsets.addElement(getOffset()); 336 // For an expansion, the offset of each non-initial CE is the limit offset, 337 // consistent with forward iteration. 338 while(offsets.size() <= ceBuffer.length) { 339 offsets.addElement(limitOffset); 340 }; 341 } 342 return ceBuffer.get(--ceBuffer.length); 343 } 344 345 public final int getCEsLength() { 346 return ceBuffer.length; 347 } 348 349 public final long getCE(int i) { 350 return ceBuffer.get(i); 351 } 352 353 public final long[] getCEs() { 354 return ceBuffer.getCEs(); 355 } 356 357 final void clearCEs() { 358 cesIndex = ceBuffer.length = 0; 359 } 360 361 public final void clearCEsIfNoneRemaining() { 362 if(cesIndex == ceBuffer.length) { clearCEs(); } 363 } 364 365 /** 366 * Returns the next code point (with post-increment). 367 * Public for identical-level comparison and for testing. 368 */ 369 public abstract int nextCodePoint(); 370 371 /** 372 * Returns the previous code point (with pre-decrement). 373 * Public for identical-level comparison and for testing. 374 */ 375 public abstract int previousCodePoint(); 376 377 protected final void reset() { 378 cesIndex = ceBuffer.length = 0; 379 if(skipped != null) { skipped.clear(); } 380 } 381 /** 382 * Resets the state as well as the numeric setting, 383 * and completes the initialization. 384 * Only exists in Java where we reset cached CollationIterator instances 385 * rather than stack-allocating temporary ones. 386 * (See also the constructor comments.) 387 */ 388 protected final void reset(boolean numeric) { 389 if(ceBuffer == null) { 390 ceBuffer = new CEBuffer(); 391 } 392 reset(); 393 isNumeric = numeric; 394 } 395 396 /** 397 * Returns the next code point and its local CE32 value. 398 * Returns Collation.FALLBACK_CE32 at the end of the text (c<0) 399 * or when c's CE32 value is to be looked up in the base data (fallback). 400 * 401 * The code point is used for fallbacks, context and implicit weights. 402 * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32). 403 * 404 * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0. 405 */ 406 protected long handleNextCE32() { 407 int c = nextCodePoint(); 408 if(c < 0) { return NO_CP_AND_CE32; } 409 return makeCodePointAndCE32Pair(c, data.getCE32(c)); 410 } 411 protected long makeCodePointAndCE32Pair(int c, int ce32) { 412 return ((long)c << 32) | (ce32 & 0xffffffffL); 413 } 414 protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL); 415 416 /** 417 * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit. 418 * Returns the trail surrogate in that case and advances past it, 419 * if a trail surrogate follows the lead surrogate. 420 * Otherwise returns any other code unit and does not advance. 421 */ 422 protected char handleGetTrailSurrogate() { 423 return 0; 424 } 425 426 /** 427 * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator. 428 * (Not needed in Java.) 429 */ 430 /*protected boolean foundNULTerminator() { 431 return false; 432 }*/ 433 434 /** 435 * @return false if surrogate code points U+D800..U+DFFF 436 * map to their own implicit primary weights (for UTF-16), 437 * or true if they map to CE(U+FFFD) (for UTF-8) 438 */ 439 protected boolean forbidSurrogateCodePoints() { 440 return false; 441 } 442 443 protected abstract void forwardNumCodePoints(int num); 444 445 protected abstract void backwardNumCodePoints(int num); 446 447 /** 448 * Returns the CE32 from the data trie. 449 * Normally the same as data.getCE32(), but overridden in the builder. 450 * Call this only when the faster data.getCE32() cannot be used. 451 */ 452 protected int getDataCE32(int c) { 453 return data.getCE32(c); 454 } 455 456 protected int getCE32FromBuilderData(int ce32) { 457 throw new ICUException("internal program error: should be unreachable"); 458 } 459 460 protected final void appendCEsFromCE32(CollationData d, int c, int ce32, 461 boolean forward) { 462 while(Collation.isSpecialCE32(ce32)) { 463 switch(Collation.tagFromCE32(ce32)) { 464 case Collation.FALLBACK_TAG: 465 case Collation.RESERVED_TAG_3: 466 throw new ICUException("internal program error: should be unreachable"); 467 case Collation.LONG_PRIMARY_TAG: 468 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32)); 469 return; 470 case Collation.LONG_SECONDARY_TAG: 471 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32)); 472 return; 473 case Collation.LATIN_EXPANSION_TAG: 474 ceBuffer.ensureAppendCapacity(2); 475 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32)); 476 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32)); 477 ceBuffer.length += 2; 478 return; 479 case Collation.EXPANSION32_TAG: { 480 int index = Collation.indexFromCE32(ce32); 481 int length = Collation.lengthFromCE32(ce32); 482 ceBuffer.ensureAppendCapacity(length); 483 do { 484 ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++])); 485 } while(--length > 0); 486 return; 487 } 488 case Collation.EXPANSION_TAG: { 489 int index = Collation.indexFromCE32(ce32); 490 int length = Collation.lengthFromCE32(ce32); 491 ceBuffer.ensureAppendCapacity(length); 492 do { 493 ceBuffer.appendUnsafe(d.ces[index++]); 494 } while(--length > 0); 495 return; 496 } 497 case Collation.BUILDER_DATA_TAG: 498 ce32 = getCE32FromBuilderData(ce32); 499 if(ce32 == Collation.FALLBACK_CE32) { 500 d = data.base; 501 ce32 = d.getCE32(c); 502 } 503 break; 504 case Collation.PREFIX_TAG: 505 if(forward) { backwardNumCodePoints(1); } 506 ce32 = getCE32FromPrefix(d, ce32); 507 if(forward) { forwardNumCodePoints(1); } 508 break; 509 case Collation.CONTRACTION_TAG: { 510 int index = Collation.indexFromCE32(ce32); 511 int defaultCE32 = d.getCE32FromContexts(index); // Default if no suffix match. 512 if(!forward) { 513 // Backward contractions are handled by previousCEUnsafe(). 514 // c has contractions but they were not found. 515 ce32 = defaultCE32; 516 break; 517 } 518 int nextCp; 519 if(skipped == null && numCpFwd < 0) { 520 // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path, 521 // avoiding the function call and the nextSkippedCodePoint() overhead. 522 nextCp = nextCodePoint(); 523 if(nextCp < 0) { 524 // No more text. 525 ce32 = defaultCE32; 526 break; 527 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 528 !CollationFCD.mayHaveLccc(nextCp)) { 529 // All contraction suffixes start with characters with lccc!=0 530 // but the next code point has lccc==0. 531 backwardNumCodePoints(1); 532 ce32 = defaultCE32; 533 break; 534 } 535 } else { 536 nextCp = nextSkippedCodePoint(); 537 if(nextCp < 0) { 538 // No more text. 539 ce32 = defaultCE32; 540 break; 541 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 542 !CollationFCD.mayHaveLccc(nextCp)) { 543 // All contraction suffixes start with characters with lccc!=0 544 // but the next code point has lccc==0. 545 backwardNumSkipped(1); 546 ce32 = defaultCE32; 547 break; 548 } 549 } 550 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp); 551 if(ce32 == Collation.NO_CE32) { 552 // CEs from a discontiguous contraction plus the skipped combining marks 553 // have been appended already. 554 return; 555 } 556 break; 557 } 558 case Collation.DIGIT_TAG: 559 if(isNumeric) { 560 appendNumericCEs(ce32, forward); 561 return; 562 } else { 563 // Fetch the non-numeric-collation CE32 and continue. 564 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 565 break; 566 } 567 case Collation.U0000_TAG: 568 assert(c == 0); 569 // NUL-terminated input not supported in Java. 570 // Fetch the normal ce32 for U+0000 and continue. 571 ce32 = d.ce32s[0]; 572 break; 573 case Collation.HANGUL_TAG: { 574 int[] jamoCE32s = d.jamoCE32s; 575 c -= Hangul.HANGUL_BASE; 576 int t = c % Hangul.JAMO_T_COUNT; 577 c /= Hangul.JAMO_T_COUNT; 578 int v = c % Hangul.JAMO_V_COUNT; 579 c /= Hangul.JAMO_V_COUNT; 580 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) { 581 // None of the Jamo CE32s are isSpecialCE32(). 582 // Avoid recursive function calls and per-Jamo tests. 583 ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3); 584 ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c])); 585 ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v])); 586 ceBuffer.length += 2; 587 if(t != 0) { 588 ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t])); 589 } 590 return; 591 } else { 592 // We should not need to compute each Jamo code point. 593 // In particular, there should be no offset or implicit ce32. 594 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward); 595 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward); 596 if(t == 0) { return; } 597 // offset 39 = 19 + 21 - 1: 598 // 19 = JAMO_L_COUNT 599 // 21 = JAMO_T_COUNT 600 // -1 = omit t==0 601 ce32 = jamoCE32s[39 + t]; 602 c = Collation.SENTINEL_CP; 603 break; 604 } 605 } 606 case Collation.LEAD_SURROGATE_TAG: { 607 assert(forward); // Backward iteration should never see lead surrogate code _unit_ data. 608 assert(isLeadSurrogate(c)); 609 char trail; 610 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) { 611 c = Character.toCodePoint((char)c, trail); 612 ce32 &= Collation.LEAD_TYPE_MASK; 613 if(ce32 == Collation.LEAD_ALL_UNASSIGNED) { 614 ce32 = Collation.UNASSIGNED_CE32; // unassigned-implicit 615 } else if(ce32 == Collation.LEAD_ALL_FALLBACK || 616 (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) { 617 // fall back to the base data 618 d = d.base; 619 ce32 = d.getCE32FromSupplementary(c); 620 } 621 } else { 622 // c is an unpaired surrogate. 623 ce32 = Collation.UNASSIGNED_CE32; 624 } 625 break; 626 } 627 case Collation.OFFSET_TAG: 628 assert(c >= 0); 629 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32)); 630 return; 631 case Collation.IMPLICIT_TAG: 632 assert(c >= 0); 633 if(isSurrogate(c) && forbidSurrogateCodePoints()) { 634 ce32 = Collation.FFFD_CE32; 635 break; 636 } else { 637 ceBuffer.append(Collation.unassignedCEFromCodePoint(c)); 638 return; 639 } 640 } 641 } 642 ceBuffer.append(Collation.ceFromSimpleCE32(ce32)); 643 } 644 645 // TODO: Propose widening the UTF16 method. 646 private static final boolean isSurrogate(int c) { 647 return (c & 0xfffff800) == 0xd800; 648 } 649 650 // TODO: Propose widening the UTF16 method. 651 protected static final boolean isLeadSurrogate(int c) { 652 return (c & 0xfffffc00) == 0xd800; 653 } 654 655 // TODO: Propose widening the UTF16 method. 656 protected static final boolean isTrailSurrogate(int c) { 657 return (c & 0xfffffc00) == 0xdc00; 658 } 659 660 // Main lookup trie of the data object. 661 protected final Trie2_32 trie; 662 protected final CollationData data; 663 664 private final long nextCEFromCE32(CollationData d, int c, int ce32) { 665 --ceBuffer.length; // Undo ceBuffer.incLength(). 666 appendCEsFromCE32(d, c, ce32, true); 667 return ceBuffer.get(cesIndex++); 668 } 669 670 private final int getCE32FromPrefix(CollationData d, int ce32) { 671 int index = Collation.indexFromCE32(ce32); 672 ce32 = d.getCE32FromContexts(index); // Default if no prefix match. 673 index += 2; 674 // Number of code points read before the original code point. 675 int lookBehind = 0; 676 CharsTrie prefixes = new CharsTrie(d.contexts, index); 677 for(;;) { 678 int c = previousCodePoint(); 679 if(c < 0) { break; } 680 ++lookBehind; 681 BytesTrie.Result match = prefixes.nextForCodePoint(c); 682 if(match.hasValue()) { 683 ce32 = prefixes.getValue(); 684 } 685 if(!match.hasNext()) { break; } 686 } 687 forwardNumCodePoints(lookBehind); 688 return ce32; 689 } 690 691 private final int nextSkippedCodePoint() { 692 if(skipped != null && skipped.hasNext()) { return skipped.next(); } 693 if(numCpFwd == 0) { return Collation.SENTINEL_CP; } 694 int c = nextCodePoint(); 695 if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); } 696 if(numCpFwd > 0 && c >= 0) { --numCpFwd; } 697 return c; 698 } 699 700 private final void backwardNumSkipped(int n) { 701 if(skipped != null && !skipped.isEmpty()) { 702 n = skipped.backwardNumCodePoints(n); 703 } 704 backwardNumCodePoints(n); 705 if(numCpFwd >= 0) { numCpFwd += n; } 706 } 707 708 private final int nextCE32FromContraction( 709 CollationData d, int contractionCE32, 710 CharSequence trieChars, int trieOffset, int ce32, int c) { 711 // c: next code point after the original one 712 713 // Number of code points read beyond the original code point. 714 // Needed for discontiguous contraction matching. 715 int lookAhead = 1; 716 // Number of code points read since the last match (initially only c). 717 int sinceMatch = 1; 718 // Normally we only need a contiguous match, 719 // and therefore need not remember the suffixes state from before a mismatch for retrying. 720 // If we are already processing skipped combining marks, then we do track the state. 721 CharsTrie suffixes = new CharsTrie(trieChars, trieOffset); 722 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 723 BytesTrie.Result match = suffixes.firstForCodePoint(c); 724 for(;;) { 725 int nextCp; 726 if(match.hasValue()) { 727 ce32 = suffixes.getValue(); 728 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) { 729 return ce32; 730 } 731 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 732 sinceMatch = 1; 733 } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) { 734 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text. 735 // Back up if necessary, and try a discontiguous contraction. 736 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 && 737 // Discontiguous contraction matching extends an existing match. 738 // If there is no match yet, then there is nothing to do. 739 ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 || 740 sinceMatch < lookAhead)) { 741 // The last character of at least one suffix has lccc!=0, 742 // allowing for discontiguous contractions. 743 // UCA S2.1.1 only processes non-starters immediately following 744 // "a match in the table" (sinceMatch=1). 745 if(sinceMatch > 1) { 746 // Return to the state after the last match. 747 // (Return to sinceMatch=0 and re-fetch the first partially-matched character.) 748 backwardNumSkipped(sinceMatch); 749 c = nextSkippedCodePoint(); 750 lookAhead -= sinceMatch - 1; 751 sinceMatch = 1; 752 } 753 if(d.getFCD16(c) > 0xff) { 754 return nextCE32FromDiscontiguousContraction( 755 d, suffixes, ce32, lookAhead, c); 756 } 757 } 758 break; 759 } else { 760 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c. 761 // It does not have a result value, therefore it is not itself "a match in the table". 762 // If a partially-matched c has ccc!=0 then 763 // it might be skipped in discontiguous contraction. 764 c = nextCp; 765 ++sinceMatch; 766 } 767 ++lookAhead; 768 match = suffixes.nextForCodePoint(c); 769 } 770 backwardNumSkipped(sinceMatch); 771 return ce32; 772 } 773 774 private final int nextCE32FromDiscontiguousContraction( 775 CollationData d, CharsTrie suffixes, int ce32, 776 int lookAhead, int c) { 777 // UCA section 3.3.2 Contractions: 778 // Contractions that end with non-starter characters 779 // are known as discontiguous contractions. 780 // ... discontiguous contractions must be detected in input text 781 // whenever the final sequence of non-starter characters could be rearranged 782 // so as to make a contiguous matching sequence that is canonically equivalent. 783 784 // UCA: http://www.unicode.org/reports/tr10/#S2.1 785 // S2.1 Find the longest initial substring S at each point that has a match in the table. 786 // S2.1.1 If there are any non-starters following S, process each non-starter C. 787 // S2.1.2 If C is not blocked from S, find if S + C has a match in the table. 788 // Note: A non-starter in a string is called blocked 789 // if there is another non-starter of the same canonical combining class or zero 790 // between it and the last character of canonical combining class 0. 791 // S2.1.3 If there is a match, replace S by S + C, and remove C. 792 793 // First: Is a discontiguous contraction even possible? 794 int fcd16 = d.getFCD16(c); 795 assert(fcd16 > 0xff); // The caller checked this already, as a shortcut. 796 int nextCp = nextSkippedCodePoint(); 797 if(nextCp < 0) { 798 // No further text. 799 backwardNumSkipped(1); 800 return ce32; 801 } 802 ++lookAhead; 803 int prevCC = fcd16 & 0xff; 804 fcd16 = d.getFCD16(nextCp); 805 if(fcd16 <= 0xff) { 806 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 807 backwardNumSkipped(2); 808 return ce32; 809 } 810 811 // We have read and matched (lookAhead-2) code points, 812 // read non-matching c and peeked ahead at nextCp. 813 // Return to the state before the mismatch and continue matching with nextCp. 814 if(skipped == null || skipped.isEmpty()) { 815 if(skipped == null) { 816 skipped = new SkippedState(); 817 } 818 suffixes.reset(); 819 if(lookAhead > 2) { 820 // Replay the partial match so far. 821 backwardNumCodePoints(lookAhead); 822 suffixes.firstForCodePoint(nextCodePoint()); 823 for(int i = 3; i < lookAhead; ++i) { 824 suffixes.nextForCodePoint(nextCodePoint()); 825 } 826 // Skip c (which did not match) and nextCp (which we will try now). 827 forwardNumCodePoints(2); 828 } 829 skipped.saveTrieState(suffixes); 830 } else { 831 // Reset to the trie state before the failed match of c. 832 skipped.resetToTrieState(suffixes); 833 } 834 835 skipped.setFirstSkipped(c); 836 // Number of code points read since the last match (at this point: c and nextCp). 837 int sinceMatch = 2; 838 c = nextCp; 839 for(;;) { 840 BytesTrie.Result match; 841 // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2) 842 if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) { 843 // "If there is a match, replace S by S + C, and remove C." (S2.1.3) 844 // Keep prevCC unchanged. 845 ce32 = suffixes.getValue(); 846 sinceMatch = 0; 847 skipped.recordMatch(); 848 if(!match.hasNext()) { break; } 849 skipped.saveTrieState(suffixes); 850 } else { 851 // No match for "S + C", skip C. 852 skipped.skip(c); 853 skipped.resetToTrieState(suffixes); 854 prevCC = fcd16 & 0xff; 855 } 856 if((c = nextSkippedCodePoint()) < 0) { break; } 857 ++sinceMatch; 858 fcd16 = d.getFCD16(c); 859 if(fcd16 <= 0xff) { 860 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 861 break; 862 } 863 } 864 backwardNumSkipped(sinceMatch); 865 boolean isTopDiscontiguous = skipped.isEmpty(); 866 skipped.replaceMatch(); 867 if(isTopDiscontiguous && !skipped.isEmpty()) { 868 // We did get a match after skipping one or more combining marks, 869 // and we are not in a recursive discontiguous contraction. 870 // Append CEs from the contraction ce32 871 // and then from the combining marks that we skipped before the match. 872 c = Collation.SENTINEL_CP; 873 for(;;) { 874 appendCEsFromCE32(d, c, ce32, true); 875 // Fetch CE32s for skipped combining marks from the normal data, with fallback, 876 // rather than from the CollationData where we found the contraction. 877 if(!skipped.hasNext()) { break; } 878 c = skipped.next(); 879 ce32 = getDataCE32(c); 880 if(ce32 == Collation.FALLBACK_CE32) { 881 d = data.base; 882 ce32 = d.getCE32(c); 883 } else { 884 d = data; 885 } 886 // Note: A nested discontiguous-contraction match 887 // replaces consumed combining marks with newly skipped ones 888 // and resets the reading position to the beginning. 889 } 890 skipped.clear(); 891 ce32 = Collation.NO_CE32; // Signal to the caller that the result is in the ceBuffer. 892 } 893 return ce32; 894 } 895 896 /** 897 * Returns the previous CE when data.isUnsafeBackward(c, isNumeric). 898 */ 899 private final long previousCEUnsafe(int c, UVector32 offsets) { 900 // We just move through the input counting safe and unsafe code points 901 // without collecting the unsafe-backward substring into a buffer and 902 // switching to it. 903 // This is to keep the logic simple. Otherwise we would have to handle 904 // prefix matching going before the backward buffer, switching 905 // to iteration and back, etc. 906 // In the most important case of iterating over a normal string, 907 // reading from the string itself is already maximally fast. 908 // The only drawback there is that after getting the CEs we always 909 // skip backward to the safe character rather than switching out 910 // of a backwardBuffer. 911 // But this should not be the common case for previousCE(), 912 // and correctness and maintainability are more important than 913 // complex optimizations. 914 // Find the first safe character before c. 915 int numBackward = 1; 916 while((c = previousCodePoint()) >= 0) { 917 ++numBackward; 918 if(!data.isUnsafeBackward(c, isNumeric)) { 919 break; 920 } 921 } 922 // Set the forward iteration limit. 923 // Note: This counts code points. 924 // We cannot enforce a limit in the middle of a surrogate pair or similar. 925 numCpFwd = numBackward; 926 // Reset the forward iterator. 927 cesIndex = 0; 928 assert(ceBuffer.length == 0); 929 // Go forward and collect the CEs. 930 int offset = getOffset(); 931 while(numCpFwd > 0) { 932 // nextCE() normally reads one code point. 933 // Contraction matching and digit specials read more and check numCpFwd. 934 --numCpFwd; 935 // Append one or more CEs to the ceBuffer. 936 nextCE(); 937 assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE); 938 // No need to loop for getting each expansion CE from nextCE(). 939 cesIndex = ceBuffer.length; 940 // However, we need to write an offset for each CE. 941 // This is for CollationElementIterator.getOffset() to return 942 // intermediate offsets from the unsafe-backwards segment. 943 assert(offsets.size() < ceBuffer.length); 944 offsets.addElement(offset); 945 // For an expansion, the offset of each non-initial CE is the limit offset, 946 // consistent with forward iteration. 947 offset = getOffset(); 948 while(offsets.size() < ceBuffer.length) { 949 offsets.addElement(offset); 950 }; 951 } 952 assert(offsets.size() == ceBuffer.length); 953 // End offset corresponding to just after the unsafe-backwards segment. 954 offsets.addElement(offset); 955 // Reset the forward iteration limit 956 // and move backward to before the segment for which we fetched CEs. 957 numCpFwd = -1; 958 backwardNumCodePoints(numBackward); 959 // Use the collected CEs and return the last one. 960 cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented. 961 return ceBuffer.get(--ceBuffer.length); 962 } 963 964 /** 965 * Turns a string of digits (bytes 0..9) 966 * into a sequence of CEs that will sort in numeric order. 967 * 968 * Starts from this ce32's digit value and consumes the following/preceding digits. 969 * The digits string must not be empty and must not have leading zeros. 970 */ 971 private final void appendNumericCEs(int ce32, boolean forward) { 972 // Collect digits. 973 // TODO: Use some kind of a byte buffer? We only store values 0..9. 974 StringBuilder digits = new StringBuilder(); 975 if(forward) { 976 for(;;) { 977 char digit = Collation.digitFromCE32(ce32); 978 digits.append(digit); 979 if(numCpFwd == 0) { break; } 980 int c = nextCodePoint(); 981 if(c < 0) { break; } 982 ce32 = data.getCE32(c); 983 if(ce32 == Collation.FALLBACK_CE32) { 984 ce32 = data.base.getCE32(c); 985 } 986 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 987 backwardNumCodePoints(1); 988 break; 989 } 990 if(numCpFwd > 0) { --numCpFwd; } 991 } 992 } else { 993 for(;;) { 994 char digit = Collation.digitFromCE32(ce32); 995 digits.append(digit); 996 int c = previousCodePoint(); 997 if(c < 0) { break; } 998 ce32 = data.getCE32(c); 999 if(ce32 == Collation.FALLBACK_CE32) { 1000 ce32 = data.base.getCE32(c); 1001 } 1002 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 1003 forwardNumCodePoints(1); 1004 break; 1005 } 1006 } 1007 // Reverse the digit string. 1008 digits.reverse(); 1009 } 1010 int pos = 0; 1011 do { 1012 // Skip leading zeros. 1013 while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; } 1014 // Write a sequence of CEs for at most 254 digits at a time. 1015 int segmentLength = digits.length() - pos; 1016 if(segmentLength > 254) { segmentLength = 254; } 1017 appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength)); 1018 pos += segmentLength; 1019 } while(pos < digits.length()); 1020 } 1021 1022 /** 1023 * Turns 1..254 digits into a sequence of CEs. 1024 * Called by appendNumericCEs() for each segment of at most 254 digits. 1025 */ 1026 private final void appendNumericSegmentCEs(CharSequence digits) { 1027 int length = digits.length(); 1028 assert(1 <= length && length <= 254); 1029 assert(length == 1 || digits.charAt(0) != 0); 1030 long numericPrimary = data.numericPrimary; 1031 // Note: We use primary byte values 2..255: digits are not compressible. 1032 if(length <= 7) { 1033 // Very dense encoding for small numbers. 1034 int value = digits.charAt(0); 1035 for(int i = 1; i < length; ++i) { 1036 value = value * 10 + digits.charAt(i); 1037 } 1038 // Primary weight second byte values: 1039 // 74 byte values 2.. 75 for small numbers in two-byte primary weights. 1040 // 40 byte values 76..115 for medium numbers in three-byte primary weights. 1041 // 16 byte values 116..131 for large numbers in four-byte primary weights. 1042 // 124 byte values 132..255 for very large numbers with 4..127 digit pairs. 1043 int firstByte = 2; 1044 int numBytes = 74; 1045 if(value < numBytes) { 1046 // Two-byte primary for 0..73, good for day & month numbers etc. 1047 long primary = numericPrimary | ((firstByte + value) << 16); 1048 ceBuffer.append(Collation.makeCE(primary)); 1049 return; 1050 } 1051 value -= numBytes; 1052 firstByte += numBytes; 1053 numBytes = 40; 1054 if(value < numBytes * 254) { 1055 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more. 1056 long primary = numericPrimary | 1057 ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8); 1058 ceBuffer.append(Collation.makeCE(primary)); 1059 return; 1060 } 1061 value -= numBytes * 254; 1062 firstByte += numBytes; 1063 numBytes = 16; 1064 if(value < numBytes * 254 * 254) { 1065 // Four-byte primary for 10234..1042489=10234+16*254*254-1. 1066 long primary = numericPrimary | (2 + value % 254); 1067 value /= 254; 1068 primary |= (2 + value % 254) << 8; 1069 value /= 254; 1070 primary |= (firstByte + value % 254) << 16; 1071 ceBuffer.append(Collation.makeCE(primary)); 1072 return; 1073 } 1074 // original value > 1042489 1075 } 1076 assert(length >= 7); 1077 1078 // The second primary byte value 132..255 indicates the number of digit pairs (4..127), 1079 // then we generate primary bytes with those pairs. 1080 // Omit trailing 00 pairs. 1081 // Decrement the value for the last pair. 1082 1083 // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255. 1084 int numPairs = (length + 1) / 2; 1085 long primary = numericPrimary | ((132 - 4 + numPairs) << 16); 1086 // Find the length without trailing 00 pairs. 1087 while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) { 1088 length -= 2; 1089 } 1090 // Read the first pair. 1091 int pair; 1092 int pos; 1093 if((length & 1) != 0) { 1094 // Only "half a pair" if we have an odd number of digits. 1095 pair = digits.charAt(0); 1096 pos = 1; 1097 } else { 1098 pair = digits.charAt(0) * 10 + digits.charAt(1); 1099 pos = 2; 1100 } 1101 pair = 11 + 2 * pair; 1102 // Add the pairs of digits between pos and length. 1103 int shift = 8; 1104 while(pos < length) { 1105 if(shift == 0) { 1106 // Every three pairs/bytes we need to store a 4-byte-primary CE 1107 // and start with a new CE with the '0' primary lead byte. 1108 primary |= pair; 1109 ceBuffer.append(Collation.makeCE(primary)); 1110 primary = numericPrimary; 1111 shift = 16; 1112 } else { 1113 primary |= pair << shift; 1114 shift -= 8; 1115 } 1116 pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1)); 1117 pos += 2; 1118 } 1119 primary |= (pair - 1) << shift; 1120 ceBuffer.append(Collation.makeCE(primary)); 1121 } 1122 1123 private CEBuffer ceBuffer; 1124 private int cesIndex; 1125 1126 private SkippedState skipped; 1127 1128 // Number of code points to read forward, or -1. 1129 // Used as a forward iteration limit in previousCEUnsafe(). 1130 private int numCpFwd; 1131 // Numeric collation (CollationSettings.NUMERIC). 1132 private boolean isNumeric; 1133 } 1134