1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationIterator.java, ported from collationiterator.h/.cpp 9 * 10 * C++ version created on: 2010oct27 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import com.ibm.icu.impl.Normalizer2Impl.Hangul; 17 import com.ibm.icu.impl.Trie2_32; 18 import com.ibm.icu.util.BytesTrie; 19 import com.ibm.icu.util.CharsTrie; 20 import com.ibm.icu.util.ICUException; 21 22 /** 23 * Collation element iterator and abstract character iterator. 24 * 25 * When a method returns a code point value, it must be in 0..10FFFF, 26 * except it can be negative as a sentinel value. 27 */ 28 public abstract class CollationIterator { 29 private static final class CEBuffer { 30 /** Large enough for CEs of most short strings. */ 31 private static final int INITIAL_CAPACITY = 40; 32 33 CEBuffer() {} 34 35 void append(long ce) { 36 if(length >= INITIAL_CAPACITY) { 37 ensureAppendCapacity(1); 38 } 39 buffer[length++] = ce; 40 } 41 42 void appendUnsafe(long ce) { 43 buffer[length++] = ce; 44 } 45 46 void ensureAppendCapacity(int appCap) { 47 int capacity = buffer.length; 48 if((length + appCap) <= capacity) { return; } 49 do { 50 if(capacity < 1000) { 51 capacity *= 4; 52 } else { 53 capacity *= 2; 54 } 55 } while(capacity < (length + appCap)); 56 long[] newBuffer = new long[capacity]; 57 System.arraycopy(buffer, 0, newBuffer, 0, length); 58 buffer = newBuffer; 59 } 60 61 void incLength() { 62 // Use INITIAL_CAPACITY for a very simple fastpath. 63 // (Rather than buffer.getCapacity().) 64 if(length >= INITIAL_CAPACITY) { 65 ensureAppendCapacity(1); 66 } 67 ++length; 68 } 69 70 long set(int i, long ce) { 71 return buffer[i] = ce; 72 } 73 long get(int i) { return buffer[i]; } 74 75 long[] getCEs() { return buffer; } 76 77 int length = 0; 78 79 private long[] buffer = new long[INITIAL_CAPACITY]; 80 } 81 82 // State of combining marks skipped in discontiguous contraction. 83 // We create a state object on first use and keep it around deactivated between uses. 84 private static final class SkippedState { 85 // Born active but empty. 86 SkippedState() {} 87 void clear() { 88 oldBuffer.setLength(0); 89 pos = 0; 90 // The newBuffer is reset by setFirstSkipped(). 91 } 92 93 boolean isEmpty() { return oldBuffer.length() == 0; } 94 95 boolean hasNext() { return pos < oldBuffer.length(); } 96 97 // Requires hasNext(). 98 int next() { 99 int c = oldBuffer.codePointAt(pos); 100 pos += Character.charCount(c); 101 return c; 102 } 103 104 // Accounts for one more input code point read beyond the end of the marks buffer. 105 void incBeyond() { 106 assert(!hasNext()); 107 ++pos; 108 } 109 110 // Goes backward through the skipped-marks buffer. 111 // Returns the number of code points read beyond the skipped marks 112 // that need to be backtracked through normal input. 113 int backwardNumCodePoints(int n) { 114 int length = oldBuffer.length(); 115 int beyond = pos - length; 116 if(beyond > 0) { 117 if(beyond >= n) { 118 // Not back far enough to re-enter the oldBuffer. 119 pos -= n; 120 return n; 121 } else { 122 // Back out all beyond-oldBuffer code points and re-enter the buffer. 123 pos = oldBuffer.offsetByCodePoints(length, beyond - n); 124 return beyond; 125 } 126 } else { 127 // Go backwards from inside the oldBuffer. 128 pos = oldBuffer.offsetByCodePoints(pos, -n); 129 return 0; 130 } 131 } 132 133 void setFirstSkipped(int c) { 134 skipLengthAtMatch = 0; 135 newBuffer.setLength(0); 136 newBuffer.appendCodePoint(c); 137 } 138 139 void skip(int c) { 140 newBuffer.appendCodePoint(c); 141 } 142 143 void recordMatch() { skipLengthAtMatch = newBuffer.length(); } 144 145 // Replaces the characters we consumed with the newly skipped ones. 146 void replaceMatch() { 147 // Note: UnicodeString.replace() pins pos to at most length(). 148 int oldLength = oldBuffer.length(); 149 if(pos > oldLength) { pos = oldLength; } 150 oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch); 151 pos = 0; 152 } 153 154 void saveTrieState(CharsTrie trie) { trie.saveState(state); } 155 void resetToTrieState(CharsTrie trie) { trie.resetToState(state); } 156 157 // Combining marks skipped in previous discontiguous-contraction matching. 158 // After that discontiguous contraction was completed, we start reading them from here. 159 private final StringBuilder oldBuffer = new StringBuilder(); 160 // Combining marks newly skipped in current discontiguous-contraction matching. 161 // These might have been read from the normal text or from the oldBuffer. 162 private final StringBuilder newBuffer = new StringBuilder(); 163 // Reading index in oldBuffer, 164 // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()). 165 private int pos; 166 // newBuffer.length() at the time of the last matching character. 167 // When a partial match fails, we back out skipped and partial-matching input characters. 168 private int skipLengthAtMatch; 169 // We save the trie state before we attempt to match a character, 170 // so that we can skip it and try the next one. 171 private CharsTrie.State state = new CharsTrie.State(); 172 }; 173 174 /** 175 * Partially constructs the iterator. 176 * In Java, we cache partially constructed iterators 177 * and finish their setup when starting to work on text 178 * (via reset(boolean) and the setText(numeric, ...) methods of subclasses). 179 * This avoids memory allocations for iterators that remain unused. 180 * 181 * <p>In C++, there is only one constructor, and iterators are 182 * stack-allocated as needed. 183 */ 184 public CollationIterator(CollationData d) { 185 trie = d.trie; 186 data = d; 187 numCpFwd = -1; 188 isNumeric = false; 189 ceBuffer = null; 190 } 191 192 public CollationIterator(CollationData d, boolean numeric) { 193 trie = d.trie; 194 data = d; 195 numCpFwd = -1; 196 isNumeric = numeric; 197 ceBuffer = new CEBuffer(); 198 } 199 200 @Override 201 public boolean equals(Object other) { 202 // Subclasses: Call this method and then add more specific checks. 203 // Compare the iterator state but not the collation data (trie & data fields): 204 // Assume that the caller compares the data. 205 // Ignore skipped since that should be unused between calls to nextCE(). 206 // (It only stays around to avoid another memory allocation.) 207 if(other == null) { return false; } 208 if(!this.getClass().equals(other.getClass())) { return false; } 209 CollationIterator o = (CollationIterator)other; 210 if(!(ceBuffer.length == o.ceBuffer.length && 211 cesIndex == o.cesIndex && 212 numCpFwd == o.numCpFwd && 213 isNumeric == o.isNumeric)) { 214 return false; 215 } 216 for(int i = 0; i < ceBuffer.length; ++i) { 217 if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; } 218 } 219 return true; 220 } 221 222 @Override 223 public int hashCode() { 224 // Dummy return to prevent compile warnings. 225 return 0; 226 } 227 228 /** 229 * Resets the iterator state and sets the position to the specified offset. 230 * Subclasses must implement, and must call the parent class method, 231 * or CollationIterator.reset(). 232 */ 233 public abstract void resetToOffset(int newOffset); 234 235 public abstract int getOffset(); 236 237 /** 238 * Returns the next collation element. 239 */ 240 public final long nextCE() { 241 if(cesIndex < ceBuffer.length) { 242 // Return the next buffered CE. 243 return ceBuffer.get(cesIndex++); 244 } 245 assert cesIndex == ceBuffer.length; 246 ceBuffer.incLength(); 247 long cAndCE32 = handleNextCE32(); 248 int c = (int)(cAndCE32 >> 32); 249 int ce32 = (int)cAndCE32; 250 int t = ce32 & 0xff; 251 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32). 252 // Normal CE from the main data. 253 // Forced-inline of ceFromSimpleCE32(ce32). 254 return ceBuffer.set(cesIndex++, 255 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 256 } 257 CollationData d; 258 // The compiler should be able to optimize the previous and the following 259 // comparisons of t with the same constant. 260 if(t == Collation.SPECIAL_CE32_LOW_BYTE) { 261 if(c < 0) { 262 return ceBuffer.set(cesIndex++, Collation.NO_CE); 263 } 264 d = data.base; 265 ce32 = d.getCE32(c); 266 t = ce32 & 0xff; 267 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { 268 // Normal CE from the base data. 269 return ceBuffer.set(cesIndex++, 270 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 271 } 272 } else { 273 d = data; 274 } 275 if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) { 276 // Forced-inline of ceFromLongPrimaryCE32(ce32). 277 return ceBuffer.set(cesIndex++, 278 ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE); 279 } 280 return nextCEFromCE32(d, c, ce32); 281 } 282 283 /** 284 * Fetches all CEs. 285 * @return getCEsLength() 286 */ 287 public final int fetchCEs() { 288 while(nextCE() != Collation.NO_CE) { 289 // No need to loop for each expansion CE. 290 cesIndex = ceBuffer.length; 291 } 292 return ceBuffer.length; 293 } 294 295 /** 296 * Overwrites the current CE (the last one returned by nextCE()). 297 */ 298 final void setCurrentCE(long ce) { 299 assert cesIndex > 0; 300 ceBuffer.set(cesIndex - 1, ce); 301 } 302 303 /** 304 * Returns the previous collation element. 305 */ 306 public final long previousCE(UVector32 offsets) { 307 if(ceBuffer.length > 0) { 308 // Return the previous buffered CE. 309 return ceBuffer.get(--ceBuffer.length); 310 } 311 offsets.removeAllElements(); 312 int limitOffset = getOffset(); 313 int c = previousCodePoint(); 314 if(c < 0) { return Collation.NO_CE; } 315 if(data.isUnsafeBackward(c, isNumeric)) { 316 return previousCEUnsafe(c, offsets); 317 } 318 // Simple, safe-backwards iteration: 319 // Get a CE going backwards, handle prefixes but no contractions. 320 int ce32 = data.getCE32(c); 321 CollationData d; 322 if(ce32 == Collation.FALLBACK_CE32) { 323 d = data.base; 324 ce32 = d.getCE32(c); 325 } else { 326 d = data; 327 } 328 if(Collation.isSimpleOrLongCE32(ce32)) { 329 return Collation.ceFromCE32(ce32); 330 } 331 appendCEsFromCE32(d, c, ce32, false); 332 if(ceBuffer.length > 1) { 333 offsets.addElement(getOffset()); 334 // For an expansion, the offset of each non-initial CE is the limit offset, 335 // consistent with forward iteration. 336 while(offsets.size() <= ceBuffer.length) { 337 offsets.addElement(limitOffset); 338 }; 339 } 340 return ceBuffer.get(--ceBuffer.length); 341 } 342 343 public final int getCEsLength() { 344 return ceBuffer.length; 345 } 346 347 public final long getCE(int i) { 348 return ceBuffer.get(i); 349 } 350 351 public final long[] getCEs() { 352 return ceBuffer.getCEs(); 353 } 354 355 final void clearCEs() { 356 cesIndex = ceBuffer.length = 0; 357 } 358 359 public final void clearCEsIfNoneRemaining() { 360 if(cesIndex == ceBuffer.length) { clearCEs(); } 361 } 362 363 /** 364 * Returns the next code point (with post-increment). 365 * Public for identical-level comparison and for testing. 366 */ 367 public abstract int nextCodePoint(); 368 369 /** 370 * Returns the previous code point (with pre-decrement). 371 * Public for identical-level comparison and for testing. 372 */ 373 public abstract int previousCodePoint(); 374 375 protected final void reset() { 376 cesIndex = ceBuffer.length = 0; 377 if(skipped != null) { skipped.clear(); } 378 } 379 /** 380 * Resets the state as well as the numeric setting, 381 * and completes the initialization. 382 * Only exists in Java where we reset cached CollationIterator instances 383 * rather than stack-allocating temporary ones. 384 * (See also the constructor comments.) 385 */ 386 protected final void reset(boolean numeric) { 387 if(ceBuffer == null) { 388 ceBuffer = new CEBuffer(); 389 } 390 reset(); 391 isNumeric = numeric; 392 } 393 394 /** 395 * Returns the next code point and its local CE32 value. 396 * Returns Collation.FALLBACK_CE32 at the end of the text (c<0) 397 * or when c's CE32 value is to be looked up in the base data (fallback). 398 * 399 * The code point is used for fallbacks, context and implicit weights. 400 * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32). 401 * 402 * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0. 403 */ 404 protected long handleNextCE32() { 405 int c = nextCodePoint(); 406 if(c < 0) { return NO_CP_AND_CE32; } 407 return makeCodePointAndCE32Pair(c, data.getCE32(c)); 408 } 409 protected long makeCodePointAndCE32Pair(int c, int ce32) { 410 return ((long)c << 32) | (ce32 & 0xffffffffL); 411 } 412 protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL); 413 414 /** 415 * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit. 416 * Returns the trail surrogate in that case and advances past it, 417 * if a trail surrogate follows the lead surrogate. 418 * Otherwise returns any other code unit and does not advance. 419 */ 420 protected char handleGetTrailSurrogate() { 421 return 0; 422 } 423 424 /** 425 * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator. 426 * (Not needed in Java.) 427 */ 428 /*protected boolean foundNULTerminator() { 429 return false; 430 }*/ 431 432 /** 433 * @return false if surrogate code points U+D800..U+DFFF 434 * map to their own implicit primary weights (for UTF-16), 435 * or true if they map to CE(U+FFFD) (for UTF-8) 436 */ 437 protected boolean forbidSurrogateCodePoints() { 438 return false; 439 } 440 441 protected abstract void forwardNumCodePoints(int num); 442 443 protected abstract void backwardNumCodePoints(int num); 444 445 /** 446 * Returns the CE32 from the data trie. 447 * Normally the same as data.getCE32(), but overridden in the builder. 448 * Call this only when the faster data.getCE32() cannot be used. 449 */ 450 protected int getDataCE32(int c) { 451 return data.getCE32(c); 452 } 453 454 protected int getCE32FromBuilderData(int ce32) { 455 throw new ICUException("internal program error: should be unreachable"); 456 } 457 458 protected final void appendCEsFromCE32(CollationData d, int c, int ce32, 459 boolean forward) { 460 while(Collation.isSpecialCE32(ce32)) { 461 switch(Collation.tagFromCE32(ce32)) { 462 case Collation.FALLBACK_TAG: 463 case Collation.RESERVED_TAG_3: 464 throw new ICUException("internal program error: should be unreachable"); 465 case Collation.LONG_PRIMARY_TAG: 466 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32)); 467 return; 468 case Collation.LONG_SECONDARY_TAG: 469 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32)); 470 return; 471 case Collation.LATIN_EXPANSION_TAG: 472 ceBuffer.ensureAppendCapacity(2); 473 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32)); 474 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32)); 475 ceBuffer.length += 2; 476 return; 477 case Collation.EXPANSION32_TAG: { 478 int index = Collation.indexFromCE32(ce32); 479 int length = Collation.lengthFromCE32(ce32); 480 ceBuffer.ensureAppendCapacity(length); 481 do { 482 ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++])); 483 } while(--length > 0); 484 return; 485 } 486 case Collation.EXPANSION_TAG: { 487 int index = Collation.indexFromCE32(ce32); 488 int length = Collation.lengthFromCE32(ce32); 489 ceBuffer.ensureAppendCapacity(length); 490 do { 491 ceBuffer.appendUnsafe(d.ces[index++]); 492 } while(--length > 0); 493 return; 494 } 495 case Collation.BUILDER_DATA_TAG: 496 ce32 = getCE32FromBuilderData(ce32); 497 if(ce32 == Collation.FALLBACK_CE32) { 498 d = data.base; 499 ce32 = d.getCE32(c); 500 } 501 break; 502 case Collation.PREFIX_TAG: 503 if(forward) { backwardNumCodePoints(1); } 504 ce32 = getCE32FromPrefix(d, ce32); 505 if(forward) { forwardNumCodePoints(1); } 506 break; 507 case Collation.CONTRACTION_TAG: { 508 int index = Collation.indexFromCE32(ce32); 509 int defaultCE32 = d.getCE32FromContexts(index); // Default if no suffix match. 510 if(!forward) { 511 // Backward contractions are handled by previousCEUnsafe(). 512 // c has contractions but they were not found. 513 ce32 = defaultCE32; 514 break; 515 } 516 int nextCp; 517 if(skipped == null && numCpFwd < 0) { 518 // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path, 519 // avoiding the function call and the nextSkippedCodePoint() overhead. 520 nextCp = nextCodePoint(); 521 if(nextCp < 0) { 522 // No more text. 523 ce32 = defaultCE32; 524 break; 525 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 526 !CollationFCD.mayHaveLccc(nextCp)) { 527 // All contraction suffixes start with characters with lccc!=0 528 // but the next code point has lccc==0. 529 backwardNumCodePoints(1); 530 ce32 = defaultCE32; 531 break; 532 } 533 } else { 534 nextCp = nextSkippedCodePoint(); 535 if(nextCp < 0) { 536 // No more text. 537 ce32 = defaultCE32; 538 break; 539 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 540 !CollationFCD.mayHaveLccc(nextCp)) { 541 // All contraction suffixes start with characters with lccc!=0 542 // but the next code point has lccc==0. 543 backwardNumSkipped(1); 544 ce32 = defaultCE32; 545 break; 546 } 547 } 548 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp); 549 if(ce32 == Collation.NO_CE32) { 550 // CEs from a discontiguous contraction plus the skipped combining marks 551 // have been appended already. 552 return; 553 } 554 break; 555 } 556 case Collation.DIGIT_TAG: 557 if(isNumeric) { 558 appendNumericCEs(ce32, forward); 559 return; 560 } else { 561 // Fetch the non-numeric-collation CE32 and continue. 562 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 563 break; 564 } 565 case Collation.U0000_TAG: 566 assert(c == 0); 567 // NUL-terminated input not supported in Java. 568 // Fetch the normal ce32 for U+0000 and continue. 569 ce32 = d.ce32s[0]; 570 break; 571 case Collation.HANGUL_TAG: { 572 int[] jamoCE32s = d.jamoCE32s; 573 c -= Hangul.HANGUL_BASE; 574 int t = c % Hangul.JAMO_T_COUNT; 575 c /= Hangul.JAMO_T_COUNT; 576 int v = c % Hangul.JAMO_V_COUNT; 577 c /= Hangul.JAMO_V_COUNT; 578 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) { 579 // None of the Jamo CE32s are isSpecialCE32(). 580 // Avoid recursive function calls and per-Jamo tests. 581 ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3); 582 ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c])); 583 ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v])); 584 ceBuffer.length += 2; 585 if(t != 0) { 586 ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t])); 587 } 588 return; 589 } else { 590 // We should not need to compute each Jamo code point. 591 // In particular, there should be no offset or implicit ce32. 592 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward); 593 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward); 594 if(t == 0) { return; } 595 // offset 39 = 19 + 21 - 1: 596 // 19 = JAMO_L_COUNT 597 // 21 = JAMO_T_COUNT 598 // -1 = omit t==0 599 ce32 = jamoCE32s[39 + t]; 600 c = Collation.SENTINEL_CP; 601 break; 602 } 603 } 604 case Collation.LEAD_SURROGATE_TAG: { 605 assert(forward); // Backward iteration should never see lead surrogate code _unit_ data. 606 assert(isLeadSurrogate(c)); 607 char trail; 608 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) { 609 c = Character.toCodePoint((char)c, trail); 610 ce32 &= Collation.LEAD_TYPE_MASK; 611 if(ce32 == Collation.LEAD_ALL_UNASSIGNED) { 612 ce32 = Collation.UNASSIGNED_CE32; // unassigned-implicit 613 } else if(ce32 == Collation.LEAD_ALL_FALLBACK || 614 (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) { 615 // fall back to the base data 616 d = d.base; 617 ce32 = d.getCE32FromSupplementary(c); 618 } 619 } else { 620 // c is an unpaired surrogate. 621 ce32 = Collation.UNASSIGNED_CE32; 622 } 623 break; 624 } 625 case Collation.OFFSET_TAG: 626 assert(c >= 0); 627 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32)); 628 return; 629 case Collation.IMPLICIT_TAG: 630 assert(c >= 0); 631 if(isSurrogate(c) && forbidSurrogateCodePoints()) { 632 ce32 = Collation.FFFD_CE32; 633 break; 634 } else { 635 ceBuffer.append(Collation.unassignedCEFromCodePoint(c)); 636 return; 637 } 638 } 639 } 640 ceBuffer.append(Collation.ceFromSimpleCE32(ce32)); 641 } 642 643 // TODO: Propose widening the UTF16 method. 644 private static final boolean isSurrogate(int c) { 645 return (c & 0xfffff800) == 0xd800; 646 } 647 648 // TODO: Propose widening the UTF16 method. 649 protected static final boolean isLeadSurrogate(int c) { 650 return (c & 0xfffffc00) == 0xd800; 651 } 652 653 // TODO: Propose widening the UTF16 method. 654 protected static final boolean isTrailSurrogate(int c) { 655 return (c & 0xfffffc00) == 0xdc00; 656 } 657 658 // Main lookup trie of the data object. 659 protected final Trie2_32 trie; 660 protected final CollationData data; 661 662 private final long nextCEFromCE32(CollationData d, int c, int ce32) { 663 --ceBuffer.length; // Undo ceBuffer.incLength(). 664 appendCEsFromCE32(d, c, ce32, true); 665 return ceBuffer.get(cesIndex++); 666 } 667 668 private final int getCE32FromPrefix(CollationData d, int ce32) { 669 int index = Collation.indexFromCE32(ce32); 670 ce32 = d.getCE32FromContexts(index); // Default if no prefix match. 671 index += 2; 672 // Number of code points read before the original code point. 673 int lookBehind = 0; 674 CharsTrie prefixes = new CharsTrie(d.contexts, index); 675 for(;;) { 676 int c = previousCodePoint(); 677 if(c < 0) { break; } 678 ++lookBehind; 679 BytesTrie.Result match = prefixes.nextForCodePoint(c); 680 if(match.hasValue()) { 681 ce32 = prefixes.getValue(); 682 } 683 if(!match.hasNext()) { break; } 684 } 685 forwardNumCodePoints(lookBehind); 686 return ce32; 687 } 688 689 private final int nextSkippedCodePoint() { 690 if(skipped != null && skipped.hasNext()) { return skipped.next(); } 691 if(numCpFwd == 0) { return Collation.SENTINEL_CP; } 692 int c = nextCodePoint(); 693 if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); } 694 if(numCpFwd > 0 && c >= 0) { --numCpFwd; } 695 return c; 696 } 697 698 private final void backwardNumSkipped(int n) { 699 if(skipped != null && !skipped.isEmpty()) { 700 n = skipped.backwardNumCodePoints(n); 701 } 702 backwardNumCodePoints(n); 703 if(numCpFwd >= 0) { numCpFwd += n; } 704 } 705 706 private final int nextCE32FromContraction( 707 CollationData d, int contractionCE32, 708 CharSequence trieChars, int trieOffset, int ce32, int c) { 709 // c: next code point after the original one 710 711 // Number of code points read beyond the original code point. 712 // Needed for discontiguous contraction matching. 713 int lookAhead = 1; 714 // Number of code points read since the last match (initially only c). 715 int sinceMatch = 1; 716 // Normally we only need a contiguous match, 717 // and therefore need not remember the suffixes state from before a mismatch for retrying. 718 // If we are already processing skipped combining marks, then we do track the state. 719 CharsTrie suffixes = new CharsTrie(trieChars, trieOffset); 720 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 721 BytesTrie.Result match = suffixes.firstForCodePoint(c); 722 for(;;) { 723 int nextCp; 724 if(match.hasValue()) { 725 ce32 = suffixes.getValue(); 726 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) { 727 return ce32; 728 } 729 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 730 sinceMatch = 1; 731 } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) { 732 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text. 733 // Back up if necessary, and try a discontiguous contraction. 734 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 && 735 // Discontiguous contraction matching extends an existing match. 736 // If there is no match yet, then there is nothing to do. 737 ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 || 738 sinceMatch < lookAhead)) { 739 // The last character of at least one suffix has lccc!=0, 740 // allowing for discontiguous contractions. 741 // UCA S2.1.1 only processes non-starters immediately following 742 // "a match in the table" (sinceMatch=1). 743 if(sinceMatch > 1) { 744 // Return to the state after the last match. 745 // (Return to sinceMatch=0 and re-fetch the first partially-matched character.) 746 backwardNumSkipped(sinceMatch); 747 c = nextSkippedCodePoint(); 748 lookAhead -= sinceMatch - 1; 749 sinceMatch = 1; 750 } 751 if(d.getFCD16(c) > 0xff) { 752 return nextCE32FromDiscontiguousContraction( 753 d, suffixes, ce32, lookAhead, c); 754 } 755 } 756 break; 757 } else { 758 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c. 759 // It does not have a result value, therefore it is not itself "a match in the table". 760 // If a partially-matched c has ccc!=0 then 761 // it might be skipped in discontiguous contraction. 762 c = nextCp; 763 ++sinceMatch; 764 } 765 ++lookAhead; 766 match = suffixes.nextForCodePoint(c); 767 } 768 backwardNumSkipped(sinceMatch); 769 return ce32; 770 } 771 772 private final int nextCE32FromDiscontiguousContraction( 773 CollationData d, CharsTrie suffixes, int ce32, 774 int lookAhead, int c) { 775 // UCA section 3.3.2 Contractions: 776 // Contractions that end with non-starter characters 777 // are known as discontiguous contractions. 778 // ... discontiguous contractions must be detected in input text 779 // whenever the final sequence of non-starter characters could be rearranged 780 // so as to make a contiguous matching sequence that is canonically equivalent. 781 782 // UCA: http://www.unicode.org/reports/tr10/#S2.1 783 // S2.1 Find the longest initial substring S at each point that has a match in the table. 784 // S2.1.1 If there are any non-starters following S, process each non-starter C. 785 // S2.1.2 If C is not blocked from S, find if S + C has a match in the table. 786 // Note: A non-starter in a string is called blocked 787 // if there is another non-starter of the same canonical combining class or zero 788 // between it and the last character of canonical combining class 0. 789 // S2.1.3 If there is a match, replace S by S + C, and remove C. 790 791 // First: Is a discontiguous contraction even possible? 792 int fcd16 = d.getFCD16(c); 793 assert(fcd16 > 0xff); // The caller checked this already, as a shortcut. 794 int nextCp = nextSkippedCodePoint(); 795 if(nextCp < 0) { 796 // No further text. 797 backwardNumSkipped(1); 798 return ce32; 799 } 800 ++lookAhead; 801 int prevCC = fcd16 & 0xff; 802 fcd16 = d.getFCD16(nextCp); 803 if(fcd16 <= 0xff) { 804 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 805 backwardNumSkipped(2); 806 return ce32; 807 } 808 809 // We have read and matched (lookAhead-2) code points, 810 // read non-matching c and peeked ahead at nextCp. 811 // Return to the state before the mismatch and continue matching with nextCp. 812 if(skipped == null || skipped.isEmpty()) { 813 if(skipped == null) { 814 skipped = new SkippedState(); 815 } 816 suffixes.reset(); 817 if(lookAhead > 2) { 818 // Replay the partial match so far. 819 backwardNumCodePoints(lookAhead); 820 suffixes.firstForCodePoint(nextCodePoint()); 821 for(int i = 3; i < lookAhead; ++i) { 822 suffixes.nextForCodePoint(nextCodePoint()); 823 } 824 // Skip c (which did not match) and nextCp (which we will try now). 825 forwardNumCodePoints(2); 826 } 827 skipped.saveTrieState(suffixes); 828 } else { 829 // Reset to the trie state before the failed match of c. 830 skipped.resetToTrieState(suffixes); 831 } 832 833 skipped.setFirstSkipped(c); 834 // Number of code points read since the last match (at this point: c and nextCp). 835 int sinceMatch = 2; 836 c = nextCp; 837 for(;;) { 838 BytesTrie.Result match; 839 // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2) 840 if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) { 841 // "If there is a match, replace S by S + C, and remove C." (S2.1.3) 842 // Keep prevCC unchanged. 843 ce32 = suffixes.getValue(); 844 sinceMatch = 0; 845 skipped.recordMatch(); 846 if(!match.hasNext()) { break; } 847 skipped.saveTrieState(suffixes); 848 } else { 849 // No match for "S + C", skip C. 850 skipped.skip(c); 851 skipped.resetToTrieState(suffixes); 852 prevCC = fcd16 & 0xff; 853 } 854 if((c = nextSkippedCodePoint()) < 0) { break; } 855 ++sinceMatch; 856 fcd16 = d.getFCD16(c); 857 if(fcd16 <= 0xff) { 858 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 859 break; 860 } 861 } 862 backwardNumSkipped(sinceMatch); 863 boolean isTopDiscontiguous = skipped.isEmpty(); 864 skipped.replaceMatch(); 865 if(isTopDiscontiguous && !skipped.isEmpty()) { 866 // We did get a match after skipping one or more combining marks, 867 // and we are not in a recursive discontiguous contraction. 868 // Append CEs from the contraction ce32 869 // and then from the combining marks that we skipped before the match. 870 c = Collation.SENTINEL_CP; 871 for(;;) { 872 appendCEsFromCE32(d, c, ce32, true); 873 // Fetch CE32s for skipped combining marks from the normal data, with fallback, 874 // rather than from the CollationData where we found the contraction. 875 if(!skipped.hasNext()) { break; } 876 c = skipped.next(); 877 ce32 = getDataCE32(c); 878 if(ce32 == Collation.FALLBACK_CE32) { 879 d = data.base; 880 ce32 = d.getCE32(c); 881 } else { 882 d = data; 883 } 884 // Note: A nested discontiguous-contraction match 885 // replaces consumed combining marks with newly skipped ones 886 // and resets the reading position to the beginning. 887 } 888 skipped.clear(); 889 ce32 = Collation.NO_CE32; // Signal to the caller that the result is in the ceBuffer. 890 } 891 return ce32; 892 } 893 894 /** 895 * Returns the previous CE when data.isUnsafeBackward(c, isNumeric). 896 */ 897 private final long previousCEUnsafe(int c, UVector32 offsets) { 898 // We just move through the input counting safe and unsafe code points 899 // without collecting the unsafe-backward substring into a buffer and 900 // switching to it. 901 // This is to keep the logic simple. Otherwise we would have to handle 902 // prefix matching going before the backward buffer, switching 903 // to iteration and back, etc. 904 // In the most important case of iterating over a normal string, 905 // reading from the string itself is already maximally fast. 906 // The only drawback there is that after getting the CEs we always 907 // skip backward to the safe character rather than switching out 908 // of a backwardBuffer. 909 // But this should not be the common case for previousCE(), 910 // and correctness and maintainability are more important than 911 // complex optimizations. 912 // Find the first safe character before c. 913 int numBackward = 1; 914 while((c = previousCodePoint()) >= 0) { 915 ++numBackward; 916 if(!data.isUnsafeBackward(c, isNumeric)) { 917 break; 918 } 919 } 920 // Set the forward iteration limit. 921 // Note: This counts code points. 922 // We cannot enforce a limit in the middle of a surrogate pair or similar. 923 numCpFwd = numBackward; 924 // Reset the forward iterator. 925 cesIndex = 0; 926 assert(ceBuffer.length == 0); 927 // Go forward and collect the CEs. 928 int offset = getOffset(); 929 while(numCpFwd > 0) { 930 // nextCE() normally reads one code point. 931 // Contraction matching and digit specials read more and check numCpFwd. 932 --numCpFwd; 933 // Append one or more CEs to the ceBuffer. 934 nextCE(); 935 assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE); 936 // No need to loop for getting each expansion CE from nextCE(). 937 cesIndex = ceBuffer.length; 938 // However, we need to write an offset for each CE. 939 // This is for CollationElementIterator.getOffset() to return 940 // intermediate offsets from the unsafe-backwards segment. 941 assert(offsets.size() < ceBuffer.length); 942 offsets.addElement(offset); 943 // For an expansion, the offset of each non-initial CE is the limit offset, 944 // consistent with forward iteration. 945 offset = getOffset(); 946 while(offsets.size() < ceBuffer.length) { 947 offsets.addElement(offset); 948 }; 949 } 950 assert(offsets.size() == ceBuffer.length); 951 // End offset corresponding to just after the unsafe-backwards segment. 952 offsets.addElement(offset); 953 // Reset the forward iteration limit 954 // and move backward to before the segment for which we fetched CEs. 955 numCpFwd = -1; 956 backwardNumCodePoints(numBackward); 957 // Use the collected CEs and return the last one. 958 cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented. 959 return ceBuffer.get(--ceBuffer.length); 960 } 961 962 /** 963 * Turns a string of digits (bytes 0..9) 964 * into a sequence of CEs that will sort in numeric order. 965 * 966 * Starts from this ce32's digit value and consumes the following/preceding digits. 967 * The digits string must not be empty and must not have leading zeros. 968 */ 969 private final void appendNumericCEs(int ce32, boolean forward) { 970 // Collect digits. 971 // TODO: Use some kind of a byte buffer? We only store values 0..9. 972 StringBuilder digits = new StringBuilder(); 973 if(forward) { 974 for(;;) { 975 char digit = Collation.digitFromCE32(ce32); 976 digits.append(digit); 977 if(numCpFwd == 0) { break; } 978 int c = nextCodePoint(); 979 if(c < 0) { break; } 980 ce32 = data.getCE32(c); 981 if(ce32 == Collation.FALLBACK_CE32) { 982 ce32 = data.base.getCE32(c); 983 } 984 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 985 backwardNumCodePoints(1); 986 break; 987 } 988 if(numCpFwd > 0) { --numCpFwd; } 989 } 990 } else { 991 for(;;) { 992 char digit = Collation.digitFromCE32(ce32); 993 digits.append(digit); 994 int c = previousCodePoint(); 995 if(c < 0) { break; } 996 ce32 = data.getCE32(c); 997 if(ce32 == Collation.FALLBACK_CE32) { 998 ce32 = data.base.getCE32(c); 999 } 1000 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 1001 forwardNumCodePoints(1); 1002 break; 1003 } 1004 } 1005 // Reverse the digit string. 1006 digits.reverse(); 1007 } 1008 int pos = 0; 1009 do { 1010 // Skip leading zeros. 1011 while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; } 1012 // Write a sequence of CEs for at most 254 digits at a time. 1013 int segmentLength = digits.length() - pos; 1014 if(segmentLength > 254) { segmentLength = 254; } 1015 appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength)); 1016 pos += segmentLength; 1017 } while(pos < digits.length()); 1018 } 1019 1020 /** 1021 * Turns 1..254 digits into a sequence of CEs. 1022 * Called by appendNumericCEs() for each segment of at most 254 digits. 1023 */ 1024 private final void appendNumericSegmentCEs(CharSequence digits) { 1025 int length = digits.length(); 1026 assert(1 <= length && length <= 254); 1027 assert(length == 1 || digits.charAt(0) != 0); 1028 long numericPrimary = data.numericPrimary; 1029 // Note: We use primary byte values 2..255: digits are not compressible. 1030 if(length <= 7) { 1031 // Very dense encoding for small numbers. 1032 int value = digits.charAt(0); 1033 for(int i = 1; i < length; ++i) { 1034 value = value * 10 + digits.charAt(i); 1035 } 1036 // Primary weight second byte values: 1037 // 74 byte values 2.. 75 for small numbers in two-byte primary weights. 1038 // 40 byte values 76..115 for medium numbers in three-byte primary weights. 1039 // 16 byte values 116..131 for large numbers in four-byte primary weights. 1040 // 124 byte values 132..255 for very large numbers with 4..127 digit pairs. 1041 int firstByte = 2; 1042 int numBytes = 74; 1043 if(value < numBytes) { 1044 // Two-byte primary for 0..73, good for day & month numbers etc. 1045 long primary = numericPrimary | ((firstByte + value) << 16); 1046 ceBuffer.append(Collation.makeCE(primary)); 1047 return; 1048 } 1049 value -= numBytes; 1050 firstByte += numBytes; 1051 numBytes = 40; 1052 if(value < numBytes * 254) { 1053 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more. 1054 long primary = numericPrimary | 1055 ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8); 1056 ceBuffer.append(Collation.makeCE(primary)); 1057 return; 1058 } 1059 value -= numBytes * 254; 1060 firstByte += numBytes; 1061 numBytes = 16; 1062 if(value < numBytes * 254 * 254) { 1063 // Four-byte primary for 10234..1042489=10234+16*254*254-1. 1064 long primary = numericPrimary | (2 + value % 254); 1065 value /= 254; 1066 primary |= (2 + value % 254) << 8; 1067 value /= 254; 1068 primary |= (firstByte + value % 254) << 16; 1069 ceBuffer.append(Collation.makeCE(primary)); 1070 return; 1071 } 1072 // original value > 1042489 1073 } 1074 assert(length >= 7); 1075 1076 // The second primary byte value 132..255 indicates the number of digit pairs (4..127), 1077 // then we generate primary bytes with those pairs. 1078 // Omit trailing 00 pairs. 1079 // Decrement the value for the last pair. 1080 1081 // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255. 1082 int numPairs = (length + 1) / 2; 1083 long primary = numericPrimary | ((132 - 4 + numPairs) << 16); 1084 // Find the length without trailing 00 pairs. 1085 while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) { 1086 length -= 2; 1087 } 1088 // Read the first pair. 1089 int pair; 1090 int pos; 1091 if((length & 1) != 0) { 1092 // Only "half a pair" if we have an odd number of digits. 1093 pair = digits.charAt(0); 1094 pos = 1; 1095 } else { 1096 pair = digits.charAt(0) * 10 + digits.charAt(1); 1097 pos = 2; 1098 } 1099 pair = 11 + 2 * pair; 1100 // Add the pairs of digits between pos and length. 1101 int shift = 8; 1102 while(pos < length) { 1103 if(shift == 0) { 1104 // Every three pairs/bytes we need to store a 4-byte-primary CE 1105 // and start with a new CE with the '0' primary lead byte. 1106 primary |= pair; 1107 ceBuffer.append(Collation.makeCE(primary)); 1108 primary = numericPrimary; 1109 shift = 16; 1110 } else { 1111 primary |= pair << shift; 1112 shift -= 8; 1113 } 1114 pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1)); 1115 pos += 2; 1116 } 1117 primary |= (pair - 1) << shift; 1118 ceBuffer.append(Collation.makeCE(primary)); 1119 } 1120 1121 private CEBuffer ceBuffer; 1122 private int cesIndex; 1123 1124 private SkippedState skipped; 1125 1126 // Number of code points to read forward, or -1. 1127 // Used as a forward iteration limit in previousCEUnsafe(). 1128 private int numCpFwd; 1129 // Numeric collation (CollationSettings.NUMERIC). 1130 private boolean isNumeric; 1131 } 1132