1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.text; 11 12 import java.text.CharacterIterator; 13 import java.text.StringCharacterIterator; 14 import java.util.Locale; 15 16 import android.icu.util.ICUException; 17 import android.icu.util.ULocale; 18 19 // Java porting note: 20 // 21 // The ICU4C implementation contains dead code in many places. 22 // While porting the ICU4C linear search implementation, this dead code 23 // was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***" 24 // are those dead code blocks, still available in ICU4C. 25 26 // The ICU4C implementation does not seem to handle UCharacterIterator pointing 27 // to a fragment of text properly. ICU4J uses CharacterIterator to navigate through 28 // the input text. We need to carefully review the code ported from ICU4C 29 // assuming the start index is 0. 30 31 // ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like 32 // CE is no longer used, except in a few places checking CELength. It looks like this 33 // is a leftover from already-disabled Boyer-Moore search code. This Java implementation 34 // preserves the code, but we should clean this up later. 35 36 /** 37 * 38 * <tt>StringSearch</tt> is a {@link SearchIterator} that provides 39 * language-sensitive text searching based on the comparison rules defined 40 * in a {@link RuleBasedCollator} object. 41 * StringSearch ensures that language eccentricity can be 42 * handled, e.g. for the German collator, characters ß and SS will be matched 43 * if case is chosen to be ignored. 44 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> 45 * "ICU Collation Design Document"</a> for more information. 46 * <p> 47 * There are 2 match options for selection:<br> 48 * Let S' be the sub-string of a text string S between the offsets start and 49 * end [start, end]. 50 * <br> 51 * A pattern string P matches a text string S at the offsets [start, end] 52 * if 53 * <pre> 54 * option 1. Some canonical equivalent of P matches some canonical equivalent 55 * of S' 56 * option 2. P matches S' and if P starts or ends with a combining mark, 57 * there exists no non-ignorable combining mark before or after S? 58 * in S respectively. 59 * </pre> 60 * Option 2. is the default. 61 * <p> 62 * This search has APIs similar to that of other text iteration mechanisms 63 * such as the break iterators in {@link BreakIterator}. Using these 64 * APIs, it is easy to scan through text looking for all occurrences of 65 * a given pattern. This search iterator allows changing of direction by 66 * calling a {@link #reset} followed by a {@link #next} or {@link #previous}. 67 * Though a direction change can occur without calling {@link #reset} first, 68 * this operation comes with some speed penalty. 69 * Match results in the forward direction will match the result matches in 70 * the backwards direction in the reverse order 71 * <p> 72 * {@link SearchIterator} provides APIs to specify the starting position 73 * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex}, 74 * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}. 75 * Since the starting position will be set as it is specified, please take note that 76 * there are some danger points at which the search may render incorrect 77 * results: 78 * <ul> 79 * <li> In the midst of a substring that requires normalization. 80 * <li> If the following match is to be found, the position should not be the 81 * second character which requires swapping with the preceding 82 * character. Vice versa, if the preceding match is to be found, the 83 * position to search from should not be the first character which 84 * requires swapping with the next character. E.g certain Thai and 85 * Lao characters require swapping. 86 * <li> If a following pattern match is to be found, any position within a 87 * contracting sequence except the first will fail. Vice versa if a 88 * preceding pattern match is to be found, an invalid starting point 89 * would be any character within a contracting sequence except the last. 90 * </ul> 91 * <p> 92 * A {@link BreakIterator} can be used if only matches at logical breaks are desired. 93 * Using a {@link BreakIterator} will only give you results that exactly matches the 94 * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will 95 * not be found in the string "\u00e9" if a character break iterator is used. 96 * <p> 97 * Options are provided to handle overlapping matches. 98 * E.g. In English, overlapping matches produces the result 0 and 2 99 * for the pattern "abab" in the text "ababab", where mutually 100 * exclusive matches only produces the result of 0. 101 * <p> 102 * Options are also provided to implement "asymmetric search" as described in 103 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> 104 * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType 105 * values. 106 * <p> 107 * Though collator attributes will be taken into consideration while 108 * performing matches, there are no APIs here for setting and getting the 109 * attributes. These attributes can be set by getting the collator 110 * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}. 111 * Lastly to update <tt>StringSearch</tt> to the new collator attributes, 112 * {@link #reset} has to be called. 113 * <p> 114 * Restriction: <br> 115 * Currently there are no composite characters that consists of a 116 * character with combining class > 0 before a character with combining 117 * class == 0. However, if such a character exists in the future, 118 * <tt>StringSearch</tt> does not guarantee the results for option 1. 119 * <p> 120 * Consult the {@link SearchIterator} documentation for information on 121 * and examples of how to use instances of this class to implement text 122 * searching. 123 * <p> 124 * Note, <tt>StringSearch</tt> is not to be subclassed. 125 * </p> 126 * @see SearchIterator 127 * @see RuleBasedCollator 128 * @author Laura Werner, synwee 129 */ 130 // internal notes: all methods do not guarantee the correct status of the 131 // characteriterator. the caller has to maintain the original index position 132 // if necessary. methods could change the index position as it deems fit 133 public final class StringSearch extends SearchIterator { 134 135 private Pattern pattern_; 136 private RuleBasedCollator collator_; 137 138 // positions within the collation element iterator is used to determine 139 // if we are at the start of the text. 140 private CollationElementIterator textIter_; 141 private CollationPCE textProcessedIter_; 142 143 // utility collation element, used throughout program for temporary 144 // iteration. 145 private CollationElementIterator utilIter_; 146 147 private Normalizer2 nfd_; 148 149 private int strength_; 150 int ceMask_; 151 int variableTop_; 152 153 private boolean toShift_; 154 155 // *** Boyer-Moore *** 156 // private char[] canonicalPrefixAccents_; 157 // private char[] canonicalSuffixAccents_; 158 159 /** 160 * Initializes the iterator to use the language-specific rules defined in 161 * the argument collator to search for argument pattern in the argument 162 * target text. The argument <code>breakiter</code> is used to define logical matches. 163 * See super class documentation for more details on the use of the target 164 * text and {@link BreakIterator}. 165 * @param pattern text to look for. 166 * @param target target text to search for pattern. 167 * @param collator {@link RuleBasedCollator} that defines the language rules 168 * @param breakiter A {@link BreakIterator} that is used to determine the 169 * boundaries of a logical match. This argument can be null. 170 * @throws IllegalArgumentException thrown when argument target is null, 171 * or of length 0 172 * @see BreakIterator 173 * @see RuleBasedCollator 174 */ 175 public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, 176 BreakIterator breakiter) { 177 178 // This implementation is ported from ICU4C usearch_open() 179 180 super(target, breakiter); 181 182 // string search does not really work when numeric collation is turned on 183 if (collator.getNumericCollation()) { 184 throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch"); 185 } 186 187 collator_ = collator; 188 strength_ = collator.getStrength(); 189 ceMask_ = getMask(strength_); 190 toShift_ = collator.isAlternateHandlingShifted(); 191 variableTop_ = collator.getVariableTop(); 192 193 nfd_ = Normalizer2.getNFDInstance(); 194 195 pattern_ = new Pattern(pattern); 196 197 search_.setMatchedLength(0); 198 search_.matchedIndex_ = DONE; 199 200 utilIter_ = null; 201 textIter_ = new CollationElementIterator(target, collator); 202 203 textProcessedIter_ = null; 204 205 // This is done by super class constructor 206 /* 207 search_.isOverlap_ = false; 208 search_.isCanonicalMatch_ = false; 209 search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 210 search_.isForwardSearching_ = true; 211 search_.reset_ = true; 212 */ 213 ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 214 search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 215 search_.internalBreakIter_.setText((CharacterIterator)target.clone()); // We need to create a clone 216 217 initialize(); 218 } 219 220 /** 221 * Initializes the iterator to use the language-specific rules defined in 222 * the argument collator to search for argument pattern in the argument 223 * target text. No {@link BreakIterator}s are set to test for logical matches. 224 * @param pattern text to look for. 225 * @param target target text to search for pattern. 226 * @param collator {@link RuleBasedCollator} that defines the language rules 227 * @throws IllegalArgumentException thrown when argument target is null, 228 * or of length 0 229 * @see RuleBasedCollator 230 */ 231 public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) { 232 this(pattern, target, collator, null); 233 } 234 235 /** 236 * Initializes the iterator to use the language-specific rules and 237 * break iterator rules defined in the argument locale to search for 238 * argument pattern in the argument target text. 239 * @param pattern text to look for. 240 * @param target target text to search for pattern. 241 * @param locale locale to use for language and break iterator rules 242 * @throws IllegalArgumentException thrown when argument target is null, 243 * or of length 0. ClassCastException thrown if the collator for 244 * the specified locale is not a RuleBasedCollator. 245 */ 246 public StringSearch(String pattern, CharacterIterator target, Locale locale) { 247 this(pattern, target, ULocale.forLocale(locale)); 248 } 249 250 /** 251 * Initializes the iterator to use the language-specific rules and 252 * break iterator rules defined in the argument locale to search for 253 * argument pattern in the argument target text. 254 * See super class documentation for more details on the use of the target 255 * text and {@link BreakIterator}. 256 * @param pattern text to look for. 257 * @param target target text to search for pattern. 258 * @param locale locale to use for language and break iterator rules 259 * @throws IllegalArgumentException thrown when argument target is null, 260 * or of length 0. ClassCastException thrown if the collator for 261 * the specified locale is not a RuleBasedCollator. 262 * @see BreakIterator 263 * @see RuleBasedCollator 264 * @see SearchIterator 265 */ 266 public StringSearch(String pattern, CharacterIterator target, ULocale locale) { 267 this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null); 268 } 269 270 /** 271 * Initializes the iterator to use the language-specific rules and 272 * break iterator rules defined in the default locale to search for 273 * argument pattern in the argument target text. 274 * @param pattern text to look for. 275 * @param target target text to search for pattern. 276 * @throws IllegalArgumentException thrown when argument target is null, 277 * or of length 0. ClassCastException thrown if the collator for 278 * the default locale is not a RuleBasedCollator. 279 */ 280 public StringSearch(String pattern, String target) { 281 this(pattern, new StringCharacterIterator(target), 282 (RuleBasedCollator) Collator.getInstance(), null); 283 } 284 285 /** 286 * Gets the {@link RuleBasedCollator} used for the language rules. 287 * <p> 288 * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any 289 * changes to the {@link RuleBasedCollator} result should follow with a call to 290 * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct 291 * search behavior. 292 * </p> 293 * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt> 294 * @see RuleBasedCollator 295 * @see #setCollator 296 */ 297 public RuleBasedCollator getCollator() { 298 return collator_; 299 } 300 301 /** 302 * Sets the {@link RuleBasedCollator} to be used for language-specific searching. 303 * <p> 304 * The iterator's position will not be changed by this method. 305 * @param collator to use for this <tt>StringSearch</tt> 306 * @throws IllegalArgumentException thrown when collator is null 307 * @see #getCollator 308 */ 309 public void setCollator(RuleBasedCollator collator) { 310 if (collator == null) { 311 throw new IllegalArgumentException("Collator can not be null"); 312 } 313 collator_ = collator; 314 ceMask_ = getMask(collator_.getStrength()); 315 316 ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 317 search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 318 search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone()); // We need to create a clone 319 320 toShift_ = collator.isAlternateHandlingShifted(); 321 variableTop_ = collator.getVariableTop(); 322 textIter_ = new CollationElementIterator(pattern_.text_, collator); 323 utilIter_ = new CollationElementIterator(pattern_.text_, collator); 324 325 // initialize() _after_ setting the iterators for the new collator. 326 initialize(); 327 } 328 329 /** 330 * Returns the pattern for which <tt>StringSearch</tt> is searching for. 331 * @return the pattern searched for 332 */ 333 public String getPattern() { 334 return pattern_.text_; 335 } 336 337 /** 338 * Set the pattern to search for. 339 * The iterator's position will not be changed by this method. 340 * @param pattern for searching 341 * @see #getPattern 342 * @exception IllegalArgumentException thrown if pattern is null or of 343 * length 0 344 */ 345 public void setPattern(String pattern) { 346 if (pattern == null || pattern.length() <= 0) { 347 throw new IllegalArgumentException( 348 "Pattern to search for can not be null or of length 0"); 349 } 350 pattern_.text_ = pattern; 351 initialize(); 352 } 353 354 /** 355 * Determines whether canonical matches (option 1, as described in the 356 * class documentation) is set. 357 * See setCanonical(boolean) for more information. 358 * @see #setCanonical 359 * @return true if canonical matches is set, false otherwise 360 */ 361 //TODO: hoist this to SearchIterator 362 public boolean isCanonical() { 363 return search_.isCanonicalMatch_; 364 } 365 366 /** 367 * Set the canonical match mode. See class documentation for details. 368 * The default setting for this property is false. 369 * @param allowCanonical flag indicator if canonical matches are allowed 370 * @see #isCanonical 371 */ 372 //TODO: hoist this to SearchIterator 373 public void setCanonical(boolean allowCanonical) { 374 search_.isCanonicalMatch_ = allowCanonical; 375 } 376 377 /** 378 * {@inheritDoc} 379 */ 380 @Override 381 public void setTarget(CharacterIterator text) { 382 super.setTarget(text); 383 textIter_.setText(text); 384 } 385 386 /** 387 * {@inheritDoc} 388 */ 389 @Override 390 public int getIndex() { 391 int result = textIter_.getOffset(); 392 if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) { 393 return DONE; 394 } 395 return result; 396 } 397 398 /** 399 * {@inheritDoc} 400 */ 401 @Override 402 public void setIndex(int position) { 403 // Java porting note: This method is equivalent to setOffset() in ICU4C. 404 // ICU4C SearchIterator::setOffset() is a pure virtual method, while 405 // ICU4J SearchIterator.setIndex() is not abstract method. 406 407 super.setIndex(position); 408 textIter_.setOffset(position); 409 } 410 411 /** 412 * {@inheritDoc} 413 */ 414 @Override 415 public void reset() { 416 // reset is setting the attributes that are already in 417 // string search, hence all attributes in the collator should 418 // be retrieved without any problems 419 420 boolean sameCollAttribute = true; 421 int ceMask; 422 boolean shift; 423 int varTop; 424 425 // **** hack to deal w/ how processed CEs encode quaternary **** 426 int newStrength = collator_.getStrength(); 427 if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY) 428 || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) { 429 sameCollAttribute = false; 430 } 431 432 strength_ = collator_.getStrength(); 433 ceMask = getMask(strength_); 434 if (ceMask_ != ceMask) { 435 ceMask_ = ceMask; 436 sameCollAttribute = false; 437 } 438 439 shift = collator_.isAlternateHandlingShifted(); 440 if (toShift_ != shift) { 441 toShift_ = shift; 442 sameCollAttribute = false; 443 } 444 445 varTop = collator_.getVariableTop(); 446 if (variableTop_ != varTop) { 447 variableTop_ = varTop; 448 sameCollAttribute = false; 449 } 450 451 if (!sameCollAttribute) { 452 initialize(); 453 } 454 455 textIter_.setText(search_.text()); 456 457 search_.setMatchedLength(0); 458 search_.matchedIndex_ = DONE; 459 search_.isOverlap_ = false; 460 search_.isCanonicalMatch_ = false; 461 search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 462 search_.isForwardSearching_ = true; 463 search_.reset_ = true; 464 } 465 466 /** 467 * {@inheritDoc} 468 */ 469 @Override 470 protected int handleNext(int position) { 471 if (pattern_.CELength_ == 0) { 472 search_.matchedIndex_ = search_.matchedIndex_ == DONE ? 473 getIndex() : search_.matchedIndex_ + 1; 474 search_.setMatchedLength(0); 475 textIter_.setOffset(search_.matchedIndex_); 476 if (search_.matchedIndex_ == search_.endIndex()) { 477 search_.matchedIndex_ = DONE; 478 } 479 } else { 480 if (search_.matchedLength() <= 0) { 481 // the flipping direction issue has already been handled 482 // in next() 483 // for boundary check purposes. this will ensure that the 484 // next match will not preceed the current offset 485 // note search_.matchedIndex_ will always be set to something 486 // in the code 487 search_.matchedIndex_ = position - 1; 488 } 489 490 textIter_.setOffset(position); 491 492 // ICU4C comment: 493 // if strsrch_->breakIter is always the same as m_breakiterator_ 494 // then we don't need to check the match boundaries here because 495 // usearch_handleNextXXX will already have done it. 496 if (search_.isCanonicalMatch_) { 497 // *could* actually use exact here 'cause no extra accents allowed... 498 handleNextCanonical(); 499 } else { 500 handleNextExact(); 501 } 502 503 if (search_.matchedIndex_ == DONE) { 504 textIter_.setOffset(search_.endIndex()); 505 } else { 506 textIter_.setOffset(search_.matchedIndex_); 507 } 508 509 return search_.matchedIndex_; 510 } 511 512 return DONE; 513 } 514 515 /** 516 * {@inheritDoc} 517 */ 518 @Override 519 protected int handlePrevious(int position) { 520 if (pattern_.CELength_ == 0) { 521 search_.matchedIndex_ = 522 search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_; 523 if (search_.matchedIndex_ == search_.beginIndex()) { 524 setMatchNotFound(); 525 } else { 526 search_.matchedIndex_--; 527 textIter_.setOffset(search_.matchedIndex_); 528 search_.setMatchedLength(0); 529 } 530 } else { 531 textIter_.setOffset(position); 532 533 if (search_.isCanonicalMatch_) { 534 // *could* use exact match here since extra accents *not* allowed! 535 handlePreviousCanonical(); 536 } else { 537 handlePreviousExact(); 538 } 539 } 540 541 return search_.matchedIndex_; 542 } 543 544 // ------------------ Internal implementation code --------------------------- 545 546 private static final int INITIAL_ARRAY_SIZE_ = 256; 547 548 // *** Boyer-Moore *** 549 // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl; 550 // private static final int LAST_BYTE_MASK_ = 0xff; 551 // private static final int SECOND_LAST_BYTE_SHIFT_ = 8; 552 553 private static final int PRIMARYORDERMASK = 0xffff0000; 554 private static final int SECONDARYORDERMASK = 0x0000ff00; 555 private static final int TERTIARYORDERMASK = 0x000000ff; 556 557 /** 558 * Getting the mask for collation strength 559 * @param strength collation strength 560 * @return collation element mask 561 */ 562 private static int getMask(int strength) { 563 switch (strength) { 564 case Collator.PRIMARY: 565 return PRIMARYORDERMASK; 566 case Collator.SECONDARY: 567 return SECONDARYORDERMASK | PRIMARYORDERMASK; 568 default: 569 return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK; 570 } 571 } 572 573 574 // *** Boyer-Moore *** 575 /* 576 private final char getFCD(String str, int offset) { 577 char ch = str.charAt(offset); 578 if (ch < 0x180) { 579 return (char) nfcImpl_.getFCD16FromBelow180(ch); 580 } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { 581 if (!Character.isHighSurrogate(ch)) { 582 return (char) nfcImpl_.getFCD16FromNormData(ch); 583 } else { 584 char c2; 585 if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) { 586 return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2)); 587 } 588 } 589 } 590 return 0; 591 } 592 593 private final char getFCD(int c) { 594 return (char)nfcImpl_.getFCD16(c); 595 } 596 */ 597 598 /** 599 * Getting the modified collation elements taking into account the collation 600 * attributes. 601 * 602 * @param sourcece 603 * @return the modified collation element 604 */ 605 private int getCE(int sourcece) { 606 // note for tertiary we can't use the collator->tertiaryMask, that 607 // is a preprocessed mask that takes into account case options. since 608 // we are only concerned with exact matches, we don't need that. 609 sourcece &= ceMask_; 610 611 if (toShift_) { 612 // alternate handling here, since only the 16 most significant digits 613 // is only used, we can safely do a compare without masking 614 // if the ce is a variable, we mask and get only the primary values 615 // no shifting to quartenary is required since all primary values 616 // less than variabletop will need to be masked off anyway. 617 if (variableTop_ > sourcece) { 618 if (strength_ >= Collator.QUATERNARY) { 619 sourcece &= PRIMARYORDERMASK; 620 } else { 621 sourcece = CollationElementIterator.IGNORABLE; 622 } 623 } 624 } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) { 625 sourcece = 0xFFFF; 626 } 627 628 return sourcece; 629 } 630 631 /** 632 * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp 633 * (except not taking destination buffer size and status param). 634 * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 635 * implement this in Pattern class. 636 * 637 * @param destination target array 638 * @param offset destination offset to add value 639 * @param value to be added 640 * @param increments incremental size expected 641 * @return new destination array, destination if there was no new allocation 642 */ 643 private static int[] addToIntArray(int[] destination, int offset, int value, int increments) { 644 int newlength = destination.length; 645 if (offset + 1 == newlength) { 646 newlength += increments; 647 int temp[] = new int[newlength]; 648 System.arraycopy(destination, 0, temp, 0, offset); 649 destination = temp; 650 } 651 destination[offset] = value; 652 return destination; 653 } 654 655 /** 656 * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp. 657 * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 658 * implement this in Pattern class. 659 * 660 * @param destination target array 661 * @param offset destination offset to add value 662 * @param destinationlength target array size 663 * @param value to be added 664 * @param increments incremental size expected 665 * @return new destination array, destination if there was no new allocation 666 */ 667 private static long[] addToLongArray(long[] destination, int offset, int destinationlength, 668 long value, int increments) { 669 int newlength = destinationlength; 670 if (offset + 1 == newlength) { 671 newlength += increments; 672 long temp[] = new long[newlength]; 673 System.arraycopy(destination, 0, temp, 0, offset); 674 destination = temp; 675 } 676 destination[offset] = value; 677 return destination; 678 } 679 680 /** 681 * Initializing the ce table for a pattern. 682 * Stores non-ignorable collation keys. 683 * Table size will be estimated by the size of the pattern text. Table 684 * expansion will be perform as we go along. Adding 1 to ensure that the table 685 * size definitely increases. 686 * @return total number of expansions 687 */ 688 // TODO: We probably do not need Pattern CE table. 689 private int initializePatternCETable() { 690 int[] cetable = new int[INITIAL_ARRAY_SIZE_]; 691 int patternlength = pattern_.text_.length(); 692 CollationElementIterator coleiter = utilIter_; 693 694 if (coleiter == null) { 695 coleiter = new CollationElementIterator(pattern_.text_, collator_); 696 utilIter_ = coleiter; 697 } else { 698 coleiter.setText(pattern_.text_); 699 } 700 701 int offset = 0; 702 int result = 0; 703 int ce; 704 705 while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) { 706 int newce = getCE(ce); 707 if (newce != CollationElementIterator.IGNORABLE /* 0 */) { 708 int[] temp = addToIntArray(cetable, offset, newce, 709 patternlength - coleiter.getOffset() + 1); 710 offset++; 711 cetable = temp; 712 } 713 result += (coleiter.getMaxExpansion(ce) - 1); 714 } 715 716 cetable[offset] = 0; 717 pattern_.CE_ = cetable; 718 pattern_.CELength_ = offset; 719 720 return result; 721 } 722 723 /** 724 * Initializing the pce table for a pattern. 725 * Stores non-ignorable collation keys. 726 * Table size will be estimated by the size of the pattern text. Table 727 * expansion will be perform as we go along. Adding 1 to ensure that the table 728 * size definitely increases. 729 * @return total number of expansions 730 */ 731 private int initializePatternPCETable() { 732 long[] pcetable = new long[INITIAL_ARRAY_SIZE_]; 733 int pcetablesize = pcetable.length; 734 int patternlength = pattern_.text_.length(); 735 CollationElementIterator coleiter = utilIter_; 736 737 if (coleiter == null) { 738 coleiter = new CollationElementIterator(pattern_.text_, collator_); 739 utilIter_ = coleiter; 740 } else { 741 coleiter.setText(pattern_.text_); 742 } 743 744 int offset = 0; 745 int result = 0; 746 long pce; 747 748 CollationPCE iter = new CollationPCE(coleiter); 749 750 // ** Should processed CEs be signed or unsigned? 751 // ** (the rest of the code in this file seems to play fast-and-loose with 752 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 753 while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) { 754 long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1); 755 offset++; 756 pcetable = temp; 757 } 758 759 pcetable[offset] = 0; 760 pattern_.PCE_ = pcetable; 761 pattern_.PCELength_ = offset; 762 763 return result; 764 } 765 766 // TODO: This method only triggers initializePatternCETable(), which is probably no 767 // longer needed. 768 private int initializePattern() { 769 // Since the strength is primary, accents are ignored in the pattern. 770 771 // *** Boyer-Moore *** 772 /* 773 if (strength_ == Collator.PRIMARY) { 774 pattern_.hasPrefixAccents_ = false; 775 pattern_.hasSuffixAccents_ = false; 776 } else { 777 pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0; 778 pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0; 779 } 780 */ 781 782 pattern_.PCE_ = null; 783 784 // since intializePattern is an internal method status is a success. 785 return initializePatternCETable(); 786 } 787 788 // *** Boyer-Moore *** 789 /* 790 private final void setShiftTable(char shift[], 791 char backshift[], 792 int cetable[], int cesize, 793 int expansionsize, 794 int defaultforward, 795 int defaultbackward) { 796 // No implementation 797 } 798 */ 799 800 // TODO: This method only triggers initializePattern(), which is probably no 801 // longer needed. 802 private void initialize() { 803 /* int expandlength = */ initializePattern(); 804 805 // *** Boyer-Moore *** 806 /* 807 if (pattern_.CELength_ > 0) { 808 int cesize = pattern_.CELength_; 809 int minlength = cesize > expandlength ? cesize - expandlength : 1; 810 pattern_.defaultShiftSize_ = minlength; 811 setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize, 812 expandlength, minlength, minlength); 813 return; 814 } 815 return pattern_.defaultShiftSize_; 816 */ 817 } 818 819 /** 820 * @deprecated This API is ICU internal only. 821 * @hide original deprecated declaration 822 * @hide draft / provisional / internal are hidden on Android 823 */ 824 @Override 825 @Deprecated 826 protected void setMatchNotFound() { 827 super.setMatchNotFound(); 828 // SearchIterator#setMatchNotFound() does following: 829 // search_.matchedIndex_ = DONE; 830 // search_.setMatchedLength(0); 831 if (search_.isForwardSearching_) { 832 textIter_.setOffset(search_.text().getEndIndex()); 833 } else { 834 textIter_.setOffset(0); 835 } 836 } 837 838 /** 839 * Checks if the offset runs out of the text string range 840 * @param textstart offset of the first character in the range 841 * @param textlimit limit offset of the text string range 842 * @param offset to test 843 * @return true if offset is out of bounds, false otherwise 844 */ 845 private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) { 846 return offset < textstart || offset > textlimit; 847 } 848 849 /** 850 * Checks for identical match 851 * @param start offset of possible match 852 * @param end offset of possible match 853 * @return TRUE if identical match is found 854 */ 855 private boolean checkIdentical(int start, int end) { 856 if (strength_ != Collator.IDENTICAL) { 857 return true; 858 } 859 // Note: We could use Normalizer::compare() or similar, but for short strings 860 // which may not be in FCD it might be faster to just NFD them. 861 String textstr = getString(targetText, start, end - start); 862 if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) { 863 textstr = Normalizer.decompose(textstr, false); 864 } 865 String patternstr = pattern_.text_; 866 if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) { 867 patternstr = Normalizer.decompose(patternstr, false); 868 } 869 return textstr.equals(patternstr); 870 } 871 872 private boolean initTextProcessedIter() { 873 if (textProcessedIter_ == null) { 874 textProcessedIter_ = new CollationPCE(textIter_); 875 } else { 876 textProcessedIter_.init(textIter_); 877 } 878 return true; 879 } 880 881 /* 882 * Find the next break boundary after startIndex. If the UStringSearch object 883 * has an external break iterator, use that. Otherwise use the internal character 884 * break iterator. 885 */ 886 private int nextBoundaryAfter(int startIndex) { 887 BreakIterator breakiterator = search_.breakIter(); 888 889 if (breakiterator == null) { 890 breakiterator = search_.internalBreakIter_; 891 } 892 893 if (breakiterator != null) { 894 return breakiterator.following(startIndex); 895 } 896 897 return startIndex; 898 } 899 900 /* 901 * Returns TRUE if index is on a break boundary. If the UStringSearch 902 * has an external break iterator, test using that, otherwise test 903 * using the internal character break iterator. 904 */ 905 private boolean isBreakBoundary(int index) { 906 BreakIterator breakiterator = search_.breakIter(); 907 908 if (breakiterator == null) { 909 breakiterator = search_.internalBreakIter_; 910 } 911 912 return (breakiterator != null && breakiterator.isBoundary(index)); 913 } 914 915 916 // Java porting note: Followings are corresponding to UCompareCEsResult enum 917 private static final int CE_MATCH = -1; 918 private static final int CE_NO_MATCH = 0; 919 private static final int CE_SKIP_TARG = 1; 920 private static final int CE_SKIP_PATN = 2; 921 922 private static int CE_LEVEL2_BASE = 0x00000005; 923 private static int CE_LEVEL3_BASE = 0x00050000; 924 925 private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) { 926 if (targCE == patCE) { 927 return CE_MATCH; 928 } 929 if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 930 return CE_NO_MATCH; 931 } 932 933 long targCEshifted = targCE >>> 32; 934 long patCEshifted = patCE >>> 32; 935 long mask; 936 937 mask = 0xFFFF0000L; 938 int targLev1 = (int)(targCEshifted & mask); 939 int patLev1 = (int)(patCEshifted & mask); 940 if (targLev1 != patLev1) { 941 if (targLev1 == 0) { 942 return CE_SKIP_TARG; 943 } 944 if (patLev1 == 0 945 && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 946 return CE_SKIP_PATN; 947 } 948 return CE_NO_MATCH; 949 } 950 951 mask = 0x0000FFFFL; 952 int targLev2 = (int)(targCEshifted & mask); 953 int patLev2 = (int)(patCEshifted & mask); 954 if (targLev2 != patLev2) { 955 if (targLev2 == 0) { 956 return CE_SKIP_TARG; 957 } 958 if (patLev2 == 0 959 && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 960 return CE_SKIP_PATN; 961 } 962 return (patLev2 == CE_LEVEL2_BASE || 963 (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 964 targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH; 965 } 966 967 mask = 0xFFFF0000L; 968 int targLev3 = (int)(targCE & mask); 969 int patLev3 = (int)(patCE & mask); 970 if (targLev3 != patLev3) { 971 return (patLev3 == CE_LEVEL3_BASE || 972 (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 973 targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH; 974 } 975 976 return CE_MATCH; 977 } 978 979 /** 980 * An object used for receiving matched index in search() and 981 * searchBackwards(). 982 */ 983 private static class Match { 984 int start_ = -1; 985 int limit_ = -1; 986 } 987 988 private boolean search(int startIdx, Match m) { 989 // Input parameter sanity check. 990 if (pattern_.CELength_ == 0 991 || startIdx < search_.beginIndex() 992 || startIdx > search_.endIndex()) { 993 throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " + 994 search_.beginIndex() + " and " + search_.endIndex()); 995 } 996 997 if (pattern_.PCE_ == null) { 998 initializePatternPCETable(); 999 } 1000 1001 textIter_.setOffset(startIdx); 1002 CEBuffer ceb = new CEBuffer(this); 1003 1004 int targetIx = 0; 1005 CEI targetCEI = null; 1006 int patIx; 1007 boolean found; 1008 1009 int mStart = -1; 1010 int mLimit = -1; 1011 int minLimit; 1012 int maxLimit; 1013 1014 // Outer loop moves over match starting positions in the 1015 // target CE space. 1016 // Here we see the target as a sequence of collation elements, resulting from the following: 1017 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 1018 // (for example, digraphs such as IJ may be broken into two characters). 1019 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 1020 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 1021 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 1022 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 1023 // then the CE is deleted, so the following code sees only CEs that are relevant. 1024 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 1025 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 1026 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 1027 for (targetIx = 0; ; targetIx++) { 1028 found = true; 1029 // Inner loop checks for a match beginning at each 1030 // position from the outer loop. 1031 int targetIxOffset = 0; 1032 long patCE = 0; 1033 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 1034 // (compared to the last CE fetched for the previous targetIx value) as we need to go 1035 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 1036 CEI firstCEI = ceb.get(targetIx); 1037 if (firstCEI == null) { 1038 throw new ICUException("CEBuffer.get(" + targetIx + ") returned null."); 1039 } 1040 1041 for (patIx = 0; patIx < pattern_.PCELength_; patIx++) { 1042 patCE = pattern_.PCE_[patIx]; 1043 targetCEI = ceb.get(targetIx + patIx + targetIxOffset); 1044 // Compare CE from target string with CE from the pattern. 1045 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 1046 // which will fail the compare, below. 1047 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 1048 if (ceMatch == CE_NO_MATCH) { 1049 found = false; 1050 break; 1051 } else if (ceMatch > CE_NO_MATCH) { 1052 if (ceMatch == CE_SKIP_TARG) { 1053 // redo with same patCE, next targCE 1054 patIx--; 1055 targetIxOffset++; 1056 } else { // ceMatch == CE_SKIP_PATN 1057 // redo with same targCE, next patCE 1058 targetIxOffset--; 1059 } 1060 } 1061 } 1062 targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far 1063 1064 if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 1065 // No match at this targetIx. Try again at the next. 1066 continue; 1067 } 1068 1069 if (!found) { 1070 // No match at all, we have run off the end of the target text. 1071 break; 1072 } 1073 1074 // We have found a match in CE space. 1075 // Now determine the bounds in string index space. 1076 // There still is a chance of match failure if the CE range not correspond to 1077 // an acceptable character range. 1078 // 1079 CEI lastCEI = ceb.get(targetIx + targetIxOffset -1); 1080 1081 mStart = firstCEI.lowIndex_; 1082 minLimit = lastCEI.lowIndex_; 1083 1084 // Look at the CE following the match. If it is UCOL_NULLORDER the match 1085 // extended to the end of input, and the match is good. 1086 1087 // Look at the high and low indices of the CE following the match. If 1088 // they are the same it means one of two things: 1089 // 1. The match extended to the last CE from the target text, which is OK, or 1090 // 2. The last CE that was part of the match is in an expansion that extends 1091 // to the first CE after the match. In this case, we reject the match. 1092 CEI nextCEI = null; 1093 if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 1094 nextCEI = ceb.get(targetIx + targetIxOffset); 1095 maxLimit = nextCEI.lowIndex_; 1096 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 1097 found = false; 1098 } 1099 } else { 1100 for (;; ++targetIxOffset) { 1101 nextCEI = ceb.get(targetIx + targetIxOffset); 1102 maxLimit = nextCEI.lowIndex_; 1103 // If we are at the end of the target too, match succeeds 1104 if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) { 1105 break; 1106 } 1107 // As long as the next CE has primary weight of 0, 1108 // it is part of the last target element matched by the pattern; 1109 // make sure it can be part of a match with the last patCE 1110 if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) { 1111 int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_); 1112 if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) { 1113 found = false; 1114 break; 1115 } 1116 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 1117 // target element, but it has non-zero primary weight => match fails 1118 } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) { 1119 found = false; 1120 break; 1121 // Else the target CE is not part of an expansion of the last matched element, match succeeds 1122 } else { 1123 break; 1124 } 1125 } 1126 } 1127 1128 // Check for the start of the match being within a combining sequence. 1129 // This can happen if the pattern itself begins with a combining char, and 1130 // the match found combining marks in the target text that were attached 1131 // to something else. 1132 // This type of match should be rejected for not completely consuming a 1133 // combining sequence. 1134 if (!isBreakBoundary(mStart)) { 1135 found = false; 1136 } 1137 1138 // Check for the start of the match being within an Collation Element Expansion, 1139 // meaning that the first char of the match is only partially matched. 1140 // With expansions, the first CE will report the index of the source 1141 // character, and all subsequent (expansions) CEs will report the source index of the 1142 // _following_ character. 1143 int secondIx = firstCEI.highIndex_; 1144 if (mStart == secondIx) { 1145 found = false; 1146 } 1147 1148 // Allow matches to end in the middle of a grapheme cluster if the following 1149 // conditions are met; this is needed to make prefix search work properly in 1150 // Indic, see #11750 1151 // * the default breakIter is being used 1152 // * the next collation element after this combining sequence 1153 // - has non-zero primary weight 1154 // - corresponds to a separate character following the one at end of the current match 1155 // (the second of these conditions, and perhaps both, may be redundant given the 1156 // subsequent check for normalization boundary; however they are likely much faster 1157 // tests in any case) 1158 // * the match limit is a normalization boundary 1159 boolean allowMidclusterMatch = 1160 breakIterator == null && 1161 (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && 1162 maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && 1163 (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || 1164 nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); 1165 1166 // If those conditions are met, then: 1167 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 1168 // the match limit may be backed off to a previous break boundary. This handles 1169 // cases in which mLimit includes target characters that are ignorable with current 1170 // settings (such as space) and which extend beyond the pattern match. 1171 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 1172 // * do NOT require that match limit be on a breakIter boundary 1173 1174 // Advance the match end position to the first acceptable match boundary. 1175 // This advances the index over any combining characters. 1176 mLimit = maxLimit; 1177 if (minLimit < maxLimit) { 1178 // When the last CE's low index is same with its high index, the CE is likely 1179 // a part of expansion. In this case, the index is located just after the 1180 // character corresponding to the CEs compared above. If the index is right 1181 // at the break boundary, move the position to the next boundary will result 1182 // incorrect match length when there are ignorable characters exist between 1183 // the position and the next character produces CE(s). See ticket#8482. 1184 if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) { 1185 mLimit = minLimit; 1186 } else { 1187 int nba = nextBoundaryAfter(minLimit); 1188 // Note that we can have nba < maxLimit && nba >= minLImit, in which 1189 // case we want to set mLimit to nba regardless of allowMidclusterMatch 1190 // (i.e. we back off mLimit to the previous breakIterator boundary). 1191 if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { 1192 mLimit = nba; 1193 } 1194 } 1195 } 1196 1197 if (!allowMidclusterMatch) { 1198 // If advancing to the end of a combining sequence in character indexing space 1199 // advanced us beyond the end of the match in CE space, reject this match. 1200 if (mLimit > maxLimit) { 1201 found = false; 1202 } 1203 1204 if (!isBreakBoundary(mLimit)) { 1205 found = false; 1206 } 1207 } 1208 1209 if (!checkIdentical(mStart, mLimit)) { 1210 found = false; 1211 } 1212 1213 if (found) { 1214 break; 1215 } 1216 } 1217 1218 // All Done. Store back the match bounds to the caller. 1219 // 1220 if (found == false) { 1221 mLimit = -1; 1222 mStart = -1; 1223 } 1224 1225 if (m != null) { 1226 m.start_ = mStart; 1227 m.limit_ = mLimit; 1228 } 1229 1230 return found; 1231 } 1232 1233 private static int codePointAt(CharacterIterator iter, int index) { 1234 int currentIterIndex = iter.getIndex(); 1235 char codeUnit = iter.setIndex(index); 1236 int cp = codeUnit; 1237 if (Character.isHighSurrogate(codeUnit)) { 1238 char nextUnit = iter.next(); 1239 if (Character.isLowSurrogate(nextUnit)) { 1240 cp = Character.toCodePoint(codeUnit, nextUnit); 1241 } 1242 } 1243 iter.setIndex(currentIterIndex); // restore iter position 1244 return cp; 1245 } 1246 1247 private static int codePointBefore(CharacterIterator iter, int index) { 1248 int currentIterIndex = iter.getIndex(); 1249 iter.setIndex(index); 1250 char codeUnit = iter.previous(); 1251 int cp = codeUnit; 1252 if (Character.isLowSurrogate(codeUnit)) { 1253 char prevUnit = iter.previous(); 1254 if (Character.isHighSurrogate(prevUnit)) { 1255 cp = Character.toCodePoint(prevUnit, codeUnit); 1256 } 1257 } 1258 iter.setIndex(currentIterIndex); // restore iter position 1259 return cp; 1260 } 1261 1262 private boolean searchBackwards(int startIdx, Match m) { 1263 //ICU4C_TODO comment: reject search patterns beginning with a combining char. 1264 1265 // Input parameter sanity check. 1266 if (pattern_.CELength_ == 0 1267 || startIdx < search_.beginIndex() 1268 || startIdx > search_.endIndex()) { 1269 throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " + 1270 search_.beginIndex() + " and " + search_.endIndex()); 1271 } 1272 1273 if (pattern_.PCE_ == null) { 1274 initializePatternPCETable(); 1275 } 1276 1277 CEBuffer ceb = new CEBuffer(this); 1278 int targetIx = 0; 1279 1280 /* 1281 * Pre-load the buffer with the CE's for the grapheme 1282 * after our starting position so that we're sure that 1283 * we can look at the CE following the match when we 1284 * check the match boundaries. 1285 * 1286 * This will also pre-fetch the first CE that we'll 1287 * consider for the match. 1288 */ 1289 if (startIdx < search_.endIndex()) { 1290 BreakIterator bi = search_.internalBreakIter_; 1291 int next = bi.following(startIdx); 1292 1293 textIter_.setOffset(next); 1294 1295 for (targetIx = 0; ; targetIx++) { 1296 if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) { 1297 break; 1298 } 1299 } 1300 } else { 1301 textIter_.setOffset(startIdx); 1302 } 1303 1304 CEI targetCEI = null; 1305 int patIx; 1306 boolean found; 1307 1308 int limitIx = targetIx; 1309 int mStart = -1; 1310 int mLimit = -1; 1311 int minLimit; 1312 int maxLimit; 1313 1314 // Outer loop moves over match starting positions in the 1315 // target CE space. 1316 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 1317 // But patIx is 0 at the beginning of the pattern and increases toward the end. 1318 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 1319 // and the beginning of the base text. 1320 for (targetIx = limitIx; ; targetIx++) { 1321 found = true; 1322 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 1323 // (compared to the last CE fetched for the previous targetIx value) as we need to go 1324 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 1325 CEI lastCEI = ceb.getPrevious(targetIx); 1326 if (lastCEI == null) { 1327 throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null."); 1328 } 1329 // Inner loop checks for a match beginning at each 1330 // position from the outer loop. 1331 int targetIxOffset = 0; 1332 for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) { 1333 long patCE = pattern_.PCE_[patIx]; 1334 1335 targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset); 1336 // Compare CE from target string with CE from the pattern. 1337 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 1338 // which will fail the compare, below. 1339 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 1340 if (ceMatch == CE_NO_MATCH) { 1341 found = false; 1342 break; 1343 } else if (ceMatch > CE_NO_MATCH) { 1344 if (ceMatch == CE_SKIP_TARG) { 1345 // redo with same patCE, next targCE 1346 patIx++; 1347 targetIxOffset++; 1348 } else { // ceMatch == CE_SKIP_PATN 1349 // redo with same targCE, next patCE 1350 targetIxOffset--; 1351 } 1352 } 1353 } 1354 1355 if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 1356 // No match at this targetIx. Try again at the next. 1357 continue; 1358 } 1359 1360 if (!found) { 1361 // No match at all, we have run off the end of the target text. 1362 break; 1363 } 1364 1365 // We have found a match in CE space. 1366 // Now determine the bounds in string index space. 1367 // There still is a chance of match failure if the CE range not correspond to 1368 // an acceptable character range. 1369 // 1370 CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset); 1371 mStart = firstCEI.lowIndex_; 1372 1373 // Check for the start of the match being within a combining sequence. 1374 // This can happen if the pattern itself begins with a combining char, and 1375 // the match found combining marks in the target text that were attached 1376 // to something else. 1377 // This type of match should be rejected for not completely consuming a 1378 // combining sequence. 1379 if (!isBreakBoundary(mStart)) { 1380 found = false; 1381 } 1382 1383 // Look at the high index of the first CE in the match. If it's the same as the 1384 // low index, the first CE in the match is in the middle of an expansion. 1385 if (mStart == firstCEI.highIndex_) { 1386 found = false; 1387 } 1388 1389 minLimit = lastCEI.lowIndex_; 1390 1391 if (targetIx > 0) { 1392 // Look at the CE following the match. If it is UCOL_NULLORDER the match 1393 // extended to the end of input, and the match is good. 1394 1395 // Look at the high and low indices of the CE following the match. If 1396 // they are the same it means one of two things: 1397 // 1. The match extended to the last CE from the target text, which is OK, or 1398 // 2. The last CE that was part of the match is in an expansion that extends 1399 // to the first CE after the match. In this case, we reject the match. 1400 CEI nextCEI = ceb.getPrevious(targetIx - 1); 1401 1402 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 1403 found = false; 1404 } 1405 1406 mLimit = maxLimit = nextCEI.lowIndex_; 1407 1408 // Allow matches to end in the middle of a grapheme cluster if the following 1409 // conditions are met; this is needed to make prefix search work properly in 1410 // Indic, see #11750 1411 // * the default breakIter is being used 1412 // * the next collation element after this combining sequence 1413 // - has non-zero primary weight 1414 // - corresponds to a separate character following the one at end of the current match 1415 // (the second of these conditions, and perhaps both, may be redundant given the 1416 // subsequent check for normalization boundary; however they are likely much faster 1417 // tests in any case) 1418 // * the match limit is a normalization boundary 1419 boolean allowMidclusterMatch = 1420 breakIterator == null && 1421 (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && 1422 maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && 1423 (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || 1424 nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); 1425 1426 // If those conditions are met, then: 1427 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 1428 // the match limit may be backed off to a previous break boundary. This handles 1429 // cases in which mLimit includes target characters that are ignorable with current 1430 // settings (such as space) and which extend beyond the pattern match. 1431 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 1432 // * do NOT require that match limit be on a breakIter boundary 1433 1434 // Advance the match end position to the first acceptable match boundary. 1435 // This advances the index over any combining charcters. 1436 if (minLimit < maxLimit) { 1437 int nba = nextBoundaryAfter(minLimit); 1438 // Note that we can have nba < maxLimit && nba >= minLImit, in which 1439 // case we want to set mLimit to nba regardless of allowMidclusterMatch 1440 // (i.e. we back off mLimit to the previous breakIterator boundary). 1441 if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { 1442 mLimit = nba; 1443 } 1444 } 1445 1446 if (!allowMidclusterMatch) { 1447 // If advancing to the end of a combining sequence in character indexing space 1448 // advanced us beyond the end of the match in CE space, reject this match. 1449 if (mLimit > maxLimit) { 1450 found = false; 1451 } 1452 1453 // Make sure the end of the match is on a break boundary 1454 if (!isBreakBoundary(mLimit)) { 1455 found = false; 1456 } 1457 } 1458 1459 } else { 1460 // No non-ignorable CEs after this point. 1461 // The maximum position is detected by boundary after 1462 // the last non-ignorable CE. Combining sequence 1463 // across the start index will be truncated. 1464 int nba = nextBoundaryAfter(minLimit); 1465 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 1466 } 1467 1468 if (!checkIdentical(mStart, mLimit)) { 1469 found = false; 1470 } 1471 1472 if (found) { 1473 break; 1474 } 1475 } 1476 1477 // All Done. Store back the match bounds to the caller. 1478 // 1479 if (found == false) { 1480 mLimit = -1; 1481 mStart = -1; 1482 } 1483 1484 if (m != null) { 1485 m.start_ = mStart; 1486 m.limit_ = mLimit; 1487 } 1488 1489 return found; 1490 } 1491 1492 // Java porting note: 1493 // 1494 // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical() 1495 // for the linear search implementation. The differences are addressed in search(). 1496 // 1497 private boolean handleNextExact() { 1498 return handleNextCommonImpl(); 1499 } 1500 1501 private boolean handleNextCanonical() { 1502 return handleNextCommonImpl(); 1503 } 1504 1505 private boolean handleNextCommonImpl() { 1506 int textOffset = textIter_.getOffset(); 1507 Match match = new Match(); 1508 1509 if (search(textOffset, match)) { 1510 search_.matchedIndex_ = match.start_; 1511 search_.setMatchedLength(match.limit_ - match.start_); 1512 return true; 1513 } else { 1514 setMatchNotFound(); 1515 return false; 1516 } 1517 } 1518 1519 // Java porting note: 1520 // 1521 // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical() 1522 // for the linear search implementation. The differences are addressed in searchBackwards(). 1523 // 1524 private boolean handlePreviousExact() { 1525 return handlePreviousCommonImpl(); 1526 } 1527 1528 private boolean handlePreviousCanonical() { 1529 return handlePreviousCommonImpl(); 1530 } 1531 1532 private boolean handlePreviousCommonImpl() { 1533 int textOffset; 1534 1535 if (search_.isOverlap_) { 1536 if (search_.matchedIndex_ != DONE) { 1537 textOffset = search_.matchedIndex_ + search_.matchedLength() - 1; 1538 } else { 1539 // move the start position at the end of possible match 1540 initializePatternPCETable(); 1541 if (!initTextProcessedIter()) { 1542 setMatchNotFound(); 1543 return false; 1544 } 1545 for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) { 1546 long pce = textProcessedIter_.nextProcessed(null); 1547 if (pce == CollationPCE.PROCESSED_NULLORDER) { 1548 // at the end of the text 1549 break; 1550 } 1551 } 1552 textOffset = textIter_.getOffset(); 1553 } 1554 } else { 1555 textOffset = textIter_.getOffset(); 1556 } 1557 1558 Match match = new Match(); 1559 if (searchBackwards(textOffset, match)) { 1560 search_.matchedIndex_ = match.start_; 1561 search_.setMatchedLength(match.limit_ - match.start_); 1562 return true; 1563 } else { 1564 setMatchNotFound(); 1565 return false; 1566 } 1567 } 1568 1569 /** 1570 * Gets a substring out of a CharacterIterator 1571 * 1572 * Java porting note: Not available in ICU4C 1573 * 1574 * @param text CharacterIterator 1575 * @param start start offset 1576 * @param length of substring 1577 * @return substring from text starting at start and length length 1578 */ 1579 private static final String getString(CharacterIterator text, int start, int length) { 1580 StringBuilder result = new StringBuilder(length); 1581 int offset = text.getIndex(); 1582 text.setIndex(start); 1583 for (int i = 0; i < length; i++) { 1584 result.append(text.current()); 1585 text.next(); 1586 } 1587 text.setIndex(offset); 1588 return result.toString(); 1589 } 1590 1591 /** 1592 * Java port of ICU4C struct UPattern (usrchimp.h) 1593 */ 1594 private static final class Pattern { 1595 /** Pattern string */ 1596 String text_; 1597 1598 long[] PCE_; 1599 int PCELength_ = 0; 1600 1601 // TODO: We probably do not need CE_ / CELength_ 1602 @SuppressWarnings("unused") 1603 int[] CE_; 1604 int CELength_ = 0; 1605 1606 // *** Boyer-Moore *** 1607 // boolean hasPrefixAccents_ = false; 1608 // boolean hasSuffixAccents_ = false; 1609 // int defaultShiftSize_; 1610 // char[] shift_; 1611 // char[] backShift_; 1612 1613 protected Pattern(String pattern) { 1614 text_ = pattern; 1615 } 1616 } 1617 1618 /** 1619 * Java port of ICU4C UCollationPCE (usrchimp.h) 1620 */ 1621 private static class CollationPCE { 1622 public static final long PROCESSED_NULLORDER = -1; 1623 1624 private static final int DEFAULT_BUFFER_SIZE = 16; 1625 private static final int BUFFER_GROW = 8; 1626 1627 // Note: PRIMARYORDERMASK is also duplicated in StringSearch class 1628 private static final int PRIMARYORDERMASK = 0xffff0000; 1629 private static final int CONTINUATION_MARKER = 0xc0; 1630 1631 private PCEBuffer pceBuffer_ = new PCEBuffer(); 1632 private CollationElementIterator cei_; 1633 private int strength_; 1634 private boolean toShift_; 1635 private boolean isShifted_; 1636 private int variableTop_; 1637 1638 public CollationPCE(CollationElementIterator iter) { 1639 init(iter); 1640 } 1641 1642 public void init(CollationElementIterator iter) { 1643 cei_ = iter; 1644 init(iter.getRuleBasedCollator()); 1645 } 1646 1647 private void init(RuleBasedCollator coll) { 1648 strength_ = coll.getStrength(); 1649 toShift_ = coll.isAlternateHandlingShifted(); 1650 isShifted_ = false; 1651 variableTop_ = coll.getVariableTop(); 1652 } 1653 1654 @SuppressWarnings("fallthrough") 1655 private long processCE(int ce) { 1656 long primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 1657 1658 // This is clean, but somewhat slow... 1659 // We could apply the mask to ce and then 1660 // just get all three orders... 1661 switch (strength_) { 1662 default: 1663 tertiary = CollationElementIterator.tertiaryOrder(ce); 1664 /* note fall-through */ 1665 1666 case Collator.SECONDARY: 1667 secondary = CollationElementIterator.secondaryOrder(ce); 1668 /* note fall-through */ 1669 1670 case Collator.PRIMARY: 1671 primary = CollationElementIterator.primaryOrder(ce); 1672 } 1673 1674 // **** This should probably handle continuations too. **** 1675 // **** That means that we need 24 bits for the primary **** 1676 // **** instead of the 16 that we're currently using. **** 1677 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 1678 // **** Another complication with continuations is that **** 1679 // **** the *second* CE is marked as a continuation, so **** 1680 // **** we always have to peek ahead to know how long **** 1681 // **** the primary is... **** 1682 if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) { 1683 1684 if (primary == 0) { 1685 return CollationElementIterator.IGNORABLE; 1686 } 1687 1688 if (strength_ >= Collator.QUATERNARY) { 1689 quaternary = primary; 1690 } 1691 1692 primary = secondary = tertiary = 0; 1693 isShifted_ = true; 1694 } else { 1695 if (strength_ >= Collator.QUATERNARY) { 1696 quaternary = 0xFFFF; 1697 } 1698 1699 isShifted_ = false; 1700 } 1701 1702 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 1703 } 1704 1705 /** 1706 * Get the processed ordering priority of the next collation element in the text. 1707 * A single character may contain more than one collation element. 1708 * 1709 * Note: This is equivalent to 1710 * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 1711 * 1712 * @param range receiving the iterator index before/after fetching the CE. 1713 * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER 1714 * if an error has occurred or if the end of string has been reached 1715 */ 1716 public long nextProcessed(Range range) { 1717 long result = CollationElementIterator.IGNORABLE; 1718 int low = 0, high = 0; 1719 1720 pceBuffer_.reset(); 1721 1722 do { 1723 low = cei_.getOffset(); 1724 int ce = cei_.next(); 1725 high = cei_.getOffset(); 1726 1727 if (ce == CollationElementIterator.NULLORDER) { 1728 result = PROCESSED_NULLORDER; 1729 break; 1730 } 1731 1732 result = processCE(ce); 1733 } while (result == CollationElementIterator.IGNORABLE); 1734 1735 if (range != null) { 1736 range.ixLow_ = low; 1737 range.ixHigh_ = high; 1738 } 1739 1740 return result; 1741 } 1742 1743 /** 1744 * Get the processed ordering priority of the previous collation element in the text. 1745 * A single character may contain more than one collation element. 1746 * 1747 * Note: This is equivalent to 1748 * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 1749 * 1750 * @param range receiving the iterator index before/after fetching the CE. 1751 * @return The previous collation elements ordering, otherwise returns 1752 * PROCESSED_NULLORDER if an error has occurred or if the start of 1753 * string has been reached. 1754 */ 1755 public long previousProcessed(Range range) { 1756 long result = CollationElementIterator.IGNORABLE; 1757 int low = 0, high = 0; 1758 1759 // pceBuffer_.reset(); 1760 1761 while (pceBuffer_.empty()) { 1762 // buffer raw CEs up to non-ignorable primary 1763 RCEBuffer rceb = new RCEBuffer(); 1764 int ce; 1765 1766 boolean finish = false; 1767 1768 // **** do we need to reset rceb, or will it always be empty at this point **** 1769 do { 1770 high = cei_.getOffset(); 1771 ce = cei_.previous(); 1772 low = cei_.getOffset(); 1773 1774 if (ce == CollationElementIterator.NULLORDER) { 1775 if (!rceb.empty()) { 1776 break; 1777 } 1778 1779 finish = true; 1780 break; 1781 } 1782 1783 rceb.put(ce, low, high); 1784 } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce)); 1785 1786 if (finish) { 1787 break; 1788 } 1789 1790 // process the raw CEs 1791 while (!rceb.empty()) { 1792 RCEI rcei = rceb.get(); 1793 1794 result = processCE(rcei.ce_); 1795 1796 if (result != CollationElementIterator.IGNORABLE) { 1797 pceBuffer_.put(result, rcei.low_, rcei.high_); 1798 } 1799 } 1800 } 1801 1802 if (pceBuffer_.empty()) { 1803 // **** Is -1 the right value for ixLow, ixHigh? **** 1804 if (range != null) { 1805 range.ixLow_ = -1; 1806 range.ixHigh_ = -1; 1807 } 1808 return CollationElementIterator.NULLORDER; 1809 } 1810 1811 PCEI pcei = pceBuffer_.get(); 1812 1813 if (range != null) { 1814 range.ixLow_ = pcei.low_; 1815 range.ixHigh_ = pcei.high_; 1816 } 1817 1818 return pcei.ce_; 1819 } 1820 1821 private static boolean isContinuation(int ce) { 1822 return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER); 1823 } 1824 1825 public static final class Range { 1826 int ixLow_; 1827 int ixHigh_; 1828 } 1829 1830 /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 1831 private static final class PCEI { 1832 long ce_; 1833 int low_; 1834 int high_; 1835 } 1836 1837 private static final class PCEBuffer { 1838 private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE]; 1839 private int bufferIndex_ = 0; 1840 1841 void reset() { 1842 bufferIndex_ = 0; 1843 } 1844 1845 boolean empty() { 1846 return bufferIndex_ <= 0; 1847 } 1848 1849 void put(long ce, int ixLow, int ixHigh) 1850 { 1851 if (bufferIndex_ >= buffer_.length) { 1852 PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW]; 1853 System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 1854 buffer_ = newBuffer; 1855 } 1856 buffer_[bufferIndex_] = new PCEI(); 1857 buffer_[bufferIndex_].ce_ = ce; 1858 buffer_[bufferIndex_].low_ = ixLow; 1859 buffer_[bufferIndex_].high_ = ixHigh; 1860 1861 bufferIndex_ += 1; 1862 } 1863 1864 PCEI get() { 1865 if (bufferIndex_ > 0) { 1866 return buffer_[--bufferIndex_]; 1867 } 1868 return null; 1869 } 1870 } 1871 1872 /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 1873 private static final class RCEI { 1874 int ce_; 1875 int low_; 1876 int high_; 1877 } 1878 1879 private static final class RCEBuffer { 1880 private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE]; 1881 private int bufferIndex_ = 0; 1882 1883 boolean empty() { 1884 return bufferIndex_ <= 0; 1885 } 1886 1887 void put(int ce, int ixLow, int ixHigh) { 1888 if (bufferIndex_ >= buffer_.length) { 1889 RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW]; 1890 System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 1891 buffer_ = newBuffer; 1892 } 1893 buffer_[bufferIndex_] = new RCEI(); 1894 buffer_[bufferIndex_].ce_ = ce; 1895 buffer_[bufferIndex_].low_ = ixLow; 1896 buffer_[bufferIndex_].high_ = ixHigh; 1897 1898 bufferIndex_ += 1; 1899 } 1900 1901 RCEI get() { 1902 if (bufferIndex_ > 0) { 1903 return buffer_[--bufferIndex_]; 1904 } 1905 return null; 1906 } 1907 } 1908 } 1909 1910 /** 1911 * Java port of ICU4C CEI (usearch.cpp) 1912 * 1913 * CEI Collation Element + source text index. 1914 * These structs are kept in the circular buffer. 1915 */ 1916 private static class CEI { 1917 long ce_; 1918 int lowIndex_; 1919 int highIndex_; 1920 } 1921 1922 /** 1923 * CEBuffer A circular buffer of CEs from the text being searched 1924 */ 1925 private static class CEBuffer { 1926 // Java porting note: ICU4C uses the size for stack buffer 1927 // static final int DEFAULT_CEBUFFER_SIZE = 96; 1928 1929 static final int CEBUFFER_EXTRA = 32; 1930 static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8; 1931 static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3; 1932 1933 CEI[] buf_; 1934 int bufSize_; 1935 int firstIx_; 1936 int limitIx_; 1937 1938 // Java porting note: No references in ICU4C implementation 1939 // CollationElementIterator ceIter_; 1940 1941 StringSearch strSearch_; 1942 1943 CEBuffer(StringSearch ss) { 1944 strSearch_ = ss; 1945 bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA; 1946 if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 1947 String patText = ss.pattern_.text_; 1948 if (patText != null) { 1949 for (int i = 0; i < patText.length(); i++) { 1950 char c = patText.charAt(i); 1951 if (MIGHT_BE_JAMO_L(c)) { 1952 bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 1953 } else { 1954 // No check for surrogates, we might allocate slightly more buffer than necessary. 1955 bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 1956 } 1957 } 1958 } 1959 } 1960 1961 // Not used - see above 1962 // ceIter_ = ss.textIter_; 1963 1964 firstIx_ = 0; 1965 limitIx_ = 0; 1966 1967 if (!ss.initTextProcessedIter()) { 1968 return; 1969 } 1970 1971 buf_ = new CEI[bufSize_]; 1972 } 1973 1974 // Get the CE with the specified index. 1975 // Index must be in the range 1976 // n-history_size < index < n+1 1977 // where n is the largest index to have been fetched by some previous call to this function. 1978 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1979 // 1980 CEI get(int index) { 1981 int i = index % bufSize_; 1982 1983 if (index >= firstIx_ && index < limitIx_) { 1984 // The request was for an entry already in our buffer. 1985 // Just return it. 1986 return buf_[i]; 1987 } 1988 1989 // Caller is requesting a new, never accessed before, CE. 1990 // Verify that it is the next one in sequence, which is all 1991 // that is allowed. 1992 if (index != limitIx_) { 1993 assert(false); 1994 return null; 1995 } 1996 1997 // Manage the circular CE buffer indexing 1998 limitIx_++; 1999 2000 if (limitIx_ - firstIx_ >= bufSize_) { 2001 // The buffer is full, knock out the lowest-indexed entry. 2002 firstIx_++; 2003 } 2004 2005 CollationPCE.Range range = new CollationPCE.Range(); 2006 if (buf_[i] == null) { 2007 buf_[i] = new CEI(); 2008 } 2009 buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range); 2010 buf_[i].lowIndex_ = range.ixLow_; 2011 buf_[i].highIndex_ = range.ixHigh_; 2012 2013 return buf_[i]; 2014 } 2015 2016 // Get the CE with the specified index. 2017 // Index must be in the range 2018 // n-history_size < index < n+1 2019 // where n is the largest index to have been fetched by some previous call to this function. 2020 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 2021 // 2022 CEI getPrevious(int index) { 2023 int i = index % bufSize_; 2024 2025 if (index >= firstIx_ && index < limitIx_) { 2026 // The request was for an entry already in our buffer. 2027 // Just return it. 2028 return buf_[i]; 2029 } 2030 2031 // Caller is requesting a new, never accessed before, CE. 2032 // Verify that it is the next one in sequence, which is all 2033 // that is allowed. 2034 if (index != limitIx_) { 2035 assert(false); 2036 return null; 2037 } 2038 2039 // Manage the circular CE buffer indexing 2040 limitIx_++; 2041 2042 if (limitIx_ - firstIx_ >= bufSize_) { 2043 // The buffer is full, knock out the lowest-indexed entry. 2044 firstIx_++; 2045 } 2046 2047 CollationPCE.Range range = new CollationPCE.Range(); 2048 if (buf_[i] == null) { 2049 buf_[i] = new CEI(); 2050 } 2051 buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range); 2052 buf_[i].lowIndex_ = range.ixLow_; 2053 buf_[i].highIndex_ = range.ixHigh_; 2054 2055 return buf_[i]; 2056 } 2057 2058 static boolean MIGHT_BE_JAMO_L(char c) { 2059 return (c >= 0x1100 && c <= 0x115E) 2060 || (c >= 0x3131 && c <= 0x314E) 2061 || (c >= 0x3165 && c <= 0x3186); 2062 } 2063 } 2064 } 2065