1 /* 2 * Copyright (C) 2008 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import com.google.common.annotations.GwtCompatible; 20 import static com.google.common.base.Preconditions.checkArgument; 21 import static com.google.common.base.Preconditions.checkNotNull; 22 23 import java.util.ArrayList; 24 import java.util.Arrays; 25 import java.util.List; 26 27 /** 28 * Determines a true or false value for any Java {@code char} value, just as 29 * {@link Predicate} does for any {@link Object}. Also offers basic text 30 * processing methods based on this function. Implementations are strongly 31 * encouraged to be side-effect-free and immutable. 32 * 33 * <p>Throughout the documentation of this class, the phrase "matching 34 * character" is used to mean "any character {@code c} for which {@code 35 * this.matches(c)} returns {@code true}". 36 * 37 * <p><b>Note:</b> This class deals only with {@code char} values; it does not 38 * understand supplementary Unicode code points in the range {@code 0x10000} to 39 * {@code 0x10FFFF}. Such logical characters are encoded into a {@code String} 40 * using surrogate pairs, and a {@code CharMatcher} treats these just as two 41 * separate characters. 42 * 43 * @author Kevin Bourrillion 44 * @since 2009.09.15 <b>tentative</b> 45 */ 46 @GwtCompatible 47 public abstract class CharMatcher implements Predicate<Character> { 48 49 // Constants 50 51 // Excludes 2000-2000a, which is handled as a range 52 private static final String BREAKING_WHITESPACE_CHARS = 53 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000"; 54 55 // Excludes 2007, which is handled as a gap in a pair of ranges 56 private static final String NON_BREAKING_WHITESPACE_CHARS = 57 "\u00a0\u180e\u202f"; 58 59 /** 60 * Determines whether a character is whitespace according to the latest 61 * Unicode standard, as illustrated 62 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 63 * This is not the same definition used by other Java APIs. See a comparison 64 * of several definitions of "whitespace" at 65 * <a href="TODO">(TODO)</a>. 66 * 67 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this 68 * constant to keep it up to date. 69 */ 70 public static final CharMatcher WHITESPACE = 71 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS) 72 .or(inRange('\u2000', '\u200a')); 73 74 /** 75 * Determines whether a character is a breaking whitespace (that is, 76 * a whitespace which can be interpreted as a break between words 77 * for formatting purposes). See {@link #WHITESPACE} for a discussion 78 * of that term. 79 * 80 * @since 2010.01.04 <b>tentative</b> 81 */ 82 public static final CharMatcher BREAKING_WHITESPACE = 83 anyOf(BREAKING_WHITESPACE_CHARS) 84 .or(inRange('\u2000', '\u2006')) 85 .or(inRange('\u2008', '\u200a')); 86 87 /** 88 * Determines whether a character is ASCII, meaning that its code point is 89 * less than 128. 90 */ 91 public static final CharMatcher ASCII = inRange('\0', '\u007f'); 92 93 /** 94 * Determines whether a character is a digit according to 95 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 96 */ 97 public static final CharMatcher DIGIT; 98 99 static { 100 CharMatcher digit = inRange('0', '9'); 101 String zeroes = 102 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 103 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 104 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 105 for (char base : zeroes.toCharArray()) { 106 digit = digit.or(inRange(base, (char) (base + 9))); 107 } 108 DIGIT = digit; 109 } 110 111 /** 112 * Determines whether a character is whitespace according to {@link 113 * Character#isWhitespace(char) Java's definition}; it is usually preferable 114 * to use {@link #WHITESPACE}. See a comparison of several definitions of 115 * "whitespace" at <a href="http://go/white+space">go/white+space</a>. 116 */ 117 public static final CharMatcher JAVA_WHITESPACE 118 = inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal 119 .or(inRange('\u001c', '\u0020')) 120 .or(is('\u1680')) 121 .or(is('\u180e')) 122 .or(inRange('\u2000', '\u2006')) 123 .or(inRange('\u2008', '\u200b')) 124 .or(inRange('\u2028', '\u2029')) 125 .or(is('\u205f')) 126 .or(is('\u3000')); 127 128 /** 129 * Determines whether a character is a digit according to {@link 130 * Character#isDigit(char) Java's definition}. If you only care to match 131 * ASCII digits, you can use {@code inRange('0', '9')}. 132 */ 133 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 134 @Override public boolean matches(char c) { 135 return Character.isDigit(c); 136 } 137 }; 138 139 /** 140 * Determines whether a character is a letter according to {@link 141 * Character#isLetter(char) Java's definition}. If you only care to match 142 * letters of the Latin alphabet, you can use {@code 143 * inRange('a', 'z').or(inRange('A', 'Z'))}. 144 */ 145 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 146 @Override public boolean matches(char c) { 147 return Character.isLetter(c); 148 } 149 }; 150 151 /** 152 * Determines whether a character is a letter or digit according to {@link 153 * Character#isLetterOrDigit(char) Java's definition}. 154 */ 155 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 156 @Override public boolean matches(char c) { 157 return Character.isLetterOrDigit(c); 158 } 159 }; 160 161 /** 162 * Determines whether a character is upper case according to {@link 163 * Character#isUpperCase(char) Java's definition}. 164 */ 165 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 166 @Override public boolean matches(char c) { 167 return Character.isUpperCase(c); 168 } 169 }; 170 171 /** 172 * Determines whether a character is lower case according to {@link 173 * Character#isLowerCase(char) Java's definition}. 174 */ 175 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 176 @Override public boolean matches(char c) { 177 return Character.isLowerCase(c); 178 } 179 }; 180 181 /** 182 * Determines whether a character is an ISO control character according to 183 * {@link Character#isISOControl(char)}. 184 */ 185 public static final CharMatcher JAVA_ISO_CONTROL = inRange('\u0000', '\u001f') 186 .or(inRange('\u007f', '\u009f')); 187 188 /** 189 * Determines whether a character is invisible; that is, if its Unicode 190 * category is any of SPACE_SEPARATOR, LINE_SEPARATOR, 191 * PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and PRIVATE_USE according 192 * to ICU4J. 193 */ 194 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 195 .or(inRange('\u007f', '\u00a0')) 196 .or(is('\u00ad')) 197 .or(inRange('\u0600', '\u0603')) 198 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 199 .or(inRange('\u2000', '\u200f')) 200 .or(inRange('\u2028', '\u202f')) 201 .or(inRange('\u205f', '\u2064')) 202 .or(inRange('\u206a', '\u206f')) 203 .or(is('\u3000')) 204 .or(inRange('\ud800', '\uf8ff')) 205 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")); 206 207 /** 208 * Determines whether a character is single-width (not double-width). When 209 * in doubt, this matcher errs on the side of returning {@code false} (that 210 * is, it tends to assume a character is double-width). 211 * 212 * <b>Note:</b> as the reference file evolves, we will modify this constant 213 * to keep it up to date. 214 */ 215 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 216 .or(is('\u05be')) 217 .or(inRange('\u05d0', '\u05ea')) 218 .or(is('\u05f3')) 219 .or(is('\u05f4')) 220 .or(inRange('\u0600', '\u06ff')) 221 .or(inRange('\u0750', '\u077f')) 222 .or(inRange('\u0e00', '\u0e7f')) 223 .or(inRange('\u1e00', '\u20af')) 224 .or(inRange('\u2100', '\u213a')) 225 .or(inRange('\ufb50', '\ufdff')) 226 .or(inRange('\ufe70', '\ufeff')) 227 .or(inRange('\uff61', '\uffdc')); 228 229 /** Matches any character. */ 230 public static final CharMatcher ANY = new CharMatcher() { 231 @Override public boolean matches(char c) { 232 return true; 233 } 234 235 @Override public int indexIn(CharSequence sequence) { 236 return (sequence.length() == 0) ? -1 : 0; 237 } 238 @Override public int indexIn(CharSequence sequence, int start) { 239 int length = sequence.length(); 240 Preconditions.checkPositionIndex(start, length); 241 return (start == length) ? -1 : start; 242 } 243 @Override public int lastIndexIn(CharSequence sequence) { 244 return sequence.length() - 1; 245 } 246 @Override public boolean matchesAllOf(CharSequence sequence) { 247 checkNotNull(sequence); 248 return true; 249 } 250 @Override public boolean matchesNoneOf(CharSequence sequence) { 251 return sequence.length() == 0; 252 } 253 @Override public String removeFrom(CharSequence sequence) { 254 checkNotNull(sequence); 255 return ""; 256 } 257 @Override public String replaceFrom( 258 CharSequence sequence, char replacement) { 259 char[] array = new char[sequence.length()]; 260 Arrays.fill(array, replacement); 261 return new String(array); 262 } 263 @Override public String replaceFrom( 264 CharSequence sequence, CharSequence replacement) { 265 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 266 for (int i = 0; i < sequence.length(); i++) { 267 retval.append(replacement); 268 } 269 return retval.toString(); 270 } 271 @Override public String collapseFrom(CharSequence sequence, char replacement) { 272 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 273 } 274 @Override public String trimFrom(CharSequence sequence) { 275 checkNotNull(sequence); 276 return ""; 277 } 278 @Override public int countIn(CharSequence sequence) { 279 return sequence.length(); 280 } 281 @Override public CharMatcher and(CharMatcher other) { 282 return checkNotNull(other); 283 } 284 @Override public CharMatcher or(CharMatcher other) { 285 checkNotNull(other); 286 return this; 287 } 288 @Override public CharMatcher negate() { 289 return NONE; 290 } 291 @Override public CharMatcher precomputed() { 292 return this; 293 } 294 }; 295 296 /** Matches no characters. */ 297 public static final CharMatcher NONE = new CharMatcher() { 298 @Override public boolean matches(char c) { 299 return false; 300 } 301 302 @Override public int indexIn(CharSequence sequence) { 303 checkNotNull(sequence); 304 return -1; 305 } 306 @Override public int indexIn(CharSequence sequence, int start) { 307 int length = sequence.length(); 308 Preconditions.checkPositionIndex(start, length); 309 return -1; 310 } 311 @Override public int lastIndexIn(CharSequence sequence) { 312 checkNotNull(sequence); 313 return -1; 314 } 315 @Override public boolean matchesAllOf(CharSequence sequence) { 316 return sequence.length() == 0; 317 } 318 @Override public boolean matchesNoneOf(CharSequence sequence) { 319 checkNotNull(sequence); 320 return true; 321 } 322 @Override public String removeFrom(CharSequence sequence) { 323 return sequence.toString(); 324 } 325 @Override public String replaceFrom( 326 CharSequence sequence, char replacement) { 327 return sequence.toString(); 328 } 329 @Override public String replaceFrom( 330 CharSequence sequence, CharSequence replacement) { 331 checkNotNull(replacement); 332 return sequence.toString(); 333 } 334 @Override public String collapseFrom( 335 CharSequence sequence, char replacement) { 336 return sequence.toString(); 337 } 338 @Override public String trimFrom(CharSequence sequence) { 339 return sequence.toString(); 340 } 341 @Override public int countIn(CharSequence sequence) { 342 checkNotNull(sequence); 343 return 0; 344 } 345 @Override public CharMatcher and(CharMatcher other) { 346 checkNotNull(other); 347 return this; 348 } 349 @Override public CharMatcher or(CharMatcher other) { 350 return checkNotNull(other); 351 } 352 @Override public CharMatcher negate() { 353 return ANY; 354 } 355 @Override protected void setBits(LookupTable table) { 356 } 357 @Override public CharMatcher precomputed() { 358 return this; 359 } 360 }; 361 362 // Static factories 363 364 /** 365 * Returns a {@code char} matcher that matches only one specified character. 366 */ 367 public static CharMatcher is(final char match) { 368 return new CharMatcher() { 369 @Override public boolean matches(char c) { 370 return c == match; 371 } 372 373 @Override public String replaceFrom( 374 CharSequence sequence, char replacement) { 375 return sequence.toString().replace(match, replacement); 376 } 377 @Override public CharMatcher and(CharMatcher other) { 378 return other.matches(match) ? this : NONE; 379 } 380 @Override public CharMatcher or(CharMatcher other) { 381 return other.matches(match) ? other : super.or(other); 382 } 383 @Override public CharMatcher negate() { 384 return isNot(match); 385 } 386 @Override protected void setBits(LookupTable table) { 387 table.set(match); 388 } 389 @Override public CharMatcher precomputed() { 390 return this; 391 } 392 }; 393 } 394 395 /** 396 * Returns a {@code char} matcher that matches any character except the one 397 * specified. 398 * 399 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 400 */ 401 public static CharMatcher isNot(final char match) { 402 return new CharMatcher() { 403 @Override public boolean matches(char c) { 404 return c != match; 405 } 406 407 @Override public CharMatcher and(CharMatcher other) { 408 return other.matches(match) ? super.and(other) : other; 409 } 410 @Override public CharMatcher or(CharMatcher other) { 411 return other.matches(match) ? ANY : this; 412 } 413 @Override public CharMatcher negate() { 414 return is(match); 415 } 416 }; 417 } 418 419 /** 420 * Returns a {@code char} matcher that matches any character present in the 421 * given character sequence. 422 */ 423 public static CharMatcher anyOf(final CharSequence sequence) { 424 switch (sequence.length()) { 425 case 0: 426 return NONE; 427 case 1: 428 return is(sequence.charAt(0)); 429 case 2: 430 final char match1 = sequence.charAt(0); 431 final char match2 = sequence.charAt(1); 432 return new CharMatcher() { 433 @Override public boolean matches(char c) { 434 return c == match1 || c == match2; 435 } 436 @Override protected void setBits(LookupTable table) { 437 table.set(match1); 438 table.set(match2); 439 } 440 @Override public CharMatcher precomputed() { 441 return this; 442 } 443 }; 444 } 445 446 final char[] chars = sequence.toString().toCharArray(); 447 Arrays.sort(chars); // not worth collapsing duplicates 448 449 return new CharMatcher() { 450 @Override public boolean matches(char c) { 451 return Arrays.binarySearch(chars, c) >= 0; 452 } 453 @Override protected void setBits(LookupTable table) { 454 for (char c : chars) { 455 table.set(c); 456 } 457 } 458 }; 459 } 460 461 /** 462 * Returns a {@code char} matcher that matches any character not present in 463 * the given character sequence. 464 */ 465 public static CharMatcher noneOf(CharSequence sequence) { 466 return anyOf(sequence).negate(); 467 } 468 469 /** 470 * Returns a {@code char} matcher that matches any character in a given range 471 * (both endpoints are inclusive). For example, to match any lowercase letter 472 * of the English alphabet, use {@code CharMatcher.inRange('a', 'z')}. 473 * 474 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 475 */ 476 public static CharMatcher inRange( 477 final char startInclusive, final char endInclusive) { 478 checkArgument(endInclusive >= startInclusive); 479 return new CharMatcher() { 480 @Override public boolean matches(char c) { 481 return startInclusive <= c && c <= endInclusive; 482 } 483 @Override protected void setBits(LookupTable table) { 484 char c = startInclusive; 485 while (true) { 486 table.set(c); 487 if (c++ == endInclusive) { 488 break; 489 } 490 } 491 } 492 @Override public CharMatcher precomputed() { 493 return this; 494 } 495 }; 496 } 497 498 /** 499 * Returns a matcher with identical behavior to the given {@link 500 * Character}-based predicate, but which operates on primitive {@code char} 501 * instances instead. 502 */ 503 public static CharMatcher forPredicate( 504 final Predicate<? super Character> predicate) { 505 checkNotNull(predicate); 506 if (predicate instanceof CharMatcher) { 507 return (CharMatcher) predicate; 508 } 509 return new CharMatcher() { 510 @Override public boolean matches(char c) { 511 return predicate.apply(c); 512 } 513 @Override public boolean apply(Character character) { 514 return predicate.apply(checkNotNull(character)); 515 } 516 }; 517 } 518 519 // Abstract methods 520 521 /** Determines a true or false value for the given character. */ 522 public abstract boolean matches(char c); 523 524 // Non-static factories 525 526 /** 527 * Returns a matcher that matches any character not matched by this matcher. 528 */ 529 public CharMatcher negate() { 530 final CharMatcher original = this; 531 return new CharMatcher() { 532 @Override public boolean matches(char c) { 533 return !original.matches(c); 534 } 535 536 @Override public boolean matchesAllOf(CharSequence sequence) { 537 return original.matchesNoneOf(sequence); 538 } 539 @Override public boolean matchesNoneOf(CharSequence sequence) { 540 return original.matchesAllOf(sequence); 541 } 542 @Override public int countIn(CharSequence sequence) { 543 return sequence.length() - original.countIn(sequence); 544 } 545 @Override public CharMatcher negate() { 546 return original; 547 } 548 }; 549 } 550 551 /** 552 * Returns a matcher that matches any character matched by both this matcher 553 * and {@code other}. 554 */ 555 public CharMatcher and(CharMatcher other) { 556 return new And(Arrays.asList(this, checkNotNull(other))); 557 } 558 559 private static class And extends CharMatcher { 560 List<CharMatcher> components; 561 562 And(List<CharMatcher> components) { 563 this.components = components; // Skip defensive copy (private) 564 } 565 566 @Override public boolean matches(char c) { 567 for (CharMatcher matcher : components) { 568 if (!matcher.matches(c)) { 569 return false; 570 } 571 } 572 return true; 573 } 574 575 @Override public CharMatcher and(CharMatcher other) { 576 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 577 newComponents.add(checkNotNull(other)); 578 return new And(newComponents); 579 } 580 } 581 582 /** 583 * Returns a matcher that matches any character matched by either this matcher 584 * or {@code other}. 585 */ 586 public CharMatcher or(CharMatcher other) { 587 return new Or(Arrays.asList(this, checkNotNull(other))); 588 } 589 590 private static class Or extends CharMatcher { 591 List<CharMatcher> components; 592 593 Or(List<CharMatcher> components) { 594 this.components = components; // Skip defensive copy (private) 595 } 596 597 @Override public boolean matches(char c) { 598 for (CharMatcher matcher : components) { 599 if (matcher.matches(c)) { 600 return true; 601 } 602 } 603 return false; 604 } 605 606 @Override public CharMatcher or(CharMatcher other) { 607 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 608 newComponents.add(checkNotNull(other)); 609 return new Or(newComponents); 610 } 611 612 @Override protected void setBits(LookupTable table) { 613 for (CharMatcher matcher : components) { 614 matcher.setBits(table); 615 } 616 } 617 } 618 619 /** 620 * Returns a {@code char} matcher functionally equivalent to this one, but 621 * which may be faster to query than the original; your mileage may vary. 622 * Precomputation takes time and is likely to be worthwhile only if the 623 * precomputed matcher is queried many thousands of times. 624 * 625 * <p>This method has no effect (returns {@code this}) when called in GWT: 626 * it's unclear whether a precomputed matcher is faster, but it certainly 627 * consumes more memory, which doesn't seem like a worthwhile tradeoff in a 628 * browser. 629 */ 630 public CharMatcher precomputed() { 631 return Platform.precomputeCharMatcher(this); 632 } 633 634 /** 635 * This is the actual implementation of {@link #precomputed}, but we bounce 636 * calls through a method on {@link Platform} so that we can have different 637 * behavior in GWT. 638 * 639 * <p>The default precomputation is to cache the configuration of the original 640 * matcher in an eight-kilobyte bit array. In some situations this produces a 641 * matcher which is faster to query than the original. 642 * 643 * <p>The default implementation creates a new bit array and passes it to 644 * {@link #setBits(LookupTable)}. 645 */ 646 CharMatcher precomputedInternal() { 647 final LookupTable table = new LookupTable(); 648 setBits(table); 649 650 return new CharMatcher() { 651 @Override public boolean matches(char c) { 652 return table.get(c); 653 } 654 655 // TODO: make methods like negate() smart 656 657 @Override public CharMatcher precomputed() { 658 return this; 659 } 660 }; 661 } 662 663 /** 664 * For use by implementors; sets the bit corresponding to each character ('\0' 665 * to '{@literal \}uFFFF') that matches this matcher in the given bit array, 666 * leaving all other bits untouched. 667 * 668 * <p>The default implementation loops over every possible character value, 669 * invoking {@link #matches} for each one. 670 */ 671 protected void setBits(LookupTable table) { 672 char c = Character.MIN_VALUE; 673 while (true) { 674 if (matches(c)) { 675 table.set(c); 676 } 677 if (c++ == Character.MAX_VALUE) { 678 break; 679 } 680 } 681 } 682 683 /** 684 * A bit array with one bit per {@code char} value, used by {@link 685 * CharMatcher#precomputed}. 686 * 687 * <p>TODO: possibly share a common BitArray class with BloomFilter 688 * and others... a simpler java.util.BitSet. 689 */ 690 protected static class LookupTable { 691 int[] data = new int[2048]; 692 693 void set(char index) { 694 data[index >> 5] |= (1 << index); 695 } 696 boolean get(char index) { 697 return (data[index >> 5] & (1 << index)) != 0; 698 } 699 } 700 701 // Text processing routines 702 703 /** 704 * Returns {@code true} if a character sequence contains only matching 705 * characters. 706 * 707 * <p>The default implementation iterates over the sequence, invoking {@link 708 * #matches} for each character, until this returns {@code false} or the end 709 * is reached. 710 * 711 * @param sequence the character sequence to examine, possibly empty 712 * @return {@code true} if this matcher matches every character in the 713 * sequence, including when the sequence is empty 714 */ 715 public boolean matchesAllOf(CharSequence sequence) { 716 for (int i = sequence.length() - 1; i >= 0; i--) { 717 if (!matches(sequence.charAt(i))) { 718 return false; 719 } 720 } 721 return true; 722 } 723 724 /** 725 * Returns {@code true} if a character sequence contains no matching 726 * characters. 727 * 728 * <p>The default implementation iterates over the sequence, invoking {@link 729 * #matches} for each character, until this returns {@code false} or the end is 730 * reached. 731 * 732 * @param sequence the character sequence to examine, possibly empty 733 * @return {@code true} if this matcher matches every character in the 734 * sequence, including when the sequence is empty 735 */ 736 public boolean matchesNoneOf(CharSequence sequence) { 737 return indexIn(sequence) == -1; 738 } 739 740 // TODO: perhaps add matchesAnyOf() 741 742 /** 743 * Returns the index of the first matching character in a character sequence, 744 * or {@code -1} if no matching character is present. 745 * 746 * <p>The default implementation iterates over the sequence in forward order 747 * calling {@link #matches} for each character. 748 * 749 * @param sequence the character sequence to examine from the beginning 750 * @return an index, or {@code -1} if no character matches 751 */ 752 public int indexIn(CharSequence sequence) { 753 int length = sequence.length(); 754 for (int i = 0; i < length; i++) { 755 if (matches(sequence.charAt(i))) { 756 return i; 757 } 758 } 759 return -1; 760 } 761 762 /** 763 * Returns the index of the first matching character in a character sequence, 764 * starting from a given position, or {@code -1} if no character matches after 765 * that position. 766 * 767 * <p>The default implementation iterates over the sequence in forward order, 768 * beginning at {@code start}, calling {@link #matches} for each character. 769 * 770 * @param sequence the character sequence to examine 771 * @param start the first index to examine; must be nonnegative and no 772 * greater than {@code sequence.length()} 773 * @return the index of the first matching character, guaranteed to be no less 774 * than {@code start}, or {@code -1} if no character matches 775 * @throws IndexOutOfBoundsException if start is negative or greater than 776 * {@code sequence.length()} 777 */ 778 public int indexIn(CharSequence sequence, int start) { 779 int length = sequence.length(); 780 Preconditions.checkPositionIndex(start, length); 781 for (int i = start; i < length; i++) { 782 if (matches(sequence.charAt(i))) { 783 return i; 784 } 785 } 786 return -1; 787 } 788 789 /** 790 * Returns the index of the last matching character in a character sequence, 791 * or {@code -1} if no matching character is present. 792 * 793 * <p>The default implementation iterates over the sequence in reverse order 794 * calling {@link #matches} for each character. 795 * 796 * @param sequence the character sequence to examine from the end 797 * @return an index, or {@code -1} if no character matches 798 */ 799 public int lastIndexIn(CharSequence sequence) { 800 for (int i = sequence.length() - 1; i >= 0; i--) { 801 if (matches(sequence.charAt(i))) { 802 return i; 803 } 804 } 805 return -1; 806 } 807 808 /** 809 * Returns the number of matching characters found in a character sequence. 810 */ 811 public int countIn(CharSequence sequence) { 812 int count = 0; 813 for (int i = 0; i < sequence.length(); i++) { 814 if (matches(sequence.charAt(i))) { 815 count++; 816 } 817 } 818 return count; 819 } 820 821 /** 822 * Returns a string containing all non-matching characters of a character 823 * sequence, in order. For example: <pre> {@code 824 * 825 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 826 * 827 * ... returns {@code "bzr"}. 828 */ 829 public String removeFrom(CharSequence sequence) { 830 String string = sequence.toString(); 831 int pos = indexIn(string); 832 if (pos == -1) { 833 return string; 834 } 835 836 char[] chars = string.toCharArray(); 837 int spread = 1; 838 839 // This unusual loop comes from extensive benchmarking 840 OUT: 841 while (true) { 842 pos++; 843 while (true) { 844 if (pos == chars.length) { 845 break OUT; 846 } 847 if (matches(chars[pos])) { 848 break; 849 } 850 chars[pos - spread] = chars[pos]; 851 pos++; 852 } 853 spread++; 854 } 855 return new String(chars, 0, pos - spread); 856 } 857 858 /** 859 * Returns a string containing all matching characters of a character 860 * sequence, in order. For example: <pre> {@code 861 * 862 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 863 * 864 * ... returns {@code "aaa"}. 865 */ 866 public String retainFrom(CharSequence sequence) { 867 return negate().removeFrom(sequence); 868 } 869 870 /** 871 * Returns a string copy of the input character sequence, with each character 872 * that matches this matcher replaced by a given replacement character. For 873 * example: <pre> {@code 874 * 875 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 876 * 877 * ... returns {@code "rodor"}. 878 * 879 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find 880 * the first matching character, then iterates the remainder of the sequence 881 * calling {@link #matches(char)} for each character. 882 * 883 * @param sequence the character sequence to replace matching characters in 884 * @param replacement the character to append to the result string in place of 885 * each matching character in {@code sequence} 886 * @return the new string 887 */ 888 public String replaceFrom(CharSequence sequence, char replacement) { 889 String string = sequence.toString(); 890 int pos = indexIn(string); 891 if (pos == -1) { 892 return string; 893 } 894 char[] chars = string.toCharArray(); 895 chars[pos] = replacement; 896 for (int i = pos + 1; i < chars.length; i++) { 897 if (matches(chars[i])) { 898 chars[i] = replacement; 899 } 900 } 901 return new String(chars); 902 } 903 904 /** 905 * Returns a string copy of the input character sequence, with each character 906 * that matches this matcher replaced by a given replacement sequence. For 907 * example: <pre> {@code 908 * 909 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 910 * 911 * ... returns {@code "yoohoo"}. 912 * 913 * <p><b>Note:</b> If the replacement is a fixed string with only one character, 914 * you are better off calling {@link #replaceFrom(CharSequence, char)} directly. 915 * 916 * @param sequence the character sequence to replace matching characters in 917 * @param replacement the characters to append to the result string in place 918 * of each matching character in {@code sequence} 919 * @return the new string 920 */ 921 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 922 int replacementLen = replacement.length(); 923 if (replacementLen == 0) { 924 return removeFrom(sequence); 925 } 926 if (replacementLen == 1) { 927 return replaceFrom(sequence, replacement.charAt(0)); 928 } 929 930 String string = sequence.toString(); 931 int pos = indexIn(string); 932 if (pos == -1) { 933 return string; 934 } 935 936 int len = string.length(); 937 StringBuilder buf = new StringBuilder((int) (len * 1.5) + 16); 938 939 int oldpos = 0; 940 do { 941 buf.append(string, oldpos, pos); 942 buf.append(replacement); 943 oldpos = pos + 1; 944 pos = indexIn(string, oldpos); 945 } while (pos != -1); 946 947 buf.append(string, oldpos, len); 948 return buf.toString(); 949 } 950 951 /** 952 * Returns a substring of the input character sequence that omits all 953 * characters this matcher matches from the beginning and from the end of the 954 * string. For example: <pre> {@code 955 * 956 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 957 * 958 * ... returns {@code "cat"}. 959 * 960 * <p>Note that<pre> {@code 961 * 962 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 963 * 964 * ... is equivalent to {@link String#trim()}. 965 */ 966 public String trimFrom(CharSequence sequence) { 967 int len = sequence.length(); 968 int first; 969 int last; 970 971 for (first = 0; first < len; first++) { 972 if (!matches(sequence.charAt(first))) { 973 break; 974 } 975 } 976 for (last = len - 1; last > first; last--) { 977 if (!matches(sequence.charAt(last))) { 978 break; 979 } 980 } 981 982 return sequence.subSequence(first, last + 1).toString(); 983 } 984 985 /** 986 * Returns a substring of the input character sequence that omits all 987 * characters this matcher matches from the beginning of the 988 * string. For example: <pre> {@code 989 * 990 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 991 * 992 * ... returns {@code "catbab"}. 993 */ 994 public String trimLeadingFrom(CharSequence sequence) { 995 int len = sequence.length(); 996 int first; 997 998 for (first = 0; first < len; first++) { 999 if (!matches(sequence.charAt(first))) { 1000 break; 1001 } 1002 } 1003 1004 return sequence.subSequence(first, len).toString(); 1005 } 1006 1007 /** 1008 * Returns a substring of the input character sequence that omits all 1009 * characters this matcher matches from the end of the 1010 * string. For example: <pre> {@code 1011 * 1012 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1013 * 1014 * ... returns {@code "abacat"}. 1015 */ 1016 public String trimTrailingFrom(CharSequence sequence) { 1017 int len = sequence.length(); 1018 int last; 1019 1020 for (last = len - 1; last >= 0; last--) { 1021 if (!matches(sequence.charAt(last))) { 1022 break; 1023 } 1024 } 1025 1026 return sequence.subSequence(0, last + 1).toString(); 1027 } 1028 1029 /** 1030 * Returns a string copy of the input character sequence, with each group of 1031 * consecutive characters that match this matcher replaced by a single 1032 * replacement character. For example: <pre> {@code 1033 * 1034 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1035 * 1036 * ... returns {@code "b-p-r"}. 1037 * 1038 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find 1039 * the first matching character, then iterates the remainder of the sequence 1040 * calling {@link #matches(char)} for each character. 1041 * 1042 * @param sequence the character sequence to replace matching groups of 1043 * characters in 1044 * @param replacement the character to append to the result string in place of 1045 * each group of matching characters in {@code sequence} 1046 * @return the new string 1047 */ 1048 public String collapseFrom(CharSequence sequence, char replacement) { 1049 int first = indexIn(sequence); 1050 if (first == -1) { 1051 return sequence.toString(); 1052 } 1053 1054 // TODO: this implementation can probably be made faster. 1055 1056 StringBuilder builder = new StringBuilder(sequence.length()) 1057 .append(sequence.subSequence(0, first)) 1058 .append(replacement); 1059 boolean in = true; 1060 for (int i = first + 1; i < sequence.length(); i++) { 1061 char c = sequence.charAt(i); 1062 if (apply(c)) { 1063 if (!in) { 1064 builder.append(replacement); 1065 in = true; 1066 } 1067 } else { 1068 builder.append(c); 1069 in = false; 1070 } 1071 } 1072 return builder.toString(); 1073 } 1074 1075 /** 1076 * Collapses groups of matching characters exactly as {@link #collapseFrom} 1077 * does, except that groups of matching characters at the start or end of the 1078 * sequence are removed without replacement. 1079 */ 1080 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1081 int first = negate().indexIn(sequence); 1082 if (first == -1) { 1083 return ""; // everything matches. nothing's left. 1084 } 1085 StringBuilder builder = new StringBuilder(sequence.length()); 1086 boolean inMatchingGroup = false; 1087 for (int i = first; i < sequence.length(); i++) { 1088 char c = sequence.charAt(i); 1089 if (apply(c)) { 1090 inMatchingGroup = true; 1091 } else { 1092 if (inMatchingGroup) { 1093 builder.append(replacement); 1094 inMatchingGroup = false; 1095 } 1096 builder.append(c); 1097 } 1098 } 1099 return builder.toString(); 1100 } 1101 1102 // Predicate interface 1103 1104 /** 1105 * Returns {@code true} if this matcher matches the given character. 1106 * 1107 * @throws NullPointerException if {@code character} is null 1108 */ 1109 /*@Override*/ public boolean apply(Character character) { 1110 return matches(character); 1111 } 1112 } 1113