Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2008 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.base;
     18 
     19 import com.google.common.annotations.GwtCompatible;
     20 import static com.google.common.base.Preconditions.checkArgument;
     21 import static com.google.common.base.Preconditions.checkNotNull;
     22 
     23 import java.util.ArrayList;
     24 import java.util.Arrays;
     25 import java.util.List;
     26 
     27 /**
     28  * Determines a true or false value for any Java {@code char} value, just as
     29  * {@link Predicate} does for any {@link Object}. Also offers basic text
     30  * processing methods based on this function. Implementations are strongly
     31  * encouraged to be side-effect-free and immutable.
     32  *
     33  * <p>Throughout the documentation of this class, the phrase "matching
     34  * character" is used to mean "any character {@code c} for which {@code
     35  * this.matches(c)} returns {@code true}".
     36  *
     37  * <p><b>Note:</b> This class deals only with {@code char} values; it does not
     38  * understand supplementary Unicode code points in the range {@code 0x10000} to
     39  * {@code 0x10FFFF}. Such logical characters are encoded into a {@code String}
     40  * using surrogate pairs, and a {@code CharMatcher} treats these just as two
     41  * separate characters.
     42  *
     43  * @author Kevin Bourrillion
     44  * @since 2009.09.15 <b>tentative</b>
     45  */
     46 @GwtCompatible
     47 public abstract class CharMatcher implements Predicate<Character> {
     48 
     49   // Constants
     50 
     51   // Excludes 2000-2000a, which is handled as a range
     52   private static final String BREAKING_WHITESPACE_CHARS =
     53       "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000";
     54 
     55   // Excludes 2007, which is handled as a gap in a pair of ranges
     56   private static final String NON_BREAKING_WHITESPACE_CHARS =
     57       "\u00a0\u180e\u202f";
     58 
     59   /**
     60    * Determines whether a character is whitespace according to the latest
     61    * Unicode standard, as illustrated
     62    * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
     63    * This is not the same definition used by other Java APIs. See a comparison
     64    * of several definitions of "whitespace" at
     65    * <a href="TODO">(TODO)</a>.
     66    *
     67    * <p><b>Note:</b> as the Unicode definition evolves, we will modify this
     68    * constant to keep it up to date.
     69    */
     70   public static final CharMatcher WHITESPACE =
     71       anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS)
     72           .or(inRange('\u2000', '\u200a'));
     73 
     74   /**
     75    * Determines whether a character is a breaking whitespace (that is,
     76    * a whitespace which can be interpreted as a break between words
     77    * for formatting purposes).  See {@link #WHITESPACE} for a discussion
     78    * of that term.
     79    *
     80    * @since 2010.01.04 <b>tentative</b>
     81    */
     82   public static final CharMatcher BREAKING_WHITESPACE =
     83       anyOf(BREAKING_WHITESPACE_CHARS)
     84           .or(inRange('\u2000', '\u2006'))
     85           .or(inRange('\u2008', '\u200a'));
     86 
     87   /**
     88    * Determines whether a character is ASCII, meaning that its code point is
     89    * less than 128.
     90    */
     91   public static final CharMatcher ASCII = inRange('\0', '\u007f');
     92 
     93   /**
     94    * Determines whether a character is a digit according to
     95    * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
     96    */
     97   public static final CharMatcher DIGIT;
     98 
     99   static {
    100     CharMatcher digit = inRange('0', '9');
    101     String zeroes =
    102         "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
    103             + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
    104             + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
    105     for (char base : zeroes.toCharArray()) {
    106       digit = digit.or(inRange(base, (char) (base + 9)));
    107     }
    108     DIGIT = digit;
    109   }
    110 
    111   /**
    112    * Determines whether a character is whitespace according to {@link
    113    * Character#isWhitespace(char) Java's definition}; it is usually preferable
    114    * to use {@link #WHITESPACE}. See a comparison of several definitions of
    115    * "whitespace" at <a href="http://go/white+space">go/white+space</a>.
    116    */
    117   public static final CharMatcher JAVA_WHITESPACE
    118       = inRange('\u0009', (char) 13)  // \\u000d doesn't work as a char literal
    119       .or(inRange('\u001c', '\u0020'))
    120       .or(is('\u1680'))
    121       .or(is('\u180e'))
    122       .or(inRange('\u2000', '\u2006'))
    123       .or(inRange('\u2008', '\u200b'))
    124       .or(inRange('\u2028', '\u2029'))
    125       .or(is('\u205f'))
    126       .or(is('\u3000'));
    127 
    128   /**
    129    * Determines whether a character is a digit according to {@link
    130    * Character#isDigit(char) Java's definition}. If you only care to match
    131    * ASCII digits, you can use {@code inRange('0', '9')}.
    132    */
    133   public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
    134     @Override public boolean matches(char c) {
    135       return Character.isDigit(c);
    136     }
    137   };
    138 
    139   /**
    140    * Determines whether a character is a letter according to {@link
    141    * Character#isLetter(char) Java's definition}. If you only care to match
    142    * letters of the Latin alphabet, you can use {@code
    143    * inRange('a', 'z').or(inRange('A', 'Z'))}.
    144    */
    145   public static final CharMatcher JAVA_LETTER = new CharMatcher() {
    146     @Override public boolean matches(char c) {
    147       return Character.isLetter(c);
    148     }
    149   };
    150 
    151   /**
    152    * Determines whether a character is a letter or digit according to {@link
    153    * Character#isLetterOrDigit(char) Java's definition}.
    154    */
    155   public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
    156     @Override public boolean matches(char c) {
    157       return Character.isLetterOrDigit(c);
    158     }
    159   };
    160 
    161   /**
    162    * Determines whether a character is upper case according to {@link
    163    * Character#isUpperCase(char) Java's definition}.
    164    */
    165   public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
    166     @Override public boolean matches(char c) {
    167       return Character.isUpperCase(c);
    168     }
    169   };
    170 
    171   /**
    172    * Determines whether a character is lower case according to {@link
    173    * Character#isLowerCase(char) Java's definition}.
    174    */
    175   public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
    176     @Override public boolean matches(char c) {
    177       return Character.isLowerCase(c);
    178     }
    179   };
    180 
    181   /**
    182    * Determines whether a character is an ISO control character according to
    183    * {@link Character#isISOControl(char)}.
    184    */
    185   public static final CharMatcher JAVA_ISO_CONTROL = inRange('\u0000', '\u001f')
    186       .or(inRange('\u007f', '\u009f'));
    187 
    188   /**
    189    * Determines whether a character is invisible; that is, if its Unicode
    190    * category is any of SPACE_SEPARATOR, LINE_SEPARATOR,
    191    * PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and PRIVATE_USE according
    192    * to ICU4J.
    193    */
    194   public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
    195       .or(inRange('\u007f', '\u00a0'))
    196       .or(is('\u00ad'))
    197       .or(inRange('\u0600', '\u0603'))
    198       .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
    199       .or(inRange('\u2000', '\u200f'))
    200       .or(inRange('\u2028', '\u202f'))
    201       .or(inRange('\u205f', '\u2064'))
    202       .or(inRange('\u206a', '\u206f'))
    203       .or(is('\u3000'))
    204       .or(inRange('\ud800', '\uf8ff'))
    205       .or(anyOf("\ufeff\ufff9\ufffa\ufffb"));
    206 
    207   /**
    208    * Determines whether a character is single-width (not double-width).  When
    209    * in doubt, this matcher errs on the side of returning {@code false} (that
    210    * is, it tends to assume a character is double-width).
    211    *
    212    * <b>Note:</b> as the reference file evolves, we will modify this constant
    213    * to keep it up to date.
    214    */
    215   public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
    216       .or(is('\u05be'))
    217       .or(inRange('\u05d0', '\u05ea'))
    218       .or(is('\u05f3'))
    219       .or(is('\u05f4'))
    220       .or(inRange('\u0600', '\u06ff'))
    221       .or(inRange('\u0750', '\u077f'))
    222       .or(inRange('\u0e00', '\u0e7f'))
    223       .or(inRange('\u1e00', '\u20af'))
    224       .or(inRange('\u2100', '\u213a'))
    225       .or(inRange('\ufb50', '\ufdff'))
    226       .or(inRange('\ufe70', '\ufeff'))
    227       .or(inRange('\uff61', '\uffdc'));
    228 
    229   /** Matches any character. */
    230   public static final CharMatcher ANY = new CharMatcher() {
    231     @Override public boolean matches(char c) {
    232       return true;
    233     }
    234 
    235     @Override public int indexIn(CharSequence sequence) {
    236       return (sequence.length() == 0) ? -1 : 0;
    237     }
    238     @Override public int indexIn(CharSequence sequence, int start) {
    239       int length = sequence.length();
    240       Preconditions.checkPositionIndex(start, length);
    241       return (start == length) ? -1 : start;
    242     }
    243     @Override public int lastIndexIn(CharSequence sequence) {
    244       return sequence.length() - 1;
    245     }
    246     @Override public boolean matchesAllOf(CharSequence sequence) {
    247       checkNotNull(sequence);
    248       return true;
    249     }
    250     @Override public boolean matchesNoneOf(CharSequence sequence) {
    251       return sequence.length() == 0;
    252     }
    253     @Override public String removeFrom(CharSequence sequence) {
    254       checkNotNull(sequence);
    255       return "";
    256     }
    257     @Override public String replaceFrom(
    258         CharSequence sequence, char replacement) {
    259       char[] array = new char[sequence.length()];
    260       Arrays.fill(array, replacement);
    261       return new String(array);
    262     }
    263     @Override public String replaceFrom(
    264         CharSequence sequence, CharSequence replacement) {
    265       StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
    266       for (int i = 0; i < sequence.length(); i++) {
    267         retval.append(replacement);
    268       }
    269       return retval.toString();
    270     }
    271     @Override public String collapseFrom(CharSequence sequence, char replacement) {
    272       return (sequence.length() == 0) ? "" : String.valueOf(replacement);
    273     }
    274     @Override public String trimFrom(CharSequence sequence) {
    275       checkNotNull(sequence);
    276       return "";
    277     }
    278     @Override public int countIn(CharSequence sequence) {
    279       return sequence.length();
    280     }
    281     @Override public CharMatcher and(CharMatcher other) {
    282       return checkNotNull(other);
    283     }
    284     @Override public CharMatcher or(CharMatcher other) {
    285       checkNotNull(other);
    286       return this;
    287     }
    288     @Override public CharMatcher negate() {
    289       return NONE;
    290     }
    291     @Override public CharMatcher precomputed() {
    292       return this;
    293     }
    294   };
    295 
    296   /** Matches no characters. */
    297   public static final CharMatcher NONE = new CharMatcher() {
    298     @Override public boolean matches(char c) {
    299       return false;
    300     }
    301 
    302     @Override public int indexIn(CharSequence sequence) {
    303       checkNotNull(sequence);
    304       return -1;
    305     }
    306     @Override public int indexIn(CharSequence sequence, int start) {
    307       int length = sequence.length();
    308       Preconditions.checkPositionIndex(start, length);
    309       return -1;
    310     }
    311     @Override public int lastIndexIn(CharSequence sequence) {
    312       checkNotNull(sequence);
    313       return -1;
    314     }
    315     @Override public boolean matchesAllOf(CharSequence sequence) {
    316       return sequence.length() == 0;
    317     }
    318     @Override public boolean matchesNoneOf(CharSequence sequence) {
    319       checkNotNull(sequence);
    320       return true;
    321     }
    322     @Override public String removeFrom(CharSequence sequence) {
    323       return sequence.toString();
    324     }
    325     @Override public String replaceFrom(
    326         CharSequence sequence, char replacement) {
    327       return sequence.toString();
    328     }
    329     @Override public String replaceFrom(
    330         CharSequence sequence, CharSequence replacement) {
    331       checkNotNull(replacement);
    332       return sequence.toString();
    333     }
    334     @Override public String collapseFrom(
    335         CharSequence sequence, char replacement) {
    336       return sequence.toString();
    337     }
    338     @Override public String trimFrom(CharSequence sequence) {
    339       return sequence.toString();
    340     }
    341     @Override public int countIn(CharSequence sequence) {
    342       checkNotNull(sequence);
    343       return 0;
    344     }
    345     @Override public CharMatcher and(CharMatcher other) {
    346       checkNotNull(other);
    347       return this;
    348     }
    349     @Override public CharMatcher or(CharMatcher other) {
    350       return checkNotNull(other);
    351     }
    352     @Override public CharMatcher negate() {
    353       return ANY;
    354     }
    355     @Override protected void setBits(LookupTable table) {
    356     }
    357     @Override public CharMatcher precomputed() {
    358       return this;
    359     }
    360   };
    361 
    362   // Static factories
    363 
    364   /**
    365    * Returns a {@code char} matcher that matches only one specified character.
    366    */
    367   public static CharMatcher is(final char match) {
    368     return new CharMatcher() {
    369       @Override public boolean matches(char c) {
    370         return c == match;
    371       }
    372 
    373       @Override public String replaceFrom(
    374           CharSequence sequence, char replacement) {
    375         return sequence.toString().replace(match, replacement);
    376       }
    377       @Override public CharMatcher and(CharMatcher other) {
    378         return other.matches(match) ? this : NONE;
    379       }
    380       @Override public CharMatcher or(CharMatcher other) {
    381         return other.matches(match) ? other : super.or(other);
    382       }
    383       @Override public CharMatcher negate() {
    384         return isNot(match);
    385       }
    386       @Override protected void setBits(LookupTable table) {
    387         table.set(match);
    388       }
    389       @Override public CharMatcher precomputed() {
    390         return this;
    391       }
    392     };
    393   }
    394 
    395   /**
    396    * Returns a {@code char} matcher that matches any character except the one
    397    * specified.
    398    *
    399    * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
    400    */
    401   public static CharMatcher isNot(final char match) {
    402     return new CharMatcher() {
    403       @Override public boolean matches(char c) {
    404         return c != match;
    405       }
    406 
    407       @Override public CharMatcher and(CharMatcher other) {
    408         return other.matches(match) ? super.and(other) : other;
    409       }
    410       @Override public CharMatcher or(CharMatcher other) {
    411         return other.matches(match) ? ANY : this;
    412       }
    413       @Override public CharMatcher negate() {
    414         return is(match);
    415       }
    416     };
    417   }
    418 
    419   /**
    420    * Returns a {@code char} matcher that matches any character present in the
    421    * given character sequence.
    422    */
    423   public static CharMatcher anyOf(final CharSequence sequence) {
    424     switch (sequence.length()) {
    425       case 0:
    426         return NONE;
    427       case 1:
    428         return is(sequence.charAt(0));
    429       case 2:
    430         final char match1 = sequence.charAt(0);
    431         final char match2 = sequence.charAt(1);
    432         return new CharMatcher() {
    433           @Override public boolean matches(char c) {
    434             return c == match1 || c == match2;
    435           }
    436           @Override protected void setBits(LookupTable table) {
    437             table.set(match1);
    438             table.set(match2);
    439           }
    440           @Override public CharMatcher precomputed() {
    441             return this;
    442           }
    443         };
    444     }
    445 
    446     final char[] chars = sequence.toString().toCharArray();
    447     Arrays.sort(chars); // not worth collapsing duplicates
    448 
    449     return new CharMatcher() {
    450       @Override public boolean matches(char c) {
    451         return Arrays.binarySearch(chars, c) >= 0;
    452       }
    453       @Override protected void setBits(LookupTable table) {
    454         for (char c : chars) {
    455           table.set(c);
    456         }
    457       }
    458     };
    459   }
    460 
    461   /**
    462    * Returns a {@code char} matcher that matches any character not present in
    463    * the given character sequence.
    464    */
    465   public static CharMatcher noneOf(CharSequence sequence) {
    466     return anyOf(sequence).negate();
    467   }
    468 
    469   /**
    470    * Returns a {@code char} matcher that matches any character in a given range
    471    * (both endpoints are inclusive). For example, to match any lowercase letter
    472    * of the English alphabet, use {@code CharMatcher.inRange('a', 'z')}.
    473    *
    474    * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
    475    */
    476   public static CharMatcher inRange(
    477       final char startInclusive, final char endInclusive) {
    478     checkArgument(endInclusive >= startInclusive);
    479     return new CharMatcher() {
    480       @Override public boolean matches(char c) {
    481         return startInclusive <= c && c <= endInclusive;
    482       }
    483       @Override protected void setBits(LookupTable table) {
    484         char c = startInclusive;
    485         while (true) {
    486           table.set(c);
    487           if (c++ == endInclusive) {
    488             break;
    489           }
    490         }
    491       }
    492       @Override public CharMatcher precomputed() {
    493         return this;
    494       }
    495     };
    496   }
    497 
    498   /**
    499    * Returns a matcher with identical behavior to the given {@link
    500    * Character}-based predicate, but which operates on primitive {@code char}
    501    * instances instead.
    502    */
    503   public static CharMatcher forPredicate(
    504       final Predicate<? super Character> predicate) {
    505     checkNotNull(predicate);
    506     if (predicate instanceof CharMatcher) {
    507       return (CharMatcher) predicate;
    508     }
    509     return new CharMatcher() {
    510       @Override public boolean matches(char c) {
    511         return predicate.apply(c);
    512       }
    513       @Override public boolean apply(Character character) {
    514         return predicate.apply(checkNotNull(character));
    515       }
    516     };
    517   }
    518 
    519   // Abstract methods
    520 
    521   /** Determines a true or false value for the given character. */
    522   public abstract boolean matches(char c);
    523 
    524   // Non-static factories
    525 
    526   /**
    527    * Returns a matcher that matches any character not matched by this matcher.
    528    */
    529   public CharMatcher negate() {
    530     final CharMatcher original = this;
    531     return new CharMatcher() {
    532       @Override public boolean matches(char c) {
    533         return !original.matches(c);
    534       }
    535 
    536       @Override public boolean matchesAllOf(CharSequence sequence) {
    537         return original.matchesNoneOf(sequence);
    538       }
    539       @Override public boolean matchesNoneOf(CharSequence sequence) {
    540         return original.matchesAllOf(sequence);
    541       }
    542       @Override public int countIn(CharSequence sequence) {
    543         return sequence.length() - original.countIn(sequence);
    544       }
    545       @Override public CharMatcher negate() {
    546         return original;
    547       }
    548     };
    549   }
    550 
    551   /**
    552    * Returns a matcher that matches any character matched by both this matcher
    553    * and {@code other}.
    554    */
    555   public CharMatcher and(CharMatcher other) {
    556     return new And(Arrays.asList(this, checkNotNull(other)));
    557   }
    558 
    559   private static class And extends CharMatcher {
    560     List<CharMatcher> components;
    561 
    562     And(List<CharMatcher> components) {
    563       this.components = components; // Skip defensive copy (private)
    564     }
    565 
    566     @Override public boolean matches(char c) {
    567       for (CharMatcher matcher : components) {
    568         if (!matcher.matches(c)) {
    569           return false;
    570         }
    571       }
    572       return true;
    573     }
    574 
    575     @Override public CharMatcher and(CharMatcher other) {
    576       List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
    577       newComponents.add(checkNotNull(other));
    578       return new And(newComponents);
    579     }
    580   }
    581 
    582   /**
    583    * Returns a matcher that matches any character matched by either this matcher
    584    * or {@code other}.
    585    */
    586   public CharMatcher or(CharMatcher other) {
    587     return new Or(Arrays.asList(this, checkNotNull(other)));
    588   }
    589 
    590   private static class Or extends CharMatcher {
    591     List<CharMatcher> components;
    592 
    593     Or(List<CharMatcher> components) {
    594       this.components = components; // Skip defensive copy (private)
    595     }
    596 
    597     @Override public boolean matches(char c) {
    598       for (CharMatcher matcher : components) {
    599         if (matcher.matches(c)) {
    600           return true;
    601         }
    602       }
    603       return false;
    604     }
    605 
    606     @Override public CharMatcher or(CharMatcher other) {
    607       List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
    608       newComponents.add(checkNotNull(other));
    609       return new Or(newComponents);
    610     }
    611 
    612     @Override protected void setBits(LookupTable table) {
    613       for (CharMatcher matcher : components) {
    614         matcher.setBits(table);
    615       }
    616     }
    617   }
    618 
    619   /**
    620    * Returns a {@code char} matcher functionally equivalent to this one, but
    621    * which may be faster to query than the original; your mileage may vary.
    622    * Precomputation takes time and is likely to be worthwhile only if the
    623    * precomputed matcher is queried many thousands of times.
    624    *
    625    * <p>This method has no effect (returns {@code this}) when called in GWT:
    626    * it's unclear whether a precomputed matcher is faster, but it certainly
    627    * consumes more memory, which doesn't seem like a worthwhile tradeoff in a
    628    * browser.
    629    */
    630   public CharMatcher precomputed() {
    631     return Platform.precomputeCharMatcher(this);
    632   }
    633 
    634   /**
    635    * This is the actual implementation of {@link #precomputed}, but we bounce
    636    * calls through a method on {@link Platform} so that we can have different
    637    * behavior in GWT.
    638    *
    639    * <p>The default precomputation is to cache the configuration of the original
    640    * matcher in an eight-kilobyte bit array. In some situations this produces a
    641    * matcher which is faster to query than the original.
    642    *
    643    * <p>The default implementation creates a new bit array and passes it to
    644    * {@link #setBits(LookupTable)}.
    645    */
    646   CharMatcher precomputedInternal() {
    647     final LookupTable table = new LookupTable();
    648     setBits(table);
    649 
    650     return new CharMatcher() {
    651       @Override public boolean matches(char c) {
    652         return table.get(c);
    653       }
    654 
    655       // TODO: make methods like negate() smart
    656 
    657       @Override public CharMatcher precomputed() {
    658         return this;
    659       }
    660     };
    661   }
    662 
    663   /**
    664    * For use by implementors; sets the bit corresponding to each character ('\0'
    665    * to '{@literal \}uFFFF') that matches this matcher in the given bit array,
    666    * leaving all other bits untouched.
    667    *
    668    * <p>The default implementation loops over every possible character value,
    669    * invoking {@link #matches} for each one.
    670    */
    671   protected void setBits(LookupTable table) {
    672     char c = Character.MIN_VALUE;
    673     while (true) {
    674       if (matches(c)) {
    675         table.set(c);
    676       }
    677       if (c++ == Character.MAX_VALUE) {
    678         break;
    679       }
    680     }
    681   }
    682 
    683   /**
    684    * A bit array with one bit per {@code char} value, used by {@link
    685    * CharMatcher#precomputed}.
    686    *
    687    * <p>TODO: possibly share a common BitArray class with BloomFilter
    688    * and others... a simpler java.util.BitSet.
    689    */
    690   protected static class LookupTable {
    691     int[] data = new int[2048];
    692 
    693     void set(char index) {
    694       data[index >> 5] |= (1 << index);
    695     }
    696     boolean get(char index) {
    697       return (data[index >> 5] & (1 << index)) != 0;
    698     }
    699   }
    700 
    701   // Text processing routines
    702 
    703   /**
    704    * Returns {@code true} if a character sequence contains only matching
    705    * characters.
    706    *
    707    * <p>The default implementation iterates over the sequence, invoking {@link
    708    * #matches} for each character, until this returns {@code false} or the end
    709    * is reached.
    710    *
    711    * @param sequence the character sequence to examine, possibly empty
    712    * @return {@code true} if this matcher matches every character in the
    713    *     sequence, including when the sequence is empty
    714    */
    715   public boolean matchesAllOf(CharSequence sequence) {
    716     for (int i = sequence.length() - 1; i >= 0; i--) {
    717       if (!matches(sequence.charAt(i))) {
    718         return false;
    719       }
    720     }
    721     return true;
    722   }
    723 
    724   /**
    725    * Returns {@code true} if a character sequence contains no matching
    726    * characters.
    727    *
    728    * <p>The default implementation iterates over the sequence, invoking {@link
    729    * #matches} for each character, until this returns {@code false} or the end is
    730    * reached.
    731    *
    732    * @param sequence the character sequence to examine, possibly empty
    733    * @return {@code true} if this matcher matches every character in the
    734    *     sequence, including when the sequence is empty
    735    */
    736   public boolean matchesNoneOf(CharSequence sequence) {
    737     return indexIn(sequence) == -1;
    738   }
    739 
    740   // TODO: perhaps add matchesAnyOf()
    741 
    742   /**
    743    * Returns the index of the first matching character in a character sequence,
    744    * or {@code -1} if no matching character is present.
    745    *
    746    * <p>The default implementation iterates over the sequence in forward order
    747    * calling {@link #matches} for each character.
    748    *
    749    * @param sequence the character sequence to examine from the beginning
    750    * @return an index, or {@code -1} if no character matches
    751    */
    752   public int indexIn(CharSequence sequence) {
    753     int length = sequence.length();
    754     for (int i = 0; i < length; i++) {
    755       if (matches(sequence.charAt(i))) {
    756         return i;
    757       }
    758     }
    759     return -1;
    760   }
    761 
    762   /**
    763    * Returns the index of the first matching character in a character sequence,
    764    * starting from a given position, or {@code -1} if no character matches after
    765    * that position.
    766    *
    767    * <p>The default implementation iterates over the sequence in forward order,
    768    * beginning at {@code start}, calling {@link #matches} for each character.
    769    *
    770    * @param sequence the character sequence to examine
    771    * @param start the first index to examine; must be nonnegative and no
    772    *     greater than {@code sequence.length()}
    773    * @return the index of the first matching character, guaranteed to be no less
    774    *     than {@code start}, or {@code -1} if no character matches
    775    * @throws IndexOutOfBoundsException if start is negative or greater than
    776    *     {@code sequence.length()}
    777    */
    778   public int indexIn(CharSequence sequence, int start) {
    779     int length = sequence.length();
    780     Preconditions.checkPositionIndex(start, length);
    781     for (int i = start; i < length; i++) {
    782       if (matches(sequence.charAt(i))) {
    783         return i;
    784       }
    785     }
    786     return -1;
    787   }
    788 
    789   /**
    790    * Returns the index of the last matching character in a character sequence,
    791    * or {@code -1} if no matching character is present.
    792    *
    793    * <p>The default implementation iterates over the sequence in reverse order
    794    * calling {@link #matches} for each character.
    795    *
    796    * @param sequence the character sequence to examine from the end
    797    * @return an index, or {@code -1} if no character matches
    798    */
    799   public int lastIndexIn(CharSequence sequence) {
    800     for (int i = sequence.length() - 1; i >= 0; i--) {
    801       if (matches(sequence.charAt(i))) {
    802         return i;
    803       }
    804     }
    805     return -1;
    806   }
    807 
    808   /**
    809    * Returns the number of matching characters found in a character sequence.
    810    */
    811   public int countIn(CharSequence sequence) {
    812     int count = 0;
    813     for (int i = 0; i < sequence.length(); i++) {
    814       if (matches(sequence.charAt(i))) {
    815         count++;
    816       }
    817     }
    818     return count;
    819   }
    820 
    821   /**
    822    * Returns a string containing all non-matching characters of a character
    823    * sequence, in order. For example: <pre>   {@code
    824    *
    825    *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
    826    *
    827    * ... returns {@code "bzr"}.
    828    */
    829   public String removeFrom(CharSequence sequence) {
    830     String string = sequence.toString();
    831     int pos = indexIn(string);
    832     if (pos == -1) {
    833       return string;
    834     }
    835 
    836     char[] chars = string.toCharArray();
    837     int spread = 1;
    838 
    839     // This unusual loop comes from extensive benchmarking
    840     OUT:
    841     while (true) {
    842       pos++;
    843       while (true) {
    844         if (pos == chars.length) {
    845           break OUT;
    846         }
    847         if (matches(chars[pos])) {
    848           break;
    849         }
    850         chars[pos - spread] = chars[pos];
    851         pos++;
    852       }
    853       spread++;
    854     }
    855     return new String(chars, 0, pos - spread);
    856   }
    857 
    858   /**
    859    * Returns a string containing all matching characters of a character
    860    * sequence, in order. For example: <pre>   {@code
    861    *
    862    *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
    863    *
    864    * ... returns {@code "aaa"}.
    865    */
    866   public String retainFrom(CharSequence sequence) {
    867     return negate().removeFrom(sequence);
    868   }
    869 
    870   /**
    871    * Returns a string copy of the input character sequence, with each character
    872    * that matches this matcher replaced by a given replacement character. For
    873    * example: <pre>   {@code
    874    *
    875    *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
    876    *
    877    * ... returns {@code "rodor"}.
    878    *
    879    * <p>The default implementation uses {@link #indexIn(CharSequence)} to find
    880    * the first matching character, then iterates the remainder of the sequence
    881    * calling {@link #matches(char)} for each character.
    882    *
    883    * @param sequence the character sequence to replace matching characters in
    884    * @param replacement the character to append to the result string in place of
    885    *     each matching character in {@code sequence}
    886    * @return the new string
    887    */
    888   public String replaceFrom(CharSequence sequence, char replacement) {
    889     String string = sequence.toString();
    890     int pos = indexIn(string);
    891     if (pos == -1) {
    892       return string;
    893     }
    894     char[] chars = string.toCharArray();
    895     chars[pos] = replacement;
    896     for (int i = pos + 1; i < chars.length; i++) {
    897       if (matches(chars[i])) {
    898         chars[i] = replacement;
    899       }
    900     }
    901     return new String(chars);
    902   }
    903 
    904   /**
    905    * Returns a string copy of the input character sequence, with each character
    906    * that matches this matcher replaced by a given replacement sequence. For
    907    * example: <pre>   {@code
    908    *
    909    *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
    910    *
    911    * ... returns {@code "yoohoo"}.
    912    *
    913    * <p><b>Note:</b> If the replacement is a fixed string with only one character,
    914    * you are better off calling {@link #replaceFrom(CharSequence, char)} directly.
    915    *
    916    * @param sequence the character sequence to replace matching characters in
    917    * @param replacement the characters to append to the result string in place
    918    *     of each matching character in {@code sequence}
    919    * @return the new string
    920    */
    921   public String replaceFrom(CharSequence sequence, CharSequence replacement) {
    922     int replacementLen = replacement.length();
    923     if (replacementLen == 0) {
    924       return removeFrom(sequence);
    925     }
    926     if (replacementLen == 1) {
    927       return replaceFrom(sequence, replacement.charAt(0));
    928     }
    929 
    930     String string = sequence.toString();
    931     int pos = indexIn(string);
    932     if (pos == -1) {
    933       return string;
    934     }
    935 
    936     int len = string.length();
    937     StringBuilder buf = new StringBuilder((int) (len * 1.5) + 16);
    938 
    939     int oldpos = 0;
    940     do {
    941       buf.append(string, oldpos, pos);
    942       buf.append(replacement);
    943       oldpos = pos + 1;
    944       pos = indexIn(string, oldpos);
    945     } while (pos != -1);
    946 
    947     buf.append(string, oldpos, len);
    948     return buf.toString();
    949   }
    950 
    951   /**
    952    * Returns a substring of the input character sequence that omits all
    953    * characters this matcher matches from the beginning and from the end of the
    954    * string. For example: <pre> {@code
    955    *
    956    *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
    957    *
    958    * ... returns {@code "cat"}.
    959    *
    960    * <p>Note that<pre>   {@code
    961    *
    962    *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
    963    *
    964    * ... is equivalent to {@link String#trim()}.
    965    */
    966   public String trimFrom(CharSequence sequence) {
    967     int len = sequence.length();
    968     int first;
    969     int last;
    970 
    971     for (first = 0; first < len; first++) {
    972       if (!matches(sequence.charAt(first))) {
    973         break;
    974       }
    975     }
    976     for (last = len - 1; last > first; last--) {
    977       if (!matches(sequence.charAt(last))) {
    978         break;
    979       }
    980     }
    981 
    982     return sequence.subSequence(first, last + 1).toString();
    983   }
    984 
    985   /**
    986    * Returns a substring of the input character sequence that omits all
    987    * characters this matcher matches from the beginning of the
    988    * string. For example: <pre> {@code
    989    *
    990    *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
    991    *
    992    * ... returns {@code "catbab"}.
    993    */
    994   public String trimLeadingFrom(CharSequence sequence) {
    995     int len = sequence.length();
    996     int first;
    997 
    998     for (first = 0; first < len; first++) {
    999       if (!matches(sequence.charAt(first))) {
   1000         break;
   1001       }
   1002     }
   1003 
   1004     return sequence.subSequence(first, len).toString();
   1005   }
   1006 
   1007   /**
   1008    * Returns a substring of the input character sequence that omits all
   1009    * characters this matcher matches from the end of the
   1010    * string. For example: <pre> {@code
   1011    *
   1012    *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
   1013    *
   1014    * ... returns {@code "abacat"}.
   1015    */
   1016   public String trimTrailingFrom(CharSequence sequence) {
   1017     int len = sequence.length();
   1018     int last;
   1019 
   1020     for (last = len - 1; last >= 0; last--) {
   1021       if (!matches(sequence.charAt(last))) {
   1022         break;
   1023       }
   1024     }
   1025 
   1026     return sequence.subSequence(0, last + 1).toString();
   1027   }
   1028 
   1029   /**
   1030    * Returns a string copy of the input character sequence, with each group of
   1031    * consecutive characters that match this matcher replaced by a single
   1032    * replacement character. For example: <pre>   {@code
   1033    *
   1034    *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
   1035    *
   1036    * ... returns {@code "b-p-r"}.
   1037    *
   1038    * <p>The default implementation uses {@link #indexIn(CharSequence)} to find
   1039    * the first matching character, then iterates the remainder of the sequence
   1040    * calling {@link #matches(char)} for each character.
   1041    *
   1042    * @param sequence the character sequence to replace matching groups of
   1043    *     characters in
   1044    * @param replacement the character to append to the result string in place of
   1045    *     each group of matching characters in {@code sequence}
   1046    * @return the new string
   1047    */
   1048   public String collapseFrom(CharSequence sequence, char replacement) {
   1049     int first = indexIn(sequence);
   1050     if (first == -1) {
   1051       return sequence.toString();
   1052     }
   1053 
   1054     // TODO: this implementation can probably be made faster.
   1055 
   1056     StringBuilder builder = new StringBuilder(sequence.length())
   1057         .append(sequence.subSequence(0, first))
   1058         .append(replacement);
   1059     boolean in = true;
   1060     for (int i = first + 1; i < sequence.length(); i++) {
   1061       char c = sequence.charAt(i);
   1062       if (apply(c)) {
   1063         if (!in) {
   1064           builder.append(replacement);
   1065           in = true;
   1066         }
   1067       } else {
   1068         builder.append(c);
   1069         in = false;
   1070       }
   1071     }
   1072     return builder.toString();
   1073   }
   1074 
   1075   /**
   1076    * Collapses groups of matching characters exactly as {@link #collapseFrom}
   1077    * does, except that groups of matching characters at the start or end of the
   1078    * sequence are removed without replacement.
   1079    */
   1080   public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
   1081     int first = negate().indexIn(sequence);
   1082     if (first == -1) {
   1083       return ""; // everything matches. nothing's left.
   1084     }
   1085     StringBuilder builder = new StringBuilder(sequence.length());
   1086     boolean inMatchingGroup = false;
   1087     for (int i = first; i < sequence.length(); i++) {
   1088       char c = sequence.charAt(i);
   1089       if (apply(c)) {
   1090         inMatchingGroup = true;
   1091       } else {
   1092         if (inMatchingGroup) {
   1093           builder.append(replacement);
   1094           inMatchingGroup = false;
   1095         }
   1096         builder.append(c);
   1097       }
   1098     }
   1099     return builder.toString();
   1100   }
   1101 
   1102   // Predicate interface
   1103 
   1104   /**
   1105    * Returns {@code true} if this matcher matches the given character.
   1106    *
   1107    * @throws NullPointerException if {@code character} is null
   1108    */
   1109   /*@Override*/ public boolean apply(Character character) {
   1110     return matches(character);
   1111   }
   1112 }
   1113