Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2008 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.android.mail.common.base;
     18 
     19 import static com.google.android.mail.common.base.Preconditions.checkArgument;
     20 import static com.google.android.mail.common.base.Preconditions.checkNotNull;
     21 
     22 import java.util.ArrayList;
     23 import java.util.Arrays;
     24 import java.util.List;
     25 
     26 /**
     27  * Determines a true or false value for any Java {@code char} value, just as
     28  * {@link Predicate} does for any {@link Object}. Also offers basic text
     29  * processing methods based on this function. Implementations are strongly
     30  * encouraged to be side-effect-free and immutable.
     31  *
     32  * <p>Throughout the documentation of this class, the phrase "matching
     33  * character" is used to mean "any character {@code c} for which {@code
     34  * this.matches(c)} returns {@code true}".
     35  *
     36  * <p><b>Note:</b> This class deals only with {@code char} values; it does not
     37  * understand supplementary Unicode code points in the range {@code 0x10000} to
     38  * {@code 0x10FFFF}. Such logical characters are encoded into a {@code String}
     39  * using surrogate pairs, and a {@code CharMatcher} treats these just as two
     40  * separate characters.
     41  *
     42  * @author Kevin Bourrillion
     43  * @since 2009.09.15 <b>tentative</b>
     44  */
     45 public abstract class CharMatcher implements Predicate<Character> {
     46 
     47   // Constants
     48 
     49   // Excludes 2000-2000a, which is handled as a range
     50   private static final String BREAKING_WHITESPACE_CHARS =
     51       "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000";
     52 
     53   // Excludes 2007, which is handled as a gap in a pair of ranges
     54   private static final String NON_BREAKING_WHITESPACE_CHARS =
     55       "\u00a0\u180e\u202f";
     56 
     57   /**
     58    * Determines whether a character is whitespace according to the latest
     59    * Unicode standard, as illustrated
     60    * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
     61    * This is not the same definition used by other Java APIs. See a comparison
     62    * of several definitions of "whitespace" at
     63    * <a href="TODO">(TODO)</a>.
     64    *
     65    * <p><b>Note:</b> as the Unicode definition evolves, we will modify this
     66    * constant to keep it up to date.
     67    */
     68   public static final CharMatcher WHITESPACE =
     69       anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS)
     70           .or(inRange('\u2000', '\u200a'));
     71 
     72   /**
     73    * Determines whether a character is a breaking whitespace (that is,
     74    * a whitespace which can be interpreted as a break between words
     75    * for formatting purposes).  See {@link #WHITESPACE} for a discussion
     76    * of that term.
     77    *
     78    * @since 2010.01.04 <b>tentative</b>
     79    */
     80   public static final CharMatcher BREAKING_WHITESPACE =
     81       anyOf(BREAKING_WHITESPACE_CHARS)
     82           .or(inRange('\u2000', '\u2006'))
     83           .or(inRange('\u2008', '\u200a'));
     84 
     85   /**
     86    * Determines whether a character is ASCII, meaning that its code point is
     87    * less than 128.
     88    */
     89   public static final CharMatcher ASCII = inRange('\0', '\u007f');
     90 
     91   /**
     92    * Determines whether a character is a digit according to
     93    * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
     94    */
     95   public static final CharMatcher DIGIT;
     96 
     97   static {
     98     CharMatcher digit = inRange('0', '9');
     99     String zeroes =
    100         "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
    101             + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
    102             + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
    103     for (char base : zeroes.toCharArray()) {
    104       digit = digit.or(inRange(base, (char) (base + 9)));
    105     }
    106     DIGIT = digit;
    107   }
    108 
    109   /**
    110    * Determines whether a character is whitespace according to {@link
    111    * Character#isWhitespace(char) Java's definition}; it is usually preferable
    112    * to use {@link #WHITESPACE}. See a comparison of several definitions of
    113    * "whitespace" at <a href="http://go/white+space">go/white+space</a>.
    114    */
    115   public static final CharMatcher JAVA_WHITESPACE
    116       = inRange('\u0009', (char) 13)  // \\u000d doesn't work as a char literal
    117       .or(inRange('\u001c', '\u0020'))
    118       .or(is('\u1680'))
    119       .or(is('\u180e'))
    120       .or(inRange('\u2000', '\u2006'))
    121       .or(inRange('\u2008', '\u200b'))
    122       .or(inRange('\u2028', '\u2029'))
    123       .or(is('\u205f'))
    124       .or(is('\u3000'));
    125 
    126   /**
    127    * Determines whether a character is a digit according to {@link
    128    * Character#isDigit(char) Java's definition}. If you only care to match
    129    * ASCII digits, you can use {@code inRange('0', '9')}.
    130    */
    131   public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
    132     @Override public boolean matches(char c) {
    133       return Character.isDigit(c);
    134     }
    135   };
    136 
    137   /**
    138    * Determines whether a character is a letter according to {@link
    139    * Character#isLetter(char) Java's definition}. If you only care to match
    140    * letters of the Latin alphabet, you can use {@code
    141    * inRange('a', 'z').or(inRange('A', 'Z'))}.
    142    */
    143   public static final CharMatcher JAVA_LETTER = new CharMatcher() {
    144     @Override public boolean matches(char c) {
    145       return Character.isLetter(c);
    146     }
    147   };
    148 
    149   /**
    150    * Determines whether a character is a letter or digit according to {@link
    151    * Character#isLetterOrDigit(char) Java's definition}.
    152    */
    153   public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
    154     @Override public boolean matches(char c) {
    155       return Character.isLetterOrDigit(c);
    156     }
    157   };
    158 
    159   /**
    160    * Determines whether a character is upper case according to {@link
    161    * Character#isUpperCase(char) Java's definition}.
    162    */
    163   public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
    164     @Override public boolean matches(char c) {
    165       return Character.isUpperCase(c);
    166     }
    167   };
    168 
    169   /**
    170    * Determines whether a character is lower case according to {@link
    171    * Character#isLowerCase(char) Java's definition}.
    172    */
    173   public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
    174     @Override public boolean matches(char c) {
    175       return Character.isLowerCase(c);
    176     }
    177   };
    178 
    179   /**
    180    * Determines whether a character is an ISO control character according to
    181    * {@link Character#isISOControl(char)}.
    182    */
    183   public static final CharMatcher JAVA_ISO_CONTROL = inRange('\u0000', '\u001f')
    184       .or(inRange('\u007f', '\u009f'));
    185 
    186   /**
    187    * Determines whether a character is invisible; that is, if its Unicode
    188    * category is any of SPACE_SEPARATOR, LINE_SEPARATOR,
    189    * PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and PRIVATE_USE according
    190    * to ICU4J.
    191    */
    192   public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
    193       .or(inRange('\u007f', '\u00a0'))
    194       .or(is('\u00ad'))
    195       .or(inRange('\u0600', '\u0603'))
    196       .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
    197       .or(inRange('\u2000', '\u200f'))
    198       .or(inRange('\u2028', '\u202f'))
    199       .or(inRange('\u205f', '\u2064'))
    200       .or(inRange('\u206a', '\u206f'))
    201       .or(is('\u3000'))
    202       .or(inRange('\ud800', '\uf8ff'))
    203       .or(anyOf("\ufeff\ufff9\ufffa\ufffb"));
    204 
    205   /**
    206    * Determines whether a character is single-width (not double-width).  When
    207    * in doubt, this matcher errs on the side of returning {@code false} (that
    208    * is, it tends to assume a character is double-width).
    209    *
    210    * <b>Note:</b> as the reference file evolves, we will modify this constant
    211    * to keep it up to date.
    212    */
    213   public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
    214       .or(is('\u05be'))
    215       .or(inRange('\u05d0', '\u05ea'))
    216       .or(is('\u05f3'))
    217       .or(is('\u05f4'))
    218       .or(inRange('\u0600', '\u06ff'))
    219       .or(inRange('\u0750', '\u077f'))
    220       .or(inRange('\u0e00', '\u0e7f'))
    221       .or(inRange('\u1e00', '\u20af'))
    222       .or(inRange('\u2100', '\u213a'))
    223       .or(inRange('\ufb50', '\ufdff'))
    224       .or(inRange('\ufe70', '\ufeff'))
    225       .or(inRange('\uff61', '\uffdc'));
    226 
    227   /**
    228    * Determines whether a character is whitespace according to an arbitrary definition used by
    229    * {@link StringUtil} for years. Most likely you don't want to use this. See a comparison of
    230    * several definitions of "whitespace" at <a href="http://goto/white space">goto/white space</a>.
    231    *
    232    * <p><b>To be deprecated.</b> use {@link #WHITESPACE} to switch to the Unicode definition, or
    233    * create a matcher for the specific characters you want. Not deprecated yet because it is a
    234    * stepping stone for getting off of many deprecated {@link StringUtil} methods.
    235    */
    236   @Deprecated
    237   public static final CharMatcher LEGACY_WHITESPACE =
    238       anyOf(" \r\n\t\u3000\u00A0\u2007\u202F").precomputed();
    239 
    240 
    241   /** Matches any character. */
    242   public static final CharMatcher ANY = new CharMatcher() {
    243     @Override public boolean matches(char c) {
    244       return true;
    245     }
    246 
    247     @Override public int indexIn(CharSequence sequence) {
    248       return (sequence.length() == 0) ? -1 : 0;
    249     }
    250     @Override public int indexIn(CharSequence sequence, int start) {
    251       int length = sequence.length();
    252       Preconditions.checkPositionIndex(start, length);
    253       return (start == length) ? -1 : start;
    254     }
    255     @Override public int lastIndexIn(CharSequence sequence) {
    256       return sequence.length() - 1;
    257     }
    258     @Override public boolean matchesAllOf(CharSequence sequence) {
    259       checkNotNull(sequence);
    260       return true;
    261     }
    262     @Override public boolean matchesNoneOf(CharSequence sequence) {
    263       return sequence.length() == 0;
    264     }
    265     @Override public String removeFrom(CharSequence sequence) {
    266       checkNotNull(sequence);
    267       return "";
    268     }
    269     @Override public String replaceFrom(
    270         CharSequence sequence, char replacement) {
    271       char[] array = new char[sequence.length()];
    272       Arrays.fill(array, replacement);
    273       return new String(array);
    274     }
    275     @Override public String replaceFrom(
    276         CharSequence sequence, CharSequence replacement) {
    277       StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
    278       for (int i = 0; i < sequence.length(); i++) {
    279         retval.append(replacement);
    280       }
    281       return retval.toString();
    282     }
    283     @Override public String collapseFrom(CharSequence sequence, char replacement) {
    284       return (sequence.length() == 0) ? "" : String.valueOf(replacement);
    285     }
    286     @Override public String trimFrom(CharSequence sequence) {
    287       checkNotNull(sequence);
    288       return "";
    289     }
    290     @Override public int countIn(CharSequence sequence) {
    291       return sequence.length();
    292     }
    293     @Override public CharMatcher and(CharMatcher other) {
    294       return checkNotNull(other);
    295     }
    296     @Override public CharMatcher or(CharMatcher other) {
    297       checkNotNull(other);
    298       return this;
    299     }
    300     @Override public CharMatcher negate() {
    301       return NONE;
    302     }
    303     @Override public CharMatcher precomputed() {
    304       return this;
    305     }
    306   };
    307 
    308   /** Matches no characters. */
    309   public static final CharMatcher NONE = new CharMatcher() {
    310     @Override public boolean matches(char c) {
    311       return false;
    312     }
    313 
    314     @Override public int indexIn(CharSequence sequence) {
    315       checkNotNull(sequence);
    316       return -1;
    317     }
    318     @Override public int indexIn(CharSequence sequence, int start) {
    319       int length = sequence.length();
    320       Preconditions.checkPositionIndex(start, length);
    321       return -1;
    322     }
    323     @Override public int lastIndexIn(CharSequence sequence) {
    324       checkNotNull(sequence);
    325       return -1;
    326     }
    327     @Override public boolean matchesAllOf(CharSequence sequence) {
    328       return sequence.length() == 0;
    329     }
    330     @Override public boolean matchesNoneOf(CharSequence sequence) {
    331       checkNotNull(sequence);
    332       return true;
    333     }
    334     @Override public String removeFrom(CharSequence sequence) {
    335       return sequence.toString();
    336     }
    337     @Override public String replaceFrom(
    338         CharSequence sequence, char replacement) {
    339       return sequence.toString();
    340     }
    341     @Override public String replaceFrom(
    342         CharSequence sequence, CharSequence replacement) {
    343       checkNotNull(replacement);
    344       return sequence.toString();
    345     }
    346     @Override public String collapseFrom(
    347         CharSequence sequence, char replacement) {
    348       return sequence.toString();
    349     }
    350     @Override public String trimFrom(CharSequence sequence) {
    351       return sequence.toString();
    352     }
    353     @Override public int countIn(CharSequence sequence) {
    354       checkNotNull(sequence);
    355       return 0;
    356     }
    357     @Override public CharMatcher and(CharMatcher other) {
    358       checkNotNull(other);
    359       return this;
    360     }
    361     @Override public CharMatcher or(CharMatcher other) {
    362       return checkNotNull(other);
    363     }
    364     @Override public CharMatcher negate() {
    365       return ANY;
    366     }
    367     @Override protected void setBits(LookupTable table) {
    368     }
    369     @Override public CharMatcher precomputed() {
    370       return this;
    371     }
    372   };
    373 
    374   // Static factories
    375 
    376   /**
    377    * Returns a {@code char} matcher that matches only one specified character.
    378    */
    379   public static CharMatcher is(final char match) {
    380     return new CharMatcher() {
    381       @Override public boolean matches(char c) {
    382         return c == match;
    383       }
    384 
    385       @Override public String replaceFrom(
    386           CharSequence sequence, char replacement) {
    387         return sequence.toString().replace(match, replacement);
    388       }
    389       @Override public CharMatcher and(CharMatcher other) {
    390         return other.matches(match) ? this : NONE;
    391       }
    392       @Override public CharMatcher or(CharMatcher other) {
    393         return other.matches(match) ? other : super.or(other);
    394       }
    395       @Override public CharMatcher negate() {
    396         return isNot(match);
    397       }
    398       @Override protected void setBits(LookupTable table) {
    399         table.set(match);
    400       }
    401       @Override public CharMatcher precomputed() {
    402         return this;
    403       }
    404     };
    405   }
    406 
    407   /**
    408    * Returns a {@code char} matcher that matches any character except the one
    409    * specified.
    410    *
    411    * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
    412    */
    413   public static CharMatcher isNot(final char match) {
    414     return new CharMatcher() {
    415       @Override public boolean matches(char c) {
    416         return c != match;
    417       }
    418 
    419       @Override public CharMatcher and(CharMatcher other) {
    420         return other.matches(match) ? super.and(other) : other;
    421       }
    422       @Override public CharMatcher or(CharMatcher other) {
    423         return other.matches(match) ? ANY : this;
    424       }
    425       @Override public CharMatcher negate() {
    426         return is(match);
    427       }
    428     };
    429   }
    430 
    431   /**
    432    * Returns a {@code char} matcher that matches any character present in the
    433    * given character sequence.
    434    */
    435   public static CharMatcher anyOf(final CharSequence sequence) {
    436     switch (sequence.length()) {
    437       case 0:
    438         return NONE;
    439       case 1:
    440         return is(sequence.charAt(0));
    441       case 2:
    442         final char match1 = sequence.charAt(0);
    443         final char match2 = sequence.charAt(1);
    444         return new CharMatcher() {
    445           @Override public boolean matches(char c) {
    446             return c == match1 || c == match2;
    447           }
    448           @Override protected void setBits(LookupTable table) {
    449             table.set(match1);
    450             table.set(match2);
    451           }
    452           @Override public CharMatcher precomputed() {
    453             return this;
    454           }
    455         };
    456     }
    457 
    458     final char[] chars = sequence.toString().toCharArray();
    459     Arrays.sort(chars); // not worth collapsing duplicates
    460 
    461     return new CharMatcher() {
    462       @Override public boolean matches(char c) {
    463         return Arrays.binarySearch(chars, c) >= 0;
    464       }
    465       @Override protected void setBits(LookupTable table) {
    466         for (char c : chars) {
    467           table.set(c);
    468         }
    469       }
    470     };
    471   }
    472 
    473   /**
    474    * Returns a {@code char} matcher that matches any character not present in
    475    * the given character sequence.
    476    */
    477   public static CharMatcher noneOf(CharSequence sequence) {
    478     return anyOf(sequence).negate();
    479   }
    480 
    481   /**
    482    * Returns a {@code char} matcher that matches any character in a given range
    483    * (both endpoints are inclusive). For example, to match any lowercase letter
    484    * of the English alphabet, use {@code CharMatcher.inRange('a', 'z')}.
    485    *
    486    * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
    487    */
    488   public static CharMatcher inRange(
    489       final char startInclusive, final char endInclusive) {
    490     checkArgument(endInclusive >= startInclusive);
    491     return new CharMatcher() {
    492       @Override public boolean matches(char c) {
    493         return startInclusive <= c && c <= endInclusive;
    494       }
    495       @Override protected void setBits(LookupTable table) {
    496         char c = startInclusive;
    497         while (true) {
    498           table.set(c);
    499           if (c++ == endInclusive) {
    500             break;
    501           }
    502         }
    503       }
    504       @Override public CharMatcher precomputed() {
    505         return this;
    506       }
    507     };
    508   }
    509 
    510   /**
    511    * Returns a matcher with identical behavior to the given {@link
    512    * Character}-based predicate, but which operates on primitive {@code char}
    513    * instances instead.
    514    */
    515   public static CharMatcher forPredicate(
    516       final Predicate<? super Character> predicate) {
    517     checkNotNull(predicate);
    518     if (predicate instanceof CharMatcher) {
    519       return (CharMatcher) predicate;
    520     }
    521     return new CharMatcher() {
    522       @Override public boolean matches(char c) {
    523         return predicate.apply(c);
    524       }
    525       @Override public boolean apply(Character character) {
    526         return predicate.apply(checkNotNull(character));
    527       }
    528     };
    529   }
    530 
    531   // Abstract methods
    532 
    533   /** Determines a true or false value for the given character. */
    534   public abstract boolean matches(char c);
    535 
    536   // Non-static factories
    537 
    538   /**
    539    * Returns a matcher that matches any character not matched by this matcher.
    540    */
    541   public CharMatcher negate() {
    542     final CharMatcher original = this;
    543     return new CharMatcher() {
    544       @Override public boolean matches(char c) {
    545         return !original.matches(c);
    546       }
    547 
    548       @Override public boolean matchesAllOf(CharSequence sequence) {
    549         return original.matchesNoneOf(sequence);
    550       }
    551       @Override public boolean matchesNoneOf(CharSequence sequence) {
    552         return original.matchesAllOf(sequence);
    553       }
    554       @Override public int countIn(CharSequence sequence) {
    555         return sequence.length() - original.countIn(sequence);
    556       }
    557       @Override public CharMatcher negate() {
    558         return original;
    559       }
    560     };
    561   }
    562 
    563   /**
    564    * Returns a matcher that matches any character matched by both this matcher
    565    * and {@code other}.
    566    */
    567   public CharMatcher and(CharMatcher other) {
    568     return new And(Arrays.asList(this, checkNotNull(other)));
    569   }
    570 
    571   private static class And extends CharMatcher {
    572     List<CharMatcher> components;
    573 
    574     And(List<CharMatcher> components) {
    575       this.components = components; // Skip defensive copy (private)
    576     }
    577 
    578     @Override public boolean matches(char c) {
    579       for (CharMatcher matcher : components) {
    580         if (!matcher.matches(c)) {
    581           return false;
    582         }
    583       }
    584       return true;
    585     }
    586 
    587     @Override public CharMatcher and(CharMatcher other) {
    588       List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
    589       newComponents.add(checkNotNull(other));
    590       return new And(newComponents);
    591     }
    592   }
    593 
    594   /**
    595    * Returns a matcher that matches any character matched by either this matcher
    596    * or {@code other}.
    597    */
    598   public CharMatcher or(CharMatcher other) {
    599     return new Or(Arrays.asList(this, checkNotNull(other)));
    600   }
    601 
    602   private static class Or extends CharMatcher {
    603     List<CharMatcher> components;
    604 
    605     Or(List<CharMatcher> components) {
    606       this.components = components; // Skip defensive copy (private)
    607     }
    608 
    609     @Override public boolean matches(char c) {
    610       for (CharMatcher matcher : components) {
    611         if (matcher.matches(c)) {
    612           return true;
    613         }
    614       }
    615       return false;
    616     }
    617 
    618     @Override public CharMatcher or(CharMatcher other) {
    619       List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
    620       newComponents.add(checkNotNull(other));
    621       return new Or(newComponents);
    622     }
    623 
    624     @Override protected void setBits(LookupTable table) {
    625       for (CharMatcher matcher : components) {
    626         matcher.setBits(table);
    627       }
    628     }
    629   }
    630 
    631   /**
    632    * Returns a {@code char} matcher functionally equivalent to this one, but
    633    * which may be faster to query than the original; your mileage may vary.
    634    * Precomputation takes time and is likely to be worthwhile only if the
    635    * precomputed matcher is queried many thousands of times.
    636    *
    637    * <p>This method has no effect (returns {@code this}) when called in GWT:
    638    * it's unclear whether a precomputed matcher is faster, but it certainly
    639    * consumes more memory, which doesn't seem like a worthwhile tradeoff in a
    640    * browser.
    641    */
    642   public CharMatcher precomputed() {
    643     return Platform.precomputeCharMatcher(this);
    644   }
    645 
    646   /**
    647    * This is the actual implementation of {@link #precomputed}, but we bounce
    648    * calls through a method on {@link Platform} so that we can have different
    649    * behavior in GWT.
    650    *
    651    * <p>The default precomputation is to cache the configuration of the original
    652    * matcher in an eight-kilobyte bit array. In some situations this produces a
    653    * matcher which is faster to query than the original.
    654    *
    655    * <p>The default implementation creates a new bit array and passes it to
    656    * {@link #setBits(LookupTable)}.
    657    */
    658   CharMatcher precomputedInternal() {
    659     final LookupTable table = new LookupTable();
    660     setBits(table);
    661 
    662     return new CharMatcher() {
    663       @Override public boolean matches(char c) {
    664         return table.get(c);
    665       }
    666 
    667       // TODO: make methods like negate() smart
    668 
    669       @Override public CharMatcher precomputed() {
    670         return this;
    671       }
    672     };
    673   }
    674 
    675   /**
    676    * For use by implementors; sets the bit corresponding to each character ('\0'
    677    * to '{@literal \}uFFFF') that matches this matcher in the given bit array,
    678    * leaving all other bits untouched.
    679    *
    680    * <p>The default implementation loops over every possible character value,
    681    * invoking {@link #matches} for each one.
    682    */
    683   protected void setBits(LookupTable table) {
    684     char c = Character.MIN_VALUE;
    685     while (true) {
    686       if (matches(c)) {
    687         table.set(c);
    688       }
    689       if (c++ == Character.MAX_VALUE) {
    690         break;
    691       }
    692     }
    693   }
    694 
    695   /**
    696    * A bit array with one bit per {@code char} value, used by {@link
    697    * CharMatcher#precomputed}.
    698    *
    699    * <p>TODO: possibly share a common BitArray class with BloomFilter
    700    * and others... a simpler java.util.BitSet.
    701    */
    702   protected static class LookupTable {
    703     int[] data = new int[2048];
    704 
    705     void set(char index) {
    706       data[index >> 5] |= (1 << index);
    707     }
    708     boolean get(char index) {
    709       return (data[index >> 5] & (1 << index)) != 0;
    710     }
    711   }
    712 
    713   // Text processing routines
    714 
    715   /**
    716    * Returns {@code true} if a character sequence contains only matching
    717    * characters.
    718    *
    719    * <p>The default implementation iterates over the sequence, invoking {@link
    720    * #matches} for each character, until this returns {@code false} or the end
    721    * is reached.
    722    *
    723    * @param sequence the character sequence to examine, possibly empty
    724    * @return {@code true} if this matcher matches every character in the
    725    *     sequence, including when the sequence is empty
    726    */
    727   public boolean matchesAllOf(CharSequence sequence) {
    728     for (int i = sequence.length() - 1; i >= 0; i--) {
    729       if (!matches(sequence.charAt(i))) {
    730         return false;
    731       }
    732     }
    733     return true;
    734   }
    735 
    736   /**
    737    * Returns {@code true} if a character sequence contains no matching
    738    * characters.
    739    *
    740    * <p>The default implementation iterates over the sequence, invoking {@link
    741    * #matches} for each character, until this returns {@code false} or the end is
    742    * reached.
    743    *
    744    * @param sequence the character sequence to examine, possibly empty
    745    * @return {@code true} if this matcher matches every character in the
    746    *     sequence, including when the sequence is empty
    747    */
    748   public boolean matchesNoneOf(CharSequence sequence) {
    749     return indexIn(sequence) == -1;
    750   }
    751 
    752   // TODO: perhaps add matchesAnyOf()
    753 
    754   /**
    755    * Returns the index of the first matching character in a character sequence,
    756    * or {@code -1} if no matching character is present.
    757    *
    758    * <p>The default implementation iterates over the sequence in forward order
    759    * calling {@link #matches} for each character.
    760    *
    761    * @param sequence the character sequence to examine from the beginning
    762    * @return an index, or {@code -1} if no character matches
    763    */
    764   public int indexIn(CharSequence sequence) {
    765     int length = sequence.length();
    766     for (int i = 0; i < length; i++) {
    767       if (matches(sequence.charAt(i))) {
    768         return i;
    769       }
    770     }
    771     return -1;
    772   }
    773 
    774   /**
    775    * Returns the index of the first matching character in a character sequence,
    776    * starting from a given position, or {@code -1} if no character matches after
    777    * that position.
    778    *
    779    * <p>The default implementation iterates over the sequence in forward order,
    780    * beginning at {@code start}, calling {@link #matches} for each character.
    781    *
    782    * @param sequence the character sequence to examine
    783    * @param start the first index to examine; must be nonnegative and no
    784    *     greater than {@code sequence.length()}
    785    * @return the index of the first matching character, guaranteed to be no less
    786    *     than {@code start}, or {@code -1} if no character matches
    787    * @throws IndexOutOfBoundsException if start is negative or greater than
    788    *     {@code sequence.length()}
    789    */
    790   public int indexIn(CharSequence sequence, int start) {
    791     int length = sequence.length();
    792     Preconditions.checkPositionIndex(start, length);
    793     for (int i = start; i < length; i++) {
    794       if (matches(sequence.charAt(i))) {
    795         return i;
    796       }
    797     }
    798     return -1;
    799   }
    800 
    801   /**
    802    * Returns the index of the last matching character in a character sequence,
    803    * or {@code -1} if no matching character is present.
    804    *
    805    * <p>The default implementation iterates over the sequence in reverse order
    806    * calling {@link #matches} for each character.
    807    *
    808    * @param sequence the character sequence to examine from the end
    809    * @return an index, or {@code -1} if no character matches
    810    */
    811   public int lastIndexIn(CharSequence sequence) {
    812     for (int i = sequence.length() - 1; i >= 0; i--) {
    813       if (matches(sequence.charAt(i))) {
    814         return i;
    815       }
    816     }
    817     return -1;
    818   }
    819 
    820   /**
    821    * Returns the number of matching characters found in a character sequence.
    822    */
    823   public int countIn(CharSequence sequence) {
    824     int count = 0;
    825     for (int i = 0; i < sequence.length(); i++) {
    826       if (matches(sequence.charAt(i))) {
    827         count++;
    828       }
    829     }
    830     return count;
    831   }
    832 
    833   /**
    834    * Returns a string containing all non-matching characters of a character
    835    * sequence, in order. For example: <pre>   {@code
    836    *
    837    *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
    838    *
    839    * ... returns {@code "bzr"}.
    840    */
    841   public String removeFrom(CharSequence sequence) {
    842     String string = sequence.toString();
    843     int pos = indexIn(string);
    844     if (pos == -1) {
    845       return string;
    846     }
    847 
    848     char[] chars = string.toCharArray();
    849     int spread = 1;
    850 
    851     // This unusual loop comes from extensive benchmarking
    852     OUT:
    853     while (true) {
    854       pos++;
    855       while (true) {
    856         if (pos == chars.length) {
    857           break OUT;
    858         }
    859         if (matches(chars[pos])) {
    860           break;
    861         }
    862         chars[pos - spread] = chars[pos];
    863         pos++;
    864       }
    865       spread++;
    866     }
    867     return new String(chars, 0, pos - spread);
    868   }
    869 
    870   /**
    871    * Returns a string containing all matching characters of a character
    872    * sequence, in order. For example: <pre>   {@code
    873    *
    874    *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
    875    *
    876    * ... returns {@code "aaa"}.
    877    */
    878   public String retainFrom(CharSequence sequence) {
    879     return negate().removeFrom(sequence);
    880   }
    881 
    882   /**
    883    * Returns a string copy of the input character sequence, with each character
    884    * that matches this matcher replaced by a given replacement character. For
    885    * example: <pre>   {@code
    886    *
    887    *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
    888    *
    889    * ... returns {@code "rodor"}.
    890    *
    891    * <p>The default implementation uses {@link #indexIn(CharSequence)} to find
    892    * the first matching character, then iterates the remainder of the sequence
    893    * calling {@link #matches(char)} for each character.
    894    *
    895    * @param sequence the character sequence to replace matching characters in
    896    * @param replacement the character to append to the result string in place of
    897    *     each matching character in {@code sequence}
    898    * @return the new string
    899    */
    900   public String replaceFrom(CharSequence sequence, char replacement) {
    901     String string = sequence.toString();
    902     int pos = indexIn(string);
    903     if (pos == -1) {
    904       return string;
    905     }
    906     char[] chars = string.toCharArray();
    907     chars[pos] = replacement;
    908     for (int i = pos + 1; i < chars.length; i++) {
    909       if (matches(chars[i])) {
    910         chars[i] = replacement;
    911       }
    912     }
    913     return new String(chars);
    914   }
    915 
    916   /**
    917    * Returns a string copy of the input character sequence, with each character
    918    * that matches this matcher replaced by a given replacement sequence. For
    919    * example: <pre>   {@code
    920    *
    921    *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
    922    *
    923    * ... returns {@code "yoohoo"}.
    924    *
    925    * <p><b>Note:</b> If the replacement is a fixed string with only one character,
    926    * you are better off calling {@link #replaceFrom(CharSequence, char)} directly.
    927    *
    928    * @param sequence the character sequence to replace matching characters in
    929    * @param replacement the characters to append to the result string in place
    930    *     of each matching character in {@code sequence}
    931    * @return the new string
    932    */
    933   public String replaceFrom(CharSequence sequence, CharSequence replacement) {
    934     int replacementLen = replacement.length();
    935     if (replacementLen == 0) {
    936       return removeFrom(sequence);
    937     }
    938     if (replacementLen == 1) {
    939       return replaceFrom(sequence, replacement.charAt(0));
    940     }
    941 
    942     String string = sequence.toString();
    943     int pos = indexIn(string);
    944     if (pos == -1) {
    945       return string;
    946     }
    947 
    948     int len = string.length();
    949     StringBuilder buf = new StringBuilder((int) (len * 1.5) + 16);
    950 
    951     int oldpos = 0;
    952     do {
    953       buf.append(string, oldpos, pos);
    954       buf.append(replacement);
    955       oldpos = pos + 1;
    956       pos = indexIn(string, oldpos);
    957     } while (pos != -1);
    958 
    959     buf.append(string, oldpos, len);
    960     return buf.toString();
    961   }
    962 
    963   /**
    964    * Returns a substring of the input character sequence that omits all
    965    * characters this matcher matches from the beginning and from the end of the
    966    * string. For example: <pre> {@code
    967    *
    968    *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
    969    *
    970    * ... returns {@code "cat"}.
    971    *
    972    * <p>Note that<pre>   {@code
    973    *
    974    *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
    975    *
    976    * ... is equivalent to {@link String#trim()}.
    977    */
    978   public String trimFrom(CharSequence sequence) {
    979     int len = sequence.length();
    980     int first;
    981     int last;
    982 
    983     for (first = 0; first < len; first++) {
    984       if (!matches(sequence.charAt(first))) {
    985         break;
    986       }
    987     }
    988     for (last = len - 1; last > first; last--) {
    989       if (!matches(sequence.charAt(last))) {
    990         break;
    991       }
    992     }
    993 
    994     return sequence.subSequence(first, last + 1).toString();
    995   }
    996 
    997   /**
    998    * Returns a substring of the input character sequence that omits all
    999    * characters this matcher matches from the beginning of the
   1000    * string. For example: <pre> {@code
   1001    *
   1002    *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
   1003    *
   1004    * ... returns {@code "catbab"}.
   1005    */
   1006   public String trimLeadingFrom(CharSequence sequence) {
   1007     int len = sequence.length();
   1008     int first;
   1009 
   1010     for (first = 0; first < len; first++) {
   1011       if (!matches(sequence.charAt(first))) {
   1012         break;
   1013       }
   1014     }
   1015 
   1016     return sequence.subSequence(first, len).toString();
   1017   }
   1018 
   1019   /**
   1020    * Returns a substring of the input character sequence that omits all
   1021    * characters this matcher matches from the end of the
   1022    * string. For example: <pre> {@code
   1023    *
   1024    *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
   1025    *
   1026    * ... returns {@code "abacat"}.
   1027    */
   1028   public String trimTrailingFrom(CharSequence sequence) {
   1029     int len = sequence.length();
   1030     int last;
   1031 
   1032     for (last = len - 1; last >= 0; last--) {
   1033       if (!matches(sequence.charAt(last))) {
   1034         break;
   1035       }
   1036     }
   1037 
   1038     return sequence.subSequence(0, last + 1).toString();
   1039   }
   1040 
   1041   /**
   1042    * Returns a string copy of the input character sequence, with each group of
   1043    * consecutive characters that match this matcher replaced by a single
   1044    * replacement character. For example: <pre>   {@code
   1045    *
   1046    *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
   1047    *
   1048    * ... returns {@code "b-p-r"}.
   1049    *
   1050    * <p>The default implementation uses {@link #indexIn(CharSequence)} to find
   1051    * the first matching character, then iterates the remainder of the sequence
   1052    * calling {@link #matches(char)} for each character.
   1053    *
   1054    * @param sequence the character sequence to replace matching groups of
   1055    *     characters in
   1056    * @param replacement the character to append to the result string in place of
   1057    *     each group of matching characters in {@code sequence}
   1058    * @return the new string
   1059    */
   1060   public String collapseFrom(CharSequence sequence, char replacement) {
   1061     int first = indexIn(sequence);
   1062     if (first == -1) {
   1063       return sequence.toString();
   1064     }
   1065 
   1066     // TODO: this implementation can probably be made faster.
   1067 
   1068     StringBuilder builder = new StringBuilder(sequence.length())
   1069         .append(sequence.subSequence(0, first))
   1070         .append(replacement);
   1071     boolean in = true;
   1072     for (int i = first + 1; i < sequence.length(); i++) {
   1073       char c = sequence.charAt(i);
   1074       if (apply(c)) {
   1075         if (!in) {
   1076           builder.append(replacement);
   1077           in = true;
   1078         }
   1079       } else {
   1080         builder.append(c);
   1081         in = false;
   1082       }
   1083     }
   1084     return builder.toString();
   1085   }
   1086 
   1087   /**
   1088    * Collapses groups of matching characters exactly as {@link #collapseFrom}
   1089    * does, except that groups of matching characters at the start or end of the
   1090    * sequence are removed without replacement.
   1091    */
   1092   public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
   1093     int first = negate().indexIn(sequence);
   1094     if (first == -1) {
   1095       return ""; // everything matches. nothing's left.
   1096     }
   1097     StringBuilder builder = new StringBuilder(sequence.length());
   1098     boolean inMatchingGroup = false;
   1099     for (int i = first; i < sequence.length(); i++) {
   1100       char c = sequence.charAt(i);
   1101       if (apply(c)) {
   1102         inMatchingGroup = true;
   1103       } else {
   1104         if (inMatchingGroup) {
   1105           builder.append(replacement);
   1106           inMatchingGroup = false;
   1107         }
   1108         builder.append(c);
   1109       }
   1110     }
   1111     return builder.toString();
   1112   }
   1113 
   1114   // Predicate interface
   1115 
   1116   /**
   1117    * Returns {@code true} if this matcher matches the given character.
   1118    *
   1119    * @throws NullPointerException if {@code character} is null
   1120    */
   1121   /*@Override*/ public boolean apply(Character character) {
   1122     return matches(character);
   1123   }
   1124 }