Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2009 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.android.mail.common.base;
     18 
     19 import static com.google.android.mail.common.base.Preconditions.checkArgument;
     20 import static com.google.android.mail.common.base.Preconditions.checkNotNull;
     21 import static com.google.android.mail.common.base.Preconditions.checkState;
     22 
     23 import com.google.common.base.Joiner;
     24 
     25 import java.util.Iterator;
     26 import java.util.NoSuchElementException;
     27 import java.util.StringTokenizer;
     28 import java.util.regex.Matcher;
     29 import java.util.regex.Pattern;
     30 import java.util.regex.PatternSyntaxException;
     31 
     32 /**
     33  * An object that divides strings (or other instances of {@code CharSequence})
     34  * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter")
     35  * which can be expressed as a single character, literal string, regular
     36  * expression, {@code CharMatcher}, or by using a fixed substring length. This
     37  * class provides the complementary functionality to {@link Joiner}.
     38  *
     39  * <p>Here is the most basic example of {@code Splitter} usage: <pre>   {@code
     40  *
     41  *   Splitter.on(',').split("foo,bar")}</pre>
     42  *
     43  * This invocation returns an {@code Iterable<String>} containing {@code "foo"}
     44  * and {@code "bar"}, in that order.
     45  *
     46  * <p>By default {@code Splitter}'s behavior is very simplistic: <pre>   {@code
     47  *
     48  *   Splitter.on(',').split("foo,,bar,  quux")}</pre>
     49  *
     50  * This returns an iterable containing {@code ["foo", "", "bar", "  quux"]}.
     51  * Notice that the splitter does not assume that you want empty strings removed,
     52  * or that you wish to trim whitespace. If you want features like these, simply
     53  * ask for them: <pre> {@code
     54  *
     55  *   private static final Splitter MY_SPLITTER = Splitter.on(',')
     56  *       .trimResults()
     57  *       .omitEmptyStrings();}</pre>
     58  *
     59  * Now {@code MY_SPLITTER.split("foo, ,bar,  quux,")} returns an iterable
     60  * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which
     61  * the configuration methods are called is never significant; for instance,
     62  * trimming is always applied first before checking for an empty result,
     63  * regardless of the order in which the {@link #trimResults()} and
     64  * {@link #omitEmptyStrings()} methods were invoked.
     65  *
     66  * <p><b>Warning: splitter instances are always immutable</b>; a configuration
     67  * method such as {@code omitEmptyStrings} has no effect on the instance it
     68  * is invoked on! You must store and use the new splitter instance returned by
     69  * the method. This makes splitters thread-safe, and safe to store as {@code
     70  * static final} constants (as illustrated above). <pre>   {@code
     71  *
     72  *   // Bad! Do not do this!
     73  *   Splitter splitter = Splitter.on('/');
     74  *   splitter.trimResults(); // does nothing!
     75  *   return splitter.split("wrong / wrong / wrong");}</pre>
     76  *
     77  * The separator recognized by the splitter does not have to be a single
     78  * literal character as in the examples above. See the methods {@link
     79  * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples
     80  * of other ways to specify separators.
     81  *
     82  * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of
     83  * similar JDK methods; for instance, it does not silently discard trailing
     84  * separators, as does {@link String#split(String)}, nor does it have a default
     85  * behavior of using five particular whitespace characters as separators, like
     86  * {@link StringTokenizer}.
     87  *
     88  * @author Julien Silland
     89  * @author Jesse Wilson
     90  * @author Kevin Bourrillion
     91  * @since 2009.09.15 <b>tentative</b>
     92  */
     93 public final class Splitter {
     94   private final CharMatcher trimmer;
     95   private final boolean omitEmptyStrings;
     96   private final Strategy strategy;
     97 
     98   private Splitter(Strategy strategy) {
     99     this(strategy, false, CharMatcher.NONE);
    100   }
    101 
    102   private Splitter(Strategy strategy, boolean omitEmptyStrings,
    103       CharMatcher trimmer) {
    104     this.strategy = strategy;
    105     this.omitEmptyStrings = omitEmptyStrings;
    106     this.trimmer = trimmer;
    107   }
    108 
    109   /**
    110    * Returns a splitter that uses the given single-character separator. For
    111    * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
    112    * containing {@code ["foo", "", "bar"]}.
    113    *
    114    * @param separator the character to recognize as a separator
    115    * @return a splitter, with default settings, that recognizes that separator
    116    */
    117   public static Splitter on(char separator) {
    118     return on(CharMatcher.is(separator));
    119   }
    120 
    121   /**
    122    * Returns a splitter that considers any single character matched by the
    123    * given {@code CharMatcher} to be a separator. For example, {@code
    124    * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
    125    * iterable containing {@code ["foo", "", "bar", "quux"]}.
    126    *
    127    * @param separatorMatcher a {@link CharMatcher} that determines whether a
    128    *     character is a separator
    129    * @return a splitter, with default settings, that uses this matcher
    130    */
    131   public static Splitter on(final CharMatcher separatorMatcher) {
    132     checkNotNull(separatorMatcher);
    133 
    134     return new Splitter(new Strategy() {
    135       /*@Override*/ public SplittingIterator iterator(
    136           Splitter splitter, final CharSequence toSplit) {
    137         return new SplittingIterator(splitter, toSplit) {
    138           @Override int separatorStart(int start) {
    139             return separatorMatcher.indexIn(toSplit, start);
    140           }
    141 
    142           @Override int separatorEnd(int separatorPosition) {
    143             return separatorPosition + 1;
    144           }
    145         };
    146       }
    147     });
    148   }
    149 
    150   /**
    151    * Returns a splitter that uses the given fixed string as a separator. For
    152    * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an
    153    * iterable containing {@code ["foo", "bar", "baz,qux"]}.
    154    *
    155    * @param separator the literal, nonempty string to recognize as a separator
    156    * @return a splitter, with default settings, that recognizes that separator
    157    */
    158   public static Splitter on(final String separator) {
    159     checkArgument(separator.length() != 0,
    160         "The separator may not be the empty string.");
    161 
    162     return new Splitter(new Strategy() {
    163       /*@Override*/ public SplittingIterator iterator(
    164           Splitter splitter, CharSequence toSplit) {
    165         return new SplittingIterator(splitter, toSplit) {
    166           @Override public int separatorStart(int start) {
    167             int delimeterLength = separator.length();
    168 
    169             positions:
    170             for (int p = start, last = toSplit.length() - delimeterLength;
    171                 p <= last; p++) {
    172               for (int i = 0; i < delimeterLength; i++) {
    173                 if (toSplit.charAt(i + p) != separator.charAt(i)) {
    174                   continue positions;
    175                 }
    176               }
    177               return p;
    178             }
    179             return -1;
    180           }
    181 
    182           @Override public int separatorEnd(int separatorPosition) {
    183             return separatorPosition + separator.length();
    184           }
    185         };
    186       }
    187     });
    188   }
    189 
    190   /**
    191    * Returns a splitter that considers any subsequence matching {@code
    192    * pattern} to be a separator. For example, {@code
    193    * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
    194    * into lines whether it uses DOS-style or UNIX-style line terminators.
    195    *
    196    * @param separatorPattern the pattern that determines whether a subsequence
    197    *     is a separator. This pattern may not match the empty string.
    198    * @return a splitter, with default settings, that uses this pattern
    199    * @throws IllegalArgumentException if {@code separatorPattern} matches the
    200    *     empty string
    201    */
    202   public static Splitter on(final Pattern separatorPattern) {
    203     checkNotNull(separatorPattern);
    204     checkArgument(!separatorPattern.matcher("").matches(),
    205         "The pattern may not match the empty string: %s", separatorPattern);
    206 
    207     return new Splitter(new Strategy() {
    208       /*@Override*/ public SplittingIterator iterator(
    209           final Splitter splitter, CharSequence toSplit) {
    210         final Matcher matcher = separatorPattern.matcher(toSplit);
    211         return new SplittingIterator(splitter, toSplit) {
    212           @Override public int separatorStart(int start) {
    213             return matcher.find(start) ? matcher.start() : -1;
    214           }
    215 
    216           @Override public int separatorEnd(int separatorPosition) {
    217             return matcher.end();
    218           }
    219         };
    220       }
    221     });
    222   }
    223 
    224   /**
    225    * Returns a splitter that considers any subsequence matching a given
    226    * pattern (regular expression) to be a separator. For example, {@code
    227    * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
    228    * whether it uses DOS-style or UNIX-style line terminators. This is
    229    * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
    230    *
    231    * @param separatorPattern the pattern that determines whether a subsequence
    232    *     is a separator. This pattern may not match the empty string.
    233    * @return a splitter, with default settings, that uses this pattern
    234    * @throws PatternSyntaxException if {@code separatorPattern} is a malformed
    235    *     expression
    236    * @throws IllegalArgumentException if {@code separatorPattern} matches the
    237    *     empty string
    238    */
    239   public static Splitter onPattern(String separatorPattern) {
    240     return on(Pattern.compile(separatorPattern));
    241   }
    242 
    243   /**
    244    * Returns a splitter that divides strings into pieces of the given length.
    245    * For example, {@code Splitter.atEach(2).split("abcde")} returns an
    246    * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
    247    * smaller than {@code length} but will never be empty.
    248    *
    249    * @param length the desired length of pieces after splitting
    250    * @return a splitter, with default settings, that can split into fixed sized
    251    *     pieces
    252    */
    253   public static Splitter fixedLength(final int length) {
    254     checkArgument(length > 0, "The length may not be less than 1");
    255 
    256     return new Splitter(new Strategy() {
    257       /*@Override*/ public SplittingIterator iterator(
    258           final Splitter splitter, CharSequence toSplit) {
    259         return new SplittingIterator(splitter, toSplit) {
    260           @Override public int separatorStart(int start) {
    261             int nextChunkStart = start + length;
    262             return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
    263           }
    264 
    265           @Override public int separatorEnd(int separatorPosition) {
    266             return separatorPosition;
    267           }
    268         };
    269       }
    270     });
    271   }
    272 
    273   /**
    274    * Returns a splitter that behaves equivalently to {@code this} splitter, but
    275    * automatically omits empty strings from the results. For example, {@code
    276    * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
    277    * iterable containing only {@code ["a", "b", "c"]}.
    278    *
    279    * <p>If either {@code trimResults} option is also specified when creating a
    280    * splitter, that splitter always trims results first before checking for
    281    * emptiness. So, for example, {@code
    282    * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
    283    * an empty iterable.
    284    *
    285    * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
    286    * to return an empty iterable, but when using this option, it can (if the
    287    * input sequence consists of nothing but separators).
    288    *
    289    * @return a splitter with the desired configuration
    290    */
    291   public Splitter omitEmptyStrings() {
    292     return new Splitter(strategy, true, trimmer);
    293   }
    294 
    295   /**
    296    * Returns a splitter that behaves equivalently to {@code this} splitter, but
    297    * automatically removes leading and trailing {@linkplain
    298    * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
    299    * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
    300    * Splitter.on(',').trimResults().split(" a, b  ,c  ")} returns an iterable
    301    * containing {@code ["a", "b", "c"]}.
    302    *
    303    * @return a splitter with the desired configuration
    304    */
    305   public Splitter trimResults() {
    306     return trimResults(CharMatcher.WHITESPACE);
    307   }
    308 
    309   /**
    310    * Returns a splitter that behaves equivalently to {@code this} splitter, but
    311    * removes all leading or trailing characters matching the given {@code
    312    * CharMatcher} from each returned substring. For example, {@code
    313    * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
    314    * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
    315    *
    316    * @param trimmer a {@link CharMatcher} that determines whether a character
    317    *     should be removed from the beginning/end of a subsequence
    318    * @return a splitter with the desired configuration
    319    */
    320   public Splitter trimResults(CharMatcher trimmer) {
    321     checkNotNull(trimmer);
    322     return new Splitter(strategy, omitEmptyStrings, trimmer);
    323   }
    324 
    325   /**
    326    * Splits the {@link CharSequence} passed in parameter.
    327    *
    328    * @param sequence the sequence of characters to split
    329    * @return an iteration over the segments split from the parameter.
    330    */
    331   public Iterable<String> split(final CharSequence sequence) {
    332     checkNotNull(sequence);
    333 
    334     return new Iterable<String>() {
    335       /*@Override*/ public Iterator<String> iterator() {
    336         return strategy.iterator(Splitter.this, sequence);
    337       }
    338     };
    339   }
    340 
    341   private interface Strategy {
    342     Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
    343   }
    344 
    345   private abstract static class SplittingIterator
    346       extends AbstractIterator<String> {
    347     final CharSequence toSplit;
    348     final CharMatcher trimmer;
    349     final boolean omitEmptyStrings;
    350 
    351     /**
    352      * Returns the first index in {@code toSplit} at or after {@code start}
    353      * that contains the separator.
    354      */
    355     abstract int separatorStart(int start);
    356 
    357     /**
    358      * Returns the first index in {@code toSplit} after {@code
    359      * separatorPosition} that does not contain a separator. This method is only
    360      * invoked after a call to {@code separatorStart}.
    361      */
    362     abstract int separatorEnd(int separatorPosition);
    363 
    364     int offset = 0;
    365 
    366     protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
    367       this.trimmer = splitter.trimmer;
    368       this.omitEmptyStrings = splitter.omitEmptyStrings;
    369       this.toSplit = toSplit;
    370     }
    371 
    372     @Override protected String computeNext() {
    373       while (offset != -1) {
    374         int start = offset;
    375         int end;
    376 
    377         int separatorPosition = separatorStart(offset);
    378         if (separatorPosition == -1) {
    379           end = toSplit.length();
    380           offset = -1;
    381         } else {
    382           end = separatorPosition;
    383           offset = separatorEnd(separatorPosition);
    384         }
    385 
    386         while (start < end && trimmer.matches(toSplit.charAt(start))) {
    387           start++;
    388         }
    389         while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
    390           end--;
    391         }
    392 
    393         if (omitEmptyStrings && start == end) {
    394           continue;
    395         }
    396 
    397         return toSplit.subSequence(start, end).toString();
    398       }
    399       return endOfData();
    400     }
    401   }
    402 
    403   /*
    404    * Copied from common.collect.AbstractIterator. TODO: un-fork once these
    405    * packages have been combined into a single library.
    406    */
    407   private static abstract class AbstractIterator<T> implements Iterator<T> {
    408     State state = State.NOT_READY;
    409 
    410     enum State {
    411       READY, NOT_READY, DONE, FAILED,
    412     }
    413 
    414     T next;
    415 
    416     protected abstract T computeNext();
    417 
    418     protected final T endOfData() {
    419       state = State.DONE;
    420       return null;
    421     }
    422 
    423     public final boolean hasNext() {
    424       checkState(state != State.FAILED);
    425       switch (state) {
    426         case DONE:
    427           return false;
    428         case READY:
    429           return true;
    430         default:
    431       }
    432       return tryToComputeNext();
    433     }
    434 
    435     boolean tryToComputeNext() {
    436       state = State.FAILED; // temporary pessimism
    437       next = computeNext();
    438       if (state != State.DONE) {
    439         state = State.READY;
    440         return true;
    441       }
    442       return false;
    443     }
    444 
    445     public final T next() {
    446       if (!hasNext()) {
    447         throw new NoSuchElementException();
    448       }
    449       state = State.NOT_READY;
    450       return next;
    451     }
    452 
    453     /*@Override*/ public void remove() {
    454       throw new UnsupportedOperationException();
    455     }
    456   }
    457 }
    458