Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2009 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.base;
     18 
     19 import static com.google.common.base.Preconditions.checkArgument;
     20 import static com.google.common.base.Preconditions.checkNotNull;
     21 
     22 import com.google.common.annotations.Beta;
     23 import com.google.common.annotations.GwtCompatible;
     24 import com.google.common.annotations.GwtIncompatible;
     25 
     26 import java.util.ArrayList;
     27 import java.util.Collections;
     28 import java.util.Iterator;
     29 import java.util.LinkedHashMap;
     30 import java.util.List;
     31 import java.util.Map;
     32 import java.util.regex.Matcher;
     33 import java.util.regex.Pattern;
     34 
     35 import javax.annotation.CheckReturnValue;
     36 
     37 /**
     38  * Extracts non-overlapping substrings from an input string, typically by
     39  * recognizing appearances of a <i>separator</i> sequence. This separator can be
     40  * specified as a single {@linkplain #on(char) character}, fixed {@linkplain
     41  * #on(String) string}, {@linkplain #onPattern regular expression} or {@link
     42  * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at
     43  * all, a splitter can extract adjacent substrings of a given {@linkplain
     44  * #fixedLength fixed length}.
     45  *
     46  * <p>For example, this expression: <pre>   {@code
     47  *
     48  *   Splitter.on(',').split("foo,bar,qux")}</pre>
     49  *
     50  * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and
     51  * {@code "qux"}, in that order.
     52  *
     53  * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The
     54  * following expression: <pre>   {@code
     55  *
     56  *   Splitter.on(',').split(" foo,,,  bar ,")}</pre>
     57  *
     58  * ... yields the substrings {@code [" foo", "", "", "  bar ", ""]}. If this
     59  * is not the desired behavior, use configuration methods to obtain a <i>new</i>
     60  * splitter instance with modified behavior: <pre>   {@code
     61  *
     62  *   private static final Splitter MY_SPLITTER = Splitter.on(',')
     63  *       .trimResults()
     64  *       .omitEmptyStrings();}</pre>
     65  *
     66  * <p>Now {@code MY_SPLITTER.split("foo,,,  bar ,")} returns just {@code ["foo",
     67  * "bar"]}. Note that the order in which these configuration methods are called
     68  * is never significant.
     69  *
     70  * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration
     71  * method has no effect on the receiving instance; you must store and use the
     72  * new splitter instance it returns instead. <pre>   {@code
     73  *
     74  *   // Do NOT do this
     75  *   Splitter splitter = Splitter.on('/');
     76  *   splitter.trimResults(); // does nothing!
     77  *   return splitter.split("wrong / wrong / wrong");}</pre>
     78  *
     79  * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an
     80  * input string containing {@code n} occurrences of the separator naturally
     81  * yields an iterable of size {@code n + 1}. So if the separator does not occur
     82  * anywhere in the input, a single substring is returned containing the entire
     83  * input. Consequently, all splitters split the empty string to {@code [""]}
     84  * (note: even fixed-length splitters).
     85  *
     86  * <p>Splitter instances are thread-safe immutable, and are therefore safe to
     87  * store as {@code static final} constants.
     88  *
     89  * <p>The {@link Joiner} class provides the inverse operation to splitting, but
     90  * note that a round-trip between the two should be assumed to be lossy.
     91  *
     92  * <p>See the Guava User Guide article on <a href=
     93  * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter">
     94  * {@code Splitter}</a>.
     95  *
     96  * @author Julien Silland
     97  * @author Jesse Wilson
     98  * @author Kevin Bourrillion
     99  * @author Louis Wasserman
    100  * @since 1.0
    101  */
    102 @GwtCompatible(emulated = true)
    103 public final class Splitter {
    104   private final CharMatcher trimmer;
    105   private final boolean omitEmptyStrings;
    106   private final Strategy strategy;
    107   private final int limit;
    108 
    109   private Splitter(Strategy strategy) {
    110     this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
    111   }
    112 
    113   private Splitter(Strategy strategy, boolean omitEmptyStrings,
    114       CharMatcher trimmer, int limit) {
    115     this.strategy = strategy;
    116     this.omitEmptyStrings = omitEmptyStrings;
    117     this.trimmer = trimmer;
    118     this.limit = limit;
    119   }
    120 
    121   /**
    122    * Returns a splitter that uses the given single-character separator. For
    123    * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
    124    * containing {@code ["foo", "", "bar"]}.
    125    *
    126    * @param separator the character to recognize as a separator
    127    * @return a splitter, with default settings, that recognizes that separator
    128    */
    129   public static Splitter on(char separator) {
    130     return on(CharMatcher.is(separator));
    131   }
    132 
    133   /**
    134    * Returns a splitter that considers any single character matched by the
    135    * given {@code CharMatcher} to be a separator. For example, {@code
    136    * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
    137    * iterable containing {@code ["foo", "", "bar", "quux"]}.
    138    *
    139    * @param separatorMatcher a {@link CharMatcher} that determines whether a
    140    *     character is a separator
    141    * @return a splitter, with default settings, that uses this matcher
    142    */
    143   public static Splitter on(final CharMatcher separatorMatcher) {
    144     checkNotNull(separatorMatcher);
    145 
    146     return new Splitter(new Strategy() {
    147       @Override public SplittingIterator iterator(
    148           Splitter splitter, final CharSequence toSplit) {
    149         return new SplittingIterator(splitter, toSplit) {
    150           @Override int separatorStart(int start) {
    151             return separatorMatcher.indexIn(toSplit, start);
    152           }
    153 
    154           @Override int separatorEnd(int separatorPosition) {
    155             return separatorPosition + 1;
    156           }
    157         };
    158       }
    159     });
    160   }
    161 
    162   /**
    163    * Returns a splitter that uses the given fixed string as a separator. For
    164    * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an
    165    * iterable containing {@code ["foo", "bar,baz"]}.
    166    *
    167    * @param separator the literal, nonempty string to recognize as a separator
    168    * @return a splitter, with default settings, that recognizes that separator
    169    */
    170   public static Splitter on(final String separator) {
    171     checkArgument(separator.length() != 0,
    172         "The separator may not be the empty string.");
    173 
    174     return new Splitter(new Strategy() {
    175       @Override public SplittingIterator iterator(
    176           Splitter splitter, CharSequence toSplit) {
    177         return new SplittingIterator(splitter, toSplit) {
    178           @Override public int separatorStart(int start) {
    179             int separatorLength = separator.length();
    180 
    181             positions:
    182             for (int p = start, last = toSplit.length() - separatorLength;
    183                 p <= last; p++) {
    184               for (int i = 0; i < separatorLength; i++) {
    185                 if (toSplit.charAt(i + p) != separator.charAt(i)) {
    186                   continue positions;
    187                 }
    188               }
    189               return p;
    190             }
    191             return -1;
    192           }
    193 
    194           @Override public int separatorEnd(int separatorPosition) {
    195             return separatorPosition + separator.length();
    196           }
    197         };
    198       }
    199     });
    200   }
    201 
    202   /**
    203    * Returns a splitter that considers any subsequence matching {@code
    204    * pattern} to be a separator. For example, {@code
    205    * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
    206    * into lines whether it uses DOS-style or UNIX-style line terminators.
    207    *
    208    * @param separatorPattern the pattern that determines whether a subsequence
    209    *     is a separator. This pattern may not match the empty string.
    210    * @return a splitter, with default settings, that uses this pattern
    211    * @throws IllegalArgumentException if {@code separatorPattern} matches the
    212    *     empty string
    213    */
    214   @GwtIncompatible("java.util.regex")
    215   public static Splitter on(final Pattern separatorPattern) {
    216     checkNotNull(separatorPattern);
    217     checkArgument(!separatorPattern.matcher("").matches(),
    218         "The pattern may not match the empty string: %s", separatorPattern);
    219 
    220     return new Splitter(new Strategy() {
    221       @Override public SplittingIterator iterator(
    222           final Splitter splitter, CharSequence toSplit) {
    223         final Matcher matcher = separatorPattern.matcher(toSplit);
    224         return new SplittingIterator(splitter, toSplit) {
    225           @Override public int separatorStart(int start) {
    226             return matcher.find(start) ? matcher.start() : -1;
    227           }
    228 
    229           @Override public int separatorEnd(int separatorPosition) {
    230             return matcher.end();
    231           }
    232         };
    233       }
    234     });
    235   }
    236 
    237   /**
    238    * Returns a splitter that considers any subsequence matching a given
    239    * pattern (regular expression) to be a separator. For example, {@code
    240    * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
    241    * whether it uses DOS-style or UNIX-style line terminators. This is
    242    * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
    243    *
    244    * @param separatorPattern the pattern that determines whether a subsequence
    245    *     is a separator. This pattern may not match the empty string.
    246    * @return a splitter, with default settings, that uses this pattern
    247    * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
    248    *     is a malformed expression
    249    * @throws IllegalArgumentException if {@code separatorPattern} matches the
    250    *     empty string
    251    */
    252   @GwtIncompatible("java.util.regex")
    253   public static Splitter onPattern(String separatorPattern) {
    254     return on(Pattern.compile(separatorPattern));
    255   }
    256 
    257   /**
    258    * Returns a splitter that divides strings into pieces of the given length.
    259    * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
    260    * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
    261    * smaller than {@code length} but will never be empty.
    262    *
    263    * <p><b>Exception:</b> for consistency with separator-based splitters, {@code
    264    * split("")} does not yield an empty iterable, but an iterable containing
    265    * {@code ""}. This is the only case in which {@code
    266    * Iterables.size(split(input))} does not equal {@code
    267    * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior,
    268    * use {@code omitEmptyStrings}.
    269    *
    270    * @param length the desired length of pieces after splitting, a positive
    271    *     integer
    272    * @return a splitter, with default settings, that can split into fixed sized
    273    *     pieces
    274    * @throws IllegalArgumentException if {@code length} is zero or negative
    275    */
    276   public static Splitter fixedLength(final int length) {
    277     checkArgument(length > 0, "The length may not be less than 1");
    278 
    279     return new Splitter(new Strategy() {
    280       @Override public SplittingIterator iterator(
    281           final Splitter splitter, CharSequence toSplit) {
    282         return new SplittingIterator(splitter, toSplit) {
    283           @Override public int separatorStart(int start) {
    284             int nextChunkStart = start + length;
    285             return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
    286           }
    287 
    288           @Override public int separatorEnd(int separatorPosition) {
    289             return separatorPosition;
    290           }
    291         };
    292       }
    293     });
    294   }
    295 
    296   /**
    297    * Returns a splitter that behaves equivalently to {@code this} splitter, but
    298    * automatically omits empty strings from the results. For example, {@code
    299    * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
    300    * iterable containing only {@code ["a", "b", "c"]}.
    301    *
    302    * <p>If either {@code trimResults} option is also specified when creating a
    303    * splitter, that splitter always trims results first before checking for
    304    * emptiness. So, for example, {@code
    305    * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
    306    * an empty iterable.
    307    *
    308    * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
    309    * to return an empty iterable, but when using this option, it can (if the
    310    * input sequence consists of nothing but separators).
    311    *
    312    * @return a splitter with the desired configuration
    313    */
    314   @CheckReturnValue
    315   public Splitter omitEmptyStrings() {
    316     return new Splitter(strategy, true, trimmer, limit);
    317   }
    318 
    319   /**
    320    * Returns a splitter that behaves equivalently to {@code this} splitter but
    321    * stops splitting after it reaches the limit.
    322    * The limit defines the maximum number of items returned by the iterator.
    323    *
    324    * <p>For example,
    325    * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
    326    * containing {@code ["a", "b", "c,d"]}.  When omitting empty strings, the
    327    * omitted strings do no count.  Hence,
    328    * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
    329    * returns an iterable containing {@code ["a", "b", "c,d"}.
    330    * When trim is requested, all entries, including the last are trimmed.  Hence
    331    * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
    332    * results in @{code ["a", "b", "c , d"]}.
    333    *
    334    * @param limit the maximum number of items returns
    335    * @return a splitter with the desired configuration
    336    * @since 9.0
    337    */
    338   @CheckReturnValue
    339   public Splitter limit(int limit) {
    340     checkArgument(limit > 0, "must be greater than zero: %s", limit);
    341     return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
    342   }
    343 
    344   /**
    345    * Returns a splitter that behaves equivalently to {@code this} splitter, but
    346    * automatically removes leading and trailing {@linkplain
    347    * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
    348    * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
    349    * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
    350    * containing {@code ["a", "b", "c"]}.
    351    *
    352    * @return a splitter with the desired configuration
    353    */
    354   @CheckReturnValue
    355   public Splitter trimResults() {
    356     return trimResults(CharMatcher.WHITESPACE);
    357   }
    358 
    359   /**
    360    * Returns a splitter that behaves equivalently to {@code this} splitter, but
    361    * removes all leading or trailing characters matching the given {@code
    362    * CharMatcher} from each returned substring. For example, {@code
    363    * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
    364    * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
    365    *
    366    * @param trimmer a {@link CharMatcher} that determines whether a character
    367    *     should be removed from the beginning/end of a subsequence
    368    * @return a splitter with the desired configuration
    369    */
    370   // TODO(kevinb): throw if a trimmer was already specified!
    371   @CheckReturnValue
    372   public Splitter trimResults(CharMatcher trimmer) {
    373     checkNotNull(trimmer);
    374     return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
    375   }
    376 
    377   /**
    378    * Splits {@code sequence} into string components and makes them available
    379    * through an {@link Iterator}, which may be lazily evaluated. If you want
    380    * an eagerly computed {@link List}, use {@link #splitToList(CharSequence)}.
    381    *
    382    * @param sequence the sequence of characters to split
    383    * @return an iteration over the segments split from the parameter.
    384    */
    385   public Iterable<String> split(final CharSequence sequence) {
    386     checkNotNull(sequence);
    387 
    388     return new Iterable<String>() {
    389       @Override public Iterator<String> iterator() {
    390         return splittingIterator(sequence);
    391       }
    392       @Override public String toString() {
    393         return Joiner.on(", ")
    394             .appendTo(new StringBuilder().append('['), this)
    395             .append(']')
    396             .toString();
    397       }
    398     };
    399   }
    400 
    401   private Iterator<String> splittingIterator(CharSequence sequence) {
    402     return strategy.iterator(this, sequence);
    403   }
    404 
    405   /**
    406    * Splits {@code sequence} into string components and returns them as
    407    * an immutable list. If you want an {@link Iterable} which may be lazily
    408    * evaluated, use {@link #split(CharSequence)}.
    409    *
    410    * @param sequence the sequence of characters to split
    411    * @return an immutable list of the segments split from the parameter
    412    * @since 15.0
    413    */
    414   @Beta
    415   public List<String> splitToList(CharSequence sequence) {
    416     checkNotNull(sequence);
    417 
    418     Iterator<String> iterator = splittingIterator(sequence);
    419     List<String> result = new ArrayList<String>();
    420 
    421     while (iterator.hasNext()) {
    422       result.add(iterator.next());
    423     }
    424 
    425     return Collections.unmodifiableList(result);
    426   }
    427 
    428   /**
    429    * Returns a {@code MapSplitter} which splits entries based on this splitter,
    430    * and splits entries into keys and values using the specified separator.
    431    *
    432    * @since 10.0
    433    */
    434   @CheckReturnValue
    435   @Beta
    436   public MapSplitter withKeyValueSeparator(String separator) {
    437     return withKeyValueSeparator(on(separator));
    438   }
    439 
    440   /**
    441    * Returns a {@code MapSplitter} which splits entries based on this splitter,
    442    * and splits entries into keys and values using the specified separator.
    443    *
    444    * @since 14.0
    445    */
    446   @CheckReturnValue
    447   @Beta
    448   public MapSplitter withKeyValueSeparator(char separator) {
    449     return withKeyValueSeparator(on(separator));
    450   }
    451 
    452   /**
    453    * Returns a {@code MapSplitter} which splits entries based on this splitter,
    454    * and splits entries into keys and values using the specified key-value
    455    * splitter.
    456    *
    457    * @since 10.0
    458    */
    459   @CheckReturnValue
    460   @Beta
    461   public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
    462     return new MapSplitter(this, keyValueSplitter);
    463   }
    464 
    465   /**
    466    * An object that splits strings into maps as {@code Splitter} splits
    467    * iterables and lists. Like {@code Splitter}, it is thread-safe and
    468    * immutable.
    469    *
    470    * @since 10.0
    471    */
    472   @Beta
    473   public static final class MapSplitter {
    474     private static final String INVALID_ENTRY_MESSAGE =
    475         "Chunk [%s] is not a valid entry";
    476     private final Splitter outerSplitter;
    477     private final Splitter entrySplitter;
    478 
    479     private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
    480       this.outerSplitter = outerSplitter; // only "this" is passed
    481       this.entrySplitter = checkNotNull(entrySplitter);
    482     }
    483 
    484     /**
    485      * Splits {@code sequence} into substrings, splits each substring into
    486      * an entry, and returns an unmodifiable map with each of the entries. For
    487      * example, <code>
    488      * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
    489      * .split("a=>b ; c=>b")
    490      * </code> will return a mapping from {@code "a"} to {@code "b"} and
    491      * {@code "c"} to {@code b}.
    492      *
    493      * <p>The returned map preserves the order of the entries from
    494      * {@code sequence}.
    495      *
    496      * @throws IllegalArgumentException if the specified sequence does not split
    497      *         into valid map entries, or if there are duplicate keys
    498      */
    499     public Map<String, String> split(CharSequence sequence) {
    500       Map<String, String> map = new LinkedHashMap<String, String>();
    501       for (String entry : outerSplitter.split(sequence)) {
    502         Iterator<String> entryFields = entrySplitter.splittingIterator(entry);
    503 
    504         checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
    505         String key = entryFields.next();
    506         checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
    507 
    508         checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
    509         String value = entryFields.next();
    510         map.put(key, value);
    511 
    512         checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
    513       }
    514       return Collections.unmodifiableMap(map);
    515     }
    516   }
    517 
    518   private interface Strategy {
    519     Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
    520   }
    521 
    522   private abstract static class SplittingIterator extends AbstractIterator<String> {
    523     final CharSequence toSplit;
    524     final CharMatcher trimmer;
    525     final boolean omitEmptyStrings;
    526 
    527     /**
    528      * Returns the first index in {@code toSplit} at or after {@code start}
    529      * that contains the separator.
    530      */
    531     abstract int separatorStart(int start);
    532 
    533     /**
    534      * Returns the first index in {@code toSplit} after {@code
    535      * separatorPosition} that does not contain a separator. This method is only
    536      * invoked after a call to {@code separatorStart}.
    537      */
    538     abstract int separatorEnd(int separatorPosition);
    539 
    540     int offset = 0;
    541     int limit;
    542 
    543     protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
    544       this.trimmer = splitter.trimmer;
    545       this.omitEmptyStrings = splitter.omitEmptyStrings;
    546       this.limit = splitter.limit;
    547       this.toSplit = toSplit;
    548     }
    549 
    550     @Override protected String computeNext() {
    551       /*
    552        * The returned string will be from the end of the last match to the
    553        * beginning of the next one. nextStart is the start position of the
    554        * returned substring, while offset is the place to start looking for a
    555        * separator.
    556        */
    557       int nextStart = offset;
    558       while (offset != -1) {
    559         int start = nextStart;
    560         int end;
    561 
    562         int separatorPosition = separatorStart(offset);
    563         if (separatorPosition == -1) {
    564           end = toSplit.length();
    565           offset = -1;
    566         } else {
    567           end = separatorPosition;
    568           offset = separatorEnd(separatorPosition);
    569         }
    570         if (offset == nextStart) {
    571           /*
    572            * This occurs when some pattern has an empty match, even if it
    573            * doesn't match the empty string -- for example, if it requires
    574            * lookahead or the like. The offset must be increased to look for
    575            * separators beyond this point, without changing the start position
    576            * of the next returned substring -- so nextStart stays the same.
    577            */
    578           offset++;
    579           if (offset >= toSplit.length()) {
    580             offset = -1;
    581           }
    582           continue;
    583         }
    584 
    585         while (start < end && trimmer.matches(toSplit.charAt(start))) {
    586           start++;
    587         }
    588         while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
    589           end--;
    590         }
    591 
    592         if (omitEmptyStrings && start == end) {
    593           // Don't include the (unused) separator in next split string.
    594           nextStart = offset;
    595           continue;
    596         }
    597 
    598         if (limit == 1) {
    599           // The limit has been reached, return the rest of the string as the
    600           // final item.  This is tested after empty string removal so that
    601           // empty strings do not count towards the limit.
    602           end = toSplit.length();
    603           offset = -1;
    604           // Since we may have changed the end, we need to trim it again.
    605           while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
    606             end--;
    607           }
    608         } else {
    609           limit--;
    610         }
    611 
    612         return toSplit.subSequence(start, end).toString();
    613       }
    614       return endOfData();
    615     }
    616   }
    617 }
    618