Home | History | Annotate | Download | only in strings
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef BASE_STRINGS_STRING_TOKENIZER_H_
      6 #define BASE_STRINGS_STRING_TOKENIZER_H_
      7 
      8 #include <algorithm>
      9 #include <string>
     10 
     11 #include "base/strings/string_piece.h"
     12 
     13 namespace base {
     14 
     15 // StringTokenizerT is a simple string tokenizer class.  It works like an
     16 // iterator that with each step (see the Advance method) updates members that
     17 // refer to the next token in the input string.  The user may optionally
     18 // configure the tokenizer to return delimiters.
     19 //
     20 // Warning: be careful not to pass a C string into the 2-arg constructor:
     21 // StringTokenizer t("this is a test", " ");  // WRONG
     22 // This will create a temporary std::string, save the begin() and end()
     23 // iterators, and then the string will be freed before we actually start
     24 // tokenizing it.
     25 // Instead, use a std::string or use the 3 arg constructor of CStringTokenizer.
     26 //
     27 //
     28 // EXAMPLE 1:
     29 //
     30 //   char input[] = "this is a test";
     31 //   CStringTokenizer t(input, input + strlen(input), " ");
     32 //   while (t.GetNext()) {
     33 //     printf("%s\n", t.token().c_str());
     34 //   }
     35 //
     36 // Output:
     37 //
     38 //   this
     39 //   is
     40 //   a
     41 //   test
     42 //
     43 //
     44 // EXAMPLE 2:
     45 //
     46 //   std::string input = "no-cache=\"foo, bar\", private";
     47 //   StringTokenizer t(input, ", ");
     48 //   t.set_quote_chars("\"");
     49 //   while (t.GetNext()) {
     50 //     printf("%s\n", t.token().c_str());
     51 //   }
     52 //
     53 // Output:
     54 //
     55 //   no-cache="foo, bar"
     56 //   private
     57 //
     58 //
     59 // EXAMPLE 3:
     60 //
     61 //   bool next_is_option = false, next_is_value = false;
     62 //   std::string input = "text/html; charset=UTF-8; foo=bar";
     63 //   StringTokenizer t(input, "; =");
     64 //   t.set_options(StringTokenizer::RETURN_DELIMS);
     65 //   while (t.GetNext()) {
     66 //     if (t.token_is_delim()) {
     67 //       switch (*t.token_begin()) {
     68 //         case ';':
     69 //           next_is_option = true;
     70 //           break;
     71 //         case '=':
     72 //           next_is_value = true;
     73 //           break;
     74 //       }
     75 //     } else {
     76 //       const char* label;
     77 //       if (next_is_option) {
     78 //         label = "option-name";
     79 //         next_is_option = false;
     80 //       } else if (next_is_value) {
     81 //         label = "option-value";
     82 //         next_is_value = false;
     83 //       } else {
     84 //         label = "mime-type";
     85 //       }
     86 //       printf("%s: %s\n", label, t.token().c_str());
     87 //     }
     88 //   }
     89 //
     90 //
     91 template <class str, class const_iterator>
     92 class StringTokenizerT {
     93  public:
     94   typedef typename str::value_type char_type;
     95 
     96   // Options that may be pass to set_options()
     97   enum {
     98     // Specifies the delimiters should be returned as tokens
     99     RETURN_DELIMS = 1 << 0,
    100   };
    101 
    102   // The string object must live longer than the tokenizer.  (In particular this
    103   // should not be constructed with a temporary.)
    104   StringTokenizerT(const str& string,
    105                    const str& delims) {
    106     Init(string.begin(), string.end(), delims);
    107   }
    108 
    109   StringTokenizerT(const_iterator string_begin,
    110                    const_iterator string_end,
    111                    const str& delims) {
    112     Init(string_begin, string_end, delims);
    113   }
    114 
    115   // Set the options for this tokenizer.  By default, this is 0.
    116   void set_options(int options) { options_ = options; }
    117 
    118   // Set the characters to regard as quotes.  By default, this is empty.  When
    119   // a quote char is encountered, the tokenizer will switch into a mode where
    120   // it ignores delimiters that it finds.  It switches out of this mode once it
    121   // finds another instance of the quote char.  If a backslash is encountered
    122   // within a quoted string, then the next character is skipped.
    123   void set_quote_chars(const str& quotes) { quotes_ = quotes; }
    124 
    125   // Call this method to advance the tokenizer to the next delimiter.  This
    126   // returns false if the tokenizer is complete.  This method must be called
    127   // before calling any of the token* methods.
    128   bool GetNext() {
    129     if (quotes_.empty() && options_ == 0)
    130       return QuickGetNext();
    131     else
    132       return FullGetNext();
    133   }
    134 
    135   // Start iterating through tokens from the beginning of the string.
    136   void Reset() {
    137     token_end_ = start_pos_;
    138   }
    139 
    140   // Returns true if token is a delimiter.  When the tokenizer is constructed
    141   // with the RETURN_DELIMS option, this method can be used to check if the
    142   // returned token is actually a delimiter.
    143   bool token_is_delim() const { return token_is_delim_; }
    144 
    145   // If GetNext() returned true, then these methods may be used to read the
    146   // value of the token.
    147   const_iterator token_begin() const { return token_begin_; }
    148   const_iterator token_end() const { return token_end_; }
    149   str token() const { return str(token_begin_, token_end_); }
    150   base::StringPiece token_piece() const {
    151     return base::StringPiece(&*token_begin_,
    152                              std::distance(token_begin_, token_end_));
    153   }
    154 
    155  private:
    156   void Init(const_iterator string_begin,
    157             const_iterator string_end,
    158             const str& delims) {
    159     start_pos_ = string_begin;
    160     token_begin_ = string_begin;
    161     token_end_ = string_begin;
    162     end_ = string_end;
    163     delims_ = delims;
    164     options_ = 0;
    165     token_is_delim_ = false;
    166   }
    167 
    168   // Implementation of GetNext() for when we have no quote characters. We have
    169   // two separate implementations because AdvanceOne() is a hot spot in large
    170   // text files with large tokens.
    171   bool QuickGetNext() {
    172     token_is_delim_ = false;
    173     for (;;) {
    174       token_begin_ = token_end_;
    175       if (token_end_ == end_)
    176         return false;
    177       ++token_end_;
    178       if (delims_.find(*token_begin_) == str::npos)
    179         break;
    180       // else skip over delimiter.
    181     }
    182     while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
    183       ++token_end_;
    184     return true;
    185   }
    186 
    187   // Implementation of GetNext() for when we have to take quotes into account.
    188   bool FullGetNext() {
    189     AdvanceState state;
    190     token_is_delim_ = false;
    191     for (;;) {
    192       token_begin_ = token_end_;
    193       if (token_end_ == end_)
    194         return false;
    195       ++token_end_;
    196       if (AdvanceOne(&state, *token_begin_))
    197         break;
    198       if (options_ & RETURN_DELIMS) {
    199         token_is_delim_ = true;
    200         return true;
    201       }
    202       // else skip over delimiter.
    203     }
    204     while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
    205       ++token_end_;
    206     return true;
    207   }
    208 
    209   bool IsDelim(char_type c) const {
    210     return delims_.find(c) != str::npos;
    211   }
    212 
    213   bool IsQuote(char_type c) const {
    214     return quotes_.find(c) != str::npos;
    215   }
    216 
    217   struct AdvanceState {
    218     bool in_quote;
    219     bool in_escape;
    220     char_type quote_char;
    221     AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {}
    222   };
    223 
    224   // Returns true if a delimiter was not hit.
    225   bool AdvanceOne(AdvanceState* state, char_type c) {
    226     if (state->in_quote) {
    227       if (state->in_escape) {
    228         state->in_escape = false;
    229       } else if (c == '\\') {
    230         state->in_escape = true;
    231       } else if (c == state->quote_char) {
    232         state->in_quote = false;
    233       }
    234     } else {
    235       if (IsDelim(c))
    236         return false;
    237       state->in_quote = IsQuote(state->quote_char = c);
    238     }
    239     return true;
    240   }
    241 
    242   const_iterator start_pos_;
    243   const_iterator token_begin_;
    244   const_iterator token_end_;
    245   const_iterator end_;
    246   str delims_;
    247   str quotes_;
    248   int options_;
    249   bool token_is_delim_;
    250 };
    251 
    252 typedef StringTokenizerT<std::string, std::string::const_iterator>
    253     StringTokenizer;
    254 typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
    255     WStringTokenizer;
    256 typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
    257 
    258 }  // namespace base
    259 
    260 #endif  // BASE_STRINGS_STRING_TOKENIZER_H_
    261