Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef BASE_STRING_TOKENIZER_H_
      6 #define BASE_STRING_TOKENIZER_H_
      7 
      8 #include <string>
      9 
     10 // StringTokenizerT is a simple string tokenizer class.  It works like an
     11 // iterator that with each step (see the Advance method) updates members that
     12 // refer to the next token in the input string.  The user may optionally
     13 // configure the tokenizer to return delimiters.
     14 //
     15 // Warning: be careful not to pass a C string into the 2-arg constructor:
     16 // StringTokenizer t("this is a test", " ");  // WRONG
     17 // This will create a temporary std::string, save the begin() and end()
     18 // iterators, and then the string will be freed before we actually start
     19 // tokenizing it.
     20 // Instead, use a std::string or use the 3 arg constructor of CStringTokenizer.
     21 //
     22 //
     23 // EXAMPLE 1:
     24 //
     25 //   char input[] = "this is a test";
     26 //   CStringTokenizer t(input, input + strlen(input), " ");
     27 //   while (t.GetNext()) {
     28 //     printf("%s\n", t.token().c_str());
     29 //   }
     30 //
     31 // Output:
     32 //
     33 //   this
     34 //   is
     35 //   a
     36 //   test
     37 //
     38 //
     39 // EXAMPLE 2:
     40 //
     41 //   std::string input = "no-cache=\"foo, bar\", private";
     42 //   StringTokenizer t(input, ", ");
     43 //   t.set_quote_chars("\"");
     44 //   while (t.GetNext()) {
     45 //     printf("%s\n", t.token().c_str());
     46 //   }
     47 //
     48 // Output:
     49 //
     50 //   no-cache="foo, bar"
     51 //   private
     52 //
     53 //
     54 // EXAMPLE 3:
     55 //
     56 //   bool next_is_option = false, next_is_value = false;
     57 //   std::string input = "text/html; charset=UTF-8; foo=bar";
     58 //   StringTokenizer t(input, "; =");
     59 //   t.set_options(StringTokenizer::RETURN_DELIMS);
     60 //   while (t.GetNext()) {
     61 //     if (t.token_is_delim()) {
     62 //       switch (*t.token_begin()) {
     63 //         case ';':
     64 //           next_is_option = true;
     65 //           break;
     66 //         case '=':
     67 //           next_is_value = true;
     68 //           break;
     69 //       }
     70 //     } else {
     71 //       const char* label;
     72 //       if (next_is_option) {
     73 //         label = "option-name";
     74 //         next_is_option = false;
     75 //       } else if (next_is_value) {
     76 //         label = "option-value";
     77 //         next_is_value = false;
     78 //       } else {
     79 //         label = "mime-type";
     80 //       }
     81 //       printf("%s: %s\n", label, t.token().c_str());
     82 //     }
     83 //   }
     84 //
     85 //
     86 template <class str, class const_iterator>
     87 class StringTokenizerT {
     88  public:
     89   typedef typename str::value_type char_type;
     90 
     91   // Options that may be pass to set_options()
     92   enum {
     93     // Specifies the delimiters should be returned as tokens
     94     RETURN_DELIMS = 1 << 0,
     95   };
     96 
     97   // The string object must live longer than the tokenizer.  (In particular this
     98   // should not be constructed with a temporary.)
     99   StringTokenizerT(const str& string,
    100                    const str& delims) {
    101     Init(string.begin(), string.end(), delims);
    102   }
    103 
    104   StringTokenizerT(const_iterator string_begin,
    105                    const_iterator string_end,
    106                    const str& delims) {
    107     Init(string_begin, string_end, delims);
    108   }
    109 
    110   // Set the options for this tokenizer.  By default, this is 0.
    111   void set_options(int options) { options_ = options; }
    112 
    113   // Set the characters to regard as quotes.  By default, this is empty.  When
    114   // a quote char is encountered, the tokenizer will switch into a mode where
    115   // it ignores delimiters that it finds.  It switches out of this mode once it
    116   // finds another instance of the quote char.  If a backslash is encountered
    117   // within a quoted string, then the next character is skipped.
    118   void set_quote_chars(const str& quotes) { quotes_ = quotes; }
    119 
    120   // Call this method to advance the tokenizer to the next delimiter.  This
    121   // returns false if the tokenizer is complete.  This method must be called
    122   // before calling any of the token* methods.
    123   bool GetNext() {
    124     AdvanceState state;
    125     token_is_delim_ = false;
    126     for (;;) {
    127       token_begin_ = token_end_;
    128       if (token_end_ == end_)
    129         return false;
    130       ++token_end_;
    131       if (AdvanceOne(&state, *token_begin_))
    132         break;
    133       if (options_ & RETURN_DELIMS) {
    134         token_is_delim_ = true;
    135         return true;
    136       }
    137       // else skip over delim
    138     }
    139     while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
    140       ++token_end_;
    141     return true;
    142   }
    143 
    144   // Start iterating through tokens from the beginning of the string.
    145   void Reset() {
    146     token_end_ = start_pos_;
    147   }
    148 
    149   // Returns true if token is a delimiter.  When the tokenizer is constructed
    150   // with the RETURN_DELIMS option, this method can be used to check if the
    151   // returned token is actually a delimiter.
    152   bool token_is_delim() const { return token_is_delim_; }
    153 
    154   // If GetNext() returned true, then these methods may be used to read the
    155   // value of the token.
    156   const_iterator token_begin() const { return token_begin_; }
    157   const_iterator token_end() const { return token_end_; }
    158   str token() const { return str(token_begin_, token_end_); }
    159 
    160  private:
    161   void Init(const_iterator string_begin,
    162             const_iterator string_end,
    163             const str& delims) {
    164     start_pos_ = string_begin;
    165     token_end_ = string_begin;
    166     end_ = string_end;
    167     delims_ = delims;
    168     options_ = 0;
    169   }
    170 
    171   bool IsDelim(char_type c) const {
    172     return delims_.find(c) != str::npos;
    173   }
    174 
    175   bool IsQuote(char_type c) const {
    176     return quotes_.find(c) != str::npos;
    177   }
    178 
    179   struct AdvanceState {
    180     bool in_quote;
    181     bool in_escape;
    182     char_type quote_char;
    183     AdvanceState() : in_quote(false), in_escape(false) {}
    184   };
    185 
    186   // Returns true if a delimiter was not hit.
    187   bool AdvanceOne(AdvanceState* state, char_type c) {
    188     if (state->in_quote) {
    189       if (state->in_escape) {
    190         state->in_escape = false;
    191       } else if (c == '\\') {
    192         state->in_escape = true;
    193       } else if (c == state->quote_char) {
    194         state->in_quote = false;
    195       }
    196     } else {
    197       if (IsDelim(c))
    198         return false;
    199       state->in_quote = IsQuote(state->quote_char = c);
    200     }
    201     return true;
    202   }
    203 
    204   const_iterator start_pos_;
    205   const_iterator token_begin_;
    206   const_iterator token_end_;
    207   const_iterator end_;
    208   str delims_;
    209   str quotes_;
    210   int options_;
    211   bool token_is_delim_;
    212 };
    213 
    214 typedef StringTokenizerT<std::string, std::string::const_iterator>
    215     StringTokenizer;
    216 typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
    217     WStringTokenizer;
    218 typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
    219 
    220 #endif  // BASE_STRING_TOKENIZER_H_
    221