Home | History | Annotate | Download | only in io
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // http://code.google.com/p/protobuf/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 //
     35 // Class for parsing tokenized text from a ZeroCopyInputStream.
     36 
     37 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
     38 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
     39 
     40 #include <string>
     41 #include <vector>
     42 #include <google/protobuf/stubs/common.h>
     43 
     44 namespace google {
     45 namespace protobuf {
     46 namespace io {
     47 
     48 class ZeroCopyInputStream;     // zero_copy_stream.h
     49 
     50 // Defined in this file.
     51 class ErrorCollector;
     52 class Tokenizer;
     53 
     54 // Abstract interface for an object which collects the errors that occur
     55 // during parsing.  A typical implementation might simply print the errors
     56 // to stdout.
     57 class LIBPROTOBUF_EXPORT ErrorCollector {
     58  public:
     59   inline ErrorCollector() {}
     60   virtual ~ErrorCollector();
     61 
     62   // Indicates that there was an error in the input at the given line and
     63   // column numbers.  The numbers are zero-based, so you may want to add
     64   // 1 to each before printing them.
     65   virtual void AddError(int line, int column, const string& message) = 0;
     66 
     67   // Indicates that there was a warning in the input at the given line and
     68   // column numbers.  The numbers are zero-based, so you may want to add
     69   // 1 to each before printing them.
     70   virtual void AddWarning(int line, int column, const string& message) { }
     71 
     72  private:
     73   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
     74 };
     75 
     76 // This class converts a stream of raw text into a stream of tokens for
     77 // the protocol definition parser to parse.  The tokens recognized are
     78 // similar to those that make up the C language; see the TokenType enum for
     79 // precise descriptions.  Whitespace and comments are skipped.  By default,
     80 // C- and C++-style comments are recognized, but other styles can be used by
     81 // calling set_comment_style().
     82 class LIBPROTOBUF_EXPORT Tokenizer {
     83  public:
     84   // Construct a Tokenizer that reads and tokenizes text from the given
     85   // input stream and writes errors to the given error_collector.
     86   // The caller keeps ownership of input and error_collector.
     87   Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
     88   ~Tokenizer();
     89 
     90   enum TokenType {
     91     TYPE_START,       // Next() has not yet been called.
     92     TYPE_END,         // End of input reached.  "text" is empty.
     93 
     94     TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
     95                       // starting with a digit.  It is an error for a number
     96                       // to be followed by an identifier with no space in
     97                       // between.
     98     TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
     99                       // the digits are decimal, but a prefix of "0x" indicates
    100                       // a hex number and a leading zero indicates octal, just
    101                       // like with C numeric literals.  A leading negative sign
    102                       // is NOT included in the token; it's up to the parser to
    103                       // interpret the unary minus operator on its own.
    104     TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
    105                       // an exponent.  Always in decimal.  Again, never
    106                       // negative.
    107     TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
    108                       // or double quotes can be used, but they must match.
    109                       // A string literal cannot cross a line break.
    110     TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
    111                       // Symbols are always a single character, so "!+$%" is
    112                       // four tokens.
    113   };
    114 
    115   // Structure representing a token read from the token stream.
    116   struct Token {
    117     TokenType type;
    118     string text;       // The exact text of the token as it appeared in
    119                        // the input.  e.g. tokens of TYPE_STRING will still
    120                        // be escaped and in quotes.
    121 
    122     // "line" and "column" specify the position of the first character of
    123     // the token within the input stream.  They are zero-based.
    124     int line;
    125     int column;
    126     int end_column;
    127   };
    128 
    129   // Get the current token.  This is updated when Next() is called.  Before
    130   // the first call to Next(), current() has type TYPE_START and no contents.
    131   const Token& current();
    132 
    133   // Return the previous token -- i.e. what current() returned before the
    134   // previous call to Next().
    135   const Token& previous();
    136 
    137   // Advance to the next token.  Returns false if the end of the input is
    138   // reached.
    139   bool Next();
    140 
    141   // Like Next(), but also collects comments which appear between the previous
    142   // and next tokens.
    143   //
    144   // Comments which appear to be attached to the previous token are stored
    145   // in *prev_tailing_comments.  Comments which appear to be attached to the
    146   // next token are stored in *next_leading_comments.  Comments appearing in
    147   // between which do not appear to be attached to either will be added to
    148   // detached_comments.  Any of these parameters can be NULL to simply discard
    149   // the comments.
    150   //
    151   // A series of line comments appearing on consecutive lines, with no other
    152   // tokens appearing on those lines, will be treated as a single comment.
    153   //
    154   // Only the comment content is returned; comment markers (e.g. //) are
    155   // stripped out.  For block comments, leading whitespace and an asterisk will
    156   // be stripped from the beginning of each line other than the first.  Newlines
    157   // are included in the output.
    158   //
    159   // Examples:
    160   //
    161   //   optional int32 foo = 1;  // Comment attached to foo.
    162   //   // Comment attached to bar.
    163   //   optional int32 bar = 2;
    164   //
    165   //   optional string baz = 3;
    166   //   // Comment attached to baz.
    167   //   // Another line attached to baz.
    168   //
    169   //   // Comment attached to qux.
    170   //   //
    171   //   // Another line attached to qux.
    172   //   optional double qux = 4;
    173   //
    174   //   // Detached comment.  This is not attached to qux or corge
    175   //   // because there are blank lines separating it from both.
    176   //
    177   //   optional string corge = 5;
    178   //   /* Block comment attached
    179   //    * to corge.  Leading asterisks
    180   //    * will be removed. */
    181   //   /* Block comment attached to
    182   //    * grault. */
    183   //   optional int32 grault = 6;
    184   bool NextWithComments(string* prev_trailing_comments,
    185                         vector<string>* detached_comments,
    186                         string* next_leading_comments);
    187 
    188   // Parse helpers ---------------------------------------------------
    189 
    190   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
    191   // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
    192   // result is undefined (possibly an assert failure).
    193   static double ParseFloat(const string& text);
    194 
    195   // Parses a TYPE_STRING token.  This never fails, so long as the text actually
    196   // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
    197   // result is undefined (possibly an assert failure).
    198   static void ParseString(const string& text, string* output);
    199 
    200   // Identical to ParseString, but appends to output.
    201   static void ParseStringAppend(const string& text, string* output);
    202 
    203   // Parses a TYPE_INTEGER token.  Returns false if the result would be
    204   // greater than max_value.  Otherwise, returns true and sets *output to the
    205   // result.  If the text is not from a Token of type TYPE_INTEGER originally
    206   // parsed by a Tokenizer, the result is undefined (possibly an assert
    207   // failure).
    208   static bool ParseInteger(const string& text, uint64 max_value,
    209                            uint64* output);
    210 
    211   // Options ---------------------------------------------------------
    212 
    213   // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
    214   // which would otherwise be integers but which have the 'f' suffix will be
    215   // forced to be interpreted as floats.  For all other purposes, the 'f' is
    216   // ignored.
    217   void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
    218 
    219   // Valid values for set_comment_style().
    220   enum CommentStyle {
    221     // Line comments begin with "//", block comments are delimited by "/*" and
    222     // "*/".
    223     CPP_COMMENT_STYLE,
    224     // Line comments begin with "#".  No way to write block comments.
    225     SH_COMMENT_STYLE
    226   };
    227 
    228   // Sets the comment style.
    229   void set_comment_style(CommentStyle style) { comment_style_ = style; }
    230 
    231   // -----------------------------------------------------------------
    232  private:
    233   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
    234 
    235   Token current_;           // Returned by current().
    236   Token previous_;          // Returned by previous().
    237 
    238   ZeroCopyInputStream* input_;
    239   ErrorCollector* error_collector_;
    240 
    241   char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
    242   const char* buffer_;      // Current buffer returned from input_.
    243   int buffer_size_;         // Size of buffer_.
    244   int buffer_pos_;          // Current position within the buffer.
    245   bool read_error_;         // Did we previously encounter a read error?
    246 
    247   // Line and column number of current_char_ within the whole input stream.
    248   int line_;
    249   int column_;
    250 
    251   // String to which text should be appended as we advance through it.
    252   // Call RecordTo(&str) to start recording and StopRecording() to stop.
    253   // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
    254   // position within the current buffer where recording started.
    255   string* record_target_;
    256   int record_start_;
    257 
    258   // Options.
    259   bool allow_f_after_float_;
    260   CommentStyle comment_style_;
    261 
    262   // Since we count columns we need to interpret tabs somehow.  We'll take
    263   // the standard 8-character definition for lack of any way to do better.
    264   static const int kTabWidth = 8;
    265 
    266   // -----------------------------------------------------------------
    267   // Helper methods.
    268 
    269   // Consume this character and advance to the next one.
    270   void NextChar();
    271 
    272   // Read a new buffer from the input.
    273   void Refresh();
    274 
    275   inline void RecordTo(string* target);
    276   inline void StopRecording();
    277 
    278   // Called when the current character is the first character of a new
    279   // token (not including whitespace or comments).
    280   inline void StartToken();
    281   // Called when the current character is the first character after the
    282   // end of the last token.  After this returns, current_.text will
    283   // contain all text consumed since StartToken() was called.
    284   inline void EndToken();
    285 
    286   // Convenience method to add an error at the current line and column.
    287   void AddError(const string& message) {
    288     error_collector_->AddError(line_, column_, message);
    289   }
    290 
    291   // -----------------------------------------------------------------
    292   // The following four methods are used to consume tokens of specific
    293   // types.  They are actually used to consume all characters *after*
    294   // the first, since the calling function consumes the first character
    295   // in order to decide what kind of token is being read.
    296 
    297   // Read and consume a string, ending when the given delimiter is
    298   // consumed.
    299   void ConsumeString(char delimiter);
    300 
    301   // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
    302   // depending on what was read.  This needs to know if the first
    303   // character was a zero in order to correctly recognize hex and octal
    304   // numbers.
    305   // It also needs to know if the first characted was a . to parse floating
    306   // point correctly.
    307   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
    308 
    309   // Consume the rest of a line.
    310   void ConsumeLineComment(string* content);
    311   // Consume until "*/".
    312   void ConsumeBlockComment(string* content);
    313 
    314   enum NextCommentStatus {
    315     // Started a line comment.
    316     LINE_COMMENT,
    317 
    318     // Started a block comment.
    319     BLOCK_COMMENT,
    320 
    321     // Consumed a slash, then realized it wasn't a comment.  current_ has
    322     // been filled in with a slash token.  The caller should return it.
    323     SLASH_NOT_COMMENT,
    324 
    325     // We do not appear to be starting a comment here.
    326     NO_COMMENT
    327   };
    328 
    329   // If we're at the start of a new comment, consume it and return what kind
    330   // of comment it is.
    331   NextCommentStatus TryConsumeCommentStart();
    332 
    333   // -----------------------------------------------------------------
    334   // These helper methods make the parsing code more readable.  The
    335   // "character classes" refered to are defined at the top of the .cc file.
    336   // Basically it is a C++ class with one method:
    337   //   static bool InClass(char c);
    338   // The method returns true if c is a member of this "class", like "Letter"
    339   // or "Digit".
    340 
    341   // Returns true if the current character is of the given character
    342   // class, but does not consume anything.
    343   template<typename CharacterClass>
    344   inline bool LookingAt();
    345 
    346   // If the current character is in the given class, consume it and return
    347   // true.  Otherwise return false.
    348   // e.g. TryConsumeOne<Letter>()
    349   template<typename CharacterClass>
    350   inline bool TryConsumeOne();
    351 
    352   // Like above, but try to consume the specific character indicated.
    353   inline bool TryConsume(char c);
    354 
    355   // Consume zero or more of the given character class.
    356   template<typename CharacterClass>
    357   inline void ConsumeZeroOrMore();
    358 
    359   // Consume one or more of the given character class or log the given
    360   // error message.
    361   // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
    362   template<typename CharacterClass>
    363   inline void ConsumeOneOrMore(const char* error);
    364 };
    365 
    366 // inline methods ====================================================
    367 inline const Tokenizer::Token& Tokenizer::current() {
    368   return current_;
    369 }
    370 
    371 inline const Tokenizer::Token& Tokenizer::previous() {
    372   return previous_;
    373 }
    374 
    375 inline void Tokenizer::ParseString(const string& text, string* output) {
    376   output->clear();
    377   ParseStringAppend(text, output);
    378 }
    379 
    380 }  // namespace io
    381 }  // namespace protobuf
    382 
    383 }  // namespace google
    384 #endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
    385