Home | History | Annotate | Download | only in io
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // https://developers.google.com/protocol-buffers/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 //
     35 // Class for parsing tokenized text from a ZeroCopyInputStream.
     36 
     37 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
     38 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
     39 
     40 #include <string>
     41 #include <vector>
     42 #include <google/protobuf/stubs/common.h>
     43 
     44 namespace google {
     45 namespace protobuf {
     46 namespace io {
     47 
     48 class ZeroCopyInputStream;     // zero_copy_stream.h
     49 
     50 // Defined in this file.
     51 class ErrorCollector;
     52 class Tokenizer;
     53 
     54 // Abstract interface for an object which collects the errors that occur
     55 // during parsing.  A typical implementation might simply print the errors
     56 // to stdout.
     57 class LIBPROTOBUF_EXPORT ErrorCollector {
     58  public:
     59   inline ErrorCollector() {}
     60   virtual ~ErrorCollector();
     61 
     62   // Indicates that there was an error in the input at the given line and
     63   // column numbers.  The numbers are zero-based, so you may want to add
     64   // 1 to each before printing them.
     65   virtual void AddError(int line, int column, const string& message) = 0;
     66 
     67   // Indicates that there was a warning in the input at the given line and
     68   // column numbers.  The numbers are zero-based, so you may want to add
     69   // 1 to each before printing them.
     70   virtual void AddWarning(int /* line */, int /* column */,
     71                           const string& /* message */) { }
     72 
     73  private:
     74   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
     75 };
     76 
     77 // This class converts a stream of raw text into a stream of tokens for
     78 // the protocol definition parser to parse.  The tokens recognized are
     79 // similar to those that make up the C language; see the TokenType enum for
     80 // precise descriptions.  Whitespace and comments are skipped.  By default,
     81 // C- and C++-style comments are recognized, but other styles can be used by
     82 // calling set_comment_style().
     83 class LIBPROTOBUF_EXPORT Tokenizer {
     84  public:
     85   // Construct a Tokenizer that reads and tokenizes text from the given
     86   // input stream and writes errors to the given error_collector.
     87   // The caller keeps ownership of input and error_collector.
     88   Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
     89   ~Tokenizer();
     90 
     91   enum TokenType {
     92     TYPE_START,       // Next() has not yet been called.
     93     TYPE_END,         // End of input reached.  "text" is empty.
     94 
     95     TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
     96                       // starting with a digit.  It is an error for a number
     97                       // to be followed by an identifier with no space in
     98                       // between.
     99     TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
    100                       // the digits are decimal, but a prefix of "0x" indicates
    101                       // a hex number and a leading zero indicates octal, just
    102                       // like with C numeric literals.  A leading negative sign
    103                       // is NOT included in the token; it's up to the parser to
    104                       // interpret the unary minus operator on its own.
    105     TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
    106                       // an exponent.  Always in decimal.  Again, never
    107                       // negative.
    108     TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
    109                       // or double quotes can be used, but they must match.
    110                       // A string literal cannot cross a line break.
    111     TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
    112                       // Symbols are always a single character, so "!+$%" is
    113                       // four tokens.
    114   };
    115 
    116   // Structure representing a token read from the token stream.
    117   struct Token {
    118     TokenType type;
    119     string text;       // The exact text of the token as it appeared in
    120                        // the input.  e.g. tokens of TYPE_STRING will still
    121                        // be escaped and in quotes.
    122 
    123     // "line" and "column" specify the position of the first character of
    124     // the token within the input stream.  They are zero-based.
    125     int line;
    126     int column;
    127     int end_column;
    128   };
    129 
    130   // Get the current token.  This is updated when Next() is called.  Before
    131   // the first call to Next(), current() has type TYPE_START and no contents.
    132   const Token& current();
    133 
    134   // Return the previous token -- i.e. what current() returned before the
    135   // previous call to Next().
    136   const Token& previous();
    137 
    138   // Advance to the next token.  Returns false if the end of the input is
    139   // reached.
    140   bool Next();
    141 
    142   // Like Next(), but also collects comments which appear between the previous
    143   // and next tokens.
    144   //
    145   // Comments which appear to be attached to the previous token are stored
    146   // in *prev_tailing_comments.  Comments which appear to be attached to the
    147   // next token are stored in *next_leading_comments.  Comments appearing in
    148   // between which do not appear to be attached to either will be added to
    149   // detached_comments.  Any of these parameters can be NULL to simply discard
    150   // the comments.
    151   //
    152   // A series of line comments appearing on consecutive lines, with no other
    153   // tokens appearing on those lines, will be treated as a single comment.
    154   //
    155   // Only the comment content is returned; comment markers (e.g. //) are
    156   // stripped out.  For block comments, leading whitespace and an asterisk will
    157   // be stripped from the beginning of each line other than the first.  Newlines
    158   // are included in the output.
    159   //
    160   // Examples:
    161   //
    162   //   optional int32 foo = 1;  // Comment attached to foo.
    163   //   // Comment attached to bar.
    164   //   optional int32 bar = 2;
    165   //
    166   //   optional string baz = 3;
    167   //   // Comment attached to baz.
    168   //   // Another line attached to baz.
    169   //
    170   //   // Comment attached to qux.
    171   //   //
    172   //   // Another line attached to qux.
    173   //   optional double qux = 4;
    174   //
    175   //   // Detached comment.  This is not attached to qux or corge
    176   //   // because there are blank lines separating it from both.
    177   //
    178   //   optional string corge = 5;
    179   //   /* Block comment attached
    180   //    * to corge.  Leading asterisks
    181   //    * will be removed. */
    182   //   /* Block comment attached to
    183   //    * grault. */
    184   //   optional int32 grault = 6;
    185   bool NextWithComments(string* prev_trailing_comments,
    186                         vector<string>* detached_comments,
    187                         string* next_leading_comments);
    188 
    189   // Parse helpers ---------------------------------------------------
    190 
    191   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
    192   // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
    193   // result is undefined (possibly an assert failure).
    194   static double ParseFloat(const string& text);
    195 
    196   // Parses a TYPE_STRING token.  This never fails, so long as the text actually
    197   // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
    198   // result is undefined (possibly an assert failure).
    199   static void ParseString(const string& text, string* output);
    200 
    201   // Identical to ParseString, but appends to output.
    202   static void ParseStringAppend(const string& text, string* output);
    203 
    204   // Parses a TYPE_INTEGER token.  Returns false if the result would be
    205   // greater than max_value.  Otherwise, returns true and sets *output to the
    206   // result.  If the text is not from a Token of type TYPE_INTEGER originally
    207   // parsed by a Tokenizer, the result is undefined (possibly an assert
    208   // failure).
    209   static bool ParseInteger(const string& text, uint64 max_value,
    210                            uint64* output);
    211 
    212   // Options ---------------------------------------------------------
    213 
    214   // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
    215   // which would otherwise be integers but which have the 'f' suffix will be
    216   // forced to be interpreted as floats.  For all other purposes, the 'f' is
    217   // ignored.
    218   void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
    219 
    220   // Valid values for set_comment_style().
    221   enum CommentStyle {
    222     // Line comments begin with "//", block comments are delimited by "/*" and
    223     // "*/".
    224     CPP_COMMENT_STYLE,
    225     // Line comments begin with "#".  No way to write block comments.
    226     SH_COMMENT_STYLE
    227   };
    228 
    229   // Sets the comment style.
    230   void set_comment_style(CommentStyle style) { comment_style_ = style; }
    231 
    232   // Whether to require whitespace between a number and a field name.
    233   // Default is true. Do not use this; for Google-internal cleanup only.
    234   void set_require_space_after_number(bool require) {
    235     require_space_after_number_ = require;
    236   }
    237 
    238   // Whether to allow string literals to span multiple lines. Default is false.
    239   // Do not use this; for Google-internal cleanup only.
    240   void set_allow_multiline_strings(bool allow) {
    241     allow_multiline_strings_ = allow;
    242   }
    243 
    244   // External helper: validate an identifier.
    245   static bool IsIdentifier(const string& text);
    246 
    247   // -----------------------------------------------------------------
    248  private:
    249   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
    250 
    251   Token current_;           // Returned by current().
    252   Token previous_;          // Returned by previous().
    253 
    254   ZeroCopyInputStream* input_;
    255   ErrorCollector* error_collector_;
    256 
    257   char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
    258   const char* buffer_;      // Current buffer returned from input_.
    259   int buffer_size_;         // Size of buffer_.
    260   int buffer_pos_;          // Current position within the buffer.
    261   bool read_error_;         // Did we previously encounter a read error?
    262 
    263   // Line and column number of current_char_ within the whole input stream.
    264   int line_;
    265   int column_;
    266 
    267   // String to which text should be appended as we advance through it.
    268   // Call RecordTo(&str) to start recording and StopRecording() to stop.
    269   // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
    270   // position within the current buffer where recording started.
    271   string* record_target_;
    272   int record_start_;
    273 
    274   // Options.
    275   bool allow_f_after_float_;
    276   CommentStyle comment_style_;
    277   bool require_space_after_number_;
    278   bool allow_multiline_strings_;
    279 
    280   // Since we count columns we need to interpret tabs somehow.  We'll take
    281   // the standard 8-character definition for lack of any way to do better.
    282   static const int kTabWidth = 8;
    283 
    284   // -----------------------------------------------------------------
    285   // Helper methods.
    286 
    287   // Consume this character and advance to the next one.
    288   void NextChar();
    289 
    290   // Read a new buffer from the input.
    291   void Refresh();
    292 
    293   inline void RecordTo(string* target);
    294   inline void StopRecording();
    295 
    296   // Called when the current character is the first character of a new
    297   // token (not including whitespace or comments).
    298   inline void StartToken();
    299   // Called when the current character is the first character after the
    300   // end of the last token.  After this returns, current_.text will
    301   // contain all text consumed since StartToken() was called.
    302   inline void EndToken();
    303 
    304   // Convenience method to add an error at the current line and column.
    305   void AddError(const string& message) {
    306     error_collector_->AddError(line_, column_, message);
    307   }
    308 
    309   // -----------------------------------------------------------------
    310   // The following four methods are used to consume tokens of specific
    311   // types.  They are actually used to consume all characters *after*
    312   // the first, since the calling function consumes the first character
    313   // in order to decide what kind of token is being read.
    314 
    315   // Read and consume a string, ending when the given delimiter is
    316   // consumed.
    317   void ConsumeString(char delimiter);
    318 
    319   // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
    320   // depending on what was read.  This needs to know if the first
    321   // character was a zero in order to correctly recognize hex and octal
    322   // numbers.
    323   // It also needs to know if the first characted was a . to parse floating
    324   // point correctly.
    325   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
    326 
    327   // Consume the rest of a line.
    328   void ConsumeLineComment(string* content);
    329   // Consume until "*/".
    330   void ConsumeBlockComment(string* content);
    331 
    332   enum NextCommentStatus {
    333     // Started a line comment.
    334     LINE_COMMENT,
    335 
    336     // Started a block comment.
    337     BLOCK_COMMENT,
    338 
    339     // Consumed a slash, then realized it wasn't a comment.  current_ has
    340     // been filled in with a slash token.  The caller should return it.
    341     SLASH_NOT_COMMENT,
    342 
    343     // We do not appear to be starting a comment here.
    344     NO_COMMENT
    345   };
    346 
    347   // If we're at the start of a new comment, consume it and return what kind
    348   // of comment it is.
    349   NextCommentStatus TryConsumeCommentStart();
    350 
    351   // -----------------------------------------------------------------
    352   // These helper methods make the parsing code more readable.  The
    353   // "character classes" refered to are defined at the top of the .cc file.
    354   // Basically it is a C++ class with one method:
    355   //   static bool InClass(char c);
    356   // The method returns true if c is a member of this "class", like "Letter"
    357   // or "Digit".
    358 
    359   // Returns true if the current character is of the given character
    360   // class, but does not consume anything.
    361   template<typename CharacterClass>
    362   inline bool LookingAt();
    363 
    364   // If the current character is in the given class, consume it and return
    365   // true.  Otherwise return false.
    366   // e.g. TryConsumeOne<Letter>()
    367   template<typename CharacterClass>
    368   inline bool TryConsumeOne();
    369 
    370   // Like above, but try to consume the specific character indicated.
    371   inline bool TryConsume(char c);
    372 
    373   // Consume zero or more of the given character class.
    374   template<typename CharacterClass>
    375   inline void ConsumeZeroOrMore();
    376 
    377   // Consume one or more of the given character class or log the given
    378   // error message.
    379   // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
    380   template<typename CharacterClass>
    381   inline void ConsumeOneOrMore(const char* error);
    382 };
    383 
    384 // inline methods ====================================================
    385 inline const Tokenizer::Token& Tokenizer::current() {
    386   return current_;
    387 }
    388 
    389 inline const Tokenizer::Token& Tokenizer::previous() {
    390   return previous_;
    391 }
    392 
    393 inline void Tokenizer::ParseString(const string& text, string* output) {
    394   output->clear();
    395   ParseStringAppend(text, output);
    396 }
    397 
    398 }  // namespace io
    399 }  // namespace protobuf
    400 
    401 }  // namespace google
    402 #endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
    403