1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // http://code.google.com/p/protobuf/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton (at) google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 // 35 // Class for parsing tokenized text from a ZeroCopyInputStream. 36 37 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 38 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 39 40 #include <string> 41 #include <vector> 42 #include <google/protobuf/stubs/common.h> 43 44 namespace google { 45 namespace protobuf { 46 namespace io { 47 48 class ZeroCopyInputStream; // zero_copy_stream.h 49 50 // Defined in this file. 51 class ErrorCollector; 52 class Tokenizer; 53 54 // Abstract interface for an object which collects the errors that occur 55 // during parsing. A typical implementation might simply print the errors 56 // to stdout. 57 class LIBPROTOBUF_EXPORT ErrorCollector { 58 public: 59 inline ErrorCollector() {} 60 virtual ~ErrorCollector(); 61 62 // Indicates that there was an error in the input at the given line and 63 // column numbers. The numbers are zero-based, so you may want to add 64 // 1 to each before printing them. 65 virtual void AddError(int line, int column, const string& message) = 0; 66 67 // Indicates that there was a warning in the input at the given line and 68 // column numbers. The numbers are zero-based, so you may want to add 69 // 1 to each before printing them. 70 virtual void AddWarning(int line, int column, const string& message) { } 71 72 private: 73 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); 74 }; 75 76 // This class converts a stream of raw text into a stream of tokens for 77 // the protocol definition parser to parse. The tokens recognized are 78 // similar to those that make up the C language; see the TokenType enum for 79 // precise descriptions. Whitespace and comments are skipped. By default, 80 // C- and C++-style comments are recognized, but other styles can be used by 81 // calling set_comment_style(). 82 class LIBPROTOBUF_EXPORT Tokenizer { 83 public: 84 // Construct a Tokenizer that reads and tokenizes text from the given 85 // input stream and writes errors to the given error_collector. 86 // The caller keeps ownership of input and error_collector. 87 Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); 88 ~Tokenizer(); 89 90 enum TokenType { 91 TYPE_START, // Next() has not yet been called. 92 TYPE_END, // End of input reached. "text" is empty. 93 94 TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not 95 // starting with a digit. It is an error for a number 96 // to be followed by an identifier with no space in 97 // between. 98 TYPE_INTEGER, // A sequence of digits representing an integer. Normally 99 // the digits are decimal, but a prefix of "0x" indicates 100 // a hex number and a leading zero indicates octal, just 101 // like with C numeric literals. A leading negative sign 102 // is NOT included in the token; it's up to the parser to 103 // interpret the unary minus operator on its own. 104 TYPE_FLOAT, // A floating point literal, with a fractional part and/or 105 // an exponent. Always in decimal. Again, never 106 // negative. 107 TYPE_STRING, // A quoted sequence of escaped characters. Either single 108 // or double quotes can be used, but they must match. 109 // A string literal cannot cross a line break. 110 TYPE_SYMBOL, // Any other printable character, like '!' or '+'. 111 // Symbols are always a single character, so "!+$%" is 112 // four tokens. 113 }; 114 115 // Structure representing a token read from the token stream. 116 struct Token { 117 TokenType type; 118 string text; // The exact text of the token as it appeared in 119 // the input. e.g. tokens of TYPE_STRING will still 120 // be escaped and in quotes. 121 122 // "line" and "column" specify the position of the first character of 123 // the token within the input stream. They are zero-based. 124 int line; 125 int column; 126 int end_column; 127 }; 128 129 // Get the current token. This is updated when Next() is called. Before 130 // the first call to Next(), current() has type TYPE_START and no contents. 131 const Token& current(); 132 133 // Return the previous token -- i.e. what current() returned before the 134 // previous call to Next(). 135 const Token& previous(); 136 137 // Advance to the next token. Returns false if the end of the input is 138 // reached. 139 bool Next(); 140 141 // Like Next(), but also collects comments which appear between the previous 142 // and next tokens. 143 // 144 // Comments which appear to be attached to the previous token are stored 145 // in *prev_tailing_comments. Comments which appear to be attached to the 146 // next token are stored in *next_leading_comments. Comments appearing in 147 // between which do not appear to be attached to either will be added to 148 // detached_comments. Any of these parameters can be NULL to simply discard 149 // the comments. 150 // 151 // A series of line comments appearing on consecutive lines, with no other 152 // tokens appearing on those lines, will be treated as a single comment. 153 // 154 // Only the comment content is returned; comment markers (e.g. //) are 155 // stripped out. For block comments, leading whitespace and an asterisk will 156 // be stripped from the beginning of each line other than the first. Newlines 157 // are included in the output. 158 // 159 // Examples: 160 // 161 // optional int32 foo = 1; // Comment attached to foo. 162 // // Comment attached to bar. 163 // optional int32 bar = 2; 164 // 165 // optional string baz = 3; 166 // // Comment attached to baz. 167 // // Another line attached to baz. 168 // 169 // // Comment attached to qux. 170 // // 171 // // Another line attached to qux. 172 // optional double qux = 4; 173 // 174 // // Detached comment. This is not attached to qux or corge 175 // // because there are blank lines separating it from both. 176 // 177 // optional string corge = 5; 178 // /* Block comment attached 179 // * to corge. Leading asterisks 180 // * will be removed. */ 181 // /* Block comment attached to 182 // * grault. */ 183 // optional int32 grault = 6; 184 bool NextWithComments(string* prev_trailing_comments, 185 vector<string>* detached_comments, 186 string* next_leading_comments); 187 188 // Parse helpers --------------------------------------------------- 189 190 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 191 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 192 // result is undefined (possibly an assert failure). 193 static double ParseFloat(const string& text); 194 195 // Parses a TYPE_STRING token. This never fails, so long as the text actually 196 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 197 // result is undefined (possibly an assert failure). 198 static void ParseString(const string& text, string* output); 199 200 // Identical to ParseString, but appends to output. 201 static void ParseStringAppend(const string& text, string* output); 202 203 // Parses a TYPE_INTEGER token. Returns false if the result would be 204 // greater than max_value. Otherwise, returns true and sets *output to the 205 // result. If the text is not from a Token of type TYPE_INTEGER originally 206 // parsed by a Tokenizer, the result is undefined (possibly an assert 207 // failure). 208 static bool ParseInteger(const string& text, uint64 max_value, 209 uint64* output); 210 211 // Options --------------------------------------------------------- 212 213 // Set true to allow floats to be suffixed with the letter 'f'. Tokens 214 // which would otherwise be integers but which have the 'f' suffix will be 215 // forced to be interpreted as floats. For all other purposes, the 'f' is 216 // ignored. 217 void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } 218 219 // Valid values for set_comment_style(). 220 enum CommentStyle { 221 // Line comments begin with "//", block comments are delimited by "/*" and 222 // "*/". 223 CPP_COMMENT_STYLE, 224 // Line comments begin with "#". No way to write block comments. 225 SH_COMMENT_STYLE 226 }; 227 228 // Sets the comment style. 229 void set_comment_style(CommentStyle style) { comment_style_ = style; } 230 231 // ----------------------------------------------------------------- 232 private: 233 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer); 234 235 Token current_; // Returned by current(). 236 Token previous_; // Returned by previous(). 237 238 ZeroCopyInputStream* input_; 239 ErrorCollector* error_collector_; 240 241 char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). 242 const char* buffer_; // Current buffer returned from input_. 243 int buffer_size_; // Size of buffer_. 244 int buffer_pos_; // Current position within the buffer. 245 bool read_error_; // Did we previously encounter a read error? 246 247 // Line and column number of current_char_ within the whole input stream. 248 int line_; 249 int column_; 250 251 // String to which text should be appended as we advance through it. 252 // Call RecordTo(&str) to start recording and StopRecording() to stop. 253 // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the 254 // position within the current buffer where recording started. 255 string* record_target_; 256 int record_start_; 257 258 // Options. 259 bool allow_f_after_float_; 260 CommentStyle comment_style_; 261 262 // Since we count columns we need to interpret tabs somehow. We'll take 263 // the standard 8-character definition for lack of any way to do better. 264 static const int kTabWidth = 8; 265 266 // ----------------------------------------------------------------- 267 // Helper methods. 268 269 // Consume this character and advance to the next one. 270 void NextChar(); 271 272 // Read a new buffer from the input. 273 void Refresh(); 274 275 inline void RecordTo(string* target); 276 inline void StopRecording(); 277 278 // Called when the current character is the first character of a new 279 // token (not including whitespace or comments). 280 inline void StartToken(); 281 // Called when the current character is the first character after the 282 // end of the last token. After this returns, current_.text will 283 // contain all text consumed since StartToken() was called. 284 inline void EndToken(); 285 286 // Convenience method to add an error at the current line and column. 287 void AddError(const string& message) { 288 error_collector_->AddError(line_, column_, message); 289 } 290 291 // ----------------------------------------------------------------- 292 // The following four methods are used to consume tokens of specific 293 // types. They are actually used to consume all characters *after* 294 // the first, since the calling function consumes the first character 295 // in order to decide what kind of token is being read. 296 297 // Read and consume a string, ending when the given delimiter is 298 // consumed. 299 void ConsumeString(char delimiter); 300 301 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER 302 // depending on what was read. This needs to know if the first 303 // character was a zero in order to correctly recognize hex and octal 304 // numbers. 305 // It also needs to know if the first characted was a . to parse floating 306 // point correctly. 307 TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); 308 309 // Consume the rest of a line. 310 void ConsumeLineComment(string* content); 311 // Consume until "*/". 312 void ConsumeBlockComment(string* content); 313 314 enum NextCommentStatus { 315 // Started a line comment. 316 LINE_COMMENT, 317 318 // Started a block comment. 319 BLOCK_COMMENT, 320 321 // Consumed a slash, then realized it wasn't a comment. current_ has 322 // been filled in with a slash token. The caller should return it. 323 SLASH_NOT_COMMENT, 324 325 // We do not appear to be starting a comment here. 326 NO_COMMENT 327 }; 328 329 // If we're at the start of a new comment, consume it and return what kind 330 // of comment it is. 331 NextCommentStatus TryConsumeCommentStart(); 332 333 // ----------------------------------------------------------------- 334 // These helper methods make the parsing code more readable. The 335 // "character classes" refered to are defined at the top of the .cc file. 336 // Basically it is a C++ class with one method: 337 // static bool InClass(char c); 338 // The method returns true if c is a member of this "class", like "Letter" 339 // or "Digit". 340 341 // Returns true if the current character is of the given character 342 // class, but does not consume anything. 343 template<typename CharacterClass> 344 inline bool LookingAt(); 345 346 // If the current character is in the given class, consume it and return 347 // true. Otherwise return false. 348 // e.g. TryConsumeOne<Letter>() 349 template<typename CharacterClass> 350 inline bool TryConsumeOne(); 351 352 // Like above, but try to consume the specific character indicated. 353 inline bool TryConsume(char c); 354 355 // Consume zero or more of the given character class. 356 template<typename CharacterClass> 357 inline void ConsumeZeroOrMore(); 358 359 // Consume one or more of the given character class or log the given 360 // error message. 361 // e.g. ConsumeOneOrMore<Digit>("Expected digits."); 362 template<typename CharacterClass> 363 inline void ConsumeOneOrMore(const char* error); 364 }; 365 366 // inline methods ==================================================== 367 inline const Tokenizer::Token& Tokenizer::current() { 368 return current_; 369 } 370 371 inline const Tokenizer::Token& Tokenizer::previous() { 372 return previous_; 373 } 374 375 inline void Tokenizer::ParseString(const string& text, string* output) { 376 output->clear(); 377 ParseStringAppend(text, output); 378 } 379 380 } // namespace io 381 } // namespace protobuf 382 383 } // namespace google 384 #endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 385