1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton (at) google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 // 35 // Class for parsing tokenized text from a ZeroCopyInputStream. 36 37 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 38 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 39 40 #include <string> 41 #include <vector> 42 #include <google/protobuf/stubs/common.h> 43 #include <google/protobuf/stubs/logging.h> 44 45 namespace google { 46 namespace protobuf { 47 namespace io { 48 49 class ZeroCopyInputStream; // zero_copy_stream.h 50 51 // Defined in this file. 52 class ErrorCollector; 53 class Tokenizer; 54 55 // By "column number", the proto compiler refers to a count of the number 56 // of bytes before a given byte, except that a tab character advances to 57 // the next multiple of 8 bytes. Note in particular that column numbers 58 // are zero-based, while many user interfaces use one-based column numbers. 59 typedef int ColumnNumber; 60 61 // Abstract interface for an object which collects the errors that occur 62 // during parsing. A typical implementation might simply print the errors 63 // to stdout. 64 class LIBPROTOBUF_EXPORT ErrorCollector { 65 public: 66 inline ErrorCollector() {} 67 virtual ~ErrorCollector(); 68 69 // Indicates that there was an error in the input at the given line and 70 // column numbers. The numbers are zero-based, so you may want to add 71 // 1 to each before printing them. 72 virtual void AddError(int line, ColumnNumber column, 73 const string& message) = 0; 74 75 // Indicates that there was a warning in the input at the given line and 76 // column numbers. The numbers are zero-based, so you may want to add 77 // 1 to each before printing them. 78 virtual void AddWarning(int line, ColumnNumber column, 79 const string& message) { } 80 81 private: 82 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); 83 }; 84 85 // This class converts a stream of raw text into a stream of tokens for 86 // the protocol definition parser to parse. The tokens recognized are 87 // similar to those that make up the C language; see the TokenType enum for 88 // precise descriptions. Whitespace and comments are skipped. By default, 89 // C- and C++-style comments are recognized, but other styles can be used by 90 // calling set_comment_style(). 91 class LIBPROTOBUF_EXPORT Tokenizer { 92 public: 93 // Construct a Tokenizer that reads and tokenizes text from the given 94 // input stream and writes errors to the given error_collector. 95 // The caller keeps ownership of input and error_collector. 96 Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); 97 ~Tokenizer(); 98 99 enum TokenType { 100 TYPE_START, // Next() has not yet been called. 101 TYPE_END, // End of input reached. "text" is empty. 102 103 TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not 104 // starting with a digit. It is an error for a number 105 // to be followed by an identifier with no space in 106 // between. 107 TYPE_INTEGER, // A sequence of digits representing an integer. Normally 108 // the digits are decimal, but a prefix of "0x" indicates 109 // a hex number and a leading zero indicates octal, just 110 // like with C numeric literals. A leading negative sign 111 // is NOT included in the token; it's up to the parser to 112 // interpret the unary minus operator on its own. 113 TYPE_FLOAT, // A floating point literal, with a fractional part and/or 114 // an exponent. Always in decimal. Again, never 115 // negative. 116 TYPE_STRING, // A quoted sequence of escaped characters. Either single 117 // or double quotes can be used, but they must match. 118 // A string literal cannot cross a line break. 119 TYPE_SYMBOL, // Any other printable character, like '!' or '+'. 120 // Symbols are always a single character, so "!+$%" is 121 // four tokens. 122 }; 123 124 // Structure representing a token read from the token stream. 125 struct Token { 126 TokenType type; 127 string text; // The exact text of the token as it appeared in 128 // the input. e.g. tokens of TYPE_STRING will still 129 // be escaped and in quotes. 130 131 // "line" and "column" specify the position of the first character of 132 // the token within the input stream. They are zero-based. 133 int line; 134 ColumnNumber column; 135 ColumnNumber end_column; 136 }; 137 138 // Get the current token. This is updated when Next() is called. Before 139 // the first call to Next(), current() has type TYPE_START and no contents. 140 const Token& current(); 141 142 // Return the previous token -- i.e. what current() returned before the 143 // previous call to Next(). 144 const Token& previous(); 145 146 // Advance to the next token. Returns false if the end of the input is 147 // reached. 148 bool Next(); 149 150 // Like Next(), but also collects comments which appear between the previous 151 // and next tokens. 152 // 153 // Comments which appear to be attached to the previous token are stored 154 // in *prev_tailing_comments. Comments which appear to be attached to the 155 // next token are stored in *next_leading_comments. Comments appearing in 156 // between which do not appear to be attached to either will be added to 157 // detached_comments. Any of these parameters can be NULL to simply discard 158 // the comments. 159 // 160 // A series of line comments appearing on consecutive lines, with no other 161 // tokens appearing on those lines, will be treated as a single comment. 162 // 163 // Only the comment content is returned; comment markers (e.g. //) are 164 // stripped out. For block comments, leading whitespace and an asterisk will 165 // be stripped from the beginning of each line other than the first. Newlines 166 // are included in the output. 167 // 168 // Examples: 169 // 170 // optional int32 foo = 1; // Comment attached to foo. 171 // // Comment attached to bar. 172 // optional int32 bar = 2; 173 // 174 // optional string baz = 3; 175 // // Comment attached to baz. 176 // // Another line attached to baz. 177 // 178 // // Comment attached to qux. 179 // // 180 // // Another line attached to qux. 181 // optional double qux = 4; 182 // 183 // // Detached comment. This is not attached to qux or corge 184 // // because there are blank lines separating it from both. 185 // 186 // optional string corge = 5; 187 // /* Block comment attached 188 // * to corge. Leading asterisks 189 // * will be removed. */ 190 // /* Block comment attached to 191 // * grault. */ 192 // optional int32 grault = 6; 193 bool NextWithComments(string* prev_trailing_comments, 194 vector<string>* detached_comments, 195 string* next_leading_comments); 196 197 // Parse helpers --------------------------------------------------- 198 199 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 200 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 201 // result is undefined (possibly an assert failure). 202 static double ParseFloat(const string& text); 203 204 // Parses a TYPE_STRING token. This never fails, so long as the text actually 205 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 206 // result is undefined (possibly an assert failure). 207 static void ParseString(const string& text, string* output); 208 209 // Identical to ParseString, but appends to output. 210 static void ParseStringAppend(const string& text, string* output); 211 212 // Parses a TYPE_INTEGER token. Returns false if the result would be 213 // greater than max_value. Otherwise, returns true and sets *output to the 214 // result. If the text is not from a Token of type TYPE_INTEGER originally 215 // parsed by a Tokenizer, the result is undefined (possibly an assert 216 // failure). 217 static bool ParseInteger(const string& text, uint64 max_value, 218 uint64* output); 219 220 // Options --------------------------------------------------------- 221 222 // Set true to allow floats to be suffixed with the letter 'f'. Tokens 223 // which would otherwise be integers but which have the 'f' suffix will be 224 // forced to be interpreted as floats. For all other purposes, the 'f' is 225 // ignored. 226 void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } 227 228 // Valid values for set_comment_style(). 229 enum CommentStyle { 230 // Line comments begin with "//", block comments are delimited by "/*" and 231 // "*/". 232 CPP_COMMENT_STYLE, 233 // Line comments begin with "#". No way to write block comments. 234 SH_COMMENT_STYLE 235 }; 236 237 // Sets the comment style. 238 void set_comment_style(CommentStyle style) { comment_style_ = style; } 239 240 // Whether to require whitespace between a number and a field name. 241 // Default is true. Do not use this; for Google-internal cleanup only. 242 void set_require_space_after_number(bool require) { 243 require_space_after_number_ = require; 244 } 245 246 // Whether to allow string literals to span multiple lines. Default is false. 247 // Do not use this; for Google-internal cleanup only. 248 void set_allow_multiline_strings(bool allow) { 249 allow_multiline_strings_ = allow; 250 } 251 252 // External helper: validate an identifier. 253 static bool IsIdentifier(const string& text); 254 255 // ----------------------------------------------------------------- 256 private: 257 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer); 258 259 Token current_; // Returned by current(). 260 Token previous_; // Returned by previous(). 261 262 ZeroCopyInputStream* input_; 263 ErrorCollector* error_collector_; 264 265 char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). 266 const char* buffer_; // Current buffer returned from input_. 267 int buffer_size_; // Size of buffer_. 268 int buffer_pos_; // Current position within the buffer. 269 bool read_error_; // Did we previously encounter a read error? 270 271 // Line and column number of current_char_ within the whole input stream. 272 int line_; 273 ColumnNumber column_; 274 275 // String to which text should be appended as we advance through it. 276 // Call RecordTo(&str) to start recording and StopRecording() to stop. 277 // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the 278 // position within the current buffer where recording started. 279 string* record_target_; 280 int record_start_; 281 282 // Options. 283 bool allow_f_after_float_; 284 CommentStyle comment_style_; 285 bool require_space_after_number_; 286 bool allow_multiline_strings_; 287 288 // Since we count columns we need to interpret tabs somehow. We'll take 289 // the standard 8-character definition for lack of any way to do better. 290 // This must match the documentation of ColumnNumber. 291 static const int kTabWidth = 8; 292 293 // ----------------------------------------------------------------- 294 // Helper methods. 295 296 // Consume this character and advance to the next one. 297 void NextChar(); 298 299 // Read a new buffer from the input. 300 void Refresh(); 301 302 inline void RecordTo(string* target); 303 inline void StopRecording(); 304 305 // Called when the current character is the first character of a new 306 // token (not including whitespace or comments). 307 inline void StartToken(); 308 // Called when the current character is the first character after the 309 // end of the last token. After this returns, current_.text will 310 // contain all text consumed since StartToken() was called. 311 inline void EndToken(); 312 313 // Convenience method to add an error at the current line and column. 314 void AddError(const string& message) { 315 error_collector_->AddError(line_, column_, message); 316 } 317 318 // ----------------------------------------------------------------- 319 // The following four methods are used to consume tokens of specific 320 // types. They are actually used to consume all characters *after* 321 // the first, since the calling function consumes the first character 322 // in order to decide what kind of token is being read. 323 324 // Read and consume a string, ending when the given delimiter is 325 // consumed. 326 void ConsumeString(char delimiter); 327 328 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER 329 // depending on what was read. This needs to know if the first 330 // character was a zero in order to correctly recognize hex and octal 331 // numbers. 332 // It also needs to know if the first characted was a . to parse floating 333 // point correctly. 334 TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); 335 336 // Consume the rest of a line. 337 void ConsumeLineComment(string* content); 338 // Consume until "*/". 339 void ConsumeBlockComment(string* content); 340 341 enum NextCommentStatus { 342 // Started a line comment. 343 LINE_COMMENT, 344 345 // Started a block comment. 346 BLOCK_COMMENT, 347 348 // Consumed a slash, then realized it wasn't a comment. current_ has 349 // been filled in with a slash token. The caller should return it. 350 SLASH_NOT_COMMENT, 351 352 // We do not appear to be starting a comment here. 353 NO_COMMENT 354 }; 355 356 // If we're at the start of a new comment, consume it and return what kind 357 // of comment it is. 358 NextCommentStatus TryConsumeCommentStart(); 359 360 // ----------------------------------------------------------------- 361 // These helper methods make the parsing code more readable. The 362 // "character classes" referred to are defined at the top of the .cc file. 363 // Basically it is a C++ class with one method: 364 // static bool InClass(char c); 365 // The method returns true if c is a member of this "class", like "Letter" 366 // or "Digit". 367 368 // Returns true if the current character is of the given character 369 // class, but does not consume anything. 370 template<typename CharacterClass> 371 inline bool LookingAt(); 372 373 // If the current character is in the given class, consume it and return 374 // true. Otherwise return false. 375 // e.g. TryConsumeOne<Letter>() 376 template<typename CharacterClass> 377 inline bool TryConsumeOne(); 378 379 // Like above, but try to consume the specific character indicated. 380 inline bool TryConsume(char c); 381 382 // Consume zero or more of the given character class. 383 template<typename CharacterClass> 384 inline void ConsumeZeroOrMore(); 385 386 // Consume one or more of the given character class or log the given 387 // error message. 388 // e.g. ConsumeOneOrMore<Digit>("Expected digits."); 389 template<typename CharacterClass> 390 inline void ConsumeOneOrMore(const char* error); 391 }; 392 393 // inline methods ==================================================== 394 inline const Tokenizer::Token& Tokenizer::current() { 395 return current_; 396 } 397 398 inline const Tokenizer::Token& Tokenizer::previous() { 399 return previous_; 400 } 401 402 inline void Tokenizer::ParseString(const string& text, string* output) { 403 output->clear(); 404 ParseStringAppend(text, output); 405 } 406 407 } // namespace io 408 } // namespace protobuf 409 410 } // namespace google 411 #endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 412