1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton (at) google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 // 35 // Here we have a hand-written lexer. At first you might ask yourself, 36 // "Hand-written text processing? Is Kenton crazy?!" Well, first of all, 37 // yes I am crazy, but that's beside the point. There are actually reasons 38 // why I ended up writing this this way. 39 // 40 // The traditional approach to lexing is to use lex to generate a lexer for 41 // you. Unfortunately, lex's output is ridiculously ugly and difficult to 42 // integrate cleanly with C++ code, especially abstract code or code meant 43 // as a library. Better parser-generators exist but would add dependencies 44 // which most users won't already have, which we'd like to avoid. (GNU flex 45 // has a C++ output option, but it's still ridiculously ugly, non-abstract, 46 // and not library-friendly.) 47 // 48 // The next approach that any good software engineer should look at is to 49 // use regular expressions. And, indeed, I did. I have code which 50 // implements this same class using regular expressions. It's about 200 51 // lines shorter. However: 52 // - Rather than error messages telling you "This string has an invalid 53 // escape sequence at line 5, column 45", you get error messages like 54 // "Parse error on line 5". Giving more precise errors requires adding 55 // a lot of code that ends up basically as complex as the hand-coded 56 // version anyway. 57 // - The regular expression to match a string literal looks like this: 58 // kString = new RE("(\"([^\"\\\\]|" // non-escaped 59 // "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape 60 // "\\\\x[0-9a-fA-F])*\"|" // hex escape 61 // "\'([^\'\\\\]|" // Also support single-quotes. 62 // "\\\\[abfnrtv?\"'\\\\0-7]|" 63 // "\\\\x[0-9a-fA-F])*\')"); 64 // Verifying the correctness of this line noise is actually harder than 65 // verifying the correctness of ConsumeString(), defined below. I'm not 66 // even confident that the above is correct, after staring at it for some 67 // time. 68 // - PCRE is fast, but there's still more overhead involved than the code 69 // below. 70 // - Sadly, regular expressions are not part of the C standard library, so 71 // using them would require depending on some other library. For the 72 // open source release, this could be really annoying. Nobody likes 73 // downloading one piece of software just to find that they need to 74 // download something else to make it work, and in all likelihood 75 // people downloading Protocol Buffers will already be doing so just 76 // to make something else work. We could include a copy of PCRE with 77 // our code, but that obligates us to keep it up-to-date and just seems 78 // like a big waste just to save 200 lines of code. 79 // 80 // On a similar but unrelated note, I'm even scared to use ctype.h. 81 // Apparently functions like isalpha() are locale-dependent. So, if we used 82 // that, then if this code is being called from some program that doesn't 83 // have its locale set to "C", it would behave strangely. We can't just set 84 // the locale to "C" ourselves since we might break the calling program that 85 // way, particularly if it is multi-threaded. WTF? Someone please let me 86 // (Kenton) know if I'm missing something here... 87 // 88 // I'd love to hear about other alternatives, though, as this code isn't 89 // exactly pretty. 90 91 #include <google/protobuf/io/tokenizer.h> 92 #include <google/protobuf/stubs/common.h> 93 #include <google/protobuf/stubs/logging.h> 94 #include <google/protobuf/stubs/stringprintf.h> 95 #include <google/protobuf/io/strtod.h> 96 #include <google/protobuf/io/zero_copy_stream.h> 97 #include <google/protobuf/stubs/strutil.h> 98 #include <google/protobuf/stubs/stl_util.h> 99 100 namespace google { 101 namespace protobuf { 102 namespace io { 103 namespace { 104 105 // As mentioned above, I don't trust ctype.h due to the presence of "locales". 106 // So, I have written replacement functions here. Someone please smack me if 107 // this is a bad idea or if there is some way around this. 108 // 109 // These "character classes" are designed to be used in template methods. 110 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat 111 // whitespace. 112 113 // Note: No class is allowed to contain '\0', since this is used to mark end- 114 // of-input and is handled specially. 115 116 #define CHARACTER_CLASS(NAME, EXPRESSION) \ 117 class NAME { \ 118 public: \ 119 static inline bool InClass(char c) { \ 120 return EXPRESSION; \ 121 } \ 122 } 123 124 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || 125 c == '\r' || c == '\v' || c == '\f'); 126 CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' || 127 c == '\r' || c == '\v' || c == '\f'); 128 129 CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0'); 130 131 CHARACTER_CLASS(Digit, '0' <= c && c <= '9'); 132 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7'); 133 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || 134 ('a' <= c && c <= 'f') || 135 ('A' <= c && c <= 'F')); 136 137 CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') || 138 ('A' <= c && c <= 'Z') || 139 (c == '_')); 140 141 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') || 142 ('A' <= c && c <= 'Z') || 143 ('0' <= c && c <= '9') || 144 (c == '_')); 145 146 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' || 147 c == 'r' || c == 't' || c == 'v' || c == '\\' || 148 c == '?' || c == '\'' || c == '\"'); 149 150 #undef CHARACTER_CLASS 151 152 // Given a char, interpret it as a numeric digit and return its value. 153 // This supports any number base up to 36. 154 inline int DigitValue(char digit) { 155 if ('0' <= digit && digit <= '9') return digit - '0'; 156 if ('a' <= digit && digit <= 'z') return digit - 'a' + 10; 157 if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10; 158 return -1; 159 } 160 161 // Inline because it's only used in one place. 162 inline char TranslateEscape(char c) { 163 switch (c) { 164 case 'a': return '\a'; 165 case 'b': return '\b'; 166 case 'f': return '\f'; 167 case 'n': return '\n'; 168 case 'r': return '\r'; 169 case 't': return '\t'; 170 case 'v': return '\v'; 171 case '\\': return '\\'; 172 case '?': return '\?'; // Trigraphs = :( 173 case '\'': return '\''; 174 case '"': return '\"'; 175 176 // We expect escape sequences to have been validated separately. 177 default: return '?'; 178 } 179 } 180 181 } // anonymous namespace 182 183 ErrorCollector::~ErrorCollector() {} 184 185 // =================================================================== 186 187 Tokenizer::Tokenizer(ZeroCopyInputStream* input, 188 ErrorCollector* error_collector) 189 : input_(input), 190 error_collector_(error_collector), 191 buffer_(NULL), 192 buffer_size_(0), 193 buffer_pos_(0), 194 read_error_(false), 195 line_(0), 196 column_(0), 197 record_target_(NULL), 198 record_start_(-1), 199 allow_f_after_float_(false), 200 comment_style_(CPP_COMMENT_STYLE), 201 require_space_after_number_(true), 202 allow_multiline_strings_(false) { 203 204 current_.line = 0; 205 current_.column = 0; 206 current_.end_column = 0; 207 current_.type = TYPE_START; 208 209 Refresh(); 210 } 211 212 Tokenizer::~Tokenizer() { 213 // If we had any buffer left unread, return it to the underlying stream 214 // so that someone else can read it. 215 if (buffer_size_ > buffer_pos_) { 216 input_->BackUp(buffer_size_ - buffer_pos_); 217 } 218 } 219 220 // ------------------------------------------------------------------- 221 // Internal helpers. 222 223 void Tokenizer::NextChar() { 224 // Update our line and column counters based on the character being 225 // consumed. 226 if (current_char_ == '\n') { 227 ++line_; 228 column_ = 0; 229 } else if (current_char_ == '\t') { 230 column_ += kTabWidth - column_ % kTabWidth; 231 } else { 232 ++column_; 233 } 234 235 // Advance to the next character. 236 ++buffer_pos_; 237 if (buffer_pos_ < buffer_size_) { 238 current_char_ = buffer_[buffer_pos_]; 239 } else { 240 Refresh(); 241 } 242 } 243 244 void Tokenizer::Refresh() { 245 if (read_error_) { 246 current_char_ = '\0'; 247 return; 248 } 249 250 // If we're in a token, append the rest of the buffer to it. 251 if (record_target_ != NULL && record_start_ < buffer_size_) { 252 record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_); 253 record_start_ = 0; 254 } 255 256 const void* data = NULL; 257 buffer_ = NULL; 258 buffer_pos_ = 0; 259 do { 260 if (!input_->Next(&data, &buffer_size_)) { 261 // end of stream (or read error) 262 buffer_size_ = 0; 263 read_error_ = true; 264 current_char_ = '\0'; 265 return; 266 } 267 } while (buffer_size_ == 0); 268 269 buffer_ = static_cast<const char*>(data); 270 271 current_char_ = buffer_[0]; 272 } 273 274 inline void Tokenizer::RecordTo(string* target) { 275 record_target_ = target; 276 record_start_ = buffer_pos_; 277 } 278 279 inline void Tokenizer::StopRecording() { 280 // Note: The if() is necessary because some STL implementations crash when 281 // you call string::append(NULL, 0), presumably because they are trying to 282 // be helpful by detecting the NULL pointer, even though there's nothing 283 // wrong with reading zero bytes from NULL. 284 if (buffer_pos_ != record_start_) { 285 record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_); 286 } 287 record_target_ = NULL; 288 record_start_ = -1; 289 } 290 291 inline void Tokenizer::StartToken() { 292 current_.type = TYPE_START; // Just for the sake of initializing it. 293 current_.text.clear(); 294 current_.line = line_; 295 current_.column = column_; 296 RecordTo(¤t_.text); 297 } 298 299 inline void Tokenizer::EndToken() { 300 StopRecording(); 301 current_.end_column = column_; 302 } 303 304 // ------------------------------------------------------------------- 305 // Helper methods that consume characters. 306 307 template<typename CharacterClass> 308 inline bool Tokenizer::LookingAt() { 309 return CharacterClass::InClass(current_char_); 310 } 311 312 template<typename CharacterClass> 313 inline bool Tokenizer::TryConsumeOne() { 314 if (CharacterClass::InClass(current_char_)) { 315 NextChar(); 316 return true; 317 } else { 318 return false; 319 } 320 } 321 322 inline bool Tokenizer::TryConsume(char c) { 323 if (current_char_ == c) { 324 NextChar(); 325 return true; 326 } else { 327 return false; 328 } 329 } 330 331 template<typename CharacterClass> 332 inline void Tokenizer::ConsumeZeroOrMore() { 333 while (CharacterClass::InClass(current_char_)) { 334 NextChar(); 335 } 336 } 337 338 template<typename CharacterClass> 339 inline void Tokenizer::ConsumeOneOrMore(const char* error) { 340 if (!CharacterClass::InClass(current_char_)) { 341 AddError(error); 342 } else { 343 do { 344 NextChar(); 345 } while (CharacterClass::InClass(current_char_)); 346 } 347 } 348 349 // ------------------------------------------------------------------- 350 // Methods that read whole patterns matching certain kinds of tokens 351 // or comments. 352 353 void Tokenizer::ConsumeString(char delimiter) { 354 while (true) { 355 switch (current_char_) { 356 case '\0': 357 AddError("Unexpected end of string."); 358 return; 359 360 case '\n': { 361 if (!allow_multiline_strings_) { 362 AddError("String literals cannot cross line boundaries."); 363 return; 364 } 365 NextChar(); 366 break; 367 } 368 369 case '\\': { 370 // An escape sequence. 371 NextChar(); 372 if (TryConsumeOne<Escape>()) { 373 // Valid escape sequence. 374 } else if (TryConsumeOne<OctalDigit>()) { 375 // Possibly followed by two more octal digits, but these will 376 // just be consumed by the main loop anyway so we don't need 377 // to do so explicitly here. 378 } else if (TryConsume('x')) { 379 if (!TryConsumeOne<HexDigit>()) { 380 AddError("Expected hex digits for escape sequence."); 381 } 382 // Possibly followed by another hex digit, but again we don't care. 383 } else if (TryConsume('u')) { 384 if (!TryConsumeOne<HexDigit>() || 385 !TryConsumeOne<HexDigit>() || 386 !TryConsumeOne<HexDigit>() || 387 !TryConsumeOne<HexDigit>()) { 388 AddError("Expected four hex digits for \\u escape sequence."); 389 } 390 } else if (TryConsume('U')) { 391 // We expect 8 hex digits; but only the range up to 0x10ffff is 392 // legal. 393 if (!TryConsume('0') || 394 !TryConsume('0') || 395 !(TryConsume('0') || TryConsume('1')) || 396 !TryConsumeOne<HexDigit>() || 397 !TryConsumeOne<HexDigit>() || 398 !TryConsumeOne<HexDigit>() || 399 !TryConsumeOne<HexDigit>() || 400 !TryConsumeOne<HexDigit>()) { 401 AddError("Expected eight hex digits up to 10ffff for \\U escape " 402 "sequence"); 403 } 404 } else { 405 AddError("Invalid escape sequence in string literal."); 406 } 407 break; 408 } 409 410 default: { 411 if (current_char_ == delimiter) { 412 NextChar(); 413 return; 414 } 415 NextChar(); 416 break; 417 } 418 } 419 } 420 } 421 422 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero, 423 bool started_with_dot) { 424 bool is_float = false; 425 426 if (started_with_zero && (TryConsume('x') || TryConsume('X'))) { 427 // A hex number (started with "0x"). 428 ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits."); 429 430 } else if (started_with_zero && LookingAt<Digit>()) { 431 // An octal number (had a leading zero). 432 ConsumeZeroOrMore<OctalDigit>(); 433 if (LookingAt<Digit>()) { 434 AddError("Numbers starting with leading zero must be in octal."); 435 ConsumeZeroOrMore<Digit>(); 436 } 437 438 } else { 439 // A decimal number. 440 if (started_with_dot) { 441 is_float = true; 442 ConsumeZeroOrMore<Digit>(); 443 } else { 444 ConsumeZeroOrMore<Digit>(); 445 446 if (TryConsume('.')) { 447 is_float = true; 448 ConsumeZeroOrMore<Digit>(); 449 } 450 } 451 452 if (TryConsume('e') || TryConsume('E')) { 453 is_float = true; 454 TryConsume('-') || TryConsume('+'); 455 ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent."); 456 } 457 458 if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) { 459 is_float = true; 460 } 461 } 462 463 if (LookingAt<Letter>() && require_space_after_number_) { 464 AddError("Need space between number and identifier."); 465 } else if (current_char_ == '.') { 466 if (is_float) { 467 AddError( 468 "Already saw decimal point or exponent; can't have another one."); 469 } else { 470 AddError("Hex and octal numbers must be integers."); 471 } 472 } 473 474 return is_float ? TYPE_FLOAT : TYPE_INTEGER; 475 } 476 477 void Tokenizer::ConsumeLineComment(string* content) { 478 if (content != NULL) RecordTo(content); 479 480 while (current_char_ != '\0' && current_char_ != '\n') { 481 NextChar(); 482 } 483 TryConsume('\n'); 484 485 if (content != NULL) StopRecording(); 486 } 487 488 void Tokenizer::ConsumeBlockComment(string* content) { 489 int start_line = line_; 490 int start_column = column_ - 2; 491 492 if (content != NULL) RecordTo(content); 493 494 while (true) { 495 while (current_char_ != '\0' && 496 current_char_ != '*' && 497 current_char_ != '/' && 498 current_char_ != '\n') { 499 NextChar(); 500 } 501 502 if (TryConsume('\n')) { 503 if (content != NULL) StopRecording(); 504 505 // Consume leading whitespace and asterisk; 506 ConsumeZeroOrMore<WhitespaceNoNewline>(); 507 if (TryConsume('*')) { 508 if (TryConsume('/')) { 509 // End of comment. 510 break; 511 } 512 } 513 514 if (content != NULL) RecordTo(content); 515 } else if (TryConsume('*') && TryConsume('/')) { 516 // End of comment. 517 if (content != NULL) { 518 StopRecording(); 519 // Strip trailing "*/". 520 content->erase(content->size() - 2); 521 } 522 break; 523 } else if (TryConsume('/') && current_char_ == '*') { 524 // Note: We didn't consume the '*' because if there is a '/' after it 525 // we want to interpret that as the end of the comment. 526 AddError( 527 "\"/*\" inside block comment. Block comments cannot be nested."); 528 } else if (current_char_ == '\0') { 529 AddError("End-of-file inside block comment."); 530 error_collector_->AddError( 531 start_line, start_column, " Comment started here."); 532 if (content != NULL) StopRecording(); 533 break; 534 } 535 } 536 } 537 538 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() { 539 if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { 540 if (TryConsume('/')) { 541 return LINE_COMMENT; 542 } else if (TryConsume('*')) { 543 return BLOCK_COMMENT; 544 } else { 545 // Oops, it was just a slash. Return it. 546 current_.type = TYPE_SYMBOL; 547 current_.text = "/"; 548 current_.line = line_; 549 current_.column = column_ - 1; 550 current_.end_column = column_; 551 return SLASH_NOT_COMMENT; 552 } 553 } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { 554 return LINE_COMMENT; 555 } else { 556 return NO_COMMENT; 557 } 558 } 559 560 // ------------------------------------------------------------------- 561 562 bool Tokenizer::Next() { 563 previous_ = current_; 564 565 while (!read_error_) { 566 ConsumeZeroOrMore<Whitespace>(); 567 568 switch (TryConsumeCommentStart()) { 569 case LINE_COMMENT: 570 ConsumeLineComment(NULL); 571 continue; 572 case BLOCK_COMMENT: 573 ConsumeBlockComment(NULL); 574 continue; 575 case SLASH_NOT_COMMENT: 576 return true; 577 case NO_COMMENT: 578 break; 579 } 580 581 // Check for EOF before continuing. 582 if (read_error_) break; 583 584 if (LookingAt<Unprintable>() || current_char_ == '\0') { 585 AddError("Invalid control characters encountered in text."); 586 NextChar(); 587 // Skip more unprintable characters, too. But, remember that '\0' is 588 // also what current_char_ is set to after EOF / read error. We have 589 // to be careful not to go into an infinite loop of trying to consume 590 // it, so make sure to check read_error_ explicitly before consuming 591 // '\0'. 592 while (TryConsumeOne<Unprintable>() || 593 (!read_error_ && TryConsume('\0'))) { 594 // Ignore. 595 } 596 597 } else { 598 // Reading some sort of token. 599 StartToken(); 600 601 if (TryConsumeOne<Letter>()) { 602 ConsumeZeroOrMore<Alphanumeric>(); 603 current_.type = TYPE_IDENTIFIER; 604 } else if (TryConsume('0')) { 605 current_.type = ConsumeNumber(true, false); 606 } else if (TryConsume('.')) { 607 // This could be the beginning of a floating-point number, or it could 608 // just be a '.' symbol. 609 610 if (TryConsumeOne<Digit>()) { 611 // It's a floating-point number. 612 if (previous_.type == TYPE_IDENTIFIER && 613 current_.line == previous_.line && 614 current_.column == previous_.end_column) { 615 // We don't accept syntax like "blah.123". 616 error_collector_->AddError(line_, column_ - 2, 617 "Need space between identifier and decimal point."); 618 } 619 current_.type = ConsumeNumber(false, true); 620 } else { 621 current_.type = TYPE_SYMBOL; 622 } 623 } else if (TryConsumeOne<Digit>()) { 624 current_.type = ConsumeNumber(false, false); 625 } else if (TryConsume('\"')) { 626 ConsumeString('\"'); 627 current_.type = TYPE_STRING; 628 } else if (TryConsume('\'')) { 629 ConsumeString('\''); 630 current_.type = TYPE_STRING; 631 } else { 632 // Check if the high order bit is set. 633 if (current_char_ & 0x80) { 634 error_collector_->AddError(line_, column_, 635 StringPrintf("Interpreting non ascii codepoint %d.", 636 static_cast<unsigned char>(current_char_))); 637 } 638 NextChar(); 639 current_.type = TYPE_SYMBOL; 640 } 641 642 EndToken(); 643 return true; 644 } 645 } 646 647 // EOF 648 current_.type = TYPE_END; 649 current_.text.clear(); 650 current_.line = line_; 651 current_.column = column_; 652 current_.end_column = column_; 653 return false; 654 } 655 656 namespace { 657 658 // Helper class for collecting comments and putting them in the right places. 659 // 660 // This basically just buffers the most recent comment until it can be decided 661 // exactly where that comment should be placed. When Flush() is called, the 662 // current comment goes into either prev_trailing_comments or detached_comments. 663 // When the CommentCollector is destroyed, the last buffered comment goes into 664 // next_leading_comments. 665 class CommentCollector { 666 public: 667 CommentCollector(string* prev_trailing_comments, 668 vector<string>* detached_comments, 669 string* next_leading_comments) 670 : prev_trailing_comments_(prev_trailing_comments), 671 detached_comments_(detached_comments), 672 next_leading_comments_(next_leading_comments), 673 has_comment_(false), 674 is_line_comment_(false), 675 can_attach_to_prev_(true) { 676 if (prev_trailing_comments != NULL) prev_trailing_comments->clear(); 677 if (detached_comments != NULL) detached_comments->clear(); 678 if (next_leading_comments != NULL) next_leading_comments->clear(); 679 } 680 681 ~CommentCollector() { 682 // Whatever is in the buffer is a leading comment. 683 if (next_leading_comments_ != NULL && has_comment_) { 684 comment_buffer_.swap(*next_leading_comments_); 685 } 686 } 687 688 // About to read a line comment. Get the comment buffer pointer in order to 689 // read into it. 690 string* GetBufferForLineComment() { 691 // We want to combine with previous line comments, but not block comments. 692 if (has_comment_ && !is_line_comment_) { 693 Flush(); 694 } 695 has_comment_ = true; 696 is_line_comment_ = true; 697 return &comment_buffer_; 698 } 699 700 // About to read a block comment. Get the comment buffer pointer in order to 701 // read into it. 702 string* GetBufferForBlockComment() { 703 if (has_comment_) { 704 Flush(); 705 } 706 has_comment_ = true; 707 is_line_comment_ = false; 708 return &comment_buffer_; 709 } 710 711 void ClearBuffer() { 712 comment_buffer_.clear(); 713 has_comment_ = false; 714 } 715 716 // Called once we know that the comment buffer is complete and is *not* 717 // connected to the next token. 718 void Flush() { 719 if (has_comment_) { 720 if (can_attach_to_prev_) { 721 if (prev_trailing_comments_ != NULL) { 722 prev_trailing_comments_->append(comment_buffer_); 723 } 724 can_attach_to_prev_ = false; 725 } else { 726 if (detached_comments_ != NULL) { 727 detached_comments_->push_back(comment_buffer_); 728 } 729 } 730 ClearBuffer(); 731 } 732 } 733 734 void DetachFromPrev() { 735 can_attach_to_prev_ = false; 736 } 737 738 private: 739 string* prev_trailing_comments_; 740 vector<string>* detached_comments_; 741 string* next_leading_comments_; 742 743 string comment_buffer_; 744 745 // True if any comments were read into comment_buffer_. This can be true even 746 // if comment_buffer_ is empty, namely if the comment was "/**/". 747 bool has_comment_; 748 749 // Is the comment in the comment buffer a line comment? 750 bool is_line_comment_; 751 752 // Is it still possible that we could be reading a comment attached to the 753 // previous token? 754 bool can_attach_to_prev_; 755 }; 756 757 } // namespace 758 759 bool Tokenizer::NextWithComments(string* prev_trailing_comments, 760 vector<string>* detached_comments, 761 string* next_leading_comments) { 762 CommentCollector collector(prev_trailing_comments, detached_comments, 763 next_leading_comments); 764 765 if (current_.type == TYPE_START) { 766 // Ignore unicode byte order mark(BOM) if it appears at the file 767 // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted. 768 if (TryConsume((char)0xEF)) { 769 if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) { 770 AddError("Proto file starts with 0xEF but not UTF-8 BOM. " 771 "Only UTF-8 is accepted for proto file."); 772 return false; 773 } 774 } 775 collector.DetachFromPrev(); 776 } else { 777 // A comment appearing on the same line must be attached to the previous 778 // declaration. 779 ConsumeZeroOrMore<WhitespaceNoNewline>(); 780 switch (TryConsumeCommentStart()) { 781 case LINE_COMMENT: 782 ConsumeLineComment(collector.GetBufferForLineComment()); 783 784 // Don't allow comments on subsequent lines to be attached to a trailing 785 // comment. 786 collector.Flush(); 787 break; 788 case BLOCK_COMMENT: 789 ConsumeBlockComment(collector.GetBufferForBlockComment()); 790 791 ConsumeZeroOrMore<WhitespaceNoNewline>(); 792 if (!TryConsume('\n')) { 793 // Oops, the next token is on the same line. If we recorded a comment 794 // we really have no idea which token it should be attached to. 795 collector.ClearBuffer(); 796 return Next(); 797 } 798 799 // Don't allow comments on subsequent lines to be attached to a trailing 800 // comment. 801 collector.Flush(); 802 break; 803 case SLASH_NOT_COMMENT: 804 return true; 805 case NO_COMMENT: 806 if (!TryConsume('\n')) { 807 // The next token is on the same line. There are no comments. 808 return Next(); 809 } 810 break; 811 } 812 } 813 814 // OK, we are now on the line *after* the previous token. 815 while (true) { 816 ConsumeZeroOrMore<WhitespaceNoNewline>(); 817 818 switch (TryConsumeCommentStart()) { 819 case LINE_COMMENT: 820 ConsumeLineComment(collector.GetBufferForLineComment()); 821 break; 822 case BLOCK_COMMENT: 823 ConsumeBlockComment(collector.GetBufferForBlockComment()); 824 825 // Consume the rest of the line so that we don't interpret it as a 826 // blank line the next time around the loop. 827 ConsumeZeroOrMore<WhitespaceNoNewline>(); 828 TryConsume('\n'); 829 break; 830 case SLASH_NOT_COMMENT: 831 return true; 832 case NO_COMMENT: 833 if (TryConsume('\n')) { 834 // Completely blank line. 835 collector.Flush(); 836 collector.DetachFromPrev(); 837 } else { 838 bool result = Next(); 839 if (!result || 840 current_.text == "}" || 841 current_.text == "]" || 842 current_.text == ")") { 843 // It looks like we're at the end of a scope. In this case it 844 // makes no sense to attach a comment to the following token. 845 collector.Flush(); 846 } 847 return result; 848 } 849 break; 850 } 851 } 852 } 853 854 // ------------------------------------------------------------------- 855 // Token-parsing helpers. Remember that these don't need to report 856 // errors since any errors should already have been reported while 857 // tokenizing. Also, these can assume that whatever text they 858 // are given is text that the tokenizer actually parsed as a token 859 // of the given type. 860 861 bool Tokenizer::ParseInteger(const string& text, uint64 max_value, 862 uint64* output) { 863 // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull() 864 // is non-standard. I hate the C standard library. :( 865 866 // return strtoull(text.c_str(), NULL, 0); 867 868 const char* ptr = text.c_str(); 869 int base = 10; 870 if (ptr[0] == '0') { 871 if (ptr[1] == 'x' || ptr[1] == 'X') { 872 // This is hex. 873 base = 16; 874 ptr += 2; 875 } else { 876 // This is octal. 877 base = 8; 878 } 879 } 880 881 uint64 result = 0; 882 for (; *ptr != '\0'; ptr++) { 883 int digit = DigitValue(*ptr); 884 if (digit < 0 || digit >= base) { 885 // The token provided by Tokenizer is invalid. i.e., 099 is an invalid 886 // token, but Tokenizer still think it's integer. 887 return false; 888 } 889 if (digit > max_value || result > (max_value - digit) / base) { 890 // Overflow. 891 return false; 892 } 893 result = result * base + digit; 894 } 895 896 *output = result; 897 return true; 898 } 899 900 double Tokenizer::ParseFloat(const string& text) { 901 const char* start = text.c_str(); 902 char* end; 903 double result = NoLocaleStrtod(start, &end); 904 905 // "1e" is not a valid float, but if the tokenizer reads it, it will 906 // report an error but still return it as a valid token. We need to 907 // accept anything the tokenizer could possibly return, error or not. 908 if (*end == 'e' || *end == 'E') { 909 ++end; 910 if (*end == '-' || *end == '+') ++end; 911 } 912 913 // If the Tokenizer had allow_f_after_float_ enabled, the float may be 914 // suffixed with the letter 'f'. 915 if (*end == 'f' || *end == 'F') { 916 ++end; 917 } 918 919 GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-') 920 << " Tokenizer::ParseFloat() passed text that could not have been" 921 " tokenized as a float: " << CEscape(text); 922 return result; 923 } 924 925 // Helper to append a Unicode code point to a string as UTF8, without bringing 926 // in any external dependencies. 927 static void AppendUTF8(uint32 code_point, string* output) { 928 uint32 tmp = 0; 929 int len = 0; 930 if (code_point <= 0x7f) { 931 tmp = code_point; 932 len = 1; 933 } else if (code_point <= 0x07ff) { 934 tmp = 0x0000c080 | 935 ((code_point & 0x07c0) << 2) | 936 (code_point & 0x003f); 937 len = 2; 938 } else if (code_point <= 0xffff) { 939 tmp = 0x00e08080 | 940 ((code_point & 0xf000) << 4) | 941 ((code_point & 0x0fc0) << 2) | 942 (code_point & 0x003f); 943 len = 3; 944 } else if (code_point <= 0x1fffff) { 945 tmp = 0xf0808080 | 946 ((code_point & 0x1c0000) << 6) | 947 ((code_point & 0x03f000) << 4) | 948 ((code_point & 0x000fc0) << 2) | 949 (code_point & 0x003f); 950 len = 4; 951 } else { 952 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is 953 // normally only defined up to there as well. 954 StringAppendF(output, "\\U%08x", code_point); 955 return; 956 } 957 tmp = ghtonl(tmp); 958 output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len); 959 } 960 961 // Try to read <len> hex digits from ptr, and stuff the numeric result into 962 // *result. Returns true if that many digits were successfully consumed. 963 static bool ReadHexDigits(const char* ptr, int len, uint32* result) { 964 *result = 0; 965 if (len == 0) return false; 966 for (const char* end = ptr + len; ptr < end; ++ptr) { 967 if (*ptr == '\0') return false; 968 *result = (*result << 4) + DigitValue(*ptr); 969 } 970 return true; 971 } 972 973 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range 974 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail 975 // surrogate. These numbers are in a reserved range of Unicode code points, so 976 // if we encounter such a pair we know how to parse it and convert it into a 977 // single code point. 978 static const uint32 kMinHeadSurrogate = 0xd800; 979 static const uint32 kMaxHeadSurrogate = 0xdc00; 980 static const uint32 kMinTrailSurrogate = 0xdc00; 981 static const uint32 kMaxTrailSurrogate = 0xe000; 982 983 static inline bool IsHeadSurrogate(uint32 code_point) { 984 return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate); 985 } 986 987 static inline bool IsTrailSurrogate(uint32 code_point) { 988 return (code_point >= kMinTrailSurrogate) && 989 (code_point < kMaxTrailSurrogate); 990 } 991 992 // Combine a head and trail surrogate into a single Unicode code point. 993 static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) { 994 GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate)); 995 GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate)); 996 return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) | 997 (trail_surrogate - kMinTrailSurrogate)); 998 } 999 1000 // Convert the escape sequence parameter to a number of expected hex digits. 1001 static inline int UnicodeLength(char key) { 1002 if (key == 'u') return 4; 1003 if (key == 'U') return 8; 1004 return 0; 1005 } 1006 1007 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt 1008 // to parse that sequence. On success, returns a pointer to the first char 1009 // beyond that sequence, and fills in *code_point. On failure, returns ptr 1010 // itself. 1011 static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) { 1012 const char* p = ptr; 1013 // Fetch the code point. 1014 const int len = UnicodeLength(*p++); 1015 if (!ReadHexDigits(p, len, code_point)) 1016 return ptr; 1017 p += len; 1018 1019 // Check if the code point we read is a "head surrogate." If so, then we 1020 // expect it to be immediately followed by another code point which is a valid 1021 // "trail surrogate," and together they form a UTF-16 pair which decodes into 1022 // a single Unicode point. Trail surrogates may only use \u, not \U. 1023 if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') { 1024 uint32 trail_surrogate; 1025 if (ReadHexDigits(p + 2, 4, &trail_surrogate) && 1026 IsTrailSurrogate(trail_surrogate)) { 1027 *code_point = AssembleUTF16(*code_point, trail_surrogate); 1028 p += 6; 1029 } 1030 // If this failed, then we just emit the head surrogate as a code point. 1031 // It's bogus, but so is the string. 1032 } 1033 1034 return p; 1035 } 1036 1037 // The text string must begin and end with single or double quote 1038 // characters. 1039 void Tokenizer::ParseStringAppend(const string& text, string* output) { 1040 // Reminder: text[0] is always a quote character. (If text is 1041 // empty, it's invalid, so we'll just return). 1042 const size_t text_size = text.size(); 1043 if (text_size == 0) { 1044 GOOGLE_LOG(DFATAL) 1045 << " Tokenizer::ParseStringAppend() passed text that could not" 1046 " have been tokenized as a string: " << CEscape(text); 1047 return; 1048 } 1049 1050 // Reserve room for new string. The branch is necessary because if 1051 // there is already space available the reserve() call might 1052 // downsize the output. 1053 const size_t new_len = text_size + output->size(); 1054 if (new_len > output->capacity()) { 1055 output->reserve(new_len); 1056 } 1057 1058 // Loop through the string copying characters to "output" and 1059 // interpreting escape sequences. Note that any invalid escape 1060 // sequences or other errors were already reported while tokenizing. 1061 // In this case we do not need to produce valid results. 1062 for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) { 1063 if (*ptr == '\\' && ptr[1] != '\0') { 1064 // An escape sequence. 1065 ++ptr; 1066 1067 if (OctalDigit::InClass(*ptr)) { 1068 // An octal escape. May one, two, or three digits. 1069 int code = DigitValue(*ptr); 1070 if (OctalDigit::InClass(ptr[1])) { 1071 ++ptr; 1072 code = code * 8 + DigitValue(*ptr); 1073 } 1074 if (OctalDigit::InClass(ptr[1])) { 1075 ++ptr; 1076 code = code * 8 + DigitValue(*ptr); 1077 } 1078 output->push_back(static_cast<char>(code)); 1079 1080 } else if (*ptr == 'x') { 1081 // A hex escape. May zero, one, or two digits. (The zero case 1082 // will have been caught as an error earlier.) 1083 int code = 0; 1084 if (HexDigit::InClass(ptr[1])) { 1085 ++ptr; 1086 code = DigitValue(*ptr); 1087 } 1088 if (HexDigit::InClass(ptr[1])) { 1089 ++ptr; 1090 code = code * 16 + DigitValue(*ptr); 1091 } 1092 output->push_back(static_cast<char>(code)); 1093 1094 } else if (*ptr == 'u' || *ptr == 'U') { 1095 uint32 unicode; 1096 const char* end = FetchUnicodePoint(ptr, &unicode); 1097 if (end == ptr) { 1098 // Failure: Just dump out what we saw, don't try to parse it. 1099 output->push_back(*ptr); 1100 } else { 1101 AppendUTF8(unicode, output); 1102 ptr = end - 1; // Because we're about to ++ptr. 1103 } 1104 } else { 1105 // Some other escape code. 1106 output->push_back(TranslateEscape(*ptr)); 1107 } 1108 1109 } else if (*ptr == text[0] && ptr[1] == '\0') { 1110 // Ignore final quote matching the starting quote. 1111 } else { 1112 output->push_back(*ptr); 1113 } 1114 } 1115 } 1116 1117 template<typename CharacterClass> 1118 static bool AllInClass(const string& s) { 1119 for (int i = 0; i < s.size(); ++i) { 1120 if (!CharacterClass::InClass(s[i])) 1121 return false; 1122 } 1123 return true; 1124 } 1125 1126 bool Tokenizer::IsIdentifier(const string& text) { 1127 // Mirrors IDENTIFIER definition in Tokenizer::Next() above. 1128 if (text.size() == 0) 1129 return false; 1130 if (!Letter::InClass(text.at(0))) 1131 return false; 1132 if (!AllInClass<Alphanumeric>(text.substr(1))) 1133 return false; 1134 return true; 1135 } 1136 1137 } // namespace io 1138 } // namespace protobuf 1139 } // namespace google 1140