1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton (at) google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 // 35 // Here we have a hand-written lexer. At first you might ask yourself, 36 // "Hand-written text processing? Is Kenton crazy?!" Well, first of all, 37 // yes I am crazy, but that's beside the point. There are actually reasons 38 // why I ended up writing this this way. 39 // 40 // The traditional approach to lexing is to use lex to generate a lexer for 41 // you. Unfortunately, lex's output is ridiculously ugly and difficult to 42 // integrate cleanly with C++ code, especially abstract code or code meant 43 // as a library. Better parser-generators exist but would add dependencies 44 // which most users won't already have, which we'd like to avoid. (GNU flex 45 // has a C++ output option, but it's still ridiculously ugly, non-abstract, 46 // and not library-friendly.) 47 // 48 // The next approach that any good software engineer should look at is to 49 // use regular expressions. And, indeed, I did. I have code which 50 // implements this same class using regular expressions. It's about 200 51 // lines shorter. However: 52 // - Rather than error messages telling you "This string has an invalid 53 // escape sequence at line 5, column 45", you get error messages like 54 // "Parse error on line 5". Giving more precise errors requires adding 55 // a lot of code that ends up basically as complex as the hand-coded 56 // version anyway. 57 // - The regular expression to match a string literal looks like this: 58 // kString = new RE("(\"([^\"\\\\]|" // non-escaped 59 // "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape 60 // "\\\\x[0-9a-fA-F])*\"|" // hex escape 61 // "\'([^\'\\\\]|" // Also support single-quotes. 62 // "\\\\[abfnrtv?\"'\\\\0-7]|" 63 // "\\\\x[0-9a-fA-F])*\')"); 64 // Verifying the correctness of this line noise is actually harder than 65 // verifying the correctness of ConsumeString(), defined below. I'm not 66 // even confident that the above is correct, after staring at it for some 67 // time. 68 // - PCRE is fast, but there's still more overhead involved than the code 69 // below. 70 // - Sadly, regular expressions are not part of the C standard library, so 71 // using them would require depending on some other library. For the 72 // open source release, this could be really annoying. Nobody likes 73 // downloading one piece of software just to find that they need to 74 // download something else to make it work, and in all likelihood 75 // people downloading Protocol Buffers will already be doing so just 76 // to make something else work. We could include a copy of PCRE with 77 // our code, but that obligates us to keep it up-to-date and just seems 78 // like a big waste just to save 200 lines of code. 79 // 80 // On a similar but unrelated note, I'm even scared to use ctype.h. 81 // Apparently functions like isalpha() are locale-dependent. So, if we used 82 // that, then if this code is being called from some program that doesn't 83 // have its locale set to "C", it would behave strangely. We can't just set 84 // the locale to "C" ourselves since we might break the calling program that 85 // way, particularly if it is multi-threaded. WTF? Someone please let me 86 // (Kenton) know if I'm missing something here... 87 // 88 // I'd love to hear about other alternatives, though, as this code isn't 89 // exactly pretty. 90 91 #include <google/protobuf/io/tokenizer.h> 92 #include <google/protobuf/stubs/common.h> 93 #include <google/protobuf/stubs/stringprintf.h> 94 #include <google/protobuf/io/strtod.h> 95 #include <google/protobuf/io/zero_copy_stream.h> 96 #include <google/protobuf/stubs/strutil.h> 97 #include <google/protobuf/stubs/stl_util.h> 98 99 namespace google { 100 namespace protobuf { 101 namespace io { 102 namespace { 103 104 // As mentioned above, I don't trust ctype.h due to the presence of "locales". 105 // So, I have written replacement functions here. Someone please smack me if 106 // this is a bad idea or if there is some way around this. 107 // 108 // These "character classes" are designed to be used in template methods. 109 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat 110 // whitespace. 111 112 // Note: No class is allowed to contain '\0', since this is used to mark end- 113 // of-input and is handled specially. 114 115 #define CHARACTER_CLASS(NAME, EXPRESSION) \ 116 class NAME { \ 117 public: \ 118 static inline bool InClass(char c) { \ 119 return EXPRESSION; \ 120 } \ 121 } 122 123 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || 124 c == '\r' || c == '\v' || c == '\f'); 125 CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' || 126 c == '\r' || c == '\v' || c == '\f'); 127 128 CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0'); 129 130 CHARACTER_CLASS(Digit, '0' <= c && c <= '9'); 131 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7'); 132 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || 133 ('a' <= c && c <= 'f') || 134 ('A' <= c && c <= 'F')); 135 136 CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') || 137 ('A' <= c && c <= 'Z') || 138 (c == '_')); 139 140 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') || 141 ('A' <= c && c <= 'Z') || 142 ('0' <= c && c <= '9') || 143 (c == '_')); 144 145 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' || 146 c == 'r' || c == 't' || c == 'v' || c == '\\' || 147 c == '?' || c == '\'' || c == '\"'); 148 149 #undef CHARACTER_CLASS 150 151 // Given a char, interpret it as a numeric digit and return its value. 152 // This supports any number base up to 36. 153 inline int DigitValue(char digit) { 154 if ('0' <= digit && digit <= '9') return digit - '0'; 155 if ('a' <= digit && digit <= 'z') return digit - 'a' + 10; 156 if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10; 157 return -1; 158 } 159 160 // Inline because it's only used in one place. 161 inline char TranslateEscape(char c) { 162 switch (c) { 163 case 'a': return '\a'; 164 case 'b': return '\b'; 165 case 'f': return '\f'; 166 case 'n': return '\n'; 167 case 'r': return '\r'; 168 case 't': return '\t'; 169 case 'v': return '\v'; 170 case '\\': return '\\'; 171 case '?': return '\?'; // Trigraphs = :( 172 case '\'': return '\''; 173 case '"': return '\"'; 174 175 // We expect escape sequences to have been validated separately. 176 default: return '?'; 177 } 178 } 179 180 } // anonymous namespace 181 182 ErrorCollector::~ErrorCollector() {} 183 184 // =================================================================== 185 186 Tokenizer::Tokenizer(ZeroCopyInputStream* input, 187 ErrorCollector* error_collector) 188 : input_(input), 189 error_collector_(error_collector), 190 buffer_(NULL), 191 buffer_size_(0), 192 buffer_pos_(0), 193 read_error_(false), 194 line_(0), 195 column_(0), 196 record_target_(NULL), 197 record_start_(-1), 198 allow_f_after_float_(false), 199 comment_style_(CPP_COMMENT_STYLE), 200 require_space_after_number_(true), 201 allow_multiline_strings_(false) { 202 203 current_.line = 0; 204 current_.column = 0; 205 current_.end_column = 0; 206 current_.type = TYPE_START; 207 208 Refresh(); 209 } 210 211 Tokenizer::~Tokenizer() { 212 // If we had any buffer left unread, return it to the underlying stream 213 // so that someone else can read it. 214 if (buffer_size_ > buffer_pos_) { 215 input_->BackUp(buffer_size_ - buffer_pos_); 216 } 217 } 218 219 // ------------------------------------------------------------------- 220 // Internal helpers. 221 222 void Tokenizer::NextChar() { 223 // Update our line and column counters based on the character being 224 // consumed. 225 if (current_char_ == '\n') { 226 ++line_; 227 column_ = 0; 228 } else if (current_char_ == '\t') { 229 column_ += kTabWidth - column_ % kTabWidth; 230 } else { 231 ++column_; 232 } 233 234 // Advance to the next character. 235 ++buffer_pos_; 236 if (buffer_pos_ < buffer_size_) { 237 current_char_ = buffer_[buffer_pos_]; 238 } else { 239 Refresh(); 240 } 241 } 242 243 void Tokenizer::Refresh() { 244 if (read_error_) { 245 current_char_ = '\0'; 246 return; 247 } 248 249 // If we're in a token, append the rest of the buffer to it. 250 if (record_target_ != NULL && record_start_ < buffer_size_) { 251 record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_); 252 record_start_ = 0; 253 } 254 255 const void* data = NULL; 256 buffer_ = NULL; 257 buffer_pos_ = 0; 258 do { 259 if (!input_->Next(&data, &buffer_size_)) { 260 // end of stream (or read error) 261 buffer_size_ = 0; 262 read_error_ = true; 263 current_char_ = '\0'; 264 return; 265 } 266 } while (buffer_size_ == 0); 267 268 buffer_ = static_cast<const char*>(data); 269 270 current_char_ = buffer_[0]; 271 } 272 273 inline void Tokenizer::RecordTo(string* target) { 274 record_target_ = target; 275 record_start_ = buffer_pos_; 276 } 277 278 inline void Tokenizer::StopRecording() { 279 // Note: The if() is necessary because some STL implementations crash when 280 // you call string::append(NULL, 0), presumably because they are trying to 281 // be helpful by detecting the NULL pointer, even though there's nothing 282 // wrong with reading zero bytes from NULL. 283 if (buffer_pos_ != record_start_) { 284 record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_); 285 } 286 record_target_ = NULL; 287 record_start_ = -1; 288 } 289 290 inline void Tokenizer::StartToken() { 291 current_.type = TYPE_START; // Just for the sake of initializing it. 292 current_.text.clear(); 293 current_.line = line_; 294 current_.column = column_; 295 RecordTo(¤t_.text); 296 } 297 298 inline void Tokenizer::EndToken() { 299 StopRecording(); 300 current_.end_column = column_; 301 } 302 303 // ------------------------------------------------------------------- 304 // Helper methods that consume characters. 305 306 template<typename CharacterClass> 307 inline bool Tokenizer::LookingAt() { 308 return CharacterClass::InClass(current_char_); 309 } 310 311 template<typename CharacterClass> 312 inline bool Tokenizer::TryConsumeOne() { 313 if (CharacterClass::InClass(current_char_)) { 314 NextChar(); 315 return true; 316 } else { 317 return false; 318 } 319 } 320 321 inline bool Tokenizer::TryConsume(char c) { 322 if (current_char_ == c) { 323 NextChar(); 324 return true; 325 } else { 326 return false; 327 } 328 } 329 330 template<typename CharacterClass> 331 inline void Tokenizer::ConsumeZeroOrMore() { 332 while (CharacterClass::InClass(current_char_)) { 333 NextChar(); 334 } 335 } 336 337 template<typename CharacterClass> 338 inline void Tokenizer::ConsumeOneOrMore(const char* error) { 339 if (!CharacterClass::InClass(current_char_)) { 340 AddError(error); 341 } else { 342 do { 343 NextChar(); 344 } while (CharacterClass::InClass(current_char_)); 345 } 346 } 347 348 // ------------------------------------------------------------------- 349 // Methods that read whole patterns matching certain kinds of tokens 350 // or comments. 351 352 void Tokenizer::ConsumeString(char delimiter) { 353 while (true) { 354 switch (current_char_) { 355 case '\0': 356 AddError("Unexpected end of string."); 357 return; 358 359 case '\n': { 360 if (!allow_multiline_strings_) { 361 AddError("String literals cannot cross line boundaries."); 362 return; 363 } 364 NextChar(); 365 break; 366 } 367 368 case '\\': { 369 // An escape sequence. 370 NextChar(); 371 if (TryConsumeOne<Escape>()) { 372 // Valid escape sequence. 373 } else if (TryConsumeOne<OctalDigit>()) { 374 // Possibly followed by two more octal digits, but these will 375 // just be consumed by the main loop anyway so we don't need 376 // to do so explicitly here. 377 } else if (TryConsume('x') || TryConsume('X')) { 378 if (!TryConsumeOne<HexDigit>()) { 379 AddError("Expected hex digits for escape sequence."); 380 } 381 // Possibly followed by another hex digit, but again we don't care. 382 } else if (TryConsume('u')) { 383 if (!TryConsumeOne<HexDigit>() || 384 !TryConsumeOne<HexDigit>() || 385 !TryConsumeOne<HexDigit>() || 386 !TryConsumeOne<HexDigit>()) { 387 AddError("Expected four hex digits for \\u escape sequence."); 388 } 389 } else if (TryConsume('U')) { 390 // We expect 8 hex digits; but only the range up to 0x10ffff is 391 // legal. 392 if (!TryConsume('0') || 393 !TryConsume('0') || 394 !(TryConsume('0') || TryConsume('1')) || 395 !TryConsumeOne<HexDigit>() || 396 !TryConsumeOne<HexDigit>() || 397 !TryConsumeOne<HexDigit>() || 398 !TryConsumeOne<HexDigit>() || 399 !TryConsumeOne<HexDigit>()) { 400 AddError("Expected eight hex digits up to 10ffff for \\U escape " 401 "sequence"); 402 } 403 } else { 404 AddError("Invalid escape sequence in string literal."); 405 } 406 break; 407 } 408 409 default: { 410 if (current_char_ == delimiter) { 411 NextChar(); 412 return; 413 } 414 NextChar(); 415 break; 416 } 417 } 418 } 419 } 420 421 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero, 422 bool started_with_dot) { 423 bool is_float = false; 424 425 if (started_with_zero && (TryConsume('x') || TryConsume('X'))) { 426 // A hex number (started with "0x"). 427 ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits."); 428 429 } else if (started_with_zero && LookingAt<Digit>()) { 430 // An octal number (had a leading zero). 431 ConsumeZeroOrMore<OctalDigit>(); 432 if (LookingAt<Digit>()) { 433 AddError("Numbers starting with leading zero must be in octal."); 434 ConsumeZeroOrMore<Digit>(); 435 } 436 437 } else { 438 // A decimal number. 439 if (started_with_dot) { 440 is_float = true; 441 ConsumeZeroOrMore<Digit>(); 442 } else { 443 ConsumeZeroOrMore<Digit>(); 444 445 if (TryConsume('.')) { 446 is_float = true; 447 ConsumeZeroOrMore<Digit>(); 448 } 449 } 450 451 if (TryConsume('e') || TryConsume('E')) { 452 is_float = true; 453 TryConsume('-') || TryConsume('+'); 454 ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent."); 455 } 456 457 if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) { 458 is_float = true; 459 } 460 } 461 462 if (LookingAt<Letter>() && require_space_after_number_) { 463 AddError("Need space between number and identifier."); 464 } else if (current_char_ == '.') { 465 if (is_float) { 466 AddError( 467 "Already saw decimal point or exponent; can't have another one."); 468 } else { 469 AddError("Hex and octal numbers must be integers."); 470 } 471 } 472 473 return is_float ? TYPE_FLOAT : TYPE_INTEGER; 474 } 475 476 void Tokenizer::ConsumeLineComment(string* content) { 477 if (content != NULL) RecordTo(content); 478 479 while (current_char_ != '\0' && current_char_ != '\n') { 480 NextChar(); 481 } 482 TryConsume('\n'); 483 484 if (content != NULL) StopRecording(); 485 } 486 487 void Tokenizer::ConsumeBlockComment(string* content) { 488 int start_line = line_; 489 int start_column = column_ - 2; 490 491 if (content != NULL) RecordTo(content); 492 493 while (true) { 494 while (current_char_ != '\0' && 495 current_char_ != '*' && 496 current_char_ != '/' && 497 current_char_ != '\n') { 498 NextChar(); 499 } 500 501 if (TryConsume('\n')) { 502 if (content != NULL) StopRecording(); 503 504 // Consume leading whitespace and asterisk; 505 ConsumeZeroOrMore<WhitespaceNoNewline>(); 506 if (TryConsume('*')) { 507 if (TryConsume('/')) { 508 // End of comment. 509 break; 510 } 511 } 512 513 if (content != NULL) RecordTo(content); 514 } else if (TryConsume('*') && TryConsume('/')) { 515 // End of comment. 516 if (content != NULL) { 517 StopRecording(); 518 // Strip trailing "*/". 519 content->erase(content->size() - 2); 520 } 521 break; 522 } else if (TryConsume('/') && current_char_ == '*') { 523 // Note: We didn't consume the '*' because if there is a '/' after it 524 // we want to interpret that as the end of the comment. 525 AddError( 526 "\"/*\" inside block comment. Block comments cannot be nested."); 527 } else if (current_char_ == '\0') { 528 AddError("End-of-file inside block comment."); 529 error_collector_->AddError( 530 start_line, start_column, " Comment started here."); 531 if (content != NULL) StopRecording(); 532 break; 533 } 534 } 535 } 536 537 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() { 538 if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { 539 if (TryConsume('/')) { 540 return LINE_COMMENT; 541 } else if (TryConsume('*')) { 542 return BLOCK_COMMENT; 543 } else { 544 // Oops, it was just a slash. Return it. 545 current_.type = TYPE_SYMBOL; 546 current_.text = "/"; 547 current_.line = line_; 548 current_.column = column_ - 1; 549 current_.end_column = column_; 550 return SLASH_NOT_COMMENT; 551 } 552 } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { 553 return LINE_COMMENT; 554 } else { 555 return NO_COMMENT; 556 } 557 } 558 559 // ------------------------------------------------------------------- 560 561 bool Tokenizer::Next() { 562 previous_ = current_; 563 564 while (!read_error_) { 565 ConsumeZeroOrMore<Whitespace>(); 566 567 switch (TryConsumeCommentStart()) { 568 case LINE_COMMENT: 569 ConsumeLineComment(NULL); 570 continue; 571 case BLOCK_COMMENT: 572 ConsumeBlockComment(NULL); 573 continue; 574 case SLASH_NOT_COMMENT: 575 return true; 576 case NO_COMMENT: 577 break; 578 } 579 580 // Check for EOF before continuing. 581 if (read_error_) break; 582 583 if (LookingAt<Unprintable>() || current_char_ == '\0') { 584 AddError("Invalid control characters encountered in text."); 585 NextChar(); 586 // Skip more unprintable characters, too. But, remember that '\0' is 587 // also what current_char_ is set to after EOF / read error. We have 588 // to be careful not to go into an infinite loop of trying to consume 589 // it, so make sure to check read_error_ explicitly before consuming 590 // '\0'. 591 while (TryConsumeOne<Unprintable>() || 592 (!read_error_ && TryConsume('\0'))) { 593 // Ignore. 594 } 595 596 } else { 597 // Reading some sort of token. 598 StartToken(); 599 600 if (TryConsumeOne<Letter>()) { 601 ConsumeZeroOrMore<Alphanumeric>(); 602 current_.type = TYPE_IDENTIFIER; 603 } else if (TryConsume('0')) { 604 current_.type = ConsumeNumber(true, false); 605 } else if (TryConsume('.')) { 606 // This could be the beginning of a floating-point number, or it could 607 // just be a '.' symbol. 608 609 if (TryConsumeOne<Digit>()) { 610 // It's a floating-point number. 611 if (previous_.type == TYPE_IDENTIFIER && 612 current_.line == previous_.line && 613 current_.column == previous_.end_column) { 614 // We don't accept syntax like "blah.123". 615 error_collector_->AddError(line_, column_ - 2, 616 "Need space between identifier and decimal point."); 617 } 618 current_.type = ConsumeNumber(false, true); 619 } else { 620 current_.type = TYPE_SYMBOL; 621 } 622 } else if (TryConsumeOne<Digit>()) { 623 current_.type = ConsumeNumber(false, false); 624 } else if (TryConsume('\"')) { 625 ConsumeString('\"'); 626 current_.type = TYPE_STRING; 627 } else if (TryConsume('\'')) { 628 ConsumeString('\''); 629 current_.type = TYPE_STRING; 630 } else { 631 // Check if the high order bit is set. 632 if (current_char_ & 0x80) { 633 error_collector_->AddError(line_, column_, 634 StringPrintf("Interpreting non ascii codepoint %d.", 635 static_cast<unsigned char>(current_char_))); 636 } 637 NextChar(); 638 current_.type = TYPE_SYMBOL; 639 } 640 641 EndToken(); 642 return true; 643 } 644 } 645 646 // EOF 647 current_.type = TYPE_END; 648 current_.text.clear(); 649 current_.line = line_; 650 current_.column = column_; 651 current_.end_column = column_; 652 return false; 653 } 654 655 namespace { 656 657 // Helper class for collecting comments and putting them in the right places. 658 // 659 // This basically just buffers the most recent comment until it can be decided 660 // exactly where that comment should be placed. When Flush() is called, the 661 // current comment goes into either prev_trailing_comments or detached_comments. 662 // When the CommentCollector is destroyed, the last buffered comment goes into 663 // next_leading_comments. 664 class CommentCollector { 665 public: 666 CommentCollector(string* prev_trailing_comments, 667 vector<string>* detached_comments, 668 string* next_leading_comments) 669 : prev_trailing_comments_(prev_trailing_comments), 670 detached_comments_(detached_comments), 671 next_leading_comments_(next_leading_comments), 672 has_comment_(false), 673 is_line_comment_(false), 674 can_attach_to_prev_(true) { 675 if (prev_trailing_comments != NULL) prev_trailing_comments->clear(); 676 if (detached_comments != NULL) detached_comments->clear(); 677 if (next_leading_comments != NULL) next_leading_comments->clear(); 678 } 679 680 ~CommentCollector() { 681 // Whatever is in the buffer is a leading comment. 682 if (next_leading_comments_ != NULL && has_comment_) { 683 comment_buffer_.swap(*next_leading_comments_); 684 } 685 } 686 687 // About to read a line comment. Get the comment buffer pointer in order to 688 // read into it. 689 string* GetBufferForLineComment() { 690 // We want to combine with previous line comments, but not block comments. 691 if (has_comment_ && !is_line_comment_) { 692 Flush(); 693 } 694 has_comment_ = true; 695 is_line_comment_ = true; 696 return &comment_buffer_; 697 } 698 699 // About to read a block comment. Get the comment buffer pointer in order to 700 // read into it. 701 string* GetBufferForBlockComment() { 702 if (has_comment_) { 703 Flush(); 704 } 705 has_comment_ = true; 706 is_line_comment_ = false; 707 return &comment_buffer_; 708 } 709 710 void ClearBuffer() { 711 comment_buffer_.clear(); 712 has_comment_ = false; 713 } 714 715 // Called once we know that the comment buffer is complete and is *not* 716 // connected to the next token. 717 void Flush() { 718 if (has_comment_) { 719 if (can_attach_to_prev_) { 720 if (prev_trailing_comments_ != NULL) { 721 prev_trailing_comments_->append(comment_buffer_); 722 } 723 can_attach_to_prev_ = false; 724 } else { 725 if (detached_comments_ != NULL) { 726 detached_comments_->push_back(comment_buffer_); 727 } 728 } 729 ClearBuffer(); 730 } 731 } 732 733 void DetachFromPrev() { 734 can_attach_to_prev_ = false; 735 } 736 737 private: 738 string* prev_trailing_comments_; 739 vector<string>* detached_comments_; 740 string* next_leading_comments_; 741 742 string comment_buffer_; 743 744 // True if any comments were read into comment_buffer_. This can be true even 745 // if comment_buffer_ is empty, namely if the comment was "/**/". 746 bool has_comment_; 747 748 // Is the comment in the comment buffer a line comment? 749 bool is_line_comment_; 750 751 // Is it still possible that we could be reading a comment attached to the 752 // previous token? 753 bool can_attach_to_prev_; 754 }; 755 756 } // namespace 757 758 bool Tokenizer::NextWithComments(string* prev_trailing_comments, 759 vector<string>* detached_comments, 760 string* next_leading_comments) { 761 CommentCollector collector(prev_trailing_comments, detached_comments, 762 next_leading_comments); 763 764 if (current_.type == TYPE_START) { 765 collector.DetachFromPrev(); 766 } else { 767 // A comment appearing on the same line must be attached to the previous 768 // declaration. 769 ConsumeZeroOrMore<WhitespaceNoNewline>(); 770 switch (TryConsumeCommentStart()) { 771 case LINE_COMMENT: 772 ConsumeLineComment(collector.GetBufferForLineComment()); 773 774 // Don't allow comments on subsequent lines to be attached to a trailing 775 // comment. 776 collector.Flush(); 777 break; 778 case BLOCK_COMMENT: 779 ConsumeBlockComment(collector.GetBufferForBlockComment()); 780 781 ConsumeZeroOrMore<WhitespaceNoNewline>(); 782 if (!TryConsume('\n')) { 783 // Oops, the next token is on the same line. If we recorded a comment 784 // we really have no idea which token it should be attached to. 785 collector.ClearBuffer(); 786 return Next(); 787 } 788 789 // Don't allow comments on subsequent lines to be attached to a trailing 790 // comment. 791 collector.Flush(); 792 break; 793 case SLASH_NOT_COMMENT: 794 return true; 795 case NO_COMMENT: 796 if (!TryConsume('\n')) { 797 // The next token is on the same line. There are no comments. 798 return Next(); 799 } 800 break; 801 } 802 } 803 804 // OK, we are now on the line *after* the previous token. 805 while (true) { 806 ConsumeZeroOrMore<WhitespaceNoNewline>(); 807 808 switch (TryConsumeCommentStart()) { 809 case LINE_COMMENT: 810 ConsumeLineComment(collector.GetBufferForLineComment()); 811 break; 812 case BLOCK_COMMENT: 813 ConsumeBlockComment(collector.GetBufferForBlockComment()); 814 815 // Consume the rest of the line so that we don't interpret it as a 816 // blank line the next time around the loop. 817 ConsumeZeroOrMore<WhitespaceNoNewline>(); 818 TryConsume('\n'); 819 break; 820 case SLASH_NOT_COMMENT: 821 return true; 822 case NO_COMMENT: 823 if (TryConsume('\n')) { 824 // Completely blank line. 825 collector.Flush(); 826 collector.DetachFromPrev(); 827 } else { 828 bool result = Next(); 829 if (!result || 830 current_.text == "}" || 831 current_.text == "]" || 832 current_.text == ")") { 833 // It looks like we're at the end of a scope. In this case it 834 // makes no sense to attach a comment to the following token. 835 collector.Flush(); 836 } 837 return result; 838 } 839 break; 840 } 841 } 842 } 843 844 // ------------------------------------------------------------------- 845 // Token-parsing helpers. Remember that these don't need to report 846 // errors since any errors should already have been reported while 847 // tokenizing. Also, these can assume that whatever text they 848 // are given is text that the tokenizer actually parsed as a token 849 // of the given type. 850 851 bool Tokenizer::ParseInteger(const string& text, uint64 max_value, 852 uint64* output) { 853 // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull() 854 // is non-standard. I hate the C standard library. :( 855 856 // return strtoull(text.c_str(), NULL, 0); 857 858 const char* ptr = text.c_str(); 859 int base = 10; 860 if (ptr[0] == '0') { 861 if (ptr[1] == 'x' || ptr[1] == 'X') { 862 // This is hex. 863 base = 16; 864 ptr += 2; 865 } else { 866 // This is octal. 867 base = 8; 868 } 869 } 870 871 uint64 result = 0; 872 for (; *ptr != '\0'; ptr++) { 873 int digit = DigitValue(*ptr); 874 GOOGLE_LOG_IF(DFATAL, digit < 0 || digit >= base) 875 << " Tokenizer::ParseInteger() passed text that could not have been" 876 " tokenized as an integer: " << CEscape(text); 877 if (digit > max_value || result > (max_value - digit) / base) { 878 // Overflow. 879 return false; 880 } 881 result = result * base + digit; 882 } 883 884 *output = result; 885 return true; 886 } 887 888 double Tokenizer::ParseFloat(const string& text) { 889 const char* start = text.c_str(); 890 char* end; 891 double result = NoLocaleStrtod(start, &end); 892 893 // "1e" is not a valid float, but if the tokenizer reads it, it will 894 // report an error but still return it as a valid token. We need to 895 // accept anything the tokenizer could possibly return, error or not. 896 if (*end == 'e' || *end == 'E') { 897 ++end; 898 if (*end == '-' || *end == '+') ++end; 899 } 900 901 // If the Tokenizer had allow_f_after_float_ enabled, the float may be 902 // suffixed with the letter 'f'. 903 if (*end == 'f' || *end == 'F') { 904 ++end; 905 } 906 907 GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-') 908 << " Tokenizer::ParseFloat() passed text that could not have been" 909 " tokenized as a float: " << CEscape(text); 910 return result; 911 } 912 913 // Helper to append a Unicode code point to a string as UTF8, without bringing 914 // in any external dependencies. 915 static void AppendUTF8(uint32 code_point, string* output) { 916 uint32 tmp = 0; 917 int len = 0; 918 if (code_point <= 0x7f) { 919 tmp = code_point; 920 len = 1; 921 } else if (code_point <= 0x07ff) { 922 tmp = 0x0000c080 | 923 ((code_point & 0x07c0) << 2) | 924 (code_point & 0x003f); 925 len = 2; 926 } else if (code_point <= 0xffff) { 927 tmp = 0x00e08080 | 928 ((code_point & 0xf000) << 4) | 929 ((code_point & 0x0fc0) << 2) | 930 (code_point & 0x003f); 931 len = 3; 932 } else if (code_point <= 0x1fffff) { 933 tmp = 0xf0808080 | 934 ((code_point & 0x1c0000) << 6) | 935 ((code_point & 0x03f000) << 4) | 936 ((code_point & 0x000fc0) << 2) | 937 (code_point & 0x003f); 938 len = 4; 939 } else { 940 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is 941 // normally only defined up to there as well. 942 StringAppendF(output, "\\U%08x", code_point); 943 return; 944 } 945 tmp = ghtonl(tmp); 946 output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len); 947 } 948 949 // Try to read <len> hex digits from ptr, and stuff the numeric result into 950 // *result. Returns true if that many digits were successfully consumed. 951 static bool ReadHexDigits(const char* ptr, int len, uint32* result) { 952 *result = 0; 953 if (len == 0) return false; 954 for (const char* end = ptr + len; ptr < end; ++ptr) { 955 if (*ptr == '\0') return false; 956 *result = (*result << 4) + DigitValue(*ptr); 957 } 958 return true; 959 } 960 961 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range 962 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail 963 // surrogate. These numbers are in a reserved range of Unicode code points, so 964 // if we encounter such a pair we know how to parse it and convert it into a 965 // single code point. 966 static const uint32 kMinHeadSurrogate = 0xd800; 967 static const uint32 kMaxHeadSurrogate = 0xdc00; 968 static const uint32 kMinTrailSurrogate = 0xdc00; 969 static const uint32 kMaxTrailSurrogate = 0xe000; 970 971 static inline bool IsHeadSurrogate(uint32 code_point) { 972 return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate); 973 } 974 975 static inline bool IsTrailSurrogate(uint32 code_point) { 976 return (code_point >= kMinTrailSurrogate) && 977 (code_point < kMaxTrailSurrogate); 978 } 979 980 // Combine a head and trail surrogate into a single Unicode code point. 981 static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) { 982 GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate)); 983 GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate)); 984 return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) | 985 (trail_surrogate - kMinTrailSurrogate)); 986 } 987 988 // Convert the escape sequence parameter to a number of expected hex digits. 989 static inline int UnicodeLength(char key) { 990 if (key == 'u') return 4; 991 if (key == 'U') return 8; 992 return 0; 993 } 994 995 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt 996 // to parse that sequence. On success, returns a pointer to the first char 997 // beyond that sequence, and fills in *code_point. On failure, returns ptr 998 // itself. 999 static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) { 1000 const char* p = ptr; 1001 // Fetch the code point. 1002 const int len = UnicodeLength(*p++); 1003 if (!ReadHexDigits(p, len, code_point)) 1004 return ptr; 1005 p += len; 1006 1007 // Check if the code point we read is a "head surrogate." If so, then we 1008 // expect it to be immediately followed by another code point which is a valid 1009 // "trail surrogate," and together they form a UTF-16 pair which decodes into 1010 // a single Unicode point. Trail surrogates may only use \u, not \U. 1011 if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') { 1012 uint32 trail_surrogate; 1013 if (ReadHexDigits(p + 2, 4, &trail_surrogate) && 1014 IsTrailSurrogate(trail_surrogate)) { 1015 *code_point = AssembleUTF16(*code_point, trail_surrogate); 1016 p += 6; 1017 } 1018 // If this failed, then we just emit the head surrogate as a code point. 1019 // It's bogus, but so is the string. 1020 } 1021 1022 return p; 1023 } 1024 1025 // The text string must begin and end with single or double quote 1026 // characters. 1027 void Tokenizer::ParseStringAppend(const string& text, string* output) { 1028 // Reminder: text[0] is always a quote character. (If text is 1029 // empty, it's invalid, so we'll just return). 1030 const size_t text_size = text.size(); 1031 if (text_size == 0) { 1032 GOOGLE_LOG(DFATAL) 1033 << " Tokenizer::ParseStringAppend() passed text that could not" 1034 " have been tokenized as a string: " << CEscape(text); 1035 return; 1036 } 1037 1038 // Reserve room for new string. The branch is necessary because if 1039 // there is already space available the reserve() call might 1040 // downsize the output. 1041 const size_t new_len = text_size + output->size(); 1042 if (new_len > output->capacity()) { 1043 output->reserve(new_len); 1044 } 1045 1046 // Loop through the string copying characters to "output" and 1047 // interpreting escape sequences. Note that any invalid escape 1048 // sequences or other errors were already reported while tokenizing. 1049 // In this case we do not need to produce valid results. 1050 for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) { 1051 if (*ptr == '\\' && ptr[1] != '\0') { 1052 // An escape sequence. 1053 ++ptr; 1054 1055 if (OctalDigit::InClass(*ptr)) { 1056 // An octal escape. May one, two, or three digits. 1057 int code = DigitValue(*ptr); 1058 if (OctalDigit::InClass(ptr[1])) { 1059 ++ptr; 1060 code = code * 8 + DigitValue(*ptr); 1061 } 1062 if (OctalDigit::InClass(ptr[1])) { 1063 ++ptr; 1064 code = code * 8 + DigitValue(*ptr); 1065 } 1066 output->push_back(static_cast<char>(code)); 1067 1068 } else if (*ptr == 'x') { 1069 // A hex escape. May zero, one, or two digits. (The zero case 1070 // will have been caught as an error earlier.) 1071 int code = 0; 1072 if (HexDigit::InClass(ptr[1])) { 1073 ++ptr; 1074 code = DigitValue(*ptr); 1075 } 1076 if (HexDigit::InClass(ptr[1])) { 1077 ++ptr; 1078 code = code * 16 + DigitValue(*ptr); 1079 } 1080 output->push_back(static_cast<char>(code)); 1081 1082 } else if (*ptr == 'u' || *ptr == 'U') { 1083 uint32 unicode; 1084 const char* end = FetchUnicodePoint(ptr, &unicode); 1085 if (end == ptr) { 1086 // Failure: Just dump out what we saw, don't try to parse it. 1087 output->push_back(*ptr); 1088 } else { 1089 AppendUTF8(unicode, output); 1090 ptr = end - 1; // Because we're about to ++ptr. 1091 } 1092 } else { 1093 // Some other escape code. 1094 output->push_back(TranslateEscape(*ptr)); 1095 } 1096 1097 } else if (*ptr == text[0] && ptr[1] == '\0') { 1098 // Ignore final quote matching the starting quote. 1099 } else { 1100 output->push_back(*ptr); 1101 } 1102 } 1103 } 1104 1105 template<typename CharacterClass> 1106 static bool AllInClass(const string& s) { 1107 for (int i = 0; i < s.size(); ++i) { 1108 if (!CharacterClass::InClass(s[i])) 1109 return false; 1110 } 1111 return true; 1112 } 1113 1114 bool Tokenizer::IsIdentifier(const string& text) { 1115 // Mirrors IDENTIFIER definition in Tokenizer::Next() above. 1116 if (text.size() == 0) 1117 return false; 1118 if (!Letter::InClass(text.at(0))) 1119 return false; 1120 if (!AllInClass<Alphanumeric>(text.substr(1))) 1121 return false; 1122 return true; 1123 } 1124 1125 } // namespace io 1126 } // namespace protobuf 1127 } // namespace google 1128