1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton (at) google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 35 #include <limits.h> 36 #include <math.h> 37 38 #include <vector> 39 40 #include <google/protobuf/io/tokenizer.h> 41 #include <google/protobuf/io/zero_copy_stream_impl.h> 42 43 #include <google/protobuf/stubs/common.h> 44 #include <google/protobuf/stubs/logging.h> 45 #include <google/protobuf/stubs/strutil.h> 46 #include <google/protobuf/stubs/substitute.h> 47 #include <google/protobuf/testing/googletest.h> 48 #include <gtest/gtest.h> 49 50 namespace google { 51 namespace protobuf { 52 namespace io { 53 namespace { 54 55 // =================================================================== 56 // Data-Driven Test Infrastructure 57 58 // TODO(kenton): This is copied from coded_stream_unittest. This is 59 // temporary until these fetaures are integrated into gTest itself. 60 61 // TEST_1D and TEST_2D are macros I'd eventually like to see added to 62 // gTest. These macros can be used to declare tests which should be 63 // run multiple times, once for each item in some input array. TEST_1D 64 // tests all cases in a single input array. TEST_2D tests all 65 // combinations of cases from two arrays. The arrays must be statically 66 // defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example: 67 // 68 // int kCases[] = {1, 2, 3, 4} 69 // TEST_1D(MyFixture, MyTest, kCases) { 70 // EXPECT_GT(kCases_case, 0); 71 // } 72 // 73 // This test iterates through the numbers 1, 2, 3, and 4 and tests that 74 // they are all grater than zero. In case of failure, the exact case 75 // which failed will be printed. The case type must be printable using 76 // ostream::operator<<. 77 78 #define TEST_1D(FIXTURE, NAME, CASES) \ 79 class FIXTURE##_##NAME##_DD : public FIXTURE { \ 80 protected: \ 81 template <typename CaseType> \ 82 void DoSingleCase(const CaseType& CASES##_case); \ 83 }; \ 84 \ 85 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ 86 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \ 87 SCOPED_TRACE(testing::Message() \ 88 << #CASES " case #" << i << ": " << CASES[i]); \ 89 DoSingleCase(CASES[i]); \ 90 } \ 91 } \ 92 \ 93 template <typename CaseType> \ 94 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case) 95 96 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \ 97 class FIXTURE##_##NAME##_DD : public FIXTURE { \ 98 protected: \ 99 template <typename CaseType1, typename CaseType2> \ 100 void DoSingleCase(const CaseType1& CASES1##_case, \ 101 const CaseType2& CASES2##_case); \ 102 }; \ 103 \ 104 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ 105 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \ 106 for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \ 107 SCOPED_TRACE(testing::Message() \ 108 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \ 109 << #CASES2 " case #" << j << ": " << CASES2[j]); \ 110 DoSingleCase(CASES1[i], CASES2[j]); \ 111 } \ 112 } \ 113 } \ 114 \ 115 template <typename CaseType1, typename CaseType2> \ 116 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \ 117 const CaseType2& CASES2##_case) 118 119 // ------------------------------------------------------------------- 120 121 // An input stream that is basically like an ArrayInputStream but sometimes 122 // returns empty buffers, just to throw us off. 123 class TestInputStream : public ZeroCopyInputStream { 124 public: 125 TestInputStream(const void* data, int size, int block_size) 126 : array_stream_(data, size, block_size), counter_(0) {} 127 ~TestInputStream() {} 128 129 // implements ZeroCopyInputStream ---------------------------------- 130 bool Next(const void** data, int* size) { 131 // We'll return empty buffers starting with the first buffer, and every 132 // 3 and 5 buffers after that. 133 if (counter_ % 3 == 0 || counter_ % 5 == 0) { 134 *data = NULL; 135 *size = 0; 136 ++counter_; 137 return true; 138 } else { 139 ++counter_; 140 return array_stream_.Next(data, size); 141 } 142 } 143 144 void BackUp(int count) { return array_stream_.BackUp(count); } 145 bool Skip(int count) { return array_stream_.Skip(count); } 146 int64 ByteCount() const { return array_stream_.ByteCount(); } 147 148 private: 149 ArrayInputStream array_stream_; 150 int counter_; 151 }; 152 153 // ------------------------------------------------------------------- 154 155 // An error collector which simply concatenates all its errors into a big 156 // block of text which can be checked. 157 class TestErrorCollector : public ErrorCollector { 158 public: 159 TestErrorCollector() {} 160 ~TestErrorCollector() {} 161 162 string text_; 163 164 // implements ErrorCollector --------------------------------------- 165 void AddError(int line, int column, const string& message) { 166 strings::SubstituteAndAppend(&text_, "$0:$1: $2\n", 167 line, column, message); 168 } 169 }; 170 171 // ------------------------------------------------------------------- 172 173 // We test each operation over a variety of block sizes to insure that 174 // we test cases where reads cross buffer boundaries as well as cases 175 // where they don't. This is sort of a brute-force approach to this, 176 // but it's easy to write and easy to understand. 177 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024}; 178 179 class TokenizerTest : public testing::Test { 180 protected: 181 // For easy testing. 182 uint64 ParseInteger(const string& text) { 183 uint64 result; 184 EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result)); 185 return result; 186 } 187 }; 188 189 // =================================================================== 190 191 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error: 192 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr" 193 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) 194 195 // In each test case, the entire input text should parse as a single token 196 // of the given type. 197 struct SimpleTokenCase { 198 string input; 199 Tokenizer::TokenType type; 200 }; 201 202 inline ostream& operator<<(ostream& out, 203 const SimpleTokenCase& test_case) { 204 return out << CEscape(test_case.input); 205 } 206 207 SimpleTokenCase kSimpleTokenCases[] = { 208 // Test identifiers. 209 { "hello", Tokenizer::TYPE_IDENTIFIER }, 210 211 // Test integers. 212 { "123", Tokenizer::TYPE_INTEGER }, 213 { "0xab6", Tokenizer::TYPE_INTEGER }, 214 { "0XAB6", Tokenizer::TYPE_INTEGER }, 215 { "0X1234567", Tokenizer::TYPE_INTEGER }, 216 { "0x89abcdef", Tokenizer::TYPE_INTEGER }, 217 { "0x89ABCDEF", Tokenizer::TYPE_INTEGER }, 218 { "01234567", Tokenizer::TYPE_INTEGER }, 219 220 // Test floats. 221 { "123.45", Tokenizer::TYPE_FLOAT }, 222 { "1.", Tokenizer::TYPE_FLOAT }, 223 { "1e3", Tokenizer::TYPE_FLOAT }, 224 { "1E3", Tokenizer::TYPE_FLOAT }, 225 { "1e-3", Tokenizer::TYPE_FLOAT }, 226 { "1e+3", Tokenizer::TYPE_FLOAT }, 227 { "1.e3", Tokenizer::TYPE_FLOAT }, 228 { "1.2e3", Tokenizer::TYPE_FLOAT }, 229 { ".1", Tokenizer::TYPE_FLOAT }, 230 { ".1e3", Tokenizer::TYPE_FLOAT }, 231 { ".1e-3", Tokenizer::TYPE_FLOAT }, 232 { ".1e+3", Tokenizer::TYPE_FLOAT }, 233 234 // Test strings. 235 { "'hello'", Tokenizer::TYPE_STRING }, 236 { "\"foo\"", Tokenizer::TYPE_STRING }, 237 { "'a\"b'", Tokenizer::TYPE_STRING }, 238 { "\"a'b\"", Tokenizer::TYPE_STRING }, 239 { "'a\\'b'", Tokenizer::TYPE_STRING }, 240 { "\"a\\\"b\"", Tokenizer::TYPE_STRING }, 241 { "'\\xf'", Tokenizer::TYPE_STRING }, 242 { "'\\0'", Tokenizer::TYPE_STRING }, 243 244 // Test symbols. 245 { "+", Tokenizer::TYPE_SYMBOL }, 246 { ".", Tokenizer::TYPE_SYMBOL }, 247 }; 248 249 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { 250 // Set up the tokenizer. 251 TestInputStream input(kSimpleTokenCases_case.input.data(), 252 kSimpleTokenCases_case.input.size(), 253 kBlockSizes_case); 254 TestErrorCollector error_collector; 255 Tokenizer tokenizer(&input, &error_collector); 256 257 // Before Next() is called, the initial token should always be TYPE_START. 258 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type); 259 EXPECT_EQ("", tokenizer.current().text); 260 EXPECT_EQ(0, tokenizer.current().line); 261 EXPECT_EQ(0, tokenizer.current().column); 262 EXPECT_EQ(0, tokenizer.current().end_column); 263 264 // Parse the token. 265 ASSERT_TRUE(tokenizer.Next()); 266 267 // Check that it has the right type. 268 EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type); 269 // Check that it contains the complete input text. 270 EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text); 271 // Check that it is located at the beginning of the input 272 EXPECT_EQ(0, tokenizer.current().line); 273 EXPECT_EQ(0, tokenizer.current().column); 274 EXPECT_EQ(kSimpleTokenCases_case.input.size(), 275 tokenizer.current().end_column); 276 277 // There should be no more input. 278 EXPECT_FALSE(tokenizer.Next()); 279 280 // After Next() returns false, the token should have type TYPE_END. 281 EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type); 282 EXPECT_EQ("", tokenizer.current().text); 283 EXPECT_EQ(0, tokenizer.current().line); 284 EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column); 285 EXPECT_EQ(kSimpleTokenCases_case.input.size(), 286 tokenizer.current().end_column); 287 288 // There should be no errors. 289 EXPECT_TRUE(error_collector.text_.empty()); 290 } 291 292 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) { 293 // Test the "allow_f_after_float" option. 294 295 // Set up the tokenizer. 296 const char* text = "1f 2.5f 6e3f 7F"; 297 TestInputStream input(text, strlen(text), kBlockSizes_case); 298 TestErrorCollector error_collector; 299 Tokenizer tokenizer(&input, &error_collector); 300 tokenizer.set_allow_f_after_float(true); 301 302 // Advance through tokens and check that they are parsed as expected. 303 ASSERT_TRUE(tokenizer.Next()); 304 EXPECT_EQ(tokenizer.current().text, "1f"); 305 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 306 ASSERT_TRUE(tokenizer.Next()); 307 EXPECT_EQ(tokenizer.current().text, "2.5f"); 308 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 309 ASSERT_TRUE(tokenizer.Next()); 310 EXPECT_EQ(tokenizer.current().text, "6e3f"); 311 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 312 ASSERT_TRUE(tokenizer.Next()); 313 EXPECT_EQ(tokenizer.current().text, "7F"); 314 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 315 316 // There should be no more input. 317 EXPECT_FALSE(tokenizer.Next()); 318 // There should be no errors. 319 EXPECT_TRUE(error_collector.text_.empty()); 320 } 321 322 #endif 323 324 // ------------------------------------------------------------------- 325 326 // In each case, the input is parsed to produce a list of tokens. The 327 // last token in "output" must have type TYPE_END. 328 struct MultiTokenCase { 329 string input; 330 Tokenizer::Token output[10]; // The compiler wants a constant array 331 // size for initialization to work. There 332 // is no reason this can't be increased if 333 // needed. 334 }; 335 336 inline ostream& operator<<(ostream& out, 337 const MultiTokenCase& test_case) { 338 return out << CEscape(test_case.input); 339 } 340 341 MultiTokenCase kMultiTokenCases[] = { 342 // Test empty input. 343 { "", { 344 { Tokenizer::TYPE_END , "" , 0, 0 }, 345 }}, 346 347 // Test all token types at the same time. 348 { "foo 1 1.2 + 'bar'", { 349 { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 }, 350 { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 }, 351 { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 }, 352 { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 }, 353 { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 }, 354 { Tokenizer::TYPE_END , "" , 0, 17, 17 }, 355 }}, 356 357 // Test that consecutive symbols are parsed as separate tokens. 358 { "!@+%", { 359 { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 }, 360 { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 }, 361 { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 }, 362 { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 }, 363 { Tokenizer::TYPE_END , "" , 0, 4, 4 }, 364 }}, 365 366 // Test that newlines affect line numbers correctly. 367 { "foo bar\nrab oof", { 368 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 369 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 }, 370 { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 }, 371 { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 }, 372 { Tokenizer::TYPE_END , "" , 1, 7, 7 }, 373 }}, 374 375 // Test that tabs affect column numbers correctly. 376 { "foo\tbar \tbaz", { 377 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 378 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 }, 379 { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 }, 380 { Tokenizer::TYPE_END , "" , 0, 19, 19 }, 381 }}, 382 383 // Test that tabs in string literals affect column numbers correctly. 384 { "\"foo\tbar\" baz", { 385 { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 }, 386 { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 }, 387 { Tokenizer::TYPE_END , "" , 0, 16, 16 }, 388 }}, 389 390 // Test that line comments are ignored. 391 { "foo // This is a comment\n" 392 "bar // This is another comment", { 393 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 394 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 }, 395 { Tokenizer::TYPE_END , "" , 1, 30, 30 }, 396 }}, 397 398 // Test that block comments are ignored. 399 { "foo /* This is a block comment */ bar", { 400 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 401 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 }, 402 { Tokenizer::TYPE_END , "" , 0, 37, 37 }, 403 }}, 404 405 // Test that sh-style comments are not ignored by default. 406 { "foo # bar\n" 407 "baz", { 408 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 409 { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 }, 410 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 }, 411 { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 }, 412 { Tokenizer::TYPE_END , "" , 1, 3, 3 }, 413 }}, 414 415 // Test all whitespace chars 416 { "foo\n\t\r\v\fbar", { 417 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 418 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 }, 419 { Tokenizer::TYPE_END , "" , 1, 14, 14 }, 420 }}, 421 }; 422 423 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { 424 // Set up the tokenizer. 425 TestInputStream input(kMultiTokenCases_case.input.data(), 426 kMultiTokenCases_case.input.size(), 427 kBlockSizes_case); 428 TestErrorCollector error_collector; 429 Tokenizer tokenizer(&input, &error_collector); 430 431 // Before Next() is called, the initial token should always be TYPE_START. 432 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type); 433 EXPECT_EQ("", tokenizer.current().text); 434 EXPECT_EQ(0, tokenizer.current().line); 435 EXPECT_EQ(0, tokenizer.current().column); 436 EXPECT_EQ(0, tokenizer.current().end_column); 437 438 // Loop through all expected tokens. 439 int i = 0; 440 Tokenizer::Token token; 441 do { 442 token = kMultiTokenCases_case.output[i++]; 443 444 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text); 445 446 Tokenizer::Token previous = tokenizer.current(); 447 448 // Next() should only return false when it hits the end token. 449 if (token.type != Tokenizer::TYPE_END) { 450 ASSERT_TRUE(tokenizer.Next()); 451 } else { 452 ASSERT_FALSE(tokenizer.Next()); 453 } 454 455 // Check that the previous token is set correctly. 456 EXPECT_EQ(previous.type, tokenizer.previous().type); 457 EXPECT_EQ(previous.text, tokenizer.previous().text); 458 EXPECT_EQ(previous.line, tokenizer.previous().line); 459 EXPECT_EQ(previous.column, tokenizer.previous().column); 460 EXPECT_EQ(previous.end_column, tokenizer.previous().end_column); 461 462 // Check that the token matches the expected one. 463 EXPECT_EQ(token.type, tokenizer.current().type); 464 EXPECT_EQ(token.text, tokenizer.current().text); 465 EXPECT_EQ(token.line, tokenizer.current().line); 466 EXPECT_EQ(token.column, tokenizer.current().column); 467 EXPECT_EQ(token.end_column, tokenizer.current().end_column); 468 469 } while (token.type != Tokenizer::TYPE_END); 470 471 // There should be no errors. 472 EXPECT_TRUE(error_collector.text_.empty()); 473 } 474 475 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error: 476 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr" 477 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) 478 479 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { 480 // Test the "comment_style" option. 481 482 const char* text = "foo # bar\n" 483 "baz // qux\n" 484 "corge /* grault */\n" 485 "garply"; 486 const char* const kTokens[] = {"foo", // "# bar" is ignored 487 "baz", "/", "/", "qux", 488 "corge", "/", "*", "grault", "*", "/", 489 "garply"}; 490 491 // Set up the tokenizer. 492 TestInputStream input(text, strlen(text), kBlockSizes_case); 493 TestErrorCollector error_collector; 494 Tokenizer tokenizer(&input, &error_collector); 495 tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE); 496 497 // Advance through tokens and check that they are parsed as expected. 498 for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) { 499 EXPECT_TRUE(tokenizer.Next()); 500 EXPECT_EQ(tokenizer.current().text, kTokens[i]); 501 } 502 503 // There should be no more input. 504 EXPECT_FALSE(tokenizer.Next()); 505 // There should be no errors. 506 EXPECT_TRUE(error_collector.text_.empty()); 507 } 508 509 #endif 510 511 // ------------------------------------------------------------------- 512 513 // In each case, the input is expected to have two tokens named "prev" and 514 // "next" with comments in between. 515 struct DocCommentCase { 516 string input; 517 518 const char* prev_trailing_comments; 519 const char* detached_comments[10]; 520 const char* next_leading_comments; 521 }; 522 523 inline ostream& operator<<(ostream& out, 524 const DocCommentCase& test_case) { 525 return out << CEscape(test_case.input); 526 } 527 528 DocCommentCase kDocCommentCases[] = { 529 { 530 "prev next", 531 532 "", 533 {}, 534 "" 535 }, 536 537 { 538 "prev /* ignored */ next", 539 540 "", 541 {}, 542 "" 543 }, 544 545 { 546 "prev // trailing comment\n" 547 "next", 548 549 " trailing comment\n", 550 {}, 551 "" 552 }, 553 554 { 555 "prev\n" 556 "// leading comment\n" 557 "// line 2\n" 558 "next", 559 560 "", 561 {}, 562 " leading comment\n" 563 " line 2\n" 564 }, 565 566 { 567 "prev\n" 568 "// trailing comment\n" 569 "// line 2\n" 570 "\n" 571 "next", 572 573 " trailing comment\n" 574 " line 2\n", 575 {}, 576 "" 577 }, 578 579 { 580 "prev // trailing comment\n" 581 "// leading comment\n" 582 "// line 2\n" 583 "next", 584 585 " trailing comment\n", 586 {}, 587 " leading comment\n" 588 " line 2\n" 589 }, 590 591 { 592 "prev /* trailing block comment */\n" 593 "/* leading block comment\n" 594 " * line 2\n" 595 " * line 3 */" 596 "next", 597 598 " trailing block comment ", 599 {}, 600 " leading block comment\n" 601 " line 2\n" 602 " line 3 " 603 }, 604 605 { 606 "prev\n" 607 "/* trailing block comment\n" 608 " * line 2\n" 609 " * line 3\n" 610 " */\n" 611 "/* leading block comment\n" 612 " * line 2\n" 613 " * line 3 */" 614 "next", 615 616 " trailing block comment\n" 617 " line 2\n" 618 " line 3\n", 619 {}, 620 " leading block comment\n" 621 " line 2\n" 622 " line 3 " 623 }, 624 625 { 626 "prev\n" 627 "// trailing comment\n" 628 "\n" 629 "// detached comment\n" 630 "// line 2\n" 631 "\n" 632 "// second detached comment\n" 633 "/* third detached comment\n" 634 " * line 2 */\n" 635 "// leading comment\n" 636 "next", 637 638 " trailing comment\n", 639 { 640 " detached comment\n" 641 " line 2\n", 642 " second detached comment\n", 643 " third detached comment\n" 644 " line 2 " 645 }, 646 " leading comment\n" 647 }, 648 649 { 650 "prev /**/\n" 651 "\n" 652 "// detached comment\n" 653 "\n" 654 "// leading comment\n" 655 "next", 656 657 "", 658 { 659 " detached comment\n" 660 }, 661 " leading comment\n" 662 }, 663 664 { 665 "prev /**/\n" 666 "// leading comment\n" 667 "next", 668 669 "", 670 {}, 671 " leading comment\n" 672 }, 673 }; 674 675 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { 676 // Set up the tokenizer. 677 TestInputStream input(kDocCommentCases_case.input.data(), 678 kDocCommentCases_case.input.size(), 679 kBlockSizes_case); 680 TestErrorCollector error_collector; 681 Tokenizer tokenizer(&input, &error_collector); 682 683 // Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). 684 TestInputStream input2(kDocCommentCases_case.input.data(), 685 kDocCommentCases_case.input.size(), 686 kBlockSizes_case); 687 Tokenizer tokenizer2(&input2, &error_collector); 688 689 tokenizer.Next(); 690 tokenizer2.Next(); 691 692 EXPECT_EQ("prev", tokenizer.current().text); 693 EXPECT_EQ("prev", tokenizer2.current().text); 694 695 string prev_trailing_comments; 696 vector<string> detached_comments; 697 string next_leading_comments; 698 tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, 699 &next_leading_comments); 700 tokenizer2.NextWithComments(NULL, NULL, NULL); 701 EXPECT_EQ("next", tokenizer.current().text); 702 EXPECT_EQ("next", tokenizer2.current().text); 703 704 EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, 705 prev_trailing_comments); 706 707 for (int i = 0; i < detached_comments.size(); i++) { 708 ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases)); 709 ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL); 710 EXPECT_EQ(kDocCommentCases_case.detached_comments[i], 711 detached_comments[i]); 712 } 713 714 // Verify that we matched all the detached comments. 715 EXPECT_EQ(NULL, 716 kDocCommentCases_case.detached_comments[detached_comments.size()]); 717 718 EXPECT_EQ(kDocCommentCases_case.next_leading_comments, 719 next_leading_comments); 720 } 721 722 // ------------------------------------------------------------------- 723 724 // Test parse helpers. It's not really worth setting up a full data-driven 725 // test here. 726 TEST_F(TokenizerTest, ParseInteger) { 727 EXPECT_EQ(0, ParseInteger("0")); 728 EXPECT_EQ(123, ParseInteger("123")); 729 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12")); 730 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12")); 731 EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF")); 732 EXPECT_EQ(01234567, ParseInteger("01234567")); 733 EXPECT_EQ(0X123, ParseInteger("0X123")); 734 735 // Test invalid integers that may still be tokenized as integers. 736 EXPECT_EQ(0, ParseInteger("0x")); 737 738 uint64 i; 739 740 // Test invalid integers that will never be tokenized as integers. 741 EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i)); 742 EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i)); 743 EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i)); 744 EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i)); 745 EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i)); 746 747 // Test overflows. 748 EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i)); 749 EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i)); 750 EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i)); 751 EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i)); 752 EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i)); 753 EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i)); 754 EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i)); 755 } 756 757 TEST_F(TokenizerTest, ParseFloat) { 758 EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1.")); 759 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3")); 760 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3")); 761 EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3")); 762 EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1")); 763 EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25")); 764 EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3")); 765 EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3")); 766 EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3")); 767 EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3")); 768 EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5")); 769 EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12")); 770 EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2")); 771 EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2")); 772 773 // Test invalid integers that may still be tokenized as integers. 774 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e")); 775 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-")); 776 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e")); 777 778 // Test 'f' suffix. 779 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f")); 780 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f")); 781 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F")); 782 783 // These should parse successfully even though they are out of range. 784 // Overflows become infinity and underflows become zero. 785 EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999")); 786 EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999")); 787 788 #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet 789 // Test invalid integers that will never be tokenized as integers. 790 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"), 791 "passed text that could not have been tokenized as a float"); 792 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"), 793 "passed text that could not have been tokenized as a float"); 794 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"), 795 "passed text that could not have been tokenized as a float"); 796 #endif // PROTOBUF_HAS_DEATH_TEST 797 } 798 799 TEST_F(TokenizerTest, ParseString) { 800 string output; 801 Tokenizer::ParseString("'hello'", &output); 802 EXPECT_EQ("hello", output); 803 Tokenizer::ParseString("\"blah\\nblah2\"", &output); 804 EXPECT_EQ("blah\nblah2", output); 805 Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output); 806 EXPECT_EQ("\1x\1\123\739\52\334n\3", output); 807 Tokenizer::ParseString("'\\x20\\x4'", &output); 808 EXPECT_EQ("\x20\x4", output); 809 810 // Test invalid strings that may still be tokenized as strings. 811 Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid 812 EXPECT_EQ("\a?\v\t", output); 813 Tokenizer::ParseString("'", &output); 814 EXPECT_EQ("", output); 815 Tokenizer::ParseString("'\\", &output); 816 EXPECT_EQ("\\", output); 817 818 // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode 819 // characters. 820 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output); 821 EXPECT_EQ("$XX", output); 822 // Same thing encoded using UTF16. 823 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output); 824 EXPECT_EQ("$XX", output); 825 // Here's some broken UTF16; there's a head surrogate with no tail surrogate. 826 // We just output this as if it were UTF8; it's not a defined code point, but 827 // it has a defined encoding. 828 Tokenizer::ParseString("'\\ud852XX'", &output); 829 EXPECT_EQ("\xed\xa1\x92XX", output); 830 // Malformed escape: Demons may fly out of the nose. 831 Tokenizer::ParseString("\\u0", &output); 832 EXPECT_EQ("u0", output); 833 834 // Test invalid strings that will never be tokenized as strings. 835 #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet 836 EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output), 837 "passed text that could not have been tokenized as a string"); 838 #endif // PROTOBUF_HAS_DEATH_TEST 839 } 840 841 TEST_F(TokenizerTest, ParseStringAppend) { 842 // Check that ParseString and ParseStringAppend differ. 843 string output("stuff+"); 844 Tokenizer::ParseStringAppend("'hello'", &output); 845 EXPECT_EQ("stuff+hello", output); 846 Tokenizer::ParseString("'hello'", &output); 847 EXPECT_EQ("hello", output); 848 } 849 850 // ------------------------------------------------------------------- 851 852 // Each case parses some input text, ignoring the tokens produced, and 853 // checks that the error output matches what is expected. 854 struct ErrorCase { 855 string input; 856 bool recoverable; // True if the tokenizer should be able to recover and 857 // parse more tokens after seeing this error. Cases 858 // for which this is true must end with "foo" as 859 // the last token, which the test will check for. 860 const char* errors; 861 }; 862 863 inline ostream& operator<<(ostream& out, 864 const ErrorCase& test_case) { 865 return out << CEscape(test_case.input); 866 } 867 868 ErrorCase kErrorCases[] = { 869 // String errors. 870 { "'\\l' foo", true, 871 "0:2: Invalid escape sequence in string literal.\n" }, 872 { "'\\X' foo", true, 873 "0:2: Invalid escape sequence in string literal.\n" }, 874 { "'\\x' foo", true, 875 "0:3: Expected hex digits for escape sequence.\n" }, 876 { "'foo", false, 877 "0:4: Unexpected end of string.\n" }, 878 { "'bar\nfoo", true, 879 "0:4: String literals cannot cross line boundaries.\n" }, 880 { "'\\u01' foo", true, 881 "0:5: Expected four hex digits for \\u escape sequence.\n" }, 882 { "'\\u01' foo", true, 883 "0:5: Expected four hex digits for \\u escape sequence.\n" }, 884 { "'\\uXYZ' foo", true, 885 "0:3: Expected four hex digits for \\u escape sequence.\n" }, 886 887 // Integer errors. 888 { "123foo", true, 889 "0:3: Need space between number and identifier.\n" }, 890 891 // Hex/octal errors. 892 { "0x foo", true, 893 "0:2: \"0x\" must be followed by hex digits.\n" }, 894 { "0541823 foo", true, 895 "0:4: Numbers starting with leading zero must be in octal.\n" }, 896 { "0x123z foo", true, 897 "0:5: Need space between number and identifier.\n" }, 898 { "0x123.4 foo", true, 899 "0:5: Hex and octal numbers must be integers.\n" }, 900 { "0123.4 foo", true, 901 "0:4: Hex and octal numbers must be integers.\n" }, 902 903 // Float errors. 904 { "1e foo", true, 905 "0:2: \"e\" must be followed by exponent.\n" }, 906 { "1e- foo", true, 907 "0:3: \"e\" must be followed by exponent.\n" }, 908 { "1.2.3 foo", true, 909 "0:3: Already saw decimal point or exponent; can't have another one.\n" }, 910 { "1e2.3 foo", true, 911 "0:3: Already saw decimal point or exponent; can't have another one.\n" }, 912 { "a.1 foo", true, 913 "0:1: Need space between identifier and decimal point.\n" }, 914 // allow_f_after_float not enabled, so this should be an error. 915 { "1.0f foo", true, 916 "0:3: Need space between number and identifier.\n" }, 917 918 // Block comment errors. 919 { "/*", false, 920 "0:2: End-of-file inside block comment.\n" 921 "0:0: Comment started here.\n"}, 922 { "/*/*/ foo", true, 923 "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"}, 924 925 // Control characters. Multiple consecutive control characters should only 926 // produce one error. 927 { "\b foo", true, 928 "0:0: Invalid control characters encountered in text.\n" }, 929 { "\b\b foo", true, 930 "0:0: Invalid control characters encountered in text.\n" }, 931 932 // Check that control characters at end of input don't result in an 933 // infinite loop. 934 { "\b", false, 935 "0:0: Invalid control characters encountered in text.\n" }, 936 937 // Check recovery from '\0'. We have to explicitly specify the length of 938 // these strings because otherwise the string constructor will just call 939 // strlen() which will see the first '\0' and think that is the end of the 940 // string. 941 { string("\0foo", 4), true, 942 "0:0: Invalid control characters encountered in text.\n" }, 943 { string("\0\0foo", 5), true, 944 "0:0: Invalid control characters encountered in text.\n" }, 945 946 // Check error from high order bits set 947 { "\300foo", true, 948 "0:0: Interpreting non ascii codepoint 192.\n" }, 949 }; 950 951 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { 952 // Set up the tokenizer. 953 TestInputStream input(kErrorCases_case.input.data(), 954 kErrorCases_case.input.size(), 955 kBlockSizes_case); 956 TestErrorCollector error_collector; 957 Tokenizer tokenizer(&input, &error_collector); 958 959 // Ignore all input, except remember if the last token was "foo". 960 bool last_was_foo = false; 961 while (tokenizer.Next()) { 962 last_was_foo = tokenizer.current().text == "foo"; 963 } 964 965 // Check that the errors match what was expected. 966 EXPECT_EQ(kErrorCases_case.errors, error_collector.text_); 967 968 // If the error was recoverable, make sure we saw "foo" after it. 969 if (kErrorCases_case.recoverable) { 970 EXPECT_TRUE(last_was_foo); 971 } 972 } 973 974 // ------------------------------------------------------------------- 975 976 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { 977 string text = "foo bar"; 978 TestInputStream input(text.data(), text.size(), kBlockSizes_case); 979 980 // Create a tokenizer, read one token, then destroy it. 981 { 982 TestErrorCollector error_collector; 983 Tokenizer tokenizer(&input, &error_collector); 984 985 tokenizer.Next(); 986 } 987 988 // Only "foo" should have been read. 989 EXPECT_EQ(strlen("foo"), input.ByteCount()); 990 } 991 992 993 } // namespace 994 } // namespace io 995 } // namespace protobuf 996 } // namespace google 997