Home | History | Annotate | Download | only in io
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // https://developers.google.com/protocol-buffers/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 
     35 #include <limits.h>
     36 #include <math.h>
     37 
     38 #include <vector>
     39 
     40 #include <google/protobuf/io/tokenizer.h>
     41 #include <google/protobuf/io/zero_copy_stream_impl.h>
     42 
     43 #include <google/protobuf/stubs/common.h>
     44 #include <google/protobuf/stubs/strutil.h>
     45 #include <google/protobuf/stubs/substitute.h>
     46 #include <google/protobuf/testing/googletest.h>
     47 #include <gtest/gtest.h>
     48 
     49 namespace google {
     50 namespace protobuf {
     51 namespace io {
     52 namespace {
     53 
     54 // ===================================================================
     55 // Data-Driven Test Infrastructure
     56 
     57 // TODO(kenton):  This is copied from coded_stream_unittest.  This is
     58 //   temporary until these fetaures are integrated into gTest itself.
     59 
     60 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
     61 // gTest.  These macros can be used to declare tests which should be
     62 // run multiple times, once for each item in some input array.  TEST_1D
     63 // tests all cases in a single input array.  TEST_2D tests all
     64 // combinations of cases from two arrays.  The arrays must be statically
     65 // defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
     66 //
     67 // int kCases[] = {1, 2, 3, 4}
     68 // TEST_1D(MyFixture, MyTest, kCases) {
     69 //   EXPECT_GT(kCases_case, 0);
     70 // }
     71 //
     72 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
     73 // they are all grater than zero.  In case of failure, the exact case
     74 // which failed will be printed.  The case type must be printable using
     75 // ostream::operator<<.
     76 
     77 #define TEST_1D(FIXTURE, NAME, CASES)                                      \
     78   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
     79    protected:                                                              \
     80     template <typename CaseType>                                           \
     81     void DoSingleCase(const CaseType& CASES##_case);                       \
     82   };                                                                       \
     83                                                                            \
     84   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
     85     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
     86       SCOPED_TRACE(testing::Message()                                      \
     87         << #CASES " case #" << i << ": " << CASES[i]);                     \
     88       DoSingleCase(CASES[i]);                                              \
     89     }                                                                      \
     90   }                                                                        \
     91                                                                            \
     92   template <typename CaseType>                                             \
     93   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
     94 
     95 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
     96   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
     97    protected:                                                              \
     98     template <typename CaseType1, typename CaseType2>                      \
     99     void DoSingleCase(const CaseType1& CASES1##_case,                      \
    100                       const CaseType2& CASES2##_case);                     \
    101   };                                                                       \
    102                                                                            \
    103   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
    104     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
    105       for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
    106         SCOPED_TRACE(testing::Message()                                    \
    107           << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
    108           << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
    109         DoSingleCase(CASES1[i], CASES2[j]);                                \
    110       }                                                                    \
    111     }                                                                      \
    112   }                                                                        \
    113                                                                            \
    114   template <typename CaseType1, typename CaseType2>                        \
    115   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
    116                                            const CaseType2& CASES2##_case)
    117 
    118 // -------------------------------------------------------------------
    119 
    120 // An input stream that is basically like an ArrayInputStream but sometimes
    121 // returns empty buffers, just to throw us off.
    122 class TestInputStream : public ZeroCopyInputStream {
    123  public:
    124   TestInputStream(const void* data, int size, int block_size)
    125     : array_stream_(data, size, block_size), counter_(0) {}
    126   ~TestInputStream() {}
    127 
    128   // implements ZeroCopyInputStream ----------------------------------
    129   bool Next(const void** data, int* size) {
    130     // We'll return empty buffers starting with the first buffer, and every
    131     // 3 and 5 buffers after that.
    132     if (counter_ % 3 == 0 || counter_ % 5 == 0) {
    133       *data = NULL;
    134       *size = 0;
    135       ++counter_;
    136       return true;
    137     } else {
    138       ++counter_;
    139       return array_stream_.Next(data, size);
    140     }
    141   }
    142 
    143   void BackUp(int count)  { return array_stream_.BackUp(count); }
    144   bool Skip(int count)    { return array_stream_.Skip(count);   }
    145   int64 ByteCount() const { return array_stream_.ByteCount();   }
    146 
    147  private:
    148   ArrayInputStream array_stream_;
    149   int counter_;
    150 };
    151 
    152 // -------------------------------------------------------------------
    153 
    154 // An error collector which simply concatenates all its errors into a big
    155 // block of text which can be checked.
    156 class TestErrorCollector : public ErrorCollector {
    157  public:
    158   TestErrorCollector() {}
    159   ~TestErrorCollector() {}
    160 
    161   string text_;
    162 
    163   // implements ErrorCollector ---------------------------------------
    164   void AddError(int line, int column, const string& message) {
    165     strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
    166                                  line, column, message);
    167   }
    168 };
    169 
    170 // -------------------------------------------------------------------
    171 
    172 // We test each operation over a variety of block sizes to insure that
    173 // we test cases where reads cross buffer boundaries as well as cases
    174 // where they don't.  This is sort of a brute-force approach to this,
    175 // but it's easy to write and easy to understand.
    176 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
    177 
    178 class TokenizerTest : public testing::Test {
    179  protected:
    180   // For easy testing.
    181   uint64 ParseInteger(const string& text) {
    182     uint64 result;
    183     EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
    184     return result;
    185   }
    186 };
    187 
    188 // ===================================================================
    189 
    190 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
    191 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
    192 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
    193 
    194 // In each test case, the entire input text should parse as a single token
    195 // of the given type.
    196 struct SimpleTokenCase {
    197   string input;
    198   Tokenizer::TokenType type;
    199 };
    200 
    201 inline ostream& operator<<(ostream& out,
    202                            const SimpleTokenCase& test_case) {
    203   return out << CEscape(test_case.input);
    204 }
    205 
    206 SimpleTokenCase kSimpleTokenCases[] = {
    207   // Test identifiers.
    208   { "hello",       Tokenizer::TYPE_IDENTIFIER },
    209 
    210   // Test integers.
    211   { "123",         Tokenizer::TYPE_INTEGER },
    212   { "0xab6",       Tokenizer::TYPE_INTEGER },
    213   { "0XAB6",       Tokenizer::TYPE_INTEGER },
    214   { "0X1234567",   Tokenizer::TYPE_INTEGER },
    215   { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
    216   { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
    217   { "01234567",    Tokenizer::TYPE_INTEGER },
    218 
    219   // Test floats.
    220   { "123.45",      Tokenizer::TYPE_FLOAT },
    221   { "1.",          Tokenizer::TYPE_FLOAT },
    222   { "1e3",         Tokenizer::TYPE_FLOAT },
    223   { "1E3",         Tokenizer::TYPE_FLOAT },
    224   { "1e-3",        Tokenizer::TYPE_FLOAT },
    225   { "1e+3",        Tokenizer::TYPE_FLOAT },
    226   { "1.e3",        Tokenizer::TYPE_FLOAT },
    227   { "1.2e3",       Tokenizer::TYPE_FLOAT },
    228   { ".1",          Tokenizer::TYPE_FLOAT },
    229   { ".1e3",        Tokenizer::TYPE_FLOAT },
    230   { ".1e-3",       Tokenizer::TYPE_FLOAT },
    231   { ".1e+3",       Tokenizer::TYPE_FLOAT },
    232 
    233   // Test strings.
    234   { "'hello'",     Tokenizer::TYPE_STRING },
    235   { "\"foo\"",     Tokenizer::TYPE_STRING },
    236   { "'a\"b'",      Tokenizer::TYPE_STRING },
    237   { "\"a'b\"",     Tokenizer::TYPE_STRING },
    238   { "'a\\'b'",     Tokenizer::TYPE_STRING },
    239   { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
    240   { "'\\xf'",      Tokenizer::TYPE_STRING },
    241   { "'\\0'",       Tokenizer::TYPE_STRING },
    242 
    243   // Test symbols.
    244   { "+",           Tokenizer::TYPE_SYMBOL },
    245   { ".",           Tokenizer::TYPE_SYMBOL },
    246 };
    247 
    248 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
    249   // Set up the tokenizer.
    250   TestInputStream input(kSimpleTokenCases_case.input.data(),
    251                         kSimpleTokenCases_case.input.size(),
    252                         kBlockSizes_case);
    253   TestErrorCollector error_collector;
    254   Tokenizer tokenizer(&input, &error_collector);
    255 
    256   // Before Next() is called, the initial token should always be TYPE_START.
    257   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
    258   EXPECT_EQ("", tokenizer.current().text);
    259   EXPECT_EQ(0, tokenizer.current().line);
    260   EXPECT_EQ(0, tokenizer.current().column);
    261   EXPECT_EQ(0, tokenizer.current().end_column);
    262 
    263   // Parse the token.
    264   ASSERT_TRUE(tokenizer.Next());
    265 
    266   // Check that it has the right type.
    267   EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
    268   // Check that it contains the complete input text.
    269   EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
    270   // Check that it is located at the beginning of the input
    271   EXPECT_EQ(0, tokenizer.current().line);
    272   EXPECT_EQ(0, tokenizer.current().column);
    273   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
    274             tokenizer.current().end_column);
    275 
    276   // There should be no more input.
    277   EXPECT_FALSE(tokenizer.Next());
    278 
    279   // After Next() returns false, the token should have type TYPE_END.
    280   EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
    281   EXPECT_EQ("", tokenizer.current().text);
    282   EXPECT_EQ(0, tokenizer.current().line);
    283   EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
    284   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
    285             tokenizer.current().end_column);
    286 
    287   // There should be no errors.
    288   EXPECT_TRUE(error_collector.text_.empty());
    289 }
    290 
    291 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
    292   // Test the "allow_f_after_float" option.
    293 
    294   // Set up the tokenizer.
    295   const char* text = "1f 2.5f 6e3f 7F";
    296   TestInputStream input(text, strlen(text), kBlockSizes_case);
    297   TestErrorCollector error_collector;
    298   Tokenizer tokenizer(&input, &error_collector);
    299   tokenizer.set_allow_f_after_float(true);
    300 
    301   // Advance through tokens and check that they are parsed as expected.
    302   ASSERT_TRUE(tokenizer.Next());
    303   EXPECT_EQ(tokenizer.current().text, "1f");
    304   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    305   ASSERT_TRUE(tokenizer.Next());
    306   EXPECT_EQ(tokenizer.current().text, "2.5f");
    307   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    308   ASSERT_TRUE(tokenizer.Next());
    309   EXPECT_EQ(tokenizer.current().text, "6e3f");
    310   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    311   ASSERT_TRUE(tokenizer.Next());
    312   EXPECT_EQ(tokenizer.current().text, "7F");
    313   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    314 
    315   // There should be no more input.
    316   EXPECT_FALSE(tokenizer.Next());
    317   // There should be no errors.
    318   EXPECT_TRUE(error_collector.text_.empty());
    319 }
    320 
    321 #endif
    322 
    323 // -------------------------------------------------------------------
    324 
    325 // In each case, the input is parsed to produce a list of tokens.  The
    326 // last token in "output" must have type TYPE_END.
    327 struct MultiTokenCase {
    328   string input;
    329   Tokenizer::Token output[10];  // The compiler wants a constant array
    330                                 // size for initialization to work.  There
    331                                 // is no reason this can't be increased if
    332                                 // needed.
    333 };
    334 
    335 inline ostream& operator<<(ostream& out,
    336                            const MultiTokenCase& test_case) {
    337   return out << CEscape(test_case.input);
    338 }
    339 
    340 MultiTokenCase kMultiTokenCases[] = {
    341   // Test empty input.
    342   { "", {
    343     { Tokenizer::TYPE_END       , ""     , 0,  0 },
    344   }},
    345 
    346   // Test all token types at the same time.
    347   { "foo 1 1.2 + 'bar'", {
    348     { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
    349     { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
    350     { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
    351     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
    352     { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
    353     { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
    354   }},
    355 
    356   // Test that consecutive symbols are parsed as separate tokens.
    357   { "!@+%", {
    358     { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
    359     { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
    360     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
    361     { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
    362     { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
    363   }},
    364 
    365   // Test that newlines affect line numbers correctly.
    366   { "foo bar\nrab oof", {
    367     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
    368     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
    369     { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
    370     { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
    371     { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
    372   }},
    373 
    374   // Test that tabs affect column numbers correctly.
    375   { "foo\tbar  \tbaz", {
    376     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    377     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
    378     { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
    379     { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
    380   }},
    381 
    382   // Test that tabs in string literals affect column numbers correctly.
    383   { "\"foo\tbar\" baz", {
    384     { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
    385     { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
    386     { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
    387   }},
    388 
    389   // Test that line comments are ignored.
    390   { "foo // This is a comment\n"
    391     "bar // This is another comment", {
    392     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    393     { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
    394     { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
    395   }},
    396 
    397   // Test that block comments are ignored.
    398   { "foo /* This is a block comment */ bar", {
    399     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    400     { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
    401     { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
    402   }},
    403 
    404   // Test that sh-style comments are not ignored by default.
    405   { "foo # bar\n"
    406     "baz", {
    407     { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
    408     { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
    409     { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
    410     { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
    411     { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
    412   }},
    413 
    414   // Test all whitespace chars
    415   { "foo\n\t\r\v\fbar", {
    416     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
    417     { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
    418     { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
    419   }},
    420 };
    421 
    422 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
    423   // Set up the tokenizer.
    424   TestInputStream input(kMultiTokenCases_case.input.data(),
    425                         kMultiTokenCases_case.input.size(),
    426                         kBlockSizes_case);
    427   TestErrorCollector error_collector;
    428   Tokenizer tokenizer(&input, &error_collector);
    429 
    430   // Before Next() is called, the initial token should always be TYPE_START.
    431   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
    432   EXPECT_EQ("", tokenizer.current().text);
    433   EXPECT_EQ(0, tokenizer.current().line);
    434   EXPECT_EQ(0, tokenizer.current().column);
    435   EXPECT_EQ(0, tokenizer.current().end_column);
    436 
    437   // Loop through all expected tokens.
    438   int i = 0;
    439   Tokenizer::Token token;
    440   do {
    441     token = kMultiTokenCases_case.output[i++];
    442 
    443     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
    444 
    445     Tokenizer::Token previous = tokenizer.current();
    446 
    447     // Next() should only return false when it hits the end token.
    448     if (token.type != Tokenizer::TYPE_END) {
    449       ASSERT_TRUE(tokenizer.Next());
    450     } else {
    451       ASSERT_FALSE(tokenizer.Next());
    452     }
    453 
    454     // Check that the previous token is set correctly.
    455     EXPECT_EQ(previous.type, tokenizer.previous().type);
    456     EXPECT_EQ(previous.text, tokenizer.previous().text);
    457     EXPECT_EQ(previous.line, tokenizer.previous().line);
    458     EXPECT_EQ(previous.column, tokenizer.previous().column);
    459     EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
    460 
    461     // Check that the token matches the expected one.
    462     EXPECT_EQ(token.type, tokenizer.current().type);
    463     EXPECT_EQ(token.text, tokenizer.current().text);
    464     EXPECT_EQ(token.line, tokenizer.current().line);
    465     EXPECT_EQ(token.column, tokenizer.current().column);
    466     EXPECT_EQ(token.end_column, tokenizer.current().end_column);
    467 
    468   } while (token.type != Tokenizer::TYPE_END);
    469 
    470   // There should be no errors.
    471   EXPECT_TRUE(error_collector.text_.empty());
    472 }
    473 
    474 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
    475 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
    476 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
    477 
    478 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
    479   // Test the "comment_style" option.
    480 
    481   const char* text = "foo # bar\n"
    482                      "baz // qux\n"
    483                      "corge /* grault */\n"
    484                      "garply";
    485   const char* const kTokens[] = {"foo",  // "# bar" is ignored
    486                                  "baz", "/", "/", "qux",
    487                                  "corge", "/", "*", "grault", "*", "/",
    488                                  "garply"};
    489 
    490   // Set up the tokenizer.
    491   TestInputStream input(text, strlen(text), kBlockSizes_case);
    492   TestErrorCollector error_collector;
    493   Tokenizer tokenizer(&input, &error_collector);
    494   tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
    495 
    496   // Advance through tokens and check that they are parsed as expected.
    497   for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
    498     EXPECT_TRUE(tokenizer.Next());
    499     EXPECT_EQ(tokenizer.current().text, kTokens[i]);
    500   }
    501 
    502   // There should be no more input.
    503   EXPECT_FALSE(tokenizer.Next());
    504   // There should be no errors.
    505   EXPECT_TRUE(error_collector.text_.empty());
    506 }
    507 
    508 #endif
    509 
    510 // -------------------------------------------------------------------
    511 
    512 // In each case, the input is expected to have two tokens named "prev" and
    513 // "next" with comments in between.
    514 struct DocCommentCase {
    515   string input;
    516 
    517   const char* prev_trailing_comments;
    518   const char* detached_comments[10];
    519   const char* next_leading_comments;
    520 };
    521 
    522 inline ostream& operator<<(ostream& out,
    523                            const DocCommentCase& test_case) {
    524   return out << CEscape(test_case.input);
    525 }
    526 
    527 DocCommentCase kDocCommentCases[] = {
    528   {
    529     "prev next",
    530 
    531     "",
    532     {},
    533     ""
    534       },
    535 
    536         {
    537       "prev /* ignored */ next",
    538 
    539       "",
    540       {},
    541       ""
    542         },
    543 
    544           {
    545         "prev // trailing comment\n"
    546             "next",
    547 
    548             " trailing comment\n",
    549             {},
    550             ""
    551           },
    552 
    553             {
    554           "prev\n"
    555               "// leading comment\n"
    556               "// line 2\n"
    557               "next",
    558 
    559               "",
    560               {},
    561               " leading comment\n"
    562               " line 2\n"
    563             },
    564 
    565               {
    566             "prev\n"
    567                 "// trailing comment\n"
    568                 "// line 2\n"
    569                 "\n"
    570                 "next",
    571 
    572                 " trailing comment\n"
    573                 " line 2\n",
    574                 {},
    575                 ""
    576               },
    577 
    578                 {
    579               "prev // trailing comment\n"
    580                   "// leading comment\n"
    581                   "// line 2\n"
    582                   "next",
    583 
    584                   " trailing comment\n",
    585                   {},
    586                   " leading comment\n"
    587                   " line 2\n"
    588                 },
    589 
    590                   {
    591                 "prev /* trailing block comment */\n"
    592                     "/* leading block comment\n"
    593                     " * line 2\n"
    594                     " * line 3 */"
    595                     "next",
    596 
    597                     " trailing block comment ",
    598                     {},
    599                     " leading block comment\n"
    600                     " line 2\n"
    601                     " line 3 "
    602                   },
    603 
    604                     {
    605                   "prev\n"
    606                       "/* trailing block comment\n"
    607                       " * line 2\n"
    608                       " * line 3\n"
    609                       " */\n"
    610                       "/* leading block comment\n"
    611                       " * line 2\n"
    612                       " * line 3 */"
    613                       "next",
    614 
    615                       " trailing block comment\n"
    616                       " line 2\n"
    617                       " line 3\n",
    618                       {},
    619                       " leading block comment\n"
    620                       " line 2\n"
    621                       " line 3 "
    622                     },
    623 
    624                       {
    625                     "prev\n"
    626                         "// trailing comment\n"
    627                         "\n"
    628                         "// detached comment\n"
    629                         "// line 2\n"
    630                         "\n"
    631                         "// second detached comment\n"
    632                         "/* third detached comment\n"
    633                         " * line 2 */\n"
    634                         "// leading comment\n"
    635                         "next",
    636 
    637                         " trailing comment\n",
    638                         {
    639                       " detached comment\n"
    640                           " line 2\n",
    641                           " second detached comment\n",
    642                           " third detached comment\n"
    643                           " line 2 "
    644                         },
    645                           " leading comment\n"
    646                         },
    647 
    648                           {
    649                         "prev /**/\n"
    650                             "\n"
    651                             "// detached comment\n"
    652                             "\n"
    653                             "// leading comment\n"
    654                             "next",
    655 
    656                             "",
    657                             {
    658                           " detached comment\n"
    659                             },
    660                               " leading comment\n"
    661                             },
    662 
    663                               {
    664                             "prev /**/\n"
    665                                 "// leading comment\n"
    666                                 "next",
    667 
    668                                 "",
    669                                 {},
    670                                 " leading comment\n"
    671                               },
    672                               };
    673 
    674 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
    675   // Set up the tokenizer.
    676   TestInputStream input(kDocCommentCases_case.input.data(),
    677                         kDocCommentCases_case.input.size(),
    678                         kBlockSizes_case);
    679   TestErrorCollector error_collector;
    680   Tokenizer tokenizer(&input, &error_collector);
    681 
    682   // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
    683   TestInputStream input2(kDocCommentCases_case.input.data(),
    684                         kDocCommentCases_case.input.size(),
    685                         kBlockSizes_case);
    686   Tokenizer tokenizer2(&input2, &error_collector);
    687 
    688   tokenizer.Next();
    689   tokenizer2.Next();
    690 
    691   EXPECT_EQ("prev", tokenizer.current().text);
    692   EXPECT_EQ("prev", tokenizer2.current().text);
    693 
    694   string prev_trailing_comments;
    695   vector<string> detached_comments;
    696   string next_leading_comments;
    697   tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
    698                              &next_leading_comments);
    699   tokenizer2.NextWithComments(NULL, NULL, NULL);
    700   EXPECT_EQ("next", tokenizer.current().text);
    701   EXPECT_EQ("next", tokenizer2.current().text);
    702 
    703   EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
    704             prev_trailing_comments);
    705 
    706   for (int i = 0; i < detached_comments.size(); i++) {
    707     ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
    708     ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
    709     EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
    710               detached_comments[i]);
    711   }
    712 
    713   // Verify that we matched all the detached comments.
    714   EXPECT_EQ(NULL,
    715       kDocCommentCases_case.detached_comments[detached_comments.size()]);
    716 
    717   EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
    718             next_leading_comments);
    719 }
    720 
    721 // -------------------------------------------------------------------
    722 
    723 // Test parse helpers.  It's not really worth setting up a full data-driven
    724 // test here.
    725 TEST_F(TokenizerTest, ParseInteger) {
    726   EXPECT_EQ(0, ParseInteger("0"));
    727   EXPECT_EQ(123, ParseInteger("123"));
    728   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
    729   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
    730   EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
    731   EXPECT_EQ(01234567, ParseInteger("01234567"));
    732   EXPECT_EQ(0X123, ParseInteger("0X123"));
    733 
    734   // Test invalid integers that may still be tokenized as integers.
    735   EXPECT_EQ(0, ParseInteger("0x"));
    736 
    737   uint64 i;
    738 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
    739   // Test invalid integers that will never be tokenized as integers.
    740   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
    741     "passed text that could not have been tokenized as an integer");
    742   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
    743     "passed text that could not have been tokenized as an integer");
    744   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
    745     "passed text that could not have been tokenized as an integer");
    746   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
    747     "passed text that could not have been tokenized as an integer");
    748   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
    749     "passed text that could not have been tokenized as an integer");
    750 #endif  // PROTOBUF_HAS_DEATH_TEST
    751 
    752   // Test overflows.
    753   EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
    754   EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
    755   EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
    756   EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
    757   EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
    758   EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
    759   EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
    760 }
    761 
    762 TEST_F(TokenizerTest, ParseFloat) {
    763   EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
    764   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
    765   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
    766   EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
    767   EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
    768   EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
    769   EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
    770   EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
    771   EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
    772   EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
    773   EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
    774   EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
    775   EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
    776   EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
    777 
    778   // Test invalid integers that may still be tokenized as integers.
    779   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
    780   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
    781   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
    782 
    783   // Test 'f' suffix.
    784   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
    785   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
    786   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
    787 
    788   // These should parse successfully even though they are out of range.
    789   // Overflows become infinity and underflows become zero.
    790   EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
    791   EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
    792 
    793 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
    794   // Test invalid integers that will never be tokenized as integers.
    795   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
    796     "passed text that could not have been tokenized as a float");
    797   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
    798     "passed text that could not have been tokenized as a float");
    799   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
    800     "passed text that could not have been tokenized as a float");
    801 #endif  // PROTOBUF_HAS_DEATH_TEST
    802 }
    803 
    804 TEST_F(TokenizerTest, ParseString) {
    805   string output;
    806   Tokenizer::ParseString("'hello'", &output);
    807   EXPECT_EQ("hello", output);
    808   Tokenizer::ParseString("\"blah\\nblah2\"", &output);
    809   EXPECT_EQ("blah\nblah2", output);
    810   Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
    811   EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
    812   Tokenizer::ParseString("'\\x20\\x4'", &output);
    813   EXPECT_EQ("\x20\x4", output);
    814 
    815   // Test invalid strings that may still be tokenized as strings.
    816   Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
    817   EXPECT_EQ("\a?\v\t", output);
    818   Tokenizer::ParseString("'", &output);
    819   EXPECT_EQ("", output);
    820   Tokenizer::ParseString("'\\", &output);
    821   EXPECT_EQ("\\", output);
    822 
    823   // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
    824   // characters.
    825   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
    826   EXPECT_EQ("$XX", output);
    827   // Same thing encoded using UTF16.
    828   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
    829   EXPECT_EQ("$XX", output);
    830   // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
    831   // We just output this as if it were UTF8; it's not a defined code point, but
    832   // it has a defined encoding.
    833   Tokenizer::ParseString("'\\ud852XX'", &output);
    834   EXPECT_EQ("\xed\xa1\x92XX", output);
    835   // Malformed escape: Demons may fly out of the nose.
    836   Tokenizer::ParseString("\\u0", &output);
    837   EXPECT_EQ("u0", output);
    838 
    839   // Test invalid strings that will never be tokenized as strings.
    840 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
    841   EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
    842     "passed text that could not have been tokenized as a string");
    843 #endif  // PROTOBUF_HAS_DEATH_TEST
    844 }
    845 
    846 TEST_F(TokenizerTest, ParseStringAppend) {
    847   // Check that ParseString and ParseStringAppend differ.
    848   string output("stuff+");
    849   Tokenizer::ParseStringAppend("'hello'", &output);
    850   EXPECT_EQ("stuff+hello", output);
    851   Tokenizer::ParseString("'hello'", &output);
    852   EXPECT_EQ("hello", output);
    853 }
    854 
    855 // -------------------------------------------------------------------
    856 
    857 // Each case parses some input text, ignoring the tokens produced, and
    858 // checks that the error output matches what is expected.
    859 struct ErrorCase {
    860   string input;
    861   bool recoverable;  // True if the tokenizer should be able to recover and
    862                      // parse more tokens after seeing this error.  Cases
    863                      // for which this is true must end with "foo" as
    864                      // the last token, which the test will check for.
    865   const char* errors;
    866 };
    867 
    868 inline ostream& operator<<(ostream& out,
    869                            const ErrorCase& test_case) {
    870   return out << CEscape(test_case.input);
    871 }
    872 
    873 ErrorCase kErrorCases[] = {
    874   // String errors.
    875   { "'\\l' foo", true,
    876     "0:2: Invalid escape sequence in string literal.\n" },
    877   { "'\\x' foo", true,
    878     "0:3: Expected hex digits for escape sequence.\n" },
    879   { "'foo", false,
    880     "0:4: Unexpected end of string.\n" },
    881   { "'bar\nfoo", true,
    882     "0:4: String literals cannot cross line boundaries.\n" },
    883   { "'\\u01' foo", true,
    884     "0:5: Expected four hex digits for \\u escape sequence.\n" },
    885   { "'\\u01' foo", true,
    886     "0:5: Expected four hex digits for \\u escape sequence.\n" },
    887   { "'\\uXYZ' foo", true,
    888     "0:3: Expected four hex digits for \\u escape sequence.\n" },
    889 
    890   // Integer errors.
    891   { "123foo", true,
    892     "0:3: Need space between number and identifier.\n" },
    893 
    894   // Hex/octal errors.
    895   { "0x foo", true,
    896     "0:2: \"0x\" must be followed by hex digits.\n" },
    897   { "0541823 foo", true,
    898     "0:4: Numbers starting with leading zero must be in octal.\n" },
    899   { "0x123z foo", true,
    900     "0:5: Need space between number and identifier.\n" },
    901   { "0x123.4 foo", true,
    902     "0:5: Hex and octal numbers must be integers.\n" },
    903   { "0123.4 foo", true,
    904     "0:4: Hex and octal numbers must be integers.\n" },
    905 
    906   // Float errors.
    907   { "1e foo", true,
    908     "0:2: \"e\" must be followed by exponent.\n" },
    909   { "1e- foo", true,
    910     "0:3: \"e\" must be followed by exponent.\n" },
    911   { "1.2.3 foo", true,
    912     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
    913   { "1e2.3 foo", true,
    914     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
    915   { "a.1 foo", true,
    916     "0:1: Need space between identifier and decimal point.\n" },
    917   // allow_f_after_float not enabled, so this should be an error.
    918   { "1.0f foo", true,
    919     "0:3: Need space between number and identifier.\n" },
    920 
    921   // Block comment errors.
    922   { "/*", false,
    923     "0:2: End-of-file inside block comment.\n"
    924     "0:0:   Comment started here.\n"},
    925   { "/*/*/ foo", true,
    926     "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
    927 
    928   // Control characters.  Multiple consecutive control characters should only
    929   // produce one error.
    930   { "\b foo", true,
    931     "0:0: Invalid control characters encountered in text.\n" },
    932   { "\b\b foo", true,
    933     "0:0: Invalid control characters encountered in text.\n" },
    934 
    935   // Check that control characters at end of input don't result in an
    936   // infinite loop.
    937   { "\b", false,
    938     "0:0: Invalid control characters encountered in text.\n" },
    939 
    940   // Check recovery from '\0'.  We have to explicitly specify the length of
    941   // these strings because otherwise the string constructor will just call
    942   // strlen() which will see the first '\0' and think that is the end of the
    943   // string.
    944   { string("\0foo", 4), true,
    945     "0:0: Invalid control characters encountered in text.\n" },
    946   { string("\0\0foo", 5), true,
    947     "0:0: Invalid control characters encountered in text.\n" },
    948 
    949   // Check error from high order bits set
    950   { "\300foo", true,
    951     "0:0: Interpreting non ascii codepoint 192.\n" },
    952 };
    953 
    954 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
    955   // Set up the tokenizer.
    956   TestInputStream input(kErrorCases_case.input.data(),
    957                         kErrorCases_case.input.size(),
    958                         kBlockSizes_case);
    959   TestErrorCollector error_collector;
    960   Tokenizer tokenizer(&input, &error_collector);
    961 
    962   // Ignore all input, except remember if the last token was "foo".
    963   bool last_was_foo = false;
    964   while (tokenizer.Next()) {
    965     last_was_foo = tokenizer.current().text == "foo";
    966   }
    967 
    968   // Check that the errors match what was expected.
    969   EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
    970 
    971   // If the error was recoverable, make sure we saw "foo" after it.
    972   if (kErrorCases_case.recoverable) {
    973     EXPECT_TRUE(last_was_foo);
    974   }
    975 }
    976 
    977 // -------------------------------------------------------------------
    978 
    979 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
    980   string text = "foo bar";
    981   TestInputStream input(text.data(), text.size(), kBlockSizes_case);
    982 
    983   // Create a tokenizer, read one token, then destroy it.
    984   {
    985     TestErrorCollector error_collector;
    986     Tokenizer tokenizer(&input, &error_collector);
    987 
    988     tokenizer.Next();
    989   }
    990 
    991   // Only "foo" should have been read.
    992   EXPECT_EQ(strlen("foo"), input.ByteCount());
    993 }
    994 
    995 
    996 }  // namespace
    997 }  // namespace io
    998 }  // namespace protobuf
    999 }  // namespace google
   1000