Home | History | Annotate | Download | only in io
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // http://code.google.com/p/protobuf/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 
     35 #include <vector>
     36 #include <math.h>
     37 #include <limits.h>
     38 
     39 #include <google/protobuf/io/tokenizer.h>
     40 #include <google/protobuf/io/zero_copy_stream_impl.h>
     41 
     42 #include <google/protobuf/stubs/common.h>
     43 #include <google/protobuf/stubs/strutil.h>
     44 #include <google/protobuf/stubs/substitute.h>
     45 #include <google/protobuf/testing/googletest.h>
     46 #include <gtest/gtest.h>
     47 
     48 namespace google {
     49 namespace protobuf {
     50 namespace io {
     51 namespace {
     52 
     53 // ===================================================================
     54 // Data-Driven Test Infrastructure
     55 
     56 // TODO(kenton):  This is copied from coded_stream_unittest.  This is
     57 //   temporary until these fetaures are integrated into gTest itself.
     58 
     59 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
     60 // gTest.  These macros can be used to declare tests which should be
     61 // run multiple times, once for each item in some input array.  TEST_1D
     62 // tests all cases in a single input array.  TEST_2D tests all
     63 // combinations of cases from two arrays.  The arrays must be statically
     64 // defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
     65 //
     66 // int kCases[] = {1, 2, 3, 4}
     67 // TEST_1D(MyFixture, MyTest, kCases) {
     68 //   EXPECT_GT(kCases_case, 0);
     69 // }
     70 //
     71 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
     72 // they are all grater than zero.  In case of failure, the exact case
     73 // which failed will be printed.  The case type must be printable using
     74 // ostream::operator<<.
     75 
     76 #define TEST_1D(FIXTURE, NAME, CASES)                                      \
     77   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
     78    protected:                                                              \
     79     template <typename CaseType>                                           \
     80     void DoSingleCase(const CaseType& CASES##_case);                       \
     81   };                                                                       \
     82                                                                            \
     83   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
     84     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
     85       SCOPED_TRACE(testing::Message()                                      \
     86         << #CASES " case #" << i << ": " << CASES[i]);                     \
     87       DoSingleCase(CASES[i]);                                              \
     88     }                                                                      \
     89   }                                                                        \
     90                                                                            \
     91   template <typename CaseType>                                             \
     92   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
     93 
     94 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
     95   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
     96    protected:                                                              \
     97     template <typename CaseType1, typename CaseType2>                      \
     98     void DoSingleCase(const CaseType1& CASES1##_case,                      \
     99                       const CaseType2& CASES2##_case);                     \
    100   };                                                                       \
    101                                                                            \
    102   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
    103     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
    104       for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
    105         SCOPED_TRACE(testing::Message()                                    \
    106           << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
    107           << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
    108         DoSingleCase(CASES1[i], CASES2[j]);                                \
    109       }                                                                    \
    110     }                                                                      \
    111   }                                                                        \
    112                                                                            \
    113   template <typename CaseType1, typename CaseType2>                        \
    114   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
    115                                            const CaseType2& CASES2##_case)
    116 
    117 // -------------------------------------------------------------------
    118 
    119 // An input stream that is basically like an ArrayInputStream but sometimes
    120 // returns empty buffers, just to throw us off.
    121 class TestInputStream : public ZeroCopyInputStream {
    122  public:
    123   TestInputStream(const void* data, int size, int block_size)
    124     : array_stream_(data, size, block_size), counter_(0) {}
    125   ~TestInputStream() {}
    126 
    127   // implements ZeroCopyInputStream ----------------------------------
    128   bool Next(const void** data, int* size) {
    129     // We'll return empty buffers starting with the first buffer, and every
    130     // 3 and 5 buffers after that.
    131     if (counter_ % 3 == 0 || counter_ % 5 == 0) {
    132       *data = NULL;
    133       *size = 0;
    134       ++counter_;
    135       return true;
    136     } else {
    137       ++counter_;
    138       return array_stream_.Next(data, size);
    139     }
    140   }
    141 
    142   void BackUp(int count)  { return array_stream_.BackUp(count); }
    143   bool Skip(int count)    { return array_stream_.Skip(count);   }
    144   int64 ByteCount() const { return array_stream_.ByteCount();   }
    145 
    146  private:
    147   ArrayInputStream array_stream_;
    148   int counter_;
    149 };
    150 
    151 // -------------------------------------------------------------------
    152 
    153 // An error collector which simply concatenates all its errors into a big
    154 // block of text which can be checked.
    155 class TestErrorCollector : public ErrorCollector {
    156  public:
    157   TestErrorCollector() {}
    158   ~TestErrorCollector() {}
    159 
    160   string text_;
    161 
    162   // implements ErrorCollector ---------------------------------------
    163   void AddError(int line, int column, const string& message) {
    164     strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
    165                                  line, column, message);
    166   }
    167 };
    168 
    169 // -------------------------------------------------------------------
    170 
    171 // We test each operation over a variety of block sizes to insure that
    172 // we test cases where reads cross buffer boundaries as well as cases
    173 // where they don't.  This is sort of a brute-force approach to this,
    174 // but it's easy to write and easy to understand.
    175 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
    176 
    177 class TokenizerTest : public testing::Test {
    178  protected:
    179   // For easy testing.
    180   uint64 ParseInteger(const string& text) {
    181     uint64 result;
    182     EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
    183     return result;
    184   }
    185 };
    186 
    187 // ===================================================================
    188 
    189 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
    190 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
    191 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
    192 
    193 // In each test case, the entire input text should parse as a single token
    194 // of the given type.
    195 struct SimpleTokenCase {
    196   string input;
    197   Tokenizer::TokenType type;
    198 };
    199 
    200 inline ostream& operator<<(ostream& out,
    201                            const SimpleTokenCase& test_case) {
    202   return out << CEscape(test_case.input);
    203 }
    204 
    205 SimpleTokenCase kSimpleTokenCases[] = {
    206   // Test identifiers.
    207   { "hello",       Tokenizer::TYPE_IDENTIFIER },
    208 
    209   // Test integers.
    210   { "123",         Tokenizer::TYPE_INTEGER },
    211   { "0xab6",       Tokenizer::TYPE_INTEGER },
    212   { "0XAB6",       Tokenizer::TYPE_INTEGER },
    213   { "0X1234567",   Tokenizer::TYPE_INTEGER },
    214   { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
    215   { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
    216   { "01234567",    Tokenizer::TYPE_INTEGER },
    217 
    218   // Test floats.
    219   { "123.45",      Tokenizer::TYPE_FLOAT },
    220   { "1.",          Tokenizer::TYPE_FLOAT },
    221   { "1e3",         Tokenizer::TYPE_FLOAT },
    222   { "1E3",         Tokenizer::TYPE_FLOAT },
    223   { "1e-3",        Tokenizer::TYPE_FLOAT },
    224   { "1e+3",        Tokenizer::TYPE_FLOAT },
    225   { "1.e3",        Tokenizer::TYPE_FLOAT },
    226   { "1.2e3",       Tokenizer::TYPE_FLOAT },
    227   { ".1",          Tokenizer::TYPE_FLOAT },
    228   { ".1e3",        Tokenizer::TYPE_FLOAT },
    229   { ".1e-3",       Tokenizer::TYPE_FLOAT },
    230   { ".1e+3",       Tokenizer::TYPE_FLOAT },
    231 
    232   // Test strings.
    233   { "'hello'",     Tokenizer::TYPE_STRING },
    234   { "\"foo\"",     Tokenizer::TYPE_STRING },
    235   { "'a\"b'",      Tokenizer::TYPE_STRING },
    236   { "\"a'b\"",     Tokenizer::TYPE_STRING },
    237   { "'a\\'b'",     Tokenizer::TYPE_STRING },
    238   { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
    239   { "'\\xf'",      Tokenizer::TYPE_STRING },
    240   { "'\\0'",       Tokenizer::TYPE_STRING },
    241 
    242   // Test symbols.
    243   { "+",           Tokenizer::TYPE_SYMBOL },
    244   { ".",           Tokenizer::TYPE_SYMBOL },
    245 };
    246 
    247 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
    248   // Set up the tokenizer.
    249   TestInputStream input(kSimpleTokenCases_case.input.data(),
    250                         kSimpleTokenCases_case.input.size(),
    251                         kBlockSizes_case);
    252   TestErrorCollector error_collector;
    253   Tokenizer tokenizer(&input, &error_collector);
    254 
    255   // Before Next() is called, the initial token should always be TYPE_START.
    256   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
    257   EXPECT_EQ("", tokenizer.current().text);
    258   EXPECT_EQ(0, tokenizer.current().line);
    259   EXPECT_EQ(0, tokenizer.current().column);
    260 
    261   // Parse the token.
    262   ASSERT_TRUE(tokenizer.Next());
    263 
    264   // Check that it has the right type.
    265   EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
    266   // Check that it contains the complete input text.
    267   EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
    268   // Check that it is located at the beginning of the input
    269   EXPECT_EQ(0, tokenizer.current().line);
    270   EXPECT_EQ(0, tokenizer.current().column);
    271 
    272   // There should be no more input.
    273   EXPECT_FALSE(tokenizer.Next());
    274 
    275   // After Next() returns false, the token should have type TYPE_END.
    276   EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
    277   EXPECT_EQ("", tokenizer.current().text);
    278   EXPECT_EQ(0, tokenizer.current().line);
    279   EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
    280 
    281   // There should be no errors.
    282   EXPECT_TRUE(error_collector.text_.empty());
    283 }
    284 
    285 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
    286   // Test the "allow_f_after_float" option.
    287 
    288   // Set up the tokenizer.
    289   const char* text = "1f 2.5f 6e3f 7F";
    290   TestInputStream input(text, strlen(text), kBlockSizes_case);
    291   TestErrorCollector error_collector;
    292   Tokenizer tokenizer(&input, &error_collector);
    293   tokenizer.set_allow_f_after_float(true);
    294 
    295   // Advance through tokens and check that they are parsed as expected.
    296   ASSERT_TRUE(tokenizer.Next());
    297   EXPECT_EQ(tokenizer.current().text, "1f");
    298   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    299   ASSERT_TRUE(tokenizer.Next());
    300   EXPECT_EQ(tokenizer.current().text, "2.5f");
    301   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    302   ASSERT_TRUE(tokenizer.Next());
    303   EXPECT_EQ(tokenizer.current().text, "6e3f");
    304   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    305   ASSERT_TRUE(tokenizer.Next());
    306   EXPECT_EQ(tokenizer.current().text, "7F");
    307   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
    308 
    309   // There should be no more input.
    310   EXPECT_FALSE(tokenizer.Next());
    311   // There should be no errors.
    312   EXPECT_TRUE(error_collector.text_.empty());
    313 }
    314 
    315 #endif
    316 
    317 // -------------------------------------------------------------------
    318 
    319 // In each case, the input is parsed to produce a list of tokens.  The
    320 // last token in "output" must have type TYPE_END.
    321 struct MultiTokenCase {
    322   string input;
    323   Tokenizer::Token output[10];  // The compiler wants a constant array
    324                                 // size for initialization to work.  There
    325                                 // is no reason this can't be increased if
    326                                 // needed.
    327 };
    328 
    329 inline ostream& operator<<(ostream& out,
    330                            const MultiTokenCase& test_case) {
    331   return out << CEscape(test_case.input);
    332 }
    333 
    334 MultiTokenCase kMultiTokenCases[] = {
    335   // Test empty input.
    336   { "", {
    337     { Tokenizer::TYPE_END       , ""     , 0,  0 },
    338   }},
    339 
    340   // Test all token types at the same time.
    341   { "foo 1 1.2 + 'bar'", {
    342     { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0 },
    343     { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4 },
    344     { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6 },
    345     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10 },
    346     { Tokenizer::TYPE_STRING    , "'bar'", 0, 12 },
    347     { Tokenizer::TYPE_END       , ""     , 0, 17 },
    348   }},
    349 
    350   // Test that consecutive symbols are parsed as separate tokens.
    351   { "!@+%", {
    352     { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0 },
    353     { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1 },
    354     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2 },
    355     { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3 },
    356     { Tokenizer::TYPE_END       , ""     , 0, 4 },
    357   }},
    358 
    359   // Test that newlines affect line numbers correctly.
    360   { "foo bar\nrab oof", {
    361     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
    362     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4 },
    363     { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0 },
    364     { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4 },
    365     { Tokenizer::TYPE_END       , ""   , 1,  7 },
    366   }},
    367 
    368   // Test that tabs affect column numbers correctly.
    369   { "foo\tbar  \tbaz", {
    370     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
    371     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8 },
    372     { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 },
    373     { Tokenizer::TYPE_END       , ""   , 0, 19 },
    374   }},
    375 
    376   // Test that line comments are ignored.
    377   { "foo // This is a comment\n"
    378     "bar // This is another comment", {
    379     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
    380     { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0 },
    381     { Tokenizer::TYPE_END       , ""   , 1, 30 },
    382   }},
    383 
    384   // Test that block comments are ignored.
    385   { "foo /* This is a block comment */ bar", {
    386     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
    387     { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 },
    388     { Tokenizer::TYPE_END       , ""   , 0, 37 },
    389   }},
    390 
    391   // Test that sh-style comments are not ignored by default.
    392   { "foo # bar\n"
    393     "baz", {
    394     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
    395     { Tokenizer::TYPE_SYMBOL    , "#"  , 0,  4 },
    396     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  6 },
    397     { Tokenizer::TYPE_IDENTIFIER, "baz", 1,  0 },
    398     { Tokenizer::TYPE_END       , ""   , 1, 3 },
    399   }},
    400 
    401   // Bytes with the high-order bit set should not be seen as control characters.
    402   { "\300", {
    403     { Tokenizer::TYPE_SYMBOL, "\300", 0, 0 },
    404     { Tokenizer::TYPE_END   , ""    , 0, 1 },
    405   }},
    406 
    407   // Test all whitespace chars
    408   { "foo\n\t\r\v\fbar", {
    409     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
    410     { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11 },
    411     { Tokenizer::TYPE_END       , ""   , 1, 14 },
    412   }},
    413 };
    414 
    415 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
    416   // Set up the tokenizer.
    417   TestInputStream input(kMultiTokenCases_case.input.data(),
    418                         kMultiTokenCases_case.input.size(),
    419                         kBlockSizes_case);
    420   TestErrorCollector error_collector;
    421   Tokenizer tokenizer(&input, &error_collector);
    422 
    423   // Before Next() is called, the initial token should always be TYPE_START.
    424   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
    425   EXPECT_EQ("", tokenizer.current().text);
    426   EXPECT_EQ(0, tokenizer.current().line);
    427   EXPECT_EQ(0, tokenizer.current().column);
    428 
    429   // Loop through all expected tokens.
    430   int i = 0;
    431   Tokenizer::Token token;
    432   do {
    433     token = kMultiTokenCases_case.output[i++];
    434 
    435     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
    436 
    437     // Next() should only return false when it hits the end token.
    438     if (token.type != Tokenizer::TYPE_END) {
    439       ASSERT_TRUE(tokenizer.Next());
    440     } else {
    441       ASSERT_FALSE(tokenizer.Next());
    442     }
    443 
    444     // Check that the token matches the expected one.
    445     EXPECT_EQ(token.type, tokenizer.current().type);
    446     EXPECT_EQ(token.text, tokenizer.current().text);
    447     EXPECT_EQ(token.line, tokenizer.current().line);
    448     EXPECT_EQ(token.column, tokenizer.current().column);
    449 
    450   } while (token.type != Tokenizer::TYPE_END);
    451 
    452   // There should be no errors.
    453   EXPECT_TRUE(error_collector.text_.empty());
    454 }
    455 
    456 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
    457 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
    458 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
    459 
    460 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
    461   // Test the "comment_style" option.
    462 
    463   const char* text = "foo # bar\n"
    464                      "baz // qux\n"
    465                      "corge /* grault */\n"
    466                      "garply";
    467   const char* const kTokens[] = {"foo",  // "# bar" is ignored
    468                                  "baz", "/", "/", "qux",
    469                                  "corge", "/", "*", "grault", "*", "/",
    470                                  "garply"};
    471 
    472   // Set up the tokenizer.
    473   TestInputStream input(text, strlen(text), kBlockSizes_case);
    474   TestErrorCollector error_collector;
    475   Tokenizer tokenizer(&input, &error_collector);
    476   tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
    477 
    478   // Advance through tokens and check that they are parsed as expected.
    479   for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
    480     EXPECT_TRUE(tokenizer.Next());
    481     EXPECT_EQ(tokenizer.current().text, kTokens[i]);
    482   }
    483 
    484   // There should be no more input.
    485   EXPECT_FALSE(tokenizer.Next());
    486   // There should be no errors.
    487   EXPECT_TRUE(error_collector.text_.empty());
    488 }
    489 
    490 #endif
    491 
    492 // -------------------------------------------------------------------
    493 
    494 // Test parse helpers.  It's not really worth setting up a full data-driven
    495 // test here.
    496 TEST_F(TokenizerTest, ParseInteger) {
    497   EXPECT_EQ(0, ParseInteger("0"));
    498   EXPECT_EQ(123, ParseInteger("123"));
    499   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
    500   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
    501   EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
    502   EXPECT_EQ(01234567, ParseInteger("01234567"));
    503   EXPECT_EQ(0X123, ParseInteger("0X123"));
    504 
    505   // Test invalid integers that may still be tokenized as integers.
    506   EXPECT_EQ(0, ParseInteger("0x"));
    507 
    508   uint64 i;
    509 #ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
    510   // Test invalid integers that will never be tokenized as integers.
    511   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
    512     "passed text that could not have been tokenized as an integer");
    513   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
    514     "passed text that could not have been tokenized as an integer");
    515   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
    516     "passed text that could not have been tokenized as an integer");
    517   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
    518     "passed text that could not have been tokenized as an integer");
    519   EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
    520     "passed text that could not have been tokenized as an integer");
    521 #endif  // GTEST_HAS_DEATH_TEST
    522 
    523   // Test overflows.
    524   EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
    525   EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
    526   EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
    527   EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
    528   EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
    529   EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
    530   EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
    531 }
    532 
    533 TEST_F(TokenizerTest, ParseFloat) {
    534   EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
    535   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
    536   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
    537   EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
    538   EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
    539   EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
    540   EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
    541   EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
    542   EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
    543   EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
    544   EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
    545   EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
    546   EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
    547   EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
    548 
    549   // Test invalid integers that may still be tokenized as integers.
    550   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
    551   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
    552   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
    553 
    554   // Test 'f' suffix.
    555   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
    556   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
    557   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
    558 
    559   // These should parse successfully even though they are out of range.
    560   // Overflows become infinity and underflows become zero.
    561   EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
    562   EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
    563 
    564 #ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
    565   // Test invalid integers that will never be tokenized as integers.
    566   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
    567     "passed text that could not have been tokenized as a float");
    568   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
    569     "passed text that could not have been tokenized as a float");
    570   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
    571     "passed text that could not have been tokenized as a float");
    572 #endif  // GTEST_HAS_DEATH_TEST
    573 }
    574 
    575 TEST_F(TokenizerTest, ParseString) {
    576   string output;
    577   Tokenizer::ParseString("'hello'", &output);
    578   EXPECT_EQ("hello", output);
    579   Tokenizer::ParseString("\"blah\\nblah2\"", &output);
    580   EXPECT_EQ("blah\nblah2", output);
    581   Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
    582   EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
    583   Tokenizer::ParseString("'\\x20\\x4'", &output);
    584   EXPECT_EQ("\x20\x4", output);
    585 
    586   // Test invalid strings that may still be tokenized as strings.
    587   Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
    588   EXPECT_EQ("\a?\v\t", output);
    589   Tokenizer::ParseString("'", &output);
    590   EXPECT_EQ("", output);
    591   Tokenizer::ParseString("'\\", &output);
    592   EXPECT_EQ("\\", output);
    593 
    594   // Test invalid strings that will never be tokenized as strings.
    595 #ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
    596   EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
    597     "passed text that could not have been tokenized as a string");
    598 #endif  // GTEST_HAS_DEATH_TEST
    599 }
    600 
    601 TEST_F(TokenizerTest, ParseStringAppend) {
    602   // Check that ParseString and ParseStringAppend differ.
    603   string output("stuff+");
    604   Tokenizer::ParseStringAppend("'hello'", &output);
    605   EXPECT_EQ("stuff+hello", output);
    606   Tokenizer::ParseString("'hello'", &output);
    607   EXPECT_EQ("hello", output);
    608 }
    609 
    610 // -------------------------------------------------------------------
    611 
    612 // Each case parses some input text, ignoring the tokens produced, and
    613 // checks that the error output matches what is expected.
    614 struct ErrorCase {
    615   string input;
    616   bool recoverable;  // True if the tokenizer should be able to recover and
    617                      // parse more tokens after seeing this error.  Cases
    618                      // for which this is true must end with "foo" as
    619                      // the last token, which the test will check for.
    620   const char* errors;
    621 };
    622 
    623 inline ostream& operator<<(ostream& out,
    624                            const ErrorCase& test_case) {
    625   return out << CEscape(test_case.input);
    626 }
    627 
    628 ErrorCase kErrorCases[] = {
    629   // String errors.
    630   { "'\\l' foo", true,
    631     "0:2: Invalid escape sequence in string literal.\n" },
    632   { "'\\x' foo", true,
    633     "0:3: Expected hex digits for escape sequence.\n" },
    634   { "'foo", false,
    635     "0:4: String literals cannot cross line boundaries.\n" },
    636   { "'bar\nfoo", true,
    637     "0:4: String literals cannot cross line boundaries.\n" },
    638 
    639   // Integer errors.
    640   { "123foo", true,
    641     "0:3: Need space between number and identifier.\n" },
    642 
    643   // Hex/octal errors.
    644   { "0x foo", true,
    645     "0:2: \"0x\" must be followed by hex digits.\n" },
    646   { "0541823 foo", true,
    647     "0:4: Numbers starting with leading zero must be in octal.\n" },
    648   { "0x123z foo", true,
    649     "0:5: Need space between number and identifier.\n" },
    650   { "0x123.4 foo", true,
    651     "0:5: Hex and octal numbers must be integers.\n" },
    652   { "0123.4 foo", true,
    653     "0:4: Hex and octal numbers must be integers.\n" },
    654 
    655   // Float errors.
    656   { "1e foo", true,
    657     "0:2: \"e\" must be followed by exponent.\n" },
    658   { "1e- foo", true,
    659     "0:3: \"e\" must be followed by exponent.\n" },
    660   { "1.2.3 foo", true,
    661     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
    662   { "1e2.3 foo", true,
    663     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
    664   { "a.1 foo", true,
    665     "0:1: Need space between identifier and decimal point.\n" },
    666   // allow_f_after_float not enabled, so this should be an error.
    667   { "1.0f foo", true,
    668     "0:3: Need space between number and identifier.\n" },
    669 
    670   // Block comment errors.
    671   { "/*", false,
    672     "0:2: End-of-file inside block comment.\n"
    673     "0:0:   Comment started here.\n"},
    674   { "/*/*/ foo", true,
    675     "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
    676 
    677   // Control characters.  Multiple consecutive control characters should only
    678   // produce one error.
    679   { "\b foo", true,
    680     "0:0: Invalid control characters encountered in text.\n" },
    681   { "\b\b foo", true,
    682     "0:0: Invalid control characters encountered in text.\n" },
    683 
    684   // Check that control characters at end of input don't result in an
    685   // infinite loop.
    686   { "\b", false,
    687     "0:0: Invalid control characters encountered in text.\n" },
    688 
    689   // Check recovery from '\0'.  We have to explicitly specify the length of
    690   // these strings because otherwise the string constructor will just call
    691   // strlen() which will see the first '\0' and think that is the end of the
    692   // string.
    693   { string("\0foo", 4), true,
    694     "0:0: Invalid control characters encountered in text.\n" },
    695   { string("\0\0foo", 5), true,
    696     "0:0: Invalid control characters encountered in text.\n" },
    697 };
    698 
    699 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
    700   // Set up the tokenizer.
    701   TestInputStream input(kErrorCases_case.input.data(),
    702                         kErrorCases_case.input.size(),
    703                         kBlockSizes_case);
    704   TestErrorCollector error_collector;
    705   Tokenizer tokenizer(&input, &error_collector);
    706 
    707   // Ignore all input, except remember if the last token was "foo".
    708   bool last_was_foo = false;
    709   while (tokenizer.Next()) {
    710     last_was_foo = tokenizer.current().text == "foo";
    711   }
    712 
    713   // Check that the errors match what was expected.
    714   EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
    715 
    716   // If the error was recoverable, make sure we saw "foo" after it.
    717   if (kErrorCases_case.recoverable) {
    718     EXPECT_TRUE(last_was_foo);
    719   }
    720 }
    721 
    722 // -------------------------------------------------------------------
    723 
    724 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
    725   string text = "foo bar";
    726   TestInputStream input(text.data(), text.size(), kBlockSizes_case);
    727 
    728   // Create a tokenizer, read one token, then destroy it.
    729   {
    730     TestErrorCollector error_collector;
    731     Tokenizer tokenizer(&input, &error_collector);
    732 
    733     tokenizer.Next();
    734   }
    735 
    736   // Only "foo" should have been read.
    737   EXPECT_EQ(strlen("foo"), input.ByteCount());
    738 }
    739 
    740 }  // namespace
    741 }  // namespace io
    742 }  // namespace protobuf
    743 }  // namespace google
    744