Home | History | Annotate | Download | only in io
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // https://developers.google.com/protocol-buffers/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 //
     35 // Here we have a hand-written lexer.  At first you might ask yourself,
     36 // "Hand-written text processing?  Is Kenton crazy?!"  Well, first of all,
     37 // yes I am crazy, but that's beside the point.  There are actually reasons
     38 // why I ended up writing this this way.
     39 //
     40 // The traditional approach to lexing is to use lex to generate a lexer for
     41 // you.  Unfortunately, lex's output is ridiculously ugly and difficult to
     42 // integrate cleanly with C++ code, especially abstract code or code meant
     43 // as a library.  Better parser-generators exist but would add dependencies
     44 // which most users won't already have, which we'd like to avoid.  (GNU flex
     45 // has a C++ output option, but it's still ridiculously ugly, non-abstract,
     46 // and not library-friendly.)
     47 //
     48 // The next approach that any good software engineer should look at is to
     49 // use regular expressions.  And, indeed, I did.  I have code which
     50 // implements this same class using regular expressions.  It's about 200
     51 // lines shorter.  However:
     52 // - Rather than error messages telling you "This string has an invalid
     53 //   escape sequence at line 5, column 45", you get error messages like
     54 //   "Parse error on line 5".  Giving more precise errors requires adding
     55 //   a lot of code that ends up basically as complex as the hand-coded
     56 //   version anyway.
     57 // - The regular expression to match a string literal looks like this:
     58 //     kString  = new RE("(\"([^\"\\\\]|"              // non-escaped
     59 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"   // normal escape
     60 //                       "\\\\x[0-9a-fA-F])*\"|"       // hex escape
     61 //                       "\'([^\'\\\\]|"        // Also support single-quotes.
     62 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"
     63 //                       "\\\\x[0-9a-fA-F])*\')");
     64 //   Verifying the correctness of this line noise is actually harder than
     65 //   verifying the correctness of ConsumeString(), defined below.  I'm not
     66 //   even confident that the above is correct, after staring at it for some
     67 //   time.
     68 // - PCRE is fast, but there's still more overhead involved than the code
     69 //   below.
     70 // - Sadly, regular expressions are not part of the C standard library, so
     71 //   using them would require depending on some other library.  For the
     72 //   open source release, this could be really annoying.  Nobody likes
     73 //   downloading one piece of software just to find that they need to
     74 //   download something else to make it work, and in all likelihood
     75 //   people downloading Protocol Buffers will already be doing so just
     76 //   to make something else work.  We could include a copy of PCRE with
     77 //   our code, but that obligates us to keep it up-to-date and just seems
     78 //   like a big waste just to save 200 lines of code.
     79 //
     80 // On a similar but unrelated note, I'm even scared to use ctype.h.
     81 // Apparently functions like isalpha() are locale-dependent.  So, if we used
     82 // that, then if this code is being called from some program that doesn't
     83 // have its locale set to "C", it would behave strangely.  We can't just set
     84 // the locale to "C" ourselves since we might break the calling program that
     85 // way, particularly if it is multi-threaded.  WTF?  Someone please let me
     86 // (Kenton) know if I'm missing something here...
     87 //
     88 // I'd love to hear about other alternatives, though, as this code isn't
     89 // exactly pretty.
     90 
     91 #include <google/protobuf/io/tokenizer.h>
     92 #include <google/protobuf/stubs/common.h>
     93 #include <google/protobuf/stubs/logging.h>
     94 #include <google/protobuf/stubs/stringprintf.h>
     95 #include <google/protobuf/io/strtod.h>
     96 #include <google/protobuf/io/zero_copy_stream.h>
     97 #include <google/protobuf/stubs/strutil.h>
     98 #include <google/protobuf/stubs/stl_util.h>
     99 
    100 namespace google {
    101 namespace protobuf {
    102 namespace io {
    103 namespace {
    104 
    105 // As mentioned above, I don't trust ctype.h due to the presence of "locales".
    106 // So, I have written replacement functions here.  Someone please smack me if
    107 // this is a bad idea or if there is some way around this.
    108 //
    109 // These "character classes" are designed to be used in template methods.
    110 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
    111 // whitespace.
    112 
    113 // Note:  No class is allowed to contain '\0', since this is used to mark end-
    114 //   of-input and is handled specially.
    115 
    116 #define CHARACTER_CLASS(NAME, EXPRESSION)      \
    117   class NAME {                                 \
    118    public:                                     \
    119     static inline bool InClass(char c) {       \
    120       return EXPRESSION;                       \
    121     }                                          \
    122   }
    123 
    124 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
    125                             c == '\r' || c == '\v' || c == '\f');
    126 CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
    127                                      c == '\r' || c == '\v' || c == '\f');
    128 
    129 CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
    130 
    131 CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
    132 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
    133 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
    134                           ('a' <= c && c <= 'f') ||
    135                           ('A' <= c && c <= 'F'));
    136 
    137 CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
    138                         ('A' <= c && c <= 'Z') ||
    139                         (c == '_'));
    140 
    141 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
    142                               ('A' <= c && c <= 'Z') ||
    143                               ('0' <= c && c <= '9') ||
    144                               (c == '_'));
    145 
    146 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
    147                         c == 'r' || c == 't' || c == 'v' || c == '\\' ||
    148                         c == '?' || c == '\'' || c == '\"');
    149 
    150 #undef CHARACTER_CLASS
    151 
    152 // Given a char, interpret it as a numeric digit and return its value.
    153 // This supports any number base up to 36.
    154 inline int DigitValue(char digit) {
    155   if ('0' <= digit && digit <= '9') return digit - '0';
    156   if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
    157   if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
    158   return -1;
    159 }
    160 
    161 // Inline because it's only used in one place.
    162 inline char TranslateEscape(char c) {
    163   switch (c) {
    164     case 'a':  return '\a';
    165     case 'b':  return '\b';
    166     case 'f':  return '\f';
    167     case 'n':  return '\n';
    168     case 'r':  return '\r';
    169     case 't':  return '\t';
    170     case 'v':  return '\v';
    171     case '\\': return '\\';
    172     case '?':  return '\?';    // Trigraphs = :(
    173     case '\'': return '\'';
    174     case '"':  return '\"';
    175 
    176     // We expect escape sequences to have been validated separately.
    177     default:   return '?';
    178   }
    179 }
    180 
    181 }  // anonymous namespace
    182 
    183 ErrorCollector::~ErrorCollector() {}
    184 
    185 // ===================================================================
    186 
    187 Tokenizer::Tokenizer(ZeroCopyInputStream* input,
    188                      ErrorCollector* error_collector)
    189   : input_(input),
    190     error_collector_(error_collector),
    191     buffer_(NULL),
    192     buffer_size_(0),
    193     buffer_pos_(0),
    194     read_error_(false),
    195     line_(0),
    196     column_(0),
    197     record_target_(NULL),
    198     record_start_(-1),
    199     allow_f_after_float_(false),
    200     comment_style_(CPP_COMMENT_STYLE),
    201     require_space_after_number_(true),
    202     allow_multiline_strings_(false) {
    203 
    204   current_.line = 0;
    205   current_.column = 0;
    206   current_.end_column = 0;
    207   current_.type = TYPE_START;
    208 
    209   Refresh();
    210 }
    211 
    212 Tokenizer::~Tokenizer() {
    213   // If we had any buffer left unread, return it to the underlying stream
    214   // so that someone else can read it.
    215   if (buffer_size_ > buffer_pos_) {
    216     input_->BackUp(buffer_size_ - buffer_pos_);
    217   }
    218 }
    219 
    220 // -------------------------------------------------------------------
    221 // Internal helpers.
    222 
    223 void Tokenizer::NextChar() {
    224   // Update our line and column counters based on the character being
    225   // consumed.
    226   if (current_char_ == '\n') {
    227     ++line_;
    228     column_ = 0;
    229   } else if (current_char_ == '\t') {
    230     column_ += kTabWidth - column_ % kTabWidth;
    231   } else {
    232     ++column_;
    233   }
    234 
    235   // Advance to the next character.
    236   ++buffer_pos_;
    237   if (buffer_pos_ < buffer_size_) {
    238     current_char_ = buffer_[buffer_pos_];
    239   } else {
    240     Refresh();
    241   }
    242 }
    243 
    244 void Tokenizer::Refresh() {
    245   if (read_error_) {
    246     current_char_ = '\0';
    247     return;
    248   }
    249 
    250   // If we're in a token, append the rest of the buffer to it.
    251   if (record_target_ != NULL && record_start_ < buffer_size_) {
    252     record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
    253     record_start_ = 0;
    254   }
    255 
    256   const void* data = NULL;
    257   buffer_ = NULL;
    258   buffer_pos_ = 0;
    259   do {
    260     if (!input_->Next(&data, &buffer_size_)) {
    261       // end of stream (or read error)
    262       buffer_size_ = 0;
    263       read_error_ = true;
    264       current_char_ = '\0';
    265       return;
    266     }
    267   } while (buffer_size_ == 0);
    268 
    269   buffer_ = static_cast<const char*>(data);
    270 
    271   current_char_ = buffer_[0];
    272 }
    273 
    274 inline void Tokenizer::RecordTo(string* target) {
    275   record_target_ = target;
    276   record_start_ = buffer_pos_;
    277 }
    278 
    279 inline void Tokenizer::StopRecording() {
    280   // Note:  The if() is necessary because some STL implementations crash when
    281   //   you call string::append(NULL, 0), presumably because they are trying to
    282   //   be helpful by detecting the NULL pointer, even though there's nothing
    283   //   wrong with reading zero bytes from NULL.
    284   if (buffer_pos_ != record_start_) {
    285     record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
    286   }
    287   record_target_ = NULL;
    288   record_start_ = -1;
    289 }
    290 
    291 inline void Tokenizer::StartToken() {
    292   current_.type = TYPE_START;    // Just for the sake of initializing it.
    293   current_.text.clear();
    294   current_.line = line_;
    295   current_.column = column_;
    296   RecordTo(&current_.text);
    297 }
    298 
    299 inline void Tokenizer::EndToken() {
    300   StopRecording();
    301   current_.end_column = column_;
    302 }
    303 
    304 // -------------------------------------------------------------------
    305 // Helper methods that consume characters.
    306 
    307 template<typename CharacterClass>
    308 inline bool Tokenizer::LookingAt() {
    309   return CharacterClass::InClass(current_char_);
    310 }
    311 
    312 template<typename CharacterClass>
    313 inline bool Tokenizer::TryConsumeOne() {
    314   if (CharacterClass::InClass(current_char_)) {
    315     NextChar();
    316     return true;
    317   } else {
    318     return false;
    319   }
    320 }
    321 
    322 inline bool Tokenizer::TryConsume(char c) {
    323   if (current_char_ == c) {
    324     NextChar();
    325     return true;
    326   } else {
    327     return false;
    328   }
    329 }
    330 
    331 template<typename CharacterClass>
    332 inline void Tokenizer::ConsumeZeroOrMore() {
    333   while (CharacterClass::InClass(current_char_)) {
    334     NextChar();
    335   }
    336 }
    337 
    338 template<typename CharacterClass>
    339 inline void Tokenizer::ConsumeOneOrMore(const char* error) {
    340   if (!CharacterClass::InClass(current_char_)) {
    341     AddError(error);
    342   } else {
    343     do {
    344       NextChar();
    345     } while (CharacterClass::InClass(current_char_));
    346   }
    347 }
    348 
    349 // -------------------------------------------------------------------
    350 // Methods that read whole patterns matching certain kinds of tokens
    351 // or comments.
    352 
    353 void Tokenizer::ConsumeString(char delimiter) {
    354   while (true) {
    355     switch (current_char_) {
    356       case '\0':
    357         AddError("Unexpected end of string.");
    358         return;
    359 
    360       case '\n': {
    361         if (!allow_multiline_strings_) {
    362           AddError("String literals cannot cross line boundaries.");
    363           return;
    364         }
    365         NextChar();
    366         break;
    367       }
    368 
    369       case '\\': {
    370         // An escape sequence.
    371         NextChar();
    372         if (TryConsumeOne<Escape>()) {
    373           // Valid escape sequence.
    374         } else if (TryConsumeOne<OctalDigit>()) {
    375           // Possibly followed by two more octal digits, but these will
    376           // just be consumed by the main loop anyway so we don't need
    377           // to do so explicitly here.
    378         } else if (TryConsume('x')) {
    379           if (!TryConsumeOne<HexDigit>()) {
    380             AddError("Expected hex digits for escape sequence.");
    381           }
    382           // Possibly followed by another hex digit, but again we don't care.
    383         } else if (TryConsume('u')) {
    384           if (!TryConsumeOne<HexDigit>() ||
    385               !TryConsumeOne<HexDigit>() ||
    386               !TryConsumeOne<HexDigit>() ||
    387               !TryConsumeOne<HexDigit>()) {
    388             AddError("Expected four hex digits for \\u escape sequence.");
    389           }
    390         } else if (TryConsume('U')) {
    391           // We expect 8 hex digits; but only the range up to 0x10ffff is
    392           // legal.
    393           if (!TryConsume('0') ||
    394               !TryConsume('0') ||
    395               !(TryConsume('0') || TryConsume('1')) ||
    396               !TryConsumeOne<HexDigit>() ||
    397               !TryConsumeOne<HexDigit>() ||
    398               !TryConsumeOne<HexDigit>() ||
    399               !TryConsumeOne<HexDigit>() ||
    400               !TryConsumeOne<HexDigit>()) {
    401             AddError("Expected eight hex digits up to 10ffff for \\U escape "
    402                      "sequence");
    403           }
    404         } else {
    405           AddError("Invalid escape sequence in string literal.");
    406         }
    407         break;
    408       }
    409 
    410       default: {
    411         if (current_char_ == delimiter) {
    412           NextChar();
    413           return;
    414         }
    415         NextChar();
    416         break;
    417       }
    418     }
    419   }
    420 }
    421 
    422 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
    423                                               bool started_with_dot) {
    424   bool is_float = false;
    425 
    426   if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
    427     // A hex number (started with "0x").
    428     ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
    429 
    430   } else if (started_with_zero && LookingAt<Digit>()) {
    431     // An octal number (had a leading zero).
    432     ConsumeZeroOrMore<OctalDigit>();
    433     if (LookingAt<Digit>()) {
    434       AddError("Numbers starting with leading zero must be in octal.");
    435       ConsumeZeroOrMore<Digit>();
    436     }
    437 
    438   } else {
    439     // A decimal number.
    440     if (started_with_dot) {
    441       is_float = true;
    442       ConsumeZeroOrMore<Digit>();
    443     } else {
    444       ConsumeZeroOrMore<Digit>();
    445 
    446       if (TryConsume('.')) {
    447         is_float = true;
    448         ConsumeZeroOrMore<Digit>();
    449       }
    450     }
    451 
    452     if (TryConsume('e') || TryConsume('E')) {
    453       is_float = true;
    454       TryConsume('-') || TryConsume('+');
    455       ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
    456     }
    457 
    458     if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
    459       is_float = true;
    460     }
    461   }
    462 
    463   if (LookingAt<Letter>() && require_space_after_number_) {
    464     AddError("Need space between number and identifier.");
    465   } else if (current_char_ == '.') {
    466     if (is_float) {
    467       AddError(
    468         "Already saw decimal point or exponent; can't have another one.");
    469     } else {
    470       AddError("Hex and octal numbers must be integers.");
    471     }
    472   }
    473 
    474   return is_float ? TYPE_FLOAT : TYPE_INTEGER;
    475 }
    476 
    477 void Tokenizer::ConsumeLineComment(string* content) {
    478   if (content != NULL) RecordTo(content);
    479 
    480   while (current_char_ != '\0' && current_char_ != '\n') {
    481     NextChar();
    482   }
    483   TryConsume('\n');
    484 
    485   if (content != NULL) StopRecording();
    486 }
    487 
    488 void Tokenizer::ConsumeBlockComment(string* content) {
    489   int start_line = line_;
    490   int start_column = column_ - 2;
    491 
    492   if (content != NULL) RecordTo(content);
    493 
    494   while (true) {
    495     while (current_char_ != '\0' &&
    496            current_char_ != '*' &&
    497            current_char_ != '/' &&
    498            current_char_ != '\n') {
    499       NextChar();
    500     }
    501 
    502     if (TryConsume('\n')) {
    503       if (content != NULL) StopRecording();
    504 
    505       // Consume leading whitespace and asterisk;
    506       ConsumeZeroOrMore<WhitespaceNoNewline>();
    507       if (TryConsume('*')) {
    508         if (TryConsume('/')) {
    509           // End of comment.
    510           break;
    511         }
    512       }
    513 
    514       if (content != NULL) RecordTo(content);
    515     } else if (TryConsume('*') && TryConsume('/')) {
    516       // End of comment.
    517       if (content != NULL) {
    518         StopRecording();
    519         // Strip trailing "*/".
    520         content->erase(content->size() - 2);
    521       }
    522       break;
    523     } else if (TryConsume('/') && current_char_ == '*') {
    524       // Note:  We didn't consume the '*' because if there is a '/' after it
    525       //   we want to interpret that as the end of the comment.
    526       AddError(
    527         "\"/*\" inside block comment.  Block comments cannot be nested.");
    528     } else if (current_char_ == '\0') {
    529       AddError("End-of-file inside block comment.");
    530       error_collector_->AddError(
    531         start_line, start_column, "  Comment started here.");
    532       if (content != NULL) StopRecording();
    533       break;
    534     }
    535   }
    536 }
    537 
    538 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
    539   if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
    540     if (TryConsume('/')) {
    541       return LINE_COMMENT;
    542     } else if (TryConsume('*')) {
    543       return BLOCK_COMMENT;
    544     } else {
    545       // Oops, it was just a slash.  Return it.
    546       current_.type = TYPE_SYMBOL;
    547       current_.text = "/";
    548       current_.line = line_;
    549       current_.column = column_ - 1;
    550       current_.end_column = column_;
    551       return SLASH_NOT_COMMENT;
    552     }
    553   } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
    554     return LINE_COMMENT;
    555   } else {
    556     return NO_COMMENT;
    557   }
    558 }
    559 
    560 // -------------------------------------------------------------------
    561 
    562 bool Tokenizer::Next() {
    563   previous_ = current_;
    564 
    565   while (!read_error_) {
    566     ConsumeZeroOrMore<Whitespace>();
    567 
    568     switch (TryConsumeCommentStart()) {
    569       case LINE_COMMENT:
    570         ConsumeLineComment(NULL);
    571         continue;
    572       case BLOCK_COMMENT:
    573         ConsumeBlockComment(NULL);
    574         continue;
    575       case SLASH_NOT_COMMENT:
    576         return true;
    577       case NO_COMMENT:
    578         break;
    579     }
    580 
    581     // Check for EOF before continuing.
    582     if (read_error_) break;
    583 
    584     if (LookingAt<Unprintable>() || current_char_ == '\0') {
    585       AddError("Invalid control characters encountered in text.");
    586       NextChar();
    587       // Skip more unprintable characters, too.  But, remember that '\0' is
    588       // also what current_char_ is set to after EOF / read error.  We have
    589       // to be careful not to go into an infinite loop of trying to consume
    590       // it, so make sure to check read_error_ explicitly before consuming
    591       // '\0'.
    592       while (TryConsumeOne<Unprintable>() ||
    593              (!read_error_ && TryConsume('\0'))) {
    594         // Ignore.
    595       }
    596 
    597     } else {
    598       // Reading some sort of token.
    599       StartToken();
    600 
    601       if (TryConsumeOne<Letter>()) {
    602         ConsumeZeroOrMore<Alphanumeric>();
    603         current_.type = TYPE_IDENTIFIER;
    604       } else if (TryConsume('0')) {
    605         current_.type = ConsumeNumber(true, false);
    606       } else if (TryConsume('.')) {
    607         // This could be the beginning of a floating-point number, or it could
    608         // just be a '.' symbol.
    609 
    610         if (TryConsumeOne<Digit>()) {
    611           // It's a floating-point number.
    612           if (previous_.type == TYPE_IDENTIFIER &&
    613               current_.line == previous_.line &&
    614               current_.column == previous_.end_column) {
    615             // We don't accept syntax like "blah.123".
    616             error_collector_->AddError(line_, column_ - 2,
    617               "Need space between identifier and decimal point.");
    618           }
    619           current_.type = ConsumeNumber(false, true);
    620         } else {
    621           current_.type = TYPE_SYMBOL;
    622         }
    623       } else if (TryConsumeOne<Digit>()) {
    624         current_.type = ConsumeNumber(false, false);
    625       } else if (TryConsume('\"')) {
    626         ConsumeString('\"');
    627         current_.type = TYPE_STRING;
    628       } else if (TryConsume('\'')) {
    629         ConsumeString('\'');
    630         current_.type = TYPE_STRING;
    631       } else {
    632         // Check if the high order bit is set.
    633         if (current_char_ & 0x80) {
    634           error_collector_->AddError(line_, column_,
    635               StringPrintf("Interpreting non ascii codepoint %d.",
    636                            static_cast<unsigned char>(current_char_)));
    637         }
    638         NextChar();
    639         current_.type = TYPE_SYMBOL;
    640       }
    641 
    642       EndToken();
    643       return true;
    644     }
    645   }
    646 
    647   // EOF
    648   current_.type = TYPE_END;
    649   current_.text.clear();
    650   current_.line = line_;
    651   current_.column = column_;
    652   current_.end_column = column_;
    653   return false;
    654 }
    655 
    656 namespace {
    657 
    658 // Helper class for collecting comments and putting them in the right places.
    659 //
    660 // This basically just buffers the most recent comment until it can be decided
    661 // exactly where that comment should be placed.  When Flush() is called, the
    662 // current comment goes into either prev_trailing_comments or detached_comments.
    663 // When the CommentCollector is destroyed, the last buffered comment goes into
    664 // next_leading_comments.
    665 class CommentCollector {
    666  public:
    667   CommentCollector(string* prev_trailing_comments,
    668                    vector<string>* detached_comments,
    669                    string* next_leading_comments)
    670       : prev_trailing_comments_(prev_trailing_comments),
    671         detached_comments_(detached_comments),
    672         next_leading_comments_(next_leading_comments),
    673         has_comment_(false),
    674         is_line_comment_(false),
    675         can_attach_to_prev_(true) {
    676     if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
    677     if (detached_comments != NULL) detached_comments->clear();
    678     if (next_leading_comments != NULL) next_leading_comments->clear();
    679   }
    680 
    681   ~CommentCollector() {
    682     // Whatever is in the buffer is a leading comment.
    683     if (next_leading_comments_ != NULL && has_comment_) {
    684       comment_buffer_.swap(*next_leading_comments_);
    685     }
    686   }
    687 
    688   // About to read a line comment.  Get the comment buffer pointer in order to
    689   // read into it.
    690   string* GetBufferForLineComment() {
    691     // We want to combine with previous line comments, but not block comments.
    692     if (has_comment_ && !is_line_comment_) {
    693       Flush();
    694     }
    695     has_comment_ = true;
    696     is_line_comment_ = true;
    697     return &comment_buffer_;
    698   }
    699 
    700   // About to read a block comment.  Get the comment buffer pointer in order to
    701   // read into it.
    702   string* GetBufferForBlockComment() {
    703     if (has_comment_) {
    704       Flush();
    705     }
    706     has_comment_ = true;
    707     is_line_comment_ = false;
    708     return &comment_buffer_;
    709   }
    710 
    711   void ClearBuffer() {
    712     comment_buffer_.clear();
    713     has_comment_ = false;
    714   }
    715 
    716   // Called once we know that the comment buffer is complete and is *not*
    717   // connected to the next token.
    718   void Flush() {
    719     if (has_comment_) {
    720       if (can_attach_to_prev_) {
    721         if (prev_trailing_comments_ != NULL) {
    722           prev_trailing_comments_->append(comment_buffer_);
    723         }
    724         can_attach_to_prev_ = false;
    725       } else {
    726         if (detached_comments_ != NULL) {
    727           detached_comments_->push_back(comment_buffer_);
    728         }
    729       }
    730       ClearBuffer();
    731     }
    732   }
    733 
    734   void DetachFromPrev() {
    735     can_attach_to_prev_ = false;
    736   }
    737 
    738  private:
    739   string* prev_trailing_comments_;
    740   vector<string>* detached_comments_;
    741   string* next_leading_comments_;
    742 
    743   string comment_buffer_;
    744 
    745   // True if any comments were read into comment_buffer_.  This can be true even
    746   // if comment_buffer_ is empty, namely if the comment was "/**/".
    747   bool has_comment_;
    748 
    749   // Is the comment in the comment buffer a line comment?
    750   bool is_line_comment_;
    751 
    752   // Is it still possible that we could be reading a comment attached to the
    753   // previous token?
    754   bool can_attach_to_prev_;
    755 };
    756 
    757 } // namespace
    758 
    759 bool Tokenizer::NextWithComments(string* prev_trailing_comments,
    760                                  vector<string>* detached_comments,
    761                                  string* next_leading_comments) {
    762   CommentCollector collector(prev_trailing_comments, detached_comments,
    763                              next_leading_comments);
    764 
    765   if (current_.type == TYPE_START) {
    766     // Ignore unicode byte order mark(BOM) if it appears at the file
    767     // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
    768     if (TryConsume((char)0xEF)) {
    769       if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
    770         AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
    771                  "Only UTF-8 is accepted for proto file.");
    772         return false;
    773       }
    774     }
    775     collector.DetachFromPrev();
    776   } else {
    777     // A comment appearing on the same line must be attached to the previous
    778     // declaration.
    779     ConsumeZeroOrMore<WhitespaceNoNewline>();
    780     switch (TryConsumeCommentStart()) {
    781       case LINE_COMMENT:
    782         ConsumeLineComment(collector.GetBufferForLineComment());
    783 
    784         // Don't allow comments on subsequent lines to be attached to a trailing
    785         // comment.
    786         collector.Flush();
    787         break;
    788       case BLOCK_COMMENT:
    789         ConsumeBlockComment(collector.GetBufferForBlockComment());
    790 
    791         ConsumeZeroOrMore<WhitespaceNoNewline>();
    792         if (!TryConsume('\n')) {
    793           // Oops, the next token is on the same line.  If we recorded a comment
    794           // we really have no idea which token it should be attached to.
    795           collector.ClearBuffer();
    796           return Next();
    797         }
    798 
    799         // Don't allow comments on subsequent lines to be attached to a trailing
    800         // comment.
    801         collector.Flush();
    802         break;
    803       case SLASH_NOT_COMMENT:
    804         return true;
    805       case NO_COMMENT:
    806         if (!TryConsume('\n')) {
    807           // The next token is on the same line.  There are no comments.
    808           return Next();
    809         }
    810         break;
    811     }
    812   }
    813 
    814   // OK, we are now on the line *after* the previous token.
    815   while (true) {
    816     ConsumeZeroOrMore<WhitespaceNoNewline>();
    817 
    818     switch (TryConsumeCommentStart()) {
    819       case LINE_COMMENT:
    820         ConsumeLineComment(collector.GetBufferForLineComment());
    821         break;
    822       case BLOCK_COMMENT:
    823         ConsumeBlockComment(collector.GetBufferForBlockComment());
    824 
    825         // Consume the rest of the line so that we don't interpret it as a
    826         // blank line the next time around the loop.
    827         ConsumeZeroOrMore<WhitespaceNoNewline>();
    828         TryConsume('\n');
    829         break;
    830       case SLASH_NOT_COMMENT:
    831         return true;
    832       case NO_COMMENT:
    833         if (TryConsume('\n')) {
    834           // Completely blank line.
    835           collector.Flush();
    836           collector.DetachFromPrev();
    837         } else {
    838           bool result = Next();
    839           if (!result ||
    840               current_.text == "}" ||
    841               current_.text == "]" ||
    842               current_.text == ")") {
    843             // It looks like we're at the end of a scope.  In this case it
    844             // makes no sense to attach a comment to the following token.
    845             collector.Flush();
    846           }
    847           return result;
    848         }
    849         break;
    850     }
    851   }
    852 }
    853 
    854 // -------------------------------------------------------------------
    855 // Token-parsing helpers.  Remember that these don't need to report
    856 // errors since any errors should already have been reported while
    857 // tokenizing.  Also, these can assume that whatever text they
    858 // are given is text that the tokenizer actually parsed as a token
    859 // of the given type.
    860 
    861 bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
    862                              uint64* output) {
    863   // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
    864   // is non-standard.  I hate the C standard library.  :(
    865 
    866 //  return strtoull(text.c_str(), NULL, 0);
    867 
    868   const char* ptr = text.c_str();
    869   int base = 10;
    870   if (ptr[0] == '0') {
    871     if (ptr[1] == 'x' || ptr[1] == 'X') {
    872       // This is hex.
    873       base = 16;
    874       ptr += 2;
    875     } else {
    876       // This is octal.
    877       base = 8;
    878     }
    879   }
    880 
    881   uint64 result = 0;
    882   for (; *ptr != '\0'; ptr++) {
    883     int digit = DigitValue(*ptr);
    884     if (digit < 0 || digit >= base) {
    885       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
    886       // token, but Tokenizer still think it's integer.
    887       return false;
    888     }
    889     if (digit > max_value || result > (max_value - digit) / base) {
    890       // Overflow.
    891       return false;
    892     }
    893     result = result * base + digit;
    894   }
    895 
    896   *output = result;
    897   return true;
    898 }
    899 
    900 double Tokenizer::ParseFloat(const string& text) {
    901   const char* start = text.c_str();
    902   char* end;
    903   double result = NoLocaleStrtod(start, &end);
    904 
    905   // "1e" is not a valid float, but if the tokenizer reads it, it will
    906   // report an error but still return it as a valid token.  We need to
    907   // accept anything the tokenizer could possibly return, error or not.
    908   if (*end == 'e' || *end == 'E') {
    909     ++end;
    910     if (*end == '-' || *end == '+') ++end;
    911   }
    912 
    913   // If the Tokenizer had allow_f_after_float_ enabled, the float may be
    914   // suffixed with the letter 'f'.
    915   if (*end == 'f' || *end == 'F') {
    916     ++end;
    917   }
    918 
    919   GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
    920     << " Tokenizer::ParseFloat() passed text that could not have been"
    921        " tokenized as a float: " << CEscape(text);
    922   return result;
    923 }
    924 
    925 // Helper to append a Unicode code point to a string as UTF8, without bringing
    926 // in any external dependencies.
    927 static void AppendUTF8(uint32 code_point, string* output) {
    928   uint32 tmp = 0;
    929   int len = 0;
    930   if (code_point <= 0x7f) {
    931     tmp = code_point;
    932     len = 1;
    933   } else if (code_point <= 0x07ff) {
    934     tmp = 0x0000c080 |
    935         ((code_point & 0x07c0) << 2) |
    936         (code_point & 0x003f);
    937     len = 2;
    938   } else if (code_point <= 0xffff) {
    939     tmp = 0x00e08080 |
    940         ((code_point & 0xf000) << 4) |
    941         ((code_point & 0x0fc0) << 2) |
    942         (code_point & 0x003f);
    943     len = 3;
    944   } else if (code_point <= 0x1fffff) {
    945     tmp = 0xf0808080 |
    946         ((code_point & 0x1c0000) << 6) |
    947         ((code_point & 0x03f000) << 4) |
    948         ((code_point & 0x000fc0) << 2) |
    949         (code_point & 0x003f);
    950     len = 4;
    951   } else {
    952     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
    953     // normally only defined up to there as well.
    954     StringAppendF(output, "\\U%08x", code_point);
    955     return;
    956   }
    957   tmp = ghtonl(tmp);
    958   output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
    959 }
    960 
    961 // Try to read <len> hex digits from ptr, and stuff the numeric result into
    962 // *result. Returns true if that many digits were successfully consumed.
    963 static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
    964   *result = 0;
    965   if (len == 0) return false;
    966   for (const char* end = ptr + len; ptr < end; ++ptr) {
    967     if (*ptr == '\0') return false;
    968     *result = (*result << 4) + DigitValue(*ptr);
    969   }
    970   return true;
    971 }
    972 
    973 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
    974 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
    975 // surrogate. These numbers are in a reserved range of Unicode code points, so
    976 // if we encounter such a pair we know how to parse it and convert it into a
    977 // single code point.
    978 static const uint32 kMinHeadSurrogate = 0xd800;
    979 static const uint32 kMaxHeadSurrogate = 0xdc00;
    980 static const uint32 kMinTrailSurrogate = 0xdc00;
    981 static const uint32 kMaxTrailSurrogate = 0xe000;
    982 
    983 static inline bool IsHeadSurrogate(uint32 code_point) {
    984   return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
    985 }
    986 
    987 static inline bool IsTrailSurrogate(uint32 code_point) {
    988   return (code_point >= kMinTrailSurrogate) &&
    989       (code_point < kMaxTrailSurrogate);
    990 }
    991 
    992 // Combine a head and trail surrogate into a single Unicode code point.
    993 static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
    994   GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
    995   GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
    996   return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
    997       (trail_surrogate - kMinTrailSurrogate));
    998 }
    999 
   1000 // Convert the escape sequence parameter to a number of expected hex digits.
   1001 static inline int UnicodeLength(char key) {
   1002   if (key == 'u') return 4;
   1003   if (key == 'U') return 8;
   1004   return 0;
   1005 }
   1006 
   1007 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
   1008 // to parse that sequence. On success, returns a pointer to the first char
   1009 // beyond that sequence, and fills in *code_point. On failure, returns ptr
   1010 // itself.
   1011 static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
   1012   const char* p = ptr;
   1013   // Fetch the code point.
   1014   const int len = UnicodeLength(*p++);
   1015   if (!ReadHexDigits(p, len, code_point))
   1016     return ptr;
   1017   p += len;
   1018 
   1019   // Check if the code point we read is a "head surrogate." If so, then we
   1020   // expect it to be immediately followed by another code point which is a valid
   1021   // "trail surrogate," and together they form a UTF-16 pair which decodes into
   1022   // a single Unicode point. Trail surrogates may only use \u, not \U.
   1023   if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
   1024     uint32 trail_surrogate;
   1025     if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
   1026         IsTrailSurrogate(trail_surrogate)) {
   1027       *code_point = AssembleUTF16(*code_point, trail_surrogate);
   1028       p += 6;
   1029     }
   1030     // If this failed, then we just emit the head surrogate as a code point.
   1031     // It's bogus, but so is the string.
   1032   }
   1033 
   1034   return p;
   1035 }
   1036 
   1037 // The text string must begin and end with single or double quote
   1038 // characters.
   1039 void Tokenizer::ParseStringAppend(const string& text, string* output) {
   1040   // Reminder: text[0] is always a quote character.  (If text is
   1041   // empty, it's invalid, so we'll just return).
   1042   const size_t text_size = text.size();
   1043   if (text_size == 0) {
   1044     GOOGLE_LOG(DFATAL)
   1045       << " Tokenizer::ParseStringAppend() passed text that could not"
   1046          " have been tokenized as a string: " << CEscape(text);
   1047     return;
   1048   }
   1049 
   1050   // Reserve room for new string. The branch is necessary because if
   1051   // there is already space available the reserve() call might
   1052   // downsize the output.
   1053   const size_t new_len = text_size + output->size();
   1054   if (new_len > output->capacity()) {
   1055     output->reserve(new_len);
   1056   }
   1057 
   1058   // Loop through the string copying characters to "output" and
   1059   // interpreting escape sequences.  Note that any invalid escape
   1060   // sequences or other errors were already reported while tokenizing.
   1061   // In this case we do not need to produce valid results.
   1062   for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
   1063     if (*ptr == '\\' && ptr[1] != '\0') {
   1064       // An escape sequence.
   1065       ++ptr;
   1066 
   1067       if (OctalDigit::InClass(*ptr)) {
   1068         // An octal escape.  May one, two, or three digits.
   1069         int code = DigitValue(*ptr);
   1070         if (OctalDigit::InClass(ptr[1])) {
   1071           ++ptr;
   1072           code = code * 8 + DigitValue(*ptr);
   1073         }
   1074         if (OctalDigit::InClass(ptr[1])) {
   1075           ++ptr;
   1076           code = code * 8 + DigitValue(*ptr);
   1077         }
   1078         output->push_back(static_cast<char>(code));
   1079 
   1080       } else if (*ptr == 'x') {
   1081         // A hex escape.  May zero, one, or two digits.  (The zero case
   1082         // will have been caught as an error earlier.)
   1083         int code = 0;
   1084         if (HexDigit::InClass(ptr[1])) {
   1085           ++ptr;
   1086           code = DigitValue(*ptr);
   1087         }
   1088         if (HexDigit::InClass(ptr[1])) {
   1089           ++ptr;
   1090           code = code * 16 + DigitValue(*ptr);
   1091         }
   1092         output->push_back(static_cast<char>(code));
   1093 
   1094       } else if (*ptr == 'u' || *ptr == 'U') {
   1095         uint32 unicode;
   1096         const char* end = FetchUnicodePoint(ptr, &unicode);
   1097         if (end == ptr) {
   1098           // Failure: Just dump out what we saw, don't try to parse it.
   1099           output->push_back(*ptr);
   1100         } else {
   1101           AppendUTF8(unicode, output);
   1102           ptr = end - 1;  // Because we're about to ++ptr.
   1103         }
   1104       } else {
   1105         // Some other escape code.
   1106         output->push_back(TranslateEscape(*ptr));
   1107       }
   1108 
   1109     } else if (*ptr == text[0] && ptr[1] == '\0') {
   1110       // Ignore final quote matching the starting quote.
   1111     } else {
   1112       output->push_back(*ptr);
   1113     }
   1114   }
   1115 }
   1116 
   1117 template<typename CharacterClass>
   1118 static bool AllInClass(const string& s) {
   1119   for (int i = 0; i < s.size(); ++i) {
   1120     if (!CharacterClass::InClass(s[i]))
   1121       return false;
   1122   }
   1123   return true;
   1124 }
   1125 
   1126 bool Tokenizer::IsIdentifier(const string& text) {
   1127   // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
   1128   if (text.size() == 0)
   1129     return false;
   1130   if (!Letter::InClass(text.at(0)))
   1131     return false;
   1132   if (!AllInClass<Alphanumeric>(text.substr(1)))
   1133     return false;
   1134   return true;
   1135 }
   1136 
   1137 }  // namespace io
   1138 }  // namespace protobuf
   1139 }  // namespace google
   1140