Home | History | Annotate | Download | only in parsing
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Features shared by parsing and pre-parsing scanners.
      6 
      7 #include "src/parsing/scanner.h"
      8 
      9 #include <stdint.h>
     10 
     11 #include <cmath>
     12 
     13 #include "src/ast/ast-value-factory.h"
     14 #include "src/char-predicates-inl.h"
     15 #include "src/conversions-inl.h"
     16 #include "src/list-inl.h"
     17 #include "src/parsing/duplicate-finder.h"  // For Scanner::FindSymbol
     18 
     19 namespace v8 {
     20 namespace internal {
     21 
     22 // Scoped helper for saving & restoring scanner error state.
     23 // This is used for tagged template literals, in which normally forbidden
     24 // escape sequences are allowed.
     25 class ErrorState {
     26  public:
     27   ErrorState(MessageTemplate::Template* message_stack,
     28              Scanner::Location* location_stack)
     29       : message_stack_(message_stack),
     30         old_message_(*message_stack),
     31         location_stack_(location_stack),
     32         old_location_(*location_stack) {
     33     *message_stack_ = MessageTemplate::kNone;
     34     *location_stack_ = Scanner::Location::invalid();
     35   }
     36 
     37   ~ErrorState() {
     38     *message_stack_ = old_message_;
     39     *location_stack_ = old_location_;
     40   }
     41 
     42   void MoveErrorTo(MessageTemplate::Template* message_dest,
     43                    Scanner::Location* location_dest) {
     44     if (*message_stack_ == MessageTemplate::kNone) {
     45       return;
     46     }
     47     if (*message_dest == MessageTemplate::kNone) {
     48       *message_dest = *message_stack_;
     49       *location_dest = *location_stack_;
     50     }
     51     *message_stack_ = MessageTemplate::kNone;
     52     *location_stack_ = Scanner::Location::invalid();
     53   }
     54 
     55  private:
     56   MessageTemplate::Template* const message_stack_;
     57   MessageTemplate::Template const old_message_;
     58   Scanner::Location* const location_stack_;
     59   Scanner::Location const old_location_;
     60 };
     61 
     62 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
     63   if (is_one_byte()) {
     64     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
     65   }
     66   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
     67 }
     68 
     69 int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
     70   int capacity = Max(min_capacity, backing_store_.length());
     71   int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
     72   return new_capacity;
     73 }
     74 
     75 void Scanner::LiteralBuffer::ExpandBuffer() {
     76   Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
     77   MemCopy(new_store.start(), backing_store_.start(), position_);
     78   backing_store_.Dispose();
     79   backing_store_ = new_store;
     80 }
     81 
     82 void Scanner::LiteralBuffer::ConvertToTwoByte() {
     83   DCHECK(is_one_byte_);
     84   Vector<byte> new_store;
     85   int new_content_size = position_ * kUC16Size;
     86   if (new_content_size >= backing_store_.length()) {
     87     // Ensure room for all currently read code units as UC16 as well
     88     // as the code unit about to be stored.
     89     new_store = Vector<byte>::New(NewCapacity(new_content_size));
     90   } else {
     91     new_store = backing_store_;
     92   }
     93   uint8_t* src = backing_store_.start();
     94   uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
     95   for (int i = position_ - 1; i >= 0; i--) {
     96     dst[i] = src[i];
     97   }
     98   if (new_store.start() != backing_store_.start()) {
     99     backing_store_.Dispose();
    100     backing_store_ = new_store;
    101   }
    102   position_ = new_content_size;
    103   is_one_byte_ = false;
    104 }
    105 
    106 void Scanner::LiteralBuffer::AddCharSlow(uc32 code_unit) {
    107   if (position_ >= backing_store_.length()) ExpandBuffer();
    108   if (is_one_byte_) {
    109     if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
    110       backing_store_[position_] = static_cast<byte>(code_unit);
    111       position_ += kOneByteSize;
    112       return;
    113     }
    114     ConvertToTwoByte();
    115   }
    116   if (code_unit <=
    117       static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    118     *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
    119     position_ += kUC16Size;
    120   } else {
    121     *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
    122         unibrow::Utf16::LeadSurrogate(code_unit);
    123     position_ += kUC16Size;
    124     if (position_ >= backing_store_.length()) ExpandBuffer();
    125     *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
    126         unibrow::Utf16::TrailSurrogate(code_unit);
    127     position_ += kUC16Size;
    128   }
    129 }
    130 
    131 // ----------------------------------------------------------------------------
    132 // Scanner::BookmarkScope
    133 
    134 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
    135     std::numeric_limits<size_t>::max() - 2;
    136 const size_t Scanner::BookmarkScope::kNoBookmark =
    137     std::numeric_limits<size_t>::max() - 1;
    138 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
    139     std::numeric_limits<size_t>::max();
    140 
    141 void Scanner::BookmarkScope::Set() {
    142   DCHECK_EQ(bookmark_, kNoBookmark);
    143   DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED);
    144 
    145   // The first token is a bit special, since current_ will still be
    146   // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
    147   // when
    148   // applying the bookmark.
    149   DCHECK_IMPLIES(
    150       scanner_->current_.token == Token::UNINITIALIZED,
    151       scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos);
    152   bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED)
    153                   ? kBookmarkAtFirstPos
    154                   : scanner_->location().beg_pos;
    155 }
    156 
    157 void Scanner::BookmarkScope::Apply() {
    158   DCHECK(HasBeenSet());  // Caller hasn't called SetBookmark.
    159   if (bookmark_ == kBookmarkAtFirstPos) {
    160     scanner_->SeekNext(0);
    161   } else {
    162     scanner_->SeekNext(bookmark_);
    163     scanner_->Next();
    164     DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
    165   }
    166   bookmark_ = kBookmarkWasApplied;
    167 }
    168 
    169 bool Scanner::BookmarkScope::HasBeenSet() {
    170   return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
    171 }
    172 
    173 bool Scanner::BookmarkScope::HasBeenApplied() {
    174   return bookmark_ == kBookmarkWasApplied;
    175 }
    176 
    177 // ----------------------------------------------------------------------------
    178 // Scanner
    179 
    180 Scanner::Scanner(UnicodeCache* unicode_cache)
    181     : unicode_cache_(unicode_cache),
    182       octal_pos_(Location::invalid()),
    183       octal_message_(MessageTemplate::kNone),
    184       found_html_comment_(false) {}
    185 
    186 void Scanner::Initialize(Utf16CharacterStream* source) {
    187   source_ = source;
    188   // Need to capture identifiers in order to recognize "get" and "set"
    189   // in object literals.
    190   Init();
    191   // Skip initial whitespace allowing HTML comment ends just like
    192   // after a newline and scan first token.
    193   has_line_terminator_before_next_ = true;
    194   SkipWhiteSpace();
    195   Scan();
    196 }
    197 
    198 template <bool capture_raw, bool unicode>
    199 uc32 Scanner::ScanHexNumber(int expected_length) {
    200   DCHECK(expected_length <= 4);  // prevent overflow
    201 
    202   int begin = source_pos() - 2;
    203   uc32 x = 0;
    204   for (int i = 0; i < expected_length; i++) {
    205     int d = HexValue(c0_);
    206     if (d < 0) {
    207       ReportScannerError(Location(begin, begin + expected_length + 2),
    208                          unicode
    209                              ? MessageTemplate::kInvalidUnicodeEscapeSequence
    210                              : MessageTemplate::kInvalidHexEscapeSequence);
    211       return -1;
    212     }
    213     x = x * 16 + d;
    214     Advance<capture_raw>();
    215   }
    216 
    217   return x;
    218 }
    219 
    220 template <bool capture_raw>
    221 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
    222   uc32 x = 0;
    223   int d = HexValue(c0_);
    224   if (d < 0) return -1;
    225 
    226   while (d >= 0) {
    227     x = x * 16 + d;
    228     if (x > max_value) {
    229       ReportScannerError(Location(beg_pos, source_pos() + 1),
    230                          MessageTemplate::kUndefinedUnicodeCodePoint);
    231       return -1;
    232     }
    233     Advance<capture_raw>();
    234     d = HexValue(c0_);
    235   }
    236 
    237   return x;
    238 }
    239 
    240 
    241 // Ensure that tokens can be stored in a byte.
    242 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
    243 
    244 // Table of one-character tokens, by character (0x00..0x7f only).
    245 static const byte one_char_tokens[] = {
    246   Token::ILLEGAL,
    247   Token::ILLEGAL,
    248   Token::ILLEGAL,
    249   Token::ILLEGAL,
    250   Token::ILLEGAL,
    251   Token::ILLEGAL,
    252   Token::ILLEGAL,
    253   Token::ILLEGAL,
    254   Token::ILLEGAL,
    255   Token::ILLEGAL,
    256   Token::ILLEGAL,
    257   Token::ILLEGAL,
    258   Token::ILLEGAL,
    259   Token::ILLEGAL,
    260   Token::ILLEGAL,
    261   Token::ILLEGAL,
    262   Token::ILLEGAL,
    263   Token::ILLEGAL,
    264   Token::ILLEGAL,
    265   Token::ILLEGAL,
    266   Token::ILLEGAL,
    267   Token::ILLEGAL,
    268   Token::ILLEGAL,
    269   Token::ILLEGAL,
    270   Token::ILLEGAL,
    271   Token::ILLEGAL,
    272   Token::ILLEGAL,
    273   Token::ILLEGAL,
    274   Token::ILLEGAL,
    275   Token::ILLEGAL,
    276   Token::ILLEGAL,
    277   Token::ILLEGAL,
    278   Token::ILLEGAL,
    279   Token::ILLEGAL,
    280   Token::ILLEGAL,
    281   Token::ILLEGAL,
    282   Token::ILLEGAL,
    283   Token::ILLEGAL,
    284   Token::ILLEGAL,
    285   Token::ILLEGAL,
    286   Token::LPAREN,       // 0x28
    287   Token::RPAREN,       // 0x29
    288   Token::ILLEGAL,
    289   Token::ILLEGAL,
    290   Token::COMMA,        // 0x2c
    291   Token::ILLEGAL,
    292   Token::ILLEGAL,
    293   Token::ILLEGAL,
    294   Token::ILLEGAL,
    295   Token::ILLEGAL,
    296   Token::ILLEGAL,
    297   Token::ILLEGAL,
    298   Token::ILLEGAL,
    299   Token::ILLEGAL,
    300   Token::ILLEGAL,
    301   Token::ILLEGAL,
    302   Token::ILLEGAL,
    303   Token::ILLEGAL,
    304   Token::COLON,        // 0x3a
    305   Token::SEMICOLON,    // 0x3b
    306   Token::ILLEGAL,
    307   Token::ILLEGAL,
    308   Token::ILLEGAL,
    309   Token::CONDITIONAL,  // 0x3f
    310   Token::ILLEGAL,
    311   Token::ILLEGAL,
    312   Token::ILLEGAL,
    313   Token::ILLEGAL,
    314   Token::ILLEGAL,
    315   Token::ILLEGAL,
    316   Token::ILLEGAL,
    317   Token::ILLEGAL,
    318   Token::ILLEGAL,
    319   Token::ILLEGAL,
    320   Token::ILLEGAL,
    321   Token::ILLEGAL,
    322   Token::ILLEGAL,
    323   Token::ILLEGAL,
    324   Token::ILLEGAL,
    325   Token::ILLEGAL,
    326   Token::ILLEGAL,
    327   Token::ILLEGAL,
    328   Token::ILLEGAL,
    329   Token::ILLEGAL,
    330   Token::ILLEGAL,
    331   Token::ILLEGAL,
    332   Token::ILLEGAL,
    333   Token::ILLEGAL,
    334   Token::ILLEGAL,
    335   Token::ILLEGAL,
    336   Token::ILLEGAL,
    337   Token::LBRACK,     // 0x5b
    338   Token::ILLEGAL,
    339   Token::RBRACK,     // 0x5d
    340   Token::ILLEGAL,
    341   Token::ILLEGAL,
    342   Token::ILLEGAL,
    343   Token::ILLEGAL,
    344   Token::ILLEGAL,
    345   Token::ILLEGAL,
    346   Token::ILLEGAL,
    347   Token::ILLEGAL,
    348   Token::ILLEGAL,
    349   Token::ILLEGAL,
    350   Token::ILLEGAL,
    351   Token::ILLEGAL,
    352   Token::ILLEGAL,
    353   Token::ILLEGAL,
    354   Token::ILLEGAL,
    355   Token::ILLEGAL,
    356   Token::ILLEGAL,
    357   Token::ILLEGAL,
    358   Token::ILLEGAL,
    359   Token::ILLEGAL,
    360   Token::ILLEGAL,
    361   Token::ILLEGAL,
    362   Token::ILLEGAL,
    363   Token::ILLEGAL,
    364   Token::ILLEGAL,
    365   Token::ILLEGAL,
    366   Token::ILLEGAL,
    367   Token::ILLEGAL,
    368   Token::ILLEGAL,
    369   Token::LBRACE,       // 0x7b
    370   Token::ILLEGAL,
    371   Token::RBRACE,       // 0x7d
    372   Token::BIT_NOT,      // 0x7e
    373   Token::ILLEGAL
    374 };
    375 
    376 
    377 Token::Value Scanner::Next() {
    378   if (next_.token == Token::EOS) {
    379     next_.location.beg_pos = current_.location.beg_pos;
    380     next_.location.end_pos = current_.location.end_pos;
    381   }
    382   current_ = next_;
    383   if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
    384     next_ = next_next_;
    385     next_next_.token = Token::UNINITIALIZED;
    386     has_line_terminator_before_next_ = has_line_terminator_after_next_;
    387     return current_.token;
    388   }
    389   has_line_terminator_before_next_ = false;
    390   has_multiline_comment_before_next_ = false;
    391   if (static_cast<unsigned>(c0_) <= 0x7f) {
    392     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    393     if (token != Token::ILLEGAL) {
    394       int pos = source_pos();
    395       next_.token = token;
    396       next_.location.beg_pos = pos;
    397       next_.location.end_pos = pos + 1;
    398       next_.literal_chars = nullptr;
    399       next_.raw_literal_chars = nullptr;
    400       Advance();
    401       return current_.token;
    402     }
    403   }
    404   Scan();
    405   return current_.token;
    406 }
    407 
    408 
    409 Token::Value Scanner::PeekAhead() {
    410   DCHECK(next_.token != Token::DIV);
    411   DCHECK(next_.token != Token::ASSIGN_DIV);
    412 
    413   if (next_next_.token != Token::UNINITIALIZED) {
    414     return next_next_.token;
    415   }
    416   TokenDesc prev = current_;
    417   bool has_line_terminator_before_next =
    418       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
    419   Next();
    420   has_line_terminator_after_next_ =
    421       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
    422   has_line_terminator_before_next_ = has_line_terminator_before_next;
    423   Token::Value ret = next_.token;
    424   next_next_ = next_;
    425   next_ = current_;
    426   current_ = prev;
    427   return ret;
    428 }
    429 
    430 
    431 // TODO(yangguo): check whether this is actually necessary.
    432 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
    433   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    434   // Unicode character; this implies that in a Unicode context the
    435   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    436   // character expressed in little-endian byte order (since it could
    437   // not be a U+FFFE character expressed in big-endian byte
    438   // order). Nevertheless, we check for it to be compatible with
    439   // Spidermonkey.
    440   return c == 0xFFFE;
    441 }
    442 
    443 bool Scanner::SkipWhiteSpace() {
    444   int start_position = source_pos();
    445 
    446   while (true) {
    447     while (true) {
    448       // Don't skip behind the end of input.
    449       if (c0_ == kEndOfInput) break;
    450 
    451       // Advance as long as character is a WhiteSpace or LineTerminator.
    452       // Remember if the latter is the case.
    453       if (unicode_cache_->IsLineTerminator(c0_)) {
    454         has_line_terminator_before_next_ = true;
    455       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
    456                  !IsLittleEndianByteOrderMark(c0_)) {
    457         break;
    458       }
    459       Advance();
    460     }
    461 
    462     // If there is an HTML comment end '-->' at the beginning of a
    463     // line (with only whitespace in front of it), we treat the rest
    464     // of the line as a comment. This is in line with the way
    465     // SpiderMonkey handles it.
    466     if (c0_ != '-' || !has_line_terminator_before_next_) break;
    467 
    468     Advance();
    469     if (c0_ != '-') {
    470       PushBack('-');  // undo Advance()
    471       break;
    472     }
    473 
    474     Advance();
    475     if (c0_ != '>') {
    476       PushBack2('-', '-');  // undo 2x Advance();
    477       break;
    478     }
    479 
    480     // Treat the rest of the line as a comment.
    481     SkipSingleLineComment();
    482   }
    483 
    484   // Return whether or not we skipped any characters.
    485   return source_pos() != start_position;
    486 }
    487 
    488 Token::Value Scanner::SkipSingleLineComment() {
    489   Advance();
    490 
    491   // The line terminator at the end of the line is not considered
    492   // to be part of the single-line comment; it is recognized
    493   // separately by the lexical grammar and becomes part of the
    494   // stream of input elements for the syntactic grammar (see
    495   // ECMA-262, section 7.4).
    496   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
    497     Advance();
    498   }
    499 
    500   return Token::WHITESPACE;
    501 }
    502 
    503 
    504 Token::Value Scanner::SkipSourceURLComment() {
    505   TryToParseSourceURLComment();
    506   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
    507     Advance();
    508   }
    509 
    510   return Token::WHITESPACE;
    511 }
    512 
    513 
    514 void Scanner::TryToParseSourceURLComment() {
    515   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
    516   // function will just return if it cannot parse a magic comment.
    517   if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return;
    518   Advance();
    519   LiteralBuffer name;
    520   while (c0_ != kEndOfInput &&
    521          !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
    522     name.AddChar(c0_);
    523     Advance();
    524   }
    525   if (!name.is_one_byte()) return;
    526   Vector<const uint8_t> name_literal = name.one_byte_literal();
    527   LiteralBuffer* value;
    528   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
    529     value = &source_url_;
    530   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
    531     value = &source_mapping_url_;
    532   } else {
    533     return;
    534   }
    535   if (c0_ != '=')
    536     return;
    537   Advance();
    538   value->Reset();
    539   while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) {
    540     Advance();
    541   }
    542   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
    543     // Disallowed characters.
    544     if (c0_ == '"' || c0_ == '\'') {
    545       value->Reset();
    546       return;
    547     }
    548     if (unicode_cache_->IsWhiteSpace(c0_)) {
    549       break;
    550     }
    551     value->AddChar(c0_);
    552     Advance();
    553   }
    554   // Allow whitespace at the end.
    555   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
    556     if (!unicode_cache_->IsWhiteSpace(c0_)) {
    557       value->Reset();
    558       break;
    559     }
    560     Advance();
    561   }
    562 }
    563 
    564 
    565 Token::Value Scanner::SkipMultiLineComment() {
    566   DCHECK(c0_ == '*');
    567   Advance();
    568 
    569   while (c0_ != kEndOfInput) {
    570     uc32 ch = c0_;
    571     Advance();
    572     if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(ch)) {
    573       // Following ECMA-262, section 7.4, a comment containing
    574       // a newline will make the comment count as a line-terminator.
    575       has_multiline_comment_before_next_ = true;
    576     }
    577     // If we have reached the end of the multi-line comment, we
    578     // consume the '/' and insert a whitespace. This way all
    579     // multi-line comments are treated as whitespace.
    580     if (ch == '*' && c0_ == '/') {
    581       c0_ = ' ';
    582       return Token::WHITESPACE;
    583     }
    584   }
    585 
    586   // Unterminated multi-line comment.
    587   return Token::ILLEGAL;
    588 }
    589 
    590 Token::Value Scanner::ScanHtmlComment() {
    591   // Check for <!-- comments.
    592   DCHECK(c0_ == '!');
    593   Advance();
    594   if (c0_ != '-') {
    595     PushBack('!');  // undo Advance()
    596     return Token::LT;
    597   }
    598 
    599   Advance();
    600   if (c0_ != '-') {
    601     PushBack2('-', '!');  // undo 2x Advance()
    602     return Token::LT;
    603   }
    604 
    605   found_html_comment_ = true;
    606   return SkipSingleLineComment();
    607 }
    608 
    609 void Scanner::Scan() {
    610   next_.literal_chars = NULL;
    611   next_.raw_literal_chars = NULL;
    612   Token::Value token;
    613   do {
    614     // Remember the position of the next token
    615     next_.location.beg_pos = source_pos();
    616 
    617     switch (c0_) {
    618       case ' ':
    619       case '\t':
    620         Advance();
    621         token = Token::WHITESPACE;
    622         break;
    623 
    624       case '\n':
    625         Advance();
    626         has_line_terminator_before_next_ = true;
    627         token = Token::WHITESPACE;
    628         break;
    629 
    630       case '"': case '\'':
    631         token = ScanString();
    632         break;
    633 
    634       case '<':
    635         // < <= << <<= <!--
    636         Advance();
    637         if (c0_ == '=') {
    638           token = Select(Token::LTE);
    639         } else if (c0_ == '<') {
    640           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    641         } else if (c0_ == '!') {
    642           token = ScanHtmlComment();
    643         } else {
    644           token = Token::LT;
    645         }
    646         break;
    647 
    648       case '>':
    649         // > >= >> >>= >>> >>>=
    650         Advance();
    651         if (c0_ == '=') {
    652           token = Select(Token::GTE);
    653         } else if (c0_ == '>') {
    654           // >> >>= >>> >>>=
    655           Advance();
    656           if (c0_ == '=') {
    657             token = Select(Token::ASSIGN_SAR);
    658           } else if (c0_ == '>') {
    659             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    660           } else {
    661             token = Token::SAR;
    662           }
    663         } else {
    664           token = Token::GT;
    665         }
    666         break;
    667 
    668       case '=':
    669         // = == === =>
    670         Advance();
    671         if (c0_ == '=') {
    672           token = Select('=', Token::EQ_STRICT, Token::EQ);
    673         } else if (c0_ == '>') {
    674           token = Select(Token::ARROW);
    675         } else {
    676           token = Token::ASSIGN;
    677         }
    678         break;
    679 
    680       case '!':
    681         // ! != !==
    682         Advance();
    683         if (c0_ == '=') {
    684           token = Select('=', Token::NE_STRICT, Token::NE);
    685         } else {
    686           token = Token::NOT;
    687         }
    688         break;
    689 
    690       case '+':
    691         // + ++ +=
    692         Advance();
    693         if (c0_ == '+') {
    694           token = Select(Token::INC);
    695         } else if (c0_ == '=') {
    696           token = Select(Token::ASSIGN_ADD);
    697         } else {
    698           token = Token::ADD;
    699         }
    700         break;
    701 
    702       case '-':
    703         // - -- --> -=
    704         Advance();
    705         if (c0_ == '-') {
    706           Advance();
    707           if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) {
    708             // For compatibility with SpiderMonkey, we skip lines that
    709             // start with an HTML comment end '-->'.
    710             token = SkipSingleLineComment();
    711           } else {
    712             token = Token::DEC;
    713           }
    714         } else if (c0_ == '=') {
    715           token = Select(Token::ASSIGN_SUB);
    716         } else {
    717           token = Token::SUB;
    718         }
    719         break;
    720 
    721       case '*':
    722         // * *=
    723         Advance();
    724         if (c0_ == '*') {
    725           token = Select('=', Token::ASSIGN_EXP, Token::EXP);
    726         } else if (c0_ == '=') {
    727           token = Select(Token::ASSIGN_MUL);
    728         } else {
    729           token = Token::MUL;
    730         }
    731         break;
    732 
    733       case '%':
    734         // % %=
    735         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    736         break;
    737 
    738       case '/':
    739         // /  // /* /=
    740         Advance();
    741         if (c0_ == '/') {
    742           Advance();
    743           if (c0_ == '#' || c0_ == '@') {
    744             Advance();
    745             token = SkipSourceURLComment();
    746           } else {
    747             PushBack(c0_);
    748             token = SkipSingleLineComment();
    749           }
    750         } else if (c0_ == '*') {
    751           token = SkipMultiLineComment();
    752         } else if (c0_ == '=') {
    753           token = Select(Token::ASSIGN_DIV);
    754         } else {
    755           token = Token::DIV;
    756         }
    757         break;
    758 
    759       case '&':
    760         // & && &=
    761         Advance();
    762         if (c0_ == '&') {
    763           token = Select(Token::AND);
    764         } else if (c0_ == '=') {
    765           token = Select(Token::ASSIGN_BIT_AND);
    766         } else {
    767           token = Token::BIT_AND;
    768         }
    769         break;
    770 
    771       case '|':
    772         // | || |=
    773         Advance();
    774         if (c0_ == '|') {
    775           token = Select(Token::OR);
    776         } else if (c0_ == '=') {
    777           token = Select(Token::ASSIGN_BIT_OR);
    778         } else {
    779           token = Token::BIT_OR;
    780         }
    781         break;
    782 
    783       case '^':
    784         // ^ ^=
    785         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    786         break;
    787 
    788       case '.':
    789         // . Number
    790         Advance();
    791         if (IsDecimalDigit(c0_)) {
    792           token = ScanNumber(true);
    793         } else {
    794           token = Token::PERIOD;
    795           if (c0_ == '.') {
    796             Advance();
    797             if (c0_ == '.') {
    798               Advance();
    799               token = Token::ELLIPSIS;
    800             } else {
    801               PushBack('.');
    802             }
    803           }
    804         }
    805         break;
    806 
    807       case ':':
    808         token = Select(Token::COLON);
    809         break;
    810 
    811       case ';':
    812         token = Select(Token::SEMICOLON);
    813         break;
    814 
    815       case ',':
    816         token = Select(Token::COMMA);
    817         break;
    818 
    819       case '(':
    820         token = Select(Token::LPAREN);
    821         break;
    822 
    823       case ')':
    824         token = Select(Token::RPAREN);
    825         break;
    826 
    827       case '[':
    828         token = Select(Token::LBRACK);
    829         break;
    830 
    831       case ']':
    832         token = Select(Token::RBRACK);
    833         break;
    834 
    835       case '{':
    836         token = Select(Token::LBRACE);
    837         break;
    838 
    839       case '}':
    840         token = Select(Token::RBRACE);
    841         break;
    842 
    843       case '?':
    844         token = Select(Token::CONDITIONAL);
    845         break;
    846 
    847       case '~':
    848         token = Select(Token::BIT_NOT);
    849         break;
    850 
    851       case '`':
    852         token = ScanTemplateStart();
    853         break;
    854 
    855       default:
    856         if (c0_ == kEndOfInput) {
    857           token = Token::EOS;
    858         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
    859           token = ScanIdentifierOrKeyword();
    860         } else if (IsDecimalDigit(c0_)) {
    861           token = ScanNumber(false);
    862         } else if (SkipWhiteSpace()) {
    863           token = Token::WHITESPACE;
    864         } else {
    865           token = Select(Token::ILLEGAL);
    866         }
    867         break;
    868     }
    869 
    870     // Continue scanning for tokens as long as we're just skipping
    871     // whitespace.
    872   } while (token == Token::WHITESPACE);
    873 
    874   next_.location.end_pos = source_pos();
    875   next_.token = token;
    876 
    877 #ifdef DEBUG
    878   SanityCheckTokenDesc(current_);
    879   SanityCheckTokenDesc(next_);
    880   SanityCheckTokenDesc(next_next_);
    881 #endif
    882 }
    883 
    884 #ifdef DEBUG
    885 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
    886   // Most tokens should not have literal_chars or even raw_literal chars.
    887   // The rules are:
    888   // - UNINITIALIZED: we don't care.
    889   // - TEMPLATE_*: need both literal + raw literal chars.
    890   // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
    891   // - all others: should have neither.
    892 
    893   switch (token.token) {
    894     case Token::UNINITIALIZED:
    895       // token.literal_chars & other members might be garbage. That's ok.
    896       break;
    897     case Token::TEMPLATE_SPAN:
    898     case Token::TEMPLATE_TAIL:
    899       DCHECK_NOT_NULL(token.raw_literal_chars);
    900       DCHECK_NOT_NULL(token.literal_chars);
    901       break;
    902     case Token::ESCAPED_KEYWORD:
    903     case Token::ESCAPED_STRICT_RESERVED_WORD:
    904     case Token::FUTURE_STRICT_RESERVED_WORD:
    905     case Token::IDENTIFIER:
    906     case Token::NUMBER:
    907     case Token::REGEXP_LITERAL:
    908     case Token::SMI:
    909     case Token::STRING:
    910       DCHECK_NOT_NULL(token.literal_chars);
    911       DCHECK_NULL(token.raw_literal_chars);
    912       break;
    913     default:
    914       DCHECK_NULL(token.literal_chars);
    915       DCHECK_NULL(token.raw_literal_chars);
    916       break;
    917   }
    918 }
    919 #endif  // DEBUG
    920 
    921 void Scanner::SeekForward(int pos) {
    922   // After this call, we will have the token at the given position as
    923   // the "next" token. The "current" token will be invalid.
    924   if (pos == next_.location.beg_pos) return;
    925   int current_pos = source_pos();
    926   DCHECK_EQ(next_.location.end_pos, current_pos);
    927   // Positions inside the lookahead token aren't supported.
    928   DCHECK(pos >= current_pos);
    929   if (pos != current_pos) {
    930     source_->Seek(pos);
    931     Advance();
    932     // This function is only called to seek to the location
    933     // of the end of a function (at the "}" token). It doesn't matter
    934     // whether there was a line terminator in the part we skip.
    935     has_line_terminator_before_next_ = false;
    936     has_multiline_comment_before_next_ = false;
    937   }
    938   Scan();
    939 }
    940 
    941 
    942 template <bool capture_raw, bool in_template_literal>
    943 bool Scanner::ScanEscape() {
    944   uc32 c = c0_;
    945   Advance<capture_raw>();
    946 
    947   // Skip escaped newlines.
    948   if (!in_template_literal && c0_ != kEndOfInput &&
    949       unicode_cache_->IsLineTerminator(c)) {
    950     // Allow CR+LF newlines in multiline string literals.
    951     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
    952     // Allow LF+CR newlines in multiline string literals.
    953     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
    954     return true;
    955   }
    956 
    957   switch (c) {
    958     case '\'':  // fall through
    959     case '"' :  // fall through
    960     case '\\': break;
    961     case 'b' : c = '\b'; break;
    962     case 'f' : c = '\f'; break;
    963     case 'n' : c = '\n'; break;
    964     case 'r' : c = '\r'; break;
    965     case 't' : c = '\t'; break;
    966     case 'u' : {
    967       c = ScanUnicodeEscape<capture_raw>();
    968       if (c < 0) return false;
    969       break;
    970     }
    971     case 'v':
    972       c = '\v';
    973       break;
    974     case 'x': {
    975       c = ScanHexNumber<capture_raw>(2);
    976       if (c < 0) return false;
    977       break;
    978     }
    979     case '0':  // Fall through.
    980     case '1':  // fall through
    981     case '2':  // fall through
    982     case '3':  // fall through
    983     case '4':  // fall through
    984     case '5':  // fall through
    985     case '6':  // fall through
    986     case '7':
    987       c = ScanOctalEscape<capture_raw>(c, 2);
    988       break;
    989   }
    990 
    991   // Other escaped characters are interpreted as their non-escaped version.
    992   AddLiteralChar(c);
    993   return true;
    994 }
    995 
    996 
    997 template <bool capture_raw>
    998 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    999   uc32 x = c - '0';
   1000   int i = 0;
   1001   for (; i < length; i++) {
   1002     int d = c0_ - '0';
   1003     if (d < 0 || d > 7) break;
   1004     int nx = x * 8 + d;
   1005     if (nx >= 256) break;
   1006     x = nx;
   1007     Advance<capture_raw>();
   1008   }
   1009   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
   1010   // Remember the position of octal escape sequences so that an error
   1011   // can be reported later (in strict mode).
   1012   // We don't report the error immediately, because the octal escape can
   1013   // occur before the "use strict" directive.
   1014   if (c != '0' || i > 0) {
   1015     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
   1016     octal_message_ = MessageTemplate::kStrictOctalEscape;
   1017   }
   1018   return x;
   1019 }
   1020 
   1021 
   1022 Token::Value Scanner::ScanString() {
   1023   uc32 quote = c0_;
   1024   Advance<false, false>();  // consume quote
   1025 
   1026   LiteralScope literal(this);
   1027   while (true) {
   1028     if (c0_ > kMaxAscii) {
   1029       HandleLeadSurrogate();
   1030       break;
   1031     }
   1032     if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
   1033     if (c0_ == quote) {
   1034       literal.Complete();
   1035       Advance<false, false>();
   1036       return Token::STRING;
   1037     }
   1038     char c = static_cast<char>(c0_);
   1039     if (c == '\\') break;
   1040     Advance<false, false>();
   1041     AddLiteralChar(c);
   1042   }
   1043 
   1044   while (c0_ != quote && c0_ != kEndOfInput &&
   1045          !unicode_cache_->IsLineTerminator(c0_)) {
   1046     uc32 c = c0_;
   1047     Advance();
   1048     if (c == '\\') {
   1049       if (c0_ == kEndOfInput || !ScanEscape<false, false>()) {
   1050         return Token::ILLEGAL;
   1051       }
   1052     } else {
   1053       AddLiteralChar(c);
   1054     }
   1055   }
   1056   if (c0_ != quote) return Token::ILLEGAL;
   1057   literal.Complete();
   1058 
   1059   Advance();  // consume quote
   1060   return Token::STRING;
   1061 }
   1062 
   1063 
   1064 Token::Value Scanner::ScanTemplateSpan() {
   1065   // When scanning a TemplateSpan, we are looking for the following construct:
   1066   // TEMPLATE_SPAN ::
   1067   //     ` LiteralChars* ${
   1068   //   | } LiteralChars* ${
   1069   //
   1070   // TEMPLATE_TAIL ::
   1071   //     ` LiteralChars* `
   1072   //   | } LiteralChar* `
   1073   //
   1074   // A TEMPLATE_SPAN should always be followed by an Expression, while a
   1075   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
   1076   // followed by an Expression.
   1077 
   1078   // These scoped helpers save and restore the original error state, so that we
   1079   // can specially treat invalid escape sequences in templates (which are
   1080   // handled by the parser).
   1081   ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
   1082   ErrorState octal_error_state(&octal_message_, &octal_pos_);
   1083 
   1084   Token::Value result = Token::TEMPLATE_SPAN;
   1085   LiteralScope literal(this);
   1086   StartRawLiteral();
   1087   const bool capture_raw = true;
   1088   const bool in_template_literal = true;
   1089   while (true) {
   1090     uc32 c = c0_;
   1091     Advance<capture_raw>();
   1092     if (c == '`') {
   1093       result = Token::TEMPLATE_TAIL;
   1094       ReduceRawLiteralLength(1);
   1095       break;
   1096     } else if (c == '$' && c0_ == '{') {
   1097       Advance<capture_raw>();  // Consume '{'
   1098       ReduceRawLiteralLength(2);
   1099       break;
   1100     } else if (c == '\\') {
   1101       if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(c0_)) {
   1102         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
   1103         // code unit sequence.
   1104         uc32 lastChar = c0_;
   1105         Advance<capture_raw>();
   1106         if (lastChar == '\r') {
   1107           ReduceRawLiteralLength(1);  // Remove \r
   1108           if (c0_ == '\n') {
   1109             Advance<capture_raw>();  // Adds \n
   1110           } else {
   1111             AddRawLiteralChar('\n');
   1112           }
   1113         }
   1114       } else {
   1115         bool success = ScanEscape<capture_raw, in_template_literal>();
   1116         USE(success);
   1117         DCHECK_EQ(!success, has_error());
   1118         // For templates, invalid escape sequence checking is handled in the
   1119         // parser.
   1120         scanner_error_state.MoveErrorTo(&invalid_template_escape_message_,
   1121                                         &invalid_template_escape_location_);
   1122         octal_error_state.MoveErrorTo(&invalid_template_escape_message_,
   1123                                       &invalid_template_escape_location_);
   1124       }
   1125     } else if (c < 0) {
   1126       // Unterminated template literal
   1127       PushBack(c);
   1128       break;
   1129     } else {
   1130       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
   1131       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
   1132       // consisting of the CV 0x000A.
   1133       if (c == '\r') {
   1134         ReduceRawLiteralLength(1);  // Remove \r
   1135         if (c0_ == '\n') {
   1136           Advance<capture_raw>();  // Adds \n
   1137         } else {
   1138           AddRawLiteralChar('\n');
   1139         }
   1140         c = '\n';
   1141       }
   1142       AddLiteralChar(c);
   1143     }
   1144   }
   1145   literal.Complete();
   1146   next_.location.end_pos = source_pos();
   1147   next_.token = result;
   1148 
   1149   return result;
   1150 }
   1151 
   1152 
   1153 Token::Value Scanner::ScanTemplateStart() {
   1154   DCHECK(next_next_.token == Token::UNINITIALIZED);
   1155   DCHECK(c0_ == '`');
   1156   next_.location.beg_pos = source_pos();
   1157   Advance();  // Consume `
   1158   return ScanTemplateSpan();
   1159 }
   1160 
   1161 
   1162 Token::Value Scanner::ScanTemplateContinuation() {
   1163   DCHECK_EQ(next_.token, Token::RBRACE);
   1164   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
   1165   return ScanTemplateSpan();
   1166 }
   1167 
   1168 
   1169 void Scanner::ScanDecimalDigits() {
   1170   while (IsDecimalDigit(c0_))
   1171     AddLiteralCharAdvance();
   1172 }
   1173 
   1174 
   1175 Token::Value Scanner::ScanNumber(bool seen_period) {
   1176   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
   1177 
   1178   enum {
   1179     DECIMAL,
   1180     DECIMAL_WITH_LEADING_ZERO,
   1181     HEX,
   1182     OCTAL,
   1183     IMPLICIT_OCTAL,
   1184     BINARY
   1185   } kind = DECIMAL;
   1186 
   1187   LiteralScope literal(this);
   1188   bool at_start = !seen_period;
   1189   int start_pos = source_pos();  // For reporting octal positions.
   1190   if (seen_period) {
   1191     // we have already seen a decimal point of the float
   1192     AddLiteralChar('.');
   1193     ScanDecimalDigits();  // we know we have at least one digit
   1194 
   1195   } else {
   1196     // if the first character is '0' we must check for octals and hex
   1197     if (c0_ == '0') {
   1198       AddLiteralCharAdvance();
   1199 
   1200       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
   1201       // an octal number.
   1202       if (c0_ == 'x' || c0_ == 'X') {
   1203         // hex number
   1204         kind = HEX;
   1205         AddLiteralCharAdvance();
   1206         if (!IsHexDigit(c0_)) {
   1207           // we must have at least one hex digit after 'x'/'X'
   1208           return Token::ILLEGAL;
   1209         }
   1210         while (IsHexDigit(c0_)) {
   1211           AddLiteralCharAdvance();
   1212         }
   1213       } else if (c0_ == 'o' || c0_ == 'O') {
   1214         kind = OCTAL;
   1215         AddLiteralCharAdvance();
   1216         if (!IsOctalDigit(c0_)) {
   1217           // we must have at least one octal digit after 'o'/'O'
   1218           return Token::ILLEGAL;
   1219         }
   1220         while (IsOctalDigit(c0_)) {
   1221           AddLiteralCharAdvance();
   1222         }
   1223       } else if (c0_ == 'b' || c0_ == 'B') {
   1224         kind = BINARY;
   1225         AddLiteralCharAdvance();
   1226         if (!IsBinaryDigit(c0_)) {
   1227           // we must have at least one binary digit after 'b'/'B'
   1228           return Token::ILLEGAL;
   1229         }
   1230         while (IsBinaryDigit(c0_)) {
   1231           AddLiteralCharAdvance();
   1232         }
   1233       } else if ('0' <= c0_ && c0_ <= '7') {
   1234         // (possible) octal number
   1235         kind = IMPLICIT_OCTAL;
   1236         while (true) {
   1237           if (c0_ == '8' || c0_ == '9') {
   1238             at_start = false;
   1239             kind = DECIMAL_WITH_LEADING_ZERO;
   1240             break;
   1241           }
   1242           if (c0_  < '0' || '7'  < c0_) {
   1243             // Octal literal finished.
   1244             octal_pos_ = Location(start_pos, source_pos());
   1245             octal_message_ = MessageTemplate::kStrictOctalLiteral;
   1246             break;
   1247           }
   1248           AddLiteralCharAdvance();
   1249         }
   1250       } else if (c0_ == '8' || c0_ == '9') {
   1251         kind = DECIMAL_WITH_LEADING_ZERO;
   1252       }
   1253     }
   1254 
   1255     // Parse decimal digits and allow trailing fractional part.
   1256     if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
   1257       if (at_start) {
   1258         uint64_t value = 0;
   1259         while (IsDecimalDigit(c0_)) {
   1260           value = 10 * value + (c0_ - '0');
   1261 
   1262           uc32 first_char = c0_;
   1263           Advance<false, false>();
   1264           AddLiteralChar(first_char);
   1265         }
   1266 
   1267         if (next_.literal_chars->one_byte_literal().length() <= 10 &&
   1268             value <= Smi::kMaxValue && c0_ != '.' &&
   1269             (c0_ == kEndOfInput || !unicode_cache_->IsIdentifierStart(c0_))) {
   1270           next_.smi_value_ = static_cast<uint32_t>(value);
   1271           literal.Complete();
   1272           HandleLeadSurrogate();
   1273 
   1274           if (kind == DECIMAL_WITH_LEADING_ZERO) {
   1275             octal_pos_ = Location(start_pos, source_pos());
   1276             octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
   1277           }
   1278           return Token::SMI;
   1279         }
   1280         HandleLeadSurrogate();
   1281       }
   1282 
   1283       ScanDecimalDigits();  // optional
   1284       if (c0_ == '.') {
   1285         AddLiteralCharAdvance();
   1286         ScanDecimalDigits();  // optional
   1287       }
   1288     }
   1289   }
   1290 
   1291   // scan exponent, if any
   1292   if (c0_ == 'e' || c0_ == 'E') {
   1293     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
   1294     if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
   1295       return Token::ILLEGAL;
   1296     // scan exponent
   1297     AddLiteralCharAdvance();
   1298     if (c0_ == '+' || c0_ == '-')
   1299       AddLiteralCharAdvance();
   1300     if (!IsDecimalDigit(c0_)) {
   1301       // we must have at least one decimal digit after 'e'/'E'
   1302       return Token::ILLEGAL;
   1303     }
   1304     ScanDecimalDigits();
   1305   }
   1306 
   1307   // The source character immediately following a numeric literal must
   1308   // not be an identifier start or a decimal digit; see ECMA-262
   1309   // section 7.8.3, page 17 (note that we read only one decimal digit
   1310   // if the value is 0).
   1311   if (IsDecimalDigit(c0_) ||
   1312       (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_)))
   1313     return Token::ILLEGAL;
   1314 
   1315   literal.Complete();
   1316 
   1317   if (kind == DECIMAL_WITH_LEADING_ZERO) {
   1318     octal_pos_ = Location(start_pos, source_pos());
   1319     octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
   1320   }
   1321   return Token::NUMBER;
   1322 }
   1323 
   1324 
   1325 uc32 Scanner::ScanIdentifierUnicodeEscape() {
   1326   Advance();
   1327   if (c0_ != 'u') return -1;
   1328   Advance();
   1329   return ScanUnicodeEscape<false>();
   1330 }
   1331 
   1332 
   1333 template <bool capture_raw>
   1334 uc32 Scanner::ScanUnicodeEscape() {
   1335   // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
   1336   // hex digits between { } is arbitrary. \ and u have already been read.
   1337   if (c0_ == '{') {
   1338     int begin = source_pos() - 2;
   1339     Advance<capture_raw>();
   1340     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
   1341     if (cp < 0 || c0_ != '}') {
   1342       ReportScannerError(source_pos(),
   1343                          MessageTemplate::kInvalidUnicodeEscapeSequence);
   1344       return -1;
   1345     }
   1346     Advance<capture_raw>();
   1347     return cp;
   1348   }
   1349   const bool unicode = true;
   1350   return ScanHexNumber<capture_raw, unicode>(4);
   1351 }
   1352 
   1353 
   1354 // ----------------------------------------------------------------------------
   1355 // Keyword Matcher
   1356 
   1357 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
   1358   KEYWORD_GROUP('a')                                        \
   1359   KEYWORD("async", Token::ASYNC)                            \
   1360   KEYWORD("await", Token::AWAIT)                            \
   1361   KEYWORD_GROUP('b')                                        \
   1362   KEYWORD("break", Token::BREAK)                            \
   1363   KEYWORD_GROUP('c')                                        \
   1364   KEYWORD("case", Token::CASE)                              \
   1365   KEYWORD("catch", Token::CATCH)                            \
   1366   KEYWORD("class", Token::CLASS)                            \
   1367   KEYWORD("const", Token::CONST)                            \
   1368   KEYWORD("continue", Token::CONTINUE)                      \
   1369   KEYWORD_GROUP('d')                                        \
   1370   KEYWORD("debugger", Token::DEBUGGER)                      \
   1371   KEYWORD("default", Token::DEFAULT)                        \
   1372   KEYWORD("delete", Token::DELETE)                          \
   1373   KEYWORD("do", Token::DO)                                  \
   1374   KEYWORD_GROUP('e')                                        \
   1375   KEYWORD("else", Token::ELSE)                              \
   1376   KEYWORD("enum", Token::ENUM)                              \
   1377   KEYWORD("export", Token::EXPORT)                          \
   1378   KEYWORD("extends", Token::EXTENDS)                        \
   1379   KEYWORD_GROUP('f')                                        \
   1380   KEYWORD("false", Token::FALSE_LITERAL)                    \
   1381   KEYWORD("finally", Token::FINALLY)                        \
   1382   KEYWORD("for", Token::FOR)                                \
   1383   KEYWORD("function", Token::FUNCTION)                      \
   1384   KEYWORD_GROUP('i')                                        \
   1385   KEYWORD("if", Token::IF)                                  \
   1386   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
   1387   KEYWORD("import", Token::IMPORT)                          \
   1388   KEYWORD("in", Token::IN)                                  \
   1389   KEYWORD("instanceof", Token::INSTANCEOF)                  \
   1390   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
   1391   KEYWORD_GROUP('l')                                        \
   1392   KEYWORD("let", Token::LET)                                \
   1393   KEYWORD_GROUP('n')                                        \
   1394   KEYWORD("new", Token::NEW)                                \
   1395   KEYWORD("null", Token::NULL_LITERAL)                      \
   1396   KEYWORD_GROUP('p')                                        \
   1397   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
   1398   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
   1399   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
   1400   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
   1401   KEYWORD_GROUP('r')                                        \
   1402   KEYWORD("return", Token::RETURN)                          \
   1403   KEYWORD_GROUP('s')                                        \
   1404   KEYWORD("static", Token::STATIC)                          \
   1405   KEYWORD("super", Token::SUPER)                            \
   1406   KEYWORD("switch", Token::SWITCH)                          \
   1407   KEYWORD_GROUP('t')                                        \
   1408   KEYWORD("this", Token::THIS)                              \
   1409   KEYWORD("throw", Token::THROW)                            \
   1410   KEYWORD("true", Token::TRUE_LITERAL)                      \
   1411   KEYWORD("try", Token::TRY)                                \
   1412   KEYWORD("typeof", Token::TYPEOF)                          \
   1413   KEYWORD_GROUP('v')                                        \
   1414   KEYWORD("var", Token::VAR)                                \
   1415   KEYWORD("void", Token::VOID)                              \
   1416   KEYWORD_GROUP('w')                                        \
   1417   KEYWORD("while", Token::WHILE)                            \
   1418   KEYWORD("with", Token::WITH)                              \
   1419   KEYWORD_GROUP('y')                                        \
   1420   KEYWORD("yield", Token::YIELD)
   1421 
   1422 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
   1423                                              int input_length) {
   1424   DCHECK(input_length >= 1);
   1425   const int kMinLength = 2;
   1426   const int kMaxLength = 10;
   1427   if (input_length < kMinLength || input_length > kMaxLength) {
   1428     return Token::IDENTIFIER;
   1429   }
   1430   switch (input[0]) {
   1431     default:
   1432 #define KEYWORD_GROUP_CASE(ch)                                \
   1433       break;                                                  \
   1434     case ch:
   1435 #define KEYWORD(keyword, token)                                     \
   1436   {                                                                 \
   1437     /* 'keyword' is a char array, so sizeof(keyword) is */          \
   1438     /* strlen(keyword) plus 1 for the NUL char. */                  \
   1439     const int keyword_length = sizeof(keyword) - 1;                 \
   1440     STATIC_ASSERT(keyword_length >= kMinLength);                    \
   1441     STATIC_ASSERT(keyword_length <= kMaxLength);                    \
   1442     if (input_length == keyword_length && input[1] == keyword[1] && \
   1443         (keyword_length <= 2 || input[2] == keyword[2]) &&          \
   1444         (keyword_length <= 3 || input[3] == keyword[3]) &&          \
   1445         (keyword_length <= 4 || input[4] == keyword[4]) &&          \
   1446         (keyword_length <= 5 || input[5] == keyword[5]) &&          \
   1447         (keyword_length <= 6 || input[6] == keyword[6]) &&          \
   1448         (keyword_length <= 7 || input[7] == keyword[7]) &&          \
   1449         (keyword_length <= 8 || input[8] == keyword[8]) &&          \
   1450         (keyword_length <= 9 || input[9] == keyword[9])) {          \
   1451       return token;                                                 \
   1452     }                                                               \
   1453   }
   1454     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
   1455   }
   1456   return Token::IDENTIFIER;
   1457 }
   1458 
   1459 
   1460 Token::Value Scanner::ScanIdentifierOrKeyword() {
   1461   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
   1462   LiteralScope literal(this);
   1463   if (IsInRange(c0_, 'a', 'z')) {
   1464     do {
   1465       char first_char = static_cast<char>(c0_);
   1466       Advance<false, false>();
   1467       AddLiteralChar(first_char);
   1468     } while (IsInRange(c0_, 'a', 'z'));
   1469 
   1470     if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
   1471         c0_ == '$') {
   1472       // Identifier starting with lowercase.
   1473       char first_char = static_cast<char>(c0_);
   1474       Advance<false, false>();
   1475       AddLiteralChar(first_char);
   1476       while (IsAsciiIdentifier(c0_)) {
   1477         char first_char = static_cast<char>(c0_);
   1478         Advance<false, false>();
   1479         AddLiteralChar(first_char);
   1480       }
   1481       if (c0_ <= kMaxAscii && c0_ != '\\') {
   1482         literal.Complete();
   1483         return Token::IDENTIFIER;
   1484       }
   1485     } else if (c0_ <= kMaxAscii && c0_ != '\\') {
   1486       // Only a-z+: could be a keyword or identifier.
   1487       Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1488       Token::Value token =
   1489           KeywordOrIdentifierToken(chars.start(), chars.length());
   1490       if (token == Token::IDENTIFIER ||
   1491           token == Token::FUTURE_STRICT_RESERVED_WORD)
   1492         literal.Complete();
   1493       return token;
   1494     }
   1495 
   1496     HandleLeadSurrogate();
   1497   } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
   1498     do {
   1499       char first_char = static_cast<char>(c0_);
   1500       Advance<false, false>();
   1501       AddLiteralChar(first_char);
   1502     } while (IsAsciiIdentifier(c0_));
   1503 
   1504     if (c0_ <= kMaxAscii && c0_ != '\\') {
   1505       literal.Complete();
   1506       return Token::IDENTIFIER;
   1507     }
   1508 
   1509     HandleLeadSurrogate();
   1510   } else if (c0_ == '\\') {
   1511     // Scan identifier start character.
   1512     uc32 c = ScanIdentifierUnicodeEscape();
   1513     // Only allow legal identifier start characters.
   1514     if (c < 0 ||
   1515         c == '\\' ||  // No recursive escapes.
   1516         !unicode_cache_->IsIdentifierStart(c)) {
   1517       return Token::ILLEGAL;
   1518     }
   1519     AddLiteralChar(c);
   1520     return ScanIdentifierSuffix(&literal, true);
   1521   } else {
   1522     uc32 first_char = c0_;
   1523     Advance();
   1524     AddLiteralChar(first_char);
   1525   }
   1526 
   1527   // Scan the rest of the identifier characters.
   1528   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
   1529     if (c0_ != '\\') {
   1530       uc32 next_char = c0_;
   1531       Advance();
   1532       AddLiteralChar(next_char);
   1533       continue;
   1534     }
   1535     // Fallthrough if no longer able to complete keyword.
   1536     return ScanIdentifierSuffix(&literal, false);
   1537   }
   1538 
   1539   if (next_.literal_chars->is_one_byte()) {
   1540     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1541     Token::Value token =
   1542         KeywordOrIdentifierToken(chars.start(), chars.length());
   1543     if (token == Token::IDENTIFIER ||
   1544         token == Token::FUTURE_STRICT_RESERVED_WORD)
   1545       literal.Complete();
   1546     return token;
   1547   }
   1548   literal.Complete();
   1549   return Token::IDENTIFIER;
   1550 }
   1551 
   1552 
   1553 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
   1554                                            bool escaped) {
   1555   // Scan the rest of the identifier characters.
   1556   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
   1557     if (c0_ == '\\') {
   1558       uc32 c = ScanIdentifierUnicodeEscape();
   1559       escaped = true;
   1560       // Only allow legal identifier part characters.
   1561       if (c < 0 ||
   1562           c == '\\' ||
   1563           !unicode_cache_->IsIdentifierPart(c)) {
   1564         return Token::ILLEGAL;
   1565       }
   1566       AddLiteralChar(c);
   1567     } else {
   1568       AddLiteralChar(c0_);
   1569       Advance();
   1570     }
   1571   }
   1572   literal->Complete();
   1573 
   1574   if (escaped && next_.literal_chars->is_one_byte()) {
   1575     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1576     Token::Value token =
   1577         KeywordOrIdentifierToken(chars.start(), chars.length());
   1578     /* TODO(adamk): YIELD should be handled specially. */
   1579     if (token == Token::IDENTIFIER) {
   1580       return Token::IDENTIFIER;
   1581     } else if (token == Token::FUTURE_STRICT_RESERVED_WORD ||
   1582                token == Token::LET || token == Token::STATIC) {
   1583       return Token::ESCAPED_STRICT_RESERVED_WORD;
   1584     } else {
   1585       return Token::ESCAPED_KEYWORD;
   1586     }
   1587   }
   1588   return Token::IDENTIFIER;
   1589 }
   1590 
   1591 bool Scanner::ScanRegExpPattern() {
   1592   DCHECK(next_next_.token == Token::UNINITIALIZED);
   1593   DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV);
   1594 
   1595   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
   1596   bool in_character_class = false;
   1597   bool seen_equal = (next_.token == Token::ASSIGN_DIV);
   1598 
   1599   // Previous token is either '/' or '/=', in the second case, the
   1600   // pattern starts at =.
   1601   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1602   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1603 
   1604   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1605   // the scanner should pass uninterpreted bodies to the RegExp
   1606   // constructor.
   1607   LiteralScope literal(this);
   1608   if (seen_equal) {
   1609     AddLiteralChar('=');
   1610   }
   1611 
   1612   while (c0_ != '/' || in_character_class) {
   1613     if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
   1614       return false;
   1615     if (c0_ == '\\') {  // Escape sequence.
   1616       AddLiteralCharAdvance();
   1617       if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
   1618         return false;
   1619       AddLiteralCharAdvance();
   1620       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
   1621       // only "safe" characters are allowed (letters, digits, underscore),
   1622       // otherwise the escape isn't valid and the invalid character has
   1623       // its normal meaning. I.e., we can just continue scanning without
   1624       // worrying whether the following characters are part of the escape
   1625       // or not, since any '/', '\\' or '[' is guaranteed to not be part
   1626       // of the escape sequence.
   1627 
   1628       // TODO(896): At some point, parse RegExps more throughly to capture
   1629       // octal esacpes in strict mode.
   1630     } else {  // Unescaped character.
   1631       if (c0_ == '[') in_character_class = true;
   1632       if (c0_ == ']') in_character_class = false;
   1633       AddLiteralCharAdvance();
   1634     }
   1635   }
   1636   Advance();  // consume '/'
   1637 
   1638   literal.Complete();
   1639   next_.token = Token::REGEXP_LITERAL;
   1640   return true;
   1641 }
   1642 
   1643 
   1644 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
   1645   DCHECK(next_.token == Token::REGEXP_LITERAL);
   1646 
   1647   // Scan regular expression flags.
   1648   int flags = 0;
   1649   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
   1650     RegExp::Flags flag = RegExp::kNone;
   1651     switch (c0_) {
   1652       case 'g':
   1653         flag = RegExp::kGlobal;
   1654         break;
   1655       case 'i':
   1656         flag = RegExp::kIgnoreCase;
   1657         break;
   1658       case 'm':
   1659         flag = RegExp::kMultiline;
   1660         break;
   1661       case 'u':
   1662         flag = RegExp::kUnicode;
   1663         break;
   1664       case 'y':
   1665         flag = RegExp::kSticky;
   1666         break;
   1667       default:
   1668         return Nothing<RegExp::Flags>();
   1669     }
   1670     if (flags & flag) {
   1671       return Nothing<RegExp::Flags>();
   1672     }
   1673     Advance();
   1674     flags |= flag;
   1675   }
   1676 
   1677   next_.location.end_pos = source_pos();
   1678   return Just(RegExp::Flags(flags));
   1679 }
   1680 
   1681 
   1682 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
   1683   if (is_literal_one_byte()) {
   1684     return ast_value_factory->GetOneByteString(literal_one_byte_string());
   1685   }
   1686   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
   1687 }
   1688 
   1689 
   1690 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
   1691   if (is_next_literal_one_byte()) {
   1692     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
   1693   }
   1694   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
   1695 }
   1696 
   1697 
   1698 const AstRawString* Scanner::CurrentRawSymbol(
   1699     AstValueFactory* ast_value_factory) {
   1700   if (is_raw_literal_one_byte()) {
   1701     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
   1702   }
   1703   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
   1704 }
   1705 
   1706 
   1707 double Scanner::DoubleValue() {
   1708   DCHECK(is_literal_one_byte());
   1709   return StringToDouble(
   1710       unicode_cache_,
   1711       literal_one_byte_string(),
   1712       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
   1713 }
   1714 
   1715 
   1716 bool Scanner::ContainsDot() {
   1717   DCHECK(is_literal_one_byte());
   1718   Vector<const uint8_t> str = literal_one_byte_string();
   1719   return std::find(str.begin(), str.end(), '.') != str.end();
   1720 }
   1721 
   1722 bool Scanner::FindSymbol(DuplicateFinder* finder) {
   1723   // TODO(vogelheim): Move this logic into the calling class; this can be fully
   1724   //                  implemented using the public interface.
   1725   if (is_literal_one_byte()) {
   1726     return finder->AddOneByteSymbol(literal_one_byte_string());
   1727   }
   1728   return finder->AddTwoByteSymbol(literal_two_byte_string());
   1729 }
   1730 
   1731 void Scanner::SeekNext(size_t position) {
   1732   // Use with care: This cleanly resets most, but not all scanner state.
   1733   // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
   1734 
   1735   // To re-scan from a given character position, we need to:
   1736   // 1, Reset the current_, next_ and next_next_ tokens
   1737   //    (next_ + next_next_ will be overwrittem by Next(),
   1738   //     current_ will remain unchanged, so overwrite it fully.)
   1739   current_ = {{0, 0}, nullptr, nullptr, 0, Token::UNINITIALIZED};
   1740   next_.token = Token::UNINITIALIZED;
   1741   next_next_.token = Token::UNINITIALIZED;
   1742   // 2, reset the source to the desired position,
   1743   source_->Seek(position);
   1744   // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
   1745   c0_ = source_->Advance();
   1746   Next();
   1747   DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position));
   1748 }
   1749 
   1750 }  // namespace internal
   1751 }  // namespace v8
   1752