Home | History | Annotate | Download | only in parsing
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Features shared by parsing and pre-parsing scanners.
      6 
      7 #include "src/parsing/scanner.h"
      8 
      9 #include <stdint.h>
     10 
     11 #include <cmath>
     12 
     13 #include "src/ast/ast-value-factory.h"
     14 #include "src/char-predicates-inl.h"
     15 #include "src/conversions-inl.h"
     16 #include "src/list-inl.h"
     17 #include "src/parsing/parser.h"
     18 
     19 namespace v8 {
     20 namespace internal {
     21 
     22 
     23 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const {
     24   if (is_one_byte()) {
     25     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
     26   }
     27   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
     28 }
     29 
     30 
     31 // Default implementation for streams that do not support bookmarks.
     32 bool Utf16CharacterStream::SetBookmark() { return false; }
     33 void Utf16CharacterStream::ResetToBookmark() { UNREACHABLE(); }
     34 
     35 
     36 // ----------------------------------------------------------------------------
     37 // Scanner
     38 
     39 Scanner::Scanner(UnicodeCache* unicode_cache)
     40     : unicode_cache_(unicode_cache),
     41       bookmark_c0_(kNoBookmark),
     42       octal_pos_(Location::invalid()),
     43       decimal_with_leading_zero_pos_(Location::invalid()),
     44       found_html_comment_(false),
     45       allow_harmony_exponentiation_operator_(false) {
     46   bookmark_current_.literal_chars = &bookmark_current_literal_;
     47   bookmark_current_.raw_literal_chars = &bookmark_current_raw_literal_;
     48   bookmark_next_.literal_chars = &bookmark_next_literal_;
     49   bookmark_next_.raw_literal_chars = &bookmark_next_raw_literal_;
     50 }
     51 
     52 
     53 void Scanner::Initialize(Utf16CharacterStream* source) {
     54   source_ = source;
     55   // Need to capture identifiers in order to recognize "get" and "set"
     56   // in object literals.
     57   Init();
     58   // Skip initial whitespace allowing HTML comment ends just like
     59   // after a newline and scan first token.
     60   has_line_terminator_before_next_ = true;
     61   SkipWhiteSpace();
     62   Scan();
     63 }
     64 
     65 template <bool capture_raw, bool unicode>
     66 uc32 Scanner::ScanHexNumber(int expected_length) {
     67   DCHECK(expected_length <= 4);  // prevent overflow
     68 
     69   int begin = source_pos() - 2;
     70   uc32 x = 0;
     71   for (int i = 0; i < expected_length; i++) {
     72     int d = HexValue(c0_);
     73     if (d < 0) {
     74       ReportScannerError(Location(begin, begin + expected_length + 2),
     75                          unicode
     76                              ? MessageTemplate::kInvalidUnicodeEscapeSequence
     77                              : MessageTemplate::kInvalidHexEscapeSequence);
     78       return -1;
     79     }
     80     x = x * 16 + d;
     81     Advance<capture_raw>();
     82   }
     83 
     84   return x;
     85 }
     86 
     87 template <bool capture_raw>
     88 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
     89   uc32 x = 0;
     90   int d = HexValue(c0_);
     91   if (d < 0) return -1;
     92 
     93   while (d >= 0) {
     94     x = x * 16 + d;
     95     if (x > max_value) {
     96       ReportScannerError(Location(beg_pos, source_pos() + 1),
     97                          MessageTemplate::kUndefinedUnicodeCodePoint);
     98       return -1;
     99     }
    100     Advance<capture_raw>();
    101     d = HexValue(c0_);
    102   }
    103 
    104   return x;
    105 }
    106 
    107 
    108 // Ensure that tokens can be stored in a byte.
    109 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
    110 
    111 // Table of one-character tokens, by character (0x00..0x7f only).
    112 static const byte one_char_tokens[] = {
    113   Token::ILLEGAL,
    114   Token::ILLEGAL,
    115   Token::ILLEGAL,
    116   Token::ILLEGAL,
    117   Token::ILLEGAL,
    118   Token::ILLEGAL,
    119   Token::ILLEGAL,
    120   Token::ILLEGAL,
    121   Token::ILLEGAL,
    122   Token::ILLEGAL,
    123   Token::ILLEGAL,
    124   Token::ILLEGAL,
    125   Token::ILLEGAL,
    126   Token::ILLEGAL,
    127   Token::ILLEGAL,
    128   Token::ILLEGAL,
    129   Token::ILLEGAL,
    130   Token::ILLEGAL,
    131   Token::ILLEGAL,
    132   Token::ILLEGAL,
    133   Token::ILLEGAL,
    134   Token::ILLEGAL,
    135   Token::ILLEGAL,
    136   Token::ILLEGAL,
    137   Token::ILLEGAL,
    138   Token::ILLEGAL,
    139   Token::ILLEGAL,
    140   Token::ILLEGAL,
    141   Token::ILLEGAL,
    142   Token::ILLEGAL,
    143   Token::ILLEGAL,
    144   Token::ILLEGAL,
    145   Token::ILLEGAL,
    146   Token::ILLEGAL,
    147   Token::ILLEGAL,
    148   Token::ILLEGAL,
    149   Token::ILLEGAL,
    150   Token::ILLEGAL,
    151   Token::ILLEGAL,
    152   Token::ILLEGAL,
    153   Token::LPAREN,       // 0x28
    154   Token::RPAREN,       // 0x29
    155   Token::ILLEGAL,
    156   Token::ILLEGAL,
    157   Token::COMMA,        // 0x2c
    158   Token::ILLEGAL,
    159   Token::ILLEGAL,
    160   Token::ILLEGAL,
    161   Token::ILLEGAL,
    162   Token::ILLEGAL,
    163   Token::ILLEGAL,
    164   Token::ILLEGAL,
    165   Token::ILLEGAL,
    166   Token::ILLEGAL,
    167   Token::ILLEGAL,
    168   Token::ILLEGAL,
    169   Token::ILLEGAL,
    170   Token::ILLEGAL,
    171   Token::COLON,        // 0x3a
    172   Token::SEMICOLON,    // 0x3b
    173   Token::ILLEGAL,
    174   Token::ILLEGAL,
    175   Token::ILLEGAL,
    176   Token::CONDITIONAL,  // 0x3f
    177   Token::ILLEGAL,
    178   Token::ILLEGAL,
    179   Token::ILLEGAL,
    180   Token::ILLEGAL,
    181   Token::ILLEGAL,
    182   Token::ILLEGAL,
    183   Token::ILLEGAL,
    184   Token::ILLEGAL,
    185   Token::ILLEGAL,
    186   Token::ILLEGAL,
    187   Token::ILLEGAL,
    188   Token::ILLEGAL,
    189   Token::ILLEGAL,
    190   Token::ILLEGAL,
    191   Token::ILLEGAL,
    192   Token::ILLEGAL,
    193   Token::ILLEGAL,
    194   Token::ILLEGAL,
    195   Token::ILLEGAL,
    196   Token::ILLEGAL,
    197   Token::ILLEGAL,
    198   Token::ILLEGAL,
    199   Token::ILLEGAL,
    200   Token::ILLEGAL,
    201   Token::ILLEGAL,
    202   Token::ILLEGAL,
    203   Token::ILLEGAL,
    204   Token::LBRACK,     // 0x5b
    205   Token::ILLEGAL,
    206   Token::RBRACK,     // 0x5d
    207   Token::ILLEGAL,
    208   Token::ILLEGAL,
    209   Token::ILLEGAL,
    210   Token::ILLEGAL,
    211   Token::ILLEGAL,
    212   Token::ILLEGAL,
    213   Token::ILLEGAL,
    214   Token::ILLEGAL,
    215   Token::ILLEGAL,
    216   Token::ILLEGAL,
    217   Token::ILLEGAL,
    218   Token::ILLEGAL,
    219   Token::ILLEGAL,
    220   Token::ILLEGAL,
    221   Token::ILLEGAL,
    222   Token::ILLEGAL,
    223   Token::ILLEGAL,
    224   Token::ILLEGAL,
    225   Token::ILLEGAL,
    226   Token::ILLEGAL,
    227   Token::ILLEGAL,
    228   Token::ILLEGAL,
    229   Token::ILLEGAL,
    230   Token::ILLEGAL,
    231   Token::ILLEGAL,
    232   Token::ILLEGAL,
    233   Token::ILLEGAL,
    234   Token::ILLEGAL,
    235   Token::ILLEGAL,
    236   Token::LBRACE,       // 0x7b
    237   Token::ILLEGAL,
    238   Token::RBRACE,       // 0x7d
    239   Token::BIT_NOT,      // 0x7e
    240   Token::ILLEGAL
    241 };
    242 
    243 
    244 Token::Value Scanner::Next() {
    245   if (next_.token == Token::EOS) {
    246     next_.location.beg_pos = current_.location.beg_pos;
    247     next_.location.end_pos = current_.location.end_pos;
    248   }
    249   current_ = next_;
    250   if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
    251     next_ = next_next_;
    252     next_next_.token = Token::UNINITIALIZED;
    253     has_line_terminator_before_next_ = has_line_terminator_after_next_;
    254     return current_.token;
    255   }
    256   has_line_terminator_before_next_ = false;
    257   has_multiline_comment_before_next_ = false;
    258   if (static_cast<unsigned>(c0_) <= 0x7f) {
    259     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    260     if (token != Token::ILLEGAL) {
    261       int pos = source_pos();
    262       next_.token = token;
    263       next_.location.beg_pos = pos;
    264       next_.location.end_pos = pos + 1;
    265       Advance();
    266       return current_.token;
    267     }
    268   }
    269   Scan();
    270   return current_.token;
    271 }
    272 
    273 
    274 Token::Value Scanner::PeekAhead() {
    275   if (next_next_.token != Token::UNINITIALIZED) {
    276     return next_next_.token;
    277   }
    278   TokenDesc prev = current_;
    279   bool has_line_terminator_before_next =
    280       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
    281   Next();
    282   has_line_terminator_after_next_ =
    283       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
    284   has_line_terminator_before_next_ = has_line_terminator_before_next;
    285   Token::Value ret = next_.token;
    286   next_next_ = next_;
    287   next_ = current_;
    288   current_ = prev;
    289   return ret;
    290 }
    291 
    292 
    293 // TODO(yangguo): check whether this is actually necessary.
    294 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
    295   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    296   // Unicode character; this implies that in a Unicode context the
    297   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    298   // character expressed in little-endian byte order (since it could
    299   // not be a U+FFFE character expressed in big-endian byte
    300   // order). Nevertheless, we check for it to be compatible with
    301   // Spidermonkey.
    302   return c == 0xFFFE;
    303 }
    304 
    305 
    306 bool Scanner::SkipWhiteSpace() {
    307   int start_position = source_pos();
    308 
    309   while (true) {
    310     while (true) {
    311       // The unicode cache accepts unsigned inputs.
    312       if (c0_ < 0) break;
    313       // Advance as long as character is a WhiteSpace or LineTerminator.
    314       // Remember if the latter is the case.
    315       if (unicode_cache_->IsLineTerminator(c0_)) {
    316         has_line_terminator_before_next_ = true;
    317       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
    318                  !IsLittleEndianByteOrderMark(c0_)) {
    319         break;
    320       }
    321       Advance();
    322     }
    323 
    324     // If there is an HTML comment end '-->' at the beginning of a
    325     // line (with only whitespace in front of it), we treat the rest
    326     // of the line as a comment. This is in line with the way
    327     // SpiderMonkey handles it.
    328     if (c0_ == '-' && has_line_terminator_before_next_) {
    329       Advance();
    330       if (c0_ == '-') {
    331         Advance();
    332         if (c0_ == '>') {
    333           // Treat the rest of the line as a comment.
    334           SkipSingleLineComment();
    335           // Continue skipping white space after the comment.
    336           continue;
    337         }
    338         PushBack('-');  // undo Advance()
    339       }
    340       PushBack('-');  // undo Advance()
    341     }
    342     // Return whether or not we skipped any characters.
    343     return source_pos() != start_position;
    344   }
    345 }
    346 
    347 
    348 Token::Value Scanner::SkipSingleLineComment() {
    349   Advance();
    350 
    351   // The line terminator at the end of the line is not considered
    352   // to be part of the single-line comment; it is recognized
    353   // separately by the lexical grammar and becomes part of the
    354   // stream of input elements for the syntactic grammar (see
    355   // ECMA-262, section 7.4).
    356   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    357     Advance();
    358   }
    359 
    360   return Token::WHITESPACE;
    361 }
    362 
    363 
    364 Token::Value Scanner::SkipSourceURLComment() {
    365   TryToParseSourceURLComment();
    366   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    367     Advance();
    368   }
    369 
    370   return Token::WHITESPACE;
    371 }
    372 
    373 
    374 void Scanner::TryToParseSourceURLComment() {
    375   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
    376   // function will just return if it cannot parse a magic comment.
    377   if (c0_ < 0 || !unicode_cache_->IsWhiteSpace(c0_)) return;
    378   Advance();
    379   LiteralBuffer name;
    380   while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) &&
    381          c0_ != '=') {
    382     name.AddChar(c0_);
    383     Advance();
    384   }
    385   if (!name.is_one_byte()) return;
    386   Vector<const uint8_t> name_literal = name.one_byte_literal();
    387   LiteralBuffer* value;
    388   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
    389     value = &source_url_;
    390   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
    391     value = &source_mapping_url_;
    392   } else {
    393     return;
    394   }
    395   if (c0_ != '=')
    396     return;
    397   Advance();
    398   value->Reset();
    399   while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) {
    400     Advance();
    401   }
    402   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    403     // Disallowed characters.
    404     if (c0_ == '"' || c0_ == '\'') {
    405       value->Reset();
    406       return;
    407     }
    408     if (unicode_cache_->IsWhiteSpace(c0_)) {
    409       break;
    410     }
    411     value->AddChar(c0_);
    412     Advance();
    413   }
    414   // Allow whitespace at the end.
    415   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    416     if (!unicode_cache_->IsWhiteSpace(c0_)) {
    417       value->Reset();
    418       break;
    419     }
    420     Advance();
    421   }
    422 }
    423 
    424 
    425 Token::Value Scanner::SkipMultiLineComment() {
    426   DCHECK(c0_ == '*');
    427   Advance();
    428 
    429   while (c0_ >= 0) {
    430     uc32 ch = c0_;
    431     Advance();
    432     if (c0_ >= 0 && unicode_cache_->IsLineTerminator(ch)) {
    433       // Following ECMA-262, section 7.4, a comment containing
    434       // a newline will make the comment count as a line-terminator.
    435       has_multiline_comment_before_next_ = true;
    436     }
    437     // If we have reached the end of the multi-line comment, we
    438     // consume the '/' and insert a whitespace. This way all
    439     // multi-line comments are treated as whitespace.
    440     if (ch == '*' && c0_ == '/') {
    441       c0_ = ' ';
    442       return Token::WHITESPACE;
    443     }
    444   }
    445 
    446   // Unterminated multi-line comment.
    447   return Token::ILLEGAL;
    448 }
    449 
    450 
    451 Token::Value Scanner::ScanHtmlComment() {
    452   // Check for <!-- comments.
    453   DCHECK(c0_ == '!');
    454   Advance();
    455   if (c0_ == '-') {
    456     Advance();
    457     if (c0_ == '-') {
    458       found_html_comment_ = true;
    459       return SkipSingleLineComment();
    460     }
    461     PushBack('-');  // undo Advance()
    462   }
    463   PushBack('!');  // undo Advance()
    464   DCHECK(c0_ == '!');
    465   return Token::LT;
    466 }
    467 
    468 
    469 void Scanner::Scan() {
    470   next_.literal_chars = NULL;
    471   next_.raw_literal_chars = NULL;
    472   Token::Value token;
    473   do {
    474     // Remember the position of the next token
    475     next_.location.beg_pos = source_pos();
    476 
    477     switch (c0_) {
    478       case ' ':
    479       case '\t':
    480         Advance();
    481         token = Token::WHITESPACE;
    482         break;
    483 
    484       case '\n':
    485         Advance();
    486         has_line_terminator_before_next_ = true;
    487         token = Token::WHITESPACE;
    488         break;
    489 
    490       case '"': case '\'':
    491         token = ScanString();
    492         break;
    493 
    494       case '<':
    495         // < <= << <<= <!--
    496         Advance();
    497         if (c0_ == '=') {
    498           token = Select(Token::LTE);
    499         } else if (c0_ == '<') {
    500           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    501         } else if (c0_ == '!') {
    502           token = ScanHtmlComment();
    503         } else {
    504           token = Token::LT;
    505         }
    506         break;
    507 
    508       case '>':
    509         // > >= >> >>= >>> >>>=
    510         Advance();
    511         if (c0_ == '=') {
    512           token = Select(Token::GTE);
    513         } else if (c0_ == '>') {
    514           // >> >>= >>> >>>=
    515           Advance();
    516           if (c0_ == '=') {
    517             token = Select(Token::ASSIGN_SAR);
    518           } else if (c0_ == '>') {
    519             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    520           } else {
    521             token = Token::SAR;
    522           }
    523         } else {
    524           token = Token::GT;
    525         }
    526         break;
    527 
    528       case '=':
    529         // = == === =>
    530         Advance();
    531         if (c0_ == '=') {
    532           token = Select('=', Token::EQ_STRICT, Token::EQ);
    533         } else if (c0_ == '>') {
    534           token = Select(Token::ARROW);
    535         } else {
    536           token = Token::ASSIGN;
    537         }
    538         break;
    539 
    540       case '!':
    541         // ! != !==
    542         Advance();
    543         if (c0_ == '=') {
    544           token = Select('=', Token::NE_STRICT, Token::NE);
    545         } else {
    546           token = Token::NOT;
    547         }
    548         break;
    549 
    550       case '+':
    551         // + ++ +=
    552         Advance();
    553         if (c0_ == '+') {
    554           token = Select(Token::INC);
    555         } else if (c0_ == '=') {
    556           token = Select(Token::ASSIGN_ADD);
    557         } else {
    558           token = Token::ADD;
    559         }
    560         break;
    561 
    562       case '-':
    563         // - -- --> -=
    564         Advance();
    565         if (c0_ == '-') {
    566           Advance();
    567           if (c0_ == '>' && has_line_terminator_before_next_) {
    568             // For compatibility with SpiderMonkey, we skip lines that
    569             // start with an HTML comment end '-->'.
    570             token = SkipSingleLineComment();
    571           } else {
    572             token = Token::DEC;
    573           }
    574         } else if (c0_ == '=') {
    575           token = Select(Token::ASSIGN_SUB);
    576         } else {
    577           token = Token::SUB;
    578         }
    579         break;
    580 
    581       case '*':
    582         // * *=
    583         Advance();
    584         if (c0_ == '*' && allow_harmony_exponentiation_operator()) {
    585           token = Select('=', Token::ASSIGN_EXP, Token::EXP);
    586         } else if (c0_ == '=') {
    587           token = Select(Token::ASSIGN_MUL);
    588         } else {
    589           token = Token::MUL;
    590         }
    591         break;
    592 
    593       case '%':
    594         // % %=
    595         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    596         break;
    597 
    598       case '/':
    599         // /  // /* /=
    600         Advance();
    601         if (c0_ == '/') {
    602           Advance();
    603           if (c0_ == '#' || c0_ == '@') {
    604             Advance();
    605             token = SkipSourceURLComment();
    606           } else {
    607             PushBack(c0_);
    608             token = SkipSingleLineComment();
    609           }
    610         } else if (c0_ == '*') {
    611           token = SkipMultiLineComment();
    612         } else if (c0_ == '=') {
    613           token = Select(Token::ASSIGN_DIV);
    614         } else {
    615           token = Token::DIV;
    616         }
    617         break;
    618 
    619       case '&':
    620         // & && &=
    621         Advance();
    622         if (c0_ == '&') {
    623           token = Select(Token::AND);
    624         } else if (c0_ == '=') {
    625           token = Select(Token::ASSIGN_BIT_AND);
    626         } else {
    627           token = Token::BIT_AND;
    628         }
    629         break;
    630 
    631       case '|':
    632         // | || |=
    633         Advance();
    634         if (c0_ == '|') {
    635           token = Select(Token::OR);
    636         } else if (c0_ == '=') {
    637           token = Select(Token::ASSIGN_BIT_OR);
    638         } else {
    639           token = Token::BIT_OR;
    640         }
    641         break;
    642 
    643       case '^':
    644         // ^ ^=
    645         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    646         break;
    647 
    648       case '.':
    649         // . Number
    650         Advance();
    651         if (IsDecimalDigit(c0_)) {
    652           token = ScanNumber(true);
    653         } else {
    654           token = Token::PERIOD;
    655           if (c0_ == '.') {
    656             Advance();
    657             if (c0_ == '.') {
    658               Advance();
    659               token = Token::ELLIPSIS;
    660             } else {
    661               PushBack('.');
    662             }
    663           }
    664         }
    665         break;
    666 
    667       case ':':
    668         token = Select(Token::COLON);
    669         break;
    670 
    671       case ';':
    672         token = Select(Token::SEMICOLON);
    673         break;
    674 
    675       case ',':
    676         token = Select(Token::COMMA);
    677         break;
    678 
    679       case '(':
    680         token = Select(Token::LPAREN);
    681         break;
    682 
    683       case ')':
    684         token = Select(Token::RPAREN);
    685         break;
    686 
    687       case '[':
    688         token = Select(Token::LBRACK);
    689         break;
    690 
    691       case ']':
    692         token = Select(Token::RBRACK);
    693         break;
    694 
    695       case '{':
    696         token = Select(Token::LBRACE);
    697         break;
    698 
    699       case '}':
    700         token = Select(Token::RBRACE);
    701         break;
    702 
    703       case '?':
    704         token = Select(Token::CONDITIONAL);
    705         break;
    706 
    707       case '~':
    708         token = Select(Token::BIT_NOT);
    709         break;
    710 
    711       case '`':
    712         token = ScanTemplateStart();
    713         break;
    714 
    715       default:
    716         if (c0_ < 0) {
    717           token = Token::EOS;
    718         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
    719           token = ScanIdentifierOrKeyword();
    720         } else if (IsDecimalDigit(c0_)) {
    721           token = ScanNumber(false);
    722         } else if (SkipWhiteSpace()) {
    723           token = Token::WHITESPACE;
    724         } else {
    725           token = Select(Token::ILLEGAL);
    726         }
    727         break;
    728     }
    729 
    730     // Continue scanning for tokens as long as we're just skipping
    731     // whitespace.
    732   } while (token == Token::WHITESPACE);
    733 
    734   next_.location.end_pos = source_pos();
    735   next_.token = token;
    736 }
    737 
    738 
    739 void Scanner::SeekForward(int pos) {
    740   // After this call, we will have the token at the given position as
    741   // the "next" token. The "current" token will be invalid.
    742   if (pos == next_.location.beg_pos) return;
    743   int current_pos = source_pos();
    744   DCHECK_EQ(next_.location.end_pos, current_pos);
    745   // Positions inside the lookahead token aren't supported.
    746   DCHECK(pos >= current_pos);
    747   if (pos != current_pos) {
    748     source_->SeekForward(pos - source_->pos());
    749     Advance();
    750     // This function is only called to seek to the location
    751     // of the end of a function (at the "}" token). It doesn't matter
    752     // whether there was a line terminator in the part we skip.
    753     has_line_terminator_before_next_ = false;
    754     has_multiline_comment_before_next_ = false;
    755   }
    756   Scan();
    757 }
    758 
    759 
    760 template <bool capture_raw, bool in_template_literal>
    761 bool Scanner::ScanEscape() {
    762   uc32 c = c0_;
    763   Advance<capture_raw>();
    764 
    765   // Skip escaped newlines.
    766   if (!in_template_literal && c0_ >= 0 && unicode_cache_->IsLineTerminator(c)) {
    767     // Allow CR+LF newlines in multiline string literals.
    768     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
    769     // Allow LF+CR newlines in multiline string literals.
    770     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
    771     return true;
    772   }
    773 
    774   switch (c) {
    775     case '\'':  // fall through
    776     case '"' :  // fall through
    777     case '\\': break;
    778     case 'b' : c = '\b'; break;
    779     case 'f' : c = '\f'; break;
    780     case 'n' : c = '\n'; break;
    781     case 'r' : c = '\r'; break;
    782     case 't' : c = '\t'; break;
    783     case 'u' : {
    784       c = ScanUnicodeEscape<capture_raw>();
    785       if (c < 0) return false;
    786       break;
    787     }
    788     case 'v':
    789       c = '\v';
    790       break;
    791     case 'x': {
    792       c = ScanHexNumber<capture_raw>(2);
    793       if (c < 0) return false;
    794       break;
    795     }
    796     case '0':  // Fall through.
    797     case '1':  // fall through
    798     case '2':  // fall through
    799     case '3':  // fall through
    800     case '4':  // fall through
    801     case '5':  // fall through
    802     case '6':  // fall through
    803     case '7':
    804       c = ScanOctalEscape<capture_raw>(c, 2);
    805       break;
    806   }
    807 
    808   // According to ECMA-262, section 7.8.4, characters not covered by the
    809   // above cases should be illegal, but they are commonly handled as
    810   // non-escaped characters by JS VMs.
    811   AddLiteralChar(c);
    812   return true;
    813 }
    814 
    815 
    816 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
    817 // ECMA-262. Other JS VMs support them.
    818 template <bool capture_raw>
    819 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    820   uc32 x = c - '0';
    821   int i = 0;
    822   for (; i < length; i++) {
    823     int d = c0_ - '0';
    824     if (d < 0 || d > 7) break;
    825     int nx = x * 8 + d;
    826     if (nx >= 256) break;
    827     x = nx;
    828     Advance<capture_raw>();
    829   }
    830   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
    831   // Remember the position of octal escape sequences so that an error
    832   // can be reported later (in strict mode).
    833   // We don't report the error immediately, because the octal escape can
    834   // occur before the "use strict" directive.
    835   if (c != '0' || i > 0) {
    836     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
    837   }
    838   return x;
    839 }
    840 
    841 
    842 Token::Value Scanner::ScanString() {
    843   uc32 quote = c0_;
    844   Advance<false, false>();  // consume quote
    845 
    846   LiteralScope literal(this);
    847   while (true) {
    848     if (c0_ > kMaxAscii) {
    849       HandleLeadSurrogate();
    850       break;
    851     }
    852     if (c0_ < 0 || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
    853     if (c0_ == quote) {
    854       literal.Complete();
    855       Advance<false, false>();
    856       return Token::STRING;
    857     }
    858     char c = static_cast<char>(c0_);
    859     if (c == '\\') break;
    860     Advance<false, false>();
    861     AddLiteralChar(c);
    862   }
    863 
    864   while (c0_ != quote && c0_ >= 0
    865          && !unicode_cache_->IsLineTerminator(c0_)) {
    866     uc32 c = c0_;
    867     Advance();
    868     if (c == '\\') {
    869       if (c0_ < 0 || !ScanEscape<false, false>()) {
    870         return Token::ILLEGAL;
    871       }
    872     } else {
    873       AddLiteralChar(c);
    874     }
    875   }
    876   if (c0_ != quote) return Token::ILLEGAL;
    877   literal.Complete();
    878 
    879   Advance();  // consume quote
    880   return Token::STRING;
    881 }
    882 
    883 
    884 Token::Value Scanner::ScanTemplateSpan() {
    885   // When scanning a TemplateSpan, we are looking for the following construct:
    886   // TEMPLATE_SPAN ::
    887   //     ` LiteralChars* ${
    888   //   | } LiteralChars* ${
    889   //
    890   // TEMPLATE_TAIL ::
    891   //     ` LiteralChars* `
    892   //   | } LiteralChar* `
    893   //
    894   // A TEMPLATE_SPAN should always be followed by an Expression, while a
    895   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
    896   // followed by an Expression.
    897 
    898   Token::Value result = Token::TEMPLATE_SPAN;
    899   LiteralScope literal(this);
    900   StartRawLiteral();
    901   const bool capture_raw = true;
    902   const bool in_template_literal = true;
    903   while (true) {
    904     uc32 c = c0_;
    905     Advance<capture_raw>();
    906     if (c == '`') {
    907       result = Token::TEMPLATE_TAIL;
    908       ReduceRawLiteralLength(1);
    909       break;
    910     } else if (c == '$' && c0_ == '{') {
    911       Advance<capture_raw>();  // Consume '{'
    912       ReduceRawLiteralLength(2);
    913       break;
    914     } else if (c == '\\') {
    915       if (c0_ > 0 && unicode_cache_->IsLineTerminator(c0_)) {
    916         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
    917         // code unit sequence.
    918         uc32 lastChar = c0_;
    919         Advance<capture_raw>();
    920         if (lastChar == '\r') {
    921           ReduceRawLiteralLength(1);  // Remove \r
    922           if (c0_ == '\n') {
    923             Advance<capture_raw>();  // Adds \n
    924           } else {
    925             AddRawLiteralChar('\n');
    926           }
    927         }
    928       } else if (!ScanEscape<capture_raw, in_template_literal>()) {
    929         return Token::ILLEGAL;
    930       }
    931     } else if (c < 0) {
    932       // Unterminated template literal
    933       PushBack(c);
    934       break;
    935     } else {
    936       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
    937       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
    938       // consisting of the CV 0x000A.
    939       if (c == '\r') {
    940         ReduceRawLiteralLength(1);  // Remove \r
    941         if (c0_ == '\n') {
    942           Advance<capture_raw>();  // Adds \n
    943         } else {
    944           AddRawLiteralChar('\n');
    945         }
    946         c = '\n';
    947       }
    948       AddLiteralChar(c);
    949     }
    950   }
    951   literal.Complete();
    952   next_.location.end_pos = source_pos();
    953   next_.token = result;
    954   return result;
    955 }
    956 
    957 
    958 Token::Value Scanner::ScanTemplateStart() {
    959   DCHECK(c0_ == '`');
    960   next_.location.beg_pos = source_pos();
    961   Advance();  // Consume `
    962   return ScanTemplateSpan();
    963 }
    964 
    965 
    966 Token::Value Scanner::ScanTemplateContinuation() {
    967   DCHECK_EQ(next_.token, Token::RBRACE);
    968   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
    969   return ScanTemplateSpan();
    970 }
    971 
    972 
    973 void Scanner::ScanDecimalDigits() {
    974   while (IsDecimalDigit(c0_))
    975     AddLiteralCharAdvance();
    976 }
    977 
    978 
    979 Token::Value Scanner::ScanNumber(bool seen_period) {
    980   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
    981 
    982   enum {
    983     DECIMAL,
    984     DECIMAL_WITH_LEADING_ZERO,
    985     HEX,
    986     OCTAL,
    987     IMPLICIT_OCTAL,
    988     BINARY
    989   } kind = DECIMAL;
    990 
    991   LiteralScope literal(this);
    992   bool at_start = !seen_period;
    993   int start_pos = source_pos();  // For reporting octal positions.
    994   if (seen_period) {
    995     // we have already seen a decimal point of the float
    996     AddLiteralChar('.');
    997     ScanDecimalDigits();  // we know we have at least one digit
    998 
    999   } else {
   1000     // if the first character is '0' we must check for octals and hex
   1001     if (c0_ == '0') {
   1002       AddLiteralCharAdvance();
   1003 
   1004       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
   1005       // an octal number.
   1006       if (c0_ == 'x' || c0_ == 'X') {
   1007         // hex number
   1008         kind = HEX;
   1009         AddLiteralCharAdvance();
   1010         if (!IsHexDigit(c0_)) {
   1011           // we must have at least one hex digit after 'x'/'X'
   1012           return Token::ILLEGAL;
   1013         }
   1014         while (IsHexDigit(c0_)) {
   1015           AddLiteralCharAdvance();
   1016         }
   1017       } else if (c0_ == 'o' || c0_ == 'O') {
   1018         kind = OCTAL;
   1019         AddLiteralCharAdvance();
   1020         if (!IsOctalDigit(c0_)) {
   1021           // we must have at least one octal digit after 'o'/'O'
   1022           return Token::ILLEGAL;
   1023         }
   1024         while (IsOctalDigit(c0_)) {
   1025           AddLiteralCharAdvance();
   1026         }
   1027       } else if (c0_ == 'b' || c0_ == 'B') {
   1028         kind = BINARY;
   1029         AddLiteralCharAdvance();
   1030         if (!IsBinaryDigit(c0_)) {
   1031           // we must have at least one binary digit after 'b'/'B'
   1032           return Token::ILLEGAL;
   1033         }
   1034         while (IsBinaryDigit(c0_)) {
   1035           AddLiteralCharAdvance();
   1036         }
   1037       } else if ('0' <= c0_ && c0_ <= '7') {
   1038         // (possible) octal number
   1039         kind = IMPLICIT_OCTAL;
   1040         while (true) {
   1041           if (c0_ == '8' || c0_ == '9') {
   1042             at_start = false;
   1043             kind = DECIMAL_WITH_LEADING_ZERO;
   1044             break;
   1045           }
   1046           if (c0_  < '0' || '7'  < c0_) {
   1047             // Octal literal finished.
   1048             octal_pos_ = Location(start_pos, source_pos());
   1049             break;
   1050           }
   1051           AddLiteralCharAdvance();
   1052         }
   1053       } else if (c0_ == '8' || c0_ == '9') {
   1054         kind = DECIMAL_WITH_LEADING_ZERO;
   1055       }
   1056     }
   1057 
   1058     // Parse decimal digits and allow trailing fractional part.
   1059     if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
   1060       if (at_start) {
   1061         uint64_t value = 0;
   1062         while (IsDecimalDigit(c0_)) {
   1063           value = 10 * value + (c0_ - '0');
   1064 
   1065           uc32 first_char = c0_;
   1066           Advance<false, false>();
   1067           AddLiteralChar(first_char);
   1068         }
   1069 
   1070         if (next_.literal_chars->one_byte_literal().length() <= 10 &&
   1071             value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') {
   1072           next_.smi_value_ = static_cast<int>(value);
   1073           literal.Complete();
   1074           HandleLeadSurrogate();
   1075 
   1076           if (kind == DECIMAL_WITH_LEADING_ZERO)
   1077             decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
   1078           return Token::SMI;
   1079         }
   1080         HandleLeadSurrogate();
   1081       }
   1082 
   1083       ScanDecimalDigits();  // optional
   1084       if (c0_ == '.') {
   1085         AddLiteralCharAdvance();
   1086         ScanDecimalDigits();  // optional
   1087       }
   1088     }
   1089   }
   1090 
   1091   // scan exponent, if any
   1092   if (c0_ == 'e' || c0_ == 'E') {
   1093     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
   1094     if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
   1095       return Token::ILLEGAL;
   1096     // scan exponent
   1097     AddLiteralCharAdvance();
   1098     if (c0_ == '+' || c0_ == '-')
   1099       AddLiteralCharAdvance();
   1100     if (!IsDecimalDigit(c0_)) {
   1101       // we must have at least one decimal digit after 'e'/'E'
   1102       return Token::ILLEGAL;
   1103     }
   1104     ScanDecimalDigits();
   1105   }
   1106 
   1107   // The source character immediately following a numeric literal must
   1108   // not be an identifier start or a decimal digit; see ECMA-262
   1109   // section 7.8.3, page 17 (note that we read only one decimal digit
   1110   // if the value is 0).
   1111   if (IsDecimalDigit(c0_) ||
   1112       (c0_ >= 0 && unicode_cache_->IsIdentifierStart(c0_)))
   1113     return Token::ILLEGAL;
   1114 
   1115   literal.Complete();
   1116 
   1117   if (kind == DECIMAL_WITH_LEADING_ZERO)
   1118     decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
   1119   return Token::NUMBER;
   1120 }
   1121 
   1122 
   1123 uc32 Scanner::ScanIdentifierUnicodeEscape() {
   1124   Advance();
   1125   if (c0_ != 'u') return -1;
   1126   Advance();
   1127   return ScanUnicodeEscape<false>();
   1128 }
   1129 
   1130 
   1131 template <bool capture_raw>
   1132 uc32 Scanner::ScanUnicodeEscape() {
   1133   // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
   1134   // hex digits between { } is arbitrary. \ and u have already been read.
   1135   if (c0_ == '{') {
   1136     int begin = source_pos() - 2;
   1137     Advance<capture_raw>();
   1138     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
   1139     if (cp < 0 || c0_ != '}') {
   1140       ReportScannerError(source_pos(),
   1141                          MessageTemplate::kInvalidUnicodeEscapeSequence);
   1142       return -1;
   1143     }
   1144     Advance<capture_raw>();
   1145     return cp;
   1146   }
   1147   const bool unicode = true;
   1148   return ScanHexNumber<capture_raw, unicode>(4);
   1149 }
   1150 
   1151 
   1152 // ----------------------------------------------------------------------------
   1153 // Keyword Matcher
   1154 
   1155 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
   1156   KEYWORD_GROUP('a')                                        \
   1157   KEYWORD("async", Token::ASYNC)                            \
   1158   KEYWORD("await", Token::AWAIT)                            \
   1159   KEYWORD_GROUP('b')                                        \
   1160   KEYWORD("break", Token::BREAK)                            \
   1161   KEYWORD_GROUP('c')                                        \
   1162   KEYWORD("case", Token::CASE)                              \
   1163   KEYWORD("catch", Token::CATCH)                            \
   1164   KEYWORD("class", Token::CLASS)                            \
   1165   KEYWORD("const", Token::CONST)                            \
   1166   KEYWORD("continue", Token::CONTINUE)                      \
   1167   KEYWORD_GROUP('d')                                        \
   1168   KEYWORD("debugger", Token::DEBUGGER)                      \
   1169   KEYWORD("default", Token::DEFAULT)                        \
   1170   KEYWORD("delete", Token::DELETE)                          \
   1171   KEYWORD("do", Token::DO)                                  \
   1172   KEYWORD_GROUP('e')                                        \
   1173   KEYWORD("else", Token::ELSE)                              \
   1174   KEYWORD("enum", Token::ENUM)                              \
   1175   KEYWORD("export", Token::EXPORT)                          \
   1176   KEYWORD("extends", Token::EXTENDS)                        \
   1177   KEYWORD_GROUP('f')                                        \
   1178   KEYWORD("false", Token::FALSE_LITERAL)                    \
   1179   KEYWORD("finally", Token::FINALLY)                        \
   1180   KEYWORD("for", Token::FOR)                                \
   1181   KEYWORD("function", Token::FUNCTION)                      \
   1182   KEYWORD_GROUP('i')                                        \
   1183   KEYWORD("if", Token::IF)                                  \
   1184   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
   1185   KEYWORD("import", Token::IMPORT)                          \
   1186   KEYWORD("in", Token::IN)                                  \
   1187   KEYWORD("instanceof", Token::INSTANCEOF)                  \
   1188   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
   1189   KEYWORD_GROUP('l')                                        \
   1190   KEYWORD("let", Token::LET)                                \
   1191   KEYWORD_GROUP('n')                                        \
   1192   KEYWORD("new", Token::NEW)                                \
   1193   KEYWORD("null", Token::NULL_LITERAL)                      \
   1194   KEYWORD_GROUP('p')                                        \
   1195   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
   1196   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
   1197   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
   1198   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
   1199   KEYWORD_GROUP('r')                                        \
   1200   KEYWORD("return", Token::RETURN)                          \
   1201   KEYWORD_GROUP('s')                                        \
   1202   KEYWORD("static", Token::STATIC)                          \
   1203   KEYWORD("super", Token::SUPER)                            \
   1204   KEYWORD("switch", Token::SWITCH)                          \
   1205   KEYWORD_GROUP('t')                                        \
   1206   KEYWORD("this", Token::THIS)                              \
   1207   KEYWORD("throw", Token::THROW)                            \
   1208   KEYWORD("true", Token::TRUE_LITERAL)                      \
   1209   KEYWORD("try", Token::TRY)                                \
   1210   KEYWORD("typeof", Token::TYPEOF)                          \
   1211   KEYWORD_GROUP('v')                                        \
   1212   KEYWORD("var", Token::VAR)                                \
   1213   KEYWORD("void", Token::VOID)                              \
   1214   KEYWORD_GROUP('w')                                        \
   1215   KEYWORD("while", Token::WHILE)                            \
   1216   KEYWORD("with", Token::WITH)                              \
   1217   KEYWORD_GROUP('y')                                        \
   1218   KEYWORD("yield", Token::YIELD)
   1219 
   1220 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
   1221                                              int input_length, bool escaped) {
   1222   DCHECK(input_length >= 1);
   1223   const int kMinLength = 2;
   1224   const int kMaxLength = 10;
   1225   if (input_length < kMinLength || input_length > kMaxLength) {
   1226     return Token::IDENTIFIER;
   1227   }
   1228   switch (input[0]) {
   1229     default:
   1230 #define KEYWORD_GROUP_CASE(ch)                                \
   1231       break;                                                  \
   1232     case ch:
   1233 #define KEYWORD(keyword, token)                                     \
   1234   {                                                                 \
   1235     /* 'keyword' is a char array, so sizeof(keyword) is */          \
   1236     /* strlen(keyword) plus 1 for the NUL char. */                  \
   1237     const int keyword_length = sizeof(keyword) - 1;                 \
   1238     STATIC_ASSERT(keyword_length >= kMinLength);                    \
   1239     STATIC_ASSERT(keyword_length <= kMaxLength);                    \
   1240     if (input_length == keyword_length && input[1] == keyword[1] && \
   1241         (keyword_length <= 2 || input[2] == keyword[2]) &&          \
   1242         (keyword_length <= 3 || input[3] == keyword[3]) &&          \
   1243         (keyword_length <= 4 || input[4] == keyword[4]) &&          \
   1244         (keyword_length <= 5 || input[5] == keyword[5]) &&          \
   1245         (keyword_length <= 6 || input[6] == keyword[6]) &&          \
   1246         (keyword_length <= 7 || input[7] == keyword[7]) &&          \
   1247         (keyword_length <= 8 || input[8] == keyword[8]) &&          \
   1248         (keyword_length <= 9 || input[9] == keyword[9])) {          \
   1249       if (escaped) {                                                \
   1250         /* TODO(adamk): YIELD should be handled specially. */       \
   1251         return (token == Token::FUTURE_STRICT_RESERVED_WORD ||      \
   1252                 token == Token::LET || token == Token::STATIC)      \
   1253                    ? Token::ESCAPED_STRICT_RESERVED_WORD            \
   1254                    : Token::ESCAPED_KEYWORD;                        \
   1255       }                                                             \
   1256       return token;                                                 \
   1257     }                                                               \
   1258   }
   1259     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
   1260   }
   1261   return Token::IDENTIFIER;
   1262 }
   1263 
   1264 
   1265 bool Scanner::IdentifierIsFutureStrictReserved(
   1266     const AstRawString* string) const {
   1267   // Keywords are always 1-byte strings.
   1268   if (!string->is_one_byte()) return false;
   1269   if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") ||
   1270       string->IsOneByteEqualTo("yield")) {
   1271     return true;
   1272   }
   1273   return Token::FUTURE_STRICT_RESERVED_WORD ==
   1274          KeywordOrIdentifierToken(string->raw_data(), string->length(), false);
   1275 }
   1276 
   1277 
   1278 Token::Value Scanner::ScanIdentifierOrKeyword() {
   1279   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
   1280   LiteralScope literal(this);
   1281   if (IsInRange(c0_, 'a', 'z')) {
   1282     do {
   1283       char first_char = static_cast<char>(c0_);
   1284       Advance<false, false>();
   1285       AddLiteralChar(first_char);
   1286     } while (IsInRange(c0_, 'a', 'z'));
   1287 
   1288     if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
   1289         c0_ == '$') {
   1290       // Identifier starting with lowercase.
   1291       char first_char = static_cast<char>(c0_);
   1292       Advance<false, false>();
   1293       AddLiteralChar(first_char);
   1294       while (IsAsciiIdentifier(c0_)) {
   1295         char first_char = static_cast<char>(c0_);
   1296         Advance<false, false>();
   1297         AddLiteralChar(first_char);
   1298       }
   1299       if (c0_ <= kMaxAscii && c0_ != '\\') {
   1300         literal.Complete();
   1301         return Token::IDENTIFIER;
   1302       }
   1303     } else if (c0_ <= kMaxAscii && c0_ != '\\') {
   1304       // Only a-z+: could be a keyword or identifier.
   1305       literal.Complete();
   1306       Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1307       return KeywordOrIdentifierToken(chars.start(), chars.length(), false);
   1308     }
   1309 
   1310     HandleLeadSurrogate();
   1311   } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
   1312     do {
   1313       char first_char = static_cast<char>(c0_);
   1314       Advance<false, false>();
   1315       AddLiteralChar(first_char);
   1316     } while (IsAsciiIdentifier(c0_));
   1317 
   1318     if (c0_ <= kMaxAscii && c0_ != '\\') {
   1319       literal.Complete();
   1320       return Token::IDENTIFIER;
   1321     }
   1322 
   1323     HandleLeadSurrogate();
   1324   } else if (c0_ == '\\') {
   1325     // Scan identifier start character.
   1326     uc32 c = ScanIdentifierUnicodeEscape();
   1327     // Only allow legal identifier start characters.
   1328     if (c < 0 ||
   1329         c == '\\' ||  // No recursive escapes.
   1330         !unicode_cache_->IsIdentifierStart(c)) {
   1331       return Token::ILLEGAL;
   1332     }
   1333     AddLiteralChar(c);
   1334     return ScanIdentifierSuffix(&literal, true);
   1335   } else {
   1336     uc32 first_char = c0_;
   1337     Advance();
   1338     AddLiteralChar(first_char);
   1339   }
   1340 
   1341   // Scan the rest of the identifier characters.
   1342   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
   1343     if (c0_ != '\\') {
   1344       uc32 next_char = c0_;
   1345       Advance();
   1346       AddLiteralChar(next_char);
   1347       continue;
   1348     }
   1349     // Fallthrough if no longer able to complete keyword.
   1350     return ScanIdentifierSuffix(&literal, false);
   1351   }
   1352 
   1353   literal.Complete();
   1354 
   1355   if (next_.literal_chars->is_one_byte()) {
   1356     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1357     return KeywordOrIdentifierToken(chars.start(), chars.length(), false);
   1358   }
   1359   return Token::IDENTIFIER;
   1360 }
   1361 
   1362 
   1363 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
   1364                                            bool escaped) {
   1365   // Scan the rest of the identifier characters.
   1366   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
   1367     if (c0_ == '\\') {
   1368       uc32 c = ScanIdentifierUnicodeEscape();
   1369       escaped = true;
   1370       // Only allow legal identifier part characters.
   1371       if (c < 0 ||
   1372           c == '\\' ||
   1373           !unicode_cache_->IsIdentifierPart(c)) {
   1374         return Token::ILLEGAL;
   1375       }
   1376       AddLiteralChar(c);
   1377     } else {
   1378       AddLiteralChar(c0_);
   1379       Advance();
   1380     }
   1381   }
   1382   literal->Complete();
   1383 
   1384   if (escaped && next_.literal_chars->is_one_byte()) {
   1385     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1386     return KeywordOrIdentifierToken(chars.start(), chars.length(), true);
   1387   }
   1388   return Token::IDENTIFIER;
   1389 }
   1390 
   1391 
   1392 bool Scanner::ScanRegExpPattern(bool seen_equal) {
   1393   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
   1394   bool in_character_class = false;
   1395 
   1396   // Previous token is either '/' or '/=', in the second case, the
   1397   // pattern starts at =.
   1398   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1399   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1400 
   1401   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1402   // the scanner should pass uninterpreted bodies to the RegExp
   1403   // constructor.
   1404   LiteralScope literal(this);
   1405   if (seen_equal) {
   1406     AddLiteralChar('=');
   1407   }
   1408 
   1409   while (c0_ != '/' || in_character_class) {
   1410     if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
   1411     if (c0_ == '\\') {  // Escape sequence.
   1412       AddLiteralCharAdvance();
   1413       if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
   1414       AddLiteralCharAdvance();
   1415       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
   1416       // only "safe" characters are allowed (letters, digits, underscore),
   1417       // otherwise the escape isn't valid and the invalid character has
   1418       // its normal meaning. I.e., we can just continue scanning without
   1419       // worrying whether the following characters are part of the escape
   1420       // or not, since any '/', '\\' or '[' is guaranteed to not be part
   1421       // of the escape sequence.
   1422 
   1423       // TODO(896): At some point, parse RegExps more throughly to capture
   1424       // octal esacpes in strict mode.
   1425     } else {  // Unescaped character.
   1426       if (c0_ == '[') in_character_class = true;
   1427       if (c0_ == ']') in_character_class = false;
   1428       AddLiteralCharAdvance();
   1429     }
   1430   }
   1431   Advance();  // consume '/'
   1432 
   1433   literal.Complete();
   1434 
   1435   return true;
   1436 }
   1437 
   1438 
   1439 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
   1440   // Scan regular expression flags.
   1441   LiteralScope literal(this);
   1442   int flags = 0;
   1443   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
   1444     RegExp::Flags flag = RegExp::kNone;
   1445     switch (c0_) {
   1446       case 'g':
   1447         flag = RegExp::kGlobal;
   1448         break;
   1449       case 'i':
   1450         flag = RegExp::kIgnoreCase;
   1451         break;
   1452       case 'm':
   1453         flag = RegExp::kMultiline;
   1454         break;
   1455       case 'u':
   1456         flag = RegExp::kUnicode;
   1457         break;
   1458       case 'y':
   1459         flag = RegExp::kSticky;
   1460         break;
   1461       default:
   1462         return Nothing<RegExp::Flags>();
   1463     }
   1464     if (flags & flag) return Nothing<RegExp::Flags>();
   1465     AddLiteralCharAdvance();
   1466     flags |= flag;
   1467   }
   1468   literal.Complete();
   1469 
   1470   next_.location.end_pos = source_pos();
   1471   return Just(RegExp::Flags(flags));
   1472 }
   1473 
   1474 
   1475 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
   1476   if (is_literal_one_byte()) {
   1477     return ast_value_factory->GetOneByteString(literal_one_byte_string());
   1478   }
   1479   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
   1480 }
   1481 
   1482 
   1483 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
   1484   if (is_next_literal_one_byte()) {
   1485     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
   1486   }
   1487   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
   1488 }
   1489 
   1490 
   1491 const AstRawString* Scanner::CurrentRawSymbol(
   1492     AstValueFactory* ast_value_factory) {
   1493   if (is_raw_literal_one_byte()) {
   1494     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
   1495   }
   1496   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
   1497 }
   1498 
   1499 
   1500 double Scanner::DoubleValue() {
   1501   DCHECK(is_literal_one_byte());
   1502   return StringToDouble(
   1503       unicode_cache_,
   1504       literal_one_byte_string(),
   1505       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
   1506 }
   1507 
   1508 
   1509 bool Scanner::ContainsDot() {
   1510   DCHECK(is_literal_one_byte());
   1511   Vector<const uint8_t> str = literal_one_byte_string();
   1512   return std::find(str.begin(), str.end(), '.') != str.end();
   1513 }
   1514 
   1515 
   1516 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
   1517   if (is_literal_one_byte()) {
   1518     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
   1519   }
   1520   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
   1521 }
   1522 
   1523 
   1524 bool Scanner::SetBookmark() {
   1525   if (c0_ != kNoBookmark && bookmark_c0_ == kNoBookmark &&
   1526       next_next_.token == Token::UNINITIALIZED && source_->SetBookmark()) {
   1527     bookmark_c0_ = c0_;
   1528     CopyTokenDesc(&bookmark_current_, &current_);
   1529     CopyTokenDesc(&bookmark_next_, &next_);
   1530     return true;
   1531   }
   1532   return false;
   1533 }
   1534 
   1535 
   1536 void Scanner::ResetToBookmark() {
   1537   DCHECK(BookmarkHasBeenSet());  // Caller hasn't called SetBookmark.
   1538 
   1539   source_->ResetToBookmark();
   1540   c0_ = bookmark_c0_;
   1541   StartLiteral();
   1542   StartRawLiteral();
   1543   CopyTokenDesc(&next_, &bookmark_current_);
   1544   current_ = next_;
   1545   StartLiteral();
   1546   StartRawLiteral();
   1547   CopyTokenDesc(&next_, &bookmark_next_);
   1548 
   1549   bookmark_c0_ = kBookmarkWasApplied;
   1550 }
   1551 
   1552 
   1553 bool Scanner::BookmarkHasBeenSet() { return bookmark_c0_ >= 0; }
   1554 
   1555 
   1556 bool Scanner::BookmarkHasBeenReset() {
   1557   return bookmark_c0_ == kBookmarkWasApplied;
   1558 }
   1559 
   1560 
   1561 void Scanner::DropBookmark() { bookmark_c0_ = kNoBookmark; }
   1562 
   1563 
   1564 void Scanner::CopyTokenDesc(TokenDesc* to, TokenDesc* from) {
   1565   DCHECK_NOT_NULL(to);
   1566   DCHECK_NOT_NULL(from);
   1567   to->token = from->token;
   1568   to->location = from->location;
   1569   to->literal_chars->CopyFrom(from->literal_chars);
   1570   to->raw_literal_chars->CopyFrom(from->raw_literal_chars);
   1571 }
   1572 
   1573 
   1574 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
   1575   return AddSymbol(key, true, value);
   1576 }
   1577 
   1578 
   1579 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
   1580   return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
   1581 }
   1582 
   1583 
   1584 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
   1585                                bool is_one_byte,
   1586                                int value) {
   1587   uint32_t hash = Hash(key, is_one_byte);
   1588   byte* encoding = BackupKey(key, is_one_byte);
   1589   base::HashMap::Entry* entry = map_.LookupOrInsert(encoding, hash);
   1590   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
   1591   entry->value =
   1592     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
   1593   return old_value;
   1594 }
   1595 
   1596 
   1597 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
   1598   DCHECK(key.length() > 0);
   1599   // Quick check for already being in canonical form.
   1600   if (IsNumberCanonical(key)) {
   1601     return AddOneByteSymbol(key, value);
   1602   }
   1603 
   1604   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
   1605   double double_value = StringToDouble(
   1606       unicode_constants_, key, flags, 0.0);
   1607   int length;
   1608   const char* string;
   1609   if (!std::isfinite(double_value)) {
   1610     string = "Infinity";
   1611     length = 8;  // strlen("Infinity");
   1612   } else {
   1613     string = DoubleToCString(double_value,
   1614                              Vector<char>(number_buffer_, kBufferSize));
   1615     length = StrLength(string);
   1616   }
   1617   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
   1618                                       length), true, value);
   1619 }
   1620 
   1621 
   1622 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
   1623   // Test for a safe approximation of number literals that are already
   1624   // in canonical form: max 15 digits, no leading zeroes, except an
   1625   // integer part that is a single zero, and no trailing zeros below
   1626   // the decimal point.
   1627   int pos = 0;
   1628   int length = number.length();
   1629   if (number.length() > 15) return false;
   1630   if (number[pos] == '0') {
   1631     pos++;
   1632   } else {
   1633     while (pos < length &&
   1634            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
   1635   }
   1636   if (length == pos) return true;
   1637   if (number[pos] != '.') return false;
   1638   pos++;
   1639   bool invalid_last_digit = true;
   1640   while (pos < length) {
   1641     uint8_t digit = number[pos] - '0';
   1642     if (digit > '9' - '0') return false;
   1643     invalid_last_digit = (digit == 0);
   1644     pos++;
   1645   }
   1646   return !invalid_last_digit;
   1647 }
   1648 
   1649 
   1650 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
   1651   // Primitive hash function, almost identical to the one used
   1652   // for strings (except that it's seeded by the length and representation).
   1653   int length = key.length();
   1654   uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0);
   1655   for (int i = 0; i < length; i++) {
   1656     uint32_t c = key[i];
   1657     hash = (hash + c) * 1025;
   1658     hash ^= (hash >> 6);
   1659   }
   1660   return hash;
   1661 }
   1662 
   1663 
   1664 bool DuplicateFinder::Match(void* first, void* second) {
   1665   // Decode lengths.
   1666   // Length + representation is encoded as base 128, most significant heptet
   1667   // first, with a 8th bit being non-zero while there are more heptets.
   1668   // The value encodes the number of bytes following, and whether the original
   1669   // was Latin1.
   1670   byte* s1 = reinterpret_cast<byte*>(first);
   1671   byte* s2 = reinterpret_cast<byte*>(second);
   1672   uint32_t length_one_byte_field = 0;
   1673   byte c1;
   1674   do {
   1675     c1 = *s1;
   1676     if (c1 != *s2) return false;
   1677     length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
   1678     s1++;
   1679     s2++;
   1680   } while ((c1 & 0x80) != 0);
   1681   int length = static_cast<int>(length_one_byte_field >> 1);
   1682   return memcmp(s1, s2, length) == 0;
   1683 }
   1684 
   1685 
   1686 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
   1687                                  bool is_one_byte) {
   1688   uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
   1689   backing_store_.StartSequence();
   1690   // Emit one_byte_length as base-128 encoded number, with the 7th bit set
   1691   // on the byte of every heptet except the last, least significant, one.
   1692   if (one_byte_length >= (1 << 7)) {
   1693     if (one_byte_length >= (1 << 14)) {
   1694       if (one_byte_length >= (1 << 21)) {
   1695         if (one_byte_length >= (1 << 28)) {
   1696           backing_store_.Add(
   1697               static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
   1698         }
   1699         backing_store_.Add(
   1700             static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
   1701       }
   1702       backing_store_.Add(
   1703           static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
   1704     }
   1705     backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
   1706   }
   1707   backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
   1708 
   1709   backing_store_.AddBlock(bytes);
   1710   return backing_store_.EndSequence().start();
   1711 }
   1712 
   1713 }  // namespace internal
   1714 }  // namespace v8
   1715