Home | History | Annotate | Download | only in parsing
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Features shared by parsing and pre-parsing scanners.
      6 
      7 #include "src/parsing/scanner.h"
      8 
      9 #include <stdint.h>
     10 
     11 #include <cmath>
     12 
     13 #include "src/ast/ast-value-factory.h"
     14 #include "src/char-predicates-inl.h"
     15 #include "src/conversions-inl.h"
     16 #include "src/list-inl.h"
     17 #include "src/parsing/parser.h"
     18 
     19 namespace v8 {
     20 namespace internal {
     21 
     22 
     23 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const {
     24   if (is_one_byte()) {
     25     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
     26   }
     27   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
     28 }
     29 
     30 
     31 // Default implementation for streams that do not support bookmarks.
     32 bool Utf16CharacterStream::SetBookmark() { return false; }
     33 void Utf16CharacterStream::ResetToBookmark() { UNREACHABLE(); }
     34 
     35 
     36 // ----------------------------------------------------------------------------
     37 // Scanner
     38 
     39 Scanner::Scanner(UnicodeCache* unicode_cache)
     40     : unicode_cache_(unicode_cache),
     41       bookmark_c0_(kNoBookmark),
     42       octal_pos_(Location::invalid()) {
     43   bookmark_current_.literal_chars = &bookmark_current_literal_;
     44   bookmark_current_.raw_literal_chars = &bookmark_current_raw_literal_;
     45   bookmark_next_.literal_chars = &bookmark_next_literal_;
     46   bookmark_next_.raw_literal_chars = &bookmark_next_raw_literal_;
     47 }
     48 
     49 
     50 void Scanner::Initialize(Utf16CharacterStream* source) {
     51   source_ = source;
     52   // Need to capture identifiers in order to recognize "get" and "set"
     53   // in object literals.
     54   Init();
     55   // Skip initial whitespace allowing HTML comment ends just like
     56   // after a newline and scan first token.
     57   has_line_terminator_before_next_ = true;
     58   SkipWhiteSpace();
     59   Scan();
     60 }
     61 
     62 
     63 template <bool capture_raw>
     64 uc32 Scanner::ScanHexNumber(int expected_length) {
     65   DCHECK(expected_length <= 4);  // prevent overflow
     66 
     67   uc32 x = 0;
     68   for (int i = 0; i < expected_length; i++) {
     69     int d = HexValue(c0_);
     70     if (d < 0) {
     71       return -1;
     72     }
     73     x = x * 16 + d;
     74     Advance<capture_raw>();
     75   }
     76 
     77   return x;
     78 }
     79 
     80 
     81 template <bool capture_raw>
     82 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value) {
     83   uc32 x = 0;
     84   int d = HexValue(c0_);
     85   if (d < 0) {
     86     return -1;
     87   }
     88   while (d >= 0) {
     89     x = x * 16 + d;
     90     if (x > max_value) return -1;
     91     Advance<capture_raw>();
     92     d = HexValue(c0_);
     93   }
     94   return x;
     95 }
     96 
     97 
     98 // Ensure that tokens can be stored in a byte.
     99 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
    100 
    101 // Table of one-character tokens, by character (0x00..0x7f only).
    102 static const byte one_char_tokens[] = {
    103   Token::ILLEGAL,
    104   Token::ILLEGAL,
    105   Token::ILLEGAL,
    106   Token::ILLEGAL,
    107   Token::ILLEGAL,
    108   Token::ILLEGAL,
    109   Token::ILLEGAL,
    110   Token::ILLEGAL,
    111   Token::ILLEGAL,
    112   Token::ILLEGAL,
    113   Token::ILLEGAL,
    114   Token::ILLEGAL,
    115   Token::ILLEGAL,
    116   Token::ILLEGAL,
    117   Token::ILLEGAL,
    118   Token::ILLEGAL,
    119   Token::ILLEGAL,
    120   Token::ILLEGAL,
    121   Token::ILLEGAL,
    122   Token::ILLEGAL,
    123   Token::ILLEGAL,
    124   Token::ILLEGAL,
    125   Token::ILLEGAL,
    126   Token::ILLEGAL,
    127   Token::ILLEGAL,
    128   Token::ILLEGAL,
    129   Token::ILLEGAL,
    130   Token::ILLEGAL,
    131   Token::ILLEGAL,
    132   Token::ILLEGAL,
    133   Token::ILLEGAL,
    134   Token::ILLEGAL,
    135   Token::ILLEGAL,
    136   Token::ILLEGAL,
    137   Token::ILLEGAL,
    138   Token::ILLEGAL,
    139   Token::ILLEGAL,
    140   Token::ILLEGAL,
    141   Token::ILLEGAL,
    142   Token::ILLEGAL,
    143   Token::LPAREN,       // 0x28
    144   Token::RPAREN,       // 0x29
    145   Token::ILLEGAL,
    146   Token::ILLEGAL,
    147   Token::COMMA,        // 0x2c
    148   Token::ILLEGAL,
    149   Token::ILLEGAL,
    150   Token::ILLEGAL,
    151   Token::ILLEGAL,
    152   Token::ILLEGAL,
    153   Token::ILLEGAL,
    154   Token::ILLEGAL,
    155   Token::ILLEGAL,
    156   Token::ILLEGAL,
    157   Token::ILLEGAL,
    158   Token::ILLEGAL,
    159   Token::ILLEGAL,
    160   Token::ILLEGAL,
    161   Token::COLON,        // 0x3a
    162   Token::SEMICOLON,    // 0x3b
    163   Token::ILLEGAL,
    164   Token::ILLEGAL,
    165   Token::ILLEGAL,
    166   Token::CONDITIONAL,  // 0x3f
    167   Token::ILLEGAL,
    168   Token::ILLEGAL,
    169   Token::ILLEGAL,
    170   Token::ILLEGAL,
    171   Token::ILLEGAL,
    172   Token::ILLEGAL,
    173   Token::ILLEGAL,
    174   Token::ILLEGAL,
    175   Token::ILLEGAL,
    176   Token::ILLEGAL,
    177   Token::ILLEGAL,
    178   Token::ILLEGAL,
    179   Token::ILLEGAL,
    180   Token::ILLEGAL,
    181   Token::ILLEGAL,
    182   Token::ILLEGAL,
    183   Token::ILLEGAL,
    184   Token::ILLEGAL,
    185   Token::ILLEGAL,
    186   Token::ILLEGAL,
    187   Token::ILLEGAL,
    188   Token::ILLEGAL,
    189   Token::ILLEGAL,
    190   Token::ILLEGAL,
    191   Token::ILLEGAL,
    192   Token::ILLEGAL,
    193   Token::ILLEGAL,
    194   Token::LBRACK,     // 0x5b
    195   Token::ILLEGAL,
    196   Token::RBRACK,     // 0x5d
    197   Token::ILLEGAL,
    198   Token::ILLEGAL,
    199   Token::ILLEGAL,
    200   Token::ILLEGAL,
    201   Token::ILLEGAL,
    202   Token::ILLEGAL,
    203   Token::ILLEGAL,
    204   Token::ILLEGAL,
    205   Token::ILLEGAL,
    206   Token::ILLEGAL,
    207   Token::ILLEGAL,
    208   Token::ILLEGAL,
    209   Token::ILLEGAL,
    210   Token::ILLEGAL,
    211   Token::ILLEGAL,
    212   Token::ILLEGAL,
    213   Token::ILLEGAL,
    214   Token::ILLEGAL,
    215   Token::ILLEGAL,
    216   Token::ILLEGAL,
    217   Token::ILLEGAL,
    218   Token::ILLEGAL,
    219   Token::ILLEGAL,
    220   Token::ILLEGAL,
    221   Token::ILLEGAL,
    222   Token::ILLEGAL,
    223   Token::ILLEGAL,
    224   Token::ILLEGAL,
    225   Token::ILLEGAL,
    226   Token::LBRACE,       // 0x7b
    227   Token::ILLEGAL,
    228   Token::RBRACE,       // 0x7d
    229   Token::BIT_NOT,      // 0x7e
    230   Token::ILLEGAL
    231 };
    232 
    233 
    234 Token::Value Scanner::Next() {
    235   if (next_.token == Token::EOS) {
    236     next_.location.beg_pos = current_.location.beg_pos;
    237     next_.location.end_pos = current_.location.end_pos;
    238   }
    239   current_ = next_;
    240   if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
    241     next_ = next_next_;
    242     next_next_.token = Token::UNINITIALIZED;
    243     return current_.token;
    244   }
    245   has_line_terminator_before_next_ = false;
    246   has_multiline_comment_before_next_ = false;
    247   if (static_cast<unsigned>(c0_) <= 0x7f) {
    248     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    249     if (token != Token::ILLEGAL) {
    250       int pos = source_pos();
    251       next_.token = token;
    252       next_.location.beg_pos = pos;
    253       next_.location.end_pos = pos + 1;
    254       Advance();
    255       return current_.token;
    256     }
    257   }
    258   Scan();
    259   return current_.token;
    260 }
    261 
    262 
    263 Token::Value Scanner::PeekAhead() {
    264   if (next_next_.token != Token::UNINITIALIZED) {
    265     return next_next_.token;
    266   }
    267   TokenDesc prev = current_;
    268   Next();
    269   Token::Value ret = next_.token;
    270   next_next_ = next_;
    271   next_ = current_;
    272   current_ = prev;
    273   return ret;
    274 }
    275 
    276 
    277 // TODO(yangguo): check whether this is actually necessary.
    278 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
    279   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    280   // Unicode character; this implies that in a Unicode context the
    281   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    282   // character expressed in little-endian byte order (since it could
    283   // not be a U+FFFE character expressed in big-endian byte
    284   // order). Nevertheless, we check for it to be compatible with
    285   // Spidermonkey.
    286   return c == 0xFFFE;
    287 }
    288 
    289 
    290 bool Scanner::SkipWhiteSpace() {
    291   int start_position = source_pos();
    292 
    293   while (true) {
    294     while (true) {
    295       // The unicode cache accepts unsigned inputs.
    296       if (c0_ < 0) break;
    297       // Advance as long as character is a WhiteSpace or LineTerminator.
    298       // Remember if the latter is the case.
    299       if (unicode_cache_->IsLineTerminator(c0_)) {
    300         has_line_terminator_before_next_ = true;
    301       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
    302                  !IsLittleEndianByteOrderMark(c0_)) {
    303         break;
    304       }
    305       Advance();
    306     }
    307 
    308     // If there is an HTML comment end '-->' at the beginning of a
    309     // line (with only whitespace in front of it), we treat the rest
    310     // of the line as a comment. This is in line with the way
    311     // SpiderMonkey handles it.
    312     if (c0_ == '-' && has_line_terminator_before_next_) {
    313       Advance();
    314       if (c0_ == '-') {
    315         Advance();
    316         if (c0_ == '>') {
    317           // Treat the rest of the line as a comment.
    318           SkipSingleLineComment();
    319           // Continue skipping white space after the comment.
    320           continue;
    321         }
    322         PushBack('-');  // undo Advance()
    323       }
    324       PushBack('-');  // undo Advance()
    325     }
    326     // Return whether or not we skipped any characters.
    327     return source_pos() != start_position;
    328   }
    329 }
    330 
    331 
    332 Token::Value Scanner::SkipSingleLineComment() {
    333   Advance();
    334 
    335   // The line terminator at the end of the line is not considered
    336   // to be part of the single-line comment; it is recognized
    337   // separately by the lexical grammar and becomes part of the
    338   // stream of input elements for the syntactic grammar (see
    339   // ECMA-262, section 7.4).
    340   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    341     Advance();
    342   }
    343 
    344   return Token::WHITESPACE;
    345 }
    346 
    347 
    348 Token::Value Scanner::SkipSourceURLComment() {
    349   TryToParseSourceURLComment();
    350   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    351     Advance();
    352   }
    353 
    354   return Token::WHITESPACE;
    355 }
    356 
    357 
    358 void Scanner::TryToParseSourceURLComment() {
    359   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
    360   // function will just return if it cannot parse a magic comment.
    361   if (c0_ < 0 || !unicode_cache_->IsWhiteSpace(c0_)) return;
    362   Advance();
    363   LiteralBuffer name;
    364   while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) &&
    365          c0_ != '=') {
    366     name.AddChar(c0_);
    367     Advance();
    368   }
    369   if (!name.is_one_byte()) return;
    370   Vector<const uint8_t> name_literal = name.one_byte_literal();
    371   LiteralBuffer* value;
    372   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
    373     value = &source_url_;
    374   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
    375     value = &source_mapping_url_;
    376   } else {
    377     return;
    378   }
    379   if (c0_ != '=')
    380     return;
    381   Advance();
    382   value->Reset();
    383   while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) {
    384     Advance();
    385   }
    386   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    387     // Disallowed characters.
    388     if (c0_ == '"' || c0_ == '\'') {
    389       value->Reset();
    390       return;
    391     }
    392     if (unicode_cache_->IsWhiteSpace(c0_)) {
    393       break;
    394     }
    395     value->AddChar(c0_);
    396     Advance();
    397   }
    398   // Allow whitespace at the end.
    399   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    400     if (!unicode_cache_->IsWhiteSpace(c0_)) {
    401       value->Reset();
    402       break;
    403     }
    404     Advance();
    405   }
    406 }
    407 
    408 
    409 Token::Value Scanner::SkipMultiLineComment() {
    410   DCHECK(c0_ == '*');
    411   Advance();
    412 
    413   while (c0_ >= 0) {
    414     uc32 ch = c0_;
    415     Advance();
    416     if (c0_ >= 0 && unicode_cache_->IsLineTerminator(ch)) {
    417       // Following ECMA-262, section 7.4, a comment containing
    418       // a newline will make the comment count as a line-terminator.
    419       has_multiline_comment_before_next_ = true;
    420     }
    421     // If we have reached the end of the multi-line comment, we
    422     // consume the '/' and insert a whitespace. This way all
    423     // multi-line comments are treated as whitespace.
    424     if (ch == '*' && c0_ == '/') {
    425       c0_ = ' ';
    426       return Token::WHITESPACE;
    427     }
    428   }
    429 
    430   // Unterminated multi-line comment.
    431   return Token::ILLEGAL;
    432 }
    433 
    434 
    435 Token::Value Scanner::ScanHtmlComment() {
    436   // Check for <!-- comments.
    437   DCHECK(c0_ == '!');
    438   Advance();
    439   if (c0_ == '-') {
    440     Advance();
    441     if (c0_ == '-') return SkipSingleLineComment();
    442     PushBack('-');  // undo Advance()
    443   }
    444   PushBack('!');  // undo Advance()
    445   DCHECK(c0_ == '!');
    446   return Token::LT;
    447 }
    448 
    449 
    450 void Scanner::Scan() {
    451   next_.literal_chars = NULL;
    452   next_.raw_literal_chars = NULL;
    453   Token::Value token;
    454   do {
    455     // Remember the position of the next token
    456     next_.location.beg_pos = source_pos();
    457 
    458     switch (c0_) {
    459       case ' ':
    460       case '\t':
    461         Advance();
    462         token = Token::WHITESPACE;
    463         break;
    464 
    465       case '\n':
    466         Advance();
    467         has_line_terminator_before_next_ = true;
    468         token = Token::WHITESPACE;
    469         break;
    470 
    471       case '"': case '\'':
    472         token = ScanString();
    473         break;
    474 
    475       case '<':
    476         // < <= << <<= <!--
    477         Advance();
    478         if (c0_ == '=') {
    479           token = Select(Token::LTE);
    480         } else if (c0_ == '<') {
    481           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    482         } else if (c0_ == '!') {
    483           token = ScanHtmlComment();
    484         } else {
    485           token = Token::LT;
    486         }
    487         break;
    488 
    489       case '>':
    490         // > >= >> >>= >>> >>>=
    491         Advance();
    492         if (c0_ == '=') {
    493           token = Select(Token::GTE);
    494         } else if (c0_ == '>') {
    495           // >> >>= >>> >>>=
    496           Advance();
    497           if (c0_ == '=') {
    498             token = Select(Token::ASSIGN_SAR);
    499           } else if (c0_ == '>') {
    500             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    501           } else {
    502             token = Token::SAR;
    503           }
    504         } else {
    505           token = Token::GT;
    506         }
    507         break;
    508 
    509       case '=':
    510         // = == === =>
    511         Advance();
    512         if (c0_ == '=') {
    513           token = Select('=', Token::EQ_STRICT, Token::EQ);
    514         } else if (c0_ == '>') {
    515           token = Select(Token::ARROW);
    516         } else {
    517           token = Token::ASSIGN;
    518         }
    519         break;
    520 
    521       case '!':
    522         // ! != !==
    523         Advance();
    524         if (c0_ == '=') {
    525           token = Select('=', Token::NE_STRICT, Token::NE);
    526         } else {
    527           token = Token::NOT;
    528         }
    529         break;
    530 
    531       case '+':
    532         // + ++ +=
    533         Advance();
    534         if (c0_ == '+') {
    535           token = Select(Token::INC);
    536         } else if (c0_ == '=') {
    537           token = Select(Token::ASSIGN_ADD);
    538         } else {
    539           token = Token::ADD;
    540         }
    541         break;
    542 
    543       case '-':
    544         // - -- --> -=
    545         Advance();
    546         if (c0_ == '-') {
    547           Advance();
    548           if (c0_ == '>' && has_line_terminator_before_next_) {
    549             // For compatibility with SpiderMonkey, we skip lines that
    550             // start with an HTML comment end '-->'.
    551             token = SkipSingleLineComment();
    552           } else {
    553             token = Token::DEC;
    554           }
    555         } else if (c0_ == '=') {
    556           token = Select(Token::ASSIGN_SUB);
    557         } else {
    558           token = Token::SUB;
    559         }
    560         break;
    561 
    562       case '*':
    563         // * *=
    564         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
    565         break;
    566 
    567       case '%':
    568         // % %=
    569         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    570         break;
    571 
    572       case '/':
    573         // /  // /* /=
    574         Advance();
    575         if (c0_ == '/') {
    576           Advance();
    577           if (c0_ == '#' || c0_ == '@') {
    578             Advance();
    579             token = SkipSourceURLComment();
    580           } else {
    581             PushBack(c0_);
    582             token = SkipSingleLineComment();
    583           }
    584         } else if (c0_ == '*') {
    585           token = SkipMultiLineComment();
    586         } else if (c0_ == '=') {
    587           token = Select(Token::ASSIGN_DIV);
    588         } else {
    589           token = Token::DIV;
    590         }
    591         break;
    592 
    593       case '&':
    594         // & && &=
    595         Advance();
    596         if (c0_ == '&') {
    597           token = Select(Token::AND);
    598         } else if (c0_ == '=') {
    599           token = Select(Token::ASSIGN_BIT_AND);
    600         } else {
    601           token = Token::BIT_AND;
    602         }
    603         break;
    604 
    605       case '|':
    606         // | || |=
    607         Advance();
    608         if (c0_ == '|') {
    609           token = Select(Token::OR);
    610         } else if (c0_ == '=') {
    611           token = Select(Token::ASSIGN_BIT_OR);
    612         } else {
    613           token = Token::BIT_OR;
    614         }
    615         break;
    616 
    617       case '^':
    618         // ^ ^=
    619         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    620         break;
    621 
    622       case '.':
    623         // . Number
    624         Advance();
    625         if (IsDecimalDigit(c0_)) {
    626           token = ScanNumber(true);
    627         } else {
    628           token = Token::PERIOD;
    629           if (c0_ == '.') {
    630             Advance();
    631             if (c0_ == '.') {
    632               Advance();
    633               token = Token::ELLIPSIS;
    634             } else {
    635               PushBack('.');
    636             }
    637           }
    638         }
    639         break;
    640 
    641       case ':':
    642         token = Select(Token::COLON);
    643         break;
    644 
    645       case ';':
    646         token = Select(Token::SEMICOLON);
    647         break;
    648 
    649       case ',':
    650         token = Select(Token::COMMA);
    651         break;
    652 
    653       case '(':
    654         token = Select(Token::LPAREN);
    655         break;
    656 
    657       case ')':
    658         token = Select(Token::RPAREN);
    659         break;
    660 
    661       case '[':
    662         token = Select(Token::LBRACK);
    663         break;
    664 
    665       case ']':
    666         token = Select(Token::RBRACK);
    667         break;
    668 
    669       case '{':
    670         token = Select(Token::LBRACE);
    671         break;
    672 
    673       case '}':
    674         token = Select(Token::RBRACE);
    675         break;
    676 
    677       case '?':
    678         token = Select(Token::CONDITIONAL);
    679         break;
    680 
    681       case '~':
    682         token = Select(Token::BIT_NOT);
    683         break;
    684 
    685       case '`':
    686         token = ScanTemplateStart();
    687         break;
    688 
    689       default:
    690         if (c0_ < 0) {
    691           token = Token::EOS;
    692         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
    693           token = ScanIdentifierOrKeyword();
    694         } else if (IsDecimalDigit(c0_)) {
    695           token = ScanNumber(false);
    696         } else if (SkipWhiteSpace()) {
    697           token = Token::WHITESPACE;
    698         } else {
    699           token = Select(Token::ILLEGAL);
    700         }
    701         break;
    702     }
    703 
    704     // Continue scanning for tokens as long as we're just skipping
    705     // whitespace.
    706   } while (token == Token::WHITESPACE);
    707 
    708   next_.location.end_pos = source_pos();
    709   next_.token = token;
    710 }
    711 
    712 
    713 void Scanner::SeekForward(int pos) {
    714   // After this call, we will have the token at the given position as
    715   // the "next" token. The "current" token will be invalid.
    716   if (pos == next_.location.beg_pos) return;
    717   int current_pos = source_pos();
    718   DCHECK_EQ(next_.location.end_pos, current_pos);
    719   // Positions inside the lookahead token aren't supported.
    720   DCHECK(pos >= current_pos);
    721   if (pos != current_pos) {
    722     source_->SeekForward(pos - source_->pos());
    723     Advance();
    724     // This function is only called to seek to the location
    725     // of the end of a function (at the "}" token). It doesn't matter
    726     // whether there was a line terminator in the part we skip.
    727     has_line_terminator_before_next_ = false;
    728     has_multiline_comment_before_next_ = false;
    729   }
    730   Scan();
    731 }
    732 
    733 
    734 template <bool capture_raw, bool in_template_literal>
    735 bool Scanner::ScanEscape() {
    736   uc32 c = c0_;
    737   Advance<capture_raw>();
    738 
    739   // Skip escaped newlines.
    740   if (!in_template_literal && c0_ >= 0 && unicode_cache_->IsLineTerminator(c)) {
    741     // Allow CR+LF newlines in multiline string literals.
    742     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
    743     // Allow LF+CR newlines in multiline string literals.
    744     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
    745     return true;
    746   }
    747 
    748   switch (c) {
    749     case '\'':  // fall through
    750     case '"' :  // fall through
    751     case '\\': break;
    752     case 'b' : c = '\b'; break;
    753     case 'f' : c = '\f'; break;
    754     case 'n' : c = '\n'; break;
    755     case 'r' : c = '\r'; break;
    756     case 't' : c = '\t'; break;
    757     case 'u' : {
    758       c = ScanUnicodeEscape<capture_raw>();
    759       if (c < 0) return false;
    760       break;
    761     }
    762     case 'v':
    763       c = '\v';
    764       break;
    765     case 'x': {
    766       c = ScanHexNumber<capture_raw>(2);
    767       if (c < 0) return false;
    768       break;
    769     }
    770     case '0':  // Fall through.
    771     case '1':  // fall through
    772     case '2':  // fall through
    773     case '3':  // fall through
    774     case '4':  // fall through
    775     case '5':  // fall through
    776     case '6':  // fall through
    777     case '7':
    778       c = ScanOctalEscape<capture_raw>(c, 2);
    779       break;
    780   }
    781 
    782   // According to ECMA-262, section 7.8.4, characters not covered by the
    783   // above cases should be illegal, but they are commonly handled as
    784   // non-escaped characters by JS VMs.
    785   AddLiteralChar(c);
    786   return true;
    787 }
    788 
    789 
    790 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
    791 // ECMA-262. Other JS VMs support them.
    792 template <bool capture_raw>
    793 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    794   uc32 x = c - '0';
    795   int i = 0;
    796   for (; i < length; i++) {
    797     int d = c0_ - '0';
    798     if (d < 0 || d > 7) break;
    799     int nx = x * 8 + d;
    800     if (nx >= 256) break;
    801     x = nx;
    802     Advance<capture_raw>();
    803   }
    804   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
    805   // Remember the position of octal escape sequences so that an error
    806   // can be reported later (in strict mode).
    807   // We don't report the error immediately, because the octal escape can
    808   // occur before the "use strict" directive.
    809   if (c != '0' || i > 0) {
    810     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
    811   }
    812   return x;
    813 }
    814 
    815 
    816 const int kMaxAscii = 127;
    817 
    818 
    819 Token::Value Scanner::ScanString() {
    820   uc32 quote = c0_;
    821   Advance<false, false>();  // consume quote
    822 
    823   LiteralScope literal(this);
    824   while (true) {
    825     if (c0_ > kMaxAscii) {
    826       HandleLeadSurrogate();
    827       break;
    828     }
    829     if (c0_ < 0 || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
    830     if (c0_ == quote) {
    831       literal.Complete();
    832       Advance<false, false>();
    833       return Token::STRING;
    834     }
    835     uc32 c = c0_;
    836     if (c == '\\') break;
    837     Advance<false, false>();
    838     AddLiteralChar(c);
    839   }
    840 
    841   while (c0_ != quote && c0_ >= 0
    842          && !unicode_cache_->IsLineTerminator(c0_)) {
    843     uc32 c = c0_;
    844     Advance();
    845     if (c == '\\') {
    846       if (c0_ < 0 || !ScanEscape<false, false>()) return Token::ILLEGAL;
    847     } else {
    848       AddLiteralChar(c);
    849     }
    850   }
    851   if (c0_ != quote) return Token::ILLEGAL;
    852   literal.Complete();
    853 
    854   Advance();  // consume quote
    855   return Token::STRING;
    856 }
    857 
    858 
    859 Token::Value Scanner::ScanTemplateSpan() {
    860   // When scanning a TemplateSpan, we are looking for the following construct:
    861   // TEMPLATE_SPAN ::
    862   //     ` LiteralChars* ${
    863   //   | } LiteralChars* ${
    864   //
    865   // TEMPLATE_TAIL ::
    866   //     ` LiteralChars* `
    867   //   | } LiteralChar* `
    868   //
    869   // A TEMPLATE_SPAN should always be followed by an Expression, while a
    870   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
    871   // followed by an Expression.
    872 
    873   Token::Value result = Token::TEMPLATE_SPAN;
    874   LiteralScope literal(this);
    875   StartRawLiteral();
    876   const bool capture_raw = true;
    877   const bool in_template_literal = true;
    878 
    879   while (true) {
    880     uc32 c = c0_;
    881     Advance<capture_raw>();
    882     if (c == '`') {
    883       result = Token::TEMPLATE_TAIL;
    884       ReduceRawLiteralLength(1);
    885       break;
    886     } else if (c == '$' && c0_ == '{') {
    887       Advance<capture_raw>();  // Consume '{'
    888       ReduceRawLiteralLength(2);
    889       break;
    890     } else if (c == '\\') {
    891       if (c0_ > 0 && unicode_cache_->IsLineTerminator(c0_)) {
    892         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
    893         // code unit sequence.
    894         uc32 lastChar = c0_;
    895         Advance<capture_raw>();
    896         if (lastChar == '\r') {
    897           ReduceRawLiteralLength(1);  // Remove \r
    898           if (c0_ == '\n') {
    899             Advance<capture_raw>();  // Adds \n
    900           } else {
    901             AddRawLiteralChar('\n');
    902           }
    903         }
    904       } else if (!ScanEscape<capture_raw, in_template_literal>()) {
    905         return Token::ILLEGAL;
    906       }
    907     } else if (c < 0) {
    908       // Unterminated template literal
    909       PushBack(c);
    910       break;
    911     } else {
    912       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
    913       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
    914       // consisting of the CV 0x000A.
    915       if (c == '\r') {
    916         ReduceRawLiteralLength(1);  // Remove \r
    917         if (c0_ == '\n') {
    918           Advance<capture_raw>();  // Adds \n
    919         } else {
    920           AddRawLiteralChar('\n');
    921         }
    922         c = '\n';
    923       }
    924       AddLiteralChar(c);
    925     }
    926   }
    927   literal.Complete();
    928   next_.location.end_pos = source_pos();
    929   next_.token = result;
    930   return result;
    931 }
    932 
    933 
    934 Token::Value Scanner::ScanTemplateStart() {
    935   DCHECK(c0_ == '`');
    936   next_.location.beg_pos = source_pos();
    937   Advance();  // Consume `
    938   return ScanTemplateSpan();
    939 }
    940 
    941 
    942 Token::Value Scanner::ScanTemplateContinuation() {
    943   DCHECK_EQ(next_.token, Token::RBRACE);
    944   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
    945   return ScanTemplateSpan();
    946 }
    947 
    948 
    949 void Scanner::ScanDecimalDigits() {
    950   while (IsDecimalDigit(c0_))
    951     AddLiteralCharAdvance();
    952 }
    953 
    954 
    955 Token::Value Scanner::ScanNumber(bool seen_period) {
    956   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
    957 
    958   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
    959 
    960   LiteralScope literal(this);
    961   bool at_start = !seen_period;
    962   if (seen_period) {
    963     // we have already seen a decimal point of the float
    964     AddLiteralChar('.');
    965     ScanDecimalDigits();  // we know we have at least one digit
    966 
    967   } else {
    968     // if the first character is '0' we must check for octals and hex
    969     if (c0_ == '0') {
    970       int start_pos = source_pos();  // For reporting octal positions.
    971       AddLiteralCharAdvance();
    972 
    973       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
    974       // an octal number.
    975       if (c0_ == 'x' || c0_ == 'X') {
    976         // hex number
    977         kind = HEX;
    978         AddLiteralCharAdvance();
    979         if (!IsHexDigit(c0_)) {
    980           // we must have at least one hex digit after 'x'/'X'
    981           return Token::ILLEGAL;
    982         }
    983         while (IsHexDigit(c0_)) {
    984           AddLiteralCharAdvance();
    985         }
    986       } else if (c0_ == 'o' || c0_ == 'O') {
    987         kind = OCTAL;
    988         AddLiteralCharAdvance();
    989         if (!IsOctalDigit(c0_)) {
    990           // we must have at least one octal digit after 'o'/'O'
    991           return Token::ILLEGAL;
    992         }
    993         while (IsOctalDigit(c0_)) {
    994           AddLiteralCharAdvance();
    995         }
    996       } else if (c0_ == 'b' || c0_ == 'B') {
    997         kind = BINARY;
    998         AddLiteralCharAdvance();
    999         if (!IsBinaryDigit(c0_)) {
   1000           // we must have at least one binary digit after 'b'/'B'
   1001           return Token::ILLEGAL;
   1002         }
   1003         while (IsBinaryDigit(c0_)) {
   1004           AddLiteralCharAdvance();
   1005         }
   1006       } else if ('0' <= c0_ && c0_ <= '7') {
   1007         // (possible) octal number
   1008         kind = IMPLICIT_OCTAL;
   1009         while (true) {
   1010           if (c0_ == '8' || c0_ == '9') {
   1011             at_start = false;
   1012             kind = DECIMAL;
   1013             break;
   1014           }
   1015           if (c0_  < '0' || '7'  < c0_) {
   1016             // Octal literal finished.
   1017             octal_pos_ = Location(start_pos, source_pos());
   1018             break;
   1019           }
   1020           AddLiteralCharAdvance();
   1021         }
   1022       }
   1023     }
   1024 
   1025     // Parse decimal digits and allow trailing fractional part.
   1026     if (kind == DECIMAL) {
   1027       if (at_start) {
   1028         uint64_t value = 0;
   1029         while (IsDecimalDigit(c0_)) {
   1030           value = 10 * value + (c0_ - '0');
   1031 
   1032           uc32 first_char = c0_;
   1033           Advance<false, false>();
   1034           AddLiteralChar(first_char);
   1035         }
   1036 
   1037         if (next_.literal_chars->one_byte_literal().length() <= 10 &&
   1038             value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') {
   1039           next_.smi_value_ = static_cast<int>(value);
   1040           literal.Complete();
   1041           HandleLeadSurrogate();
   1042 
   1043           return Token::SMI;
   1044         }
   1045         HandleLeadSurrogate();
   1046       }
   1047 
   1048       ScanDecimalDigits();  // optional
   1049       if (c0_ == '.') {
   1050         AddLiteralCharAdvance();
   1051         ScanDecimalDigits();  // optional
   1052       }
   1053     }
   1054   }
   1055 
   1056   // scan exponent, if any
   1057   if (c0_ == 'e' || c0_ == 'E') {
   1058     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
   1059     if (kind != DECIMAL) return Token::ILLEGAL;
   1060     // scan exponent
   1061     AddLiteralCharAdvance();
   1062     if (c0_ == '+' || c0_ == '-')
   1063       AddLiteralCharAdvance();
   1064     if (!IsDecimalDigit(c0_)) {
   1065       // we must have at least one decimal digit after 'e'/'E'
   1066       return Token::ILLEGAL;
   1067     }
   1068     ScanDecimalDigits();
   1069   }
   1070 
   1071   // The source character immediately following a numeric literal must
   1072   // not be an identifier start or a decimal digit; see ECMA-262
   1073   // section 7.8.3, page 17 (note that we read only one decimal digit
   1074   // if the value is 0).
   1075   if (IsDecimalDigit(c0_) ||
   1076       (c0_ >= 0 && unicode_cache_->IsIdentifierStart(c0_)))
   1077     return Token::ILLEGAL;
   1078 
   1079   literal.Complete();
   1080 
   1081   return Token::NUMBER;
   1082 }
   1083 
   1084 
   1085 uc32 Scanner::ScanIdentifierUnicodeEscape() {
   1086   Advance();
   1087   if (c0_ != 'u') return -1;
   1088   Advance();
   1089   return ScanUnicodeEscape<false>();
   1090 }
   1091 
   1092 
   1093 template <bool capture_raw>
   1094 uc32 Scanner::ScanUnicodeEscape() {
   1095   // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
   1096   // hex digits between { } is arbitrary. \ and u have already been read.
   1097   if (c0_ == '{') {
   1098     Advance<capture_raw>();
   1099     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff);
   1100     if (cp < 0) {
   1101       return -1;
   1102     }
   1103     if (c0_ != '}') {
   1104       return -1;
   1105     }
   1106     Advance<capture_raw>();
   1107     return cp;
   1108   }
   1109   return ScanHexNumber<capture_raw>(4);
   1110 }
   1111 
   1112 
   1113 // ----------------------------------------------------------------------------
   1114 // Keyword Matcher
   1115 
   1116 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
   1117   KEYWORD_GROUP('b')                                        \
   1118   KEYWORD("break", Token::BREAK)                            \
   1119   KEYWORD_GROUP('c')                                        \
   1120   KEYWORD("case", Token::CASE)                              \
   1121   KEYWORD("catch", Token::CATCH)                            \
   1122   KEYWORD("class", Token::CLASS)                            \
   1123   KEYWORD("const", Token::CONST)                            \
   1124   KEYWORD("continue", Token::CONTINUE)                      \
   1125   KEYWORD_GROUP('d')                                        \
   1126   KEYWORD("debugger", Token::DEBUGGER)                      \
   1127   KEYWORD("default", Token::DEFAULT)                        \
   1128   KEYWORD("delete", Token::DELETE)                          \
   1129   KEYWORD("do", Token::DO)                                  \
   1130   KEYWORD_GROUP('e')                                        \
   1131   KEYWORD("else", Token::ELSE)                              \
   1132   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)              \
   1133   KEYWORD("export", Token::EXPORT)                          \
   1134   KEYWORD("extends", Token::EXTENDS)                        \
   1135   KEYWORD_GROUP('f')                                        \
   1136   KEYWORD("false", Token::FALSE_LITERAL)                    \
   1137   KEYWORD("finally", Token::FINALLY)                        \
   1138   KEYWORD("for", Token::FOR)                                \
   1139   KEYWORD("function", Token::FUNCTION)                      \
   1140   KEYWORD_GROUP('i')                                        \
   1141   KEYWORD("if", Token::IF)                                  \
   1142   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
   1143   KEYWORD("import", Token::IMPORT)                          \
   1144   KEYWORD("in", Token::IN)                                  \
   1145   KEYWORD("instanceof", Token::INSTANCEOF)                  \
   1146   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
   1147   KEYWORD_GROUP('l')                                        \
   1148   KEYWORD("let", Token::LET)                                \
   1149   KEYWORD_GROUP('n')                                        \
   1150   KEYWORD("new", Token::NEW)                                \
   1151   KEYWORD("null", Token::NULL_LITERAL)                      \
   1152   KEYWORD_GROUP('p')                                        \
   1153   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
   1154   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
   1155   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
   1156   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
   1157   KEYWORD_GROUP('r')                                        \
   1158   KEYWORD("return", Token::RETURN)                          \
   1159   KEYWORD_GROUP('s')                                        \
   1160   KEYWORD("static", Token::STATIC)                          \
   1161   KEYWORD("super", Token::SUPER)                            \
   1162   KEYWORD("switch", Token::SWITCH)                          \
   1163   KEYWORD_GROUP('t')                                        \
   1164   KEYWORD("this", Token::THIS)                              \
   1165   KEYWORD("throw", Token::THROW)                            \
   1166   KEYWORD("true", Token::TRUE_LITERAL)                      \
   1167   KEYWORD("try", Token::TRY)                                \
   1168   KEYWORD("typeof", Token::TYPEOF)                          \
   1169   KEYWORD_GROUP('v')                                        \
   1170   KEYWORD("var", Token::VAR)                                \
   1171   KEYWORD("void", Token::VOID)                              \
   1172   KEYWORD_GROUP('w')                                        \
   1173   KEYWORD("while", Token::WHILE)                            \
   1174   KEYWORD("with", Token::WITH)                              \
   1175   KEYWORD_GROUP('y')                                        \
   1176   KEYWORD("yield", Token::YIELD)
   1177 
   1178 
   1179 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
   1180                                              int input_length, bool escaped) {
   1181   DCHECK(input_length >= 1);
   1182   const int kMinLength = 2;
   1183   const int kMaxLength = 10;
   1184   if (input_length < kMinLength || input_length > kMaxLength) {
   1185     return Token::IDENTIFIER;
   1186   }
   1187   switch (input[0]) {
   1188     default:
   1189 #define KEYWORD_GROUP_CASE(ch)                                \
   1190       break;                                                  \
   1191     case ch:
   1192 #define KEYWORD(keyword, token)                                     \
   1193   {                                                                 \
   1194     /* 'keyword' is a char array, so sizeof(keyword) is */          \
   1195     /* strlen(keyword) plus 1 for the NUL char. */                  \
   1196     const int keyword_length = sizeof(keyword) - 1;                 \
   1197     STATIC_ASSERT(keyword_length >= kMinLength);                    \
   1198     STATIC_ASSERT(keyword_length <= kMaxLength);                    \
   1199     if (input_length == keyword_length && input[1] == keyword[1] && \
   1200         (keyword_length <= 2 || input[2] == keyword[2]) &&          \
   1201         (keyword_length <= 3 || input[3] == keyword[3]) &&          \
   1202         (keyword_length <= 4 || input[4] == keyword[4]) &&          \
   1203         (keyword_length <= 5 || input[5] == keyword[5]) &&          \
   1204         (keyword_length <= 6 || input[6] == keyword[6]) &&          \
   1205         (keyword_length <= 7 || input[7] == keyword[7]) &&          \
   1206         (keyword_length <= 8 || input[8] == keyword[8]) &&          \
   1207         (keyword_length <= 9 || input[9] == keyword[9])) {          \
   1208       if (escaped) {                                                \
   1209         return token == Token::FUTURE_STRICT_RESERVED_WORD          \
   1210                    ? Token::ESCAPED_STRICT_RESERVED_WORD            \
   1211                    : Token::ESCAPED_KEYWORD;                        \
   1212       }                                                             \
   1213       return token;                                                 \
   1214     }                                                               \
   1215   }
   1216     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
   1217   }
   1218   return Token::IDENTIFIER;
   1219 }
   1220 
   1221 
   1222 bool Scanner::IdentifierIsFutureStrictReserved(
   1223     const AstRawString* string) const {
   1224   // Keywords are always 1-byte strings.
   1225   if (!string->is_one_byte()) return false;
   1226   if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") ||
   1227       string->IsOneByteEqualTo("yield")) {
   1228     return true;
   1229   }
   1230   return Token::FUTURE_STRICT_RESERVED_WORD ==
   1231          KeywordOrIdentifierToken(string->raw_data(), string->length(), false);
   1232 }
   1233 
   1234 
   1235 Token::Value Scanner::ScanIdentifierOrKeyword() {
   1236   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
   1237   LiteralScope literal(this);
   1238   if (IsInRange(c0_, 'a', 'z')) {
   1239     do {
   1240       uc32 first_char = c0_;
   1241       Advance<false, false>();
   1242       AddLiteralChar(first_char);
   1243     } while (IsInRange(c0_, 'a', 'z'));
   1244 
   1245     if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
   1246         c0_ == '$') {
   1247       // Identifier starting with lowercase.
   1248       uc32 first_char = c0_;
   1249       Advance<false, false>();
   1250       AddLiteralChar(first_char);
   1251       while (IsAsciiIdentifier(c0_)) {
   1252         uc32 first_char = c0_;
   1253         Advance<false, false>();
   1254         AddLiteralChar(first_char);
   1255       }
   1256       if (c0_ <= kMaxAscii && c0_ != '\\') {
   1257         literal.Complete();
   1258         return Token::IDENTIFIER;
   1259       }
   1260     } else if (c0_ <= kMaxAscii && c0_ != '\\') {
   1261       // Only a-z+: could be a keyword or identifier.
   1262       literal.Complete();
   1263       Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1264       return KeywordOrIdentifierToken(chars.start(), chars.length(), false);
   1265     }
   1266 
   1267     HandleLeadSurrogate();
   1268   } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
   1269     do {
   1270       uc32 first_char = c0_;
   1271       Advance<false, false>();
   1272       AddLiteralChar(first_char);
   1273     } while (IsAsciiIdentifier(c0_));
   1274 
   1275     if (c0_ <= kMaxAscii && c0_ != '\\') {
   1276       literal.Complete();
   1277       return Token::IDENTIFIER;
   1278     }
   1279 
   1280     HandleLeadSurrogate();
   1281   } else if (c0_ == '\\') {
   1282     // Scan identifier start character.
   1283     uc32 c = ScanIdentifierUnicodeEscape();
   1284     // Only allow legal identifier start characters.
   1285     if (c < 0 ||
   1286         c == '\\' ||  // No recursive escapes.
   1287         !unicode_cache_->IsIdentifierStart(c)) {
   1288       return Token::ILLEGAL;
   1289     }
   1290     AddLiteralChar(c);
   1291     return ScanIdentifierSuffix(&literal, true);
   1292   } else {
   1293     uc32 first_char = c0_;
   1294     Advance();
   1295     AddLiteralChar(first_char);
   1296   }
   1297 
   1298   // Scan the rest of the identifier characters.
   1299   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
   1300     if (c0_ != '\\') {
   1301       uc32 next_char = c0_;
   1302       Advance();
   1303       AddLiteralChar(next_char);
   1304       continue;
   1305     }
   1306     // Fallthrough if no longer able to complete keyword.
   1307     return ScanIdentifierSuffix(&literal, false);
   1308   }
   1309 
   1310   literal.Complete();
   1311 
   1312   if (next_.literal_chars->is_one_byte()) {
   1313     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1314     return KeywordOrIdentifierToken(chars.start(), chars.length(), false);
   1315   }
   1316   return Token::IDENTIFIER;
   1317 }
   1318 
   1319 
   1320 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
   1321                                            bool escaped) {
   1322   // Scan the rest of the identifier characters.
   1323   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
   1324     if (c0_ == '\\') {
   1325       uc32 c = ScanIdentifierUnicodeEscape();
   1326       escaped = true;
   1327       // Only allow legal identifier part characters.
   1328       if (c < 0 ||
   1329           c == '\\' ||
   1330           !unicode_cache_->IsIdentifierPart(c)) {
   1331         return Token::ILLEGAL;
   1332       }
   1333       AddLiteralChar(c);
   1334     } else {
   1335       AddLiteralChar(c0_);
   1336       Advance();
   1337     }
   1338   }
   1339   literal->Complete();
   1340 
   1341   if (escaped && next_.literal_chars->is_one_byte()) {
   1342     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1343     return KeywordOrIdentifierToken(chars.start(), chars.length(), true);
   1344   }
   1345   return Token::IDENTIFIER;
   1346 }
   1347 
   1348 
   1349 bool Scanner::ScanRegExpPattern(bool seen_equal) {
   1350   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
   1351   bool in_character_class = false;
   1352 
   1353   // Previous token is either '/' or '/=', in the second case, the
   1354   // pattern starts at =.
   1355   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1356   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1357 
   1358   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1359   // the scanner should pass uninterpreted bodies to the RegExp
   1360   // constructor.
   1361   LiteralScope literal(this);
   1362   if (seen_equal) {
   1363     AddLiteralChar('=');
   1364   }
   1365 
   1366   while (c0_ != '/' || in_character_class) {
   1367     if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
   1368     if (c0_ == '\\') {  // Escape sequence.
   1369       AddLiteralCharAdvance();
   1370       if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
   1371       AddLiteralCharAdvance();
   1372       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
   1373       // only "safe" characters are allowed (letters, digits, underscore),
   1374       // otherwise the escape isn't valid and the invalid character has
   1375       // its normal meaning. I.e., we can just continue scanning without
   1376       // worrying whether the following characters are part of the escape
   1377       // or not, since any '/', '\\' or '[' is guaranteed to not be part
   1378       // of the escape sequence.
   1379 
   1380       // TODO(896): At some point, parse RegExps more throughly to capture
   1381       // octal esacpes in strict mode.
   1382     } else {  // Unescaped character.
   1383       if (c0_ == '[') in_character_class = true;
   1384       if (c0_ == ']') in_character_class = false;
   1385       AddLiteralCharAdvance();
   1386     }
   1387   }
   1388   Advance();  // consume '/'
   1389 
   1390   literal.Complete();
   1391 
   1392   return true;
   1393 }
   1394 
   1395 
   1396 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
   1397   // Scan regular expression flags.
   1398   LiteralScope literal(this);
   1399   int flags = 0;
   1400   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
   1401     RegExp::Flags flag = RegExp::kNone;
   1402     switch (c0_) {
   1403       case 'g':
   1404         flag = RegExp::kGlobal;
   1405         break;
   1406       case 'i':
   1407         flag = RegExp::kIgnoreCase;
   1408         break;
   1409       case 'm':
   1410         flag = RegExp::kMultiline;
   1411         break;
   1412       case 'u':
   1413         if (!FLAG_harmony_unicode_regexps) return Nothing<RegExp::Flags>();
   1414         flag = RegExp::kUnicode;
   1415         break;
   1416       case 'y':
   1417         if (!FLAG_harmony_regexps) return Nothing<RegExp::Flags>();
   1418         flag = RegExp::kSticky;
   1419         break;
   1420       default:
   1421         return Nothing<RegExp::Flags>();
   1422     }
   1423     if (flags & flag) return Nothing<RegExp::Flags>();
   1424     AddLiteralCharAdvance();
   1425     flags |= flag;
   1426   }
   1427   literal.Complete();
   1428 
   1429   next_.location.end_pos = source_pos();
   1430   return Just(RegExp::Flags(flags));
   1431 }
   1432 
   1433 
   1434 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
   1435   if (is_literal_one_byte()) {
   1436     return ast_value_factory->GetOneByteString(literal_one_byte_string());
   1437   }
   1438   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
   1439 }
   1440 
   1441 
   1442 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
   1443   if (is_next_literal_one_byte()) {
   1444     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
   1445   }
   1446   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
   1447 }
   1448 
   1449 
   1450 const AstRawString* Scanner::CurrentRawSymbol(
   1451     AstValueFactory* ast_value_factory) {
   1452   if (is_raw_literal_one_byte()) {
   1453     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
   1454   }
   1455   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
   1456 }
   1457 
   1458 
   1459 double Scanner::DoubleValue() {
   1460   DCHECK(is_literal_one_byte());
   1461   return StringToDouble(
   1462       unicode_cache_,
   1463       literal_one_byte_string(),
   1464       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
   1465 }
   1466 
   1467 
   1468 bool Scanner::ContainsDot() {
   1469   DCHECK(is_literal_one_byte());
   1470   Vector<const uint8_t> str = literal_one_byte_string();
   1471   return std::find(str.begin(), str.end(), '.') != str.end();
   1472 }
   1473 
   1474 
   1475 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
   1476   if (is_literal_one_byte()) {
   1477     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
   1478   }
   1479   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
   1480 }
   1481 
   1482 
   1483 bool Scanner::SetBookmark() {
   1484   if (c0_ != kNoBookmark && bookmark_c0_ == kNoBookmark &&
   1485       next_next_.token == Token::UNINITIALIZED && source_->SetBookmark()) {
   1486     bookmark_c0_ = c0_;
   1487     CopyTokenDesc(&bookmark_current_, &current_);
   1488     CopyTokenDesc(&bookmark_next_, &next_);
   1489     return true;
   1490   }
   1491   return false;
   1492 }
   1493 
   1494 
   1495 void Scanner::ResetToBookmark() {
   1496   DCHECK(BookmarkHasBeenSet());  // Caller hasn't called SetBookmark.
   1497 
   1498   source_->ResetToBookmark();
   1499   c0_ = bookmark_c0_;
   1500   StartLiteral();
   1501   StartRawLiteral();
   1502   CopyTokenDesc(&next_, &bookmark_current_);
   1503   current_ = next_;
   1504   StartLiteral();
   1505   StartRawLiteral();
   1506   CopyTokenDesc(&next_, &bookmark_next_);
   1507 
   1508   bookmark_c0_ = kBookmarkWasApplied;
   1509 }
   1510 
   1511 
   1512 bool Scanner::BookmarkHasBeenSet() { return bookmark_c0_ >= 0; }
   1513 
   1514 
   1515 bool Scanner::BookmarkHasBeenReset() {
   1516   return bookmark_c0_ == kBookmarkWasApplied;
   1517 }
   1518 
   1519 
   1520 void Scanner::DropBookmark() { bookmark_c0_ = kNoBookmark; }
   1521 
   1522 
   1523 void Scanner::CopyTokenDesc(TokenDesc* to, TokenDesc* from) {
   1524   DCHECK_NOT_NULL(to);
   1525   DCHECK_NOT_NULL(from);
   1526   to->token = from->token;
   1527   to->location = from->location;
   1528   to->literal_chars->CopyFrom(from->literal_chars);
   1529   to->raw_literal_chars->CopyFrom(from->raw_literal_chars);
   1530 }
   1531 
   1532 
   1533 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
   1534   return AddSymbol(key, true, value);
   1535 }
   1536 
   1537 
   1538 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
   1539   return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
   1540 }
   1541 
   1542 
   1543 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
   1544                                bool is_one_byte,
   1545                                int value) {
   1546   uint32_t hash = Hash(key, is_one_byte);
   1547   byte* encoding = BackupKey(key, is_one_byte);
   1548   HashMap::Entry* entry = map_.LookupOrInsert(encoding, hash);
   1549   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
   1550   entry->value =
   1551     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
   1552   return old_value;
   1553 }
   1554 
   1555 
   1556 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
   1557   DCHECK(key.length() > 0);
   1558   // Quick check for already being in canonical form.
   1559   if (IsNumberCanonical(key)) {
   1560     return AddOneByteSymbol(key, value);
   1561   }
   1562 
   1563   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
   1564   double double_value = StringToDouble(
   1565       unicode_constants_, key, flags, 0.0);
   1566   int length;
   1567   const char* string;
   1568   if (!std::isfinite(double_value)) {
   1569     string = "Infinity";
   1570     length = 8;  // strlen("Infinity");
   1571   } else {
   1572     string = DoubleToCString(double_value,
   1573                              Vector<char>(number_buffer_, kBufferSize));
   1574     length = StrLength(string);
   1575   }
   1576   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
   1577                                       length), true, value);
   1578 }
   1579 
   1580 
   1581 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
   1582   // Test for a safe approximation of number literals that are already
   1583   // in canonical form: max 15 digits, no leading zeroes, except an
   1584   // integer part that is a single zero, and no trailing zeros below
   1585   // the decimal point.
   1586   int pos = 0;
   1587   int length = number.length();
   1588   if (number.length() > 15) return false;
   1589   if (number[pos] == '0') {
   1590     pos++;
   1591   } else {
   1592     while (pos < length &&
   1593            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
   1594   }
   1595   if (length == pos) return true;
   1596   if (number[pos] != '.') return false;
   1597   pos++;
   1598   bool invalid_last_digit = true;
   1599   while (pos < length) {
   1600     uint8_t digit = number[pos] - '0';
   1601     if (digit > '9' - '0') return false;
   1602     invalid_last_digit = (digit == 0);
   1603     pos++;
   1604   }
   1605   return !invalid_last_digit;
   1606 }
   1607 
   1608 
   1609 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
   1610   // Primitive hash function, almost identical to the one used
   1611   // for strings (except that it's seeded by the length and representation).
   1612   int length = key.length();
   1613   uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0);
   1614   for (int i = 0; i < length; i++) {
   1615     uint32_t c = key[i];
   1616     hash = (hash + c) * 1025;
   1617     hash ^= (hash >> 6);
   1618   }
   1619   return hash;
   1620 }
   1621 
   1622 
   1623 bool DuplicateFinder::Match(void* first, void* second) {
   1624   // Decode lengths.
   1625   // Length + representation is encoded as base 128, most significant heptet
   1626   // first, with a 8th bit being non-zero while there are more heptets.
   1627   // The value encodes the number of bytes following, and whether the original
   1628   // was Latin1.
   1629   byte* s1 = reinterpret_cast<byte*>(first);
   1630   byte* s2 = reinterpret_cast<byte*>(second);
   1631   uint32_t length_one_byte_field = 0;
   1632   byte c1;
   1633   do {
   1634     c1 = *s1;
   1635     if (c1 != *s2) return false;
   1636     length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
   1637     s1++;
   1638     s2++;
   1639   } while ((c1 & 0x80) != 0);
   1640   int length = static_cast<int>(length_one_byte_field >> 1);
   1641   return memcmp(s1, s2, length) == 0;
   1642 }
   1643 
   1644 
   1645 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
   1646                                  bool is_one_byte) {
   1647   uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
   1648   backing_store_.StartSequence();
   1649   // Emit one_byte_length as base-128 encoded number, with the 7th bit set
   1650   // on the byte of every heptet except the last, least significant, one.
   1651   if (one_byte_length >= (1 << 7)) {
   1652     if (one_byte_length >= (1 << 14)) {
   1653       if (one_byte_length >= (1 << 21)) {
   1654         if (one_byte_length >= (1 << 28)) {
   1655           backing_store_.Add(
   1656               static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
   1657         }
   1658         backing_store_.Add(
   1659             static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
   1660       }
   1661       backing_store_.Add(
   1662           static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
   1663     }
   1664     backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
   1665   }
   1666   backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
   1667 
   1668   backing_store_.AddBlock(bytes);
   1669   return backing_store_.EndSequence().start();
   1670 }
   1671 
   1672 }  // namespace internal
   1673 }  // namespace v8
   1674