Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 // Features shared by parsing and pre-parsing scanners.
     29 
     30 #include "scanner.h"
     31 
     32 #include "../include/v8stdint.h"
     33 #include "char-predicates-inl.h"
     34 
     35 namespace v8 {
     36 namespace internal {
     37 
     38 // ----------------------------------------------------------------------------
     39 // Scanner
     40 
     41 Scanner::Scanner(UnicodeCache* unicode_cache)
     42     : unicode_cache_(unicode_cache),
     43       octal_pos_(Location::invalid()),
     44       harmony_scoping_(false),
     45       harmony_modules_(false) { }
     46 
     47 
     48 void Scanner::Initialize(Utf16CharacterStream* source) {
     49   source_ = source;
     50   // Need to capture identifiers in order to recognize "get" and "set"
     51   // in object literals.
     52   Init();
     53   // Skip initial whitespace allowing HTML comment ends just like
     54   // after a newline and scan first token.
     55   has_line_terminator_before_next_ = true;
     56   SkipWhiteSpace();
     57   Scan();
     58 }
     59 
     60 
     61 uc32 Scanner::ScanHexNumber(int expected_length) {
     62   ASSERT(expected_length <= 4);  // prevent overflow
     63 
     64   uc32 digits[4] = { 0, 0, 0, 0 };
     65   uc32 x = 0;
     66   for (int i = 0; i < expected_length; i++) {
     67     digits[i] = c0_;
     68     int d = HexValue(c0_);
     69     if (d < 0) {
     70       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
     71       // should be illegal, but other JS VMs just return the
     72       // non-escaped version of the original character.
     73 
     74       // Push back digits that we have advanced past.
     75       for (int j = i-1; j >= 0; j--) {
     76         PushBack(digits[j]);
     77       }
     78       return -1;
     79     }
     80     x = x * 16 + d;
     81     Advance();
     82   }
     83 
     84   return x;
     85 }
     86 
     87 
     88 // Ensure that tokens can be stored in a byte.
     89 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
     90 
     91 // Table of one-character tokens, by character (0x00..0x7f only).
     92 static const byte one_char_tokens[] = {
     93   Token::ILLEGAL,
     94   Token::ILLEGAL,
     95   Token::ILLEGAL,
     96   Token::ILLEGAL,
     97   Token::ILLEGAL,
     98   Token::ILLEGAL,
     99   Token::ILLEGAL,
    100   Token::ILLEGAL,
    101   Token::ILLEGAL,
    102   Token::ILLEGAL,
    103   Token::ILLEGAL,
    104   Token::ILLEGAL,
    105   Token::ILLEGAL,
    106   Token::ILLEGAL,
    107   Token::ILLEGAL,
    108   Token::ILLEGAL,
    109   Token::ILLEGAL,
    110   Token::ILLEGAL,
    111   Token::ILLEGAL,
    112   Token::ILLEGAL,
    113   Token::ILLEGAL,
    114   Token::ILLEGAL,
    115   Token::ILLEGAL,
    116   Token::ILLEGAL,
    117   Token::ILLEGAL,
    118   Token::ILLEGAL,
    119   Token::ILLEGAL,
    120   Token::ILLEGAL,
    121   Token::ILLEGAL,
    122   Token::ILLEGAL,
    123   Token::ILLEGAL,
    124   Token::ILLEGAL,
    125   Token::ILLEGAL,
    126   Token::ILLEGAL,
    127   Token::ILLEGAL,
    128   Token::ILLEGAL,
    129   Token::ILLEGAL,
    130   Token::ILLEGAL,
    131   Token::ILLEGAL,
    132   Token::ILLEGAL,
    133   Token::LPAREN,       // 0x28
    134   Token::RPAREN,       // 0x29
    135   Token::ILLEGAL,
    136   Token::ILLEGAL,
    137   Token::COMMA,        // 0x2c
    138   Token::ILLEGAL,
    139   Token::ILLEGAL,
    140   Token::ILLEGAL,
    141   Token::ILLEGAL,
    142   Token::ILLEGAL,
    143   Token::ILLEGAL,
    144   Token::ILLEGAL,
    145   Token::ILLEGAL,
    146   Token::ILLEGAL,
    147   Token::ILLEGAL,
    148   Token::ILLEGAL,
    149   Token::ILLEGAL,
    150   Token::ILLEGAL,
    151   Token::COLON,        // 0x3a
    152   Token::SEMICOLON,    // 0x3b
    153   Token::ILLEGAL,
    154   Token::ILLEGAL,
    155   Token::ILLEGAL,
    156   Token::CONDITIONAL,  // 0x3f
    157   Token::ILLEGAL,
    158   Token::ILLEGAL,
    159   Token::ILLEGAL,
    160   Token::ILLEGAL,
    161   Token::ILLEGAL,
    162   Token::ILLEGAL,
    163   Token::ILLEGAL,
    164   Token::ILLEGAL,
    165   Token::ILLEGAL,
    166   Token::ILLEGAL,
    167   Token::ILLEGAL,
    168   Token::ILLEGAL,
    169   Token::ILLEGAL,
    170   Token::ILLEGAL,
    171   Token::ILLEGAL,
    172   Token::ILLEGAL,
    173   Token::ILLEGAL,
    174   Token::ILLEGAL,
    175   Token::ILLEGAL,
    176   Token::ILLEGAL,
    177   Token::ILLEGAL,
    178   Token::ILLEGAL,
    179   Token::ILLEGAL,
    180   Token::ILLEGAL,
    181   Token::ILLEGAL,
    182   Token::ILLEGAL,
    183   Token::ILLEGAL,
    184   Token::LBRACK,     // 0x5b
    185   Token::ILLEGAL,
    186   Token::RBRACK,     // 0x5d
    187   Token::ILLEGAL,
    188   Token::ILLEGAL,
    189   Token::ILLEGAL,
    190   Token::ILLEGAL,
    191   Token::ILLEGAL,
    192   Token::ILLEGAL,
    193   Token::ILLEGAL,
    194   Token::ILLEGAL,
    195   Token::ILLEGAL,
    196   Token::ILLEGAL,
    197   Token::ILLEGAL,
    198   Token::ILLEGAL,
    199   Token::ILLEGAL,
    200   Token::ILLEGAL,
    201   Token::ILLEGAL,
    202   Token::ILLEGAL,
    203   Token::ILLEGAL,
    204   Token::ILLEGAL,
    205   Token::ILLEGAL,
    206   Token::ILLEGAL,
    207   Token::ILLEGAL,
    208   Token::ILLEGAL,
    209   Token::ILLEGAL,
    210   Token::ILLEGAL,
    211   Token::ILLEGAL,
    212   Token::ILLEGAL,
    213   Token::ILLEGAL,
    214   Token::ILLEGAL,
    215   Token::ILLEGAL,
    216   Token::LBRACE,       // 0x7b
    217   Token::ILLEGAL,
    218   Token::RBRACE,       // 0x7d
    219   Token::BIT_NOT,      // 0x7e
    220   Token::ILLEGAL
    221 };
    222 
    223 
    224 Token::Value Scanner::Next() {
    225   current_ = next_;
    226   has_line_terminator_before_next_ = false;
    227   has_multiline_comment_before_next_ = false;
    228   if (static_cast<unsigned>(c0_) <= 0x7f) {
    229     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    230     if (token != Token::ILLEGAL) {
    231       int pos = source_pos();
    232       next_.token = token;
    233       next_.location.beg_pos = pos;
    234       next_.location.end_pos = pos + 1;
    235       Advance();
    236       return current_.token;
    237     }
    238   }
    239   Scan();
    240   return current_.token;
    241 }
    242 
    243 
    244 static inline bool IsByteOrderMark(uc32 c) {
    245   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    246   // Unicode character; this implies that in a Unicode context the
    247   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    248   // character expressed in little-endian byte order (since it could
    249   // not be a U+FFFE character expressed in big-endian byte
    250   // order). Nevertheless, we check for it to be compatible with
    251   // Spidermonkey.
    252   return c == 0xFEFF || c == 0xFFFE;
    253 }
    254 
    255 
    256 bool Scanner::SkipWhiteSpace() {
    257   int start_position = source_pos();
    258 
    259   while (true) {
    260     // We treat byte-order marks (BOMs) as whitespace for better
    261     // compatibility with Spidermonkey and other JavaScript engines.
    262     while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
    263       // IsWhiteSpace() includes line terminators!
    264       if (unicode_cache_->IsLineTerminator(c0_)) {
    265         // Ignore line terminators, but remember them. This is necessary
    266         // for automatic semicolon insertion.
    267         has_line_terminator_before_next_ = true;
    268       }
    269       Advance();
    270     }
    271 
    272     // If there is an HTML comment end '-->' at the beginning of a
    273     // line (with only whitespace in front of it), we treat the rest
    274     // of the line as a comment. This is in line with the way
    275     // SpiderMonkey handles it.
    276     if (c0_ == '-' && has_line_terminator_before_next_) {
    277       Advance();
    278       if (c0_ == '-') {
    279         Advance();
    280         if (c0_ == '>') {
    281           // Treat the rest of the line as a comment.
    282           SkipSingleLineComment();
    283           // Continue skipping white space after the comment.
    284           continue;
    285         }
    286         PushBack('-');  // undo Advance()
    287       }
    288       PushBack('-');  // undo Advance()
    289     }
    290     // Return whether or not we skipped any characters.
    291     return source_pos() != start_position;
    292   }
    293 }
    294 
    295 
    296 Token::Value Scanner::SkipSingleLineComment() {
    297   Advance();
    298 
    299   // The line terminator at the end of the line is not considered
    300   // to be part of the single-line comment; it is recognized
    301   // separately by the lexical grammar and becomes part of the
    302   // stream of input elements for the syntactic grammar (see
    303   // ECMA-262, section 7.4).
    304   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    305     Advance();
    306   }
    307 
    308   return Token::WHITESPACE;
    309 }
    310 
    311 
    312 Token::Value Scanner::SkipMultiLineComment() {
    313   ASSERT(c0_ == '*');
    314   Advance();
    315 
    316   while (c0_ >= 0) {
    317     uc32 ch = c0_;
    318     Advance();
    319     if (unicode_cache_->IsLineTerminator(ch)) {
    320       // Following ECMA-262, section 7.4, a comment containing
    321       // a newline will make the comment count as a line-terminator.
    322       has_multiline_comment_before_next_ = true;
    323     }
    324     // If we have reached the end of the multi-line comment, we
    325     // consume the '/' and insert a whitespace. This way all
    326     // multi-line comments are treated as whitespace.
    327     if (ch == '*' && c0_ == '/') {
    328       c0_ = ' ';
    329       return Token::WHITESPACE;
    330     }
    331   }
    332 
    333   // Unterminated multi-line comment.
    334   return Token::ILLEGAL;
    335 }
    336 
    337 
    338 Token::Value Scanner::ScanHtmlComment() {
    339   // Check for <!-- comments.
    340   ASSERT(c0_ == '!');
    341   Advance();
    342   if (c0_ == '-') {
    343     Advance();
    344     if (c0_ == '-') return SkipSingleLineComment();
    345     PushBack('-');  // undo Advance()
    346   }
    347   PushBack('!');  // undo Advance()
    348   ASSERT(c0_ == '!');
    349   return Token::LT;
    350 }
    351 
    352 
    353 void Scanner::Scan() {
    354   next_.literal_chars = NULL;
    355   Token::Value token;
    356   do {
    357     // Remember the position of the next token
    358     next_.location.beg_pos = source_pos();
    359 
    360     switch (c0_) {
    361       case ' ':
    362       case '\t':
    363         Advance();
    364         token = Token::WHITESPACE;
    365         break;
    366 
    367       case '\n':
    368         Advance();
    369         has_line_terminator_before_next_ = true;
    370         token = Token::WHITESPACE;
    371         break;
    372 
    373       case '"': case '\'':
    374         token = ScanString();
    375         break;
    376 
    377       case '<':
    378         // < <= << <<= <!--
    379         Advance();
    380         if (c0_ == '=') {
    381           token = Select(Token::LTE);
    382         } else if (c0_ == '<') {
    383           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    384         } else if (c0_ == '!') {
    385           token = ScanHtmlComment();
    386         } else {
    387           token = Token::LT;
    388         }
    389         break;
    390 
    391       case '>':
    392         // > >= >> >>= >>> >>>=
    393         Advance();
    394         if (c0_ == '=') {
    395           token = Select(Token::GTE);
    396         } else if (c0_ == '>') {
    397           // >> >>= >>> >>>=
    398           Advance();
    399           if (c0_ == '=') {
    400             token = Select(Token::ASSIGN_SAR);
    401           } else if (c0_ == '>') {
    402             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    403           } else {
    404             token = Token::SAR;
    405           }
    406         } else {
    407           token = Token::GT;
    408         }
    409         break;
    410 
    411       case '=':
    412         // = == ===
    413         Advance();
    414         if (c0_ == '=') {
    415           token = Select('=', Token::EQ_STRICT, Token::EQ);
    416         } else {
    417           token = Token::ASSIGN;
    418         }
    419         break;
    420 
    421       case '!':
    422         // ! != !==
    423         Advance();
    424         if (c0_ == '=') {
    425           token = Select('=', Token::NE_STRICT, Token::NE);
    426         } else {
    427           token = Token::NOT;
    428         }
    429         break;
    430 
    431       case '+':
    432         // + ++ +=
    433         Advance();
    434         if (c0_ == '+') {
    435           token = Select(Token::INC);
    436         } else if (c0_ == '=') {
    437           token = Select(Token::ASSIGN_ADD);
    438         } else {
    439           token = Token::ADD;
    440         }
    441         break;
    442 
    443       case '-':
    444         // - -- --> -=
    445         Advance();
    446         if (c0_ == '-') {
    447           Advance();
    448           if (c0_ == '>' && has_line_terminator_before_next_) {
    449             // For compatibility with SpiderMonkey, we skip lines that
    450             // start with an HTML comment end '-->'.
    451             token = SkipSingleLineComment();
    452           } else {
    453             token = Token::DEC;
    454           }
    455         } else if (c0_ == '=') {
    456           token = Select(Token::ASSIGN_SUB);
    457         } else {
    458           token = Token::SUB;
    459         }
    460         break;
    461 
    462       case '*':
    463         // * *=
    464         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
    465         break;
    466 
    467       case '%':
    468         // % %=
    469         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    470         break;
    471 
    472       case '/':
    473         // /  // /* /=
    474         Advance();
    475         if (c0_ == '/') {
    476           token = SkipSingleLineComment();
    477         } else if (c0_ == '*') {
    478           token = SkipMultiLineComment();
    479         } else if (c0_ == '=') {
    480           token = Select(Token::ASSIGN_DIV);
    481         } else {
    482           token = Token::DIV;
    483         }
    484         break;
    485 
    486       case '&':
    487         // & && &=
    488         Advance();
    489         if (c0_ == '&') {
    490           token = Select(Token::AND);
    491         } else if (c0_ == '=') {
    492           token = Select(Token::ASSIGN_BIT_AND);
    493         } else {
    494           token = Token::BIT_AND;
    495         }
    496         break;
    497 
    498       case '|':
    499         // | || |=
    500         Advance();
    501         if (c0_ == '|') {
    502           token = Select(Token::OR);
    503         } else if (c0_ == '=') {
    504           token = Select(Token::ASSIGN_BIT_OR);
    505         } else {
    506           token = Token::BIT_OR;
    507         }
    508         break;
    509 
    510       case '^':
    511         // ^ ^=
    512         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    513         break;
    514 
    515       case '.':
    516         // . Number
    517         Advance();
    518         if (IsDecimalDigit(c0_)) {
    519           token = ScanNumber(true);
    520         } else {
    521           token = Token::PERIOD;
    522         }
    523         break;
    524 
    525       case ':':
    526         token = Select(Token::COLON);
    527         break;
    528 
    529       case ';':
    530         token = Select(Token::SEMICOLON);
    531         break;
    532 
    533       case ',':
    534         token = Select(Token::COMMA);
    535         break;
    536 
    537       case '(':
    538         token = Select(Token::LPAREN);
    539         break;
    540 
    541       case ')':
    542         token = Select(Token::RPAREN);
    543         break;
    544 
    545       case '[':
    546         token = Select(Token::LBRACK);
    547         break;
    548 
    549       case ']':
    550         token = Select(Token::RBRACK);
    551         break;
    552 
    553       case '{':
    554         token = Select(Token::LBRACE);
    555         break;
    556 
    557       case '}':
    558         token = Select(Token::RBRACE);
    559         break;
    560 
    561       case '?':
    562         token = Select(Token::CONDITIONAL);
    563         break;
    564 
    565       case '~':
    566         token = Select(Token::BIT_NOT);
    567         break;
    568 
    569       default:
    570         if (unicode_cache_->IsIdentifierStart(c0_)) {
    571           token = ScanIdentifierOrKeyword();
    572         } else if (IsDecimalDigit(c0_)) {
    573           token = ScanNumber(false);
    574         } else if (SkipWhiteSpace()) {
    575           token = Token::WHITESPACE;
    576         } else if (c0_ < 0) {
    577           token = Token::EOS;
    578         } else {
    579           token = Select(Token::ILLEGAL);
    580         }
    581         break;
    582     }
    583 
    584     // Continue scanning for tokens as long as we're just skipping
    585     // whitespace.
    586   } while (token == Token::WHITESPACE);
    587 
    588   next_.location.end_pos = source_pos();
    589   next_.token = token;
    590 }
    591 
    592 
    593 void Scanner::SeekForward(int pos) {
    594   // After this call, we will have the token at the given position as
    595   // the "next" token. The "current" token will be invalid.
    596   if (pos == next_.location.beg_pos) return;
    597   int current_pos = source_pos();
    598   ASSERT_EQ(next_.location.end_pos, current_pos);
    599   // Positions inside the lookahead token aren't supported.
    600   ASSERT(pos >= current_pos);
    601   if (pos != current_pos) {
    602     source_->SeekForward(pos - source_->pos());
    603     Advance();
    604     // This function is only called to seek to the location
    605     // of the end of a function (at the "}" token). It doesn't matter
    606     // whether there was a line terminator in the part we skip.
    607     has_line_terminator_before_next_ = false;
    608     has_multiline_comment_before_next_ = false;
    609   }
    610   Scan();
    611 }
    612 
    613 
    614 void Scanner::ScanEscape() {
    615   uc32 c = c0_;
    616   Advance();
    617 
    618   // Skip escaped newlines.
    619   if (unicode_cache_->IsLineTerminator(c)) {
    620     // Allow CR+LF newlines in multiline string literals.
    621     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
    622     // Allow LF+CR newlines in multiline string literals.
    623     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
    624     return;
    625   }
    626 
    627   switch (c) {
    628     case '\'':  // fall through
    629     case '"' :  // fall through
    630     case '\\': break;
    631     case 'b' : c = '\b'; break;
    632     case 'f' : c = '\f'; break;
    633     case 'n' : c = '\n'; break;
    634     case 'r' : c = '\r'; break;
    635     case 't' : c = '\t'; break;
    636     case 'u' : {
    637       c = ScanHexNumber(4);
    638       if (c < 0) c = 'u';
    639       break;
    640     }
    641     case 'v' : c = '\v'; break;
    642     case 'x' : {
    643       c = ScanHexNumber(2);
    644       if (c < 0) c = 'x';
    645       break;
    646     }
    647     case '0' :  // fall through
    648     case '1' :  // fall through
    649     case '2' :  // fall through
    650     case '3' :  // fall through
    651     case '4' :  // fall through
    652     case '5' :  // fall through
    653     case '6' :  // fall through
    654     case '7' : c = ScanOctalEscape(c, 2); break;
    655   }
    656 
    657   // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
    658   // should be illegal, but they are commonly handled
    659   // as non-escaped characters by JS VMs.
    660   AddLiteralChar(c);
    661 }
    662 
    663 
    664 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
    665 // ECMA-262. Other JS VMs support them.
    666 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    667   uc32 x = c - '0';
    668   int i = 0;
    669   for (; i < length; i++) {
    670     int d = c0_ - '0';
    671     if (d < 0 || d > 7) break;
    672     int nx = x * 8 + d;
    673     if (nx >= 256) break;
    674     x = nx;
    675     Advance();
    676   }
    677   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
    678   // Remember the position of octal escape sequences so that an error
    679   // can be reported later (in strict mode).
    680   // We don't report the error immediately, because the octal escape can
    681   // occur before the "use strict" directive.
    682   if (c != '0' || i > 0) {
    683     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
    684   }
    685   return x;
    686 }
    687 
    688 
    689 Token::Value Scanner::ScanString() {
    690   uc32 quote = c0_;
    691   Advance();  // consume quote
    692 
    693   LiteralScope literal(this);
    694   while (c0_ != quote && c0_ >= 0
    695          && !unicode_cache_->IsLineTerminator(c0_)) {
    696     uc32 c = c0_;
    697     Advance();
    698     if (c == '\\') {
    699       if (c0_ < 0) return Token::ILLEGAL;
    700       ScanEscape();
    701     } else {
    702       AddLiteralChar(c);
    703     }
    704   }
    705   if (c0_ != quote) return Token::ILLEGAL;
    706   literal.Complete();
    707 
    708   Advance();  // consume quote
    709   return Token::STRING;
    710 }
    711 
    712 
    713 void Scanner::ScanDecimalDigits() {
    714   while (IsDecimalDigit(c0_))
    715     AddLiteralCharAdvance();
    716 }
    717 
    718 
    719 Token::Value Scanner::ScanNumber(bool seen_period) {
    720   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
    721 
    722   enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
    723 
    724   LiteralScope literal(this);
    725   if (seen_period) {
    726     // we have already seen a decimal point of the float
    727     AddLiteralChar('.');
    728     ScanDecimalDigits();  // we know we have at least one digit
    729 
    730   } else {
    731     // if the first character is '0' we must check for octals and hex
    732     if (c0_ == '0') {
    733       int start_pos = source_pos();  // For reporting octal positions.
    734       AddLiteralCharAdvance();
    735 
    736       // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
    737       if (c0_ == 'x' || c0_ == 'X') {
    738         // hex number
    739         kind = HEX;
    740         AddLiteralCharAdvance();
    741         if (!IsHexDigit(c0_)) {
    742           // we must have at least one hex digit after 'x'/'X'
    743           return Token::ILLEGAL;
    744         }
    745         while (IsHexDigit(c0_)) {
    746           AddLiteralCharAdvance();
    747         }
    748       } else if ('0' <= c0_ && c0_ <= '7') {
    749         // (possible) octal number
    750         kind = OCTAL;
    751         while (true) {
    752           if (c0_ == '8' || c0_ == '9') {
    753             kind = DECIMAL;
    754             break;
    755           }
    756           if (c0_  < '0' || '7'  < c0_) {
    757             // Octal literal finished.
    758             octal_pos_ = Location(start_pos, source_pos());
    759             break;
    760           }
    761           AddLiteralCharAdvance();
    762         }
    763       }
    764     }
    765 
    766     // Parse decimal digits and allow trailing fractional part.
    767     if (kind == DECIMAL) {
    768       ScanDecimalDigits();  // optional
    769       if (c0_ == '.') {
    770         AddLiteralCharAdvance();
    771         ScanDecimalDigits();  // optional
    772       }
    773     }
    774   }
    775 
    776   // scan exponent, if any
    777   if (c0_ == 'e' || c0_ == 'E') {
    778     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
    779     if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
    780     // scan exponent
    781     AddLiteralCharAdvance();
    782     if (c0_ == '+' || c0_ == '-')
    783       AddLiteralCharAdvance();
    784     if (!IsDecimalDigit(c0_)) {
    785       // we must have at least one decimal digit after 'e'/'E'
    786       return Token::ILLEGAL;
    787     }
    788     ScanDecimalDigits();
    789   }
    790 
    791   // The source character immediately following a numeric literal must
    792   // not be an identifier start or a decimal digit; see ECMA-262
    793   // section 7.8.3, page 17 (note that we read only one decimal digit
    794   // if the value is 0).
    795   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
    796     return Token::ILLEGAL;
    797 
    798   literal.Complete();
    799 
    800   return Token::NUMBER;
    801 }
    802 
    803 
    804 uc32 Scanner::ScanIdentifierUnicodeEscape() {
    805   Advance();
    806   if (c0_ != 'u') return -1;
    807   Advance();
    808   uc32 result = ScanHexNumber(4);
    809   if (result < 0) PushBack('u');
    810   return result;
    811 }
    812 
    813 
    814 // ----------------------------------------------------------------------------
    815 // Keyword Matcher
    816 
    817 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
    818   KEYWORD_GROUP('b')                                                \
    819   KEYWORD("break", Token::BREAK)                                    \
    820   KEYWORD_GROUP('c')                                                \
    821   KEYWORD("case", Token::CASE)                                      \
    822   KEYWORD("catch", Token::CATCH)                                    \
    823   KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
    824   KEYWORD("const", Token::CONST)                                    \
    825   KEYWORD("continue", Token::CONTINUE)                              \
    826   KEYWORD_GROUP('d')                                                \
    827   KEYWORD("debugger", Token::DEBUGGER)                              \
    828   KEYWORD("default", Token::DEFAULT)                                \
    829   KEYWORD("delete", Token::DELETE)                                  \
    830   KEYWORD("do", Token::DO)                                          \
    831   KEYWORD_GROUP('e')                                                \
    832   KEYWORD("else", Token::ELSE)                                      \
    833   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
    834   KEYWORD("export", harmony_modules                                 \
    835                     ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
    836   KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
    837   KEYWORD_GROUP('f')                                                \
    838   KEYWORD("false", Token::FALSE_LITERAL)                            \
    839   KEYWORD("finally", Token::FINALLY)                                \
    840   KEYWORD("for", Token::FOR)                                        \
    841   KEYWORD("function", Token::FUNCTION)                              \
    842   KEYWORD_GROUP('i')                                                \
    843   KEYWORD("if", Token::IF)                                          \
    844   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
    845   KEYWORD("import", harmony_modules                                 \
    846                     ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
    847   KEYWORD("in", Token::IN)                                          \
    848   KEYWORD("instanceof", Token::INSTANCEOF)                          \
    849   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
    850   KEYWORD_GROUP('l')                                                \
    851   KEYWORD("let", harmony_scoping                                    \
    852                  ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
    853   KEYWORD_GROUP('n')                                                \
    854   KEYWORD("new", Token::NEW)                                        \
    855   KEYWORD("null", Token::NULL_LITERAL)                              \
    856   KEYWORD_GROUP('p')                                                \
    857   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
    858   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
    859   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
    860   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
    861   KEYWORD_GROUP('r')                                                \
    862   KEYWORD("return", Token::RETURN)                                  \
    863   KEYWORD_GROUP('s')                                                \
    864   KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
    865   KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
    866   KEYWORD("switch", Token::SWITCH)                                  \
    867   KEYWORD_GROUP('t')                                                \
    868   KEYWORD("this", Token::THIS)                                      \
    869   KEYWORD("throw", Token::THROW)                                    \
    870   KEYWORD("true", Token::TRUE_LITERAL)                              \
    871   KEYWORD("try", Token::TRY)                                        \
    872   KEYWORD("typeof", Token::TYPEOF)                                  \
    873   KEYWORD_GROUP('v')                                                \
    874   KEYWORD("var", Token::VAR)                                        \
    875   KEYWORD("void", Token::VOID)                                      \
    876   KEYWORD_GROUP('w')                                                \
    877   KEYWORD("while", Token::WHILE)                                    \
    878   KEYWORD("with", Token::WITH)                                      \
    879   KEYWORD_GROUP('y')                                                \
    880   KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)
    881 
    882 
    883 static Token::Value KeywordOrIdentifierToken(const char* input,
    884                                              int input_length,
    885                                              bool harmony_scoping,
    886                                              bool harmony_modules) {
    887   ASSERT(input_length >= 1);
    888   const int kMinLength = 2;
    889   const int kMaxLength = 10;
    890   if (input_length < kMinLength || input_length > kMaxLength) {
    891     return Token::IDENTIFIER;
    892   }
    893   switch (input[0]) {
    894     default:
    895 #define KEYWORD_GROUP_CASE(ch)                                \
    896       break;                                                  \
    897     case ch:
    898 #define KEYWORD(keyword, token)                               \
    899     {                                                         \
    900       /* 'keyword' is a char array, so sizeof(keyword) is */  \
    901       /* strlen(keyword) plus 1 for the NUL char. */          \
    902       const int keyword_length = sizeof(keyword) - 1;         \
    903       STATIC_ASSERT(keyword_length >= kMinLength);            \
    904       STATIC_ASSERT(keyword_length <= kMaxLength);            \
    905       if (input_length == keyword_length &&                   \
    906           input[1] == keyword[1] &&                           \
    907           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
    908           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
    909           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
    910           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
    911           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
    912           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
    913           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
    914           (keyword_length <= 9 || input[9] == keyword[9])) {  \
    915         return token;                                         \
    916       }                                                       \
    917     }
    918     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
    919   }
    920   return Token::IDENTIFIER;
    921 }
    922 
    923 
    924 Token::Value Scanner::ScanIdentifierOrKeyword() {
    925   ASSERT(unicode_cache_->IsIdentifierStart(c0_));
    926   LiteralScope literal(this);
    927   // Scan identifier start character.
    928   if (c0_ == '\\') {
    929     uc32 c = ScanIdentifierUnicodeEscape();
    930     // Only allow legal identifier start characters.
    931     if (c < 0 ||
    932         c == '\\' ||  // No recursive escapes.
    933         !unicode_cache_->IsIdentifierStart(c)) {
    934       return Token::ILLEGAL;
    935     }
    936     AddLiteralChar(c);
    937     return ScanIdentifierSuffix(&literal);
    938   }
    939 
    940   uc32 first_char = c0_;
    941   Advance();
    942   AddLiteralChar(first_char);
    943 
    944   // Scan the rest of the identifier characters.
    945   while (unicode_cache_->IsIdentifierPart(c0_)) {
    946     if (c0_ != '\\') {
    947       uc32 next_char = c0_;
    948       Advance();
    949       AddLiteralChar(next_char);
    950       continue;
    951     }
    952     // Fallthrough if no longer able to complete keyword.
    953     return ScanIdentifierSuffix(&literal);
    954   }
    955 
    956   literal.Complete();
    957 
    958   if (next_.literal_chars->is_ascii()) {
    959     Vector<const char> chars = next_.literal_chars->ascii_literal();
    960     return KeywordOrIdentifierToken(chars.start(),
    961                                     chars.length(),
    962                                     harmony_scoping_,
    963                                     harmony_modules_);
    964   }
    965 
    966   return Token::IDENTIFIER;
    967 }
    968 
    969 
    970 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
    971   // Scan the rest of the identifier characters.
    972   while (unicode_cache_->IsIdentifierPart(c0_)) {
    973     if (c0_ == '\\') {
    974       uc32 c = ScanIdentifierUnicodeEscape();
    975       // Only allow legal identifier part characters.
    976       if (c < 0 ||
    977           c == '\\' ||
    978           !unicode_cache_->IsIdentifierPart(c)) {
    979         return Token::ILLEGAL;
    980       }
    981       AddLiteralChar(c);
    982     } else {
    983       AddLiteralChar(c0_);
    984       Advance();
    985     }
    986   }
    987   literal->Complete();
    988 
    989   return Token::IDENTIFIER;
    990 }
    991 
    992 
    993 bool Scanner::ScanRegExpPattern(bool seen_equal) {
    994   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
    995   bool in_character_class = false;
    996 
    997   // Previous token is either '/' or '/=', in the second case, the
    998   // pattern starts at =.
    999   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1000   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1001 
   1002   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1003   // the scanner should pass uninterpreted bodies to the RegExp
   1004   // constructor.
   1005   LiteralScope literal(this);
   1006   if (seen_equal) {
   1007     AddLiteralChar('=');
   1008   }
   1009 
   1010   while (c0_ != '/' || in_character_class) {
   1011     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1012     if (c0_ == '\\') {  // Escape sequence.
   1013       AddLiteralCharAdvance();
   1014       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1015       AddLiteralCharAdvance();
   1016       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
   1017       // only "safe" characters are allowed (letters, digits, underscore),
   1018       // otherwise the escape isn't valid and the invalid character has
   1019       // its normal meaning. I.e., we can just continue scanning without
   1020       // worrying whether the following characters are part of the escape
   1021       // or not, since any '/', '\\' or '[' is guaranteed to not be part
   1022       // of the escape sequence.
   1023 
   1024       // TODO(896): At some point, parse RegExps more throughly to capture
   1025       // octal esacpes in strict mode.
   1026     } else {  // Unescaped character.
   1027       if (c0_ == '[') in_character_class = true;
   1028       if (c0_ == ']') in_character_class = false;
   1029       AddLiteralCharAdvance();
   1030     }
   1031   }
   1032   Advance();  // consume '/'
   1033 
   1034   literal.Complete();
   1035 
   1036   return true;
   1037 }
   1038 
   1039 
   1040 bool Scanner::ScanLiteralUnicodeEscape() {
   1041   ASSERT(c0_ == '\\');
   1042   uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
   1043   Advance();
   1044   int i = 1;
   1045   if (c0_ == 'u') {
   1046     i++;
   1047     while (i < 6) {
   1048       Advance();
   1049       if (!IsHexDigit(c0_)) break;
   1050       chars_read[i] = c0_;
   1051       i++;
   1052     }
   1053   }
   1054   if (i < 6) {
   1055     // Incomplete escape. Undo all advances and return false.
   1056     while (i > 0) {
   1057       i--;
   1058       PushBack(chars_read[i]);
   1059     }
   1060     return false;
   1061   }
   1062   // Complete escape. Add all chars to current literal buffer.
   1063   for (int i = 0; i < 6; i++) {
   1064     AddLiteralChar(chars_read[i]);
   1065   }
   1066   return true;
   1067 }
   1068 
   1069 
   1070 bool Scanner::ScanRegExpFlags() {
   1071   // Scan regular expression flags.
   1072   LiteralScope literal(this);
   1073   while (unicode_cache_->IsIdentifierPart(c0_)) {
   1074     if (c0_ != '\\') {
   1075       AddLiteralCharAdvance();
   1076     } else {
   1077       if (!ScanLiteralUnicodeEscape()) {
   1078         break;
   1079       }
   1080     }
   1081   }
   1082   literal.Complete();
   1083 
   1084   next_.location.end_pos = source_pos() - 1;
   1085   return true;
   1086 }
   1087 
   1088 } }  // namespace v8::internal
   1089