Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 // Features shared by parsing and pre-parsing scanners.
     29 
     30 #include "scanner.h"
     31 
     32 #include "../include/v8stdint.h"
     33 #include "char-predicates-inl.h"
     34 
     35 namespace v8 {
     36 namespace internal {
     37 
     38 // ----------------------------------------------------------------------------
     39 // Scanner
     40 
     41 Scanner::Scanner(UnicodeCache* unicode_cache)
     42     : unicode_cache_(unicode_cache),
     43       octal_pos_(Location::invalid()),
     44       harmony_scoping_(false),
     45       harmony_modules_(false),
     46       harmony_numeric_literals_(false) { }
     47 
     48 
     49 void Scanner::Initialize(Utf16CharacterStream* source) {
     50   source_ = source;
     51   // Need to capture identifiers in order to recognize "get" and "set"
     52   // in object literals.
     53   Init();
     54   // Skip initial whitespace allowing HTML comment ends just like
     55   // after a newline and scan first token.
     56   has_line_terminator_before_next_ = true;
     57   SkipWhiteSpace();
     58   Scan();
     59 }
     60 
     61 
     62 uc32 Scanner::ScanHexNumber(int expected_length) {
     63   ASSERT(expected_length <= 4);  // prevent overflow
     64 
     65   uc32 digits[4] = { 0, 0, 0, 0 };
     66   uc32 x = 0;
     67   for (int i = 0; i < expected_length; i++) {
     68     digits[i] = c0_;
     69     int d = HexValue(c0_);
     70     if (d < 0) {
     71       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
     72       // should be illegal, but other JS VMs just return the
     73       // non-escaped version of the original character.
     74 
     75       // Push back digits that we have advanced past.
     76       for (int j = i-1; j >= 0; j--) {
     77         PushBack(digits[j]);
     78       }
     79       return -1;
     80     }
     81     x = x * 16 + d;
     82     Advance();
     83   }
     84 
     85   return x;
     86 }
     87 
     88 
     89 // Ensure that tokens can be stored in a byte.
     90 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
     91 
     92 // Table of one-character tokens, by character (0x00..0x7f only).
     93 static const byte one_char_tokens[] = {
     94   Token::ILLEGAL,
     95   Token::ILLEGAL,
     96   Token::ILLEGAL,
     97   Token::ILLEGAL,
     98   Token::ILLEGAL,
     99   Token::ILLEGAL,
    100   Token::ILLEGAL,
    101   Token::ILLEGAL,
    102   Token::ILLEGAL,
    103   Token::ILLEGAL,
    104   Token::ILLEGAL,
    105   Token::ILLEGAL,
    106   Token::ILLEGAL,
    107   Token::ILLEGAL,
    108   Token::ILLEGAL,
    109   Token::ILLEGAL,
    110   Token::ILLEGAL,
    111   Token::ILLEGAL,
    112   Token::ILLEGAL,
    113   Token::ILLEGAL,
    114   Token::ILLEGAL,
    115   Token::ILLEGAL,
    116   Token::ILLEGAL,
    117   Token::ILLEGAL,
    118   Token::ILLEGAL,
    119   Token::ILLEGAL,
    120   Token::ILLEGAL,
    121   Token::ILLEGAL,
    122   Token::ILLEGAL,
    123   Token::ILLEGAL,
    124   Token::ILLEGAL,
    125   Token::ILLEGAL,
    126   Token::ILLEGAL,
    127   Token::ILLEGAL,
    128   Token::ILLEGAL,
    129   Token::ILLEGAL,
    130   Token::ILLEGAL,
    131   Token::ILLEGAL,
    132   Token::ILLEGAL,
    133   Token::ILLEGAL,
    134   Token::LPAREN,       // 0x28
    135   Token::RPAREN,       // 0x29
    136   Token::ILLEGAL,
    137   Token::ILLEGAL,
    138   Token::COMMA,        // 0x2c
    139   Token::ILLEGAL,
    140   Token::ILLEGAL,
    141   Token::ILLEGAL,
    142   Token::ILLEGAL,
    143   Token::ILLEGAL,
    144   Token::ILLEGAL,
    145   Token::ILLEGAL,
    146   Token::ILLEGAL,
    147   Token::ILLEGAL,
    148   Token::ILLEGAL,
    149   Token::ILLEGAL,
    150   Token::ILLEGAL,
    151   Token::ILLEGAL,
    152   Token::COLON,        // 0x3a
    153   Token::SEMICOLON,    // 0x3b
    154   Token::ILLEGAL,
    155   Token::ILLEGAL,
    156   Token::ILLEGAL,
    157   Token::CONDITIONAL,  // 0x3f
    158   Token::ILLEGAL,
    159   Token::ILLEGAL,
    160   Token::ILLEGAL,
    161   Token::ILLEGAL,
    162   Token::ILLEGAL,
    163   Token::ILLEGAL,
    164   Token::ILLEGAL,
    165   Token::ILLEGAL,
    166   Token::ILLEGAL,
    167   Token::ILLEGAL,
    168   Token::ILLEGAL,
    169   Token::ILLEGAL,
    170   Token::ILLEGAL,
    171   Token::ILLEGAL,
    172   Token::ILLEGAL,
    173   Token::ILLEGAL,
    174   Token::ILLEGAL,
    175   Token::ILLEGAL,
    176   Token::ILLEGAL,
    177   Token::ILLEGAL,
    178   Token::ILLEGAL,
    179   Token::ILLEGAL,
    180   Token::ILLEGAL,
    181   Token::ILLEGAL,
    182   Token::ILLEGAL,
    183   Token::ILLEGAL,
    184   Token::ILLEGAL,
    185   Token::LBRACK,     // 0x5b
    186   Token::ILLEGAL,
    187   Token::RBRACK,     // 0x5d
    188   Token::ILLEGAL,
    189   Token::ILLEGAL,
    190   Token::ILLEGAL,
    191   Token::ILLEGAL,
    192   Token::ILLEGAL,
    193   Token::ILLEGAL,
    194   Token::ILLEGAL,
    195   Token::ILLEGAL,
    196   Token::ILLEGAL,
    197   Token::ILLEGAL,
    198   Token::ILLEGAL,
    199   Token::ILLEGAL,
    200   Token::ILLEGAL,
    201   Token::ILLEGAL,
    202   Token::ILLEGAL,
    203   Token::ILLEGAL,
    204   Token::ILLEGAL,
    205   Token::ILLEGAL,
    206   Token::ILLEGAL,
    207   Token::ILLEGAL,
    208   Token::ILLEGAL,
    209   Token::ILLEGAL,
    210   Token::ILLEGAL,
    211   Token::ILLEGAL,
    212   Token::ILLEGAL,
    213   Token::ILLEGAL,
    214   Token::ILLEGAL,
    215   Token::ILLEGAL,
    216   Token::ILLEGAL,
    217   Token::LBRACE,       // 0x7b
    218   Token::ILLEGAL,
    219   Token::RBRACE,       // 0x7d
    220   Token::BIT_NOT,      // 0x7e
    221   Token::ILLEGAL
    222 };
    223 
    224 
    225 Token::Value Scanner::Next() {
    226   current_ = next_;
    227   has_line_terminator_before_next_ = false;
    228   has_multiline_comment_before_next_ = false;
    229   if (static_cast<unsigned>(c0_) <= 0x7f) {
    230     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    231     if (token != Token::ILLEGAL) {
    232       int pos = source_pos();
    233       next_.token = token;
    234       next_.location.beg_pos = pos;
    235       next_.location.end_pos = pos + 1;
    236       Advance();
    237       return current_.token;
    238     }
    239   }
    240   Scan();
    241   return current_.token;
    242 }
    243 
    244 
    245 static inline bool IsByteOrderMark(uc32 c) {
    246   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    247   // Unicode character; this implies that in a Unicode context the
    248   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    249   // character expressed in little-endian byte order (since it could
    250   // not be a U+FFFE character expressed in big-endian byte
    251   // order). Nevertheless, we check for it to be compatible with
    252   // Spidermonkey.
    253   return c == 0xFEFF || c == 0xFFFE;
    254 }
    255 
    256 
    257 bool Scanner::SkipWhiteSpace() {
    258   int start_position = source_pos();
    259 
    260   while (true) {
    261     // We treat byte-order marks (BOMs) as whitespace for better
    262     // compatibility with Spidermonkey and other JavaScript engines.
    263     while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
    264       // IsWhiteSpace() includes line terminators!
    265       if (unicode_cache_->IsLineTerminator(c0_)) {
    266         // Ignore line terminators, but remember them. This is necessary
    267         // for automatic semicolon insertion.
    268         has_line_terminator_before_next_ = true;
    269       }
    270       Advance();
    271     }
    272 
    273     // If there is an HTML comment end '-->' at the beginning of a
    274     // line (with only whitespace in front of it), we treat the rest
    275     // of the line as a comment. This is in line with the way
    276     // SpiderMonkey handles it.
    277     if (c0_ == '-' && has_line_terminator_before_next_) {
    278       Advance();
    279       if (c0_ == '-') {
    280         Advance();
    281         if (c0_ == '>') {
    282           // Treat the rest of the line as a comment.
    283           SkipSingleLineComment();
    284           // Continue skipping white space after the comment.
    285           continue;
    286         }
    287         PushBack('-');  // undo Advance()
    288       }
    289       PushBack('-');  // undo Advance()
    290     }
    291     // Return whether or not we skipped any characters.
    292     return source_pos() != start_position;
    293   }
    294 }
    295 
    296 
    297 Token::Value Scanner::SkipSingleLineComment() {
    298   Advance();
    299 
    300   // The line terminator at the end of the line is not considered
    301   // to be part of the single-line comment; it is recognized
    302   // separately by the lexical grammar and becomes part of the
    303   // stream of input elements for the syntactic grammar (see
    304   // ECMA-262, section 7.4).
    305   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    306     Advance();
    307   }
    308 
    309   return Token::WHITESPACE;
    310 }
    311 
    312 
    313 Token::Value Scanner::SkipMultiLineComment() {
    314   ASSERT(c0_ == '*');
    315   Advance();
    316 
    317   while (c0_ >= 0) {
    318     uc32 ch = c0_;
    319     Advance();
    320     if (unicode_cache_->IsLineTerminator(ch)) {
    321       // Following ECMA-262, section 7.4, a comment containing
    322       // a newline will make the comment count as a line-terminator.
    323       has_multiline_comment_before_next_ = true;
    324     }
    325     // If we have reached the end of the multi-line comment, we
    326     // consume the '/' and insert a whitespace. This way all
    327     // multi-line comments are treated as whitespace.
    328     if (ch == '*' && c0_ == '/') {
    329       c0_ = ' ';
    330       return Token::WHITESPACE;
    331     }
    332   }
    333 
    334   // Unterminated multi-line comment.
    335   return Token::ILLEGAL;
    336 }
    337 
    338 
    339 Token::Value Scanner::ScanHtmlComment() {
    340   // Check for <!-- comments.
    341   ASSERT(c0_ == '!');
    342   Advance();
    343   if (c0_ == '-') {
    344     Advance();
    345     if (c0_ == '-') return SkipSingleLineComment();
    346     PushBack('-');  // undo Advance()
    347   }
    348   PushBack('!');  // undo Advance()
    349   ASSERT(c0_ == '!');
    350   return Token::LT;
    351 }
    352 
    353 
    354 void Scanner::Scan() {
    355   next_.literal_chars = NULL;
    356   Token::Value token;
    357   do {
    358     // Remember the position of the next token
    359     next_.location.beg_pos = source_pos();
    360 
    361     switch (c0_) {
    362       case ' ':
    363       case '\t':
    364         Advance();
    365         token = Token::WHITESPACE;
    366         break;
    367 
    368       case '\n':
    369         Advance();
    370         has_line_terminator_before_next_ = true;
    371         token = Token::WHITESPACE;
    372         break;
    373 
    374       case '"': case '\'':
    375         token = ScanString();
    376         break;
    377 
    378       case '<':
    379         // < <= << <<= <!--
    380         Advance();
    381         if (c0_ == '=') {
    382           token = Select(Token::LTE);
    383         } else if (c0_ == '<') {
    384           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    385         } else if (c0_ == '!') {
    386           token = ScanHtmlComment();
    387         } else {
    388           token = Token::LT;
    389         }
    390         break;
    391 
    392       case '>':
    393         // > >= >> >>= >>> >>>=
    394         Advance();
    395         if (c0_ == '=') {
    396           token = Select(Token::GTE);
    397         } else if (c0_ == '>') {
    398           // >> >>= >>> >>>=
    399           Advance();
    400           if (c0_ == '=') {
    401             token = Select(Token::ASSIGN_SAR);
    402           } else if (c0_ == '>') {
    403             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    404           } else {
    405             token = Token::SAR;
    406           }
    407         } else {
    408           token = Token::GT;
    409         }
    410         break;
    411 
    412       case '=':
    413         // = == ===
    414         Advance();
    415         if (c0_ == '=') {
    416           token = Select('=', Token::EQ_STRICT, Token::EQ);
    417         } else {
    418           token = Token::ASSIGN;
    419         }
    420         break;
    421 
    422       case '!':
    423         // ! != !==
    424         Advance();
    425         if (c0_ == '=') {
    426           token = Select('=', Token::NE_STRICT, Token::NE);
    427         } else {
    428           token = Token::NOT;
    429         }
    430         break;
    431 
    432       case '+':
    433         // + ++ +=
    434         Advance();
    435         if (c0_ == '+') {
    436           token = Select(Token::INC);
    437         } else if (c0_ == '=') {
    438           token = Select(Token::ASSIGN_ADD);
    439         } else {
    440           token = Token::ADD;
    441         }
    442         break;
    443 
    444       case '-':
    445         // - -- --> -=
    446         Advance();
    447         if (c0_ == '-') {
    448           Advance();
    449           if (c0_ == '>' && has_line_terminator_before_next_) {
    450             // For compatibility with SpiderMonkey, we skip lines that
    451             // start with an HTML comment end '-->'.
    452             token = SkipSingleLineComment();
    453           } else {
    454             token = Token::DEC;
    455           }
    456         } else if (c0_ == '=') {
    457           token = Select(Token::ASSIGN_SUB);
    458         } else {
    459           token = Token::SUB;
    460         }
    461         break;
    462 
    463       case '*':
    464         // * *=
    465         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
    466         break;
    467 
    468       case '%':
    469         // % %=
    470         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    471         break;
    472 
    473       case '/':
    474         // /  // /* /=
    475         Advance();
    476         if (c0_ == '/') {
    477           token = SkipSingleLineComment();
    478         } else if (c0_ == '*') {
    479           token = SkipMultiLineComment();
    480         } else if (c0_ == '=') {
    481           token = Select(Token::ASSIGN_DIV);
    482         } else {
    483           token = Token::DIV;
    484         }
    485         break;
    486 
    487       case '&':
    488         // & && &=
    489         Advance();
    490         if (c0_ == '&') {
    491           token = Select(Token::AND);
    492         } else if (c0_ == '=') {
    493           token = Select(Token::ASSIGN_BIT_AND);
    494         } else {
    495           token = Token::BIT_AND;
    496         }
    497         break;
    498 
    499       case '|':
    500         // | || |=
    501         Advance();
    502         if (c0_ == '|') {
    503           token = Select(Token::OR);
    504         } else if (c0_ == '=') {
    505           token = Select(Token::ASSIGN_BIT_OR);
    506         } else {
    507           token = Token::BIT_OR;
    508         }
    509         break;
    510 
    511       case '^':
    512         // ^ ^=
    513         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    514         break;
    515 
    516       case '.':
    517         // . Number
    518         Advance();
    519         if (IsDecimalDigit(c0_)) {
    520           token = ScanNumber(true);
    521         } else {
    522           token = Token::PERIOD;
    523         }
    524         break;
    525 
    526       case ':':
    527         token = Select(Token::COLON);
    528         break;
    529 
    530       case ';':
    531         token = Select(Token::SEMICOLON);
    532         break;
    533 
    534       case ',':
    535         token = Select(Token::COMMA);
    536         break;
    537 
    538       case '(':
    539         token = Select(Token::LPAREN);
    540         break;
    541 
    542       case ')':
    543         token = Select(Token::RPAREN);
    544         break;
    545 
    546       case '[':
    547         token = Select(Token::LBRACK);
    548         break;
    549 
    550       case ']':
    551         token = Select(Token::RBRACK);
    552         break;
    553 
    554       case '{':
    555         token = Select(Token::LBRACE);
    556         break;
    557 
    558       case '}':
    559         token = Select(Token::RBRACE);
    560         break;
    561 
    562       case '?':
    563         token = Select(Token::CONDITIONAL);
    564         break;
    565 
    566       case '~':
    567         token = Select(Token::BIT_NOT);
    568         break;
    569 
    570       default:
    571         if (unicode_cache_->IsIdentifierStart(c0_)) {
    572           token = ScanIdentifierOrKeyword();
    573         } else if (IsDecimalDigit(c0_)) {
    574           token = ScanNumber(false);
    575         } else if (SkipWhiteSpace()) {
    576           token = Token::WHITESPACE;
    577         } else if (c0_ < 0) {
    578           token = Token::EOS;
    579         } else {
    580           token = Select(Token::ILLEGAL);
    581         }
    582         break;
    583     }
    584 
    585     // Continue scanning for tokens as long as we're just skipping
    586     // whitespace.
    587   } while (token == Token::WHITESPACE);
    588 
    589   next_.location.end_pos = source_pos();
    590   next_.token = token;
    591 }
    592 
    593 
    594 void Scanner::SeekForward(int pos) {
    595   // After this call, we will have the token at the given position as
    596   // the "next" token. The "current" token will be invalid.
    597   if (pos == next_.location.beg_pos) return;
    598   int current_pos = source_pos();
    599   ASSERT_EQ(next_.location.end_pos, current_pos);
    600   // Positions inside the lookahead token aren't supported.
    601   ASSERT(pos >= current_pos);
    602   if (pos != current_pos) {
    603     source_->SeekForward(pos - source_->pos());
    604     Advance();
    605     // This function is only called to seek to the location
    606     // of the end of a function (at the "}" token). It doesn't matter
    607     // whether there was a line terminator in the part we skip.
    608     has_line_terminator_before_next_ = false;
    609     has_multiline_comment_before_next_ = false;
    610   }
    611   Scan();
    612 }
    613 
    614 
    615 bool Scanner::ScanEscape() {
    616   uc32 c = c0_;
    617   Advance();
    618 
    619   // Skip escaped newlines.
    620   if (unicode_cache_->IsLineTerminator(c)) {
    621     // Allow CR+LF newlines in multiline string literals.
    622     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
    623     // Allow LF+CR newlines in multiline string literals.
    624     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
    625     return true;
    626   }
    627 
    628   switch (c) {
    629     case '\'':  // fall through
    630     case '"' :  // fall through
    631     case '\\': break;
    632     case 'b' : c = '\b'; break;
    633     case 'f' : c = '\f'; break;
    634     case 'n' : c = '\n'; break;
    635     case 'r' : c = '\r'; break;
    636     case 't' : c = '\t'; break;
    637     case 'u' : {
    638       c = ScanHexNumber(4);
    639       if (c < 0) return false;
    640       break;
    641     }
    642     case 'v' : c = '\v'; break;
    643     case 'x' : {
    644       c = ScanHexNumber(2);
    645       if (c < 0) return false;
    646       break;
    647     }
    648     case '0' :  // fall through
    649     case '1' :  // fall through
    650     case '2' :  // fall through
    651     case '3' :  // fall through
    652     case '4' :  // fall through
    653     case '5' :  // fall through
    654     case '6' :  // fall through
    655     case '7' : c = ScanOctalEscape(c, 2); break;
    656   }
    657 
    658   // According to ECMA-262, section 7.8.4, characters not covered by the
    659   // above cases should be illegal, but they are commonly handled as
    660   // non-escaped characters by JS VMs.
    661   AddLiteralChar(c);
    662   return true;
    663 }
    664 
    665 
    666 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
    667 // ECMA-262. Other JS VMs support them.
    668 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    669   uc32 x = c - '0';
    670   int i = 0;
    671   for (; i < length; i++) {
    672     int d = c0_ - '0';
    673     if (d < 0 || d > 7) break;
    674     int nx = x * 8 + d;
    675     if (nx >= 256) break;
    676     x = nx;
    677     Advance();
    678   }
    679   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
    680   // Remember the position of octal escape sequences so that an error
    681   // can be reported later (in strict mode).
    682   // We don't report the error immediately, because the octal escape can
    683   // occur before the "use strict" directive.
    684   if (c != '0' || i > 0) {
    685     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
    686   }
    687   return x;
    688 }
    689 
    690 
    691 Token::Value Scanner::ScanString() {
    692   uc32 quote = c0_;
    693   Advance();  // consume quote
    694 
    695   LiteralScope literal(this);
    696   while (c0_ != quote && c0_ >= 0
    697          && !unicode_cache_->IsLineTerminator(c0_)) {
    698     uc32 c = c0_;
    699     Advance();
    700     if (c == '\\') {
    701       if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
    702     } else {
    703       AddLiteralChar(c);
    704     }
    705   }
    706   if (c0_ != quote) return Token::ILLEGAL;
    707   literal.Complete();
    708 
    709   Advance();  // consume quote
    710   return Token::STRING;
    711 }
    712 
    713 
    714 void Scanner::ScanDecimalDigits() {
    715   while (IsDecimalDigit(c0_))
    716     AddLiteralCharAdvance();
    717 }
    718 
    719 
    720 Token::Value Scanner::ScanNumber(bool seen_period) {
    721   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
    722 
    723   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
    724 
    725   LiteralScope literal(this);
    726   if (seen_period) {
    727     // we have already seen a decimal point of the float
    728     AddLiteralChar('.');
    729     ScanDecimalDigits();  // we know we have at least one digit
    730 
    731   } else {
    732     // if the first character is '0' we must check for octals and hex
    733     if (c0_ == '0') {
    734       int start_pos = source_pos();  // For reporting octal positions.
    735       AddLiteralCharAdvance();
    736 
    737       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
    738       // an octal number.
    739       if (c0_ == 'x' || c0_ == 'X') {
    740         // hex number
    741         kind = HEX;
    742         AddLiteralCharAdvance();
    743         if (!IsHexDigit(c0_)) {
    744           // we must have at least one hex digit after 'x'/'X'
    745           return Token::ILLEGAL;
    746         }
    747         while (IsHexDigit(c0_)) {
    748           AddLiteralCharAdvance();
    749         }
    750       } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
    751         kind = OCTAL;
    752         AddLiteralCharAdvance();
    753         if (!IsOctalDigit(c0_)) {
    754           // we must have at least one octal digit after 'o'/'O'
    755           return Token::ILLEGAL;
    756         }
    757         while (IsOctalDigit(c0_)) {
    758           AddLiteralCharAdvance();
    759         }
    760       } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
    761         kind = BINARY;
    762         AddLiteralCharAdvance();
    763         if (!IsBinaryDigit(c0_)) {
    764           // we must have at least one binary digit after 'b'/'B'
    765           return Token::ILLEGAL;
    766         }
    767         while (IsBinaryDigit(c0_)) {
    768           AddLiteralCharAdvance();
    769         }
    770       } else if ('0' <= c0_ && c0_ <= '7') {
    771         // (possible) octal number
    772         kind = IMPLICIT_OCTAL;
    773         while (true) {
    774           if (c0_ == '8' || c0_ == '9') {
    775             kind = DECIMAL;
    776             break;
    777           }
    778           if (c0_  < '0' || '7'  < c0_) {
    779             // Octal literal finished.
    780             octal_pos_ = Location(start_pos, source_pos());
    781             break;
    782           }
    783           AddLiteralCharAdvance();
    784         }
    785       }
    786     }
    787 
    788     // Parse decimal digits and allow trailing fractional part.
    789     if (kind == DECIMAL) {
    790       ScanDecimalDigits();  // optional
    791       if (c0_ == '.') {
    792         AddLiteralCharAdvance();
    793         ScanDecimalDigits();  // optional
    794       }
    795     }
    796   }
    797 
    798   // scan exponent, if any
    799   if (c0_ == 'e' || c0_ == 'E') {
    800     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
    801     if (kind != DECIMAL) return Token::ILLEGAL;
    802     // scan exponent
    803     AddLiteralCharAdvance();
    804     if (c0_ == '+' || c0_ == '-')
    805       AddLiteralCharAdvance();
    806     if (!IsDecimalDigit(c0_)) {
    807       // we must have at least one decimal digit after 'e'/'E'
    808       return Token::ILLEGAL;
    809     }
    810     ScanDecimalDigits();
    811   }
    812 
    813   // The source character immediately following a numeric literal must
    814   // not be an identifier start or a decimal digit; see ECMA-262
    815   // section 7.8.3, page 17 (note that we read only one decimal digit
    816   // if the value is 0).
    817   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
    818     return Token::ILLEGAL;
    819 
    820   literal.Complete();
    821 
    822   return Token::NUMBER;
    823 }
    824 
    825 
    826 uc32 Scanner::ScanIdentifierUnicodeEscape() {
    827   Advance();
    828   if (c0_ != 'u') return -1;
    829   Advance();
    830   uc32 result = ScanHexNumber(4);
    831   if (result < 0) PushBack('u');
    832   return result;
    833 }
    834 
    835 
    836 // ----------------------------------------------------------------------------
    837 // Keyword Matcher
    838 
    839 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
    840   KEYWORD_GROUP('b')                                                \
    841   KEYWORD("break", Token::BREAK)                                    \
    842   KEYWORD_GROUP('c')                                                \
    843   KEYWORD("case", Token::CASE)                                      \
    844   KEYWORD("catch", Token::CATCH)                                    \
    845   KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
    846   KEYWORD("const", Token::CONST)                                    \
    847   KEYWORD("continue", Token::CONTINUE)                              \
    848   KEYWORD_GROUP('d')                                                \
    849   KEYWORD("debugger", Token::DEBUGGER)                              \
    850   KEYWORD("default", Token::DEFAULT)                                \
    851   KEYWORD("delete", Token::DELETE)                                  \
    852   KEYWORD("do", Token::DO)                                          \
    853   KEYWORD_GROUP('e')                                                \
    854   KEYWORD("else", Token::ELSE)                                      \
    855   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
    856   KEYWORD("export", harmony_modules                                 \
    857                     ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
    858   KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
    859   KEYWORD_GROUP('f')                                                \
    860   KEYWORD("false", Token::FALSE_LITERAL)                            \
    861   KEYWORD("finally", Token::FINALLY)                                \
    862   KEYWORD("for", Token::FOR)                                        \
    863   KEYWORD("function", Token::FUNCTION)                              \
    864   KEYWORD_GROUP('i')                                                \
    865   KEYWORD("if", Token::IF)                                          \
    866   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
    867   KEYWORD("import", harmony_modules                                 \
    868                     ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
    869   KEYWORD("in", Token::IN)                                          \
    870   KEYWORD("instanceof", Token::INSTANCEOF)                          \
    871   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
    872   KEYWORD_GROUP('l')                                                \
    873   KEYWORD("let", harmony_scoping                                    \
    874                  ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
    875   KEYWORD_GROUP('n')                                                \
    876   KEYWORD("new", Token::NEW)                                        \
    877   KEYWORD("null", Token::NULL_LITERAL)                              \
    878   KEYWORD_GROUP('p')                                                \
    879   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
    880   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
    881   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
    882   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
    883   KEYWORD_GROUP('r')                                                \
    884   KEYWORD("return", Token::RETURN)                                  \
    885   KEYWORD_GROUP('s')                                                \
    886   KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
    887   KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
    888   KEYWORD("switch", Token::SWITCH)                                  \
    889   KEYWORD_GROUP('t')                                                \
    890   KEYWORD("this", Token::THIS)                                      \
    891   KEYWORD("throw", Token::THROW)                                    \
    892   KEYWORD("true", Token::TRUE_LITERAL)                              \
    893   KEYWORD("try", Token::TRY)                                        \
    894   KEYWORD("typeof", Token::TYPEOF)                                  \
    895   KEYWORD_GROUP('v')                                                \
    896   KEYWORD("var", Token::VAR)                                        \
    897   KEYWORD("void", Token::VOID)                                      \
    898   KEYWORD_GROUP('w')                                                \
    899   KEYWORD("while", Token::WHILE)                                    \
    900   KEYWORD("with", Token::WITH)                                      \
    901   KEYWORD_GROUP('y')                                                \
    902   KEYWORD("yield", Token::YIELD)
    903 
    904 
    905 static Token::Value KeywordOrIdentifierToken(const char* input,
    906                                              int input_length,
    907                                              bool harmony_scoping,
    908                                              bool harmony_modules) {
    909   ASSERT(input_length >= 1);
    910   const int kMinLength = 2;
    911   const int kMaxLength = 10;
    912   if (input_length < kMinLength || input_length > kMaxLength) {
    913     return Token::IDENTIFIER;
    914   }
    915   switch (input[0]) {
    916     default:
    917 #define KEYWORD_GROUP_CASE(ch)                                \
    918       break;                                                  \
    919     case ch:
    920 #define KEYWORD(keyword, token)                               \
    921     {                                                         \
    922       /* 'keyword' is a char array, so sizeof(keyword) is */  \
    923       /* strlen(keyword) plus 1 for the NUL char. */          \
    924       const int keyword_length = sizeof(keyword) - 1;         \
    925       STATIC_ASSERT(keyword_length >= kMinLength);            \
    926       STATIC_ASSERT(keyword_length <= kMaxLength);            \
    927       if (input_length == keyword_length &&                   \
    928           input[1] == keyword[1] &&                           \
    929           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
    930           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
    931           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
    932           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
    933           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
    934           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
    935           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
    936           (keyword_length <= 9 || input[9] == keyword[9])) {  \
    937         return token;                                         \
    938       }                                                       \
    939     }
    940     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
    941   }
    942   return Token::IDENTIFIER;
    943 }
    944 
    945 
    946 Token::Value Scanner::ScanIdentifierOrKeyword() {
    947   ASSERT(unicode_cache_->IsIdentifierStart(c0_));
    948   LiteralScope literal(this);
    949   // Scan identifier start character.
    950   if (c0_ == '\\') {
    951     uc32 c = ScanIdentifierUnicodeEscape();
    952     // Only allow legal identifier start characters.
    953     if (c < 0 ||
    954         c == '\\' ||  // No recursive escapes.
    955         !unicode_cache_->IsIdentifierStart(c)) {
    956       return Token::ILLEGAL;
    957     }
    958     AddLiteralChar(c);
    959     return ScanIdentifierSuffix(&literal);
    960   }
    961 
    962   uc32 first_char = c0_;
    963   Advance();
    964   AddLiteralChar(first_char);
    965 
    966   // Scan the rest of the identifier characters.
    967   while (unicode_cache_->IsIdentifierPart(c0_)) {
    968     if (c0_ != '\\') {
    969       uc32 next_char = c0_;
    970       Advance();
    971       AddLiteralChar(next_char);
    972       continue;
    973     }
    974     // Fallthrough if no longer able to complete keyword.
    975     return ScanIdentifierSuffix(&literal);
    976   }
    977 
    978   literal.Complete();
    979 
    980   if (next_.literal_chars->is_ascii()) {
    981     Vector<const char> chars = next_.literal_chars->ascii_literal();
    982     return KeywordOrIdentifierToken(chars.start(),
    983                                     chars.length(),
    984                                     harmony_scoping_,
    985                                     harmony_modules_);
    986   }
    987 
    988   return Token::IDENTIFIER;
    989 }
    990 
    991 
    992 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
    993   // Scan the rest of the identifier characters.
    994   while (unicode_cache_->IsIdentifierPart(c0_)) {
    995     if (c0_ == '\\') {
    996       uc32 c = ScanIdentifierUnicodeEscape();
    997       // Only allow legal identifier part characters.
    998       if (c < 0 ||
    999           c == '\\' ||
   1000           !unicode_cache_->IsIdentifierPart(c)) {
   1001         return Token::ILLEGAL;
   1002       }
   1003       AddLiteralChar(c);
   1004     } else {
   1005       AddLiteralChar(c0_);
   1006       Advance();
   1007     }
   1008   }
   1009   literal->Complete();
   1010 
   1011   return Token::IDENTIFIER;
   1012 }
   1013 
   1014 
   1015 bool Scanner::ScanRegExpPattern(bool seen_equal) {
   1016   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
   1017   bool in_character_class = false;
   1018 
   1019   // Previous token is either '/' or '/=', in the second case, the
   1020   // pattern starts at =.
   1021   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1022   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1023 
   1024   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1025   // the scanner should pass uninterpreted bodies to the RegExp
   1026   // constructor.
   1027   LiteralScope literal(this);
   1028   if (seen_equal) {
   1029     AddLiteralChar('=');
   1030   }
   1031 
   1032   while (c0_ != '/' || in_character_class) {
   1033     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1034     if (c0_ == '\\') {  // Escape sequence.
   1035       AddLiteralCharAdvance();
   1036       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1037       AddLiteralCharAdvance();
   1038       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
   1039       // only "safe" characters are allowed (letters, digits, underscore),
   1040       // otherwise the escape isn't valid and the invalid character has
   1041       // its normal meaning. I.e., we can just continue scanning without
   1042       // worrying whether the following characters are part of the escape
   1043       // or not, since any '/', '\\' or '[' is guaranteed to not be part
   1044       // of the escape sequence.
   1045 
   1046       // TODO(896): At some point, parse RegExps more throughly to capture
   1047       // octal esacpes in strict mode.
   1048     } else {  // Unescaped character.
   1049       if (c0_ == '[') in_character_class = true;
   1050       if (c0_ == ']') in_character_class = false;
   1051       AddLiteralCharAdvance();
   1052     }
   1053   }
   1054   Advance();  // consume '/'
   1055 
   1056   literal.Complete();
   1057 
   1058   return true;
   1059 }
   1060 
   1061 
   1062 bool Scanner::ScanLiteralUnicodeEscape() {
   1063   ASSERT(c0_ == '\\');
   1064   uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
   1065   Advance();
   1066   int i = 1;
   1067   if (c0_ == 'u') {
   1068     i++;
   1069     while (i < 6) {
   1070       Advance();
   1071       if (!IsHexDigit(c0_)) break;
   1072       chars_read[i] = c0_;
   1073       i++;
   1074     }
   1075   }
   1076   if (i < 6) {
   1077     // Incomplete escape. Undo all advances and return false.
   1078     while (i > 0) {
   1079       i--;
   1080       PushBack(chars_read[i]);
   1081     }
   1082     return false;
   1083   }
   1084   // Complete escape. Add all chars to current literal buffer.
   1085   for (int i = 0; i < 6; i++) {
   1086     AddLiteralChar(chars_read[i]);
   1087   }
   1088   return true;
   1089 }
   1090 
   1091 
   1092 bool Scanner::ScanRegExpFlags() {
   1093   // Scan regular expression flags.
   1094   LiteralScope literal(this);
   1095   while (unicode_cache_->IsIdentifierPart(c0_)) {
   1096     if (c0_ != '\\') {
   1097       AddLiteralCharAdvance();
   1098     } else {
   1099       if (!ScanLiteralUnicodeEscape()) {
   1100         break;
   1101       }
   1102       Advance();
   1103     }
   1104   }
   1105   literal.Complete();
   1106 
   1107   next_.location.end_pos = source_pos() - 1;
   1108   return true;
   1109 }
   1110 
   1111 } }  // namespace v8::internal
   1112