Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 // Features shared by parsing and pre-parsing scanners.
     29 
     30 #include <cmath>
     31 
     32 #include "scanner.h"
     33 
     34 #include "../include/v8stdint.h"
     35 #include "char-predicates-inl.h"
     36 #include "conversions-inl.h"
     37 #include "list-inl.h"
     38 
     39 namespace v8 {
     40 namespace internal {
     41 
     42 // ----------------------------------------------------------------------------
     43 // Scanner
     44 
     45 Scanner::Scanner(UnicodeCache* unicode_cache)
     46     : unicode_cache_(unicode_cache),
     47       octal_pos_(Location::invalid()),
     48       harmony_scoping_(false),
     49       harmony_modules_(false),
     50       harmony_numeric_literals_(false) { }
     51 
     52 
     53 void Scanner::Initialize(Utf16CharacterStream* source) {
     54   source_ = source;
     55   // Need to capture identifiers in order to recognize "get" and "set"
     56   // in object literals.
     57   Init();
     58   // Skip initial whitespace allowing HTML comment ends just like
     59   // after a newline and scan first token.
     60   has_line_terminator_before_next_ = true;
     61   SkipWhiteSpace();
     62   Scan();
     63 }
     64 
     65 
     66 uc32 Scanner::ScanHexNumber(int expected_length) {
     67   ASSERT(expected_length <= 4);  // prevent overflow
     68 
     69   uc32 digits[4] = { 0, 0, 0, 0 };
     70   uc32 x = 0;
     71   for (int i = 0; i < expected_length; i++) {
     72     digits[i] = c0_;
     73     int d = HexValue(c0_);
     74     if (d < 0) {
     75       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
     76       // should be illegal, but other JS VMs just return the
     77       // non-escaped version of the original character.
     78 
     79       // Push back digits that we have advanced past.
     80       for (int j = i-1; j >= 0; j--) {
     81         PushBack(digits[j]);
     82       }
     83       return -1;
     84     }
     85     x = x * 16 + d;
     86     Advance();
     87   }
     88 
     89   return x;
     90 }
     91 
     92 
     93 // Ensure that tokens can be stored in a byte.
     94 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
     95 
     96 // Table of one-character tokens, by character (0x00..0x7f only).
     97 static const byte one_char_tokens[] = {
     98   Token::ILLEGAL,
     99   Token::ILLEGAL,
    100   Token::ILLEGAL,
    101   Token::ILLEGAL,
    102   Token::ILLEGAL,
    103   Token::ILLEGAL,
    104   Token::ILLEGAL,
    105   Token::ILLEGAL,
    106   Token::ILLEGAL,
    107   Token::ILLEGAL,
    108   Token::ILLEGAL,
    109   Token::ILLEGAL,
    110   Token::ILLEGAL,
    111   Token::ILLEGAL,
    112   Token::ILLEGAL,
    113   Token::ILLEGAL,
    114   Token::ILLEGAL,
    115   Token::ILLEGAL,
    116   Token::ILLEGAL,
    117   Token::ILLEGAL,
    118   Token::ILLEGAL,
    119   Token::ILLEGAL,
    120   Token::ILLEGAL,
    121   Token::ILLEGAL,
    122   Token::ILLEGAL,
    123   Token::ILLEGAL,
    124   Token::ILLEGAL,
    125   Token::ILLEGAL,
    126   Token::ILLEGAL,
    127   Token::ILLEGAL,
    128   Token::ILLEGAL,
    129   Token::ILLEGAL,
    130   Token::ILLEGAL,
    131   Token::ILLEGAL,
    132   Token::ILLEGAL,
    133   Token::ILLEGAL,
    134   Token::ILLEGAL,
    135   Token::ILLEGAL,
    136   Token::ILLEGAL,
    137   Token::ILLEGAL,
    138   Token::LPAREN,       // 0x28
    139   Token::RPAREN,       // 0x29
    140   Token::ILLEGAL,
    141   Token::ILLEGAL,
    142   Token::COMMA,        // 0x2c
    143   Token::ILLEGAL,
    144   Token::ILLEGAL,
    145   Token::ILLEGAL,
    146   Token::ILLEGAL,
    147   Token::ILLEGAL,
    148   Token::ILLEGAL,
    149   Token::ILLEGAL,
    150   Token::ILLEGAL,
    151   Token::ILLEGAL,
    152   Token::ILLEGAL,
    153   Token::ILLEGAL,
    154   Token::ILLEGAL,
    155   Token::ILLEGAL,
    156   Token::COLON,        // 0x3a
    157   Token::SEMICOLON,    // 0x3b
    158   Token::ILLEGAL,
    159   Token::ILLEGAL,
    160   Token::ILLEGAL,
    161   Token::CONDITIONAL,  // 0x3f
    162   Token::ILLEGAL,
    163   Token::ILLEGAL,
    164   Token::ILLEGAL,
    165   Token::ILLEGAL,
    166   Token::ILLEGAL,
    167   Token::ILLEGAL,
    168   Token::ILLEGAL,
    169   Token::ILLEGAL,
    170   Token::ILLEGAL,
    171   Token::ILLEGAL,
    172   Token::ILLEGAL,
    173   Token::ILLEGAL,
    174   Token::ILLEGAL,
    175   Token::ILLEGAL,
    176   Token::ILLEGAL,
    177   Token::ILLEGAL,
    178   Token::ILLEGAL,
    179   Token::ILLEGAL,
    180   Token::ILLEGAL,
    181   Token::ILLEGAL,
    182   Token::ILLEGAL,
    183   Token::ILLEGAL,
    184   Token::ILLEGAL,
    185   Token::ILLEGAL,
    186   Token::ILLEGAL,
    187   Token::ILLEGAL,
    188   Token::ILLEGAL,
    189   Token::LBRACK,     // 0x5b
    190   Token::ILLEGAL,
    191   Token::RBRACK,     // 0x5d
    192   Token::ILLEGAL,
    193   Token::ILLEGAL,
    194   Token::ILLEGAL,
    195   Token::ILLEGAL,
    196   Token::ILLEGAL,
    197   Token::ILLEGAL,
    198   Token::ILLEGAL,
    199   Token::ILLEGAL,
    200   Token::ILLEGAL,
    201   Token::ILLEGAL,
    202   Token::ILLEGAL,
    203   Token::ILLEGAL,
    204   Token::ILLEGAL,
    205   Token::ILLEGAL,
    206   Token::ILLEGAL,
    207   Token::ILLEGAL,
    208   Token::ILLEGAL,
    209   Token::ILLEGAL,
    210   Token::ILLEGAL,
    211   Token::ILLEGAL,
    212   Token::ILLEGAL,
    213   Token::ILLEGAL,
    214   Token::ILLEGAL,
    215   Token::ILLEGAL,
    216   Token::ILLEGAL,
    217   Token::ILLEGAL,
    218   Token::ILLEGAL,
    219   Token::ILLEGAL,
    220   Token::ILLEGAL,
    221   Token::LBRACE,       // 0x7b
    222   Token::ILLEGAL,
    223   Token::RBRACE,       // 0x7d
    224   Token::BIT_NOT,      // 0x7e
    225   Token::ILLEGAL
    226 };
    227 
    228 
    229 Token::Value Scanner::Next() {
    230   current_ = next_;
    231   has_line_terminator_before_next_ = false;
    232   has_multiline_comment_before_next_ = false;
    233   if (static_cast<unsigned>(c0_) <= 0x7f) {
    234     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    235     if (token != Token::ILLEGAL) {
    236       int pos = source_pos();
    237       next_.token = token;
    238       next_.location.beg_pos = pos;
    239       next_.location.end_pos = pos + 1;
    240       Advance();
    241       return current_.token;
    242     }
    243   }
    244   Scan();
    245   return current_.token;
    246 }
    247 
    248 
    249 static inline bool IsByteOrderMark(uc32 c) {
    250   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    251   // Unicode character; this implies that in a Unicode context the
    252   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    253   // character expressed in little-endian byte order (since it could
    254   // not be a U+FFFE character expressed in big-endian byte
    255   // order). Nevertheless, we check for it to be compatible with
    256   // Spidermonkey.
    257   return c == 0xFEFF || c == 0xFFFE;
    258 }
    259 
    260 
    261 bool Scanner::SkipWhiteSpace() {
    262   int start_position = source_pos();
    263 
    264   while (true) {
    265     // We treat byte-order marks (BOMs) as whitespace for better
    266     // compatibility with Spidermonkey and other JavaScript engines.
    267     while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
    268       // IsWhiteSpace() includes line terminators!
    269       if (unicode_cache_->IsLineTerminator(c0_)) {
    270         // Ignore line terminators, but remember them. This is necessary
    271         // for automatic semicolon insertion.
    272         has_line_terminator_before_next_ = true;
    273       }
    274       Advance();
    275     }
    276 
    277     // If there is an HTML comment end '-->' at the beginning of a
    278     // line (with only whitespace in front of it), we treat the rest
    279     // of the line as a comment. This is in line with the way
    280     // SpiderMonkey handles it.
    281     if (c0_ == '-' && has_line_terminator_before_next_) {
    282       Advance();
    283       if (c0_ == '-') {
    284         Advance();
    285         if (c0_ == '>') {
    286           // Treat the rest of the line as a comment.
    287           SkipSingleLineComment();
    288           // Continue skipping white space after the comment.
    289           continue;
    290         }
    291         PushBack('-');  // undo Advance()
    292       }
    293       PushBack('-');  // undo Advance()
    294     }
    295     // Return whether or not we skipped any characters.
    296     return source_pos() != start_position;
    297   }
    298 }
    299 
    300 
    301 Token::Value Scanner::SkipSingleLineComment() {
    302   Advance();
    303 
    304   // The line terminator at the end of the line is not considered
    305   // to be part of the single-line comment; it is recognized
    306   // separately by the lexical grammar and becomes part of the
    307   // stream of input elements for the syntactic grammar (see
    308   // ECMA-262, section 7.4).
    309   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    310     Advance();
    311   }
    312 
    313   return Token::WHITESPACE;
    314 }
    315 
    316 
    317 Token::Value Scanner::SkipMultiLineComment() {
    318   ASSERT(c0_ == '*');
    319   Advance();
    320 
    321   while (c0_ >= 0) {
    322     uc32 ch = c0_;
    323     Advance();
    324     if (unicode_cache_->IsLineTerminator(ch)) {
    325       // Following ECMA-262, section 7.4, a comment containing
    326       // a newline will make the comment count as a line-terminator.
    327       has_multiline_comment_before_next_ = true;
    328     }
    329     // If we have reached the end of the multi-line comment, we
    330     // consume the '/' and insert a whitespace. This way all
    331     // multi-line comments are treated as whitespace.
    332     if (ch == '*' && c0_ == '/') {
    333       c0_ = ' ';
    334       return Token::WHITESPACE;
    335     }
    336   }
    337 
    338   // Unterminated multi-line comment.
    339   return Token::ILLEGAL;
    340 }
    341 
    342 
    343 Token::Value Scanner::ScanHtmlComment() {
    344   // Check for <!-- comments.
    345   ASSERT(c0_ == '!');
    346   Advance();
    347   if (c0_ == '-') {
    348     Advance();
    349     if (c0_ == '-') return SkipSingleLineComment();
    350     PushBack('-');  // undo Advance()
    351   }
    352   PushBack('!');  // undo Advance()
    353   ASSERT(c0_ == '!');
    354   return Token::LT;
    355 }
    356 
    357 
    358 void Scanner::Scan() {
    359   next_.literal_chars = NULL;
    360   Token::Value token;
    361   do {
    362     // Remember the position of the next token
    363     next_.location.beg_pos = source_pos();
    364 
    365     switch (c0_) {
    366       case ' ':
    367       case '\t':
    368         Advance();
    369         token = Token::WHITESPACE;
    370         break;
    371 
    372       case '\n':
    373         Advance();
    374         has_line_terminator_before_next_ = true;
    375         token = Token::WHITESPACE;
    376         break;
    377 
    378       case '"': case '\'':
    379         token = ScanString();
    380         break;
    381 
    382       case '<':
    383         // < <= << <<= <!--
    384         Advance();
    385         if (c0_ == '=') {
    386           token = Select(Token::LTE);
    387         } else if (c0_ == '<') {
    388           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    389         } else if (c0_ == '!') {
    390           token = ScanHtmlComment();
    391         } else {
    392           token = Token::LT;
    393         }
    394         break;
    395 
    396       case '>':
    397         // > >= >> >>= >>> >>>=
    398         Advance();
    399         if (c0_ == '=') {
    400           token = Select(Token::GTE);
    401         } else if (c0_ == '>') {
    402           // >> >>= >>> >>>=
    403           Advance();
    404           if (c0_ == '=') {
    405             token = Select(Token::ASSIGN_SAR);
    406           } else if (c0_ == '>') {
    407             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    408           } else {
    409             token = Token::SAR;
    410           }
    411         } else {
    412           token = Token::GT;
    413         }
    414         break;
    415 
    416       case '=':
    417         // = == ===
    418         Advance();
    419         if (c0_ == '=') {
    420           token = Select('=', Token::EQ_STRICT, Token::EQ);
    421         } else {
    422           token = Token::ASSIGN;
    423         }
    424         break;
    425 
    426       case '!':
    427         // ! != !==
    428         Advance();
    429         if (c0_ == '=') {
    430           token = Select('=', Token::NE_STRICT, Token::NE);
    431         } else {
    432           token = Token::NOT;
    433         }
    434         break;
    435 
    436       case '+':
    437         // + ++ +=
    438         Advance();
    439         if (c0_ == '+') {
    440           token = Select(Token::INC);
    441         } else if (c0_ == '=') {
    442           token = Select(Token::ASSIGN_ADD);
    443         } else {
    444           token = Token::ADD;
    445         }
    446         break;
    447 
    448       case '-':
    449         // - -- --> -=
    450         Advance();
    451         if (c0_ == '-') {
    452           Advance();
    453           if (c0_ == '>' && has_line_terminator_before_next_) {
    454             // For compatibility with SpiderMonkey, we skip lines that
    455             // start with an HTML comment end '-->'.
    456             token = SkipSingleLineComment();
    457           } else {
    458             token = Token::DEC;
    459           }
    460         } else if (c0_ == '=') {
    461           token = Select(Token::ASSIGN_SUB);
    462         } else {
    463           token = Token::SUB;
    464         }
    465         break;
    466 
    467       case '*':
    468         // * *=
    469         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
    470         break;
    471 
    472       case '%':
    473         // % %=
    474         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    475         break;
    476 
    477       case '/':
    478         // /  // /* /=
    479         Advance();
    480         if (c0_ == '/') {
    481           token = SkipSingleLineComment();
    482         } else if (c0_ == '*') {
    483           token = SkipMultiLineComment();
    484         } else if (c0_ == '=') {
    485           token = Select(Token::ASSIGN_DIV);
    486         } else {
    487           token = Token::DIV;
    488         }
    489         break;
    490 
    491       case '&':
    492         // & && &=
    493         Advance();
    494         if (c0_ == '&') {
    495           token = Select(Token::AND);
    496         } else if (c0_ == '=') {
    497           token = Select(Token::ASSIGN_BIT_AND);
    498         } else {
    499           token = Token::BIT_AND;
    500         }
    501         break;
    502 
    503       case '|':
    504         // | || |=
    505         Advance();
    506         if (c0_ == '|') {
    507           token = Select(Token::OR);
    508         } else if (c0_ == '=') {
    509           token = Select(Token::ASSIGN_BIT_OR);
    510         } else {
    511           token = Token::BIT_OR;
    512         }
    513         break;
    514 
    515       case '^':
    516         // ^ ^=
    517         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    518         break;
    519 
    520       case '.':
    521         // . Number
    522         Advance();
    523         if (IsDecimalDigit(c0_)) {
    524           token = ScanNumber(true);
    525         } else {
    526           token = Token::PERIOD;
    527         }
    528         break;
    529 
    530       case ':':
    531         token = Select(Token::COLON);
    532         break;
    533 
    534       case ';':
    535         token = Select(Token::SEMICOLON);
    536         break;
    537 
    538       case ',':
    539         token = Select(Token::COMMA);
    540         break;
    541 
    542       case '(':
    543         token = Select(Token::LPAREN);
    544         break;
    545 
    546       case ')':
    547         token = Select(Token::RPAREN);
    548         break;
    549 
    550       case '[':
    551         token = Select(Token::LBRACK);
    552         break;
    553 
    554       case ']':
    555         token = Select(Token::RBRACK);
    556         break;
    557 
    558       case '{':
    559         token = Select(Token::LBRACE);
    560         break;
    561 
    562       case '}':
    563         token = Select(Token::RBRACE);
    564         break;
    565 
    566       case '?':
    567         token = Select(Token::CONDITIONAL);
    568         break;
    569 
    570       case '~':
    571         token = Select(Token::BIT_NOT);
    572         break;
    573 
    574       default:
    575         if (unicode_cache_->IsIdentifierStart(c0_)) {
    576           token = ScanIdentifierOrKeyword();
    577         } else if (IsDecimalDigit(c0_)) {
    578           token = ScanNumber(false);
    579         } else if (SkipWhiteSpace()) {
    580           token = Token::WHITESPACE;
    581         } else if (c0_ < 0) {
    582           token = Token::EOS;
    583         } else {
    584           token = Select(Token::ILLEGAL);
    585         }
    586         break;
    587     }
    588 
    589     // Continue scanning for tokens as long as we're just skipping
    590     // whitespace.
    591   } while (token == Token::WHITESPACE);
    592 
    593   next_.location.end_pos = source_pos();
    594   next_.token = token;
    595 }
    596 
    597 
    598 void Scanner::SeekForward(int pos) {
    599   // After this call, we will have the token at the given position as
    600   // the "next" token. The "current" token will be invalid.
    601   if (pos == next_.location.beg_pos) return;
    602   int current_pos = source_pos();
    603   ASSERT_EQ(next_.location.end_pos, current_pos);
    604   // Positions inside the lookahead token aren't supported.
    605   ASSERT(pos >= current_pos);
    606   if (pos != current_pos) {
    607     source_->SeekForward(pos - source_->pos());
    608     Advance();
    609     // This function is only called to seek to the location
    610     // of the end of a function (at the "}" token). It doesn't matter
    611     // whether there was a line terminator in the part we skip.
    612     has_line_terminator_before_next_ = false;
    613     has_multiline_comment_before_next_ = false;
    614   }
    615   Scan();
    616 }
    617 
    618 
    619 bool Scanner::ScanEscape() {
    620   uc32 c = c0_;
    621   Advance();
    622 
    623   // Skip escaped newlines.
    624   if (unicode_cache_->IsLineTerminator(c)) {
    625     // Allow CR+LF newlines in multiline string literals.
    626     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
    627     // Allow LF+CR newlines in multiline string literals.
    628     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
    629     return true;
    630   }
    631 
    632   switch (c) {
    633     case '\'':  // fall through
    634     case '"' :  // fall through
    635     case '\\': break;
    636     case 'b' : c = '\b'; break;
    637     case 'f' : c = '\f'; break;
    638     case 'n' : c = '\n'; break;
    639     case 'r' : c = '\r'; break;
    640     case 't' : c = '\t'; break;
    641     case 'u' : {
    642       c = ScanHexNumber(4);
    643       if (c < 0) return false;
    644       break;
    645     }
    646     case 'v' : c = '\v'; break;
    647     case 'x' : {
    648       c = ScanHexNumber(2);
    649       if (c < 0) return false;
    650       break;
    651     }
    652     case '0' :  // fall through
    653     case '1' :  // fall through
    654     case '2' :  // fall through
    655     case '3' :  // fall through
    656     case '4' :  // fall through
    657     case '5' :  // fall through
    658     case '6' :  // fall through
    659     case '7' : c = ScanOctalEscape(c, 2); break;
    660   }
    661 
    662   // According to ECMA-262, section 7.8.4, characters not covered by the
    663   // above cases should be illegal, but they are commonly handled as
    664   // non-escaped characters by JS VMs.
    665   AddLiteralChar(c);
    666   return true;
    667 }
    668 
    669 
    670 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
    671 // ECMA-262. Other JS VMs support them.
    672 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    673   uc32 x = c - '0';
    674   int i = 0;
    675   for (; i < length; i++) {
    676     int d = c0_ - '0';
    677     if (d < 0 || d > 7) break;
    678     int nx = x * 8 + d;
    679     if (nx >= 256) break;
    680     x = nx;
    681     Advance();
    682   }
    683   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
    684   // Remember the position of octal escape sequences so that an error
    685   // can be reported later (in strict mode).
    686   // We don't report the error immediately, because the octal escape can
    687   // occur before the "use strict" directive.
    688   if (c != '0' || i > 0) {
    689     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
    690   }
    691   return x;
    692 }
    693 
    694 
    695 Token::Value Scanner::ScanString() {
    696   uc32 quote = c0_;
    697   Advance();  // consume quote
    698 
    699   LiteralScope literal(this);
    700   while (c0_ != quote && c0_ >= 0
    701          && !unicode_cache_->IsLineTerminator(c0_)) {
    702     uc32 c = c0_;
    703     Advance();
    704     if (c == '\\') {
    705       if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
    706     } else {
    707       AddLiteralChar(c);
    708     }
    709   }
    710   if (c0_ != quote) return Token::ILLEGAL;
    711   literal.Complete();
    712 
    713   Advance();  // consume quote
    714   return Token::STRING;
    715 }
    716 
    717 
    718 void Scanner::ScanDecimalDigits() {
    719   while (IsDecimalDigit(c0_))
    720     AddLiteralCharAdvance();
    721 }
    722 
    723 
    724 Token::Value Scanner::ScanNumber(bool seen_period) {
    725   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
    726 
    727   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
    728 
    729   LiteralScope literal(this);
    730   if (seen_period) {
    731     // we have already seen a decimal point of the float
    732     AddLiteralChar('.');
    733     ScanDecimalDigits();  // we know we have at least one digit
    734 
    735   } else {
    736     // if the first character is '0' we must check for octals and hex
    737     if (c0_ == '0') {
    738       int start_pos = source_pos();  // For reporting octal positions.
    739       AddLiteralCharAdvance();
    740 
    741       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
    742       // an octal number.
    743       if (c0_ == 'x' || c0_ == 'X') {
    744         // hex number
    745         kind = HEX;
    746         AddLiteralCharAdvance();
    747         if (!IsHexDigit(c0_)) {
    748           // we must have at least one hex digit after 'x'/'X'
    749           return Token::ILLEGAL;
    750         }
    751         while (IsHexDigit(c0_)) {
    752           AddLiteralCharAdvance();
    753         }
    754       } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
    755         kind = OCTAL;
    756         AddLiteralCharAdvance();
    757         if (!IsOctalDigit(c0_)) {
    758           // we must have at least one octal digit after 'o'/'O'
    759           return Token::ILLEGAL;
    760         }
    761         while (IsOctalDigit(c0_)) {
    762           AddLiteralCharAdvance();
    763         }
    764       } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
    765         kind = BINARY;
    766         AddLiteralCharAdvance();
    767         if (!IsBinaryDigit(c0_)) {
    768           // we must have at least one binary digit after 'b'/'B'
    769           return Token::ILLEGAL;
    770         }
    771         while (IsBinaryDigit(c0_)) {
    772           AddLiteralCharAdvance();
    773         }
    774       } else if ('0' <= c0_ && c0_ <= '7') {
    775         // (possible) octal number
    776         kind = IMPLICIT_OCTAL;
    777         while (true) {
    778           if (c0_ == '8' || c0_ == '9') {
    779             kind = DECIMAL;
    780             break;
    781           }
    782           if (c0_  < '0' || '7'  < c0_) {
    783             // Octal literal finished.
    784             octal_pos_ = Location(start_pos, source_pos());
    785             break;
    786           }
    787           AddLiteralCharAdvance();
    788         }
    789       }
    790     }
    791 
    792     // Parse decimal digits and allow trailing fractional part.
    793     if (kind == DECIMAL) {
    794       ScanDecimalDigits();  // optional
    795       if (c0_ == '.') {
    796         AddLiteralCharAdvance();
    797         ScanDecimalDigits();  // optional
    798       }
    799     }
    800   }
    801 
    802   // scan exponent, if any
    803   if (c0_ == 'e' || c0_ == 'E') {
    804     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
    805     if (kind != DECIMAL) return Token::ILLEGAL;
    806     // scan exponent
    807     AddLiteralCharAdvance();
    808     if (c0_ == '+' || c0_ == '-')
    809       AddLiteralCharAdvance();
    810     if (!IsDecimalDigit(c0_)) {
    811       // we must have at least one decimal digit after 'e'/'E'
    812       return Token::ILLEGAL;
    813     }
    814     ScanDecimalDigits();
    815   }
    816 
    817   // The source character immediately following a numeric literal must
    818   // not be an identifier start or a decimal digit; see ECMA-262
    819   // section 7.8.3, page 17 (note that we read only one decimal digit
    820   // if the value is 0).
    821   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
    822     return Token::ILLEGAL;
    823 
    824   literal.Complete();
    825 
    826   return Token::NUMBER;
    827 }
    828 
    829 
    830 uc32 Scanner::ScanIdentifierUnicodeEscape() {
    831   Advance();
    832   if (c0_ != 'u') return -1;
    833   Advance();
    834   uc32 result = ScanHexNumber(4);
    835   if (result < 0) PushBack('u');
    836   return result;
    837 }
    838 
    839 
    840 // ----------------------------------------------------------------------------
    841 // Keyword Matcher
    842 
    843 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
    844   KEYWORD_GROUP('b')                                                \
    845   KEYWORD("break", Token::BREAK)                                    \
    846   KEYWORD_GROUP('c')                                                \
    847   KEYWORD("case", Token::CASE)                                      \
    848   KEYWORD("catch", Token::CATCH)                                    \
    849   KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
    850   KEYWORD("const", Token::CONST)                                    \
    851   KEYWORD("continue", Token::CONTINUE)                              \
    852   KEYWORD_GROUP('d')                                                \
    853   KEYWORD("debugger", Token::DEBUGGER)                              \
    854   KEYWORD("default", Token::DEFAULT)                                \
    855   KEYWORD("delete", Token::DELETE)                                  \
    856   KEYWORD("do", Token::DO)                                          \
    857   KEYWORD_GROUP('e')                                                \
    858   KEYWORD("else", Token::ELSE)                                      \
    859   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
    860   KEYWORD("export", harmony_modules                                 \
    861                     ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
    862   KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
    863   KEYWORD_GROUP('f')                                                \
    864   KEYWORD("false", Token::FALSE_LITERAL)                            \
    865   KEYWORD("finally", Token::FINALLY)                                \
    866   KEYWORD("for", Token::FOR)                                        \
    867   KEYWORD("function", Token::FUNCTION)                              \
    868   KEYWORD_GROUP('i')                                                \
    869   KEYWORD("if", Token::IF)                                          \
    870   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
    871   KEYWORD("import", harmony_modules                                 \
    872                     ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
    873   KEYWORD("in", Token::IN)                                          \
    874   KEYWORD("instanceof", Token::INSTANCEOF)                          \
    875   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
    876   KEYWORD_GROUP('l')                                                \
    877   KEYWORD("let", harmony_scoping                                    \
    878                  ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
    879   KEYWORD_GROUP('n')                                                \
    880   KEYWORD("new", Token::NEW)                                        \
    881   KEYWORD("null", Token::NULL_LITERAL)                              \
    882   KEYWORD_GROUP('p')                                                \
    883   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
    884   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
    885   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
    886   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
    887   KEYWORD_GROUP('r')                                                \
    888   KEYWORD("return", Token::RETURN)                                  \
    889   KEYWORD_GROUP('s')                                                \
    890   KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
    891   KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
    892   KEYWORD("switch", Token::SWITCH)                                  \
    893   KEYWORD_GROUP('t')                                                \
    894   KEYWORD("this", Token::THIS)                                      \
    895   KEYWORD("throw", Token::THROW)                                    \
    896   KEYWORD("true", Token::TRUE_LITERAL)                              \
    897   KEYWORD("try", Token::TRY)                                        \
    898   KEYWORD("typeof", Token::TYPEOF)                                  \
    899   KEYWORD_GROUP('v')                                                \
    900   KEYWORD("var", Token::VAR)                                        \
    901   KEYWORD("void", Token::VOID)                                      \
    902   KEYWORD_GROUP('w')                                                \
    903   KEYWORD("while", Token::WHILE)                                    \
    904   KEYWORD("with", Token::WITH)                                      \
    905   KEYWORD_GROUP('y')                                                \
    906   KEYWORD("yield", Token::YIELD)
    907 
    908 
    909 static Token::Value KeywordOrIdentifierToken(const char* input,
    910                                              int input_length,
    911                                              bool harmony_scoping,
    912                                              bool harmony_modules) {
    913   ASSERT(input_length >= 1);
    914   const int kMinLength = 2;
    915   const int kMaxLength = 10;
    916   if (input_length < kMinLength || input_length > kMaxLength) {
    917     return Token::IDENTIFIER;
    918   }
    919   switch (input[0]) {
    920     default:
    921 #define KEYWORD_GROUP_CASE(ch)                                \
    922       break;                                                  \
    923     case ch:
    924 #define KEYWORD(keyword, token)                               \
    925     {                                                         \
    926       /* 'keyword' is a char array, so sizeof(keyword) is */  \
    927       /* strlen(keyword) plus 1 for the NUL char. */          \
    928       const int keyword_length = sizeof(keyword) - 1;         \
    929       STATIC_ASSERT(keyword_length >= kMinLength);            \
    930       STATIC_ASSERT(keyword_length <= kMaxLength);            \
    931       if (input_length == keyword_length &&                   \
    932           input[1] == keyword[1] &&                           \
    933           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
    934           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
    935           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
    936           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
    937           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
    938           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
    939           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
    940           (keyword_length <= 9 || input[9] == keyword[9])) {  \
    941         return token;                                         \
    942       }                                                       \
    943     }
    944     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
    945   }
    946   return Token::IDENTIFIER;
    947 }
    948 
    949 
    950 Token::Value Scanner::ScanIdentifierOrKeyword() {
    951   ASSERT(unicode_cache_->IsIdentifierStart(c0_));
    952   LiteralScope literal(this);
    953   // Scan identifier start character.
    954   if (c0_ == '\\') {
    955     uc32 c = ScanIdentifierUnicodeEscape();
    956     // Only allow legal identifier start characters.
    957     if (c < 0 ||
    958         c == '\\' ||  // No recursive escapes.
    959         !unicode_cache_->IsIdentifierStart(c)) {
    960       return Token::ILLEGAL;
    961     }
    962     AddLiteralChar(c);
    963     return ScanIdentifierSuffix(&literal);
    964   }
    965 
    966   uc32 first_char = c0_;
    967   Advance();
    968   AddLiteralChar(first_char);
    969 
    970   // Scan the rest of the identifier characters.
    971   while (unicode_cache_->IsIdentifierPart(c0_)) {
    972     if (c0_ != '\\') {
    973       uc32 next_char = c0_;
    974       Advance();
    975       AddLiteralChar(next_char);
    976       continue;
    977     }
    978     // Fallthrough if no longer able to complete keyword.
    979     return ScanIdentifierSuffix(&literal);
    980   }
    981 
    982   literal.Complete();
    983 
    984   if (next_.literal_chars->is_ascii()) {
    985     Vector<const char> chars = next_.literal_chars->ascii_literal();
    986     return KeywordOrIdentifierToken(chars.start(),
    987                                     chars.length(),
    988                                     harmony_scoping_,
    989                                     harmony_modules_);
    990   }
    991 
    992   return Token::IDENTIFIER;
    993 }
    994 
    995 
    996 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
    997   // Scan the rest of the identifier characters.
    998   while (unicode_cache_->IsIdentifierPart(c0_)) {
    999     if (c0_ == '\\') {
   1000       uc32 c = ScanIdentifierUnicodeEscape();
   1001       // Only allow legal identifier part characters.
   1002       if (c < 0 ||
   1003           c == '\\' ||
   1004           !unicode_cache_->IsIdentifierPart(c)) {
   1005         return Token::ILLEGAL;
   1006       }
   1007       AddLiteralChar(c);
   1008     } else {
   1009       AddLiteralChar(c0_);
   1010       Advance();
   1011     }
   1012   }
   1013   literal->Complete();
   1014 
   1015   return Token::IDENTIFIER;
   1016 }
   1017 
   1018 
   1019 bool Scanner::ScanRegExpPattern(bool seen_equal) {
   1020   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
   1021   bool in_character_class = false;
   1022 
   1023   // Previous token is either '/' or '/=', in the second case, the
   1024   // pattern starts at =.
   1025   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1026   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1027 
   1028   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1029   // the scanner should pass uninterpreted bodies to the RegExp
   1030   // constructor.
   1031   LiteralScope literal(this);
   1032   if (seen_equal) {
   1033     AddLiteralChar('=');
   1034   }
   1035 
   1036   while (c0_ != '/' || in_character_class) {
   1037     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1038     if (c0_ == '\\') {  // Escape sequence.
   1039       AddLiteralCharAdvance();
   1040       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1041       AddLiteralCharAdvance();
   1042       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
   1043       // only "safe" characters are allowed (letters, digits, underscore),
   1044       // otherwise the escape isn't valid and the invalid character has
   1045       // its normal meaning. I.e., we can just continue scanning without
   1046       // worrying whether the following characters are part of the escape
   1047       // or not, since any '/', '\\' or '[' is guaranteed to not be part
   1048       // of the escape sequence.
   1049 
   1050       // TODO(896): At some point, parse RegExps more throughly to capture
   1051       // octal esacpes in strict mode.
   1052     } else {  // Unescaped character.
   1053       if (c0_ == '[') in_character_class = true;
   1054       if (c0_ == ']') in_character_class = false;
   1055       AddLiteralCharAdvance();
   1056     }
   1057   }
   1058   Advance();  // consume '/'
   1059 
   1060   literal.Complete();
   1061 
   1062   return true;
   1063 }
   1064 
   1065 
   1066 bool Scanner::ScanLiteralUnicodeEscape() {
   1067   ASSERT(c0_ == '\\');
   1068   uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
   1069   Advance();
   1070   int i = 1;
   1071   if (c0_ == 'u') {
   1072     i++;
   1073     while (i < 6) {
   1074       Advance();
   1075       if (!IsHexDigit(c0_)) break;
   1076       chars_read[i] = c0_;
   1077       i++;
   1078     }
   1079   }
   1080   if (i < 6) {
   1081     // Incomplete escape. Undo all advances and return false.
   1082     while (i > 0) {
   1083       i--;
   1084       PushBack(chars_read[i]);
   1085     }
   1086     return false;
   1087   }
   1088   // Complete escape. Add all chars to current literal buffer.
   1089   for (int i = 0; i < 6; i++) {
   1090     AddLiteralChar(chars_read[i]);
   1091   }
   1092   return true;
   1093 }
   1094 
   1095 
   1096 bool Scanner::ScanRegExpFlags() {
   1097   // Scan regular expression flags.
   1098   LiteralScope literal(this);
   1099   while (unicode_cache_->IsIdentifierPart(c0_)) {
   1100     if (c0_ != '\\') {
   1101       AddLiteralCharAdvance();
   1102     } else {
   1103       if (!ScanLiteralUnicodeEscape()) {
   1104         break;
   1105       }
   1106       Advance();
   1107     }
   1108   }
   1109   literal.Complete();
   1110 
   1111   next_.location.end_pos = source_pos() - 1;
   1112   return true;
   1113 }
   1114 
   1115 
   1116 int DuplicateFinder::AddAsciiSymbol(Vector<const char> key, int value) {
   1117   return AddSymbol(Vector<const byte>::cast(key), true, value);
   1118 }
   1119 
   1120 
   1121 int DuplicateFinder::AddUtf16Symbol(Vector<const uint16_t> key, int value) {
   1122   return AddSymbol(Vector<const byte>::cast(key), false, value);
   1123 }
   1124 
   1125 
   1126 int DuplicateFinder::AddSymbol(Vector<const byte> key,
   1127                                bool is_ascii,
   1128                                int value) {
   1129   uint32_t hash = Hash(key, is_ascii);
   1130   byte* encoding = BackupKey(key, is_ascii);
   1131   HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
   1132   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
   1133   entry->value =
   1134     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
   1135   return old_value;
   1136 }
   1137 
   1138 
   1139 int DuplicateFinder::AddNumber(Vector<const char> key, int value) {
   1140   ASSERT(key.length() > 0);
   1141   // Quick check for already being in canonical form.
   1142   if (IsNumberCanonical(key)) {
   1143     return AddAsciiSymbol(key, value);
   1144   }
   1145 
   1146   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
   1147   double double_value = StringToDouble(unicode_constants_, key, flags, 0.0);
   1148   int length;
   1149   const char* string;
   1150   if (!std::isfinite(double_value)) {
   1151     string = "Infinity";
   1152     length = 8;  // strlen("Infinity");
   1153   } else {
   1154     string = DoubleToCString(double_value,
   1155                              Vector<char>(number_buffer_, kBufferSize));
   1156     length = StrLength(string);
   1157   }
   1158   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
   1159                                       length), true, value);
   1160 }
   1161 
   1162 
   1163 bool DuplicateFinder::IsNumberCanonical(Vector<const char> number) {
   1164   // Test for a safe approximation of number literals that are already
   1165   // in canonical form: max 15 digits, no leading zeroes, except an
   1166   // integer part that is a single zero, and no trailing zeros below
   1167   // the decimal point.
   1168   int pos = 0;
   1169   int length = number.length();
   1170   if (number.length() > 15) return false;
   1171   if (number[pos] == '0') {
   1172     pos++;
   1173   } else {
   1174     while (pos < length &&
   1175            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
   1176   }
   1177   if (length == pos) return true;
   1178   if (number[pos] != '.') return false;
   1179   pos++;
   1180   bool invalid_last_digit = true;
   1181   while (pos < length) {
   1182     byte digit = number[pos] - '0';
   1183     if (digit > '9' - '0') return false;
   1184     invalid_last_digit = (digit == 0);
   1185     pos++;
   1186   }
   1187   return !invalid_last_digit;
   1188 }
   1189 
   1190 
   1191 uint32_t DuplicateFinder::Hash(Vector<const byte> key, bool is_ascii) {
   1192   // Primitive hash function, almost identical to the one used
   1193   // for strings (except that it's seeded by the length and ASCII-ness).
   1194   int length = key.length();
   1195   uint32_t hash = (length << 1) | (is_ascii ? 1 : 0) ;
   1196   for (int i = 0; i < length; i++) {
   1197     uint32_t c = key[i];
   1198     hash = (hash + c) * 1025;
   1199     hash ^= (hash >> 6);
   1200   }
   1201   return hash;
   1202 }
   1203 
   1204 
   1205 bool DuplicateFinder::Match(void* first, void* second) {
   1206   // Decode lengths.
   1207   // Length + ASCII-bit is encoded as base 128, most significant heptet first,
   1208   // with a 8th bit being non-zero while there are more heptets.
   1209   // The value encodes the number of bytes following, and whether the original
   1210   // was ASCII.
   1211   byte* s1 = reinterpret_cast<byte*>(first);
   1212   byte* s2 = reinterpret_cast<byte*>(second);
   1213   uint32_t length_ascii_field = 0;
   1214   byte c1;
   1215   do {
   1216     c1 = *s1;
   1217     if (c1 != *s2) return false;
   1218     length_ascii_field = (length_ascii_field << 7) | (c1 & 0x7f);
   1219     s1++;
   1220     s2++;
   1221   } while ((c1 & 0x80) != 0);
   1222   int length = static_cast<int>(length_ascii_field >> 1);
   1223   return memcmp(s1, s2, length) == 0;
   1224 }
   1225 
   1226 
   1227 byte* DuplicateFinder::BackupKey(Vector<const byte> bytes,
   1228                                  bool is_ascii) {
   1229   uint32_t ascii_length = (bytes.length() << 1) | (is_ascii ? 1 : 0);
   1230   backing_store_.StartSequence();
   1231   // Emit ascii_length as base-128 encoded number, with the 7th bit set
   1232   // on the byte of every heptet except the last, least significant, one.
   1233   if (ascii_length >= (1 << 7)) {
   1234     if (ascii_length >= (1 << 14)) {
   1235       if (ascii_length >= (1 << 21)) {
   1236         if (ascii_length >= (1 << 28)) {
   1237           backing_store_.Add(static_cast<byte>((ascii_length >> 28) | 0x80));
   1238         }
   1239         backing_store_.Add(static_cast<byte>((ascii_length >> 21) | 0x80u));
   1240       }
   1241       backing_store_.Add(static_cast<byte>((ascii_length >> 14) | 0x80u));
   1242     }
   1243     backing_store_.Add(static_cast<byte>((ascii_length >> 7) | 0x80u));
   1244   }
   1245   backing_store_.Add(static_cast<byte>(ascii_length & 0x7f));
   1246 
   1247   backing_store_.AddBlock(bytes);
   1248   return backing_store_.EndSequence().start();
   1249 }
   1250 
   1251 } }  // namespace v8::internal
   1252