Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 // Features shared by parsing and pre-parsing scanners.
     29 
     30 #include "../include/v8stdint.h"
     31 #include "scanner-base.h"
     32 #include "char-predicates-inl.h"
     33 
     34 namespace v8 {
     35 namespace internal {
     36 
     37 // ----------------------------------------------------------------------------
     38 // Scanner
     39 
     40 Scanner::Scanner(UnicodeCache* unicode_cache)
     41     : unicode_cache_(unicode_cache),
     42       octal_pos_(kNoOctalLocation) { }
     43 
     44 
     45 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
     46   ASSERT(length <= 4);  // prevent overflow
     47 
     48   uc32 digits[4];
     49   uc32 x = 0;
     50   for (int i = 0; i < length; i++) {
     51     digits[i] = c0_;
     52     int d = HexValue(c0_);
     53     if (d < 0) {
     54       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
     55       // should be illegal, but other JS VMs just return the
     56       // non-escaped version of the original character.
     57 
     58       // Push back digits read, except the last one (in c0_).
     59       for (int j = i-1; j >= 0; j--) {
     60         PushBack(digits[j]);
     61       }
     62       // Notice: No handling of error - treat it as "\u"->"u".
     63       return c;
     64     }
     65     x = x * 16 + d;
     66     Advance();
     67   }
     68 
     69   return x;
     70 }
     71 
     72 
     73 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
     74 // ECMA-262. Other JS VMs support them.
     75 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
     76   uc32 x = c - '0';
     77   int i = 0;
     78   for (; i < length; i++) {
     79     int d = c0_ - '0';
     80     if (d < 0 || d > 7) break;
     81     int nx = x * 8 + d;
     82     if (nx >= 256) break;
     83     x = nx;
     84     Advance();
     85   }
     86   // Anything excelt '\0' is an octal escape sequence, illegal in strict mode.
     87   // Remember the position of octal escape sequences so that better error
     88   // can be reported later (in strict mode).
     89   if (c != '0' || i > 0) {
     90     octal_pos_ = source_pos() - i - 1;     // Already advanced
     91   }
     92   return x;
     93 }
     94 
     95 
     96 // ----------------------------------------------------------------------------
     97 // JavaScriptScanner
     98 
     99 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
    100     : Scanner(scanner_contants) { }
    101 
    102 
    103 Token::Value JavaScriptScanner::Next() {
    104   current_ = next_;
    105   has_line_terminator_before_next_ = false;
    106   Scan();
    107   return current_.token;
    108 }
    109 
    110 
    111 static inline bool IsByteOrderMark(uc32 c) {
    112   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    113   // Unicode character; this implies that in a Unicode context the
    114   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    115   // character expressed in little-endian byte order (since it could
    116   // not be a U+FFFE character expressed in big-endian byte
    117   // order). Nevertheless, we check for it to be compatible with
    118   // Spidermonkey.
    119   return c == 0xFEFF || c == 0xFFFE;
    120 }
    121 
    122 
    123 bool JavaScriptScanner::SkipWhiteSpace() {
    124   int start_position = source_pos();
    125 
    126   while (true) {
    127     // We treat byte-order marks (BOMs) as whitespace for better
    128     // compatibility with Spidermonkey and other JavaScript engines.
    129     while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
    130       // IsWhiteSpace() includes line terminators!
    131       if (unicode_cache_->IsLineTerminator(c0_)) {
    132         // Ignore line terminators, but remember them. This is necessary
    133         // for automatic semicolon insertion.
    134         has_line_terminator_before_next_ = true;
    135       }
    136       Advance();
    137     }
    138 
    139     // If there is an HTML comment end '-->' at the beginning of a
    140     // line (with only whitespace in front of it), we treat the rest
    141     // of the line as a comment. This is in line with the way
    142     // SpiderMonkey handles it.
    143     if (c0_ == '-' && has_line_terminator_before_next_) {
    144       Advance();
    145       if (c0_ == '-') {
    146         Advance();
    147         if (c0_ == '>') {
    148           // Treat the rest of the line as a comment.
    149           SkipSingleLineComment();
    150           // Continue skipping white space after the comment.
    151           continue;
    152         }
    153         PushBack('-');  // undo Advance()
    154       }
    155       PushBack('-');  // undo Advance()
    156     }
    157     // Return whether or not we skipped any characters.
    158     return source_pos() != start_position;
    159   }
    160 }
    161 
    162 
    163 Token::Value JavaScriptScanner::SkipSingleLineComment() {
    164   Advance();
    165 
    166   // The line terminator at the end of the line is not considered
    167   // to be part of the single-line comment; it is recognized
    168   // separately by the lexical grammar and becomes part of the
    169   // stream of input elements for the syntactic grammar (see
    170   // ECMA-262, section 7.4, page 12).
    171   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    172     Advance();
    173   }
    174 
    175   return Token::WHITESPACE;
    176 }
    177 
    178 
    179 Token::Value JavaScriptScanner::SkipMultiLineComment() {
    180   ASSERT(c0_ == '*');
    181   Advance();
    182 
    183   while (c0_ >= 0) {
    184     char ch = c0_;
    185     Advance();
    186     // If we have reached the end of the multi-line comment, we
    187     // consume the '/' and insert a whitespace. This way all
    188     // multi-line comments are treated as whitespace - even the ones
    189     // containing line terminators. This contradicts ECMA-262, section
    190     // 7.4, page 12, that says that multi-line comments containing
    191     // line terminators should be treated as a line terminator, but it
    192     // matches the behaviour of SpiderMonkey and KJS.
    193     if (ch == '*' && c0_ == '/') {
    194       c0_ = ' ';
    195       return Token::WHITESPACE;
    196     }
    197   }
    198 
    199   // Unterminated multi-line comment.
    200   return Token::ILLEGAL;
    201 }
    202 
    203 
    204 Token::Value JavaScriptScanner::ScanHtmlComment() {
    205   // Check for <!-- comments.
    206   ASSERT(c0_ == '!');
    207   Advance();
    208   if (c0_ == '-') {
    209     Advance();
    210     if (c0_ == '-') return SkipSingleLineComment();
    211     PushBack('-');  // undo Advance()
    212   }
    213   PushBack('!');  // undo Advance()
    214   ASSERT(c0_ == '!');
    215   return Token::LT;
    216 }
    217 
    218 
    219 void JavaScriptScanner::Scan() {
    220   next_.literal_chars = NULL;
    221   Token::Value token;
    222   do {
    223     // Remember the position of the next token
    224     next_.location.beg_pos = source_pos();
    225 
    226     switch (c0_) {
    227       case ' ':
    228       case '\t':
    229         Advance();
    230         token = Token::WHITESPACE;
    231         break;
    232 
    233       case '\n':
    234         Advance();
    235         has_line_terminator_before_next_ = true;
    236         token = Token::WHITESPACE;
    237         break;
    238 
    239       case '"': case '\'':
    240         token = ScanString();
    241         break;
    242 
    243       case '<':
    244         // < <= << <<= <!--
    245         Advance();
    246         if (c0_ == '=') {
    247           token = Select(Token::LTE);
    248         } else if (c0_ == '<') {
    249           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    250         } else if (c0_ == '!') {
    251           token = ScanHtmlComment();
    252         } else {
    253           token = Token::LT;
    254         }
    255         break;
    256 
    257       case '>':
    258         // > >= >> >>= >>> >>>=
    259         Advance();
    260         if (c0_ == '=') {
    261           token = Select(Token::GTE);
    262         } else if (c0_ == '>') {
    263           // >> >>= >>> >>>=
    264           Advance();
    265           if (c0_ == '=') {
    266             token = Select(Token::ASSIGN_SAR);
    267           } else if (c0_ == '>') {
    268             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    269           } else {
    270             token = Token::SAR;
    271           }
    272         } else {
    273           token = Token::GT;
    274         }
    275         break;
    276 
    277       case '=':
    278         // = == ===
    279         Advance();
    280         if (c0_ == '=') {
    281           token = Select('=', Token::EQ_STRICT, Token::EQ);
    282         } else {
    283           token = Token::ASSIGN;
    284         }
    285         break;
    286 
    287       case '!':
    288         // ! != !==
    289         Advance();
    290         if (c0_ == '=') {
    291           token = Select('=', Token::NE_STRICT, Token::NE);
    292         } else {
    293           token = Token::NOT;
    294         }
    295         break;
    296 
    297       case '+':
    298         // + ++ +=
    299         Advance();
    300         if (c0_ == '+') {
    301           token = Select(Token::INC);
    302         } else if (c0_ == '=') {
    303           token = Select(Token::ASSIGN_ADD);
    304         } else {
    305           token = Token::ADD;
    306         }
    307         break;
    308 
    309       case '-':
    310         // - -- --> -=
    311         Advance();
    312         if (c0_ == '-') {
    313           Advance();
    314           if (c0_ == '>' && has_line_terminator_before_next_) {
    315             // For compatibility with SpiderMonkey, we skip lines that
    316             // start with an HTML comment end '-->'.
    317             token = SkipSingleLineComment();
    318           } else {
    319             token = Token::DEC;
    320           }
    321         } else if (c0_ == '=') {
    322           token = Select(Token::ASSIGN_SUB);
    323         } else {
    324           token = Token::SUB;
    325         }
    326         break;
    327 
    328       case '*':
    329         // * *=
    330         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
    331         break;
    332 
    333       case '%':
    334         // % %=
    335         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    336         break;
    337 
    338       case '/':
    339         // /  // /* /=
    340         Advance();
    341         if (c0_ == '/') {
    342           token = SkipSingleLineComment();
    343         } else if (c0_ == '*') {
    344           token = SkipMultiLineComment();
    345         } else if (c0_ == '=') {
    346           token = Select(Token::ASSIGN_DIV);
    347         } else {
    348           token = Token::DIV;
    349         }
    350         break;
    351 
    352       case '&':
    353         // & && &=
    354         Advance();
    355         if (c0_ == '&') {
    356           token = Select(Token::AND);
    357         } else if (c0_ == '=') {
    358           token = Select(Token::ASSIGN_BIT_AND);
    359         } else {
    360           token = Token::BIT_AND;
    361         }
    362         break;
    363 
    364       case '|':
    365         // | || |=
    366         Advance();
    367         if (c0_ == '|') {
    368           token = Select(Token::OR);
    369         } else if (c0_ == '=') {
    370           token = Select(Token::ASSIGN_BIT_OR);
    371         } else {
    372           token = Token::BIT_OR;
    373         }
    374         break;
    375 
    376       case '^':
    377         // ^ ^=
    378         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    379         break;
    380 
    381       case '.':
    382         // . Number
    383         Advance();
    384         if (IsDecimalDigit(c0_)) {
    385           token = ScanNumber(true);
    386         } else {
    387           token = Token::PERIOD;
    388         }
    389         break;
    390 
    391       case ':':
    392         token = Select(Token::COLON);
    393         break;
    394 
    395       case ';':
    396         token = Select(Token::SEMICOLON);
    397         break;
    398 
    399       case ',':
    400         token = Select(Token::COMMA);
    401         break;
    402 
    403       case '(':
    404         token = Select(Token::LPAREN);
    405         break;
    406 
    407       case ')':
    408         token = Select(Token::RPAREN);
    409         break;
    410 
    411       case '[':
    412         token = Select(Token::LBRACK);
    413         break;
    414 
    415       case ']':
    416         token = Select(Token::RBRACK);
    417         break;
    418 
    419       case '{':
    420         token = Select(Token::LBRACE);
    421         break;
    422 
    423       case '}':
    424         token = Select(Token::RBRACE);
    425         break;
    426 
    427       case '?':
    428         token = Select(Token::CONDITIONAL);
    429         break;
    430 
    431       case '~':
    432         token = Select(Token::BIT_NOT);
    433         break;
    434 
    435       default:
    436         if (unicode_cache_->IsIdentifierStart(c0_)) {
    437           token = ScanIdentifierOrKeyword();
    438         } else if (IsDecimalDigit(c0_)) {
    439           token = ScanNumber(false);
    440         } else if (SkipWhiteSpace()) {
    441           token = Token::WHITESPACE;
    442         } else if (c0_ < 0) {
    443           token = Token::EOS;
    444         } else {
    445           token = Select(Token::ILLEGAL);
    446         }
    447         break;
    448     }
    449 
    450     // Continue scanning for tokens as long as we're just skipping
    451     // whitespace.
    452   } while (token == Token::WHITESPACE);
    453 
    454   next_.location.end_pos = source_pos();
    455   next_.token = token;
    456 }
    457 
    458 
    459 void JavaScriptScanner::SeekForward(int pos) {
    460   // After this call, we will have the token at the given position as
    461   // the "next" token. The "current" token will be invalid.
    462   if (pos == next_.location.beg_pos) return;
    463   int current_pos = source_pos();
    464   ASSERT_EQ(next_.location.end_pos, current_pos);
    465   // Positions inside the lookahead token aren't supported.
    466   ASSERT(pos >= current_pos);
    467   if (pos != current_pos) {
    468     source_->SeekForward(pos - source_->pos());
    469     Advance();
    470     // This function is only called to seek to the location
    471     // of the end of a function (at the "}" token). It doesn't matter
    472     // whether there was a line terminator in the part we skip.
    473     has_line_terminator_before_next_ = false;
    474   }
    475   Scan();
    476 }
    477 
    478 
    479 void JavaScriptScanner::ScanEscape() {
    480   uc32 c = c0_;
    481   Advance();
    482 
    483   // Skip escaped newlines.
    484   if (unicode_cache_->IsLineTerminator(c)) {
    485     // Allow CR+LF newlines in multiline string literals.
    486     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
    487     // Allow LF+CR newlines in multiline string literals.
    488     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
    489     return;
    490   }
    491 
    492   switch (c) {
    493     case '\'':  // fall through
    494     case '"' :  // fall through
    495     case '\\': break;
    496     case 'b' : c = '\b'; break;
    497     case 'f' : c = '\f'; break;
    498     case 'n' : c = '\n'; break;
    499     case 'r' : c = '\r'; break;
    500     case 't' : c = '\t'; break;
    501     case 'u' : c = ScanHexEscape(c, 4); break;
    502     case 'v' : c = '\v'; break;
    503     case 'x' : c = ScanHexEscape(c, 2); break;
    504     case '0' :  // fall through
    505     case '1' :  // fall through
    506     case '2' :  // fall through
    507     case '3' :  // fall through
    508     case '4' :  // fall through
    509     case '5' :  // fall through
    510     case '6' :  // fall through
    511     case '7' : c = ScanOctalEscape(c, 2); break;
    512   }
    513 
    514   // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
    515   // should be illegal, but they are commonly handled
    516   // as non-escaped characters by JS VMs.
    517   AddLiteralChar(c);
    518 }
    519 
    520 
    521 Token::Value JavaScriptScanner::ScanString() {
    522   uc32 quote = c0_;
    523   Advance();  // consume quote
    524 
    525   LiteralScope literal(this);
    526   while (c0_ != quote && c0_ >= 0
    527          && !unicode_cache_->IsLineTerminator(c0_)) {
    528     uc32 c = c0_;
    529     Advance();
    530     if (c == '\\') {
    531       if (c0_ < 0) return Token::ILLEGAL;
    532       ScanEscape();
    533     } else {
    534       AddLiteralChar(c);
    535     }
    536   }
    537   if (c0_ != quote) return Token::ILLEGAL;
    538   literal.Complete();
    539 
    540   Advance();  // consume quote
    541   return Token::STRING;
    542 }
    543 
    544 
    545 void JavaScriptScanner::ScanDecimalDigits() {
    546   while (IsDecimalDigit(c0_))
    547     AddLiteralCharAdvance();
    548 }
    549 
    550 
    551 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
    552   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
    553 
    554   enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
    555 
    556   LiteralScope literal(this);
    557   if (seen_period) {
    558     // we have already seen a decimal point of the float
    559     AddLiteralChar('.');
    560     ScanDecimalDigits();  // we know we have at least one digit
    561 
    562   } else {
    563     // if the first character is '0' we must check for octals and hex
    564     if (c0_ == '0') {
    565       AddLiteralCharAdvance();
    566 
    567       // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
    568       if (c0_ == 'x' || c0_ == 'X') {
    569         // hex number
    570         kind = HEX;
    571         AddLiteralCharAdvance();
    572         if (!IsHexDigit(c0_)) {
    573           // we must have at least one hex digit after 'x'/'X'
    574           return Token::ILLEGAL;
    575         }
    576         while (IsHexDigit(c0_)) {
    577           AddLiteralCharAdvance();
    578         }
    579       } else if ('0' <= c0_ && c0_ <= '7') {
    580         // (possible) octal number
    581         kind = OCTAL;
    582         while (true) {
    583           if (c0_ == '8' || c0_ == '9') {
    584             kind = DECIMAL;
    585             break;
    586           }
    587           if (c0_  < '0' || '7'  < c0_) {
    588             // Octal literal finished.
    589             octal_pos_ = next_.location.beg_pos;
    590             break;
    591           }
    592           AddLiteralCharAdvance();
    593         }
    594       }
    595     }
    596 
    597     // Parse decimal digits and allow trailing fractional part.
    598     if (kind == DECIMAL) {
    599       ScanDecimalDigits();  // optional
    600       if (c0_ == '.') {
    601         AddLiteralCharAdvance();
    602         ScanDecimalDigits();  // optional
    603       }
    604     }
    605   }
    606 
    607   // scan exponent, if any
    608   if (c0_ == 'e' || c0_ == 'E') {
    609     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
    610     if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
    611     // scan exponent
    612     AddLiteralCharAdvance();
    613     if (c0_ == '+' || c0_ == '-')
    614       AddLiteralCharAdvance();
    615     if (!IsDecimalDigit(c0_)) {
    616       // we must have at least one decimal digit after 'e'/'E'
    617       return Token::ILLEGAL;
    618     }
    619     ScanDecimalDigits();
    620   }
    621 
    622   // The source character immediately following a numeric literal must
    623   // not be an identifier start or a decimal digit; see ECMA-262
    624   // section 7.8.3, page 17 (note that we read only one decimal digit
    625   // if the value is 0).
    626   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
    627     return Token::ILLEGAL;
    628 
    629   literal.Complete();
    630 
    631   return Token::NUMBER;
    632 }
    633 
    634 
    635 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
    636   Advance();
    637   if (c0_ != 'u') return unibrow::Utf8::kBadChar;
    638   Advance();
    639   uc32 c = ScanHexEscape('u', 4);
    640   // We do not allow a unicode escape sequence to start another
    641   // unicode escape sequence.
    642   if (c == '\\') return unibrow::Utf8::kBadChar;
    643   return c;
    644 }
    645 
    646 
    647 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
    648   ASSERT(unicode_cache_->IsIdentifierStart(c0_));
    649   LiteralScope literal(this);
    650   KeywordMatcher keyword_match;
    651   // Scan identifier start character.
    652   if (c0_ == '\\') {
    653     uc32 c = ScanIdentifierUnicodeEscape();
    654     // Only allow legal identifier start characters.
    655     if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
    656     AddLiteralChar(c);
    657     return ScanIdentifierSuffix(&literal);
    658   }
    659 
    660   uc32 first_char = c0_;
    661   Advance();
    662   AddLiteralChar(first_char);
    663   if (!keyword_match.AddChar(first_char)) {
    664     return ScanIdentifierSuffix(&literal);
    665   }
    666 
    667   // Scan the rest of the identifier characters.
    668   while (unicode_cache_->IsIdentifierPart(c0_)) {
    669     if (c0_ != '\\') {
    670       uc32 next_char = c0_;
    671       Advance();
    672       AddLiteralChar(next_char);
    673       if (keyword_match.AddChar(next_char)) continue;
    674     }
    675     // Fallthrough if no loner able to complete keyword.
    676     return ScanIdentifierSuffix(&literal);
    677   }
    678   literal.Complete();
    679 
    680   return keyword_match.token();
    681 }
    682 
    683 
    684 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
    685   // Scan the rest of the identifier characters.
    686   while (unicode_cache_->IsIdentifierPart(c0_)) {
    687     if (c0_ == '\\') {
    688       uc32 c = ScanIdentifierUnicodeEscape();
    689       // Only allow legal identifier part characters.
    690       if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
    691       AddLiteralChar(c);
    692     } else {
    693       AddLiteralChar(c0_);
    694       Advance();
    695     }
    696   }
    697   literal->Complete();
    698 
    699   return Token::IDENTIFIER;
    700 }
    701 
    702 
    703 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
    704   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
    705   bool in_character_class = false;
    706 
    707   // Previous token is either '/' or '/=', in the second case, the
    708   // pattern starts at =.
    709   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
    710   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
    711 
    712   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
    713   // the scanner should pass uninterpreted bodies to the RegExp
    714   // constructor.
    715   LiteralScope literal(this);
    716   if (seen_equal)
    717     AddLiteralChar('=');
    718 
    719   while (c0_ != '/' || in_character_class) {
    720     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
    721     if (c0_ == '\\') {  // Escape sequence.
    722       AddLiteralCharAdvance();
    723       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
    724       AddLiteralCharAdvance();
    725       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
    726       // only "safe" characters are allowed (letters, digits, underscore),
    727       // otherwise the escape isn't valid and the invalid character has
    728       // its normal meaning. I.e., we can just continue scanning without
    729       // worrying whether the following characters are part of the escape
    730       // or not, since any '/', '\\' or '[' is guaranteed to not be part
    731       // of the escape sequence.
    732     } else {  // Unescaped character.
    733       if (c0_ == '[') in_character_class = true;
    734       if (c0_ == ']') in_character_class = false;
    735       AddLiteralCharAdvance();
    736     }
    737   }
    738   Advance();  // consume '/'
    739 
    740   literal.Complete();
    741 
    742   return true;
    743 }
    744 
    745 
    746 bool JavaScriptScanner::ScanRegExpFlags() {
    747   // Scan regular expression flags.
    748   LiteralScope literal(this);
    749   while (unicode_cache_->IsIdentifierPart(c0_)) {
    750     if (c0_ == '\\') {
    751       uc32 c = ScanIdentifierUnicodeEscape();
    752       if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
    753         // We allow any escaped character, unlike the restriction on
    754         // IdentifierPart when it is used to build an IdentifierName.
    755         AddLiteralChar(c);
    756         continue;
    757       }
    758     }
    759     AddLiteralCharAdvance();
    760   }
    761   literal.Complete();
    762 
    763   next_.location.end_pos = source_pos() - 1;
    764   return true;
    765 }
    766 
    767 // ----------------------------------------------------------------------------
    768 // Keyword Matcher
    769 
    770 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
    771   { "break",  KEYWORD_PREFIX, Token::BREAK },
    772   { NULL,     C,              Token::ILLEGAL },
    773   { NULL,     D,              Token::ILLEGAL },
    774   { NULL,     E,              Token::ILLEGAL },
    775   { NULL,     F,              Token::ILLEGAL },
    776   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    777   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    778   { NULL,     I,              Token::ILLEGAL },
    779   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    780   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    781   { "let",    KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD },
    782   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    783   { NULL,     N,              Token::ILLEGAL },
    784   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    785   { NULL,     P,              Token::ILLEGAL },
    786   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    787   { "return", KEYWORD_PREFIX, Token::RETURN },
    788   { NULL,     S,              Token::ILLEGAL },
    789   { NULL,     T,              Token::ILLEGAL },
    790   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    791   { NULL,     V,              Token::ILLEGAL },
    792   { NULL,     W,              Token::ILLEGAL },
    793   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    794   { "yield",  KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD }
    795 };
    796 
    797 
    798 void KeywordMatcher::Step(unibrow::uchar input) {
    799   switch (state_) {
    800     case INITIAL: {
    801       // matching the first character is the only state with significant fanout.
    802       // Match only lower-case letters in range 'b'..'y'.
    803       unsigned int offset = input - kFirstCharRangeMin;
    804       if (offset < kFirstCharRangeLength) {
    805         state_ = first_states_[offset].state;
    806         if (state_ == KEYWORD_PREFIX) {
    807           keyword_ = first_states_[offset].keyword;
    808           counter_ = 1;
    809           keyword_token_ = first_states_[offset].token;
    810         }
    811         return;
    812       }
    813       break;
    814     }
    815     case KEYWORD_PREFIX:
    816       if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {
    817         counter_++;
    818         if (keyword_[counter_] == '\0') {
    819           state_ = KEYWORD_MATCHED;
    820           token_ = keyword_token_;
    821         }
    822         return;
    823       }
    824       break;
    825     case KEYWORD_MATCHED:
    826       token_ = Token::IDENTIFIER;
    827       break;
    828     case C:
    829       if (MatchState(input, 'a', CA)) return;
    830       if (MatchKeywordStart(input, "class", 1,
    831           Token::FUTURE_RESERVED_WORD)) return;
    832       if (MatchState(input, 'o', CO)) return;
    833       break;
    834     case CA:
    835       if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
    836       if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
    837       break;
    838     case CO:
    839       if (MatchState(input, 'n', CON)) return;
    840       break;
    841     case CON:
    842       if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
    843       if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
    844       break;
    845     case D:
    846       if (MatchState(input, 'e', DE)) return;
    847       if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
    848       break;
    849     case DE:
    850       if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
    851       if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
    852       if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
    853       break;
    854     case E:
    855       if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;
    856       if (MatchKeywordStart(input, "enum", 1,
    857           Token::FUTURE_RESERVED_WORD)) return;
    858       if (MatchState(input, 'x', EX)) return;
    859       break;
    860     case EX:
    861       if (MatchKeywordStart(input, "export", 2,
    862           Token::FUTURE_RESERVED_WORD)) return;
    863       if (MatchKeywordStart(input, "extends", 2,
    864           Token::FUTURE_RESERVED_WORD)) return;
    865       break;
    866     case F:
    867       if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
    868       if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
    869       if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
    870       if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
    871       break;
    872     case I:
    873       if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
    874       if (MatchState(input, 'm', IM)) return;
    875       if (MatchKeyword(input, 'n', IN, Token::IN)) return;
    876       break;
    877     case IM:
    878       if (MatchState(input, 'p', IMP)) return;
    879       break;
    880     case IMP:
    881       if (MatchKeywordStart(input, "implements", 3,
    882          Token::FUTURE_RESERVED_WORD )) return;
    883       if (MatchKeywordStart(input, "import", 3,
    884          Token::FUTURE_RESERVED_WORD)) return;
    885       break;
    886     case IN:
    887       token_ = Token::IDENTIFIER;
    888       if (MatchKeywordStart(input, "interface", 2,
    889          Token::FUTURE_RESERVED_WORD)) return;
    890       if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;
    891       break;
    892     case N:
    893       if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
    894       if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
    895       if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
    896       break;
    897     case P:
    898       if (MatchKeywordStart(input, "package", 1,
    899           Token::FUTURE_RESERVED_WORD)) return;
    900       if (MatchState(input, 'r', PR)) return;
    901       if (MatchKeywordStart(input, "public", 1,
    902           Token::FUTURE_RESERVED_WORD)) return;
    903       break;
    904     case PR:
    905       if (MatchKeywordStart(input, "private", 2,
    906           Token::FUTURE_RESERVED_WORD)) return;
    907       if (MatchKeywordStart(input, "protected", 2,
    908           Token::FUTURE_RESERVED_WORD)) return;
    909       break;
    910     case S:
    911       if (MatchKeywordStart(input, "static", 1,
    912           Token::FUTURE_RESERVED_WORD)) return;
    913       if (MatchKeywordStart(input, "super", 1,
    914           Token::FUTURE_RESERVED_WORD)) return;
    915       if (MatchKeywordStart(input, "switch", 1,
    916           Token::SWITCH)) return;
    917       break;
    918     case T:
    919       if (MatchState(input, 'h', TH)) return;
    920       if (MatchState(input, 'r', TR)) return;
    921       if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
    922       break;
    923     case TH:
    924       if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
    925       if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
    926       break;
    927     case TR:
    928       if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
    929       if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
    930       break;
    931     case V:
    932       if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
    933       if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
    934       break;
    935     case W:
    936       if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
    937       if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
    938       break;
    939     case UNMATCHABLE:
    940       break;
    941   }
    942   // On fallthrough, it's a failure.
    943   state_ = UNMATCHABLE;
    944 }
    945 
    946 } }  // namespace v8::internal
    947