Home | History | Annotate | Download | only in src
      1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #include "v8.h"
     29 
     30 #include "ast.h"
     31 #include "scanner.h"
     32 
     33 namespace v8 {
     34 namespace internal {
     35 
     36 // ----------------------------------------------------------------------------
     37 // Character predicates
     38 
     39 
     40 unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
     41 unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
     42 unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
     43 unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
     44 
     45 
     46 StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
     47 
     48 
     49 // ----------------------------------------------------------------------------
     50 // UTF8Buffer
     51 
     52 UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { }
     53 
     54 
     55 UTF8Buffer::~UTF8Buffer() {
     56   if (data_ != NULL) DeleteArray(data_);
     57 }
     58 
     59 
     60 void UTF8Buffer::AddCharSlow(uc32 c) {
     61   static const int kCapacityGrowthLimit = 1 * MB;
     62   if (cursor_ > limit_) {
     63     int old_capacity = Capacity();
     64     int old_position = pos();
     65     int new_capacity =
     66         Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit);
     67     char* new_data = NewArray<char>(new_capacity);
     68     memcpy(new_data, data_, old_position);
     69     DeleteArray(data_);
     70     data_ = new_data;
     71     cursor_ = new_data + old_position;
     72     limit_ = ComputeLimit(new_data, new_capacity);
     73     ASSERT(Capacity() == new_capacity && pos() == old_position);
     74   }
     75   if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
     76     *cursor_++ = c;  // Common case: 7-bit ASCII.
     77   } else {
     78     cursor_ += unibrow::Utf8::Encode(cursor_, c);
     79   }
     80   ASSERT(pos() <= Capacity());
     81 }
     82 
     83 
     84 // ----------------------------------------------------------------------------
     85 // UTF16Buffer
     86 
     87 
     88 UTF16Buffer::UTF16Buffer()
     89     : pos_(0), size_(0) { }
     90 
     91 
     92 Handle<String> UTF16Buffer::SubString(int start, int end) {
     93   return internal::SubString(data_, start, end);
     94 }
     95 
     96 
     97 // CharacterStreamUTF16Buffer
     98 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
     99     : pushback_buffer_(0), last_(0), stream_(NULL) { }
    100 
    101 
    102 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
    103                                             unibrow::CharacterStream* input) {
    104   data_ = data;
    105   pos_ = 0;
    106   stream_ = input;
    107 }
    108 
    109 
    110 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
    111   pushback_buffer()->Add(last_);
    112   last_ = ch;
    113   pos_--;
    114 }
    115 
    116 
    117 uc32 CharacterStreamUTF16Buffer::Advance() {
    118   // NOTE: It is of importance to Persian / Farsi resources that we do
    119   // *not* strip format control characters in the scanner; see
    120   //
    121   //    https://bugzilla.mozilla.org/show_bug.cgi?id=274152
    122   //
    123   // So, even though ECMA-262, section 7.1, page 11, dictates that we
    124   // must remove Unicode format-control characters, we do not. This is
    125   // in line with how IE and SpiderMonkey handles it.
    126   if (!pushback_buffer()->is_empty()) {
    127     pos_++;
    128     return last_ = pushback_buffer()->RemoveLast();
    129   } else if (stream_->has_more()) {
    130     pos_++;
    131     uc32 next = stream_->GetNext();
    132     return last_ = next;
    133   } else {
    134     // Note: currently the following increment is necessary to avoid a
    135     // test-parser problem!
    136     pos_++;
    137     return last_ = static_cast<uc32>(-1);
    138   }
    139 }
    140 
    141 
    142 void CharacterStreamUTF16Buffer::SeekForward(int pos) {
    143   pos_ = pos;
    144   ASSERT(pushback_buffer()->is_empty());
    145   stream_->Seek(pos);
    146 }
    147 
    148 
    149 // TwoByteStringUTF16Buffer
    150 TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer()
    151     : raw_data_(NULL) { }
    152 
    153 
    154 void TwoByteStringUTF16Buffer::Initialize(
    155      Handle<ExternalTwoByteString> data) {
    156   ASSERT(!data.is_null());
    157 
    158   data_ = data;
    159   pos_ = 0;
    160 
    161   raw_data_ = data->resource()->data();
    162   size_ = data->length();
    163 }
    164 
    165 
    166 uc32 TwoByteStringUTF16Buffer::Advance() {
    167   if (pos_ < size_) {
    168     return raw_data_[pos_++];
    169   } else {
    170     // note: currently the following increment is necessary to avoid a
    171     // test-parser problem!
    172     pos_++;
    173     return static_cast<uc32>(-1);
    174   }
    175 }
    176 
    177 
    178 void TwoByteStringUTF16Buffer::PushBack(uc32 ch) {
    179   pos_--;
    180   ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize);
    181   ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch);
    182 }
    183 
    184 
    185 void TwoByteStringUTF16Buffer::SeekForward(int pos) {
    186   pos_ = pos;
    187 }
    188 
    189 
    190 // ----------------------------------------------------------------------------
    191 // Keyword Matcher
    192 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
    193   { "break",  KEYWORD_PREFIX, Token::BREAK },
    194   { NULL,     C,              Token::ILLEGAL },
    195   { NULL,     D,              Token::ILLEGAL },
    196   { "else",   KEYWORD_PREFIX, Token::ELSE },
    197   { NULL,     F,              Token::ILLEGAL },
    198   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    199   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    200   { NULL,     I,              Token::ILLEGAL },
    201   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    202   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    203   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    204   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    205   { NULL,     N,              Token::ILLEGAL },
    206   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    207   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    208   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    209   { "return", KEYWORD_PREFIX, Token::RETURN },
    210   { "switch", KEYWORD_PREFIX, Token::SWITCH },
    211   { NULL,     T,              Token::ILLEGAL },
    212   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
    213   { NULL,     V,              Token::ILLEGAL },
    214   { NULL,     W,              Token::ILLEGAL }
    215 };
    216 
    217 
    218 void KeywordMatcher::Step(uc32 input) {
    219   switch (state_) {
    220     case INITIAL: {
    221       // matching the first character is the only state with significant fanout.
    222       // Match only lower-case letters in range 'b'..'w'.
    223       unsigned int offset = input - kFirstCharRangeMin;
    224       if (offset < kFirstCharRangeLength) {
    225         state_ = first_states_[offset].state;
    226         if (state_ == KEYWORD_PREFIX) {
    227           keyword_ = first_states_[offset].keyword;
    228           counter_ = 1;
    229           keyword_token_ = first_states_[offset].token;
    230         }
    231         return;
    232       }
    233       break;
    234     }
    235     case KEYWORD_PREFIX:
    236       if (keyword_[counter_] == input) {
    237         ASSERT_NE(input, '\0');
    238         counter_++;
    239         if (keyword_[counter_] == '\0') {
    240           state_ = KEYWORD_MATCHED;
    241           token_ = keyword_token_;
    242         }
    243         return;
    244       }
    245       break;
    246     case KEYWORD_MATCHED:
    247       token_ = Token::IDENTIFIER;
    248       break;
    249     case C:
    250       if (MatchState(input, 'a', CA)) return;
    251       if (MatchState(input, 'o', CO)) return;
    252       break;
    253     case CA:
    254       if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
    255       if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
    256       break;
    257     case CO:
    258       if (MatchState(input, 'n', CON)) return;
    259       break;
    260     case CON:
    261       if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
    262       if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
    263       break;
    264     case D:
    265       if (MatchState(input, 'e', DE)) return;
    266       if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
    267       break;
    268     case DE:
    269       if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
    270       if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
    271       if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
    272       break;
    273     case F:
    274       if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
    275       if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
    276       if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
    277       if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
    278       break;
    279     case I:
    280       if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
    281       if (MatchKeyword(input, 'n', IN, Token::IN)) return;
    282       break;
    283     case IN:
    284       token_ = Token::IDENTIFIER;
    285       if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) {
    286         return;
    287       }
    288       break;
    289     case N:
    290       if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
    291       if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
    292       if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
    293       break;
    294     case T:
    295       if (MatchState(input, 'h', TH)) return;
    296       if (MatchState(input, 'r', TR)) return;
    297       if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
    298       break;
    299     case TH:
    300       if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
    301       if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
    302       break;
    303     case TR:
    304       if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
    305       if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
    306       break;
    307     case V:
    308       if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
    309       if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
    310       break;
    311     case W:
    312       if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
    313       if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
    314       break;
    315     default:
    316       UNREACHABLE();
    317   }
    318   // On fallthrough, it's a failure.
    319   state_ = UNMATCHABLE;
    320 }
    321 
    322 
    323 // ----------------------------------------------------------------------------
    324 // Scanner
    325 
    326 Scanner::Scanner(ParserMode pre)
    327     : stack_overflow_(false), is_pre_parsing_(pre == PREPARSE) { }
    328 
    329 
    330 void Scanner::Init(Handle<String> source,
    331                    unibrow::CharacterStream* stream,
    332                    int position,
    333                    ParserLanguage language) {
    334   // Initialize the source buffer.
    335   if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
    336     two_byte_string_buffer_.Initialize(
    337         Handle<ExternalTwoByteString>::cast(source));
    338     source_ = &two_byte_string_buffer_;
    339   } else {
    340     char_stream_buffer_.Initialize(source, stream);
    341     source_ = &char_stream_buffer_;
    342   }
    343 
    344   position_ = position;
    345   is_parsing_json_ = (language == JSON);
    346 
    347   // Set c0_ (one character ahead)
    348   ASSERT(kCharacterLookaheadBufferSize == 1);
    349   Advance();
    350   // Initializer current_ to not refer to a literal buffer.
    351   current_.literal_buffer = NULL;
    352 
    353   // Skip initial whitespace allowing HTML comment ends just like
    354   // after a newline and scan first token.
    355   has_line_terminator_before_next_ = true;
    356   SkipWhiteSpace();
    357   Scan();
    358 }
    359 
    360 
    361 Handle<String> Scanner::SubString(int start, int end) {
    362   return source_->SubString(start - position_, end - position_);
    363 }
    364 
    365 
    366 Token::Value Scanner::Next() {
    367   // BUG 1215673: Find a thread safe way to set a stack limit in
    368   // pre-parse mode. Otherwise, we cannot safely pre-parse from other
    369   // threads.
    370   current_ = next_;
    371   // Check for stack-overflow before returning any tokens.
    372   StackLimitCheck check;
    373   if (check.HasOverflowed()) {
    374     stack_overflow_ = true;
    375     next_.token = Token::ILLEGAL;
    376   } else {
    377     Scan();
    378   }
    379   return current_.token;
    380 }
    381 
    382 
    383 void Scanner::StartLiteral() {
    384   // Use the first buffer unless it's currently in use by the current_ token.
    385   // In most cases we won't have two literals/identifiers in a row, so
    386   // the second buffer won't be used very often and is unlikely to grow much.
    387   UTF8Buffer* free_buffer =
    388       (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_
    389                                                       : &literal_buffer_2_;
    390   next_.literal_buffer = free_buffer;
    391   free_buffer->Reset();
    392 }
    393 
    394 
    395 void Scanner::AddChar(uc32 c) {
    396   next_.literal_buffer->AddChar(c);
    397 }
    398 
    399 
    400 void Scanner::TerminateLiteral() {
    401   AddChar(0);
    402 }
    403 
    404 
    405 void Scanner::AddCharAdvance() {
    406   AddChar(c0_);
    407   Advance();
    408 }
    409 
    410 
    411 static inline bool IsByteOrderMark(uc32 c) {
    412   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    413   // Unicode character; this implies that in a Unicode context the
    414   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    415   // character expressed in little-endian byte order (since it could
    416   // not be a U+FFFE character expressed in big-endian byte
    417   // order). Nevertheless, we check for it to be compatible with
    418   // Spidermonkey.
    419   return c == 0xFEFF || c == 0xFFFE;
    420 }
    421 
    422 
    423 bool Scanner::SkipJsonWhiteSpace() {
    424   int start_position = source_pos();
    425   // JSON WhiteSpace is tab, carrige-return, newline and space.
    426   while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
    427     Advance();
    428   }
    429   return source_pos() != start_position;
    430 }
    431 
    432 
    433 bool Scanner::SkipJavaScriptWhiteSpace() {
    434   int start_position = source_pos();
    435 
    436   while (true) {
    437     // We treat byte-order marks (BOMs) as whitespace for better
    438     // compatibility with Spidermonkey and other JavaScript engines.
    439     while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
    440       // IsWhiteSpace() includes line terminators!
    441       if (kIsLineTerminator.get(c0_)) {
    442         // Ignore line terminators, but remember them. This is necessary
    443         // for automatic semicolon insertion.
    444         has_line_terminator_before_next_ = true;
    445       }
    446       Advance();
    447     }
    448 
    449     // If there is an HTML comment end '-->' at the beginning of a
    450     // line (with only whitespace in front of it), we treat the rest
    451     // of the line as a comment. This is in line with the way
    452     // SpiderMonkey handles it.
    453     if (c0_ == '-' && has_line_terminator_before_next_) {
    454       Advance();
    455       if (c0_ == '-') {
    456         Advance();
    457         if (c0_ == '>') {
    458           // Treat the rest of the line as a comment.
    459           SkipSingleLineComment();
    460           // Continue skipping white space after the comment.
    461           continue;
    462         }
    463         PushBack('-');  // undo Advance()
    464       }
    465       PushBack('-');  // undo Advance()
    466     }
    467     // Return whether or not we skipped any characters.
    468     return source_pos() != start_position;
    469   }
    470 }
    471 
    472 
    473 Token::Value Scanner::SkipSingleLineComment() {
    474   Advance();
    475 
    476   // The line terminator at the end of the line is not considered
    477   // to be part of the single-line comment; it is recognized
    478   // separately by the lexical grammar and becomes part of the
    479   // stream of input elements for the syntactic grammar (see
    480   // ECMA-262, section 7.4, page 12).
    481   while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
    482     Advance();
    483   }
    484 
    485   return Token::WHITESPACE;
    486 }
    487 
    488 
    489 Token::Value Scanner::SkipMultiLineComment() {
    490   ASSERT(c0_ == '*');
    491   Advance();
    492 
    493   while (c0_ >= 0) {
    494     char ch = c0_;
    495     Advance();
    496     // If we have reached the end of the multi-line comment, we
    497     // consume the '/' and insert a whitespace. This way all
    498     // multi-line comments are treated as whitespace - even the ones
    499     // containing line terminators. This contradicts ECMA-262, section
    500     // 7.4, page 12, that says that multi-line comments containing
    501     // line terminators should be treated as a line terminator, but it
    502     // matches the behaviour of SpiderMonkey and KJS.
    503     if (ch == '*' && c0_ == '/') {
    504       c0_ = ' ';
    505       return Token::WHITESPACE;
    506     }
    507   }
    508 
    509   // Unterminated multi-line comment.
    510   return Token::ILLEGAL;
    511 }
    512 
    513 
    514 Token::Value Scanner::ScanHtmlComment() {
    515   // Check for <!-- comments.
    516   ASSERT(c0_ == '!');
    517   Advance();
    518   if (c0_ == '-') {
    519     Advance();
    520     if (c0_ == '-') return SkipSingleLineComment();
    521     PushBack('-');  // undo Advance()
    522   }
    523   PushBack('!');  // undo Advance()
    524   ASSERT(c0_ == '!');
    525   return Token::LT;
    526 }
    527 
    528 
    529 
    530 void Scanner::ScanJson() {
    531   next_.literal_buffer = NULL;
    532   Token::Value token;
    533   has_line_terminator_before_next_ = false;
    534   do {
    535     // Remember the position of the next token
    536     next_.location.beg_pos = source_pos();
    537     switch (c0_) {
    538       case '\t':
    539       case '\r':
    540       case '\n':
    541       case ' ':
    542         Advance();
    543         token = Token::WHITESPACE;
    544         break;
    545       case '{':
    546         Advance();
    547         token = Token::LBRACE;
    548         break;
    549       case '}':
    550         Advance();
    551         token = Token::RBRACE;
    552         break;
    553       case '[':
    554         Advance();
    555         token = Token::LBRACK;
    556         break;
    557       case ']':
    558         Advance();
    559         token = Token::RBRACK;
    560         break;
    561       case ':':
    562         Advance();
    563         token = Token::COLON;
    564         break;
    565       case ',':
    566         Advance();
    567         token = Token::COMMA;
    568         break;
    569       case '"':
    570         token = ScanJsonString();
    571         break;
    572       case '-':
    573       case '0':
    574       case '1':
    575       case '2':
    576       case '3':
    577       case '4':
    578       case '5':
    579       case '6':
    580       case '7':
    581       case '8':
    582       case '9':
    583         token = ScanJsonNumber();
    584         break;
    585       case 't':
    586         token = ScanJsonIdentifier("true", Token::TRUE_LITERAL);
    587         break;
    588       case 'f':
    589         token = ScanJsonIdentifier("false", Token::FALSE_LITERAL);
    590         break;
    591       case 'n':
    592         token = ScanJsonIdentifier("null", Token::NULL_LITERAL);
    593         break;
    594       default:
    595         if (c0_ < 0) {
    596           Advance();
    597           token = Token::EOS;
    598         } else {
    599           Advance();
    600           token = Select(Token::ILLEGAL);
    601         }
    602     }
    603   } while (token == Token::WHITESPACE);
    604 
    605   next_.location.end_pos = source_pos();
    606   next_.token = token;
    607 }
    608 
    609 
    610 Token::Value Scanner::ScanJsonString() {
    611   ASSERT_EQ('"', c0_);
    612   Advance();
    613   StartLiteral();
    614   while (c0_ != '"' && c0_ > 0) {
    615     // Check for control character (0x00-0x1f) or unterminated string (<0).
    616     if (c0_ < 0x20) return Token::ILLEGAL;
    617     if (c0_ != '\\') {
    618       AddCharAdvance();
    619     } else {
    620       Advance();
    621       switch (c0_) {
    622         case '"':
    623         case '\\':
    624         case '/':
    625           AddChar(c0_);
    626           break;
    627         case 'b':
    628           AddChar('\x08');
    629           break;
    630         case 'f':
    631           AddChar('\x0c');
    632           break;
    633         case 'n':
    634           AddChar('\x0a');
    635           break;
    636         case 'r':
    637           AddChar('\x0d');
    638           break;
    639         case 't':
    640           AddChar('\x09');
    641           break;
    642         case 'u': {
    643           uc32 value = 0;
    644           for (int i = 0; i < 4; i++) {
    645             Advance();
    646             int digit = HexValue(c0_);
    647             if (digit < 0) return Token::ILLEGAL;
    648             value = value * 16 + digit;
    649           }
    650           AddChar(value);
    651           break;
    652         }
    653         default:
    654           return Token::ILLEGAL;
    655       }
    656       Advance();
    657     }
    658   }
    659   if (c0_ != '"') {
    660     return Token::ILLEGAL;
    661   }
    662   TerminateLiteral();
    663   Advance();
    664   return Token::STRING;
    665 }
    666 
    667 
    668 Token::Value Scanner::ScanJsonNumber() {
    669   StartLiteral();
    670   if (c0_ == '-') AddCharAdvance();
    671   if (c0_ == '0') {
    672     AddCharAdvance();
    673     // Prefix zero is only allowed if it's the only digit before
    674     // a decimal point or exponent.
    675     if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
    676   } else {
    677     if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
    678     do {
    679       AddCharAdvance();
    680     } while (c0_ >= '0' && c0_ <= '9');
    681   }
    682   if (c0_ == '.') {
    683     AddCharAdvance();
    684     if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
    685     do {
    686       AddCharAdvance();
    687     } while (c0_ >= '0' && c0_ <= '9');
    688   }
    689   if ((c0_ | 0x20) == 'e') {
    690     AddCharAdvance();
    691     if (c0_ == '-' || c0_ == '+') AddCharAdvance();
    692     if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
    693     do {
    694       AddCharAdvance();
    695     } while (c0_ >= '0' && c0_ <= '9');
    696   }
    697   TerminateLiteral();
    698   return Token::NUMBER;
    699 }
    700 
    701 
    702 Token::Value Scanner::ScanJsonIdentifier(const char* text,
    703                                          Token::Value token) {
    704   StartLiteral();
    705   while (*text != '\0') {
    706     if (c0_ != *text) return Token::ILLEGAL;
    707     Advance();
    708     text++;
    709   }
    710   if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;
    711   TerminateLiteral();
    712   return token;
    713 }
    714 
    715 
    716 void Scanner::ScanJavaScript() {
    717   next_.literal_buffer = NULL;
    718   Token::Value token;
    719   has_line_terminator_before_next_ = false;
    720   do {
    721     // Remember the position of the next token
    722     next_.location.beg_pos = source_pos();
    723 
    724     switch (c0_) {
    725       case ' ':
    726       case '\t':
    727         Advance();
    728         token = Token::WHITESPACE;
    729         break;
    730 
    731       case '\n':
    732         Advance();
    733         has_line_terminator_before_next_ = true;
    734         token = Token::WHITESPACE;
    735         break;
    736 
    737       case '"': case '\'':
    738         token = ScanString();
    739         break;
    740 
    741       case '<':
    742         // < <= << <<= <!--
    743         Advance();
    744         if (c0_ == '=') {
    745           token = Select(Token::LTE);
    746         } else if (c0_ == '<') {
    747           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    748         } else if (c0_ == '!') {
    749           token = ScanHtmlComment();
    750         } else {
    751           token = Token::LT;
    752         }
    753         break;
    754 
    755       case '>':
    756         // > >= >> >>= >>> >>>=
    757         Advance();
    758         if (c0_ == '=') {
    759           token = Select(Token::GTE);
    760         } else if (c0_ == '>') {
    761           // >> >>= >>> >>>=
    762           Advance();
    763           if (c0_ == '=') {
    764             token = Select(Token::ASSIGN_SAR);
    765           } else if (c0_ == '>') {
    766             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    767           } else {
    768             token = Token::SAR;
    769           }
    770         } else {
    771           token = Token::GT;
    772         }
    773         break;
    774 
    775       case '=':
    776         // = == ===
    777         Advance();
    778         if (c0_ == '=') {
    779           token = Select('=', Token::EQ_STRICT, Token::EQ);
    780         } else {
    781           token = Token::ASSIGN;
    782         }
    783         break;
    784 
    785       case '!':
    786         // ! != !==
    787         Advance();
    788         if (c0_ == '=') {
    789           token = Select('=', Token::NE_STRICT, Token::NE);
    790         } else {
    791           token = Token::NOT;
    792         }
    793         break;
    794 
    795       case '+':
    796         // + ++ +=
    797         Advance();
    798         if (c0_ == '+') {
    799           token = Select(Token::INC);
    800         } else if (c0_ == '=') {
    801           token = Select(Token::ASSIGN_ADD);
    802         } else {
    803           token = Token::ADD;
    804         }
    805         break;
    806 
    807       case '-':
    808         // - -- --> -=
    809         Advance();
    810         if (c0_ == '-') {
    811           Advance();
    812           if (c0_ == '>' && has_line_terminator_before_next_) {
    813             // For compatibility with SpiderMonkey, we skip lines that
    814             // start with an HTML comment end '-->'.
    815             token = SkipSingleLineComment();
    816           } else {
    817             token = Token::DEC;
    818           }
    819         } else if (c0_ == '=') {
    820           token = Select(Token::ASSIGN_SUB);
    821         } else {
    822           token = Token::SUB;
    823         }
    824         break;
    825 
    826       case '*':
    827         // * *=
    828         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
    829         break;
    830 
    831       case '%':
    832         // % %=
    833         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    834         break;
    835 
    836       case '/':
    837         // /  // /* /=
    838         Advance();
    839         if (c0_ == '/') {
    840           token = SkipSingleLineComment();
    841         } else if (c0_ == '*') {
    842           token = SkipMultiLineComment();
    843         } else if (c0_ == '=') {
    844           token = Select(Token::ASSIGN_DIV);
    845         } else {
    846           token = Token::DIV;
    847         }
    848         break;
    849 
    850       case '&':
    851         // & && &=
    852         Advance();
    853         if (c0_ == '&') {
    854           token = Select(Token::AND);
    855         } else if (c0_ == '=') {
    856           token = Select(Token::ASSIGN_BIT_AND);
    857         } else {
    858           token = Token::BIT_AND;
    859         }
    860         break;
    861 
    862       case '|':
    863         // | || |=
    864         Advance();
    865         if (c0_ == '|') {
    866           token = Select(Token::OR);
    867         } else if (c0_ == '=') {
    868           token = Select(Token::ASSIGN_BIT_OR);
    869         } else {
    870           token = Token::BIT_OR;
    871         }
    872         break;
    873 
    874       case '^':
    875         // ^ ^=
    876         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    877         break;
    878 
    879       case '.':
    880         // . Number
    881         Advance();
    882         if (IsDecimalDigit(c0_)) {
    883           token = ScanNumber(true);
    884         } else {
    885           token = Token::PERIOD;
    886         }
    887         break;
    888 
    889       case ':':
    890         token = Select(Token::COLON);
    891         break;
    892 
    893       case ';':
    894         token = Select(Token::SEMICOLON);
    895         break;
    896 
    897       case ',':
    898         token = Select(Token::COMMA);
    899         break;
    900 
    901       case '(':
    902         token = Select(Token::LPAREN);
    903         break;
    904 
    905       case ')':
    906         token = Select(Token::RPAREN);
    907         break;
    908 
    909       case '[':
    910         token = Select(Token::LBRACK);
    911         break;
    912 
    913       case ']':
    914         token = Select(Token::RBRACK);
    915         break;
    916 
    917       case '{':
    918         token = Select(Token::LBRACE);
    919         break;
    920 
    921       case '}':
    922         token = Select(Token::RBRACE);
    923         break;
    924 
    925       case '?':
    926         token = Select(Token::CONDITIONAL);
    927         break;
    928 
    929       case '~':
    930         token = Select(Token::BIT_NOT);
    931         break;
    932 
    933       default:
    934         if (kIsIdentifierStart.get(c0_)) {
    935           token = ScanIdentifier();
    936         } else if (IsDecimalDigit(c0_)) {
    937           token = ScanNumber(false);
    938         } else if (SkipWhiteSpace()) {
    939           token = Token::WHITESPACE;
    940         } else if (c0_ < 0) {
    941           token = Token::EOS;
    942         } else {
    943           token = Select(Token::ILLEGAL);
    944         }
    945         break;
    946     }
    947 
    948     // Continue scanning for tokens as long as we're just skipping
    949     // whitespace.
    950   } while (token == Token::WHITESPACE);
    951 
    952   next_.location.end_pos = source_pos();
    953   next_.token = token;
    954 }
    955 
    956 
    957 void Scanner::SeekForward(int pos) {
    958   source_->SeekForward(pos - 1);
    959   Advance();
    960   Scan();
    961 }
    962 
    963 
    964 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
    965   ASSERT(length <= 4);  // prevent overflow
    966 
    967   uc32 digits[4];
    968   uc32 x = 0;
    969   for (int i = 0; i < length; i++) {
    970     digits[i] = c0_;
    971     int d = HexValue(c0_);
    972     if (d < 0) {
    973       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
    974       // should be illegal, but other JS VMs just return the
    975       // non-escaped version of the original character.
    976 
    977       // Push back digits read, except the last one (in c0_).
    978       for (int j = i-1; j >= 0; j--) {
    979         PushBack(digits[j]);
    980       }
    981       // Notice: No handling of error - treat it as "\u"->"u".
    982       return c;
    983     }
    984     x = x * 16 + d;
    985     Advance();
    986   }
    987 
    988   return x;
    989 }
    990 
    991 
    992 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
    993 // ECMA-262. Other JS VMs support them.
    994 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    995   uc32 x = c - '0';
    996   for (int i = 0; i < length; i++) {
    997     int d = c0_ - '0';
    998     if (d < 0 || d > 7) break;
    999     int nx = x * 8 + d;
   1000     if (nx >= 256) break;
   1001     x = nx;
   1002     Advance();
   1003   }
   1004   return x;
   1005 }
   1006 
   1007 
   1008 void Scanner::ScanEscape() {
   1009   uc32 c = c0_;
   1010   Advance();
   1011 
   1012   // Skip escaped newlines.
   1013   if (kIsLineTerminator.get(c)) {
   1014     // Allow CR+LF newlines in multiline string literals.
   1015     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
   1016     // Allow LF+CR newlines in multiline string literals.
   1017     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
   1018     return;
   1019   }
   1020 
   1021   switch (c) {
   1022     case '\'':  // fall through
   1023     case '"' :  // fall through
   1024     case '\\': break;
   1025     case 'b' : c = '\b'; break;
   1026     case 'f' : c = '\f'; break;
   1027     case 'n' : c = '\n'; break;
   1028     case 'r' : c = '\r'; break;
   1029     case 't' : c = '\t'; break;
   1030     case 'u' : c = ScanHexEscape(c, 4); break;
   1031     case 'v' : c = '\v'; break;
   1032     case 'x' : c = ScanHexEscape(c, 2); break;
   1033     case '0' :  // fall through
   1034     case '1' :  // fall through
   1035     case '2' :  // fall through
   1036     case '3' :  // fall through
   1037     case '4' :  // fall through
   1038     case '5' :  // fall through
   1039     case '6' :  // fall through
   1040     case '7' : c = ScanOctalEscape(c, 2); break;
   1041   }
   1042 
   1043   // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
   1044   // should be illegal, but they are commonly handled
   1045   // as non-escaped characters by JS VMs.
   1046   AddChar(c);
   1047 }
   1048 
   1049 
   1050 Token::Value Scanner::ScanString() {
   1051   uc32 quote = c0_;
   1052   Advance();  // consume quote
   1053 
   1054   StartLiteral();
   1055   while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
   1056     uc32 c = c0_;
   1057     Advance();
   1058     if (c == '\\') {
   1059       if (c0_ < 0) return Token::ILLEGAL;
   1060       ScanEscape();
   1061     } else {
   1062       AddChar(c);
   1063     }
   1064   }
   1065   if (c0_ != quote) {
   1066     return Token::ILLEGAL;
   1067   }
   1068   TerminateLiteral();
   1069 
   1070   Advance();  // consume quote
   1071   return Token::STRING;
   1072 }
   1073 
   1074 
   1075 Token::Value Scanner::Select(Token::Value tok) {
   1076   Advance();
   1077   return tok;
   1078 }
   1079 
   1080 
   1081 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
   1082   Advance();
   1083   if (c0_ == next) {
   1084     Advance();
   1085     return then;
   1086   } else {
   1087     return else_;
   1088   }
   1089 }
   1090 
   1091 
   1092 // Returns true if any decimal digits were scanned, returns false otherwise.
   1093 void Scanner::ScanDecimalDigits() {
   1094   while (IsDecimalDigit(c0_))
   1095     AddCharAdvance();
   1096 }
   1097 
   1098 
   1099 Token::Value Scanner::ScanNumber(bool seen_period) {
   1100   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
   1101 
   1102   enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
   1103 
   1104   StartLiteral();
   1105   if (seen_period) {
   1106     // we have already seen a decimal point of the float
   1107     AddChar('.');
   1108     ScanDecimalDigits();  // we know we have at least one digit
   1109 
   1110   } else {
   1111     // if the first character is '0' we must check for octals and hex
   1112     if (c0_ == '0') {
   1113       AddCharAdvance();
   1114 
   1115       // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
   1116       if (c0_ == 'x' || c0_ == 'X') {
   1117         // hex number
   1118         kind = HEX;
   1119         AddCharAdvance();
   1120         if (!IsHexDigit(c0_))
   1121           // we must have at least one hex digit after 'x'/'X'
   1122           return Token::ILLEGAL;
   1123         while (IsHexDigit(c0_))
   1124           AddCharAdvance();
   1125 
   1126       } else if ('0' <= c0_ && c0_ <= '7') {
   1127         // (possible) octal number
   1128         kind = OCTAL;
   1129         while (true) {
   1130           if (c0_ == '8' || c0_ == '9') {
   1131             kind = DECIMAL;
   1132             break;
   1133           }
   1134           if (c0_  < '0' || '7'  < c0_) break;
   1135           AddCharAdvance();
   1136         }
   1137       }
   1138     }
   1139 
   1140     // Parse decimal digits and allow trailing fractional part.
   1141     if (kind == DECIMAL) {
   1142       ScanDecimalDigits();  // optional
   1143       if (c0_ == '.') {
   1144         AddCharAdvance();
   1145         ScanDecimalDigits();  // optional
   1146       }
   1147     }
   1148   }
   1149 
   1150   // scan exponent, if any
   1151   if (c0_ == 'e' || c0_ == 'E') {
   1152     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
   1153     if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
   1154     // scan exponent
   1155     AddCharAdvance();
   1156     if (c0_ == '+' || c0_ == '-')
   1157       AddCharAdvance();
   1158     if (!IsDecimalDigit(c0_))
   1159       // we must have at least one decimal digit after 'e'/'E'
   1160       return Token::ILLEGAL;
   1161     ScanDecimalDigits();
   1162   }
   1163   TerminateLiteral();
   1164 
   1165   // The source character immediately following a numeric literal must
   1166   // not be an identifier start or a decimal digit; see ECMA-262
   1167   // section 7.8.3, page 17 (note that we read only one decimal digit
   1168   // if the value is 0).
   1169   if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
   1170     return Token::ILLEGAL;
   1171 
   1172   return Token::NUMBER;
   1173 }
   1174 
   1175 
   1176 uc32 Scanner::ScanIdentifierUnicodeEscape() {
   1177   Advance();
   1178   if (c0_ != 'u') return unibrow::Utf8::kBadChar;
   1179   Advance();
   1180   uc32 c = ScanHexEscape('u', 4);
   1181   // We do not allow a unicode escape sequence to start another
   1182   // unicode escape sequence.
   1183   if (c == '\\') return unibrow::Utf8::kBadChar;
   1184   return c;
   1185 }
   1186 
   1187 
   1188 Token::Value Scanner::ScanIdentifier() {
   1189   ASSERT(kIsIdentifierStart.get(c0_));
   1190 
   1191   StartLiteral();
   1192   KeywordMatcher keyword_match;
   1193 
   1194   // Scan identifier start character.
   1195   if (c0_ == '\\') {
   1196     uc32 c = ScanIdentifierUnicodeEscape();
   1197     // Only allow legal identifier start characters.
   1198     if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
   1199     AddChar(c);
   1200     keyword_match.Fail();
   1201   } else {
   1202     AddChar(c0_);
   1203     keyword_match.AddChar(c0_);
   1204     Advance();
   1205   }
   1206 
   1207   // Scan the rest of the identifier characters.
   1208   while (kIsIdentifierPart.get(c0_)) {
   1209     if (c0_ == '\\') {
   1210       uc32 c = ScanIdentifierUnicodeEscape();
   1211       // Only allow legal identifier part characters.
   1212       if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
   1213       AddChar(c);
   1214       keyword_match.Fail();
   1215     } else {
   1216       AddChar(c0_);
   1217       keyword_match.AddChar(c0_);
   1218       Advance();
   1219     }
   1220   }
   1221   TerminateLiteral();
   1222 
   1223   return keyword_match.token();
   1224 }
   1225 
   1226 
   1227 
   1228 bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
   1229   // Checks whether the buffer contains an identifier (no escape).
   1230   if (!buffer->has_more()) return false;
   1231   if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
   1232   while (buffer->has_more()) {
   1233     if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
   1234   }
   1235   return true;
   1236 }
   1237 
   1238 
   1239 bool Scanner::ScanRegExpPattern(bool seen_equal) {
   1240   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
   1241   bool in_character_class = false;
   1242 
   1243   // Previous token is either '/' or '/=', in the second case, the
   1244   // pattern starts at =.
   1245   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1246   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1247 
   1248   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1249   // the scanner should pass uninterpreted bodies to the RegExp
   1250   // constructor.
   1251   StartLiteral();
   1252   if (seen_equal)
   1253     AddChar('=');
   1254 
   1255   while (c0_ != '/' || in_character_class) {
   1256     if (kIsLineTerminator.get(c0_) || c0_ < 0)
   1257       return false;
   1258     if (c0_ == '\\') {  // escaped character
   1259       AddCharAdvance();
   1260       if (kIsLineTerminator.get(c0_) || c0_ < 0)
   1261         return false;
   1262       AddCharAdvance();
   1263     } else {  // unescaped character
   1264       if (c0_ == '[')
   1265         in_character_class = true;
   1266       if (c0_ == ']')
   1267         in_character_class = false;
   1268       AddCharAdvance();
   1269     }
   1270   }
   1271   Advance();  // consume '/'
   1272 
   1273   TerminateLiteral();
   1274 
   1275   return true;
   1276 }
   1277 
   1278 bool Scanner::ScanRegExpFlags() {
   1279   // Scan regular expression flags.
   1280   StartLiteral();
   1281   while (kIsIdentifierPart.get(c0_)) {
   1282     if (c0_ == '\\') {
   1283       uc32 c = ScanIdentifierUnicodeEscape();
   1284       if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
   1285         // We allow any escaped character, unlike the restriction on
   1286         // IdentifierPart when it is used to build an IdentifierName.
   1287         AddChar(c);
   1288         continue;
   1289       }
   1290     }
   1291     AddCharAdvance();
   1292   }
   1293   TerminateLiteral();
   1294 
   1295   next_.location.end_pos = source_pos() - 1;
   1296   return true;
   1297 }
   1298 
   1299 } }  // namespace v8::internal
   1300