1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #include "v8.h" 29 30 #include "ast.h" 31 #include "scanner.h" 32 33 namespace v8 { 34 namespace internal { 35 36 // ---------------------------------------------------------------------------- 37 // Character predicates 38 39 40 unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart; 41 unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart; 42 unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator; 43 unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace; 44 45 46 StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_; 47 48 49 // ---------------------------------------------------------------------------- 50 // UTF8Buffer 51 52 UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { } 53 54 55 UTF8Buffer::~UTF8Buffer() { 56 if (data_ != NULL) DeleteArray(data_); 57 } 58 59 60 void UTF8Buffer::AddCharSlow(uc32 c) { 61 static const int kCapacityGrowthLimit = 1 * MB; 62 if (cursor_ > limit_) { 63 int old_capacity = Capacity(); 64 int old_position = pos(); 65 int new_capacity = 66 Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit); 67 char* new_data = NewArray<char>(new_capacity); 68 memcpy(new_data, data_, old_position); 69 DeleteArray(data_); 70 data_ = new_data; 71 cursor_ = new_data + old_position; 72 limit_ = ComputeLimit(new_data, new_capacity); 73 ASSERT(Capacity() == new_capacity && pos() == old_position); 74 } 75 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 76 *cursor_++ = c; // Common case: 7-bit ASCII. 77 } else { 78 cursor_ += unibrow::Utf8::Encode(cursor_, c); 79 } 80 ASSERT(pos() <= Capacity()); 81 } 82 83 84 // ---------------------------------------------------------------------------- 85 // UTF16Buffer 86 87 88 UTF16Buffer::UTF16Buffer() 89 : pos_(0), size_(0) { } 90 91 92 Handle<String> UTF16Buffer::SubString(int start, int end) { 93 return internal::SubString(data_, start, end); 94 } 95 96 97 // CharacterStreamUTF16Buffer 98 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() 99 : pushback_buffer_(0), last_(0), stream_(NULL) { } 100 101 102 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data, 103 unibrow::CharacterStream* input) { 104 data_ = data; 105 pos_ = 0; 106 stream_ = input; 107 } 108 109 110 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) { 111 pushback_buffer()->Add(last_); 112 last_ = ch; 113 pos_--; 114 } 115 116 117 uc32 CharacterStreamUTF16Buffer::Advance() { 118 // NOTE: It is of importance to Persian / Farsi resources that we do 119 // *not* strip format control characters in the scanner; see 120 // 121 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 122 // 123 // So, even though ECMA-262, section 7.1, page 11, dictates that we 124 // must remove Unicode format-control characters, we do not. This is 125 // in line with how IE and SpiderMonkey handles it. 126 if (!pushback_buffer()->is_empty()) { 127 pos_++; 128 return last_ = pushback_buffer()->RemoveLast(); 129 } else if (stream_->has_more()) { 130 pos_++; 131 uc32 next = stream_->GetNext(); 132 return last_ = next; 133 } else { 134 // Note: currently the following increment is necessary to avoid a 135 // test-parser problem! 136 pos_++; 137 return last_ = static_cast<uc32>(-1); 138 } 139 } 140 141 142 void CharacterStreamUTF16Buffer::SeekForward(int pos) { 143 pos_ = pos; 144 ASSERT(pushback_buffer()->is_empty()); 145 stream_->Seek(pos); 146 } 147 148 149 // TwoByteStringUTF16Buffer 150 TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer() 151 : raw_data_(NULL) { } 152 153 154 void TwoByteStringUTF16Buffer::Initialize( 155 Handle<ExternalTwoByteString> data) { 156 ASSERT(!data.is_null()); 157 158 data_ = data; 159 pos_ = 0; 160 161 raw_data_ = data->resource()->data(); 162 size_ = data->length(); 163 } 164 165 166 uc32 TwoByteStringUTF16Buffer::Advance() { 167 if (pos_ < size_) { 168 return raw_data_[pos_++]; 169 } else { 170 // note: currently the following increment is necessary to avoid a 171 // test-parser problem! 172 pos_++; 173 return static_cast<uc32>(-1); 174 } 175 } 176 177 178 void TwoByteStringUTF16Buffer::PushBack(uc32 ch) { 179 pos_--; 180 ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize); 181 ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch); 182 } 183 184 185 void TwoByteStringUTF16Buffer::SeekForward(int pos) { 186 pos_ = pos; 187 } 188 189 190 // ---------------------------------------------------------------------------- 191 // Keyword Matcher 192 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = { 193 { "break", KEYWORD_PREFIX, Token::BREAK }, 194 { NULL, C, Token::ILLEGAL }, 195 { NULL, D, Token::ILLEGAL }, 196 { "else", KEYWORD_PREFIX, Token::ELSE }, 197 { NULL, F, Token::ILLEGAL }, 198 { NULL, UNMATCHABLE, Token::ILLEGAL }, 199 { NULL, UNMATCHABLE, Token::ILLEGAL }, 200 { NULL, I, Token::ILLEGAL }, 201 { NULL, UNMATCHABLE, Token::ILLEGAL }, 202 { NULL, UNMATCHABLE, Token::ILLEGAL }, 203 { NULL, UNMATCHABLE, Token::ILLEGAL }, 204 { NULL, UNMATCHABLE, Token::ILLEGAL }, 205 { NULL, N, Token::ILLEGAL }, 206 { NULL, UNMATCHABLE, Token::ILLEGAL }, 207 { NULL, UNMATCHABLE, Token::ILLEGAL }, 208 { NULL, UNMATCHABLE, Token::ILLEGAL }, 209 { "return", KEYWORD_PREFIX, Token::RETURN }, 210 { "switch", KEYWORD_PREFIX, Token::SWITCH }, 211 { NULL, T, Token::ILLEGAL }, 212 { NULL, UNMATCHABLE, Token::ILLEGAL }, 213 { NULL, V, Token::ILLEGAL }, 214 { NULL, W, Token::ILLEGAL } 215 }; 216 217 218 void KeywordMatcher::Step(uc32 input) { 219 switch (state_) { 220 case INITIAL: { 221 // matching the first character is the only state with significant fanout. 222 // Match only lower-case letters in range 'b'..'w'. 223 unsigned int offset = input - kFirstCharRangeMin; 224 if (offset < kFirstCharRangeLength) { 225 state_ = first_states_[offset].state; 226 if (state_ == KEYWORD_PREFIX) { 227 keyword_ = first_states_[offset].keyword; 228 counter_ = 1; 229 keyword_token_ = first_states_[offset].token; 230 } 231 return; 232 } 233 break; 234 } 235 case KEYWORD_PREFIX: 236 if (keyword_[counter_] == input) { 237 ASSERT_NE(input, '\0'); 238 counter_++; 239 if (keyword_[counter_] == '\0') { 240 state_ = KEYWORD_MATCHED; 241 token_ = keyword_token_; 242 } 243 return; 244 } 245 break; 246 case KEYWORD_MATCHED: 247 token_ = Token::IDENTIFIER; 248 break; 249 case C: 250 if (MatchState(input, 'a', CA)) return; 251 if (MatchState(input, 'o', CO)) return; 252 break; 253 case CA: 254 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return; 255 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return; 256 break; 257 case CO: 258 if (MatchState(input, 'n', CON)) return; 259 break; 260 case CON: 261 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return; 262 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return; 263 break; 264 case D: 265 if (MatchState(input, 'e', DE)) return; 266 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return; 267 break; 268 case DE: 269 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return; 270 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return; 271 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return; 272 break; 273 case F: 274 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return; 275 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return; 276 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return; 277 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return; 278 break; 279 case I: 280 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return; 281 if (MatchKeyword(input, 'n', IN, Token::IN)) return; 282 break; 283 case IN: 284 token_ = Token::IDENTIFIER; 285 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) { 286 return; 287 } 288 break; 289 case N: 290 if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return; 291 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return; 292 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return; 293 break; 294 case T: 295 if (MatchState(input, 'h', TH)) return; 296 if (MatchState(input, 'r', TR)) return; 297 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return; 298 break; 299 case TH: 300 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return; 301 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return; 302 break; 303 case TR: 304 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return; 305 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return; 306 break; 307 case V: 308 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return; 309 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return; 310 break; 311 case W: 312 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return; 313 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return; 314 break; 315 default: 316 UNREACHABLE(); 317 } 318 // On fallthrough, it's a failure. 319 state_ = UNMATCHABLE; 320 } 321 322 323 // ---------------------------------------------------------------------------- 324 // Scanner 325 326 Scanner::Scanner(ParserMode pre) 327 : stack_overflow_(false), is_pre_parsing_(pre == PREPARSE) { } 328 329 330 void Scanner::Init(Handle<String> source, 331 unibrow::CharacterStream* stream, 332 int position, 333 ParserLanguage language) { 334 // Initialize the source buffer. 335 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) { 336 two_byte_string_buffer_.Initialize( 337 Handle<ExternalTwoByteString>::cast(source)); 338 source_ = &two_byte_string_buffer_; 339 } else { 340 char_stream_buffer_.Initialize(source, stream); 341 source_ = &char_stream_buffer_; 342 } 343 344 position_ = position; 345 is_parsing_json_ = (language == JSON); 346 347 // Set c0_ (one character ahead) 348 ASSERT(kCharacterLookaheadBufferSize == 1); 349 Advance(); 350 // Initializer current_ to not refer to a literal buffer. 351 current_.literal_buffer = NULL; 352 353 // Skip initial whitespace allowing HTML comment ends just like 354 // after a newline and scan first token. 355 has_line_terminator_before_next_ = true; 356 SkipWhiteSpace(); 357 Scan(); 358 } 359 360 361 Handle<String> Scanner::SubString(int start, int end) { 362 return source_->SubString(start - position_, end - position_); 363 } 364 365 366 Token::Value Scanner::Next() { 367 // BUG 1215673: Find a thread safe way to set a stack limit in 368 // pre-parse mode. Otherwise, we cannot safely pre-parse from other 369 // threads. 370 current_ = next_; 371 // Check for stack-overflow before returning any tokens. 372 StackLimitCheck check; 373 if (check.HasOverflowed()) { 374 stack_overflow_ = true; 375 next_.token = Token::ILLEGAL; 376 } else { 377 Scan(); 378 } 379 return current_.token; 380 } 381 382 383 void Scanner::StartLiteral() { 384 // Use the first buffer unless it's currently in use by the current_ token. 385 // In most cases we won't have two literals/identifiers in a row, so 386 // the second buffer won't be used very often and is unlikely to grow much. 387 UTF8Buffer* free_buffer = 388 (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_ 389 : &literal_buffer_2_; 390 next_.literal_buffer = free_buffer; 391 free_buffer->Reset(); 392 } 393 394 395 void Scanner::AddChar(uc32 c) { 396 next_.literal_buffer->AddChar(c); 397 } 398 399 400 void Scanner::TerminateLiteral() { 401 AddChar(0); 402 } 403 404 405 void Scanner::AddCharAdvance() { 406 AddChar(c0_); 407 Advance(); 408 } 409 410 411 static inline bool IsByteOrderMark(uc32 c) { 412 // The Unicode value U+FFFE is guaranteed never to be assigned as a 413 // Unicode character; this implies that in a Unicode context the 414 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 415 // character expressed in little-endian byte order (since it could 416 // not be a U+FFFE character expressed in big-endian byte 417 // order). Nevertheless, we check for it to be compatible with 418 // Spidermonkey. 419 return c == 0xFEFF || c == 0xFFFE; 420 } 421 422 423 bool Scanner::SkipJsonWhiteSpace() { 424 int start_position = source_pos(); 425 // JSON WhiteSpace is tab, carrige-return, newline and space. 426 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') { 427 Advance(); 428 } 429 return source_pos() != start_position; 430 } 431 432 433 bool Scanner::SkipJavaScriptWhiteSpace() { 434 int start_position = source_pos(); 435 436 while (true) { 437 // We treat byte-order marks (BOMs) as whitespace for better 438 // compatibility with Spidermonkey and other JavaScript engines. 439 while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { 440 // IsWhiteSpace() includes line terminators! 441 if (kIsLineTerminator.get(c0_)) { 442 // Ignore line terminators, but remember them. This is necessary 443 // for automatic semicolon insertion. 444 has_line_terminator_before_next_ = true; 445 } 446 Advance(); 447 } 448 449 // If there is an HTML comment end '-->' at the beginning of a 450 // line (with only whitespace in front of it), we treat the rest 451 // of the line as a comment. This is in line with the way 452 // SpiderMonkey handles it. 453 if (c0_ == '-' && has_line_terminator_before_next_) { 454 Advance(); 455 if (c0_ == '-') { 456 Advance(); 457 if (c0_ == '>') { 458 // Treat the rest of the line as a comment. 459 SkipSingleLineComment(); 460 // Continue skipping white space after the comment. 461 continue; 462 } 463 PushBack('-'); // undo Advance() 464 } 465 PushBack('-'); // undo Advance() 466 } 467 // Return whether or not we skipped any characters. 468 return source_pos() != start_position; 469 } 470 } 471 472 473 Token::Value Scanner::SkipSingleLineComment() { 474 Advance(); 475 476 // The line terminator at the end of the line is not considered 477 // to be part of the single-line comment; it is recognized 478 // separately by the lexical grammar and becomes part of the 479 // stream of input elements for the syntactic grammar (see 480 // ECMA-262, section 7.4, page 12). 481 while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) { 482 Advance(); 483 } 484 485 return Token::WHITESPACE; 486 } 487 488 489 Token::Value Scanner::SkipMultiLineComment() { 490 ASSERT(c0_ == '*'); 491 Advance(); 492 493 while (c0_ >= 0) { 494 char ch = c0_; 495 Advance(); 496 // If we have reached the end of the multi-line comment, we 497 // consume the '/' and insert a whitespace. This way all 498 // multi-line comments are treated as whitespace - even the ones 499 // containing line terminators. This contradicts ECMA-262, section 500 // 7.4, page 12, that says that multi-line comments containing 501 // line terminators should be treated as a line terminator, but it 502 // matches the behaviour of SpiderMonkey and KJS. 503 if (ch == '*' && c0_ == '/') { 504 c0_ = ' '; 505 return Token::WHITESPACE; 506 } 507 } 508 509 // Unterminated multi-line comment. 510 return Token::ILLEGAL; 511 } 512 513 514 Token::Value Scanner::ScanHtmlComment() { 515 // Check for <!-- comments. 516 ASSERT(c0_ == '!'); 517 Advance(); 518 if (c0_ == '-') { 519 Advance(); 520 if (c0_ == '-') return SkipSingleLineComment(); 521 PushBack('-'); // undo Advance() 522 } 523 PushBack('!'); // undo Advance() 524 ASSERT(c0_ == '!'); 525 return Token::LT; 526 } 527 528 529 530 void Scanner::ScanJson() { 531 next_.literal_buffer = NULL; 532 Token::Value token; 533 has_line_terminator_before_next_ = false; 534 do { 535 // Remember the position of the next token 536 next_.location.beg_pos = source_pos(); 537 switch (c0_) { 538 case '\t': 539 case '\r': 540 case '\n': 541 case ' ': 542 Advance(); 543 token = Token::WHITESPACE; 544 break; 545 case '{': 546 Advance(); 547 token = Token::LBRACE; 548 break; 549 case '}': 550 Advance(); 551 token = Token::RBRACE; 552 break; 553 case '[': 554 Advance(); 555 token = Token::LBRACK; 556 break; 557 case ']': 558 Advance(); 559 token = Token::RBRACK; 560 break; 561 case ':': 562 Advance(); 563 token = Token::COLON; 564 break; 565 case ',': 566 Advance(); 567 token = Token::COMMA; 568 break; 569 case '"': 570 token = ScanJsonString(); 571 break; 572 case '-': 573 case '0': 574 case '1': 575 case '2': 576 case '3': 577 case '4': 578 case '5': 579 case '6': 580 case '7': 581 case '8': 582 case '9': 583 token = ScanJsonNumber(); 584 break; 585 case 't': 586 token = ScanJsonIdentifier("true", Token::TRUE_LITERAL); 587 break; 588 case 'f': 589 token = ScanJsonIdentifier("false", Token::FALSE_LITERAL); 590 break; 591 case 'n': 592 token = ScanJsonIdentifier("null", Token::NULL_LITERAL); 593 break; 594 default: 595 if (c0_ < 0) { 596 Advance(); 597 token = Token::EOS; 598 } else { 599 Advance(); 600 token = Select(Token::ILLEGAL); 601 } 602 } 603 } while (token == Token::WHITESPACE); 604 605 next_.location.end_pos = source_pos(); 606 next_.token = token; 607 } 608 609 610 Token::Value Scanner::ScanJsonString() { 611 ASSERT_EQ('"', c0_); 612 Advance(); 613 StartLiteral(); 614 while (c0_ != '"' && c0_ > 0) { 615 // Check for control character (0x00-0x1f) or unterminated string (<0). 616 if (c0_ < 0x20) return Token::ILLEGAL; 617 if (c0_ != '\\') { 618 AddCharAdvance(); 619 } else { 620 Advance(); 621 switch (c0_) { 622 case '"': 623 case '\\': 624 case '/': 625 AddChar(c0_); 626 break; 627 case 'b': 628 AddChar('\x08'); 629 break; 630 case 'f': 631 AddChar('\x0c'); 632 break; 633 case 'n': 634 AddChar('\x0a'); 635 break; 636 case 'r': 637 AddChar('\x0d'); 638 break; 639 case 't': 640 AddChar('\x09'); 641 break; 642 case 'u': { 643 uc32 value = 0; 644 for (int i = 0; i < 4; i++) { 645 Advance(); 646 int digit = HexValue(c0_); 647 if (digit < 0) return Token::ILLEGAL; 648 value = value * 16 + digit; 649 } 650 AddChar(value); 651 break; 652 } 653 default: 654 return Token::ILLEGAL; 655 } 656 Advance(); 657 } 658 } 659 if (c0_ != '"') { 660 return Token::ILLEGAL; 661 } 662 TerminateLiteral(); 663 Advance(); 664 return Token::STRING; 665 } 666 667 668 Token::Value Scanner::ScanJsonNumber() { 669 StartLiteral(); 670 if (c0_ == '-') AddCharAdvance(); 671 if (c0_ == '0') { 672 AddCharAdvance(); 673 // Prefix zero is only allowed if it's the only digit before 674 // a decimal point or exponent. 675 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL; 676 } else { 677 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL; 678 do { 679 AddCharAdvance(); 680 } while (c0_ >= '0' && c0_ <= '9'); 681 } 682 if (c0_ == '.') { 683 AddCharAdvance(); 684 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; 685 do { 686 AddCharAdvance(); 687 } while (c0_ >= '0' && c0_ <= '9'); 688 } 689 if ((c0_ | 0x20) == 'e') { 690 AddCharAdvance(); 691 if (c0_ == '-' || c0_ == '+') AddCharAdvance(); 692 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; 693 do { 694 AddCharAdvance(); 695 } while (c0_ >= '0' && c0_ <= '9'); 696 } 697 TerminateLiteral(); 698 return Token::NUMBER; 699 } 700 701 702 Token::Value Scanner::ScanJsonIdentifier(const char* text, 703 Token::Value token) { 704 StartLiteral(); 705 while (*text != '\0') { 706 if (c0_ != *text) return Token::ILLEGAL; 707 Advance(); 708 text++; 709 } 710 if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL; 711 TerminateLiteral(); 712 return token; 713 } 714 715 716 void Scanner::ScanJavaScript() { 717 next_.literal_buffer = NULL; 718 Token::Value token; 719 has_line_terminator_before_next_ = false; 720 do { 721 // Remember the position of the next token 722 next_.location.beg_pos = source_pos(); 723 724 switch (c0_) { 725 case ' ': 726 case '\t': 727 Advance(); 728 token = Token::WHITESPACE; 729 break; 730 731 case '\n': 732 Advance(); 733 has_line_terminator_before_next_ = true; 734 token = Token::WHITESPACE; 735 break; 736 737 case '"': case '\'': 738 token = ScanString(); 739 break; 740 741 case '<': 742 // < <= << <<= <!-- 743 Advance(); 744 if (c0_ == '=') { 745 token = Select(Token::LTE); 746 } else if (c0_ == '<') { 747 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 748 } else if (c0_ == '!') { 749 token = ScanHtmlComment(); 750 } else { 751 token = Token::LT; 752 } 753 break; 754 755 case '>': 756 // > >= >> >>= >>> >>>= 757 Advance(); 758 if (c0_ == '=') { 759 token = Select(Token::GTE); 760 } else if (c0_ == '>') { 761 // >> >>= >>> >>>= 762 Advance(); 763 if (c0_ == '=') { 764 token = Select(Token::ASSIGN_SAR); 765 } else if (c0_ == '>') { 766 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 767 } else { 768 token = Token::SAR; 769 } 770 } else { 771 token = Token::GT; 772 } 773 break; 774 775 case '=': 776 // = == === 777 Advance(); 778 if (c0_ == '=') { 779 token = Select('=', Token::EQ_STRICT, Token::EQ); 780 } else { 781 token = Token::ASSIGN; 782 } 783 break; 784 785 case '!': 786 // ! != !== 787 Advance(); 788 if (c0_ == '=') { 789 token = Select('=', Token::NE_STRICT, Token::NE); 790 } else { 791 token = Token::NOT; 792 } 793 break; 794 795 case '+': 796 // + ++ += 797 Advance(); 798 if (c0_ == '+') { 799 token = Select(Token::INC); 800 } else if (c0_ == '=') { 801 token = Select(Token::ASSIGN_ADD); 802 } else { 803 token = Token::ADD; 804 } 805 break; 806 807 case '-': 808 // - -- --> -= 809 Advance(); 810 if (c0_ == '-') { 811 Advance(); 812 if (c0_ == '>' && has_line_terminator_before_next_) { 813 // For compatibility with SpiderMonkey, we skip lines that 814 // start with an HTML comment end '-->'. 815 token = SkipSingleLineComment(); 816 } else { 817 token = Token::DEC; 818 } 819 } else if (c0_ == '=') { 820 token = Select(Token::ASSIGN_SUB); 821 } else { 822 token = Token::SUB; 823 } 824 break; 825 826 case '*': 827 // * *= 828 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 829 break; 830 831 case '%': 832 // % %= 833 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 834 break; 835 836 case '/': 837 // / // /* /= 838 Advance(); 839 if (c0_ == '/') { 840 token = SkipSingleLineComment(); 841 } else if (c0_ == '*') { 842 token = SkipMultiLineComment(); 843 } else if (c0_ == '=') { 844 token = Select(Token::ASSIGN_DIV); 845 } else { 846 token = Token::DIV; 847 } 848 break; 849 850 case '&': 851 // & && &= 852 Advance(); 853 if (c0_ == '&') { 854 token = Select(Token::AND); 855 } else if (c0_ == '=') { 856 token = Select(Token::ASSIGN_BIT_AND); 857 } else { 858 token = Token::BIT_AND; 859 } 860 break; 861 862 case '|': 863 // | || |= 864 Advance(); 865 if (c0_ == '|') { 866 token = Select(Token::OR); 867 } else if (c0_ == '=') { 868 token = Select(Token::ASSIGN_BIT_OR); 869 } else { 870 token = Token::BIT_OR; 871 } 872 break; 873 874 case '^': 875 // ^ ^= 876 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 877 break; 878 879 case '.': 880 // . Number 881 Advance(); 882 if (IsDecimalDigit(c0_)) { 883 token = ScanNumber(true); 884 } else { 885 token = Token::PERIOD; 886 } 887 break; 888 889 case ':': 890 token = Select(Token::COLON); 891 break; 892 893 case ';': 894 token = Select(Token::SEMICOLON); 895 break; 896 897 case ',': 898 token = Select(Token::COMMA); 899 break; 900 901 case '(': 902 token = Select(Token::LPAREN); 903 break; 904 905 case ')': 906 token = Select(Token::RPAREN); 907 break; 908 909 case '[': 910 token = Select(Token::LBRACK); 911 break; 912 913 case ']': 914 token = Select(Token::RBRACK); 915 break; 916 917 case '{': 918 token = Select(Token::LBRACE); 919 break; 920 921 case '}': 922 token = Select(Token::RBRACE); 923 break; 924 925 case '?': 926 token = Select(Token::CONDITIONAL); 927 break; 928 929 case '~': 930 token = Select(Token::BIT_NOT); 931 break; 932 933 default: 934 if (kIsIdentifierStart.get(c0_)) { 935 token = ScanIdentifier(); 936 } else if (IsDecimalDigit(c0_)) { 937 token = ScanNumber(false); 938 } else if (SkipWhiteSpace()) { 939 token = Token::WHITESPACE; 940 } else if (c0_ < 0) { 941 token = Token::EOS; 942 } else { 943 token = Select(Token::ILLEGAL); 944 } 945 break; 946 } 947 948 // Continue scanning for tokens as long as we're just skipping 949 // whitespace. 950 } while (token == Token::WHITESPACE); 951 952 next_.location.end_pos = source_pos(); 953 next_.token = token; 954 } 955 956 957 void Scanner::SeekForward(int pos) { 958 source_->SeekForward(pos - 1); 959 Advance(); 960 Scan(); 961 } 962 963 964 uc32 Scanner::ScanHexEscape(uc32 c, int length) { 965 ASSERT(length <= 4); // prevent overflow 966 967 uc32 digits[4]; 968 uc32 x = 0; 969 for (int i = 0; i < length; i++) { 970 digits[i] = c0_; 971 int d = HexValue(c0_); 972 if (d < 0) { 973 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 974 // should be illegal, but other JS VMs just return the 975 // non-escaped version of the original character. 976 977 // Push back digits read, except the last one (in c0_). 978 for (int j = i-1; j >= 0; j--) { 979 PushBack(digits[j]); 980 } 981 // Notice: No handling of error - treat it as "\u"->"u". 982 return c; 983 } 984 x = x * 16 + d; 985 Advance(); 986 } 987 988 return x; 989 } 990 991 992 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of 993 // ECMA-262. Other JS VMs support them. 994 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 995 uc32 x = c - '0'; 996 for (int i = 0; i < length; i++) { 997 int d = c0_ - '0'; 998 if (d < 0 || d > 7) break; 999 int nx = x * 8 + d; 1000 if (nx >= 256) break; 1001 x = nx; 1002 Advance(); 1003 } 1004 return x; 1005 } 1006 1007 1008 void Scanner::ScanEscape() { 1009 uc32 c = c0_; 1010 Advance(); 1011 1012 // Skip escaped newlines. 1013 if (kIsLineTerminator.get(c)) { 1014 // Allow CR+LF newlines in multiline string literals. 1015 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 1016 // Allow LF+CR newlines in multiline string literals. 1017 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 1018 return; 1019 } 1020 1021 switch (c) { 1022 case '\'': // fall through 1023 case '"' : // fall through 1024 case '\\': break; 1025 case 'b' : c = '\b'; break; 1026 case 'f' : c = '\f'; break; 1027 case 'n' : c = '\n'; break; 1028 case 'r' : c = '\r'; break; 1029 case 't' : c = '\t'; break; 1030 case 'u' : c = ScanHexEscape(c, 4); break; 1031 case 'v' : c = '\v'; break; 1032 case 'x' : c = ScanHexEscape(c, 2); break; 1033 case '0' : // fall through 1034 case '1' : // fall through 1035 case '2' : // fall through 1036 case '3' : // fall through 1037 case '4' : // fall through 1038 case '5' : // fall through 1039 case '6' : // fall through 1040 case '7' : c = ScanOctalEscape(c, 2); break; 1041 } 1042 1043 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these 1044 // should be illegal, but they are commonly handled 1045 // as non-escaped characters by JS VMs. 1046 AddChar(c); 1047 } 1048 1049 1050 Token::Value Scanner::ScanString() { 1051 uc32 quote = c0_; 1052 Advance(); // consume quote 1053 1054 StartLiteral(); 1055 while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) { 1056 uc32 c = c0_; 1057 Advance(); 1058 if (c == '\\') { 1059 if (c0_ < 0) return Token::ILLEGAL; 1060 ScanEscape(); 1061 } else { 1062 AddChar(c); 1063 } 1064 } 1065 if (c0_ != quote) { 1066 return Token::ILLEGAL; 1067 } 1068 TerminateLiteral(); 1069 1070 Advance(); // consume quote 1071 return Token::STRING; 1072 } 1073 1074 1075 Token::Value Scanner::Select(Token::Value tok) { 1076 Advance(); 1077 return tok; 1078 } 1079 1080 1081 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) { 1082 Advance(); 1083 if (c0_ == next) { 1084 Advance(); 1085 return then; 1086 } else { 1087 return else_; 1088 } 1089 } 1090 1091 1092 // Returns true if any decimal digits were scanned, returns false otherwise. 1093 void Scanner::ScanDecimalDigits() { 1094 while (IsDecimalDigit(c0_)) 1095 AddCharAdvance(); 1096 } 1097 1098 1099 Token::Value Scanner::ScanNumber(bool seen_period) { 1100 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 1101 1102 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; 1103 1104 StartLiteral(); 1105 if (seen_period) { 1106 // we have already seen a decimal point of the float 1107 AddChar('.'); 1108 ScanDecimalDigits(); // we know we have at least one digit 1109 1110 } else { 1111 // if the first character is '0' we must check for octals and hex 1112 if (c0_ == '0') { 1113 AddCharAdvance(); 1114 1115 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number 1116 if (c0_ == 'x' || c0_ == 'X') { 1117 // hex number 1118 kind = HEX; 1119 AddCharAdvance(); 1120 if (!IsHexDigit(c0_)) 1121 // we must have at least one hex digit after 'x'/'X' 1122 return Token::ILLEGAL; 1123 while (IsHexDigit(c0_)) 1124 AddCharAdvance(); 1125 1126 } else if ('0' <= c0_ && c0_ <= '7') { 1127 // (possible) octal number 1128 kind = OCTAL; 1129 while (true) { 1130 if (c0_ == '8' || c0_ == '9') { 1131 kind = DECIMAL; 1132 break; 1133 } 1134 if (c0_ < '0' || '7' < c0_) break; 1135 AddCharAdvance(); 1136 } 1137 } 1138 } 1139 1140 // Parse decimal digits and allow trailing fractional part. 1141 if (kind == DECIMAL) { 1142 ScanDecimalDigits(); // optional 1143 if (c0_ == '.') { 1144 AddCharAdvance(); 1145 ScanDecimalDigits(); // optional 1146 } 1147 } 1148 } 1149 1150 // scan exponent, if any 1151 if (c0_ == 'e' || c0_ == 'E') { 1152 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 1153 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed 1154 // scan exponent 1155 AddCharAdvance(); 1156 if (c0_ == '+' || c0_ == '-') 1157 AddCharAdvance(); 1158 if (!IsDecimalDigit(c0_)) 1159 // we must have at least one decimal digit after 'e'/'E' 1160 return Token::ILLEGAL; 1161 ScanDecimalDigits(); 1162 } 1163 TerminateLiteral(); 1164 1165 // The source character immediately following a numeric literal must 1166 // not be an identifier start or a decimal digit; see ECMA-262 1167 // section 7.8.3, page 17 (note that we read only one decimal digit 1168 // if the value is 0). 1169 if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_)) 1170 return Token::ILLEGAL; 1171 1172 return Token::NUMBER; 1173 } 1174 1175 1176 uc32 Scanner::ScanIdentifierUnicodeEscape() { 1177 Advance(); 1178 if (c0_ != 'u') return unibrow::Utf8::kBadChar; 1179 Advance(); 1180 uc32 c = ScanHexEscape('u', 4); 1181 // We do not allow a unicode escape sequence to start another 1182 // unicode escape sequence. 1183 if (c == '\\') return unibrow::Utf8::kBadChar; 1184 return c; 1185 } 1186 1187 1188 Token::Value Scanner::ScanIdentifier() { 1189 ASSERT(kIsIdentifierStart.get(c0_)); 1190 1191 StartLiteral(); 1192 KeywordMatcher keyword_match; 1193 1194 // Scan identifier start character. 1195 if (c0_ == '\\') { 1196 uc32 c = ScanIdentifierUnicodeEscape(); 1197 // Only allow legal identifier start characters. 1198 if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL; 1199 AddChar(c); 1200 keyword_match.Fail(); 1201 } else { 1202 AddChar(c0_); 1203 keyword_match.AddChar(c0_); 1204 Advance(); 1205 } 1206 1207 // Scan the rest of the identifier characters. 1208 while (kIsIdentifierPart.get(c0_)) { 1209 if (c0_ == '\\') { 1210 uc32 c = ScanIdentifierUnicodeEscape(); 1211 // Only allow legal identifier part characters. 1212 if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL; 1213 AddChar(c); 1214 keyword_match.Fail(); 1215 } else { 1216 AddChar(c0_); 1217 keyword_match.AddChar(c0_); 1218 Advance(); 1219 } 1220 } 1221 TerminateLiteral(); 1222 1223 return keyword_match.token(); 1224 } 1225 1226 1227 1228 bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) { 1229 // Checks whether the buffer contains an identifier (no escape). 1230 if (!buffer->has_more()) return false; 1231 if (!kIsIdentifierStart.get(buffer->GetNext())) return false; 1232 while (buffer->has_more()) { 1233 if (!kIsIdentifierPart.get(buffer->GetNext())) return false; 1234 } 1235 return true; 1236 } 1237 1238 1239 bool Scanner::ScanRegExpPattern(bool seen_equal) { 1240 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1241 bool in_character_class = false; 1242 1243 // Previous token is either '/' or '/=', in the second case, the 1244 // pattern starts at =. 1245 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1246 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1247 1248 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1249 // the scanner should pass uninterpreted bodies to the RegExp 1250 // constructor. 1251 StartLiteral(); 1252 if (seen_equal) 1253 AddChar('='); 1254 1255 while (c0_ != '/' || in_character_class) { 1256 if (kIsLineTerminator.get(c0_) || c0_ < 0) 1257 return false; 1258 if (c0_ == '\\') { // escaped character 1259 AddCharAdvance(); 1260 if (kIsLineTerminator.get(c0_) || c0_ < 0) 1261 return false; 1262 AddCharAdvance(); 1263 } else { // unescaped character 1264 if (c0_ == '[') 1265 in_character_class = true; 1266 if (c0_ == ']') 1267 in_character_class = false; 1268 AddCharAdvance(); 1269 } 1270 } 1271 Advance(); // consume '/' 1272 1273 TerminateLiteral(); 1274 1275 return true; 1276 } 1277 1278 bool Scanner::ScanRegExpFlags() { 1279 // Scan regular expression flags. 1280 StartLiteral(); 1281 while (kIsIdentifierPart.get(c0_)) { 1282 if (c0_ == '\\') { 1283 uc32 c = ScanIdentifierUnicodeEscape(); 1284 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { 1285 // We allow any escaped character, unlike the restriction on 1286 // IdentifierPart when it is used to build an IdentifierName. 1287 AddChar(c); 1288 continue; 1289 } 1290 } 1291 AddCharAdvance(); 1292 } 1293 TerminateLiteral(); 1294 1295 next_.location.end_pos = source_pos() - 1; 1296 return true; 1297 } 1298 1299 } } // namespace v8::internal 1300