1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #include "src/parsing/scanner.h" 8 9 #include <stdint.h> 10 11 #include <cmath> 12 13 #include "src/ast/ast-value-factory.h" 14 #include "src/char-predicates-inl.h" 15 #include "src/conversions-inl.h" 16 #include "src/list-inl.h" 17 #include "src/parsing/duplicate-finder.h" // For Scanner::FindSymbol 18 19 namespace v8 { 20 namespace internal { 21 22 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const { 23 if (is_one_byte()) { 24 return isolate->factory()->InternalizeOneByteString(one_byte_literal()); 25 } 26 return isolate->factory()->InternalizeTwoByteString(two_byte_literal()); 27 } 28 29 // ---------------------------------------------------------------------------- 30 // Scanner::BookmarkScope 31 32 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos = 33 std::numeric_limits<size_t>::max() - 2; 34 const size_t Scanner::BookmarkScope::kNoBookmark = 35 std::numeric_limits<size_t>::max() - 1; 36 const size_t Scanner::BookmarkScope::kBookmarkWasApplied = 37 std::numeric_limits<size_t>::max(); 38 39 void Scanner::BookmarkScope::Set() { 40 DCHECK_EQ(bookmark_, kNoBookmark); 41 DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED); 42 43 // The first token is a bit special, since current_ will still be 44 // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it 45 // when 46 // applying the bookmark. 47 DCHECK_IMPLIES( 48 scanner_->current_.token == Token::UNINITIALIZED, 49 scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos); 50 bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED) 51 ? kBookmarkAtFirstPos 52 : scanner_->location().beg_pos; 53 } 54 55 void Scanner::BookmarkScope::Apply() { 56 DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark. 57 if (bookmark_ == kBookmarkAtFirstPos) { 58 scanner_->SeekNext(0); 59 } else { 60 scanner_->SeekNext(bookmark_); 61 scanner_->Next(); 62 DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_)); 63 } 64 bookmark_ = kBookmarkWasApplied; 65 } 66 67 bool Scanner::BookmarkScope::HasBeenSet() { 68 return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied; 69 } 70 71 bool Scanner::BookmarkScope::HasBeenApplied() { 72 return bookmark_ == kBookmarkWasApplied; 73 } 74 75 // ---------------------------------------------------------------------------- 76 // Scanner 77 78 Scanner::Scanner(UnicodeCache* unicode_cache) 79 : unicode_cache_(unicode_cache), 80 octal_pos_(Location::invalid()), 81 decimal_with_leading_zero_pos_(Location::invalid()), 82 found_html_comment_(false) { 83 } 84 85 86 void Scanner::Initialize(Utf16CharacterStream* source) { 87 source_ = source; 88 // Need to capture identifiers in order to recognize "get" and "set" 89 // in object literals. 90 Init(); 91 // Skip initial whitespace allowing HTML comment ends just like 92 // after a newline and scan first token. 93 has_line_terminator_before_next_ = true; 94 SkipWhiteSpace(); 95 Scan(); 96 } 97 98 template <bool capture_raw, bool unicode> 99 uc32 Scanner::ScanHexNumber(int expected_length) { 100 DCHECK(expected_length <= 4); // prevent overflow 101 102 int begin = source_pos() - 2; 103 uc32 x = 0; 104 for (int i = 0; i < expected_length; i++) { 105 int d = HexValue(c0_); 106 if (d < 0) { 107 ReportScannerError(Location(begin, begin + expected_length + 2), 108 unicode 109 ? MessageTemplate::kInvalidUnicodeEscapeSequence 110 : MessageTemplate::kInvalidHexEscapeSequence); 111 return -1; 112 } 113 x = x * 16 + d; 114 Advance<capture_raw>(); 115 } 116 117 return x; 118 } 119 120 template <bool capture_raw> 121 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) { 122 uc32 x = 0; 123 int d = HexValue(c0_); 124 if (d < 0) return -1; 125 126 while (d >= 0) { 127 x = x * 16 + d; 128 if (x > max_value) { 129 ReportScannerError(Location(beg_pos, source_pos() + 1), 130 MessageTemplate::kUndefinedUnicodeCodePoint); 131 return -1; 132 } 133 Advance<capture_raw>(); 134 d = HexValue(c0_); 135 } 136 137 return x; 138 } 139 140 141 // Ensure that tokens can be stored in a byte. 142 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 143 144 // Table of one-character tokens, by character (0x00..0x7f only). 145 static const byte one_char_tokens[] = { 146 Token::ILLEGAL, 147 Token::ILLEGAL, 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::ILLEGAL, 151 Token::ILLEGAL, 152 Token::ILLEGAL, 153 Token::ILLEGAL, 154 Token::ILLEGAL, 155 Token::ILLEGAL, 156 Token::ILLEGAL, 157 Token::ILLEGAL, 158 Token::ILLEGAL, 159 Token::ILLEGAL, 160 Token::ILLEGAL, 161 Token::ILLEGAL, 162 Token::ILLEGAL, 163 Token::ILLEGAL, 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::ILLEGAL, 172 Token::ILLEGAL, 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::ILLEGAL, 177 Token::ILLEGAL, 178 Token::ILLEGAL, 179 Token::ILLEGAL, 180 Token::ILLEGAL, 181 Token::ILLEGAL, 182 Token::ILLEGAL, 183 Token::ILLEGAL, 184 Token::ILLEGAL, 185 Token::ILLEGAL, 186 Token::LPAREN, // 0x28 187 Token::RPAREN, // 0x29 188 Token::ILLEGAL, 189 Token::ILLEGAL, 190 Token::COMMA, // 0x2c 191 Token::ILLEGAL, 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::COLON, // 0x3a 205 Token::SEMICOLON, // 0x3b 206 Token::ILLEGAL, 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::CONDITIONAL, // 0x3f 210 Token::ILLEGAL, 211 Token::ILLEGAL, 212 Token::ILLEGAL, 213 Token::ILLEGAL, 214 Token::ILLEGAL, 215 Token::ILLEGAL, 216 Token::ILLEGAL, 217 Token::ILLEGAL, 218 Token::ILLEGAL, 219 Token::ILLEGAL, 220 Token::ILLEGAL, 221 Token::ILLEGAL, 222 Token::ILLEGAL, 223 Token::ILLEGAL, 224 Token::ILLEGAL, 225 Token::ILLEGAL, 226 Token::ILLEGAL, 227 Token::ILLEGAL, 228 Token::ILLEGAL, 229 Token::ILLEGAL, 230 Token::ILLEGAL, 231 Token::ILLEGAL, 232 Token::ILLEGAL, 233 Token::ILLEGAL, 234 Token::ILLEGAL, 235 Token::ILLEGAL, 236 Token::ILLEGAL, 237 Token::LBRACK, // 0x5b 238 Token::ILLEGAL, 239 Token::RBRACK, // 0x5d 240 Token::ILLEGAL, 241 Token::ILLEGAL, 242 Token::ILLEGAL, 243 Token::ILLEGAL, 244 Token::ILLEGAL, 245 Token::ILLEGAL, 246 Token::ILLEGAL, 247 Token::ILLEGAL, 248 Token::ILLEGAL, 249 Token::ILLEGAL, 250 Token::ILLEGAL, 251 Token::ILLEGAL, 252 Token::ILLEGAL, 253 Token::ILLEGAL, 254 Token::ILLEGAL, 255 Token::ILLEGAL, 256 Token::ILLEGAL, 257 Token::ILLEGAL, 258 Token::ILLEGAL, 259 Token::ILLEGAL, 260 Token::ILLEGAL, 261 Token::ILLEGAL, 262 Token::ILLEGAL, 263 Token::ILLEGAL, 264 Token::ILLEGAL, 265 Token::ILLEGAL, 266 Token::ILLEGAL, 267 Token::ILLEGAL, 268 Token::ILLEGAL, 269 Token::LBRACE, // 0x7b 270 Token::ILLEGAL, 271 Token::RBRACE, // 0x7d 272 Token::BIT_NOT, // 0x7e 273 Token::ILLEGAL 274 }; 275 276 277 Token::Value Scanner::Next() { 278 if (next_.token == Token::EOS) { 279 next_.location.beg_pos = current_.location.beg_pos; 280 next_.location.end_pos = current_.location.end_pos; 281 } 282 current_ = next_; 283 if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) { 284 next_ = next_next_; 285 next_next_.token = Token::UNINITIALIZED; 286 has_line_terminator_before_next_ = has_line_terminator_after_next_; 287 return current_.token; 288 } 289 has_line_terminator_before_next_ = false; 290 has_multiline_comment_before_next_ = false; 291 if (static_cast<unsigned>(c0_) <= 0x7f) { 292 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 293 if (token != Token::ILLEGAL) { 294 int pos = source_pos(); 295 next_.token = token; 296 next_.location.beg_pos = pos; 297 next_.location.end_pos = pos + 1; 298 next_.literal_chars = nullptr; 299 next_.raw_literal_chars = nullptr; 300 Advance(); 301 return current_.token; 302 } 303 } 304 Scan(); 305 return current_.token; 306 } 307 308 309 Token::Value Scanner::PeekAhead() { 310 DCHECK(next_.token != Token::DIV); 311 DCHECK(next_.token != Token::ASSIGN_DIV); 312 313 if (next_next_.token != Token::UNINITIALIZED) { 314 return next_next_.token; 315 } 316 TokenDesc prev = current_; 317 bool has_line_terminator_before_next = 318 has_line_terminator_before_next_ || has_multiline_comment_before_next_; 319 Next(); 320 has_line_terminator_after_next_ = 321 has_line_terminator_before_next_ || has_multiline_comment_before_next_; 322 has_line_terminator_before_next_ = has_line_terminator_before_next; 323 Token::Value ret = next_.token; 324 next_next_ = next_; 325 next_ = current_; 326 current_ = prev; 327 return ret; 328 } 329 330 331 // TODO(yangguo): check whether this is actually necessary. 332 static inline bool IsLittleEndianByteOrderMark(uc32 c) { 333 // The Unicode value U+FFFE is guaranteed never to be assigned as a 334 // Unicode character; this implies that in a Unicode context the 335 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 336 // character expressed in little-endian byte order (since it could 337 // not be a U+FFFE character expressed in big-endian byte 338 // order). Nevertheless, we check for it to be compatible with 339 // Spidermonkey. 340 return c == 0xFFFE; 341 } 342 343 bool Scanner::SkipWhiteSpace() { 344 int start_position = source_pos(); 345 346 while (true) { 347 while (true) { 348 // Don't skip behind the end of input. 349 if (c0_ == kEndOfInput) break; 350 351 // Advance as long as character is a WhiteSpace or LineTerminator. 352 // Remember if the latter is the case. 353 if (unicode_cache_->IsLineTerminator(c0_)) { 354 has_line_terminator_before_next_ = true; 355 } else if (!unicode_cache_->IsWhiteSpace(c0_) && 356 !IsLittleEndianByteOrderMark(c0_)) { 357 break; 358 } 359 Advance(); 360 } 361 362 // If there is an HTML comment end '-->' at the beginning of a 363 // line (with only whitespace in front of it), we treat the rest 364 // of the line as a comment. This is in line with the way 365 // SpiderMonkey handles it. 366 if (c0_ != '-' || !has_line_terminator_before_next_) break; 367 368 Advance(); 369 if (c0_ != '-') { 370 PushBack('-'); // undo Advance() 371 break; 372 } 373 374 Advance(); 375 if (c0_ != '>') { 376 PushBack2('-', '-'); // undo 2x Advance(); 377 break; 378 } 379 380 // Treat the rest of the line as a comment. 381 SkipSingleLineComment(); 382 } 383 384 // Return whether or not we skipped any characters. 385 return source_pos() != start_position; 386 } 387 388 Token::Value Scanner::SkipSingleLineComment() { 389 Advance(); 390 391 // The line terminator at the end of the line is not considered 392 // to be part of the single-line comment; it is recognized 393 // separately by the lexical grammar and becomes part of the 394 // stream of input elements for the syntactic grammar (see 395 // ECMA-262, section 7.4). 396 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 397 Advance(); 398 } 399 400 return Token::WHITESPACE; 401 } 402 403 404 Token::Value Scanner::SkipSourceURLComment() { 405 TryToParseSourceURLComment(); 406 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 407 Advance(); 408 } 409 410 return Token::WHITESPACE; 411 } 412 413 414 void Scanner::TryToParseSourceURLComment() { 415 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this 416 // function will just return if it cannot parse a magic comment. 417 if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return; 418 Advance(); 419 LiteralBuffer name; 420 while (c0_ != kEndOfInput && 421 !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') { 422 name.AddChar(c0_); 423 Advance(); 424 } 425 if (!name.is_one_byte()) return; 426 Vector<const uint8_t> name_literal = name.one_byte_literal(); 427 LiteralBuffer* value; 428 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) { 429 value = &source_url_; 430 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) { 431 value = &source_mapping_url_; 432 } else { 433 return; 434 } 435 if (c0_ != '=') 436 return; 437 Advance(); 438 value->Reset(); 439 while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) { 440 Advance(); 441 } 442 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 443 // Disallowed characters. 444 if (c0_ == '"' || c0_ == '\'') { 445 value->Reset(); 446 return; 447 } 448 if (unicode_cache_->IsWhiteSpace(c0_)) { 449 break; 450 } 451 value->AddChar(c0_); 452 Advance(); 453 } 454 // Allow whitespace at the end. 455 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 456 if (!unicode_cache_->IsWhiteSpace(c0_)) { 457 value->Reset(); 458 break; 459 } 460 Advance(); 461 } 462 } 463 464 465 Token::Value Scanner::SkipMultiLineComment() { 466 DCHECK(c0_ == '*'); 467 Advance(); 468 469 while (c0_ != kEndOfInput) { 470 uc32 ch = c0_; 471 Advance(); 472 if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(ch)) { 473 // Following ECMA-262, section 7.4, a comment containing 474 // a newline will make the comment count as a line-terminator. 475 has_multiline_comment_before_next_ = true; 476 } 477 // If we have reached the end of the multi-line comment, we 478 // consume the '/' and insert a whitespace. This way all 479 // multi-line comments are treated as whitespace. 480 if (ch == '*' && c0_ == '/') { 481 c0_ = ' '; 482 return Token::WHITESPACE; 483 } 484 } 485 486 // Unterminated multi-line comment. 487 return Token::ILLEGAL; 488 } 489 490 Token::Value Scanner::ScanHtmlComment() { 491 // Check for <!-- comments. 492 DCHECK(c0_ == '!'); 493 Advance(); 494 if (c0_ != '-') { 495 PushBack('!'); // undo Advance() 496 return Token::LT; 497 } 498 499 Advance(); 500 if (c0_ != '-') { 501 PushBack2('-', '!'); // undo 2x Advance() 502 return Token::LT; 503 } 504 505 found_html_comment_ = true; 506 return SkipSingleLineComment(); 507 } 508 509 void Scanner::Scan() { 510 next_.literal_chars = NULL; 511 next_.raw_literal_chars = NULL; 512 Token::Value token; 513 do { 514 // Remember the position of the next token 515 next_.location.beg_pos = source_pos(); 516 517 switch (c0_) { 518 case ' ': 519 case '\t': 520 Advance(); 521 token = Token::WHITESPACE; 522 break; 523 524 case '\n': 525 Advance(); 526 has_line_terminator_before_next_ = true; 527 token = Token::WHITESPACE; 528 break; 529 530 case '"': case '\'': 531 token = ScanString(); 532 break; 533 534 case '<': 535 // < <= << <<= <!-- 536 Advance(); 537 if (c0_ == '=') { 538 token = Select(Token::LTE); 539 } else if (c0_ == '<') { 540 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 541 } else if (c0_ == '!') { 542 token = ScanHtmlComment(); 543 } else { 544 token = Token::LT; 545 } 546 break; 547 548 case '>': 549 // > >= >> >>= >>> >>>= 550 Advance(); 551 if (c0_ == '=') { 552 token = Select(Token::GTE); 553 } else if (c0_ == '>') { 554 // >> >>= >>> >>>= 555 Advance(); 556 if (c0_ == '=') { 557 token = Select(Token::ASSIGN_SAR); 558 } else if (c0_ == '>') { 559 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 560 } else { 561 token = Token::SAR; 562 } 563 } else { 564 token = Token::GT; 565 } 566 break; 567 568 case '=': 569 // = == === => 570 Advance(); 571 if (c0_ == '=') { 572 token = Select('=', Token::EQ_STRICT, Token::EQ); 573 } else if (c0_ == '>') { 574 token = Select(Token::ARROW); 575 } else { 576 token = Token::ASSIGN; 577 } 578 break; 579 580 case '!': 581 // ! != !== 582 Advance(); 583 if (c0_ == '=') { 584 token = Select('=', Token::NE_STRICT, Token::NE); 585 } else { 586 token = Token::NOT; 587 } 588 break; 589 590 case '+': 591 // + ++ += 592 Advance(); 593 if (c0_ == '+') { 594 token = Select(Token::INC); 595 } else if (c0_ == '=') { 596 token = Select(Token::ASSIGN_ADD); 597 } else { 598 token = Token::ADD; 599 } 600 break; 601 602 case '-': 603 // - -- --> -= 604 Advance(); 605 if (c0_ == '-') { 606 Advance(); 607 if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) { 608 // For compatibility with SpiderMonkey, we skip lines that 609 // start with an HTML comment end '-->'. 610 token = SkipSingleLineComment(); 611 } else { 612 token = Token::DEC; 613 } 614 } else if (c0_ == '=') { 615 token = Select(Token::ASSIGN_SUB); 616 } else { 617 token = Token::SUB; 618 } 619 break; 620 621 case '*': 622 // * *= 623 Advance(); 624 if (c0_ == '*') { 625 token = Select('=', Token::ASSIGN_EXP, Token::EXP); 626 } else if (c0_ == '=') { 627 token = Select(Token::ASSIGN_MUL); 628 } else { 629 token = Token::MUL; 630 } 631 break; 632 633 case '%': 634 // % %= 635 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 636 break; 637 638 case '/': 639 // / // /* /= 640 Advance(); 641 if (c0_ == '/') { 642 Advance(); 643 if (c0_ == '#' || c0_ == '@') { 644 Advance(); 645 token = SkipSourceURLComment(); 646 } else { 647 PushBack(c0_); 648 token = SkipSingleLineComment(); 649 } 650 } else if (c0_ == '*') { 651 token = SkipMultiLineComment(); 652 } else if (c0_ == '=') { 653 token = Select(Token::ASSIGN_DIV); 654 } else { 655 token = Token::DIV; 656 } 657 break; 658 659 case '&': 660 // & && &= 661 Advance(); 662 if (c0_ == '&') { 663 token = Select(Token::AND); 664 } else if (c0_ == '=') { 665 token = Select(Token::ASSIGN_BIT_AND); 666 } else { 667 token = Token::BIT_AND; 668 } 669 break; 670 671 case '|': 672 // | || |= 673 Advance(); 674 if (c0_ == '|') { 675 token = Select(Token::OR); 676 } else if (c0_ == '=') { 677 token = Select(Token::ASSIGN_BIT_OR); 678 } else { 679 token = Token::BIT_OR; 680 } 681 break; 682 683 case '^': 684 // ^ ^= 685 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 686 break; 687 688 case '.': 689 // . Number 690 Advance(); 691 if (IsDecimalDigit(c0_)) { 692 token = ScanNumber(true); 693 } else { 694 token = Token::PERIOD; 695 if (c0_ == '.') { 696 Advance(); 697 if (c0_ == '.') { 698 Advance(); 699 token = Token::ELLIPSIS; 700 } else { 701 PushBack('.'); 702 } 703 } 704 } 705 break; 706 707 case ':': 708 token = Select(Token::COLON); 709 break; 710 711 case ';': 712 token = Select(Token::SEMICOLON); 713 break; 714 715 case ',': 716 token = Select(Token::COMMA); 717 break; 718 719 case '(': 720 token = Select(Token::LPAREN); 721 break; 722 723 case ')': 724 token = Select(Token::RPAREN); 725 break; 726 727 case '[': 728 token = Select(Token::LBRACK); 729 break; 730 731 case ']': 732 token = Select(Token::RBRACK); 733 break; 734 735 case '{': 736 token = Select(Token::LBRACE); 737 break; 738 739 case '}': 740 token = Select(Token::RBRACE); 741 break; 742 743 case '?': 744 token = Select(Token::CONDITIONAL); 745 break; 746 747 case '~': 748 token = Select(Token::BIT_NOT); 749 break; 750 751 case '`': 752 token = ScanTemplateStart(); 753 break; 754 755 default: 756 if (c0_ == kEndOfInput) { 757 token = Token::EOS; 758 } else if (unicode_cache_->IsIdentifierStart(c0_)) { 759 token = ScanIdentifierOrKeyword(); 760 } else if (IsDecimalDigit(c0_)) { 761 token = ScanNumber(false); 762 } else if (SkipWhiteSpace()) { 763 token = Token::WHITESPACE; 764 } else { 765 token = Select(Token::ILLEGAL); 766 } 767 break; 768 } 769 770 // Continue scanning for tokens as long as we're just skipping 771 // whitespace. 772 } while (token == Token::WHITESPACE); 773 774 next_.location.end_pos = source_pos(); 775 next_.token = token; 776 777 #ifdef DEBUG 778 SanityCheckTokenDesc(current_); 779 SanityCheckTokenDesc(next_); 780 SanityCheckTokenDesc(next_next_); 781 #endif 782 } 783 784 #ifdef DEBUG 785 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const { 786 // Most tokens should not have literal_chars or even raw_literal chars. 787 // The rules are: 788 // - UNINITIALIZED: we don't care. 789 // - TEMPLATE_*: need both literal + raw literal chars. 790 // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal. 791 // - all others: should have neither. 792 793 switch (token.token) { 794 case Token::UNINITIALIZED: 795 // token.literal_chars & other members might be garbage. That's ok. 796 break; 797 case Token::TEMPLATE_SPAN: 798 case Token::TEMPLATE_TAIL: 799 DCHECK_NOT_NULL(token.raw_literal_chars); 800 DCHECK_NOT_NULL(token.literal_chars); 801 break; 802 case Token::ESCAPED_KEYWORD: 803 case Token::ESCAPED_STRICT_RESERVED_WORD: 804 case Token::FUTURE_STRICT_RESERVED_WORD: 805 case Token::IDENTIFIER: 806 case Token::NUMBER: 807 case Token::REGEXP_LITERAL: 808 case Token::SMI: 809 case Token::STRING: 810 DCHECK_NOT_NULL(token.literal_chars); 811 DCHECK_NULL(token.raw_literal_chars); 812 break; 813 default: 814 DCHECK_NULL(token.literal_chars); 815 DCHECK_NULL(token.raw_literal_chars); 816 break; 817 } 818 } 819 #endif // DEBUG 820 821 void Scanner::SeekForward(int pos) { 822 // After this call, we will have the token at the given position as 823 // the "next" token. The "current" token will be invalid. 824 if (pos == next_.location.beg_pos) return; 825 int current_pos = source_pos(); 826 DCHECK_EQ(next_.location.end_pos, current_pos); 827 // Positions inside the lookahead token aren't supported. 828 DCHECK(pos >= current_pos); 829 if (pos != current_pos) { 830 source_->Seek(pos); 831 Advance(); 832 // This function is only called to seek to the location 833 // of the end of a function (at the "}" token). It doesn't matter 834 // whether there was a line terminator in the part we skip. 835 has_line_terminator_before_next_ = false; 836 has_multiline_comment_before_next_ = false; 837 } 838 Scan(); 839 } 840 841 842 template <bool capture_raw, bool in_template_literal> 843 bool Scanner::ScanEscape() { 844 uc32 c = c0_; 845 Advance<capture_raw>(); 846 847 // Skip escaped newlines. 848 if (!in_template_literal && c0_ != kEndOfInput && 849 unicode_cache_->IsLineTerminator(c)) { 850 // Allow CR+LF newlines in multiline string literals. 851 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>(); 852 // Allow LF+CR newlines in multiline string literals. 853 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>(); 854 return true; 855 } 856 857 switch (c) { 858 case '\'': // fall through 859 case '"' : // fall through 860 case '\\': break; 861 case 'b' : c = '\b'; break; 862 case 'f' : c = '\f'; break; 863 case 'n' : c = '\n'; break; 864 case 'r' : c = '\r'; break; 865 case 't' : c = '\t'; break; 866 case 'u' : { 867 c = ScanUnicodeEscape<capture_raw>(); 868 if (c < 0) return false; 869 break; 870 } 871 case 'v': 872 c = '\v'; 873 break; 874 case 'x': { 875 c = ScanHexNumber<capture_raw>(2); 876 if (c < 0) return false; 877 break; 878 } 879 case '0': // Fall through. 880 case '1': // fall through 881 case '2': // fall through 882 case '3': // fall through 883 case '4': // fall through 884 case '5': // fall through 885 case '6': // fall through 886 case '7': 887 c = ScanOctalEscape<capture_raw>(c, 2); 888 break; 889 } 890 891 // According to ECMA-262, section 7.8.4, characters not covered by the 892 // above cases should be illegal, but they are commonly handled as 893 // non-escaped characters by JS VMs. 894 AddLiteralChar(c); 895 return true; 896 } 897 898 899 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of 900 // ECMA-262. Other JS VMs support them. 901 template <bool capture_raw> 902 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 903 uc32 x = c - '0'; 904 int i = 0; 905 for (; i < length; i++) { 906 int d = c0_ - '0'; 907 if (d < 0 || d > 7) break; 908 int nx = x * 8 + d; 909 if (nx >= 256) break; 910 x = nx; 911 Advance<capture_raw>(); 912 } 913 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 914 // Remember the position of octal escape sequences so that an error 915 // can be reported later (in strict mode). 916 // We don't report the error immediately, because the octal escape can 917 // occur before the "use strict" directive. 918 if (c != '0' || i > 0) { 919 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 920 } 921 return x; 922 } 923 924 925 Token::Value Scanner::ScanString() { 926 uc32 quote = c0_; 927 Advance<false, false>(); // consume quote 928 929 LiteralScope literal(this); 930 while (true) { 931 if (c0_ > kMaxAscii) { 932 HandleLeadSurrogate(); 933 break; 934 } 935 if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL; 936 if (c0_ == quote) { 937 literal.Complete(); 938 Advance<false, false>(); 939 return Token::STRING; 940 } 941 char c = static_cast<char>(c0_); 942 if (c == '\\') break; 943 Advance<false, false>(); 944 AddLiteralChar(c); 945 } 946 947 while (c0_ != quote && c0_ != kEndOfInput && 948 !unicode_cache_->IsLineTerminator(c0_)) { 949 uc32 c = c0_; 950 Advance(); 951 if (c == '\\') { 952 if (c0_ == kEndOfInput || !ScanEscape<false, false>()) { 953 return Token::ILLEGAL; 954 } 955 } else { 956 AddLiteralChar(c); 957 } 958 } 959 if (c0_ != quote) return Token::ILLEGAL; 960 literal.Complete(); 961 962 Advance(); // consume quote 963 return Token::STRING; 964 } 965 966 967 Token::Value Scanner::ScanTemplateSpan() { 968 // When scanning a TemplateSpan, we are looking for the following construct: 969 // TEMPLATE_SPAN :: 970 // ` LiteralChars* ${ 971 // | } LiteralChars* ${ 972 // 973 // TEMPLATE_TAIL :: 974 // ` LiteralChars* ` 975 // | } LiteralChar* ` 976 // 977 // A TEMPLATE_SPAN should always be followed by an Expression, while a 978 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be 979 // followed by an Expression. 980 981 Token::Value result = Token::TEMPLATE_SPAN; 982 LiteralScope literal(this); 983 StartRawLiteral(); 984 const bool capture_raw = true; 985 const bool in_template_literal = true; 986 while (true) { 987 uc32 c = c0_; 988 Advance<capture_raw>(); 989 if (c == '`') { 990 result = Token::TEMPLATE_TAIL; 991 ReduceRawLiteralLength(1); 992 break; 993 } else if (c == '$' && c0_ == '{') { 994 Advance<capture_raw>(); // Consume '{' 995 ReduceRawLiteralLength(2); 996 break; 997 } else if (c == '\\') { 998 if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(c0_)) { 999 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty 1000 // code unit sequence. 1001 uc32 lastChar = c0_; 1002 Advance<capture_raw>(); 1003 if (lastChar == '\r') { 1004 ReduceRawLiteralLength(1); // Remove \r 1005 if (c0_ == '\n') { 1006 Advance<capture_raw>(); // Adds \n 1007 } else { 1008 AddRawLiteralChar('\n'); 1009 } 1010 } 1011 } else if (!ScanEscape<capture_raw, in_template_literal>()) { 1012 return Token::ILLEGAL; 1013 } 1014 } else if (c < 0) { 1015 // Unterminated template literal 1016 PushBack(c); 1017 break; 1018 } else { 1019 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A. 1020 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence 1021 // consisting of the CV 0x000A. 1022 if (c == '\r') { 1023 ReduceRawLiteralLength(1); // Remove \r 1024 if (c0_ == '\n') { 1025 Advance<capture_raw>(); // Adds \n 1026 } else { 1027 AddRawLiteralChar('\n'); 1028 } 1029 c = '\n'; 1030 } 1031 AddLiteralChar(c); 1032 } 1033 } 1034 literal.Complete(); 1035 next_.location.end_pos = source_pos(); 1036 next_.token = result; 1037 return result; 1038 } 1039 1040 1041 Token::Value Scanner::ScanTemplateStart() { 1042 DCHECK(next_next_.token == Token::UNINITIALIZED); 1043 DCHECK(c0_ == '`'); 1044 next_.location.beg_pos = source_pos(); 1045 Advance(); // Consume ` 1046 return ScanTemplateSpan(); 1047 } 1048 1049 1050 Token::Value Scanner::ScanTemplateContinuation() { 1051 DCHECK_EQ(next_.token, Token::RBRACE); 1052 next_.location.beg_pos = source_pos() - 1; // We already consumed } 1053 return ScanTemplateSpan(); 1054 } 1055 1056 1057 void Scanner::ScanDecimalDigits() { 1058 while (IsDecimalDigit(c0_)) 1059 AddLiteralCharAdvance(); 1060 } 1061 1062 1063 Token::Value Scanner::ScanNumber(bool seen_period) { 1064 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 1065 1066 enum { 1067 DECIMAL, 1068 DECIMAL_WITH_LEADING_ZERO, 1069 HEX, 1070 OCTAL, 1071 IMPLICIT_OCTAL, 1072 BINARY 1073 } kind = DECIMAL; 1074 1075 LiteralScope literal(this); 1076 bool at_start = !seen_period; 1077 int start_pos = source_pos(); // For reporting octal positions. 1078 if (seen_period) { 1079 // we have already seen a decimal point of the float 1080 AddLiteralChar('.'); 1081 ScanDecimalDigits(); // we know we have at least one digit 1082 1083 } else { 1084 // if the first character is '0' we must check for octals and hex 1085 if (c0_ == '0') { 1086 AddLiteralCharAdvance(); 1087 1088 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or 1089 // an octal number. 1090 if (c0_ == 'x' || c0_ == 'X') { 1091 // hex number 1092 kind = HEX; 1093 AddLiteralCharAdvance(); 1094 if (!IsHexDigit(c0_)) { 1095 // we must have at least one hex digit after 'x'/'X' 1096 return Token::ILLEGAL; 1097 } 1098 while (IsHexDigit(c0_)) { 1099 AddLiteralCharAdvance(); 1100 } 1101 } else if (c0_ == 'o' || c0_ == 'O') { 1102 kind = OCTAL; 1103 AddLiteralCharAdvance(); 1104 if (!IsOctalDigit(c0_)) { 1105 // we must have at least one octal digit after 'o'/'O' 1106 return Token::ILLEGAL; 1107 } 1108 while (IsOctalDigit(c0_)) { 1109 AddLiteralCharAdvance(); 1110 } 1111 } else if (c0_ == 'b' || c0_ == 'B') { 1112 kind = BINARY; 1113 AddLiteralCharAdvance(); 1114 if (!IsBinaryDigit(c0_)) { 1115 // we must have at least one binary digit after 'b'/'B' 1116 return Token::ILLEGAL; 1117 } 1118 while (IsBinaryDigit(c0_)) { 1119 AddLiteralCharAdvance(); 1120 } 1121 } else if ('0' <= c0_ && c0_ <= '7') { 1122 // (possible) octal number 1123 kind = IMPLICIT_OCTAL; 1124 while (true) { 1125 if (c0_ == '8' || c0_ == '9') { 1126 at_start = false; 1127 kind = DECIMAL_WITH_LEADING_ZERO; 1128 break; 1129 } 1130 if (c0_ < '0' || '7' < c0_) { 1131 // Octal literal finished. 1132 octal_pos_ = Location(start_pos, source_pos()); 1133 break; 1134 } 1135 AddLiteralCharAdvance(); 1136 } 1137 } else if (c0_ == '8' || c0_ == '9') { 1138 kind = DECIMAL_WITH_LEADING_ZERO; 1139 } 1140 } 1141 1142 // Parse decimal digits and allow trailing fractional part. 1143 if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) { 1144 if (at_start) { 1145 uint64_t value = 0; 1146 while (IsDecimalDigit(c0_)) { 1147 value = 10 * value + (c0_ - '0'); 1148 1149 uc32 first_char = c0_; 1150 Advance<false, false>(); 1151 AddLiteralChar(first_char); 1152 } 1153 1154 if (next_.literal_chars->one_byte_literal().length() <= 10 && 1155 value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') { 1156 next_.smi_value_ = static_cast<uint32_t>(value); 1157 literal.Complete(); 1158 HandleLeadSurrogate(); 1159 1160 if (kind == DECIMAL_WITH_LEADING_ZERO) 1161 decimal_with_leading_zero_pos_ = Location(start_pos, source_pos()); 1162 return Token::SMI; 1163 } 1164 HandleLeadSurrogate(); 1165 } 1166 1167 ScanDecimalDigits(); // optional 1168 if (c0_ == '.') { 1169 AddLiteralCharAdvance(); 1170 ScanDecimalDigits(); // optional 1171 } 1172 } 1173 } 1174 1175 // scan exponent, if any 1176 if (c0_ == 'e' || c0_ == 'E') { 1177 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 1178 if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO)) 1179 return Token::ILLEGAL; 1180 // scan exponent 1181 AddLiteralCharAdvance(); 1182 if (c0_ == '+' || c0_ == '-') 1183 AddLiteralCharAdvance(); 1184 if (!IsDecimalDigit(c0_)) { 1185 // we must have at least one decimal digit after 'e'/'E' 1186 return Token::ILLEGAL; 1187 } 1188 ScanDecimalDigits(); 1189 } 1190 1191 // The source character immediately following a numeric literal must 1192 // not be an identifier start or a decimal digit; see ECMA-262 1193 // section 7.8.3, page 17 (note that we read only one decimal digit 1194 // if the value is 0). 1195 if (IsDecimalDigit(c0_) || 1196 (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_))) 1197 return Token::ILLEGAL; 1198 1199 literal.Complete(); 1200 1201 if (kind == DECIMAL_WITH_LEADING_ZERO) 1202 decimal_with_leading_zero_pos_ = Location(start_pos, source_pos()); 1203 return Token::NUMBER; 1204 } 1205 1206 1207 uc32 Scanner::ScanIdentifierUnicodeEscape() { 1208 Advance(); 1209 if (c0_ != 'u') return -1; 1210 Advance(); 1211 return ScanUnicodeEscape<false>(); 1212 } 1213 1214 1215 template <bool capture_raw> 1216 uc32 Scanner::ScanUnicodeEscape() { 1217 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of 1218 // hex digits between { } is arbitrary. \ and u have already been read. 1219 if (c0_ == '{') { 1220 int begin = source_pos() - 2; 1221 Advance<capture_raw>(); 1222 uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin); 1223 if (cp < 0 || c0_ != '}') { 1224 ReportScannerError(source_pos(), 1225 MessageTemplate::kInvalidUnicodeEscapeSequence); 1226 return -1; 1227 } 1228 Advance<capture_raw>(); 1229 return cp; 1230 } 1231 const bool unicode = true; 1232 return ScanHexNumber<capture_raw, unicode>(4); 1233 } 1234 1235 1236 // ---------------------------------------------------------------------------- 1237 // Keyword Matcher 1238 1239 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 1240 KEYWORD_GROUP('a') \ 1241 KEYWORD("async", Token::ASYNC) \ 1242 KEYWORD("await", Token::AWAIT) \ 1243 KEYWORD_GROUP('b') \ 1244 KEYWORD("break", Token::BREAK) \ 1245 KEYWORD_GROUP('c') \ 1246 KEYWORD("case", Token::CASE) \ 1247 KEYWORD("catch", Token::CATCH) \ 1248 KEYWORD("class", Token::CLASS) \ 1249 KEYWORD("const", Token::CONST) \ 1250 KEYWORD("continue", Token::CONTINUE) \ 1251 KEYWORD_GROUP('d') \ 1252 KEYWORD("debugger", Token::DEBUGGER) \ 1253 KEYWORD("default", Token::DEFAULT) \ 1254 KEYWORD("delete", Token::DELETE) \ 1255 KEYWORD("do", Token::DO) \ 1256 KEYWORD_GROUP('e') \ 1257 KEYWORD("else", Token::ELSE) \ 1258 KEYWORD("enum", Token::ENUM) \ 1259 KEYWORD("export", Token::EXPORT) \ 1260 KEYWORD("extends", Token::EXTENDS) \ 1261 KEYWORD_GROUP('f') \ 1262 KEYWORD("false", Token::FALSE_LITERAL) \ 1263 KEYWORD("finally", Token::FINALLY) \ 1264 KEYWORD("for", Token::FOR) \ 1265 KEYWORD("function", Token::FUNCTION) \ 1266 KEYWORD_GROUP('i') \ 1267 KEYWORD("if", Token::IF) \ 1268 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 1269 KEYWORD("import", Token::IMPORT) \ 1270 KEYWORD("in", Token::IN) \ 1271 KEYWORD("instanceof", Token::INSTANCEOF) \ 1272 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 1273 KEYWORD_GROUP('l') \ 1274 KEYWORD("let", Token::LET) \ 1275 KEYWORD_GROUP('n') \ 1276 KEYWORD("new", Token::NEW) \ 1277 KEYWORD("null", Token::NULL_LITERAL) \ 1278 KEYWORD_GROUP('p') \ 1279 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 1280 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 1281 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 1282 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 1283 KEYWORD_GROUP('r') \ 1284 KEYWORD("return", Token::RETURN) \ 1285 KEYWORD_GROUP('s') \ 1286 KEYWORD("static", Token::STATIC) \ 1287 KEYWORD("super", Token::SUPER) \ 1288 KEYWORD("switch", Token::SWITCH) \ 1289 KEYWORD_GROUP('t') \ 1290 KEYWORD("this", Token::THIS) \ 1291 KEYWORD("throw", Token::THROW) \ 1292 KEYWORD("true", Token::TRUE_LITERAL) \ 1293 KEYWORD("try", Token::TRY) \ 1294 KEYWORD("typeof", Token::TYPEOF) \ 1295 KEYWORD_GROUP('v') \ 1296 KEYWORD("var", Token::VAR) \ 1297 KEYWORD("void", Token::VOID) \ 1298 KEYWORD_GROUP('w') \ 1299 KEYWORD("while", Token::WHILE) \ 1300 KEYWORD("with", Token::WITH) \ 1301 KEYWORD_GROUP('y') \ 1302 KEYWORD("yield", Token::YIELD) 1303 1304 static Token::Value KeywordOrIdentifierToken(const uint8_t* input, 1305 int input_length) { 1306 DCHECK(input_length >= 1); 1307 const int kMinLength = 2; 1308 const int kMaxLength = 10; 1309 if (input_length < kMinLength || input_length > kMaxLength) { 1310 return Token::IDENTIFIER; 1311 } 1312 switch (input[0]) { 1313 default: 1314 #define KEYWORD_GROUP_CASE(ch) \ 1315 break; \ 1316 case ch: 1317 #define KEYWORD(keyword, token) \ 1318 { \ 1319 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 1320 /* strlen(keyword) plus 1 for the NUL char. */ \ 1321 const int keyword_length = sizeof(keyword) - 1; \ 1322 STATIC_ASSERT(keyword_length >= kMinLength); \ 1323 STATIC_ASSERT(keyword_length <= kMaxLength); \ 1324 if (input_length == keyword_length && input[1] == keyword[1] && \ 1325 (keyword_length <= 2 || input[2] == keyword[2]) && \ 1326 (keyword_length <= 3 || input[3] == keyword[3]) && \ 1327 (keyword_length <= 4 || input[4] == keyword[4]) && \ 1328 (keyword_length <= 5 || input[5] == keyword[5]) && \ 1329 (keyword_length <= 6 || input[6] == keyword[6]) && \ 1330 (keyword_length <= 7 || input[7] == keyword[7]) && \ 1331 (keyword_length <= 8 || input[8] == keyword[8]) && \ 1332 (keyword_length <= 9 || input[9] == keyword[9])) { \ 1333 return token; \ 1334 } \ 1335 } 1336 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 1337 } 1338 return Token::IDENTIFIER; 1339 } 1340 1341 1342 bool Scanner::IdentifierIsFutureStrictReserved( 1343 const AstRawString* string) const { 1344 // Keywords are always 1-byte strings. 1345 if (!string->is_one_byte()) return false; 1346 if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") || 1347 string->IsOneByteEqualTo("yield")) { 1348 return true; 1349 } 1350 return Token::FUTURE_STRICT_RESERVED_WORD == 1351 KeywordOrIdentifierToken(string->raw_data(), string->length()); 1352 } 1353 1354 1355 Token::Value Scanner::ScanIdentifierOrKeyword() { 1356 DCHECK(unicode_cache_->IsIdentifierStart(c0_)); 1357 LiteralScope literal(this); 1358 if (IsInRange(c0_, 'a', 'z')) { 1359 do { 1360 char first_char = static_cast<char>(c0_); 1361 Advance<false, false>(); 1362 AddLiteralChar(first_char); 1363 } while (IsInRange(c0_, 'a', 'z')); 1364 1365 if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' || 1366 c0_ == '$') { 1367 // Identifier starting with lowercase. 1368 char first_char = static_cast<char>(c0_); 1369 Advance<false, false>(); 1370 AddLiteralChar(first_char); 1371 while (IsAsciiIdentifier(c0_)) { 1372 char first_char = static_cast<char>(c0_); 1373 Advance<false, false>(); 1374 AddLiteralChar(first_char); 1375 } 1376 if (c0_ <= kMaxAscii && c0_ != '\\') { 1377 literal.Complete(); 1378 return Token::IDENTIFIER; 1379 } 1380 } else if (c0_ <= kMaxAscii && c0_ != '\\') { 1381 // Only a-z+: could be a keyword or identifier. 1382 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1383 Token::Value token = 1384 KeywordOrIdentifierToken(chars.start(), chars.length()); 1385 if (token == Token::IDENTIFIER || 1386 token == Token::FUTURE_STRICT_RESERVED_WORD) 1387 literal.Complete(); 1388 return token; 1389 } 1390 1391 HandleLeadSurrogate(); 1392 } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') { 1393 do { 1394 char first_char = static_cast<char>(c0_); 1395 Advance<false, false>(); 1396 AddLiteralChar(first_char); 1397 } while (IsAsciiIdentifier(c0_)); 1398 1399 if (c0_ <= kMaxAscii && c0_ != '\\') { 1400 literal.Complete(); 1401 return Token::IDENTIFIER; 1402 } 1403 1404 HandleLeadSurrogate(); 1405 } else if (c0_ == '\\') { 1406 // Scan identifier start character. 1407 uc32 c = ScanIdentifierUnicodeEscape(); 1408 // Only allow legal identifier start characters. 1409 if (c < 0 || 1410 c == '\\' || // No recursive escapes. 1411 !unicode_cache_->IsIdentifierStart(c)) { 1412 return Token::ILLEGAL; 1413 } 1414 AddLiteralChar(c); 1415 return ScanIdentifierSuffix(&literal, true); 1416 } else { 1417 uc32 first_char = c0_; 1418 Advance(); 1419 AddLiteralChar(first_char); 1420 } 1421 1422 // Scan the rest of the identifier characters. 1423 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) { 1424 if (c0_ != '\\') { 1425 uc32 next_char = c0_; 1426 Advance(); 1427 AddLiteralChar(next_char); 1428 continue; 1429 } 1430 // Fallthrough if no longer able to complete keyword. 1431 return ScanIdentifierSuffix(&literal, false); 1432 } 1433 1434 if (next_.literal_chars->is_one_byte()) { 1435 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1436 Token::Value token = 1437 KeywordOrIdentifierToken(chars.start(), chars.length()); 1438 if (token == Token::IDENTIFIER) literal.Complete(); 1439 return token; 1440 } 1441 literal.Complete(); 1442 return Token::IDENTIFIER; 1443 } 1444 1445 1446 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal, 1447 bool escaped) { 1448 // Scan the rest of the identifier characters. 1449 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) { 1450 if (c0_ == '\\') { 1451 uc32 c = ScanIdentifierUnicodeEscape(); 1452 escaped = true; 1453 // Only allow legal identifier part characters. 1454 if (c < 0 || 1455 c == '\\' || 1456 !unicode_cache_->IsIdentifierPart(c)) { 1457 return Token::ILLEGAL; 1458 } 1459 AddLiteralChar(c); 1460 } else { 1461 AddLiteralChar(c0_); 1462 Advance(); 1463 } 1464 } 1465 literal->Complete(); 1466 1467 if (escaped && next_.literal_chars->is_one_byte()) { 1468 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1469 Token::Value token = 1470 KeywordOrIdentifierToken(chars.start(), chars.length()); 1471 /* TODO(adamk): YIELD should be handled specially. */ 1472 if (token == Token::IDENTIFIER) { 1473 return Token::IDENTIFIER; 1474 } else if (token == Token::FUTURE_STRICT_RESERVED_WORD || 1475 token == Token::LET || token == Token::STATIC) { 1476 return Token::ESCAPED_STRICT_RESERVED_WORD; 1477 } else { 1478 return Token::ESCAPED_KEYWORD; 1479 } 1480 } 1481 return Token::IDENTIFIER; 1482 } 1483 1484 bool Scanner::ScanRegExpPattern() { 1485 DCHECK(next_next_.token == Token::UNINITIALIZED); 1486 DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV); 1487 1488 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1489 bool in_character_class = false; 1490 bool seen_equal = (next_.token == Token::ASSIGN_DIV); 1491 1492 // Previous token is either '/' or '/=', in the second case, the 1493 // pattern starts at =. 1494 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1495 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1496 1497 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1498 // the scanner should pass uninterpreted bodies to the RegExp 1499 // constructor. 1500 LiteralScope literal(this); 1501 if (seen_equal) { 1502 AddLiteralChar('='); 1503 } 1504 1505 while (c0_ != '/' || in_character_class) { 1506 if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_)) 1507 return false; 1508 if (c0_ == '\\') { // Escape sequence. 1509 AddLiteralCharAdvance(); 1510 if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_)) 1511 return false; 1512 AddLiteralCharAdvance(); 1513 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1514 // only "safe" characters are allowed (letters, digits, underscore), 1515 // otherwise the escape isn't valid and the invalid character has 1516 // its normal meaning. I.e., we can just continue scanning without 1517 // worrying whether the following characters are part of the escape 1518 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1519 // of the escape sequence. 1520 1521 // TODO(896): At some point, parse RegExps more throughly to capture 1522 // octal esacpes in strict mode. 1523 } else { // Unescaped character. 1524 if (c0_ == '[') in_character_class = true; 1525 if (c0_ == ']') in_character_class = false; 1526 AddLiteralCharAdvance(); 1527 } 1528 } 1529 Advance(); // consume '/' 1530 1531 literal.Complete(); 1532 next_.token = Token::REGEXP_LITERAL; 1533 return true; 1534 } 1535 1536 1537 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() { 1538 DCHECK(next_.token == Token::REGEXP_LITERAL); 1539 1540 // Scan regular expression flags. 1541 int flags = 0; 1542 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) { 1543 RegExp::Flags flag = RegExp::kNone; 1544 switch (c0_) { 1545 case 'g': 1546 flag = RegExp::kGlobal; 1547 break; 1548 case 'i': 1549 flag = RegExp::kIgnoreCase; 1550 break; 1551 case 'm': 1552 flag = RegExp::kMultiline; 1553 break; 1554 case 'u': 1555 flag = RegExp::kUnicode; 1556 break; 1557 case 'y': 1558 flag = RegExp::kSticky; 1559 break; 1560 default: 1561 return Nothing<RegExp::Flags>(); 1562 } 1563 if (flags & flag) { 1564 return Nothing<RegExp::Flags>(); 1565 } 1566 Advance(); 1567 flags |= flag; 1568 } 1569 1570 next_.location.end_pos = source_pos(); 1571 return Just(RegExp::Flags(flags)); 1572 } 1573 1574 1575 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) { 1576 if (is_literal_one_byte()) { 1577 return ast_value_factory->GetOneByteString(literal_one_byte_string()); 1578 } 1579 return ast_value_factory->GetTwoByteString(literal_two_byte_string()); 1580 } 1581 1582 1583 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) { 1584 if (is_next_literal_one_byte()) { 1585 return ast_value_factory->GetOneByteString(next_literal_one_byte_string()); 1586 } 1587 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string()); 1588 } 1589 1590 1591 const AstRawString* Scanner::CurrentRawSymbol( 1592 AstValueFactory* ast_value_factory) { 1593 if (is_raw_literal_one_byte()) { 1594 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string()); 1595 } 1596 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string()); 1597 } 1598 1599 1600 double Scanner::DoubleValue() { 1601 DCHECK(is_literal_one_byte()); 1602 return StringToDouble( 1603 unicode_cache_, 1604 literal_one_byte_string(), 1605 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY); 1606 } 1607 1608 1609 bool Scanner::ContainsDot() { 1610 DCHECK(is_literal_one_byte()); 1611 Vector<const uint8_t> str = literal_one_byte_string(); 1612 return std::find(str.begin(), str.end(), '.') != str.end(); 1613 } 1614 1615 1616 int Scanner::FindSymbol(DuplicateFinder* finder, int value) { 1617 // TODO(vogelheim): Move this logic into the calling class; this can be fully 1618 // implemented using the public interface. 1619 if (is_literal_one_byte()) { 1620 return finder->AddOneByteSymbol(literal_one_byte_string(), value); 1621 } 1622 return finder->AddTwoByteSymbol(literal_two_byte_string(), value); 1623 } 1624 1625 void Scanner::SeekNext(size_t position) { 1626 // Use with care: This cleanly resets most, but not all scanner state. 1627 // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions. 1628 1629 // To re-scan from a given character position, we need to: 1630 // 1, Reset the current_, next_ and next_next_ tokens 1631 // (next_ + next_next_ will be overwrittem by Next(), 1632 // current_ will remain unchanged, so overwrite it fully.) 1633 current_ = {{0, 0}, nullptr, nullptr, 0, Token::UNINITIALIZED}; 1634 next_.token = Token::UNINITIALIZED; 1635 next_next_.token = Token::UNINITIALIZED; 1636 // 2, reset the source to the desired position, 1637 source_->Seek(position); 1638 // 3, re-scan, by scanning the look-ahead char + 1 token (next_). 1639 c0_ = source_->Advance(); 1640 Next(); 1641 DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position)); 1642 } 1643 1644 } // namespace internal 1645 } // namespace v8 1646