1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #include "src/parsing/scanner.h" 8 9 #include <stdint.h> 10 11 #include <cmath> 12 13 #include "src/ast/ast-value-factory.h" 14 #include "src/char-predicates-inl.h" 15 #include "src/conversions-inl.h" 16 #include "src/list-inl.h" 17 #include "src/parsing/duplicate-finder.h" // For Scanner::FindSymbol 18 19 namespace v8 { 20 namespace internal { 21 22 // Scoped helper for saving & restoring scanner error state. 23 // This is used for tagged template literals, in which normally forbidden 24 // escape sequences are allowed. 25 class ErrorState { 26 public: 27 ErrorState(MessageTemplate::Template* message_stack, 28 Scanner::Location* location_stack) 29 : message_stack_(message_stack), 30 old_message_(*message_stack), 31 location_stack_(location_stack), 32 old_location_(*location_stack) { 33 *message_stack_ = MessageTemplate::kNone; 34 *location_stack_ = Scanner::Location::invalid(); 35 } 36 37 ~ErrorState() { 38 *message_stack_ = old_message_; 39 *location_stack_ = old_location_; 40 } 41 42 void MoveErrorTo(MessageTemplate::Template* message_dest, 43 Scanner::Location* location_dest) { 44 if (*message_stack_ == MessageTemplate::kNone) { 45 return; 46 } 47 if (*message_dest == MessageTemplate::kNone) { 48 *message_dest = *message_stack_; 49 *location_dest = *location_stack_; 50 } 51 *message_stack_ = MessageTemplate::kNone; 52 *location_stack_ = Scanner::Location::invalid(); 53 } 54 55 private: 56 MessageTemplate::Template* const message_stack_; 57 MessageTemplate::Template const old_message_; 58 Scanner::Location* const location_stack_; 59 Scanner::Location const old_location_; 60 }; 61 62 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const { 63 if (is_one_byte()) { 64 return isolate->factory()->InternalizeOneByteString(one_byte_literal()); 65 } 66 return isolate->factory()->InternalizeTwoByteString(two_byte_literal()); 67 } 68 69 int Scanner::LiteralBuffer::NewCapacity(int min_capacity) { 70 int capacity = Max(min_capacity, backing_store_.length()); 71 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); 72 return new_capacity; 73 } 74 75 void Scanner::LiteralBuffer::ExpandBuffer() { 76 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); 77 MemCopy(new_store.start(), backing_store_.start(), position_); 78 backing_store_.Dispose(); 79 backing_store_ = new_store; 80 } 81 82 void Scanner::LiteralBuffer::ConvertToTwoByte() { 83 DCHECK(is_one_byte_); 84 Vector<byte> new_store; 85 int new_content_size = position_ * kUC16Size; 86 if (new_content_size >= backing_store_.length()) { 87 // Ensure room for all currently read code units as UC16 as well 88 // as the code unit about to be stored. 89 new_store = Vector<byte>::New(NewCapacity(new_content_size)); 90 } else { 91 new_store = backing_store_; 92 } 93 uint8_t* src = backing_store_.start(); 94 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start()); 95 for (int i = position_ - 1; i >= 0; i--) { 96 dst[i] = src[i]; 97 } 98 if (new_store.start() != backing_store_.start()) { 99 backing_store_.Dispose(); 100 backing_store_ = new_store; 101 } 102 position_ = new_content_size; 103 is_one_byte_ = false; 104 } 105 106 void Scanner::LiteralBuffer::AddCharSlow(uc32 code_unit) { 107 if (position_ >= backing_store_.length()) ExpandBuffer(); 108 if (is_one_byte_) { 109 if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) { 110 backing_store_[position_] = static_cast<byte>(code_unit); 111 position_ += kOneByteSize; 112 return; 113 } 114 ConvertToTwoByte(); 115 } 116 if (code_unit <= 117 static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 118 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit; 119 position_ += kUC16Size; 120 } else { 121 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = 122 unibrow::Utf16::LeadSurrogate(code_unit); 123 position_ += kUC16Size; 124 if (position_ >= backing_store_.length()) ExpandBuffer(); 125 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = 126 unibrow::Utf16::TrailSurrogate(code_unit); 127 position_ += kUC16Size; 128 } 129 } 130 131 // ---------------------------------------------------------------------------- 132 // Scanner::BookmarkScope 133 134 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos = 135 std::numeric_limits<size_t>::max() - 2; 136 const size_t Scanner::BookmarkScope::kNoBookmark = 137 std::numeric_limits<size_t>::max() - 1; 138 const size_t Scanner::BookmarkScope::kBookmarkWasApplied = 139 std::numeric_limits<size_t>::max(); 140 141 void Scanner::BookmarkScope::Set() { 142 DCHECK_EQ(bookmark_, kNoBookmark); 143 DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED); 144 145 // The first token is a bit special, since current_ will still be 146 // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it 147 // when 148 // applying the bookmark. 149 DCHECK_IMPLIES( 150 scanner_->current_.token == Token::UNINITIALIZED, 151 scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos); 152 bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED) 153 ? kBookmarkAtFirstPos 154 : scanner_->location().beg_pos; 155 } 156 157 void Scanner::BookmarkScope::Apply() { 158 DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark. 159 if (bookmark_ == kBookmarkAtFirstPos) { 160 scanner_->SeekNext(0); 161 } else { 162 scanner_->SeekNext(bookmark_); 163 scanner_->Next(); 164 DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_)); 165 } 166 bookmark_ = kBookmarkWasApplied; 167 } 168 169 bool Scanner::BookmarkScope::HasBeenSet() { 170 return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied; 171 } 172 173 bool Scanner::BookmarkScope::HasBeenApplied() { 174 return bookmark_ == kBookmarkWasApplied; 175 } 176 177 // ---------------------------------------------------------------------------- 178 // Scanner 179 180 Scanner::Scanner(UnicodeCache* unicode_cache) 181 : unicode_cache_(unicode_cache), 182 octal_pos_(Location::invalid()), 183 octal_message_(MessageTemplate::kNone), 184 found_html_comment_(false) {} 185 186 void Scanner::Initialize(Utf16CharacterStream* source) { 187 source_ = source; 188 // Need to capture identifiers in order to recognize "get" and "set" 189 // in object literals. 190 Init(); 191 // Skip initial whitespace allowing HTML comment ends just like 192 // after a newline and scan first token. 193 has_line_terminator_before_next_ = true; 194 SkipWhiteSpace(); 195 Scan(); 196 } 197 198 template <bool capture_raw, bool unicode> 199 uc32 Scanner::ScanHexNumber(int expected_length) { 200 DCHECK(expected_length <= 4); // prevent overflow 201 202 int begin = source_pos() - 2; 203 uc32 x = 0; 204 for (int i = 0; i < expected_length; i++) { 205 int d = HexValue(c0_); 206 if (d < 0) { 207 ReportScannerError(Location(begin, begin + expected_length + 2), 208 unicode 209 ? MessageTemplate::kInvalidUnicodeEscapeSequence 210 : MessageTemplate::kInvalidHexEscapeSequence); 211 return -1; 212 } 213 x = x * 16 + d; 214 Advance<capture_raw>(); 215 } 216 217 return x; 218 } 219 220 template <bool capture_raw> 221 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) { 222 uc32 x = 0; 223 int d = HexValue(c0_); 224 if (d < 0) return -1; 225 226 while (d >= 0) { 227 x = x * 16 + d; 228 if (x > max_value) { 229 ReportScannerError(Location(beg_pos, source_pos() + 1), 230 MessageTemplate::kUndefinedUnicodeCodePoint); 231 return -1; 232 } 233 Advance<capture_raw>(); 234 d = HexValue(c0_); 235 } 236 237 return x; 238 } 239 240 241 // Ensure that tokens can be stored in a byte. 242 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 243 244 // Table of one-character tokens, by character (0x00..0x7f only). 245 static const byte one_char_tokens[] = { 246 Token::ILLEGAL, 247 Token::ILLEGAL, 248 Token::ILLEGAL, 249 Token::ILLEGAL, 250 Token::ILLEGAL, 251 Token::ILLEGAL, 252 Token::ILLEGAL, 253 Token::ILLEGAL, 254 Token::ILLEGAL, 255 Token::ILLEGAL, 256 Token::ILLEGAL, 257 Token::ILLEGAL, 258 Token::ILLEGAL, 259 Token::ILLEGAL, 260 Token::ILLEGAL, 261 Token::ILLEGAL, 262 Token::ILLEGAL, 263 Token::ILLEGAL, 264 Token::ILLEGAL, 265 Token::ILLEGAL, 266 Token::ILLEGAL, 267 Token::ILLEGAL, 268 Token::ILLEGAL, 269 Token::ILLEGAL, 270 Token::ILLEGAL, 271 Token::ILLEGAL, 272 Token::ILLEGAL, 273 Token::ILLEGAL, 274 Token::ILLEGAL, 275 Token::ILLEGAL, 276 Token::ILLEGAL, 277 Token::ILLEGAL, 278 Token::ILLEGAL, 279 Token::ILLEGAL, 280 Token::ILLEGAL, 281 Token::ILLEGAL, 282 Token::ILLEGAL, 283 Token::ILLEGAL, 284 Token::ILLEGAL, 285 Token::ILLEGAL, 286 Token::LPAREN, // 0x28 287 Token::RPAREN, // 0x29 288 Token::ILLEGAL, 289 Token::ILLEGAL, 290 Token::COMMA, // 0x2c 291 Token::ILLEGAL, 292 Token::ILLEGAL, 293 Token::ILLEGAL, 294 Token::ILLEGAL, 295 Token::ILLEGAL, 296 Token::ILLEGAL, 297 Token::ILLEGAL, 298 Token::ILLEGAL, 299 Token::ILLEGAL, 300 Token::ILLEGAL, 301 Token::ILLEGAL, 302 Token::ILLEGAL, 303 Token::ILLEGAL, 304 Token::COLON, // 0x3a 305 Token::SEMICOLON, // 0x3b 306 Token::ILLEGAL, 307 Token::ILLEGAL, 308 Token::ILLEGAL, 309 Token::CONDITIONAL, // 0x3f 310 Token::ILLEGAL, 311 Token::ILLEGAL, 312 Token::ILLEGAL, 313 Token::ILLEGAL, 314 Token::ILLEGAL, 315 Token::ILLEGAL, 316 Token::ILLEGAL, 317 Token::ILLEGAL, 318 Token::ILLEGAL, 319 Token::ILLEGAL, 320 Token::ILLEGAL, 321 Token::ILLEGAL, 322 Token::ILLEGAL, 323 Token::ILLEGAL, 324 Token::ILLEGAL, 325 Token::ILLEGAL, 326 Token::ILLEGAL, 327 Token::ILLEGAL, 328 Token::ILLEGAL, 329 Token::ILLEGAL, 330 Token::ILLEGAL, 331 Token::ILLEGAL, 332 Token::ILLEGAL, 333 Token::ILLEGAL, 334 Token::ILLEGAL, 335 Token::ILLEGAL, 336 Token::ILLEGAL, 337 Token::LBRACK, // 0x5b 338 Token::ILLEGAL, 339 Token::RBRACK, // 0x5d 340 Token::ILLEGAL, 341 Token::ILLEGAL, 342 Token::ILLEGAL, 343 Token::ILLEGAL, 344 Token::ILLEGAL, 345 Token::ILLEGAL, 346 Token::ILLEGAL, 347 Token::ILLEGAL, 348 Token::ILLEGAL, 349 Token::ILLEGAL, 350 Token::ILLEGAL, 351 Token::ILLEGAL, 352 Token::ILLEGAL, 353 Token::ILLEGAL, 354 Token::ILLEGAL, 355 Token::ILLEGAL, 356 Token::ILLEGAL, 357 Token::ILLEGAL, 358 Token::ILLEGAL, 359 Token::ILLEGAL, 360 Token::ILLEGAL, 361 Token::ILLEGAL, 362 Token::ILLEGAL, 363 Token::ILLEGAL, 364 Token::ILLEGAL, 365 Token::ILLEGAL, 366 Token::ILLEGAL, 367 Token::ILLEGAL, 368 Token::ILLEGAL, 369 Token::LBRACE, // 0x7b 370 Token::ILLEGAL, 371 Token::RBRACE, // 0x7d 372 Token::BIT_NOT, // 0x7e 373 Token::ILLEGAL 374 }; 375 376 377 Token::Value Scanner::Next() { 378 if (next_.token == Token::EOS) { 379 next_.location.beg_pos = current_.location.beg_pos; 380 next_.location.end_pos = current_.location.end_pos; 381 } 382 current_ = next_; 383 if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) { 384 next_ = next_next_; 385 next_next_.token = Token::UNINITIALIZED; 386 has_line_terminator_before_next_ = has_line_terminator_after_next_; 387 return current_.token; 388 } 389 has_line_terminator_before_next_ = false; 390 has_multiline_comment_before_next_ = false; 391 if (static_cast<unsigned>(c0_) <= 0x7f) { 392 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 393 if (token != Token::ILLEGAL) { 394 int pos = source_pos(); 395 next_.token = token; 396 next_.location.beg_pos = pos; 397 next_.location.end_pos = pos + 1; 398 next_.literal_chars = nullptr; 399 next_.raw_literal_chars = nullptr; 400 Advance(); 401 return current_.token; 402 } 403 } 404 Scan(); 405 return current_.token; 406 } 407 408 409 Token::Value Scanner::PeekAhead() { 410 DCHECK(next_.token != Token::DIV); 411 DCHECK(next_.token != Token::ASSIGN_DIV); 412 413 if (next_next_.token != Token::UNINITIALIZED) { 414 return next_next_.token; 415 } 416 TokenDesc prev = current_; 417 bool has_line_terminator_before_next = 418 has_line_terminator_before_next_ || has_multiline_comment_before_next_; 419 Next(); 420 has_line_terminator_after_next_ = 421 has_line_terminator_before_next_ || has_multiline_comment_before_next_; 422 has_line_terminator_before_next_ = has_line_terminator_before_next; 423 Token::Value ret = next_.token; 424 next_next_ = next_; 425 next_ = current_; 426 current_ = prev; 427 return ret; 428 } 429 430 431 // TODO(yangguo): check whether this is actually necessary. 432 static inline bool IsLittleEndianByteOrderMark(uc32 c) { 433 // The Unicode value U+FFFE is guaranteed never to be assigned as a 434 // Unicode character; this implies that in a Unicode context the 435 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 436 // character expressed in little-endian byte order (since it could 437 // not be a U+FFFE character expressed in big-endian byte 438 // order). Nevertheless, we check for it to be compatible with 439 // Spidermonkey. 440 return c == 0xFFFE; 441 } 442 443 bool Scanner::SkipWhiteSpace() { 444 int start_position = source_pos(); 445 446 while (true) { 447 while (true) { 448 // Don't skip behind the end of input. 449 if (c0_ == kEndOfInput) break; 450 451 // Advance as long as character is a WhiteSpace or LineTerminator. 452 // Remember if the latter is the case. 453 if (unicode_cache_->IsLineTerminator(c0_)) { 454 has_line_terminator_before_next_ = true; 455 } else if (!unicode_cache_->IsWhiteSpace(c0_) && 456 !IsLittleEndianByteOrderMark(c0_)) { 457 break; 458 } 459 Advance(); 460 } 461 462 // If there is an HTML comment end '-->' at the beginning of a 463 // line (with only whitespace in front of it), we treat the rest 464 // of the line as a comment. This is in line with the way 465 // SpiderMonkey handles it. 466 if (c0_ != '-' || !has_line_terminator_before_next_) break; 467 468 Advance(); 469 if (c0_ != '-') { 470 PushBack('-'); // undo Advance() 471 break; 472 } 473 474 Advance(); 475 if (c0_ != '>') { 476 PushBack2('-', '-'); // undo 2x Advance(); 477 break; 478 } 479 480 // Treat the rest of the line as a comment. 481 SkipSingleLineComment(); 482 } 483 484 // Return whether or not we skipped any characters. 485 return source_pos() != start_position; 486 } 487 488 Token::Value Scanner::SkipSingleLineComment() { 489 Advance(); 490 491 // The line terminator at the end of the line is not considered 492 // to be part of the single-line comment; it is recognized 493 // separately by the lexical grammar and becomes part of the 494 // stream of input elements for the syntactic grammar (see 495 // ECMA-262, section 7.4). 496 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 497 Advance(); 498 } 499 500 return Token::WHITESPACE; 501 } 502 503 504 Token::Value Scanner::SkipSourceURLComment() { 505 TryToParseSourceURLComment(); 506 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 507 Advance(); 508 } 509 510 return Token::WHITESPACE; 511 } 512 513 514 void Scanner::TryToParseSourceURLComment() { 515 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this 516 // function will just return if it cannot parse a magic comment. 517 if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return; 518 Advance(); 519 LiteralBuffer name; 520 while (c0_ != kEndOfInput && 521 !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') { 522 name.AddChar(c0_); 523 Advance(); 524 } 525 if (!name.is_one_byte()) return; 526 Vector<const uint8_t> name_literal = name.one_byte_literal(); 527 LiteralBuffer* value; 528 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) { 529 value = &source_url_; 530 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) { 531 value = &source_mapping_url_; 532 } else { 533 return; 534 } 535 if (c0_ != '=') 536 return; 537 Advance(); 538 value->Reset(); 539 while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) { 540 Advance(); 541 } 542 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 543 // Disallowed characters. 544 if (c0_ == '"' || c0_ == '\'') { 545 value->Reset(); 546 return; 547 } 548 if (unicode_cache_->IsWhiteSpace(c0_)) { 549 break; 550 } 551 value->AddChar(c0_); 552 Advance(); 553 } 554 // Allow whitespace at the end. 555 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) { 556 if (!unicode_cache_->IsWhiteSpace(c0_)) { 557 value->Reset(); 558 break; 559 } 560 Advance(); 561 } 562 } 563 564 565 Token::Value Scanner::SkipMultiLineComment() { 566 DCHECK(c0_ == '*'); 567 Advance(); 568 569 while (c0_ != kEndOfInput) { 570 uc32 ch = c0_; 571 Advance(); 572 if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(ch)) { 573 // Following ECMA-262, section 7.4, a comment containing 574 // a newline will make the comment count as a line-terminator. 575 has_multiline_comment_before_next_ = true; 576 } 577 // If we have reached the end of the multi-line comment, we 578 // consume the '/' and insert a whitespace. This way all 579 // multi-line comments are treated as whitespace. 580 if (ch == '*' && c0_ == '/') { 581 c0_ = ' '; 582 return Token::WHITESPACE; 583 } 584 } 585 586 // Unterminated multi-line comment. 587 return Token::ILLEGAL; 588 } 589 590 Token::Value Scanner::ScanHtmlComment() { 591 // Check for <!-- comments. 592 DCHECK(c0_ == '!'); 593 Advance(); 594 if (c0_ != '-') { 595 PushBack('!'); // undo Advance() 596 return Token::LT; 597 } 598 599 Advance(); 600 if (c0_ != '-') { 601 PushBack2('-', '!'); // undo 2x Advance() 602 return Token::LT; 603 } 604 605 found_html_comment_ = true; 606 return SkipSingleLineComment(); 607 } 608 609 void Scanner::Scan() { 610 next_.literal_chars = NULL; 611 next_.raw_literal_chars = NULL; 612 Token::Value token; 613 do { 614 // Remember the position of the next token 615 next_.location.beg_pos = source_pos(); 616 617 switch (c0_) { 618 case ' ': 619 case '\t': 620 Advance(); 621 token = Token::WHITESPACE; 622 break; 623 624 case '\n': 625 Advance(); 626 has_line_terminator_before_next_ = true; 627 token = Token::WHITESPACE; 628 break; 629 630 case '"': case '\'': 631 token = ScanString(); 632 break; 633 634 case '<': 635 // < <= << <<= <!-- 636 Advance(); 637 if (c0_ == '=') { 638 token = Select(Token::LTE); 639 } else if (c0_ == '<') { 640 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 641 } else if (c0_ == '!') { 642 token = ScanHtmlComment(); 643 } else { 644 token = Token::LT; 645 } 646 break; 647 648 case '>': 649 // > >= >> >>= >>> >>>= 650 Advance(); 651 if (c0_ == '=') { 652 token = Select(Token::GTE); 653 } else if (c0_ == '>') { 654 // >> >>= >>> >>>= 655 Advance(); 656 if (c0_ == '=') { 657 token = Select(Token::ASSIGN_SAR); 658 } else if (c0_ == '>') { 659 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 660 } else { 661 token = Token::SAR; 662 } 663 } else { 664 token = Token::GT; 665 } 666 break; 667 668 case '=': 669 // = == === => 670 Advance(); 671 if (c0_ == '=') { 672 token = Select('=', Token::EQ_STRICT, Token::EQ); 673 } else if (c0_ == '>') { 674 token = Select(Token::ARROW); 675 } else { 676 token = Token::ASSIGN; 677 } 678 break; 679 680 case '!': 681 // ! != !== 682 Advance(); 683 if (c0_ == '=') { 684 token = Select('=', Token::NE_STRICT, Token::NE); 685 } else { 686 token = Token::NOT; 687 } 688 break; 689 690 case '+': 691 // + ++ += 692 Advance(); 693 if (c0_ == '+') { 694 token = Select(Token::INC); 695 } else if (c0_ == '=') { 696 token = Select(Token::ASSIGN_ADD); 697 } else { 698 token = Token::ADD; 699 } 700 break; 701 702 case '-': 703 // - -- --> -= 704 Advance(); 705 if (c0_ == '-') { 706 Advance(); 707 if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) { 708 // For compatibility with SpiderMonkey, we skip lines that 709 // start with an HTML comment end '-->'. 710 token = SkipSingleLineComment(); 711 } else { 712 token = Token::DEC; 713 } 714 } else if (c0_ == '=') { 715 token = Select(Token::ASSIGN_SUB); 716 } else { 717 token = Token::SUB; 718 } 719 break; 720 721 case '*': 722 // * *= 723 Advance(); 724 if (c0_ == '*') { 725 token = Select('=', Token::ASSIGN_EXP, Token::EXP); 726 } else if (c0_ == '=') { 727 token = Select(Token::ASSIGN_MUL); 728 } else { 729 token = Token::MUL; 730 } 731 break; 732 733 case '%': 734 // % %= 735 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 736 break; 737 738 case '/': 739 // / // /* /= 740 Advance(); 741 if (c0_ == '/') { 742 Advance(); 743 if (c0_ == '#' || c0_ == '@') { 744 Advance(); 745 token = SkipSourceURLComment(); 746 } else { 747 PushBack(c0_); 748 token = SkipSingleLineComment(); 749 } 750 } else if (c0_ == '*') { 751 token = SkipMultiLineComment(); 752 } else if (c0_ == '=') { 753 token = Select(Token::ASSIGN_DIV); 754 } else { 755 token = Token::DIV; 756 } 757 break; 758 759 case '&': 760 // & && &= 761 Advance(); 762 if (c0_ == '&') { 763 token = Select(Token::AND); 764 } else if (c0_ == '=') { 765 token = Select(Token::ASSIGN_BIT_AND); 766 } else { 767 token = Token::BIT_AND; 768 } 769 break; 770 771 case '|': 772 // | || |= 773 Advance(); 774 if (c0_ == '|') { 775 token = Select(Token::OR); 776 } else if (c0_ == '=') { 777 token = Select(Token::ASSIGN_BIT_OR); 778 } else { 779 token = Token::BIT_OR; 780 } 781 break; 782 783 case '^': 784 // ^ ^= 785 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 786 break; 787 788 case '.': 789 // . Number 790 Advance(); 791 if (IsDecimalDigit(c0_)) { 792 token = ScanNumber(true); 793 } else { 794 token = Token::PERIOD; 795 if (c0_ == '.') { 796 Advance(); 797 if (c0_ == '.') { 798 Advance(); 799 token = Token::ELLIPSIS; 800 } else { 801 PushBack('.'); 802 } 803 } 804 } 805 break; 806 807 case ':': 808 token = Select(Token::COLON); 809 break; 810 811 case ';': 812 token = Select(Token::SEMICOLON); 813 break; 814 815 case ',': 816 token = Select(Token::COMMA); 817 break; 818 819 case '(': 820 token = Select(Token::LPAREN); 821 break; 822 823 case ')': 824 token = Select(Token::RPAREN); 825 break; 826 827 case '[': 828 token = Select(Token::LBRACK); 829 break; 830 831 case ']': 832 token = Select(Token::RBRACK); 833 break; 834 835 case '{': 836 token = Select(Token::LBRACE); 837 break; 838 839 case '}': 840 token = Select(Token::RBRACE); 841 break; 842 843 case '?': 844 token = Select(Token::CONDITIONAL); 845 break; 846 847 case '~': 848 token = Select(Token::BIT_NOT); 849 break; 850 851 case '`': 852 token = ScanTemplateStart(); 853 break; 854 855 default: 856 if (c0_ == kEndOfInput) { 857 token = Token::EOS; 858 } else if (unicode_cache_->IsIdentifierStart(c0_)) { 859 token = ScanIdentifierOrKeyword(); 860 } else if (IsDecimalDigit(c0_)) { 861 token = ScanNumber(false); 862 } else if (SkipWhiteSpace()) { 863 token = Token::WHITESPACE; 864 } else { 865 token = Select(Token::ILLEGAL); 866 } 867 break; 868 } 869 870 // Continue scanning for tokens as long as we're just skipping 871 // whitespace. 872 } while (token == Token::WHITESPACE); 873 874 next_.location.end_pos = source_pos(); 875 next_.token = token; 876 877 #ifdef DEBUG 878 SanityCheckTokenDesc(current_); 879 SanityCheckTokenDesc(next_); 880 SanityCheckTokenDesc(next_next_); 881 #endif 882 } 883 884 #ifdef DEBUG 885 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const { 886 // Most tokens should not have literal_chars or even raw_literal chars. 887 // The rules are: 888 // - UNINITIALIZED: we don't care. 889 // - TEMPLATE_*: need both literal + raw literal chars. 890 // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal. 891 // - all others: should have neither. 892 893 switch (token.token) { 894 case Token::UNINITIALIZED: 895 // token.literal_chars & other members might be garbage. That's ok. 896 break; 897 case Token::TEMPLATE_SPAN: 898 case Token::TEMPLATE_TAIL: 899 DCHECK_NOT_NULL(token.raw_literal_chars); 900 DCHECK_NOT_NULL(token.literal_chars); 901 break; 902 case Token::ESCAPED_KEYWORD: 903 case Token::ESCAPED_STRICT_RESERVED_WORD: 904 case Token::FUTURE_STRICT_RESERVED_WORD: 905 case Token::IDENTIFIER: 906 case Token::NUMBER: 907 case Token::REGEXP_LITERAL: 908 case Token::SMI: 909 case Token::STRING: 910 DCHECK_NOT_NULL(token.literal_chars); 911 DCHECK_NULL(token.raw_literal_chars); 912 break; 913 default: 914 DCHECK_NULL(token.literal_chars); 915 DCHECK_NULL(token.raw_literal_chars); 916 break; 917 } 918 } 919 #endif // DEBUG 920 921 void Scanner::SeekForward(int pos) { 922 // After this call, we will have the token at the given position as 923 // the "next" token. The "current" token will be invalid. 924 if (pos == next_.location.beg_pos) return; 925 int current_pos = source_pos(); 926 DCHECK_EQ(next_.location.end_pos, current_pos); 927 // Positions inside the lookahead token aren't supported. 928 DCHECK(pos >= current_pos); 929 if (pos != current_pos) { 930 source_->Seek(pos); 931 Advance(); 932 // This function is only called to seek to the location 933 // of the end of a function (at the "}" token). It doesn't matter 934 // whether there was a line terminator in the part we skip. 935 has_line_terminator_before_next_ = false; 936 has_multiline_comment_before_next_ = false; 937 } 938 Scan(); 939 } 940 941 942 template <bool capture_raw, bool in_template_literal> 943 bool Scanner::ScanEscape() { 944 uc32 c = c0_; 945 Advance<capture_raw>(); 946 947 // Skip escaped newlines. 948 if (!in_template_literal && c0_ != kEndOfInput && 949 unicode_cache_->IsLineTerminator(c)) { 950 // Allow CR+LF newlines in multiline string literals. 951 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>(); 952 // Allow LF+CR newlines in multiline string literals. 953 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>(); 954 return true; 955 } 956 957 switch (c) { 958 case '\'': // fall through 959 case '"' : // fall through 960 case '\\': break; 961 case 'b' : c = '\b'; break; 962 case 'f' : c = '\f'; break; 963 case 'n' : c = '\n'; break; 964 case 'r' : c = '\r'; break; 965 case 't' : c = '\t'; break; 966 case 'u' : { 967 c = ScanUnicodeEscape<capture_raw>(); 968 if (c < 0) return false; 969 break; 970 } 971 case 'v': 972 c = '\v'; 973 break; 974 case 'x': { 975 c = ScanHexNumber<capture_raw>(2); 976 if (c < 0) return false; 977 break; 978 } 979 case '0': // Fall through. 980 case '1': // fall through 981 case '2': // fall through 982 case '3': // fall through 983 case '4': // fall through 984 case '5': // fall through 985 case '6': // fall through 986 case '7': 987 c = ScanOctalEscape<capture_raw>(c, 2); 988 break; 989 } 990 991 // Other escaped characters are interpreted as their non-escaped version. 992 AddLiteralChar(c); 993 return true; 994 } 995 996 997 template <bool capture_raw> 998 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 999 uc32 x = c - '0'; 1000 int i = 0; 1001 for (; i < length; i++) { 1002 int d = c0_ - '0'; 1003 if (d < 0 || d > 7) break; 1004 int nx = x * 8 + d; 1005 if (nx >= 256) break; 1006 x = nx; 1007 Advance<capture_raw>(); 1008 } 1009 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 1010 // Remember the position of octal escape sequences so that an error 1011 // can be reported later (in strict mode). 1012 // We don't report the error immediately, because the octal escape can 1013 // occur before the "use strict" directive. 1014 if (c != '0' || i > 0) { 1015 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 1016 octal_message_ = MessageTemplate::kStrictOctalEscape; 1017 } 1018 return x; 1019 } 1020 1021 1022 Token::Value Scanner::ScanString() { 1023 uc32 quote = c0_; 1024 Advance<false, false>(); // consume quote 1025 1026 LiteralScope literal(this); 1027 while (true) { 1028 if (c0_ > kMaxAscii) { 1029 HandleLeadSurrogate(); 1030 break; 1031 } 1032 if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL; 1033 if (c0_ == quote) { 1034 literal.Complete(); 1035 Advance<false, false>(); 1036 return Token::STRING; 1037 } 1038 char c = static_cast<char>(c0_); 1039 if (c == '\\') break; 1040 Advance<false, false>(); 1041 AddLiteralChar(c); 1042 } 1043 1044 while (c0_ != quote && c0_ != kEndOfInput && 1045 !unicode_cache_->IsLineTerminator(c0_)) { 1046 uc32 c = c0_; 1047 Advance(); 1048 if (c == '\\') { 1049 if (c0_ == kEndOfInput || !ScanEscape<false, false>()) { 1050 return Token::ILLEGAL; 1051 } 1052 } else { 1053 AddLiteralChar(c); 1054 } 1055 } 1056 if (c0_ != quote) return Token::ILLEGAL; 1057 literal.Complete(); 1058 1059 Advance(); // consume quote 1060 return Token::STRING; 1061 } 1062 1063 1064 Token::Value Scanner::ScanTemplateSpan() { 1065 // When scanning a TemplateSpan, we are looking for the following construct: 1066 // TEMPLATE_SPAN :: 1067 // ` LiteralChars* ${ 1068 // | } LiteralChars* ${ 1069 // 1070 // TEMPLATE_TAIL :: 1071 // ` LiteralChars* ` 1072 // | } LiteralChar* ` 1073 // 1074 // A TEMPLATE_SPAN should always be followed by an Expression, while a 1075 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be 1076 // followed by an Expression. 1077 1078 // These scoped helpers save and restore the original error state, so that we 1079 // can specially treat invalid escape sequences in templates (which are 1080 // handled by the parser). 1081 ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_); 1082 ErrorState octal_error_state(&octal_message_, &octal_pos_); 1083 1084 Token::Value result = Token::TEMPLATE_SPAN; 1085 LiteralScope literal(this); 1086 StartRawLiteral(); 1087 const bool capture_raw = true; 1088 const bool in_template_literal = true; 1089 while (true) { 1090 uc32 c = c0_; 1091 Advance<capture_raw>(); 1092 if (c == '`') { 1093 result = Token::TEMPLATE_TAIL; 1094 ReduceRawLiteralLength(1); 1095 break; 1096 } else if (c == '$' && c0_ == '{') { 1097 Advance<capture_raw>(); // Consume '{' 1098 ReduceRawLiteralLength(2); 1099 break; 1100 } else if (c == '\\') { 1101 if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(c0_)) { 1102 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty 1103 // code unit sequence. 1104 uc32 lastChar = c0_; 1105 Advance<capture_raw>(); 1106 if (lastChar == '\r') { 1107 ReduceRawLiteralLength(1); // Remove \r 1108 if (c0_ == '\n') { 1109 Advance<capture_raw>(); // Adds \n 1110 } else { 1111 AddRawLiteralChar('\n'); 1112 } 1113 } 1114 } else { 1115 bool success = ScanEscape<capture_raw, in_template_literal>(); 1116 USE(success); 1117 DCHECK_EQ(!success, has_error()); 1118 // For templates, invalid escape sequence checking is handled in the 1119 // parser. 1120 scanner_error_state.MoveErrorTo(&invalid_template_escape_message_, 1121 &invalid_template_escape_location_); 1122 octal_error_state.MoveErrorTo(&invalid_template_escape_message_, 1123 &invalid_template_escape_location_); 1124 } 1125 } else if (c < 0) { 1126 // Unterminated template literal 1127 PushBack(c); 1128 break; 1129 } else { 1130 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A. 1131 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence 1132 // consisting of the CV 0x000A. 1133 if (c == '\r') { 1134 ReduceRawLiteralLength(1); // Remove \r 1135 if (c0_ == '\n') { 1136 Advance<capture_raw>(); // Adds \n 1137 } else { 1138 AddRawLiteralChar('\n'); 1139 } 1140 c = '\n'; 1141 } 1142 AddLiteralChar(c); 1143 } 1144 } 1145 literal.Complete(); 1146 next_.location.end_pos = source_pos(); 1147 next_.token = result; 1148 1149 return result; 1150 } 1151 1152 1153 Token::Value Scanner::ScanTemplateStart() { 1154 DCHECK(next_next_.token == Token::UNINITIALIZED); 1155 DCHECK(c0_ == '`'); 1156 next_.location.beg_pos = source_pos(); 1157 Advance(); // Consume ` 1158 return ScanTemplateSpan(); 1159 } 1160 1161 1162 Token::Value Scanner::ScanTemplateContinuation() { 1163 DCHECK_EQ(next_.token, Token::RBRACE); 1164 next_.location.beg_pos = source_pos() - 1; // We already consumed } 1165 return ScanTemplateSpan(); 1166 } 1167 1168 1169 void Scanner::ScanDecimalDigits() { 1170 while (IsDecimalDigit(c0_)) 1171 AddLiteralCharAdvance(); 1172 } 1173 1174 1175 Token::Value Scanner::ScanNumber(bool seen_period) { 1176 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 1177 1178 enum { 1179 DECIMAL, 1180 DECIMAL_WITH_LEADING_ZERO, 1181 HEX, 1182 OCTAL, 1183 IMPLICIT_OCTAL, 1184 BINARY 1185 } kind = DECIMAL; 1186 1187 LiteralScope literal(this); 1188 bool at_start = !seen_period; 1189 int start_pos = source_pos(); // For reporting octal positions. 1190 if (seen_period) { 1191 // we have already seen a decimal point of the float 1192 AddLiteralChar('.'); 1193 ScanDecimalDigits(); // we know we have at least one digit 1194 1195 } else { 1196 // if the first character is '0' we must check for octals and hex 1197 if (c0_ == '0') { 1198 AddLiteralCharAdvance(); 1199 1200 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or 1201 // an octal number. 1202 if (c0_ == 'x' || c0_ == 'X') { 1203 // hex number 1204 kind = HEX; 1205 AddLiteralCharAdvance(); 1206 if (!IsHexDigit(c0_)) { 1207 // we must have at least one hex digit after 'x'/'X' 1208 return Token::ILLEGAL; 1209 } 1210 while (IsHexDigit(c0_)) { 1211 AddLiteralCharAdvance(); 1212 } 1213 } else if (c0_ == 'o' || c0_ == 'O') { 1214 kind = OCTAL; 1215 AddLiteralCharAdvance(); 1216 if (!IsOctalDigit(c0_)) { 1217 // we must have at least one octal digit after 'o'/'O' 1218 return Token::ILLEGAL; 1219 } 1220 while (IsOctalDigit(c0_)) { 1221 AddLiteralCharAdvance(); 1222 } 1223 } else if (c0_ == 'b' || c0_ == 'B') { 1224 kind = BINARY; 1225 AddLiteralCharAdvance(); 1226 if (!IsBinaryDigit(c0_)) { 1227 // we must have at least one binary digit after 'b'/'B' 1228 return Token::ILLEGAL; 1229 } 1230 while (IsBinaryDigit(c0_)) { 1231 AddLiteralCharAdvance(); 1232 } 1233 } else if ('0' <= c0_ && c0_ <= '7') { 1234 // (possible) octal number 1235 kind = IMPLICIT_OCTAL; 1236 while (true) { 1237 if (c0_ == '8' || c0_ == '9') { 1238 at_start = false; 1239 kind = DECIMAL_WITH_LEADING_ZERO; 1240 break; 1241 } 1242 if (c0_ < '0' || '7' < c0_) { 1243 // Octal literal finished. 1244 octal_pos_ = Location(start_pos, source_pos()); 1245 octal_message_ = MessageTemplate::kStrictOctalLiteral; 1246 break; 1247 } 1248 AddLiteralCharAdvance(); 1249 } 1250 } else if (c0_ == '8' || c0_ == '9') { 1251 kind = DECIMAL_WITH_LEADING_ZERO; 1252 } 1253 } 1254 1255 // Parse decimal digits and allow trailing fractional part. 1256 if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) { 1257 if (at_start) { 1258 uint64_t value = 0; 1259 while (IsDecimalDigit(c0_)) { 1260 value = 10 * value + (c0_ - '0'); 1261 1262 uc32 first_char = c0_; 1263 Advance<false, false>(); 1264 AddLiteralChar(first_char); 1265 } 1266 1267 if (next_.literal_chars->one_byte_literal().length() <= 10 && 1268 value <= Smi::kMaxValue && c0_ != '.' && 1269 (c0_ == kEndOfInput || !unicode_cache_->IsIdentifierStart(c0_))) { 1270 next_.smi_value_ = static_cast<uint32_t>(value); 1271 literal.Complete(); 1272 HandleLeadSurrogate(); 1273 1274 if (kind == DECIMAL_WITH_LEADING_ZERO) { 1275 octal_pos_ = Location(start_pos, source_pos()); 1276 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero; 1277 } 1278 return Token::SMI; 1279 } 1280 HandleLeadSurrogate(); 1281 } 1282 1283 ScanDecimalDigits(); // optional 1284 if (c0_ == '.') { 1285 AddLiteralCharAdvance(); 1286 ScanDecimalDigits(); // optional 1287 } 1288 } 1289 } 1290 1291 // scan exponent, if any 1292 if (c0_ == 'e' || c0_ == 'E') { 1293 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 1294 if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO)) 1295 return Token::ILLEGAL; 1296 // scan exponent 1297 AddLiteralCharAdvance(); 1298 if (c0_ == '+' || c0_ == '-') 1299 AddLiteralCharAdvance(); 1300 if (!IsDecimalDigit(c0_)) { 1301 // we must have at least one decimal digit after 'e'/'E' 1302 return Token::ILLEGAL; 1303 } 1304 ScanDecimalDigits(); 1305 } 1306 1307 // The source character immediately following a numeric literal must 1308 // not be an identifier start or a decimal digit; see ECMA-262 1309 // section 7.8.3, page 17 (note that we read only one decimal digit 1310 // if the value is 0). 1311 if (IsDecimalDigit(c0_) || 1312 (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_))) 1313 return Token::ILLEGAL; 1314 1315 literal.Complete(); 1316 1317 if (kind == DECIMAL_WITH_LEADING_ZERO) { 1318 octal_pos_ = Location(start_pos, source_pos()); 1319 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero; 1320 } 1321 return Token::NUMBER; 1322 } 1323 1324 1325 uc32 Scanner::ScanIdentifierUnicodeEscape() { 1326 Advance(); 1327 if (c0_ != 'u') return -1; 1328 Advance(); 1329 return ScanUnicodeEscape<false>(); 1330 } 1331 1332 1333 template <bool capture_raw> 1334 uc32 Scanner::ScanUnicodeEscape() { 1335 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of 1336 // hex digits between { } is arbitrary. \ and u have already been read. 1337 if (c0_ == '{') { 1338 int begin = source_pos() - 2; 1339 Advance<capture_raw>(); 1340 uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin); 1341 if (cp < 0 || c0_ != '}') { 1342 ReportScannerError(source_pos(), 1343 MessageTemplate::kInvalidUnicodeEscapeSequence); 1344 return -1; 1345 } 1346 Advance<capture_raw>(); 1347 return cp; 1348 } 1349 const bool unicode = true; 1350 return ScanHexNumber<capture_raw, unicode>(4); 1351 } 1352 1353 1354 // ---------------------------------------------------------------------------- 1355 // Keyword Matcher 1356 1357 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 1358 KEYWORD_GROUP('a') \ 1359 KEYWORD("async", Token::ASYNC) \ 1360 KEYWORD("await", Token::AWAIT) \ 1361 KEYWORD_GROUP('b') \ 1362 KEYWORD("break", Token::BREAK) \ 1363 KEYWORD_GROUP('c') \ 1364 KEYWORD("case", Token::CASE) \ 1365 KEYWORD("catch", Token::CATCH) \ 1366 KEYWORD("class", Token::CLASS) \ 1367 KEYWORD("const", Token::CONST) \ 1368 KEYWORD("continue", Token::CONTINUE) \ 1369 KEYWORD_GROUP('d') \ 1370 KEYWORD("debugger", Token::DEBUGGER) \ 1371 KEYWORD("default", Token::DEFAULT) \ 1372 KEYWORD("delete", Token::DELETE) \ 1373 KEYWORD("do", Token::DO) \ 1374 KEYWORD_GROUP('e') \ 1375 KEYWORD("else", Token::ELSE) \ 1376 KEYWORD("enum", Token::ENUM) \ 1377 KEYWORD("export", Token::EXPORT) \ 1378 KEYWORD("extends", Token::EXTENDS) \ 1379 KEYWORD_GROUP('f') \ 1380 KEYWORD("false", Token::FALSE_LITERAL) \ 1381 KEYWORD("finally", Token::FINALLY) \ 1382 KEYWORD("for", Token::FOR) \ 1383 KEYWORD("function", Token::FUNCTION) \ 1384 KEYWORD_GROUP('i') \ 1385 KEYWORD("if", Token::IF) \ 1386 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 1387 KEYWORD("import", Token::IMPORT) \ 1388 KEYWORD("in", Token::IN) \ 1389 KEYWORD("instanceof", Token::INSTANCEOF) \ 1390 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 1391 KEYWORD_GROUP('l') \ 1392 KEYWORD("let", Token::LET) \ 1393 KEYWORD_GROUP('n') \ 1394 KEYWORD("new", Token::NEW) \ 1395 KEYWORD("null", Token::NULL_LITERAL) \ 1396 KEYWORD_GROUP('p') \ 1397 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 1398 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 1399 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 1400 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 1401 KEYWORD_GROUP('r') \ 1402 KEYWORD("return", Token::RETURN) \ 1403 KEYWORD_GROUP('s') \ 1404 KEYWORD("static", Token::STATIC) \ 1405 KEYWORD("super", Token::SUPER) \ 1406 KEYWORD("switch", Token::SWITCH) \ 1407 KEYWORD_GROUP('t') \ 1408 KEYWORD("this", Token::THIS) \ 1409 KEYWORD("throw", Token::THROW) \ 1410 KEYWORD("true", Token::TRUE_LITERAL) \ 1411 KEYWORD("try", Token::TRY) \ 1412 KEYWORD("typeof", Token::TYPEOF) \ 1413 KEYWORD_GROUP('v') \ 1414 KEYWORD("var", Token::VAR) \ 1415 KEYWORD("void", Token::VOID) \ 1416 KEYWORD_GROUP('w') \ 1417 KEYWORD("while", Token::WHILE) \ 1418 KEYWORD("with", Token::WITH) \ 1419 KEYWORD_GROUP('y') \ 1420 KEYWORD("yield", Token::YIELD) 1421 1422 static Token::Value KeywordOrIdentifierToken(const uint8_t* input, 1423 int input_length) { 1424 DCHECK(input_length >= 1); 1425 const int kMinLength = 2; 1426 const int kMaxLength = 10; 1427 if (input_length < kMinLength || input_length > kMaxLength) { 1428 return Token::IDENTIFIER; 1429 } 1430 switch (input[0]) { 1431 default: 1432 #define KEYWORD_GROUP_CASE(ch) \ 1433 break; \ 1434 case ch: 1435 #define KEYWORD(keyword, token) \ 1436 { \ 1437 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 1438 /* strlen(keyword) plus 1 for the NUL char. */ \ 1439 const int keyword_length = sizeof(keyword) - 1; \ 1440 STATIC_ASSERT(keyword_length >= kMinLength); \ 1441 STATIC_ASSERT(keyword_length <= kMaxLength); \ 1442 if (input_length == keyword_length && input[1] == keyword[1] && \ 1443 (keyword_length <= 2 || input[2] == keyword[2]) && \ 1444 (keyword_length <= 3 || input[3] == keyword[3]) && \ 1445 (keyword_length <= 4 || input[4] == keyword[4]) && \ 1446 (keyword_length <= 5 || input[5] == keyword[5]) && \ 1447 (keyword_length <= 6 || input[6] == keyword[6]) && \ 1448 (keyword_length <= 7 || input[7] == keyword[7]) && \ 1449 (keyword_length <= 8 || input[8] == keyword[8]) && \ 1450 (keyword_length <= 9 || input[9] == keyword[9])) { \ 1451 return token; \ 1452 } \ 1453 } 1454 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 1455 } 1456 return Token::IDENTIFIER; 1457 } 1458 1459 1460 Token::Value Scanner::ScanIdentifierOrKeyword() { 1461 DCHECK(unicode_cache_->IsIdentifierStart(c0_)); 1462 LiteralScope literal(this); 1463 if (IsInRange(c0_, 'a', 'z')) { 1464 do { 1465 char first_char = static_cast<char>(c0_); 1466 Advance<false, false>(); 1467 AddLiteralChar(first_char); 1468 } while (IsInRange(c0_, 'a', 'z')); 1469 1470 if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' || 1471 c0_ == '$') { 1472 // Identifier starting with lowercase. 1473 char first_char = static_cast<char>(c0_); 1474 Advance<false, false>(); 1475 AddLiteralChar(first_char); 1476 while (IsAsciiIdentifier(c0_)) { 1477 char first_char = static_cast<char>(c0_); 1478 Advance<false, false>(); 1479 AddLiteralChar(first_char); 1480 } 1481 if (c0_ <= kMaxAscii && c0_ != '\\') { 1482 literal.Complete(); 1483 return Token::IDENTIFIER; 1484 } 1485 } else if (c0_ <= kMaxAscii && c0_ != '\\') { 1486 // Only a-z+: could be a keyword or identifier. 1487 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1488 Token::Value token = 1489 KeywordOrIdentifierToken(chars.start(), chars.length()); 1490 if (token == Token::IDENTIFIER || 1491 token == Token::FUTURE_STRICT_RESERVED_WORD) 1492 literal.Complete(); 1493 return token; 1494 } 1495 1496 HandleLeadSurrogate(); 1497 } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') { 1498 do { 1499 char first_char = static_cast<char>(c0_); 1500 Advance<false, false>(); 1501 AddLiteralChar(first_char); 1502 } while (IsAsciiIdentifier(c0_)); 1503 1504 if (c0_ <= kMaxAscii && c0_ != '\\') { 1505 literal.Complete(); 1506 return Token::IDENTIFIER; 1507 } 1508 1509 HandleLeadSurrogate(); 1510 } else if (c0_ == '\\') { 1511 // Scan identifier start character. 1512 uc32 c = ScanIdentifierUnicodeEscape(); 1513 // Only allow legal identifier start characters. 1514 if (c < 0 || 1515 c == '\\' || // No recursive escapes. 1516 !unicode_cache_->IsIdentifierStart(c)) { 1517 return Token::ILLEGAL; 1518 } 1519 AddLiteralChar(c); 1520 return ScanIdentifierSuffix(&literal, true); 1521 } else { 1522 uc32 first_char = c0_; 1523 Advance(); 1524 AddLiteralChar(first_char); 1525 } 1526 1527 // Scan the rest of the identifier characters. 1528 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) { 1529 if (c0_ != '\\') { 1530 uc32 next_char = c0_; 1531 Advance(); 1532 AddLiteralChar(next_char); 1533 continue; 1534 } 1535 // Fallthrough if no longer able to complete keyword. 1536 return ScanIdentifierSuffix(&literal, false); 1537 } 1538 1539 if (next_.literal_chars->is_one_byte()) { 1540 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1541 Token::Value token = 1542 KeywordOrIdentifierToken(chars.start(), chars.length()); 1543 if (token == Token::IDENTIFIER || 1544 token == Token::FUTURE_STRICT_RESERVED_WORD) 1545 literal.Complete(); 1546 return token; 1547 } 1548 literal.Complete(); 1549 return Token::IDENTIFIER; 1550 } 1551 1552 1553 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal, 1554 bool escaped) { 1555 // Scan the rest of the identifier characters. 1556 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) { 1557 if (c0_ == '\\') { 1558 uc32 c = ScanIdentifierUnicodeEscape(); 1559 escaped = true; 1560 // Only allow legal identifier part characters. 1561 if (c < 0 || 1562 c == '\\' || 1563 !unicode_cache_->IsIdentifierPart(c)) { 1564 return Token::ILLEGAL; 1565 } 1566 AddLiteralChar(c); 1567 } else { 1568 AddLiteralChar(c0_); 1569 Advance(); 1570 } 1571 } 1572 literal->Complete(); 1573 1574 if (escaped && next_.literal_chars->is_one_byte()) { 1575 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1576 Token::Value token = 1577 KeywordOrIdentifierToken(chars.start(), chars.length()); 1578 /* TODO(adamk): YIELD should be handled specially. */ 1579 if (token == Token::IDENTIFIER) { 1580 return Token::IDENTIFIER; 1581 } else if (token == Token::FUTURE_STRICT_RESERVED_WORD || 1582 token == Token::LET || token == Token::STATIC) { 1583 return Token::ESCAPED_STRICT_RESERVED_WORD; 1584 } else { 1585 return Token::ESCAPED_KEYWORD; 1586 } 1587 } 1588 return Token::IDENTIFIER; 1589 } 1590 1591 bool Scanner::ScanRegExpPattern() { 1592 DCHECK(next_next_.token == Token::UNINITIALIZED); 1593 DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV); 1594 1595 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1596 bool in_character_class = false; 1597 bool seen_equal = (next_.token == Token::ASSIGN_DIV); 1598 1599 // Previous token is either '/' or '/=', in the second case, the 1600 // pattern starts at =. 1601 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1602 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1603 1604 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1605 // the scanner should pass uninterpreted bodies to the RegExp 1606 // constructor. 1607 LiteralScope literal(this); 1608 if (seen_equal) { 1609 AddLiteralChar('='); 1610 } 1611 1612 while (c0_ != '/' || in_character_class) { 1613 if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_)) 1614 return false; 1615 if (c0_ == '\\') { // Escape sequence. 1616 AddLiteralCharAdvance(); 1617 if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_)) 1618 return false; 1619 AddLiteralCharAdvance(); 1620 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1621 // only "safe" characters are allowed (letters, digits, underscore), 1622 // otherwise the escape isn't valid and the invalid character has 1623 // its normal meaning. I.e., we can just continue scanning without 1624 // worrying whether the following characters are part of the escape 1625 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1626 // of the escape sequence. 1627 1628 // TODO(896): At some point, parse RegExps more throughly to capture 1629 // octal esacpes in strict mode. 1630 } else { // Unescaped character. 1631 if (c0_ == '[') in_character_class = true; 1632 if (c0_ == ']') in_character_class = false; 1633 AddLiteralCharAdvance(); 1634 } 1635 } 1636 Advance(); // consume '/' 1637 1638 literal.Complete(); 1639 next_.token = Token::REGEXP_LITERAL; 1640 return true; 1641 } 1642 1643 1644 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() { 1645 DCHECK(next_.token == Token::REGEXP_LITERAL); 1646 1647 // Scan regular expression flags. 1648 int flags = 0; 1649 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) { 1650 RegExp::Flags flag = RegExp::kNone; 1651 switch (c0_) { 1652 case 'g': 1653 flag = RegExp::kGlobal; 1654 break; 1655 case 'i': 1656 flag = RegExp::kIgnoreCase; 1657 break; 1658 case 'm': 1659 flag = RegExp::kMultiline; 1660 break; 1661 case 'u': 1662 flag = RegExp::kUnicode; 1663 break; 1664 case 'y': 1665 flag = RegExp::kSticky; 1666 break; 1667 default: 1668 return Nothing<RegExp::Flags>(); 1669 } 1670 if (flags & flag) { 1671 return Nothing<RegExp::Flags>(); 1672 } 1673 Advance(); 1674 flags |= flag; 1675 } 1676 1677 next_.location.end_pos = source_pos(); 1678 return Just(RegExp::Flags(flags)); 1679 } 1680 1681 1682 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) { 1683 if (is_literal_one_byte()) { 1684 return ast_value_factory->GetOneByteString(literal_one_byte_string()); 1685 } 1686 return ast_value_factory->GetTwoByteString(literal_two_byte_string()); 1687 } 1688 1689 1690 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) { 1691 if (is_next_literal_one_byte()) { 1692 return ast_value_factory->GetOneByteString(next_literal_one_byte_string()); 1693 } 1694 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string()); 1695 } 1696 1697 1698 const AstRawString* Scanner::CurrentRawSymbol( 1699 AstValueFactory* ast_value_factory) { 1700 if (is_raw_literal_one_byte()) { 1701 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string()); 1702 } 1703 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string()); 1704 } 1705 1706 1707 double Scanner::DoubleValue() { 1708 DCHECK(is_literal_one_byte()); 1709 return StringToDouble( 1710 unicode_cache_, 1711 literal_one_byte_string(), 1712 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY); 1713 } 1714 1715 1716 bool Scanner::ContainsDot() { 1717 DCHECK(is_literal_one_byte()); 1718 Vector<const uint8_t> str = literal_one_byte_string(); 1719 return std::find(str.begin(), str.end(), '.') != str.end(); 1720 } 1721 1722 bool Scanner::FindSymbol(DuplicateFinder* finder) { 1723 // TODO(vogelheim): Move this logic into the calling class; this can be fully 1724 // implemented using the public interface. 1725 if (is_literal_one_byte()) { 1726 return finder->AddOneByteSymbol(literal_one_byte_string()); 1727 } 1728 return finder->AddTwoByteSymbol(literal_two_byte_string()); 1729 } 1730 1731 void Scanner::SeekNext(size_t position) { 1732 // Use with care: This cleanly resets most, but not all scanner state. 1733 // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions. 1734 1735 // To re-scan from a given character position, we need to: 1736 // 1, Reset the current_, next_ and next_next_ tokens 1737 // (next_ + next_next_ will be overwrittem by Next(), 1738 // current_ will remain unchanged, so overwrite it fully.) 1739 current_ = {{0, 0}, nullptr, nullptr, 0, Token::UNINITIALIZED}; 1740 next_.token = Token::UNINITIALIZED; 1741 next_next_.token = Token::UNINITIALIZED; 1742 // 2, reset the source to the desired position, 1743 source_->Seek(position); 1744 // 3, re-scan, by scanning the look-ahead char + 1 token (next_). 1745 c0_ = source_->Advance(); 1746 Next(); 1747 DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position)); 1748 } 1749 1750 } // namespace internal 1751 } // namespace v8 1752