1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #include "src/parsing/scanner.h" 8 9 #include <stdint.h> 10 11 #include <cmath> 12 13 #include "src/ast/ast-value-factory.h" 14 #include "src/char-predicates-inl.h" 15 #include "src/conversions-inl.h" 16 #include "src/objects/bigint.h" 17 #include "src/parsing/duplicate-finder.h" // For Scanner::FindSymbol 18 #include "src/parsing/scanner-inl.h" 19 20 namespace v8 { 21 namespace internal { 22 23 class Scanner::ErrorState { 24 public: 25 ErrorState(MessageTemplate::Template* message_stack, 26 Scanner::Location* location_stack) 27 : message_stack_(message_stack), 28 old_message_(*message_stack), 29 location_stack_(location_stack), 30 old_location_(*location_stack) { 31 *message_stack_ = MessageTemplate::kNone; 32 *location_stack_ = Location::invalid(); 33 } 34 35 ~ErrorState() { 36 *message_stack_ = old_message_; 37 *location_stack_ = old_location_; 38 } 39 40 void MoveErrorTo(TokenDesc* dest) { 41 if (*message_stack_ == MessageTemplate::kNone) { 42 return; 43 } 44 if (dest->invalid_template_escape_message == MessageTemplate::kNone) { 45 dest->invalid_template_escape_message = *message_stack_; 46 dest->invalid_template_escape_location = *location_stack_; 47 } 48 *message_stack_ = MessageTemplate::kNone; 49 *location_stack_ = Location::invalid(); 50 } 51 52 private: 53 MessageTemplate::Template* const message_stack_; 54 MessageTemplate::Template const old_message_; 55 Scanner::Location* const location_stack_; 56 Scanner::Location const old_location_; 57 }; 58 59 // ---------------------------------------------------------------------------- 60 // Scanner::LiteralBuffer 61 62 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const { 63 DCHECK(is_used_); 64 if (is_one_byte()) { 65 return isolate->factory()->InternalizeOneByteString(one_byte_literal()); 66 } 67 return isolate->factory()->InternalizeTwoByteString(two_byte_literal()); 68 } 69 70 int Scanner::LiteralBuffer::NewCapacity(int min_capacity) { 71 int capacity = Max(min_capacity, backing_store_.length()); 72 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); 73 return new_capacity; 74 } 75 76 void Scanner::LiteralBuffer::ExpandBuffer() { 77 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); 78 MemCopy(new_store.start(), backing_store_.start(), position_); 79 backing_store_.Dispose(); 80 backing_store_ = new_store; 81 } 82 83 void Scanner::LiteralBuffer::ConvertToTwoByte() { 84 DCHECK(is_one_byte_); 85 Vector<byte> new_store; 86 int new_content_size = position_ * kUC16Size; 87 if (new_content_size >= backing_store_.length()) { 88 // Ensure room for all currently read code units as UC16 as well 89 // as the code unit about to be stored. 90 new_store = Vector<byte>::New(NewCapacity(new_content_size)); 91 } else { 92 new_store = backing_store_; 93 } 94 uint8_t* src = backing_store_.start(); 95 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start()); 96 for (int i = position_ - 1; i >= 0; i--) { 97 dst[i] = src[i]; 98 } 99 if (new_store.start() != backing_store_.start()) { 100 backing_store_.Dispose(); 101 backing_store_ = new_store; 102 } 103 position_ = new_content_size; 104 is_one_byte_ = false; 105 } 106 107 void Scanner::LiteralBuffer::AddTwoByteChar(uc32 code_unit) { 108 DCHECK(!is_one_byte_); 109 if (position_ >= backing_store_.length()) ExpandBuffer(); 110 if (code_unit <= 111 static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 112 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit; 113 position_ += kUC16Size; 114 } else { 115 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = 116 unibrow::Utf16::LeadSurrogate(code_unit); 117 position_ += kUC16Size; 118 if (position_ >= backing_store_.length()) ExpandBuffer(); 119 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = 120 unibrow::Utf16::TrailSurrogate(code_unit); 121 position_ += kUC16Size; 122 } 123 } 124 125 // ---------------------------------------------------------------------------- 126 // Scanner::BookmarkScope 127 128 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos = 129 std::numeric_limits<size_t>::max() - 2; 130 const size_t Scanner::BookmarkScope::kNoBookmark = 131 std::numeric_limits<size_t>::max() - 1; 132 const size_t Scanner::BookmarkScope::kBookmarkWasApplied = 133 std::numeric_limits<size_t>::max(); 134 135 void Scanner::BookmarkScope::Set() { 136 DCHECK_EQ(bookmark_, kNoBookmark); 137 DCHECK_EQ(scanner_->next_next().token, Token::UNINITIALIZED); 138 139 // The first token is a bit special, since current_ will still be 140 // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it 141 // when 142 // applying the bookmark. 143 DCHECK_IMPLIES(scanner_->current().token == Token::UNINITIALIZED, 144 scanner_->current().location.beg_pos == 145 scanner_->next().location.beg_pos); 146 bookmark_ = (scanner_->current().token == Token::UNINITIALIZED) 147 ? kBookmarkAtFirstPos 148 : scanner_->location().beg_pos; 149 } 150 151 void Scanner::BookmarkScope::Apply() { 152 DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark. 153 if (bookmark_ == kBookmarkAtFirstPos) { 154 scanner_->SeekNext(0); 155 } else { 156 scanner_->SeekNext(bookmark_); 157 scanner_->Next(); 158 DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_)); 159 } 160 bookmark_ = kBookmarkWasApplied; 161 } 162 163 bool Scanner::BookmarkScope::HasBeenSet() { 164 return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied; 165 } 166 167 bool Scanner::BookmarkScope::HasBeenApplied() { 168 return bookmark_ == kBookmarkWasApplied; 169 } 170 171 // ---------------------------------------------------------------------------- 172 // Scanner 173 174 Scanner::Scanner(UnicodeCache* unicode_cache, Utf16CharacterStream* source, 175 bool is_module) 176 : unicode_cache_(unicode_cache), 177 source_(source), 178 octal_pos_(Location::invalid()), 179 octal_message_(MessageTemplate::kNone), 180 found_html_comment_(false), 181 allow_harmony_bigint_(false), 182 allow_harmony_numeric_separator_(false), 183 is_module_(is_module) { 184 DCHECK_NOT_NULL(source); 185 } 186 187 void Scanner::Initialize() { 188 // Need to capture identifiers in order to recognize "get" and "set" 189 // in object literals. 190 Init(); 191 next().after_line_terminator = true; 192 Scan(); 193 } 194 195 template <bool capture_raw, bool unicode> 196 uc32 Scanner::ScanHexNumber(int expected_length) { 197 DCHECK_LE(expected_length, 4); // prevent overflow 198 199 int begin = source_pos() - 2; 200 uc32 x = 0; 201 for (int i = 0; i < expected_length; i++) { 202 int d = HexValue(c0_); 203 if (d < 0) { 204 ReportScannerError(Location(begin, begin + expected_length + 2), 205 unicode 206 ? MessageTemplate::kInvalidUnicodeEscapeSequence 207 : MessageTemplate::kInvalidHexEscapeSequence); 208 return -1; 209 } 210 x = x * 16 + d; 211 Advance<capture_raw>(); 212 } 213 214 return x; 215 } 216 217 template <bool capture_raw> 218 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) { 219 uc32 x = 0; 220 int d = HexValue(c0_); 221 if (d < 0) return -1; 222 223 while (d >= 0) { 224 x = x * 16 + d; 225 if (x > max_value) { 226 ReportScannerError(Location(beg_pos, source_pos() + 1), 227 MessageTemplate::kUndefinedUnicodeCodePoint); 228 return -1; 229 } 230 Advance<capture_raw>(); 231 d = HexValue(c0_); 232 } 233 234 return x; 235 } 236 237 238 // Ensure that tokens can be stored in a byte. 239 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 240 241 // Table of one-character tokens, by character (0x00..0x7F only). 242 // clang-format off 243 static const byte one_char_tokens[] = { 244 Token::ILLEGAL, 245 Token::ILLEGAL, 246 Token::ILLEGAL, 247 Token::ILLEGAL, 248 Token::ILLEGAL, 249 Token::ILLEGAL, 250 Token::ILLEGAL, 251 Token::ILLEGAL, 252 Token::ILLEGAL, 253 Token::ILLEGAL, 254 Token::ILLEGAL, 255 Token::ILLEGAL, 256 Token::ILLEGAL, 257 Token::ILLEGAL, 258 Token::ILLEGAL, 259 Token::ILLEGAL, 260 Token::ILLEGAL, 261 Token::ILLEGAL, 262 Token::ILLEGAL, 263 Token::ILLEGAL, 264 Token::ILLEGAL, 265 Token::ILLEGAL, 266 Token::ILLEGAL, 267 Token::ILLEGAL, 268 Token::ILLEGAL, 269 Token::ILLEGAL, 270 Token::ILLEGAL, 271 Token::ILLEGAL, 272 Token::ILLEGAL, 273 Token::ILLEGAL, 274 Token::ILLEGAL, 275 Token::ILLEGAL, 276 Token::ILLEGAL, 277 Token::ILLEGAL, 278 Token::ILLEGAL, 279 Token::ILLEGAL, 280 Token::ILLEGAL, 281 Token::ILLEGAL, 282 Token::ILLEGAL, 283 Token::ILLEGAL, 284 Token::LPAREN, // 0x28 285 Token::RPAREN, // 0x29 286 Token::ILLEGAL, 287 Token::ILLEGAL, 288 Token::COMMA, // 0x2C 289 Token::ILLEGAL, 290 Token::ILLEGAL, 291 Token::ILLEGAL, 292 Token::ILLEGAL, 293 Token::ILLEGAL, 294 Token::ILLEGAL, 295 Token::ILLEGAL, 296 Token::ILLEGAL, 297 Token::ILLEGAL, 298 Token::ILLEGAL, 299 Token::ILLEGAL, 300 Token::ILLEGAL, 301 Token::ILLEGAL, 302 Token::COLON, // 0x3A 303 Token::SEMICOLON, // 0x3B 304 Token::ILLEGAL, 305 Token::ILLEGAL, 306 Token::ILLEGAL, 307 Token::CONDITIONAL, // 0x3F 308 Token::ILLEGAL, 309 Token::ILLEGAL, 310 Token::ILLEGAL, 311 Token::ILLEGAL, 312 Token::ILLEGAL, 313 Token::ILLEGAL, 314 Token::ILLEGAL, 315 Token::ILLEGAL, 316 Token::ILLEGAL, 317 Token::ILLEGAL, 318 Token::ILLEGAL, 319 Token::ILLEGAL, 320 Token::ILLEGAL, 321 Token::ILLEGAL, 322 Token::ILLEGAL, 323 Token::ILLEGAL, 324 Token::ILLEGAL, 325 Token::ILLEGAL, 326 Token::ILLEGAL, 327 Token::ILLEGAL, 328 Token::ILLEGAL, 329 Token::ILLEGAL, 330 Token::ILLEGAL, 331 Token::ILLEGAL, 332 Token::ILLEGAL, 333 Token::ILLEGAL, 334 Token::ILLEGAL, 335 Token::LBRACK, // 0x5B 336 Token::ILLEGAL, 337 Token::RBRACK, // 0x5D 338 Token::ILLEGAL, 339 Token::ILLEGAL, 340 Token::ILLEGAL, 341 Token::ILLEGAL, 342 Token::ILLEGAL, 343 Token::ILLEGAL, 344 Token::ILLEGAL, 345 Token::ILLEGAL, 346 Token::ILLEGAL, 347 Token::ILLEGAL, 348 Token::ILLEGAL, 349 Token::ILLEGAL, 350 Token::ILLEGAL, 351 Token::ILLEGAL, 352 Token::ILLEGAL, 353 Token::ILLEGAL, 354 Token::ILLEGAL, 355 Token::ILLEGAL, 356 Token::ILLEGAL, 357 Token::ILLEGAL, 358 Token::ILLEGAL, 359 Token::ILLEGAL, 360 Token::ILLEGAL, 361 Token::ILLEGAL, 362 Token::ILLEGAL, 363 Token::ILLEGAL, 364 Token::ILLEGAL, 365 Token::ILLEGAL, 366 Token::ILLEGAL, 367 Token::LBRACE, // 0x7B 368 Token::ILLEGAL, 369 Token::RBRACE, // 0x7D 370 Token::BIT_NOT, // 0x7E 371 Token::ILLEGAL 372 }; 373 // clang-format on 374 375 Token::Value Scanner::Next() { 376 if (next().token == Token::EOS) next().location = current().location; 377 // Rotate through tokens. 378 TokenDesc* previous = current_; 379 current_ = next_; 380 // Either we already have the next token lined up, in which case next_next_ 381 // simply becomes next_. In that case we use current_ as new next_next_ and 382 // clear its token to indicate that it wasn't scanned yet. Otherwise we use 383 // current_ as next_ and scan into it, leaving next_next_ uninitialized. 384 if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) { 385 next_ = previous; 386 next().after_line_terminator = false; 387 Scan(); 388 } else { 389 next_ = next_next_; 390 next_next_ = previous; 391 previous->token = Token::UNINITIALIZED; 392 previous->contextual_token = Token::UNINITIALIZED; 393 DCHECK_NE(Token::UNINITIALIZED, current().token); 394 } 395 return current().token; 396 } 397 398 399 Token::Value Scanner::PeekAhead() { 400 DCHECK(next().token != Token::DIV); 401 DCHECK(next().token != Token::ASSIGN_DIV); 402 403 if (next_next().token != Token::UNINITIALIZED) { 404 return next_next().token; 405 } 406 TokenDesc* temp = next_; 407 next_ = next_next_; 408 next().after_line_terminator = false; 409 Scan(); 410 next_next_ = next_; 411 next_ = temp; 412 return next_next().token; 413 } 414 415 Token::Value Scanner::SkipSingleHTMLComment() { 416 if (is_module_) { 417 ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule); 418 return Token::ILLEGAL; 419 } 420 return SkipSingleLineComment(); 421 } 422 423 Token::Value Scanner::SkipSingleLineComment() { 424 // The line terminator at the end of the line is not considered 425 // to be part of the single-line comment; it is recognized 426 // separately by the lexical grammar and becomes part of the 427 // stream of input elements for the syntactic grammar (see 428 // ECMA-262, section 7.4). 429 AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); }); 430 431 return Token::WHITESPACE; 432 } 433 434 Token::Value Scanner::SkipSourceURLComment() { 435 TryToParseSourceURLComment(); 436 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) { 437 Advance(); 438 } 439 440 return Token::WHITESPACE; 441 } 442 443 void Scanner::TryToParseSourceURLComment() { 444 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this 445 // function will just return if it cannot parse a magic comment. 446 DCHECK(!unicode_cache_->IsWhiteSpaceOrLineTerminator(kEndOfInput)); 447 if (!unicode_cache_->IsWhiteSpace(c0_)) return; 448 Advance(); 449 LiteralBuffer name; 450 name.Start(); 451 452 while (c0_ != kEndOfInput && 453 !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') { 454 name.AddChar(c0_); 455 Advance(); 456 } 457 if (!name.is_one_byte()) return; 458 Vector<const uint8_t> name_literal = name.one_byte_literal(); 459 LiteralBuffer* value; 460 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) { 461 value = &source_url_; 462 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) { 463 value = &source_mapping_url_; 464 } else { 465 return; 466 } 467 if (c0_ != '=') 468 return; 469 value->Drop(); 470 value->Start(); 471 Advance(); 472 while (unicode_cache_->IsWhiteSpace(c0_)) { 473 Advance(); 474 } 475 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) { 476 // Disallowed characters. 477 if (c0_ == '"' || c0_ == '\'') { 478 value->Drop(); 479 return; 480 } 481 if (unicode_cache_->IsWhiteSpace(c0_)) { 482 break; 483 } 484 value->AddChar(c0_); 485 Advance(); 486 } 487 // Allow whitespace at the end. 488 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) { 489 if (!unicode_cache_->IsWhiteSpace(c0_)) { 490 value->Drop(); 491 break; 492 } 493 Advance(); 494 } 495 } 496 497 Token::Value Scanner::SkipMultiLineComment() { 498 DCHECK_EQ(c0_, '*'); 499 Advance(); 500 501 while (c0_ != kEndOfInput) { 502 DCHECK(!unibrow::IsLineTerminator(kEndOfInput)); 503 if (!HasLineTerminatorBeforeNext() && unibrow::IsLineTerminator(c0_)) { 504 // Following ECMA-262, section 7.4, a comment containing 505 // a newline will make the comment count as a line-terminator. 506 next().after_line_terminator = true; 507 } 508 509 while (V8_UNLIKELY(c0_ == '*')) { 510 Advance(); 511 if (c0_ == '/') { 512 Advance(); 513 return Token::WHITESPACE; 514 } 515 } 516 Advance(); 517 } 518 519 // Unterminated multi-line comment. 520 return Token::ILLEGAL; 521 } 522 523 Token::Value Scanner::ScanHtmlComment() { 524 // Check for <!-- comments. 525 DCHECK_EQ(c0_, '!'); 526 Advance(); 527 if (c0_ != '-' || Peek() != '-') { 528 PushBack('!'); // undo Advance() 529 return Token::LT; 530 } 531 Advance(); 532 533 found_html_comment_ = true; 534 return SkipSingleHTMLComment(); 535 } 536 537 void Scanner::Scan() { 538 next().literal_chars.Drop(); 539 next().raw_literal_chars.Drop(); 540 next().invalid_template_escape_message = MessageTemplate::kNone; 541 542 Token::Value token; 543 do { 544 if (static_cast<unsigned>(c0_) <= 0x7F) { 545 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 546 if (token != Token::ILLEGAL) { 547 int pos = source_pos(); 548 next().token = token; 549 next().contextual_token = Token::UNINITIALIZED; 550 next().location.beg_pos = pos; 551 next().location.end_pos = pos + 1; 552 Advance(); 553 return; 554 } 555 } 556 557 // Remember the position of the next token 558 next().location.beg_pos = source_pos(); 559 560 switch (c0_) { 561 case '"': 562 case '\'': 563 token = ScanString(); 564 break; 565 566 case '<': 567 // < <= << <<= <!-- 568 Advance(); 569 if (c0_ == '=') { 570 token = Select(Token::LTE); 571 } else if (c0_ == '<') { 572 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 573 } else if (c0_ == '!') { 574 token = ScanHtmlComment(); 575 } else { 576 token = Token::LT; 577 } 578 break; 579 580 case '>': 581 // > >= >> >>= >>> >>>= 582 Advance(); 583 if (c0_ == '=') { 584 token = Select(Token::GTE); 585 } else if (c0_ == '>') { 586 // >> >>= >>> >>>= 587 Advance(); 588 if (c0_ == '=') { 589 token = Select(Token::ASSIGN_SAR); 590 } else if (c0_ == '>') { 591 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 592 } else { 593 token = Token::SAR; 594 } 595 } else { 596 token = Token::GT; 597 } 598 break; 599 600 case '=': 601 // = == === => 602 Advance(); 603 if (c0_ == '=') { 604 token = Select('=', Token::EQ_STRICT, Token::EQ); 605 } else if (c0_ == '>') { 606 token = Select(Token::ARROW); 607 } else { 608 token = Token::ASSIGN; 609 } 610 break; 611 612 case '!': 613 // ! != !== 614 Advance(); 615 if (c0_ == '=') { 616 token = Select('=', Token::NE_STRICT, Token::NE); 617 } else { 618 token = Token::NOT; 619 } 620 break; 621 622 case '+': 623 // + ++ += 624 Advance(); 625 if (c0_ == '+') { 626 token = Select(Token::INC); 627 } else if (c0_ == '=') { 628 token = Select(Token::ASSIGN_ADD); 629 } else { 630 token = Token::ADD; 631 } 632 break; 633 634 case '-': 635 // - -- --> -= 636 Advance(); 637 if (c0_ == '-') { 638 Advance(); 639 if (c0_ == '>' && HasLineTerminatorBeforeNext()) { 640 // For compatibility with SpiderMonkey, we skip lines that 641 // start with an HTML comment end '-->'. 642 token = SkipSingleHTMLComment(); 643 } else { 644 token = Token::DEC; 645 } 646 } else if (c0_ == '=') { 647 token = Select(Token::ASSIGN_SUB); 648 } else { 649 token = Token::SUB; 650 } 651 break; 652 653 case '*': 654 // * *= 655 Advance(); 656 if (c0_ == '*') { 657 token = Select('=', Token::ASSIGN_EXP, Token::EXP); 658 } else if (c0_ == '=') { 659 token = Select(Token::ASSIGN_MUL); 660 } else { 661 token = Token::MUL; 662 } 663 break; 664 665 case '%': 666 // % %= 667 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 668 break; 669 670 case '/': 671 // / // /* /= 672 Advance(); 673 if (c0_ == '/') { 674 uc32 c = Peek(); 675 if (c == '#' || c == '@') { 676 Advance(); 677 Advance(); 678 token = SkipSourceURLComment(); 679 } else { 680 token = SkipSingleLineComment(); 681 } 682 } else if (c0_ == '*') { 683 token = SkipMultiLineComment(); 684 } else if (c0_ == '=') { 685 token = Select(Token::ASSIGN_DIV); 686 } else { 687 token = Token::DIV; 688 } 689 break; 690 691 case '&': 692 // & && &= 693 Advance(); 694 if (c0_ == '&') { 695 token = Select(Token::AND); 696 } else if (c0_ == '=') { 697 token = Select(Token::ASSIGN_BIT_AND); 698 } else { 699 token = Token::BIT_AND; 700 } 701 break; 702 703 case '|': 704 // | || |= 705 Advance(); 706 if (c0_ == '|') { 707 token = Select(Token::OR); 708 } else if (c0_ == '=') { 709 token = Select(Token::ASSIGN_BIT_OR); 710 } else { 711 token = Token::BIT_OR; 712 } 713 break; 714 715 case '^': 716 // ^ ^= 717 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 718 break; 719 720 case '.': 721 // . Number 722 Advance(); 723 if (IsDecimalDigit(c0_)) { 724 token = ScanNumber(true); 725 } else { 726 token = Token::PERIOD; 727 if (c0_ == '.') { 728 if (Peek() == '.') { 729 Advance(); 730 Advance(); 731 token = Token::ELLIPSIS; 732 } 733 } 734 } 735 break; 736 737 case '`': 738 token = ScanTemplateStart(); 739 break; 740 741 case '#': 742 token = ScanPrivateName(); 743 break; 744 745 default: 746 if (unicode_cache_->IsIdentifierStart(c0_) || 747 (CombineSurrogatePair() && 748 unicode_cache_->IsIdentifierStart(c0_))) { 749 token = ScanIdentifierOrKeyword(); 750 } else if (IsDecimalDigit(c0_)) { 751 token = ScanNumber(false); 752 } else if (c0_ == kEndOfInput) { 753 token = Token::EOS; 754 } else { 755 token = SkipWhiteSpace(); 756 if (token == Token::ILLEGAL) Advance(); 757 } 758 break; 759 } 760 761 // Continue scanning for tokens as long as we're just skipping 762 // whitespace. 763 } while (token == Token::WHITESPACE); 764 765 next().location.end_pos = source_pos(); 766 if (Token::IsContextualKeyword(token)) { 767 next().token = Token::IDENTIFIER; 768 next().contextual_token = token; 769 } else { 770 next().token = token; 771 next().contextual_token = Token::UNINITIALIZED; 772 } 773 774 #ifdef DEBUG 775 SanityCheckTokenDesc(current()); 776 SanityCheckTokenDesc(next()); 777 SanityCheckTokenDesc(next_next()); 778 #endif 779 } 780 781 #ifdef DEBUG 782 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const { 783 // Most tokens should not have literal_chars or even raw_literal chars. 784 // The rules are: 785 // - UNINITIALIZED: we don't care. 786 // - TEMPLATE_*: need both literal + raw literal chars. 787 // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal. 788 // - all others: should have neither. 789 // Furthermore, only TEMPLATE_* tokens can have a 790 // invalid_template_escape_message. 791 792 switch (token.token) { 793 case Token::UNINITIALIZED: 794 // token.literal_chars & other members might be garbage. That's ok. 795 break; 796 case Token::TEMPLATE_SPAN: 797 case Token::TEMPLATE_TAIL: 798 DCHECK(token.raw_literal_chars.is_used()); 799 DCHECK(token.literal_chars.is_used()); 800 break; 801 case Token::ESCAPED_KEYWORD: 802 case Token::ESCAPED_STRICT_RESERVED_WORD: 803 case Token::FUTURE_STRICT_RESERVED_WORD: 804 case Token::IDENTIFIER: 805 case Token::NUMBER: 806 case Token::BIGINT: 807 case Token::REGEXP_LITERAL: 808 case Token::SMI: 809 case Token::STRING: 810 case Token::PRIVATE_NAME: 811 DCHECK(token.literal_chars.is_used()); 812 DCHECK(!token.raw_literal_chars.is_used()); 813 DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone); 814 break; 815 default: 816 DCHECK(!token.literal_chars.is_used()); 817 DCHECK(!token.raw_literal_chars.is_used()); 818 DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone); 819 break; 820 } 821 822 DCHECK_IMPLIES(token.token != Token::IDENTIFIER, 823 token.contextual_token == Token::UNINITIALIZED); 824 DCHECK_IMPLIES(token.contextual_token != Token::UNINITIALIZED, 825 token.token == Token::IDENTIFIER && 826 Token::IsContextualKeyword(token.contextual_token)); 827 DCHECK(!Token::IsContextualKeyword(token.token)); 828 } 829 #endif // DEBUG 830 831 void Scanner::SeekForward(int pos) { 832 // After this call, we will have the token at the given position as 833 // the "next" token. The "current" token will be invalid. 834 if (pos == next().location.beg_pos) return; 835 int current_pos = source_pos(); 836 DCHECK_EQ(next().location.end_pos, current_pos); 837 // Positions inside the lookahead token aren't supported. 838 DCHECK(pos >= current_pos); 839 if (pos != current_pos) { 840 source_->Seek(pos); 841 Advance(); 842 // This function is only called to seek to the location 843 // of the end of a function (at the "}" token). It doesn't matter 844 // whether there was a line terminator in the part we skip. 845 next().after_line_terminator = false; 846 } 847 Scan(); 848 } 849 850 template <bool capture_raw> 851 bool Scanner::ScanEscape() { 852 uc32 c = c0_; 853 Advance<capture_raw>(); 854 855 // Skip escaped newlines. 856 DCHECK(!unibrow::IsLineTerminator(kEndOfInput)); 857 if (!capture_raw && unibrow::IsLineTerminator(c)) { 858 // Allow escaped CR+LF newlines in multiline string literals. 859 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 860 return true; 861 } 862 863 switch (c) { 864 case '\'': // fall through 865 case '"' : // fall through 866 case '\\': break; 867 case 'b' : c = '\b'; break; 868 case 'f' : c = '\f'; break; 869 case 'n' : c = '\n'; break; 870 case 'r' : c = '\r'; break; 871 case 't' : c = '\t'; break; 872 case 'u' : { 873 c = ScanUnicodeEscape<capture_raw>(); 874 if (c < 0) return false; 875 break; 876 } 877 case 'v': 878 c = '\v'; 879 break; 880 case 'x': { 881 c = ScanHexNumber<capture_raw>(2); 882 if (c < 0) return false; 883 break; 884 } 885 case '0': // Fall through. 886 case '1': // fall through 887 case '2': // fall through 888 case '3': // fall through 889 case '4': // fall through 890 case '5': // fall through 891 case '6': // fall through 892 case '7': 893 c = ScanOctalEscape<capture_raw>(c, 2); 894 break; 895 } 896 897 // Other escaped characters are interpreted as their non-escaped version. 898 AddLiteralChar(c); 899 return true; 900 } 901 902 template <bool capture_raw> 903 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 904 uc32 x = c - '0'; 905 int i = 0; 906 for (; i < length; i++) { 907 int d = c0_ - '0'; 908 if (d < 0 || d > 7) break; 909 int nx = x * 8 + d; 910 if (nx >= 256) break; 911 x = nx; 912 Advance<capture_raw>(); 913 } 914 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 915 // Remember the position of octal escape sequences so that an error 916 // can be reported later (in strict mode). 917 // We don't report the error immediately, because the octal escape can 918 // occur before the "use strict" directive. 919 if (c != '0' || i > 0 || c0_ == '8' || c0_ == '9') { 920 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 921 octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral 922 : MessageTemplate::kStrictOctalEscape; 923 } 924 return x; 925 } 926 927 Token::Value Scanner::ScanString() { 928 uc32 quote = c0_; 929 Advance(); // consume quote 930 931 LiteralScope literal(this); 932 while (true) { 933 if (c0_ == quote) { 934 literal.Complete(); 935 Advance(); 936 return Token::STRING; 937 } 938 if (c0_ == kEndOfInput || unibrow::IsStringLiteralLineTerminator(c0_)) { 939 return Token::ILLEGAL; 940 } 941 if (c0_ == '\\') { 942 Advance(); 943 // TODO(verwaest): Check whether we can remove the additional check. 944 if (c0_ == kEndOfInput || !ScanEscape<false>()) { 945 return Token::ILLEGAL; 946 } 947 continue; 948 } 949 AddLiteralCharAdvance(); 950 } 951 } 952 953 Token::Value Scanner::ScanPrivateName() { 954 if (!allow_harmony_private_fields()) { 955 ReportScannerError(source_pos(), 956 MessageTemplate::kInvalidOrUnexpectedToken); 957 return Token::ILLEGAL; 958 } 959 960 LiteralScope literal(this); 961 DCHECK_EQ(c0_, '#'); 962 DCHECK(!unicode_cache_->IsIdentifierStart(kEndOfInput)); 963 if (!unicode_cache_->IsIdentifierStart(Peek())) { 964 ReportScannerError(source_pos(), 965 MessageTemplate::kInvalidOrUnexpectedToken); 966 return Token::ILLEGAL; 967 } 968 969 AddLiteralCharAdvance(); 970 Token::Value token = ScanIdentifierOrKeywordInner(&literal); 971 return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME; 972 } 973 974 Token::Value Scanner::ScanTemplateSpan() { 975 // When scanning a TemplateSpan, we are looking for the following construct: 976 // TEMPLATE_SPAN :: 977 // ` LiteralChars* ${ 978 // | } LiteralChars* ${ 979 // 980 // TEMPLATE_TAIL :: 981 // ` LiteralChars* ` 982 // | } LiteralChar* ` 983 // 984 // A TEMPLATE_SPAN should always be followed by an Expression, while a 985 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be 986 // followed by an Expression. 987 988 // These scoped helpers save and restore the original error state, so that we 989 // can specially treat invalid escape sequences in templates (which are 990 // handled by the parser). 991 ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_); 992 ErrorState octal_error_state(&octal_message_, &octal_pos_); 993 994 Token::Value result = Token::TEMPLATE_SPAN; 995 LiteralScope literal(this); 996 StartRawLiteral(); 997 const bool capture_raw = true; 998 while (true) { 999 uc32 c = c0_; 1000 if (c == '`') { 1001 Advance(); // Consume '`' 1002 result = Token::TEMPLATE_TAIL; 1003 break; 1004 } else if (c == '$' && Peek() == '{') { 1005 Advance(); // Consume '$' 1006 Advance(); // Consume '{' 1007 break; 1008 } else if (c == '\\') { 1009 Advance(); // Consume '\\' 1010 DCHECK(!unibrow::IsLineTerminator(kEndOfInput)); 1011 if (capture_raw) AddRawLiteralChar('\\'); 1012 if (unibrow::IsLineTerminator(c0_)) { 1013 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty 1014 // code unit sequence. 1015 uc32 lastChar = c0_; 1016 Advance(); 1017 if (lastChar == '\r') { 1018 // Also skip \n. 1019 if (c0_ == '\n') Advance(); 1020 lastChar = '\n'; 1021 } 1022 if (capture_raw) AddRawLiteralChar(lastChar); 1023 } else { 1024 bool success = ScanEscape<capture_raw>(); 1025 USE(success); 1026 DCHECK_EQ(!success, has_error()); 1027 // For templates, invalid escape sequence checking is handled in the 1028 // parser. 1029 scanner_error_state.MoveErrorTo(next_); 1030 octal_error_state.MoveErrorTo(next_); 1031 } 1032 } else if (c < 0) { 1033 // Unterminated template literal 1034 break; 1035 } else { 1036 Advance(); // Consume c. 1037 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A. 1038 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence 1039 // consisting of the CV 0x000A. 1040 if (c == '\r') { 1041 if (c0_ == '\n') Advance(); // Consume '\n' 1042 c = '\n'; 1043 } 1044 if (capture_raw) AddRawLiteralChar(c); 1045 AddLiteralChar(c); 1046 } 1047 } 1048 literal.Complete(); 1049 next().location.end_pos = source_pos(); 1050 next().token = result; 1051 next().contextual_token = Token::UNINITIALIZED; 1052 1053 return result; 1054 } 1055 1056 Token::Value Scanner::ScanTemplateStart() { 1057 DCHECK_EQ(next_next().token, Token::UNINITIALIZED); 1058 DCHECK_EQ(c0_, '`'); 1059 next().location.beg_pos = source_pos(); 1060 Advance(); // Consume ` 1061 return ScanTemplateSpan(); 1062 } 1063 1064 Handle<String> Scanner::SourceUrl(Isolate* isolate) const { 1065 Handle<String> tmp; 1066 if (source_url_.length() > 0) { 1067 DCHECK(source_url_.is_used()); 1068 tmp = source_url_.Internalize(isolate); 1069 } 1070 return tmp; 1071 } 1072 1073 Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const { 1074 Handle<String> tmp; 1075 if (source_mapping_url_.length() > 0) { 1076 DCHECK(source_mapping_url_.is_used()); 1077 tmp = source_mapping_url_.Internalize(isolate); 1078 } 1079 return tmp; 1080 } 1081 1082 bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch), 1083 bool is_check_first_digit) { 1084 // we must have at least one digit after 'x'/'b'/'o' 1085 if (is_check_first_digit && !predicate(c0_)) return false; 1086 1087 bool separator_seen = false; 1088 while (predicate(c0_) || c0_ == '_') { 1089 if (c0_ == '_') { 1090 Advance(); 1091 if (c0_ == '_') { 1092 ReportScannerError(Location(source_pos(), source_pos() + 1), 1093 MessageTemplate::kContinuousNumericSeparator); 1094 return false; 1095 } 1096 separator_seen = true; 1097 continue; 1098 } 1099 separator_seen = false; 1100 AddLiteralCharAdvance(); 1101 } 1102 1103 if (separator_seen) { 1104 ReportScannerError(Location(source_pos(), source_pos() + 1), 1105 MessageTemplate::kTrailingNumericSeparator); 1106 return false; 1107 } 1108 1109 return true; 1110 } 1111 1112 bool Scanner::ScanDecimalDigits() { 1113 if (allow_harmony_numeric_separator()) { 1114 return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false); 1115 } 1116 while (IsDecimalDigit(c0_)) { 1117 AddLiteralCharAdvance(); 1118 } 1119 return true; 1120 } 1121 1122 bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) { 1123 bool separator_seen = false; 1124 while (IsDecimalDigit(c0_) || c0_ == '_') { 1125 if (c0_ == '_') { 1126 Advance(); 1127 if (c0_ == '_') { 1128 ReportScannerError(Location(source_pos(), source_pos() + 1), 1129 MessageTemplate::kContinuousNumericSeparator); 1130 return false; 1131 } 1132 separator_seen = true; 1133 continue; 1134 } 1135 separator_seen = false; 1136 *value = 10 * *value + (c0_ - '0'); 1137 uc32 first_char = c0_; 1138 Advance(); 1139 AddLiteralChar(first_char); 1140 } 1141 1142 if (separator_seen) { 1143 ReportScannerError(Location(source_pos(), source_pos() + 1), 1144 MessageTemplate::kTrailingNumericSeparator); 1145 return false; 1146 } 1147 1148 return true; 1149 } 1150 1151 bool Scanner::ScanDecimalAsSmi(uint64_t* value) { 1152 if (allow_harmony_numeric_separator()) { 1153 return ScanDecimalAsSmiWithNumericSeparators(value); 1154 } 1155 1156 while (IsDecimalDigit(c0_)) { 1157 *value = 10 * *value + (c0_ - '0'); 1158 uc32 first_char = c0_; 1159 Advance(); 1160 AddLiteralChar(first_char); 1161 } 1162 return true; 1163 } 1164 1165 bool Scanner::ScanBinaryDigits() { 1166 if (allow_harmony_numeric_separator()) { 1167 return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true); 1168 } 1169 1170 // we must have at least one binary digit after 'b'/'B' 1171 if (!IsBinaryDigit(c0_)) { 1172 return false; 1173 } 1174 1175 while (IsBinaryDigit(c0_)) { 1176 AddLiteralCharAdvance(); 1177 } 1178 return true; 1179 } 1180 1181 bool Scanner::ScanOctalDigits() { 1182 if (allow_harmony_numeric_separator()) { 1183 return ScanDigitsWithNumericSeparators(&IsOctalDigit, true); 1184 } 1185 1186 // we must have at least one octal digit after 'o'/'O' 1187 if (!IsOctalDigit(c0_)) { 1188 return false; 1189 } 1190 1191 while (IsOctalDigit(c0_)) { 1192 AddLiteralCharAdvance(); 1193 } 1194 return true; 1195 } 1196 1197 bool Scanner::ScanImplicitOctalDigits(int start_pos, 1198 Scanner::NumberKind* kind) { 1199 *kind = IMPLICIT_OCTAL; 1200 1201 while (true) { 1202 // (possible) octal number 1203 if (c0_ == '8' || c0_ == '9') { 1204 *kind = DECIMAL_WITH_LEADING_ZERO; 1205 return true; 1206 } 1207 if (c0_ < '0' || '7' < c0_) { 1208 // Octal literal finished. 1209 octal_pos_ = Location(start_pos, source_pos()); 1210 octal_message_ = MessageTemplate::kStrictOctalLiteral; 1211 return true; 1212 } 1213 AddLiteralCharAdvance(); 1214 } 1215 } 1216 1217 bool Scanner::ScanHexDigits() { 1218 if (allow_harmony_numeric_separator()) { 1219 return ScanDigitsWithNumericSeparators(&IsHexDigit, true); 1220 } 1221 1222 // we must have at least one hex digit after 'x'/'X' 1223 if (!IsHexDigit(c0_)) { 1224 return false; 1225 } 1226 1227 while (IsHexDigit(c0_)) { 1228 AddLiteralCharAdvance(); 1229 } 1230 return true; 1231 } 1232 1233 bool Scanner::ScanSignedInteger() { 1234 if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance(); 1235 // we must have at least one decimal digit after 'e'/'E' 1236 if (!IsDecimalDigit(c0_)) return false; 1237 return ScanDecimalDigits(); 1238 } 1239 1240 Token::Value Scanner::ScanNumber(bool seen_period) { 1241 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 1242 1243 NumberKind kind = DECIMAL; 1244 1245 LiteralScope literal(this); 1246 bool at_start = !seen_period; 1247 int start_pos = source_pos(); // For reporting octal positions. 1248 if (seen_period) { 1249 // we have already seen a decimal point of the float 1250 AddLiteralChar('.'); 1251 if (allow_harmony_numeric_separator() && c0_ == '_') { 1252 return Token::ILLEGAL; 1253 } 1254 // we know we have at least one digit 1255 if (!ScanDecimalDigits()) return Token::ILLEGAL; 1256 } else { 1257 // if the first character is '0' we must check for octals and hex 1258 if (c0_ == '0') { 1259 AddLiteralCharAdvance(); 1260 1261 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or 1262 // an octal number. 1263 if (c0_ == 'x' || c0_ == 'X') { 1264 AddLiteralCharAdvance(); 1265 kind = HEX; 1266 if (!ScanHexDigits()) return Token::ILLEGAL; 1267 } else if (c0_ == 'o' || c0_ == 'O') { 1268 AddLiteralCharAdvance(); 1269 kind = OCTAL; 1270 if (!ScanOctalDigits()) return Token::ILLEGAL; 1271 } else if (c0_ == 'b' || c0_ == 'B') { 1272 AddLiteralCharAdvance(); 1273 kind = BINARY; 1274 if (!ScanBinaryDigits()) return Token::ILLEGAL; 1275 } else if ('0' <= c0_ && c0_ <= '7') { 1276 kind = IMPLICIT_OCTAL; 1277 if (!ScanImplicitOctalDigits(start_pos, &kind)) { 1278 return Token::ILLEGAL; 1279 } 1280 if (kind == DECIMAL_WITH_LEADING_ZERO) { 1281 at_start = false; 1282 } 1283 } else if (c0_ == '8' || c0_ == '9') { 1284 kind = DECIMAL_WITH_LEADING_ZERO; 1285 } else if (allow_harmony_numeric_separator() && c0_ == '_') { 1286 ReportScannerError(Location(source_pos(), source_pos() + 1), 1287 MessageTemplate::kZeroDigitNumericSeparator); 1288 return Token::ILLEGAL; 1289 } 1290 } 1291 1292 // Parse decimal digits and allow trailing fractional part. 1293 if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) { 1294 // This is an optimization for parsing Decimal numbers as Smi's. 1295 if (at_start) { 1296 uint64_t value = 0; 1297 // scan subsequent decimal digits 1298 if (!ScanDecimalAsSmi(&value)) { 1299 return Token::ILLEGAL; 1300 } 1301 1302 if (next().literal_chars.one_byte_literal().length() <= 10 && 1303 value <= Smi::kMaxValue && c0_ != '.' && 1304 !unicode_cache_->IsIdentifierStart(c0_)) { 1305 next().smi_value_ = static_cast<uint32_t>(value); 1306 literal.Complete(); 1307 1308 if (kind == DECIMAL_WITH_LEADING_ZERO) { 1309 octal_pos_ = Location(start_pos, source_pos()); 1310 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero; 1311 } 1312 return Token::SMI; 1313 } 1314 } 1315 1316 if (!ScanDecimalDigits()) return Token::ILLEGAL; 1317 if (c0_ == '.') { 1318 seen_period = true; 1319 AddLiteralCharAdvance(); 1320 if (allow_harmony_numeric_separator() && c0_ == '_') { 1321 return Token::ILLEGAL; 1322 } 1323 if (!ScanDecimalDigits()) return Token::ILLEGAL; 1324 } 1325 } 1326 } 1327 1328 bool is_bigint = false; 1329 if (allow_harmony_bigint() && c0_ == 'n' && !seen_period && 1330 (kind == DECIMAL || kind == HEX || kind == OCTAL || kind == BINARY)) { 1331 // Check that the literal is within our limits for BigInt length. 1332 // For simplicity, use 4 bits per character to calculate the maximum 1333 // allowed literal length. 1334 static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4; 1335 int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0); 1336 if (length > kMaxBigIntCharacters) { 1337 ReportScannerError(Location(start_pos, source_pos()), 1338 MessageTemplate::kBigIntTooBig); 1339 return Token::ILLEGAL; 1340 } 1341 1342 is_bigint = true; 1343 Advance(); 1344 } else if (c0_ == 'e' || c0_ == 'E') { 1345 // scan exponent, if any 1346 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 1347 1348 if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO)) 1349 return Token::ILLEGAL; 1350 1351 // scan exponent 1352 AddLiteralCharAdvance(); 1353 1354 if (!ScanSignedInteger()) return Token::ILLEGAL; 1355 } 1356 1357 // The source character immediately following a numeric literal must 1358 // not be an identifier start or a decimal digit; see ECMA-262 1359 // section 7.8.3, page 17 (note that we read only one decimal digit 1360 // if the value is 0). 1361 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) { 1362 return Token::ILLEGAL; 1363 } 1364 1365 literal.Complete(); 1366 1367 if (kind == DECIMAL_WITH_LEADING_ZERO) { 1368 octal_pos_ = Location(start_pos, source_pos()); 1369 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero; 1370 } 1371 1372 return is_bigint ? Token::BIGINT : Token::NUMBER; 1373 } 1374 1375 uc32 Scanner::ScanIdentifierUnicodeEscape() { 1376 Advance(); 1377 if (c0_ != 'u') return -1; 1378 Advance(); 1379 return ScanUnicodeEscape<false>(); 1380 } 1381 1382 template <bool capture_raw> 1383 uc32 Scanner::ScanUnicodeEscape() { 1384 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of 1385 // hex digits between { } is arbitrary. \ and u have already been read. 1386 if (c0_ == '{') { 1387 int begin = source_pos() - 2; 1388 Advance<capture_raw>(); 1389 uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10FFFF, begin); 1390 if (cp < 0 || c0_ != '}') { 1391 ReportScannerError(source_pos(), 1392 MessageTemplate::kInvalidUnicodeEscapeSequence); 1393 return -1; 1394 } 1395 Advance<capture_raw>(); 1396 return cp; 1397 } 1398 const bool unicode = true; 1399 return ScanHexNumber<capture_raw, unicode>(4); 1400 } 1401 1402 1403 // ---------------------------------------------------------------------------- 1404 // Keyword Matcher 1405 1406 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 1407 KEYWORD_GROUP('a') \ 1408 KEYWORD("arguments", Token::ARGUMENTS) \ 1409 KEYWORD("as", Token::AS) \ 1410 KEYWORD("async", Token::ASYNC) \ 1411 KEYWORD("await", Token::AWAIT) \ 1412 KEYWORD("anonymous", Token::ANONYMOUS) \ 1413 KEYWORD_GROUP('b') \ 1414 KEYWORD("break", Token::BREAK) \ 1415 KEYWORD_GROUP('c') \ 1416 KEYWORD("case", Token::CASE) \ 1417 KEYWORD("catch", Token::CATCH) \ 1418 KEYWORD("class", Token::CLASS) \ 1419 KEYWORD("const", Token::CONST) \ 1420 KEYWORD("constructor", Token::CONSTRUCTOR) \ 1421 KEYWORD("continue", Token::CONTINUE) \ 1422 KEYWORD_GROUP('d') \ 1423 KEYWORD("debugger", Token::DEBUGGER) \ 1424 KEYWORD("default", Token::DEFAULT) \ 1425 KEYWORD("delete", Token::DELETE) \ 1426 KEYWORD("do", Token::DO) \ 1427 KEYWORD_GROUP('e') \ 1428 KEYWORD("else", Token::ELSE) \ 1429 KEYWORD("enum", Token::ENUM) \ 1430 KEYWORD("eval", Token::EVAL) \ 1431 KEYWORD("export", Token::EXPORT) \ 1432 KEYWORD("extends", Token::EXTENDS) \ 1433 KEYWORD_GROUP('f') \ 1434 KEYWORD("false", Token::FALSE_LITERAL) \ 1435 KEYWORD("finally", Token::FINALLY) \ 1436 KEYWORD("for", Token::FOR) \ 1437 KEYWORD("from", Token::FROM) \ 1438 KEYWORD("function", Token::FUNCTION) \ 1439 KEYWORD_GROUP('g') \ 1440 KEYWORD("get", Token::GET) \ 1441 KEYWORD_GROUP('i') \ 1442 KEYWORD("if", Token::IF) \ 1443 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 1444 KEYWORD("import", Token::IMPORT) \ 1445 KEYWORD("in", Token::IN) \ 1446 KEYWORD("instanceof", Token::INSTANCEOF) \ 1447 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 1448 KEYWORD_GROUP('l') \ 1449 KEYWORD("let", Token::LET) \ 1450 KEYWORD_GROUP('m') \ 1451 KEYWORD("meta", Token::META) \ 1452 KEYWORD_GROUP('n') \ 1453 KEYWORD("name", Token::NAME) \ 1454 KEYWORD("new", Token::NEW) \ 1455 KEYWORD("null", Token::NULL_LITERAL) \ 1456 KEYWORD_GROUP('o') \ 1457 KEYWORD("of", Token::OF) \ 1458 KEYWORD_GROUP('p') \ 1459 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 1460 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 1461 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 1462 KEYWORD("prototype", Token::PROTOTYPE) \ 1463 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 1464 KEYWORD_GROUP('r') \ 1465 KEYWORD("return", Token::RETURN) \ 1466 KEYWORD_GROUP('s') \ 1467 KEYWORD("set", Token::SET) \ 1468 KEYWORD("static", Token::STATIC) \ 1469 KEYWORD("super", Token::SUPER) \ 1470 KEYWORD("switch", Token::SWITCH) \ 1471 KEYWORD_GROUP('t') \ 1472 KEYWORD("target", Token::TARGET) \ 1473 KEYWORD("this", Token::THIS) \ 1474 KEYWORD("throw", Token::THROW) \ 1475 KEYWORD("true", Token::TRUE_LITERAL) \ 1476 KEYWORD("try", Token::TRY) \ 1477 KEYWORD("typeof", Token::TYPEOF) \ 1478 KEYWORD_GROUP('u') \ 1479 KEYWORD("undefined", Token::UNDEFINED) \ 1480 KEYWORD_GROUP('v') \ 1481 KEYWORD("var", Token::VAR) \ 1482 KEYWORD("void", Token::VOID) \ 1483 KEYWORD_GROUP('w') \ 1484 KEYWORD("while", Token::WHILE) \ 1485 KEYWORD("with", Token::WITH) \ 1486 KEYWORD_GROUP('y') \ 1487 KEYWORD("yield", Token::YIELD) \ 1488 KEYWORD_GROUP('_') \ 1489 KEYWORD("__proto__", Token::PROTO_UNDERSCORED) \ 1490 KEYWORD_GROUP('#') \ 1491 KEYWORD("#constructor", Token::PRIVATE_CONSTRUCTOR) 1492 1493 static Token::Value KeywordOrIdentifierToken(const uint8_t* input, 1494 int input_length) { 1495 DCHECK_GE(input_length, 1); 1496 const int kMinLength = 2; 1497 const int kMaxLength = 12; 1498 if (input_length < kMinLength || input_length > kMaxLength) { 1499 return Token::IDENTIFIER; 1500 } 1501 switch (input[0]) { 1502 default: 1503 #define KEYWORD_GROUP_CASE(ch) \ 1504 break; \ 1505 case ch: 1506 #define KEYWORD(keyword, token) \ 1507 { \ 1508 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 1509 /* strlen(keyword) plus 1 for the NUL char. */ \ 1510 const int keyword_length = sizeof(keyword) - 1; \ 1511 STATIC_ASSERT(keyword_length >= kMinLength); \ 1512 STATIC_ASSERT(keyword_length <= kMaxLength); \ 1513 DCHECK_EQ(input[0], keyword[0]); \ 1514 DCHECK(token == Token::FUTURE_STRICT_RESERVED_WORD || \ 1515 0 == strncmp(keyword, Token::String(token), sizeof(keyword))); \ 1516 if (input_length == keyword_length && input[1] == keyword[1] && \ 1517 (keyword_length <= 2 || input[2] == keyword[2]) && \ 1518 (keyword_length <= 3 || input[3] == keyword[3]) && \ 1519 (keyword_length <= 4 || input[4] == keyword[4]) && \ 1520 (keyword_length <= 5 || input[5] == keyword[5]) && \ 1521 (keyword_length <= 6 || input[6] == keyword[6]) && \ 1522 (keyword_length <= 7 || input[7] == keyword[7]) && \ 1523 (keyword_length <= 8 || input[8] == keyword[8]) && \ 1524 (keyword_length <= 9 || input[9] == keyword[9]) && \ 1525 (keyword_length <= 10 || input[10] == keyword[10])) { \ 1526 return token; \ 1527 } \ 1528 } 1529 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 1530 } 1531 return Token::IDENTIFIER; 1532 #undef KEYWORDS 1533 #undef KEYWORD 1534 #undef KEYWORD_GROUP_CASE 1535 } 1536 1537 Token::Value Scanner::ScanIdentifierOrKeyword() { 1538 LiteralScope literal(this); 1539 return ScanIdentifierOrKeywordInner(&literal); 1540 } 1541 1542 Token::Value Scanner::ScanIdentifierOrKeywordInner(LiteralScope* literal) { 1543 DCHECK(unicode_cache_->IsIdentifierStart(c0_)); 1544 bool escaped = false; 1545 if (IsInRange(c0_, 'a', 'z') || c0_ == '_') { 1546 do { 1547 AddLiteralChar(static_cast<char>(c0_)); 1548 Advance(); 1549 } while (IsInRange(c0_, 'a', 'z') || c0_ == '_'); 1550 1551 if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '$') { 1552 // Identifier starting with lowercase or _. 1553 do { 1554 AddLiteralChar(static_cast<char>(c0_)); 1555 Advance(); 1556 } while (IsAsciiIdentifier(c0_)); 1557 1558 if (c0_ <= kMaxAscii && c0_ != '\\') { 1559 literal->Complete(); 1560 return Token::IDENTIFIER; 1561 } 1562 } else if (c0_ <= kMaxAscii && c0_ != '\\') { 1563 // Only a-z+ or _: could be a keyword or identifier. 1564 Vector<const uint8_t> chars = next().literal_chars.one_byte_literal(); 1565 Token::Value token = 1566 KeywordOrIdentifierToken(chars.start(), chars.length()); 1567 if (token == Token::IDENTIFIER || 1568 token == Token::FUTURE_STRICT_RESERVED_WORD || 1569 Token::IsContextualKeyword(token)) 1570 literal->Complete(); 1571 return token; 1572 } 1573 } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '$') { 1574 do { 1575 AddLiteralChar(static_cast<char>(c0_)); 1576 Advance(); 1577 } while (IsAsciiIdentifier(c0_)); 1578 1579 if (c0_ <= kMaxAscii && c0_ != '\\') { 1580 literal->Complete(); 1581 return Token::IDENTIFIER; 1582 } 1583 } else if (c0_ == '\\') { 1584 escaped = true; 1585 uc32 c = ScanIdentifierUnicodeEscape(); 1586 DCHECK(!unicode_cache_->IsIdentifierStart(-1)); 1587 if (c == '\\' || !unicode_cache_->IsIdentifierStart(c)) { 1588 return Token::ILLEGAL; 1589 } 1590 AddLiteralChar(c); 1591 } 1592 1593 while (true) { 1594 if (c0_ == '\\') { 1595 escaped = true; 1596 uc32 c = ScanIdentifierUnicodeEscape(); 1597 // Only allow legal identifier part characters. 1598 // TODO(verwaest): Make this true. 1599 // DCHECK(!unicode_cache_->IsIdentifierPart('\\')); 1600 DCHECK(!unicode_cache_->IsIdentifierPart(-1)); 1601 if (c == '\\' || !unicode_cache_->IsIdentifierPart(c)) { 1602 return Token::ILLEGAL; 1603 } 1604 AddLiteralChar(c); 1605 } else if (unicode_cache_->IsIdentifierPart(c0_) || 1606 (CombineSurrogatePair() && 1607 unicode_cache_->IsIdentifierPart(c0_))) { 1608 AddLiteralCharAdvance(); 1609 } else { 1610 break; 1611 } 1612 } 1613 1614 if (next().literal_chars.is_one_byte()) { 1615 Vector<const uint8_t> chars = next().literal_chars.one_byte_literal(); 1616 Token::Value token = 1617 KeywordOrIdentifierToken(chars.start(), chars.length()); 1618 /* TODO(adamk): YIELD should be handled specially. */ 1619 if (token == Token::FUTURE_STRICT_RESERVED_WORD) { 1620 literal->Complete(); 1621 if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD; 1622 return token; 1623 } 1624 if (token == Token::IDENTIFIER || Token::IsContextualKeyword(token)) { 1625 literal->Complete(); 1626 return token; 1627 } 1628 1629 if (!escaped) return token; 1630 1631 literal->Complete(); 1632 if (token == Token::LET || token == Token::STATIC) { 1633 return Token::ESCAPED_STRICT_RESERVED_WORD; 1634 } 1635 return Token::ESCAPED_KEYWORD; 1636 } 1637 1638 literal->Complete(); 1639 return Token::IDENTIFIER; 1640 } 1641 1642 bool Scanner::ScanRegExpPattern() { 1643 DCHECK_EQ(Token::UNINITIALIZED, next_next().token); 1644 DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV); 1645 1646 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1647 bool in_character_class = false; 1648 bool seen_equal = (next().token == Token::ASSIGN_DIV); 1649 1650 // Previous token is either '/' or '/=', in the second case, the 1651 // pattern starts at =. 1652 next().location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1653 next().location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1654 1655 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1656 // the scanner should pass uninterpreted bodies to the RegExp 1657 // constructor. 1658 LiteralScope literal(this); 1659 if (seen_equal) { 1660 AddLiteralChar('='); 1661 } 1662 1663 while (c0_ != '/' || in_character_class) { 1664 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) { 1665 return false; 1666 } 1667 if (c0_ == '\\') { // Escape sequence. 1668 AddLiteralCharAdvance(); 1669 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) { 1670 return false; 1671 } 1672 AddLiteralCharAdvance(); 1673 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1674 // only "safe" characters are allowed (letters, digits, underscore), 1675 // otherwise the escape isn't valid and the invalid character has 1676 // its normal meaning. I.e., we can just continue scanning without 1677 // worrying whether the following characters are part of the escape 1678 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1679 // of the escape sequence. 1680 1681 // TODO(896): At some point, parse RegExps more thoroughly to capture 1682 // octal esacpes in strict mode. 1683 } else { // Unescaped character. 1684 if (c0_ == '[') in_character_class = true; 1685 if (c0_ == ']') in_character_class = false; 1686 AddLiteralCharAdvance(); 1687 } 1688 } 1689 Advance(); // consume '/' 1690 1691 literal.Complete(); 1692 next().token = Token::REGEXP_LITERAL; 1693 next().contextual_token = Token::UNINITIALIZED; 1694 return true; 1695 } 1696 1697 1698 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() { 1699 DCHECK_EQ(Token::REGEXP_LITERAL, next().token); 1700 1701 // Scan regular expression flags. 1702 int flags = 0; 1703 while (unicode_cache_->IsIdentifierPart(c0_)) { 1704 RegExp::Flags flag = RegExp::kNone; 1705 switch (c0_) { 1706 case 'g': 1707 flag = RegExp::kGlobal; 1708 break; 1709 case 'i': 1710 flag = RegExp::kIgnoreCase; 1711 break; 1712 case 'm': 1713 flag = RegExp::kMultiline; 1714 break; 1715 case 's': 1716 flag = RegExp::kDotAll; 1717 break; 1718 case 'u': 1719 flag = RegExp::kUnicode; 1720 break; 1721 case 'y': 1722 flag = RegExp::kSticky; 1723 break; 1724 default: 1725 return Nothing<RegExp::Flags>(); 1726 } 1727 if (flags & flag) { 1728 return Nothing<RegExp::Flags>(); 1729 } 1730 Advance(); 1731 flags |= flag; 1732 } 1733 1734 next().location.end_pos = source_pos(); 1735 return Just(RegExp::Flags(flags)); 1736 } 1737 1738 const AstRawString* Scanner::CurrentSymbol( 1739 AstValueFactory* ast_value_factory) const { 1740 if (is_literal_one_byte()) { 1741 return ast_value_factory->GetOneByteString(literal_one_byte_string()); 1742 } 1743 return ast_value_factory->GetTwoByteString(literal_two_byte_string()); 1744 } 1745 1746 const AstRawString* Scanner::NextSymbol( 1747 AstValueFactory* ast_value_factory) const { 1748 if (is_next_literal_one_byte()) { 1749 return ast_value_factory->GetOneByteString(next_literal_one_byte_string()); 1750 } 1751 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string()); 1752 } 1753 1754 const AstRawString* Scanner::CurrentRawSymbol( 1755 AstValueFactory* ast_value_factory) const { 1756 if (is_raw_literal_one_byte()) { 1757 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string()); 1758 } 1759 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string()); 1760 } 1761 1762 1763 double Scanner::DoubleValue() { 1764 DCHECK(is_literal_one_byte()); 1765 return StringToDouble( 1766 unicode_cache_, 1767 literal_one_byte_string(), 1768 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY); 1769 } 1770 1771 const char* Scanner::CurrentLiteralAsCString(Zone* zone) const { 1772 DCHECK(is_literal_one_byte()); 1773 Vector<const uint8_t> vector = literal_one_byte_string(); 1774 int length = vector.length(); 1775 char* buffer = zone->NewArray<char>(length + 1); 1776 memcpy(buffer, vector.start(), length); 1777 buffer[length] = '\0'; 1778 return buffer; 1779 } 1780 1781 bool Scanner::IsDuplicateSymbol(DuplicateFinder* duplicate_finder, 1782 AstValueFactory* ast_value_factory) const { 1783 DCHECK_NOT_NULL(duplicate_finder); 1784 DCHECK_NOT_NULL(ast_value_factory); 1785 const AstRawString* string = CurrentSymbol(ast_value_factory); 1786 return !duplicate_finder->known_symbols_.insert(string).second; 1787 } 1788 1789 void Scanner::SeekNext(size_t position) { 1790 // Use with care: This cleanly resets most, but not all scanner state. 1791 // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions. 1792 1793 // To re-scan from a given character position, we need to: 1794 // 1, Reset the current_, next_ and next_next_ tokens 1795 // (next_ + next_next_ will be overwrittem by Next(), 1796 // current_ will remain unchanged, so overwrite it fully.) 1797 for (TokenDesc& token : token_storage_) { 1798 token.token = Token::UNINITIALIZED; 1799 token.contextual_token = Token::UNINITIALIZED; 1800 } 1801 // 2, reset the source to the desired position, 1802 source_->Seek(position); 1803 // 3, re-scan, by scanning the look-ahead char + 1 token (next_). 1804 c0_ = source_->Advance(); 1805 next().after_line_terminator = false; 1806 Scan(); 1807 DCHECK_EQ(next().location.beg_pos, static_cast<int>(position)); 1808 } 1809 1810 } // namespace internal 1811 } // namespace v8 1812