1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #include "src/parsing/scanner.h" 8 9 #include <stdint.h> 10 11 #include <cmath> 12 13 #include "src/ast/ast-value-factory.h" 14 #include "src/char-predicates-inl.h" 15 #include "src/conversions-inl.h" 16 #include "src/list-inl.h" 17 #include "src/parsing/parser.h" 18 19 namespace v8 { 20 namespace internal { 21 22 23 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const { 24 if (is_one_byte()) { 25 return isolate->factory()->InternalizeOneByteString(one_byte_literal()); 26 } 27 return isolate->factory()->InternalizeTwoByteString(two_byte_literal()); 28 } 29 30 31 // Default implementation for streams that do not support bookmarks. 32 bool Utf16CharacterStream::SetBookmark() { return false; } 33 void Utf16CharacterStream::ResetToBookmark() { UNREACHABLE(); } 34 35 36 // ---------------------------------------------------------------------------- 37 // Scanner 38 39 Scanner::Scanner(UnicodeCache* unicode_cache) 40 : unicode_cache_(unicode_cache), 41 bookmark_c0_(kNoBookmark), 42 octal_pos_(Location::invalid()), 43 decimal_with_leading_zero_pos_(Location::invalid()), 44 found_html_comment_(false), 45 allow_harmony_exponentiation_operator_(false) { 46 bookmark_current_.literal_chars = &bookmark_current_literal_; 47 bookmark_current_.raw_literal_chars = &bookmark_current_raw_literal_; 48 bookmark_next_.literal_chars = &bookmark_next_literal_; 49 bookmark_next_.raw_literal_chars = &bookmark_next_raw_literal_; 50 } 51 52 53 void Scanner::Initialize(Utf16CharacterStream* source) { 54 source_ = source; 55 // Need to capture identifiers in order to recognize "get" and "set" 56 // in object literals. 57 Init(); 58 // Skip initial whitespace allowing HTML comment ends just like 59 // after a newline and scan first token. 60 has_line_terminator_before_next_ = true; 61 SkipWhiteSpace(); 62 Scan(); 63 } 64 65 template <bool capture_raw, bool unicode> 66 uc32 Scanner::ScanHexNumber(int expected_length) { 67 DCHECK(expected_length <= 4); // prevent overflow 68 69 int begin = source_pos() - 2; 70 uc32 x = 0; 71 for (int i = 0; i < expected_length; i++) { 72 int d = HexValue(c0_); 73 if (d < 0) { 74 ReportScannerError(Location(begin, begin + expected_length + 2), 75 unicode 76 ? MessageTemplate::kInvalidUnicodeEscapeSequence 77 : MessageTemplate::kInvalidHexEscapeSequence); 78 return -1; 79 } 80 x = x * 16 + d; 81 Advance<capture_raw>(); 82 } 83 84 return x; 85 } 86 87 template <bool capture_raw> 88 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) { 89 uc32 x = 0; 90 int d = HexValue(c0_); 91 if (d < 0) return -1; 92 93 while (d >= 0) { 94 x = x * 16 + d; 95 if (x > max_value) { 96 ReportScannerError(Location(beg_pos, source_pos() + 1), 97 MessageTemplate::kUndefinedUnicodeCodePoint); 98 return -1; 99 } 100 Advance<capture_raw>(); 101 d = HexValue(c0_); 102 } 103 104 return x; 105 } 106 107 108 // Ensure that tokens can be stored in a byte. 109 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 110 111 // Table of one-character tokens, by character (0x00..0x7f only). 112 static const byte one_char_tokens[] = { 113 Token::ILLEGAL, 114 Token::ILLEGAL, 115 Token::ILLEGAL, 116 Token::ILLEGAL, 117 Token::ILLEGAL, 118 Token::ILLEGAL, 119 Token::ILLEGAL, 120 Token::ILLEGAL, 121 Token::ILLEGAL, 122 Token::ILLEGAL, 123 Token::ILLEGAL, 124 Token::ILLEGAL, 125 Token::ILLEGAL, 126 Token::ILLEGAL, 127 Token::ILLEGAL, 128 Token::ILLEGAL, 129 Token::ILLEGAL, 130 Token::ILLEGAL, 131 Token::ILLEGAL, 132 Token::ILLEGAL, 133 Token::ILLEGAL, 134 Token::ILLEGAL, 135 Token::ILLEGAL, 136 Token::ILLEGAL, 137 Token::ILLEGAL, 138 Token::ILLEGAL, 139 Token::ILLEGAL, 140 Token::ILLEGAL, 141 Token::ILLEGAL, 142 Token::ILLEGAL, 143 Token::ILLEGAL, 144 Token::ILLEGAL, 145 Token::ILLEGAL, 146 Token::ILLEGAL, 147 Token::ILLEGAL, 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::ILLEGAL, 151 Token::ILLEGAL, 152 Token::ILLEGAL, 153 Token::LPAREN, // 0x28 154 Token::RPAREN, // 0x29 155 Token::ILLEGAL, 156 Token::ILLEGAL, 157 Token::COMMA, // 0x2c 158 Token::ILLEGAL, 159 Token::ILLEGAL, 160 Token::ILLEGAL, 161 Token::ILLEGAL, 162 Token::ILLEGAL, 163 Token::ILLEGAL, 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::COLON, // 0x3a 172 Token::SEMICOLON, // 0x3b 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::CONDITIONAL, // 0x3f 177 Token::ILLEGAL, 178 Token::ILLEGAL, 179 Token::ILLEGAL, 180 Token::ILLEGAL, 181 Token::ILLEGAL, 182 Token::ILLEGAL, 183 Token::ILLEGAL, 184 Token::ILLEGAL, 185 Token::ILLEGAL, 186 Token::ILLEGAL, 187 Token::ILLEGAL, 188 Token::ILLEGAL, 189 Token::ILLEGAL, 190 Token::ILLEGAL, 191 Token::ILLEGAL, 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::LBRACK, // 0x5b 205 Token::ILLEGAL, 206 Token::RBRACK, // 0x5d 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::ILLEGAL, 210 Token::ILLEGAL, 211 Token::ILLEGAL, 212 Token::ILLEGAL, 213 Token::ILLEGAL, 214 Token::ILLEGAL, 215 Token::ILLEGAL, 216 Token::ILLEGAL, 217 Token::ILLEGAL, 218 Token::ILLEGAL, 219 Token::ILLEGAL, 220 Token::ILLEGAL, 221 Token::ILLEGAL, 222 Token::ILLEGAL, 223 Token::ILLEGAL, 224 Token::ILLEGAL, 225 Token::ILLEGAL, 226 Token::ILLEGAL, 227 Token::ILLEGAL, 228 Token::ILLEGAL, 229 Token::ILLEGAL, 230 Token::ILLEGAL, 231 Token::ILLEGAL, 232 Token::ILLEGAL, 233 Token::ILLEGAL, 234 Token::ILLEGAL, 235 Token::ILLEGAL, 236 Token::LBRACE, // 0x7b 237 Token::ILLEGAL, 238 Token::RBRACE, // 0x7d 239 Token::BIT_NOT, // 0x7e 240 Token::ILLEGAL 241 }; 242 243 244 Token::Value Scanner::Next() { 245 if (next_.token == Token::EOS) { 246 next_.location.beg_pos = current_.location.beg_pos; 247 next_.location.end_pos = current_.location.end_pos; 248 } 249 current_ = next_; 250 if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) { 251 next_ = next_next_; 252 next_next_.token = Token::UNINITIALIZED; 253 has_line_terminator_before_next_ = has_line_terminator_after_next_; 254 return current_.token; 255 } 256 has_line_terminator_before_next_ = false; 257 has_multiline_comment_before_next_ = false; 258 if (static_cast<unsigned>(c0_) <= 0x7f) { 259 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 260 if (token != Token::ILLEGAL) { 261 int pos = source_pos(); 262 next_.token = token; 263 next_.location.beg_pos = pos; 264 next_.location.end_pos = pos + 1; 265 Advance(); 266 return current_.token; 267 } 268 } 269 Scan(); 270 return current_.token; 271 } 272 273 274 Token::Value Scanner::PeekAhead() { 275 if (next_next_.token != Token::UNINITIALIZED) { 276 return next_next_.token; 277 } 278 TokenDesc prev = current_; 279 bool has_line_terminator_before_next = 280 has_line_terminator_before_next_ || has_multiline_comment_before_next_; 281 Next(); 282 has_line_terminator_after_next_ = 283 has_line_terminator_before_next_ || has_multiline_comment_before_next_; 284 has_line_terminator_before_next_ = has_line_terminator_before_next; 285 Token::Value ret = next_.token; 286 next_next_ = next_; 287 next_ = current_; 288 current_ = prev; 289 return ret; 290 } 291 292 293 // TODO(yangguo): check whether this is actually necessary. 294 static inline bool IsLittleEndianByteOrderMark(uc32 c) { 295 // The Unicode value U+FFFE is guaranteed never to be assigned as a 296 // Unicode character; this implies that in a Unicode context the 297 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 298 // character expressed in little-endian byte order (since it could 299 // not be a U+FFFE character expressed in big-endian byte 300 // order). Nevertheless, we check for it to be compatible with 301 // Spidermonkey. 302 return c == 0xFFFE; 303 } 304 305 306 bool Scanner::SkipWhiteSpace() { 307 int start_position = source_pos(); 308 309 while (true) { 310 while (true) { 311 // The unicode cache accepts unsigned inputs. 312 if (c0_ < 0) break; 313 // Advance as long as character is a WhiteSpace or LineTerminator. 314 // Remember if the latter is the case. 315 if (unicode_cache_->IsLineTerminator(c0_)) { 316 has_line_terminator_before_next_ = true; 317 } else if (!unicode_cache_->IsWhiteSpace(c0_) && 318 !IsLittleEndianByteOrderMark(c0_)) { 319 break; 320 } 321 Advance(); 322 } 323 324 // If there is an HTML comment end '-->' at the beginning of a 325 // line (with only whitespace in front of it), we treat the rest 326 // of the line as a comment. This is in line with the way 327 // SpiderMonkey handles it. 328 if (c0_ == '-' && has_line_terminator_before_next_) { 329 Advance(); 330 if (c0_ == '-') { 331 Advance(); 332 if (c0_ == '>') { 333 // Treat the rest of the line as a comment. 334 SkipSingleLineComment(); 335 // Continue skipping white space after the comment. 336 continue; 337 } 338 PushBack('-'); // undo Advance() 339 } 340 PushBack('-'); // undo Advance() 341 } 342 // Return whether or not we skipped any characters. 343 return source_pos() != start_position; 344 } 345 } 346 347 348 Token::Value Scanner::SkipSingleLineComment() { 349 Advance(); 350 351 // The line terminator at the end of the line is not considered 352 // to be part of the single-line comment; it is recognized 353 // separately by the lexical grammar and becomes part of the 354 // stream of input elements for the syntactic grammar (see 355 // ECMA-262, section 7.4). 356 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 357 Advance(); 358 } 359 360 return Token::WHITESPACE; 361 } 362 363 364 Token::Value Scanner::SkipSourceURLComment() { 365 TryToParseSourceURLComment(); 366 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 367 Advance(); 368 } 369 370 return Token::WHITESPACE; 371 } 372 373 374 void Scanner::TryToParseSourceURLComment() { 375 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this 376 // function will just return if it cannot parse a magic comment. 377 if (c0_ < 0 || !unicode_cache_->IsWhiteSpace(c0_)) return; 378 Advance(); 379 LiteralBuffer name; 380 while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && 381 c0_ != '=') { 382 name.AddChar(c0_); 383 Advance(); 384 } 385 if (!name.is_one_byte()) return; 386 Vector<const uint8_t> name_literal = name.one_byte_literal(); 387 LiteralBuffer* value; 388 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) { 389 value = &source_url_; 390 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) { 391 value = &source_mapping_url_; 392 } else { 393 return; 394 } 395 if (c0_ != '=') 396 return; 397 Advance(); 398 value->Reset(); 399 while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) { 400 Advance(); 401 } 402 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 403 // Disallowed characters. 404 if (c0_ == '"' || c0_ == '\'') { 405 value->Reset(); 406 return; 407 } 408 if (unicode_cache_->IsWhiteSpace(c0_)) { 409 break; 410 } 411 value->AddChar(c0_); 412 Advance(); 413 } 414 // Allow whitespace at the end. 415 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 416 if (!unicode_cache_->IsWhiteSpace(c0_)) { 417 value->Reset(); 418 break; 419 } 420 Advance(); 421 } 422 } 423 424 425 Token::Value Scanner::SkipMultiLineComment() { 426 DCHECK(c0_ == '*'); 427 Advance(); 428 429 while (c0_ >= 0) { 430 uc32 ch = c0_; 431 Advance(); 432 if (c0_ >= 0 && unicode_cache_->IsLineTerminator(ch)) { 433 // Following ECMA-262, section 7.4, a comment containing 434 // a newline will make the comment count as a line-terminator. 435 has_multiline_comment_before_next_ = true; 436 } 437 // If we have reached the end of the multi-line comment, we 438 // consume the '/' and insert a whitespace. This way all 439 // multi-line comments are treated as whitespace. 440 if (ch == '*' && c0_ == '/') { 441 c0_ = ' '; 442 return Token::WHITESPACE; 443 } 444 } 445 446 // Unterminated multi-line comment. 447 return Token::ILLEGAL; 448 } 449 450 451 Token::Value Scanner::ScanHtmlComment() { 452 // Check for <!-- comments. 453 DCHECK(c0_ == '!'); 454 Advance(); 455 if (c0_ == '-') { 456 Advance(); 457 if (c0_ == '-') { 458 found_html_comment_ = true; 459 return SkipSingleLineComment(); 460 } 461 PushBack('-'); // undo Advance() 462 } 463 PushBack('!'); // undo Advance() 464 DCHECK(c0_ == '!'); 465 return Token::LT; 466 } 467 468 469 void Scanner::Scan() { 470 next_.literal_chars = NULL; 471 next_.raw_literal_chars = NULL; 472 Token::Value token; 473 do { 474 // Remember the position of the next token 475 next_.location.beg_pos = source_pos(); 476 477 switch (c0_) { 478 case ' ': 479 case '\t': 480 Advance(); 481 token = Token::WHITESPACE; 482 break; 483 484 case '\n': 485 Advance(); 486 has_line_terminator_before_next_ = true; 487 token = Token::WHITESPACE; 488 break; 489 490 case '"': case '\'': 491 token = ScanString(); 492 break; 493 494 case '<': 495 // < <= << <<= <!-- 496 Advance(); 497 if (c0_ == '=') { 498 token = Select(Token::LTE); 499 } else if (c0_ == '<') { 500 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 501 } else if (c0_ == '!') { 502 token = ScanHtmlComment(); 503 } else { 504 token = Token::LT; 505 } 506 break; 507 508 case '>': 509 // > >= >> >>= >>> >>>= 510 Advance(); 511 if (c0_ == '=') { 512 token = Select(Token::GTE); 513 } else if (c0_ == '>') { 514 // >> >>= >>> >>>= 515 Advance(); 516 if (c0_ == '=') { 517 token = Select(Token::ASSIGN_SAR); 518 } else if (c0_ == '>') { 519 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 520 } else { 521 token = Token::SAR; 522 } 523 } else { 524 token = Token::GT; 525 } 526 break; 527 528 case '=': 529 // = == === => 530 Advance(); 531 if (c0_ == '=') { 532 token = Select('=', Token::EQ_STRICT, Token::EQ); 533 } else if (c0_ == '>') { 534 token = Select(Token::ARROW); 535 } else { 536 token = Token::ASSIGN; 537 } 538 break; 539 540 case '!': 541 // ! != !== 542 Advance(); 543 if (c0_ == '=') { 544 token = Select('=', Token::NE_STRICT, Token::NE); 545 } else { 546 token = Token::NOT; 547 } 548 break; 549 550 case '+': 551 // + ++ += 552 Advance(); 553 if (c0_ == '+') { 554 token = Select(Token::INC); 555 } else if (c0_ == '=') { 556 token = Select(Token::ASSIGN_ADD); 557 } else { 558 token = Token::ADD; 559 } 560 break; 561 562 case '-': 563 // - -- --> -= 564 Advance(); 565 if (c0_ == '-') { 566 Advance(); 567 if (c0_ == '>' && has_line_terminator_before_next_) { 568 // For compatibility with SpiderMonkey, we skip lines that 569 // start with an HTML comment end '-->'. 570 token = SkipSingleLineComment(); 571 } else { 572 token = Token::DEC; 573 } 574 } else if (c0_ == '=') { 575 token = Select(Token::ASSIGN_SUB); 576 } else { 577 token = Token::SUB; 578 } 579 break; 580 581 case '*': 582 // * *= 583 Advance(); 584 if (c0_ == '*' && allow_harmony_exponentiation_operator()) { 585 token = Select('=', Token::ASSIGN_EXP, Token::EXP); 586 } else if (c0_ == '=') { 587 token = Select(Token::ASSIGN_MUL); 588 } else { 589 token = Token::MUL; 590 } 591 break; 592 593 case '%': 594 // % %= 595 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 596 break; 597 598 case '/': 599 // / // /* /= 600 Advance(); 601 if (c0_ == '/') { 602 Advance(); 603 if (c0_ == '#' || c0_ == '@') { 604 Advance(); 605 token = SkipSourceURLComment(); 606 } else { 607 PushBack(c0_); 608 token = SkipSingleLineComment(); 609 } 610 } else if (c0_ == '*') { 611 token = SkipMultiLineComment(); 612 } else if (c0_ == '=') { 613 token = Select(Token::ASSIGN_DIV); 614 } else { 615 token = Token::DIV; 616 } 617 break; 618 619 case '&': 620 // & && &= 621 Advance(); 622 if (c0_ == '&') { 623 token = Select(Token::AND); 624 } else if (c0_ == '=') { 625 token = Select(Token::ASSIGN_BIT_AND); 626 } else { 627 token = Token::BIT_AND; 628 } 629 break; 630 631 case '|': 632 // | || |= 633 Advance(); 634 if (c0_ == '|') { 635 token = Select(Token::OR); 636 } else if (c0_ == '=') { 637 token = Select(Token::ASSIGN_BIT_OR); 638 } else { 639 token = Token::BIT_OR; 640 } 641 break; 642 643 case '^': 644 // ^ ^= 645 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 646 break; 647 648 case '.': 649 // . Number 650 Advance(); 651 if (IsDecimalDigit(c0_)) { 652 token = ScanNumber(true); 653 } else { 654 token = Token::PERIOD; 655 if (c0_ == '.') { 656 Advance(); 657 if (c0_ == '.') { 658 Advance(); 659 token = Token::ELLIPSIS; 660 } else { 661 PushBack('.'); 662 } 663 } 664 } 665 break; 666 667 case ':': 668 token = Select(Token::COLON); 669 break; 670 671 case ';': 672 token = Select(Token::SEMICOLON); 673 break; 674 675 case ',': 676 token = Select(Token::COMMA); 677 break; 678 679 case '(': 680 token = Select(Token::LPAREN); 681 break; 682 683 case ')': 684 token = Select(Token::RPAREN); 685 break; 686 687 case '[': 688 token = Select(Token::LBRACK); 689 break; 690 691 case ']': 692 token = Select(Token::RBRACK); 693 break; 694 695 case '{': 696 token = Select(Token::LBRACE); 697 break; 698 699 case '}': 700 token = Select(Token::RBRACE); 701 break; 702 703 case '?': 704 token = Select(Token::CONDITIONAL); 705 break; 706 707 case '~': 708 token = Select(Token::BIT_NOT); 709 break; 710 711 case '`': 712 token = ScanTemplateStart(); 713 break; 714 715 default: 716 if (c0_ < 0) { 717 token = Token::EOS; 718 } else if (unicode_cache_->IsIdentifierStart(c0_)) { 719 token = ScanIdentifierOrKeyword(); 720 } else if (IsDecimalDigit(c0_)) { 721 token = ScanNumber(false); 722 } else if (SkipWhiteSpace()) { 723 token = Token::WHITESPACE; 724 } else { 725 token = Select(Token::ILLEGAL); 726 } 727 break; 728 } 729 730 // Continue scanning for tokens as long as we're just skipping 731 // whitespace. 732 } while (token == Token::WHITESPACE); 733 734 next_.location.end_pos = source_pos(); 735 next_.token = token; 736 } 737 738 739 void Scanner::SeekForward(int pos) { 740 // After this call, we will have the token at the given position as 741 // the "next" token. The "current" token will be invalid. 742 if (pos == next_.location.beg_pos) return; 743 int current_pos = source_pos(); 744 DCHECK_EQ(next_.location.end_pos, current_pos); 745 // Positions inside the lookahead token aren't supported. 746 DCHECK(pos >= current_pos); 747 if (pos != current_pos) { 748 source_->SeekForward(pos - source_->pos()); 749 Advance(); 750 // This function is only called to seek to the location 751 // of the end of a function (at the "}" token). It doesn't matter 752 // whether there was a line terminator in the part we skip. 753 has_line_terminator_before_next_ = false; 754 has_multiline_comment_before_next_ = false; 755 } 756 Scan(); 757 } 758 759 760 template <bool capture_raw, bool in_template_literal> 761 bool Scanner::ScanEscape() { 762 uc32 c = c0_; 763 Advance<capture_raw>(); 764 765 // Skip escaped newlines. 766 if (!in_template_literal && c0_ >= 0 && unicode_cache_->IsLineTerminator(c)) { 767 // Allow CR+LF newlines in multiline string literals. 768 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>(); 769 // Allow LF+CR newlines in multiline string literals. 770 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>(); 771 return true; 772 } 773 774 switch (c) { 775 case '\'': // fall through 776 case '"' : // fall through 777 case '\\': break; 778 case 'b' : c = '\b'; break; 779 case 'f' : c = '\f'; break; 780 case 'n' : c = '\n'; break; 781 case 'r' : c = '\r'; break; 782 case 't' : c = '\t'; break; 783 case 'u' : { 784 c = ScanUnicodeEscape<capture_raw>(); 785 if (c < 0) return false; 786 break; 787 } 788 case 'v': 789 c = '\v'; 790 break; 791 case 'x': { 792 c = ScanHexNumber<capture_raw>(2); 793 if (c < 0) return false; 794 break; 795 } 796 case '0': // Fall through. 797 case '1': // fall through 798 case '2': // fall through 799 case '3': // fall through 800 case '4': // fall through 801 case '5': // fall through 802 case '6': // fall through 803 case '7': 804 c = ScanOctalEscape<capture_raw>(c, 2); 805 break; 806 } 807 808 // According to ECMA-262, section 7.8.4, characters not covered by the 809 // above cases should be illegal, but they are commonly handled as 810 // non-escaped characters by JS VMs. 811 AddLiteralChar(c); 812 return true; 813 } 814 815 816 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of 817 // ECMA-262. Other JS VMs support them. 818 template <bool capture_raw> 819 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 820 uc32 x = c - '0'; 821 int i = 0; 822 for (; i < length; i++) { 823 int d = c0_ - '0'; 824 if (d < 0 || d > 7) break; 825 int nx = x * 8 + d; 826 if (nx >= 256) break; 827 x = nx; 828 Advance<capture_raw>(); 829 } 830 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 831 // Remember the position of octal escape sequences so that an error 832 // can be reported later (in strict mode). 833 // We don't report the error immediately, because the octal escape can 834 // occur before the "use strict" directive. 835 if (c != '0' || i > 0) { 836 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 837 } 838 return x; 839 } 840 841 842 Token::Value Scanner::ScanString() { 843 uc32 quote = c0_; 844 Advance<false, false>(); // consume quote 845 846 LiteralScope literal(this); 847 while (true) { 848 if (c0_ > kMaxAscii) { 849 HandleLeadSurrogate(); 850 break; 851 } 852 if (c0_ < 0 || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL; 853 if (c0_ == quote) { 854 literal.Complete(); 855 Advance<false, false>(); 856 return Token::STRING; 857 } 858 char c = static_cast<char>(c0_); 859 if (c == '\\') break; 860 Advance<false, false>(); 861 AddLiteralChar(c); 862 } 863 864 while (c0_ != quote && c0_ >= 0 865 && !unicode_cache_->IsLineTerminator(c0_)) { 866 uc32 c = c0_; 867 Advance(); 868 if (c == '\\') { 869 if (c0_ < 0 || !ScanEscape<false, false>()) { 870 return Token::ILLEGAL; 871 } 872 } else { 873 AddLiteralChar(c); 874 } 875 } 876 if (c0_ != quote) return Token::ILLEGAL; 877 literal.Complete(); 878 879 Advance(); // consume quote 880 return Token::STRING; 881 } 882 883 884 Token::Value Scanner::ScanTemplateSpan() { 885 // When scanning a TemplateSpan, we are looking for the following construct: 886 // TEMPLATE_SPAN :: 887 // ` LiteralChars* ${ 888 // | } LiteralChars* ${ 889 // 890 // TEMPLATE_TAIL :: 891 // ` LiteralChars* ` 892 // | } LiteralChar* ` 893 // 894 // A TEMPLATE_SPAN should always be followed by an Expression, while a 895 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be 896 // followed by an Expression. 897 898 Token::Value result = Token::TEMPLATE_SPAN; 899 LiteralScope literal(this); 900 StartRawLiteral(); 901 const bool capture_raw = true; 902 const bool in_template_literal = true; 903 while (true) { 904 uc32 c = c0_; 905 Advance<capture_raw>(); 906 if (c == '`') { 907 result = Token::TEMPLATE_TAIL; 908 ReduceRawLiteralLength(1); 909 break; 910 } else if (c == '$' && c0_ == '{') { 911 Advance<capture_raw>(); // Consume '{' 912 ReduceRawLiteralLength(2); 913 break; 914 } else if (c == '\\') { 915 if (c0_ > 0 && unicode_cache_->IsLineTerminator(c0_)) { 916 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty 917 // code unit sequence. 918 uc32 lastChar = c0_; 919 Advance<capture_raw>(); 920 if (lastChar == '\r') { 921 ReduceRawLiteralLength(1); // Remove \r 922 if (c0_ == '\n') { 923 Advance<capture_raw>(); // Adds \n 924 } else { 925 AddRawLiteralChar('\n'); 926 } 927 } 928 } else if (!ScanEscape<capture_raw, in_template_literal>()) { 929 return Token::ILLEGAL; 930 } 931 } else if (c < 0) { 932 // Unterminated template literal 933 PushBack(c); 934 break; 935 } else { 936 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A. 937 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence 938 // consisting of the CV 0x000A. 939 if (c == '\r') { 940 ReduceRawLiteralLength(1); // Remove \r 941 if (c0_ == '\n') { 942 Advance<capture_raw>(); // Adds \n 943 } else { 944 AddRawLiteralChar('\n'); 945 } 946 c = '\n'; 947 } 948 AddLiteralChar(c); 949 } 950 } 951 literal.Complete(); 952 next_.location.end_pos = source_pos(); 953 next_.token = result; 954 return result; 955 } 956 957 958 Token::Value Scanner::ScanTemplateStart() { 959 DCHECK(c0_ == '`'); 960 next_.location.beg_pos = source_pos(); 961 Advance(); // Consume ` 962 return ScanTemplateSpan(); 963 } 964 965 966 Token::Value Scanner::ScanTemplateContinuation() { 967 DCHECK_EQ(next_.token, Token::RBRACE); 968 next_.location.beg_pos = source_pos() - 1; // We already consumed } 969 return ScanTemplateSpan(); 970 } 971 972 973 void Scanner::ScanDecimalDigits() { 974 while (IsDecimalDigit(c0_)) 975 AddLiteralCharAdvance(); 976 } 977 978 979 Token::Value Scanner::ScanNumber(bool seen_period) { 980 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 981 982 enum { 983 DECIMAL, 984 DECIMAL_WITH_LEADING_ZERO, 985 HEX, 986 OCTAL, 987 IMPLICIT_OCTAL, 988 BINARY 989 } kind = DECIMAL; 990 991 LiteralScope literal(this); 992 bool at_start = !seen_period; 993 int start_pos = source_pos(); // For reporting octal positions. 994 if (seen_period) { 995 // we have already seen a decimal point of the float 996 AddLiteralChar('.'); 997 ScanDecimalDigits(); // we know we have at least one digit 998 999 } else { 1000 // if the first character is '0' we must check for octals and hex 1001 if (c0_ == '0') { 1002 AddLiteralCharAdvance(); 1003 1004 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or 1005 // an octal number. 1006 if (c0_ == 'x' || c0_ == 'X') { 1007 // hex number 1008 kind = HEX; 1009 AddLiteralCharAdvance(); 1010 if (!IsHexDigit(c0_)) { 1011 // we must have at least one hex digit after 'x'/'X' 1012 return Token::ILLEGAL; 1013 } 1014 while (IsHexDigit(c0_)) { 1015 AddLiteralCharAdvance(); 1016 } 1017 } else if (c0_ == 'o' || c0_ == 'O') { 1018 kind = OCTAL; 1019 AddLiteralCharAdvance(); 1020 if (!IsOctalDigit(c0_)) { 1021 // we must have at least one octal digit after 'o'/'O' 1022 return Token::ILLEGAL; 1023 } 1024 while (IsOctalDigit(c0_)) { 1025 AddLiteralCharAdvance(); 1026 } 1027 } else if (c0_ == 'b' || c0_ == 'B') { 1028 kind = BINARY; 1029 AddLiteralCharAdvance(); 1030 if (!IsBinaryDigit(c0_)) { 1031 // we must have at least one binary digit after 'b'/'B' 1032 return Token::ILLEGAL; 1033 } 1034 while (IsBinaryDigit(c0_)) { 1035 AddLiteralCharAdvance(); 1036 } 1037 } else if ('0' <= c0_ && c0_ <= '7') { 1038 // (possible) octal number 1039 kind = IMPLICIT_OCTAL; 1040 while (true) { 1041 if (c0_ == '8' || c0_ == '9') { 1042 at_start = false; 1043 kind = DECIMAL_WITH_LEADING_ZERO; 1044 break; 1045 } 1046 if (c0_ < '0' || '7' < c0_) { 1047 // Octal literal finished. 1048 octal_pos_ = Location(start_pos, source_pos()); 1049 break; 1050 } 1051 AddLiteralCharAdvance(); 1052 } 1053 } else if (c0_ == '8' || c0_ == '9') { 1054 kind = DECIMAL_WITH_LEADING_ZERO; 1055 } 1056 } 1057 1058 // Parse decimal digits and allow trailing fractional part. 1059 if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) { 1060 if (at_start) { 1061 uint64_t value = 0; 1062 while (IsDecimalDigit(c0_)) { 1063 value = 10 * value + (c0_ - '0'); 1064 1065 uc32 first_char = c0_; 1066 Advance<false, false>(); 1067 AddLiteralChar(first_char); 1068 } 1069 1070 if (next_.literal_chars->one_byte_literal().length() <= 10 && 1071 value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') { 1072 next_.smi_value_ = static_cast<int>(value); 1073 literal.Complete(); 1074 HandleLeadSurrogate(); 1075 1076 if (kind == DECIMAL_WITH_LEADING_ZERO) 1077 decimal_with_leading_zero_pos_ = Location(start_pos, source_pos()); 1078 return Token::SMI; 1079 } 1080 HandleLeadSurrogate(); 1081 } 1082 1083 ScanDecimalDigits(); // optional 1084 if (c0_ == '.') { 1085 AddLiteralCharAdvance(); 1086 ScanDecimalDigits(); // optional 1087 } 1088 } 1089 } 1090 1091 // scan exponent, if any 1092 if (c0_ == 'e' || c0_ == 'E') { 1093 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 1094 if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO)) 1095 return Token::ILLEGAL; 1096 // scan exponent 1097 AddLiteralCharAdvance(); 1098 if (c0_ == '+' || c0_ == '-') 1099 AddLiteralCharAdvance(); 1100 if (!IsDecimalDigit(c0_)) { 1101 // we must have at least one decimal digit after 'e'/'E' 1102 return Token::ILLEGAL; 1103 } 1104 ScanDecimalDigits(); 1105 } 1106 1107 // The source character immediately following a numeric literal must 1108 // not be an identifier start or a decimal digit; see ECMA-262 1109 // section 7.8.3, page 17 (note that we read only one decimal digit 1110 // if the value is 0). 1111 if (IsDecimalDigit(c0_) || 1112 (c0_ >= 0 && unicode_cache_->IsIdentifierStart(c0_))) 1113 return Token::ILLEGAL; 1114 1115 literal.Complete(); 1116 1117 if (kind == DECIMAL_WITH_LEADING_ZERO) 1118 decimal_with_leading_zero_pos_ = Location(start_pos, source_pos()); 1119 return Token::NUMBER; 1120 } 1121 1122 1123 uc32 Scanner::ScanIdentifierUnicodeEscape() { 1124 Advance(); 1125 if (c0_ != 'u') return -1; 1126 Advance(); 1127 return ScanUnicodeEscape<false>(); 1128 } 1129 1130 1131 template <bool capture_raw> 1132 uc32 Scanner::ScanUnicodeEscape() { 1133 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of 1134 // hex digits between { } is arbitrary. \ and u have already been read. 1135 if (c0_ == '{') { 1136 int begin = source_pos() - 2; 1137 Advance<capture_raw>(); 1138 uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin); 1139 if (cp < 0 || c0_ != '}') { 1140 ReportScannerError(source_pos(), 1141 MessageTemplate::kInvalidUnicodeEscapeSequence); 1142 return -1; 1143 } 1144 Advance<capture_raw>(); 1145 return cp; 1146 } 1147 const bool unicode = true; 1148 return ScanHexNumber<capture_raw, unicode>(4); 1149 } 1150 1151 1152 // ---------------------------------------------------------------------------- 1153 // Keyword Matcher 1154 1155 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 1156 KEYWORD_GROUP('a') \ 1157 KEYWORD("async", Token::ASYNC) \ 1158 KEYWORD("await", Token::AWAIT) \ 1159 KEYWORD_GROUP('b') \ 1160 KEYWORD("break", Token::BREAK) \ 1161 KEYWORD_GROUP('c') \ 1162 KEYWORD("case", Token::CASE) \ 1163 KEYWORD("catch", Token::CATCH) \ 1164 KEYWORD("class", Token::CLASS) \ 1165 KEYWORD("const", Token::CONST) \ 1166 KEYWORD("continue", Token::CONTINUE) \ 1167 KEYWORD_GROUP('d') \ 1168 KEYWORD("debugger", Token::DEBUGGER) \ 1169 KEYWORD("default", Token::DEFAULT) \ 1170 KEYWORD("delete", Token::DELETE) \ 1171 KEYWORD("do", Token::DO) \ 1172 KEYWORD_GROUP('e') \ 1173 KEYWORD("else", Token::ELSE) \ 1174 KEYWORD("enum", Token::ENUM) \ 1175 KEYWORD("export", Token::EXPORT) \ 1176 KEYWORD("extends", Token::EXTENDS) \ 1177 KEYWORD_GROUP('f') \ 1178 KEYWORD("false", Token::FALSE_LITERAL) \ 1179 KEYWORD("finally", Token::FINALLY) \ 1180 KEYWORD("for", Token::FOR) \ 1181 KEYWORD("function", Token::FUNCTION) \ 1182 KEYWORD_GROUP('i') \ 1183 KEYWORD("if", Token::IF) \ 1184 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 1185 KEYWORD("import", Token::IMPORT) \ 1186 KEYWORD("in", Token::IN) \ 1187 KEYWORD("instanceof", Token::INSTANCEOF) \ 1188 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 1189 KEYWORD_GROUP('l') \ 1190 KEYWORD("let", Token::LET) \ 1191 KEYWORD_GROUP('n') \ 1192 KEYWORD("new", Token::NEW) \ 1193 KEYWORD("null", Token::NULL_LITERAL) \ 1194 KEYWORD_GROUP('p') \ 1195 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 1196 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 1197 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 1198 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 1199 KEYWORD_GROUP('r') \ 1200 KEYWORD("return", Token::RETURN) \ 1201 KEYWORD_GROUP('s') \ 1202 KEYWORD("static", Token::STATIC) \ 1203 KEYWORD("super", Token::SUPER) \ 1204 KEYWORD("switch", Token::SWITCH) \ 1205 KEYWORD_GROUP('t') \ 1206 KEYWORD("this", Token::THIS) \ 1207 KEYWORD("throw", Token::THROW) \ 1208 KEYWORD("true", Token::TRUE_LITERAL) \ 1209 KEYWORD("try", Token::TRY) \ 1210 KEYWORD("typeof", Token::TYPEOF) \ 1211 KEYWORD_GROUP('v') \ 1212 KEYWORD("var", Token::VAR) \ 1213 KEYWORD("void", Token::VOID) \ 1214 KEYWORD_GROUP('w') \ 1215 KEYWORD("while", Token::WHILE) \ 1216 KEYWORD("with", Token::WITH) \ 1217 KEYWORD_GROUP('y') \ 1218 KEYWORD("yield", Token::YIELD) 1219 1220 static Token::Value KeywordOrIdentifierToken(const uint8_t* input, 1221 int input_length, bool escaped) { 1222 DCHECK(input_length >= 1); 1223 const int kMinLength = 2; 1224 const int kMaxLength = 10; 1225 if (input_length < kMinLength || input_length > kMaxLength) { 1226 return Token::IDENTIFIER; 1227 } 1228 switch (input[0]) { 1229 default: 1230 #define KEYWORD_GROUP_CASE(ch) \ 1231 break; \ 1232 case ch: 1233 #define KEYWORD(keyword, token) \ 1234 { \ 1235 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 1236 /* strlen(keyword) plus 1 for the NUL char. */ \ 1237 const int keyword_length = sizeof(keyword) - 1; \ 1238 STATIC_ASSERT(keyword_length >= kMinLength); \ 1239 STATIC_ASSERT(keyword_length <= kMaxLength); \ 1240 if (input_length == keyword_length && input[1] == keyword[1] && \ 1241 (keyword_length <= 2 || input[2] == keyword[2]) && \ 1242 (keyword_length <= 3 || input[3] == keyword[3]) && \ 1243 (keyword_length <= 4 || input[4] == keyword[4]) && \ 1244 (keyword_length <= 5 || input[5] == keyword[5]) && \ 1245 (keyword_length <= 6 || input[6] == keyword[6]) && \ 1246 (keyword_length <= 7 || input[7] == keyword[7]) && \ 1247 (keyword_length <= 8 || input[8] == keyword[8]) && \ 1248 (keyword_length <= 9 || input[9] == keyword[9])) { \ 1249 if (escaped) { \ 1250 /* TODO(adamk): YIELD should be handled specially. */ \ 1251 return (token == Token::FUTURE_STRICT_RESERVED_WORD || \ 1252 token == Token::LET || token == Token::STATIC) \ 1253 ? Token::ESCAPED_STRICT_RESERVED_WORD \ 1254 : Token::ESCAPED_KEYWORD; \ 1255 } \ 1256 return token; \ 1257 } \ 1258 } 1259 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 1260 } 1261 return Token::IDENTIFIER; 1262 } 1263 1264 1265 bool Scanner::IdentifierIsFutureStrictReserved( 1266 const AstRawString* string) const { 1267 // Keywords are always 1-byte strings. 1268 if (!string->is_one_byte()) return false; 1269 if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") || 1270 string->IsOneByteEqualTo("yield")) { 1271 return true; 1272 } 1273 return Token::FUTURE_STRICT_RESERVED_WORD == 1274 KeywordOrIdentifierToken(string->raw_data(), string->length(), false); 1275 } 1276 1277 1278 Token::Value Scanner::ScanIdentifierOrKeyword() { 1279 DCHECK(unicode_cache_->IsIdentifierStart(c0_)); 1280 LiteralScope literal(this); 1281 if (IsInRange(c0_, 'a', 'z')) { 1282 do { 1283 char first_char = static_cast<char>(c0_); 1284 Advance<false, false>(); 1285 AddLiteralChar(first_char); 1286 } while (IsInRange(c0_, 'a', 'z')); 1287 1288 if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' || 1289 c0_ == '$') { 1290 // Identifier starting with lowercase. 1291 char first_char = static_cast<char>(c0_); 1292 Advance<false, false>(); 1293 AddLiteralChar(first_char); 1294 while (IsAsciiIdentifier(c0_)) { 1295 char first_char = static_cast<char>(c0_); 1296 Advance<false, false>(); 1297 AddLiteralChar(first_char); 1298 } 1299 if (c0_ <= kMaxAscii && c0_ != '\\') { 1300 literal.Complete(); 1301 return Token::IDENTIFIER; 1302 } 1303 } else if (c0_ <= kMaxAscii && c0_ != '\\') { 1304 // Only a-z+: could be a keyword or identifier. 1305 literal.Complete(); 1306 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1307 return KeywordOrIdentifierToken(chars.start(), chars.length(), false); 1308 } 1309 1310 HandleLeadSurrogate(); 1311 } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') { 1312 do { 1313 char first_char = static_cast<char>(c0_); 1314 Advance<false, false>(); 1315 AddLiteralChar(first_char); 1316 } while (IsAsciiIdentifier(c0_)); 1317 1318 if (c0_ <= kMaxAscii && c0_ != '\\') { 1319 literal.Complete(); 1320 return Token::IDENTIFIER; 1321 } 1322 1323 HandleLeadSurrogate(); 1324 } else if (c0_ == '\\') { 1325 // Scan identifier start character. 1326 uc32 c = ScanIdentifierUnicodeEscape(); 1327 // Only allow legal identifier start characters. 1328 if (c < 0 || 1329 c == '\\' || // No recursive escapes. 1330 !unicode_cache_->IsIdentifierStart(c)) { 1331 return Token::ILLEGAL; 1332 } 1333 AddLiteralChar(c); 1334 return ScanIdentifierSuffix(&literal, true); 1335 } else { 1336 uc32 first_char = c0_; 1337 Advance(); 1338 AddLiteralChar(first_char); 1339 } 1340 1341 // Scan the rest of the identifier characters. 1342 while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) { 1343 if (c0_ != '\\') { 1344 uc32 next_char = c0_; 1345 Advance(); 1346 AddLiteralChar(next_char); 1347 continue; 1348 } 1349 // Fallthrough if no longer able to complete keyword. 1350 return ScanIdentifierSuffix(&literal, false); 1351 } 1352 1353 literal.Complete(); 1354 1355 if (next_.literal_chars->is_one_byte()) { 1356 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1357 return KeywordOrIdentifierToken(chars.start(), chars.length(), false); 1358 } 1359 return Token::IDENTIFIER; 1360 } 1361 1362 1363 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal, 1364 bool escaped) { 1365 // Scan the rest of the identifier characters. 1366 while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) { 1367 if (c0_ == '\\') { 1368 uc32 c = ScanIdentifierUnicodeEscape(); 1369 escaped = true; 1370 // Only allow legal identifier part characters. 1371 if (c < 0 || 1372 c == '\\' || 1373 !unicode_cache_->IsIdentifierPart(c)) { 1374 return Token::ILLEGAL; 1375 } 1376 AddLiteralChar(c); 1377 } else { 1378 AddLiteralChar(c0_); 1379 Advance(); 1380 } 1381 } 1382 literal->Complete(); 1383 1384 if (escaped && next_.literal_chars->is_one_byte()) { 1385 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1386 return KeywordOrIdentifierToken(chars.start(), chars.length(), true); 1387 } 1388 return Token::IDENTIFIER; 1389 } 1390 1391 1392 bool Scanner::ScanRegExpPattern(bool seen_equal) { 1393 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1394 bool in_character_class = false; 1395 1396 // Previous token is either '/' or '/=', in the second case, the 1397 // pattern starts at =. 1398 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1399 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1400 1401 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1402 // the scanner should pass uninterpreted bodies to the RegExp 1403 // constructor. 1404 LiteralScope literal(this); 1405 if (seen_equal) { 1406 AddLiteralChar('='); 1407 } 1408 1409 while (c0_ != '/' || in_character_class) { 1410 if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false; 1411 if (c0_ == '\\') { // Escape sequence. 1412 AddLiteralCharAdvance(); 1413 if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false; 1414 AddLiteralCharAdvance(); 1415 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1416 // only "safe" characters are allowed (letters, digits, underscore), 1417 // otherwise the escape isn't valid and the invalid character has 1418 // its normal meaning. I.e., we can just continue scanning without 1419 // worrying whether the following characters are part of the escape 1420 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1421 // of the escape sequence. 1422 1423 // TODO(896): At some point, parse RegExps more throughly to capture 1424 // octal esacpes in strict mode. 1425 } else { // Unescaped character. 1426 if (c0_ == '[') in_character_class = true; 1427 if (c0_ == ']') in_character_class = false; 1428 AddLiteralCharAdvance(); 1429 } 1430 } 1431 Advance(); // consume '/' 1432 1433 literal.Complete(); 1434 1435 return true; 1436 } 1437 1438 1439 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() { 1440 // Scan regular expression flags. 1441 LiteralScope literal(this); 1442 int flags = 0; 1443 while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) { 1444 RegExp::Flags flag = RegExp::kNone; 1445 switch (c0_) { 1446 case 'g': 1447 flag = RegExp::kGlobal; 1448 break; 1449 case 'i': 1450 flag = RegExp::kIgnoreCase; 1451 break; 1452 case 'm': 1453 flag = RegExp::kMultiline; 1454 break; 1455 case 'u': 1456 flag = RegExp::kUnicode; 1457 break; 1458 case 'y': 1459 flag = RegExp::kSticky; 1460 break; 1461 default: 1462 return Nothing<RegExp::Flags>(); 1463 } 1464 if (flags & flag) return Nothing<RegExp::Flags>(); 1465 AddLiteralCharAdvance(); 1466 flags |= flag; 1467 } 1468 literal.Complete(); 1469 1470 next_.location.end_pos = source_pos(); 1471 return Just(RegExp::Flags(flags)); 1472 } 1473 1474 1475 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) { 1476 if (is_literal_one_byte()) { 1477 return ast_value_factory->GetOneByteString(literal_one_byte_string()); 1478 } 1479 return ast_value_factory->GetTwoByteString(literal_two_byte_string()); 1480 } 1481 1482 1483 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) { 1484 if (is_next_literal_one_byte()) { 1485 return ast_value_factory->GetOneByteString(next_literal_one_byte_string()); 1486 } 1487 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string()); 1488 } 1489 1490 1491 const AstRawString* Scanner::CurrentRawSymbol( 1492 AstValueFactory* ast_value_factory) { 1493 if (is_raw_literal_one_byte()) { 1494 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string()); 1495 } 1496 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string()); 1497 } 1498 1499 1500 double Scanner::DoubleValue() { 1501 DCHECK(is_literal_one_byte()); 1502 return StringToDouble( 1503 unicode_cache_, 1504 literal_one_byte_string(), 1505 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY); 1506 } 1507 1508 1509 bool Scanner::ContainsDot() { 1510 DCHECK(is_literal_one_byte()); 1511 Vector<const uint8_t> str = literal_one_byte_string(); 1512 return std::find(str.begin(), str.end(), '.') != str.end(); 1513 } 1514 1515 1516 int Scanner::FindSymbol(DuplicateFinder* finder, int value) { 1517 if (is_literal_one_byte()) { 1518 return finder->AddOneByteSymbol(literal_one_byte_string(), value); 1519 } 1520 return finder->AddTwoByteSymbol(literal_two_byte_string(), value); 1521 } 1522 1523 1524 bool Scanner::SetBookmark() { 1525 if (c0_ != kNoBookmark && bookmark_c0_ == kNoBookmark && 1526 next_next_.token == Token::UNINITIALIZED && source_->SetBookmark()) { 1527 bookmark_c0_ = c0_; 1528 CopyTokenDesc(&bookmark_current_, ¤t_); 1529 CopyTokenDesc(&bookmark_next_, &next_); 1530 return true; 1531 } 1532 return false; 1533 } 1534 1535 1536 void Scanner::ResetToBookmark() { 1537 DCHECK(BookmarkHasBeenSet()); // Caller hasn't called SetBookmark. 1538 1539 source_->ResetToBookmark(); 1540 c0_ = bookmark_c0_; 1541 StartLiteral(); 1542 StartRawLiteral(); 1543 CopyTokenDesc(&next_, &bookmark_current_); 1544 current_ = next_; 1545 StartLiteral(); 1546 StartRawLiteral(); 1547 CopyTokenDesc(&next_, &bookmark_next_); 1548 1549 bookmark_c0_ = kBookmarkWasApplied; 1550 } 1551 1552 1553 bool Scanner::BookmarkHasBeenSet() { return bookmark_c0_ >= 0; } 1554 1555 1556 bool Scanner::BookmarkHasBeenReset() { 1557 return bookmark_c0_ == kBookmarkWasApplied; 1558 } 1559 1560 1561 void Scanner::DropBookmark() { bookmark_c0_ = kNoBookmark; } 1562 1563 1564 void Scanner::CopyTokenDesc(TokenDesc* to, TokenDesc* from) { 1565 DCHECK_NOT_NULL(to); 1566 DCHECK_NOT_NULL(from); 1567 to->token = from->token; 1568 to->location = from->location; 1569 to->literal_chars->CopyFrom(from->literal_chars); 1570 to->raw_literal_chars->CopyFrom(from->raw_literal_chars); 1571 } 1572 1573 1574 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) { 1575 return AddSymbol(key, true, value); 1576 } 1577 1578 1579 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) { 1580 return AddSymbol(Vector<const uint8_t>::cast(key), false, value); 1581 } 1582 1583 1584 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key, 1585 bool is_one_byte, 1586 int value) { 1587 uint32_t hash = Hash(key, is_one_byte); 1588 byte* encoding = BackupKey(key, is_one_byte); 1589 base::HashMap::Entry* entry = map_.LookupOrInsert(encoding, hash); 1590 int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value)); 1591 entry->value = 1592 reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value)); 1593 return old_value; 1594 } 1595 1596 1597 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) { 1598 DCHECK(key.length() > 0); 1599 // Quick check for already being in canonical form. 1600 if (IsNumberCanonical(key)) { 1601 return AddOneByteSymbol(key, value); 1602 } 1603 1604 int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY; 1605 double double_value = StringToDouble( 1606 unicode_constants_, key, flags, 0.0); 1607 int length; 1608 const char* string; 1609 if (!std::isfinite(double_value)) { 1610 string = "Infinity"; 1611 length = 8; // strlen("Infinity"); 1612 } else { 1613 string = DoubleToCString(double_value, 1614 Vector<char>(number_buffer_, kBufferSize)); 1615 length = StrLength(string); 1616 } 1617 return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string), 1618 length), true, value); 1619 } 1620 1621 1622 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) { 1623 // Test for a safe approximation of number literals that are already 1624 // in canonical form: max 15 digits, no leading zeroes, except an 1625 // integer part that is a single zero, and no trailing zeros below 1626 // the decimal point. 1627 int pos = 0; 1628 int length = number.length(); 1629 if (number.length() > 15) return false; 1630 if (number[pos] == '0') { 1631 pos++; 1632 } else { 1633 while (pos < length && 1634 static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++; 1635 } 1636 if (length == pos) return true; 1637 if (number[pos] != '.') return false; 1638 pos++; 1639 bool invalid_last_digit = true; 1640 while (pos < length) { 1641 uint8_t digit = number[pos] - '0'; 1642 if (digit > '9' - '0') return false; 1643 invalid_last_digit = (digit == 0); 1644 pos++; 1645 } 1646 return !invalid_last_digit; 1647 } 1648 1649 1650 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) { 1651 // Primitive hash function, almost identical to the one used 1652 // for strings (except that it's seeded by the length and representation). 1653 int length = key.length(); 1654 uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0); 1655 for (int i = 0; i < length; i++) { 1656 uint32_t c = key[i]; 1657 hash = (hash + c) * 1025; 1658 hash ^= (hash >> 6); 1659 } 1660 return hash; 1661 } 1662 1663 1664 bool DuplicateFinder::Match(void* first, void* second) { 1665 // Decode lengths. 1666 // Length + representation is encoded as base 128, most significant heptet 1667 // first, with a 8th bit being non-zero while there are more heptets. 1668 // The value encodes the number of bytes following, and whether the original 1669 // was Latin1. 1670 byte* s1 = reinterpret_cast<byte*>(first); 1671 byte* s2 = reinterpret_cast<byte*>(second); 1672 uint32_t length_one_byte_field = 0; 1673 byte c1; 1674 do { 1675 c1 = *s1; 1676 if (c1 != *s2) return false; 1677 length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f); 1678 s1++; 1679 s2++; 1680 } while ((c1 & 0x80) != 0); 1681 int length = static_cast<int>(length_one_byte_field >> 1); 1682 return memcmp(s1, s2, length) == 0; 1683 } 1684 1685 1686 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes, 1687 bool is_one_byte) { 1688 uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0); 1689 backing_store_.StartSequence(); 1690 // Emit one_byte_length as base-128 encoded number, with the 7th bit set 1691 // on the byte of every heptet except the last, least significant, one. 1692 if (one_byte_length >= (1 << 7)) { 1693 if (one_byte_length >= (1 << 14)) { 1694 if (one_byte_length >= (1 << 21)) { 1695 if (one_byte_length >= (1 << 28)) { 1696 backing_store_.Add( 1697 static_cast<uint8_t>((one_byte_length >> 28) | 0x80)); 1698 } 1699 backing_store_.Add( 1700 static_cast<uint8_t>((one_byte_length >> 21) | 0x80u)); 1701 } 1702 backing_store_.Add( 1703 static_cast<uint8_t>((one_byte_length >> 14) | 0x80u)); 1704 } 1705 backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u)); 1706 } 1707 backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f)); 1708 1709 backing_store_.AddBlock(bytes); 1710 return backing_store_.EndSequence().start(); 1711 } 1712 1713 } // namespace internal 1714 } // namespace v8 1715