1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 // Features shared by parsing and pre-parsing scanners. 29 30 #include <cmath> 31 32 #include "scanner.h" 33 34 #include "../include/v8stdint.h" 35 #include "char-predicates-inl.h" 36 #include "conversions-inl.h" 37 #include "list-inl.h" 38 39 namespace v8 { 40 namespace internal { 41 42 // ---------------------------------------------------------------------------- 43 // Scanner 44 45 Scanner::Scanner(UnicodeCache* unicode_cache) 46 : unicode_cache_(unicode_cache), 47 octal_pos_(Location::invalid()), 48 harmony_scoping_(false), 49 harmony_modules_(false), 50 harmony_numeric_literals_(false) { } 51 52 53 void Scanner::Initialize(Utf16CharacterStream* source) { 54 source_ = source; 55 // Need to capture identifiers in order to recognize "get" and "set" 56 // in object literals. 57 Init(); 58 // Skip initial whitespace allowing HTML comment ends just like 59 // after a newline and scan first token. 60 has_line_terminator_before_next_ = true; 61 SkipWhiteSpace(); 62 Scan(); 63 } 64 65 66 uc32 Scanner::ScanHexNumber(int expected_length) { 67 ASSERT(expected_length <= 4); // prevent overflow 68 69 uc32 digits[4] = { 0, 0, 0, 0 }; 70 uc32 x = 0; 71 for (int i = 0; i < expected_length; i++) { 72 digits[i] = c0_; 73 int d = HexValue(c0_); 74 if (d < 0) { 75 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 76 // should be illegal, but other JS VMs just return the 77 // non-escaped version of the original character. 78 79 // Push back digits that we have advanced past. 80 for (int j = i-1; j >= 0; j--) { 81 PushBack(digits[j]); 82 } 83 return -1; 84 } 85 x = x * 16 + d; 86 Advance(); 87 } 88 89 return x; 90 } 91 92 93 // Ensure that tokens can be stored in a byte. 94 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 95 96 // Table of one-character tokens, by character (0x00..0x7f only). 97 static const byte one_char_tokens[] = { 98 Token::ILLEGAL, 99 Token::ILLEGAL, 100 Token::ILLEGAL, 101 Token::ILLEGAL, 102 Token::ILLEGAL, 103 Token::ILLEGAL, 104 Token::ILLEGAL, 105 Token::ILLEGAL, 106 Token::ILLEGAL, 107 Token::ILLEGAL, 108 Token::ILLEGAL, 109 Token::ILLEGAL, 110 Token::ILLEGAL, 111 Token::ILLEGAL, 112 Token::ILLEGAL, 113 Token::ILLEGAL, 114 Token::ILLEGAL, 115 Token::ILLEGAL, 116 Token::ILLEGAL, 117 Token::ILLEGAL, 118 Token::ILLEGAL, 119 Token::ILLEGAL, 120 Token::ILLEGAL, 121 Token::ILLEGAL, 122 Token::ILLEGAL, 123 Token::ILLEGAL, 124 Token::ILLEGAL, 125 Token::ILLEGAL, 126 Token::ILLEGAL, 127 Token::ILLEGAL, 128 Token::ILLEGAL, 129 Token::ILLEGAL, 130 Token::ILLEGAL, 131 Token::ILLEGAL, 132 Token::ILLEGAL, 133 Token::ILLEGAL, 134 Token::ILLEGAL, 135 Token::ILLEGAL, 136 Token::ILLEGAL, 137 Token::ILLEGAL, 138 Token::LPAREN, // 0x28 139 Token::RPAREN, // 0x29 140 Token::ILLEGAL, 141 Token::ILLEGAL, 142 Token::COMMA, // 0x2c 143 Token::ILLEGAL, 144 Token::ILLEGAL, 145 Token::ILLEGAL, 146 Token::ILLEGAL, 147 Token::ILLEGAL, 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::ILLEGAL, 151 Token::ILLEGAL, 152 Token::ILLEGAL, 153 Token::ILLEGAL, 154 Token::ILLEGAL, 155 Token::ILLEGAL, 156 Token::COLON, // 0x3a 157 Token::SEMICOLON, // 0x3b 158 Token::ILLEGAL, 159 Token::ILLEGAL, 160 Token::ILLEGAL, 161 Token::CONDITIONAL, // 0x3f 162 Token::ILLEGAL, 163 Token::ILLEGAL, 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::ILLEGAL, 172 Token::ILLEGAL, 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::ILLEGAL, 177 Token::ILLEGAL, 178 Token::ILLEGAL, 179 Token::ILLEGAL, 180 Token::ILLEGAL, 181 Token::ILLEGAL, 182 Token::ILLEGAL, 183 Token::ILLEGAL, 184 Token::ILLEGAL, 185 Token::ILLEGAL, 186 Token::ILLEGAL, 187 Token::ILLEGAL, 188 Token::ILLEGAL, 189 Token::LBRACK, // 0x5b 190 Token::ILLEGAL, 191 Token::RBRACK, // 0x5d 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::ILLEGAL, 205 Token::ILLEGAL, 206 Token::ILLEGAL, 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::ILLEGAL, 210 Token::ILLEGAL, 211 Token::ILLEGAL, 212 Token::ILLEGAL, 213 Token::ILLEGAL, 214 Token::ILLEGAL, 215 Token::ILLEGAL, 216 Token::ILLEGAL, 217 Token::ILLEGAL, 218 Token::ILLEGAL, 219 Token::ILLEGAL, 220 Token::ILLEGAL, 221 Token::LBRACE, // 0x7b 222 Token::ILLEGAL, 223 Token::RBRACE, // 0x7d 224 Token::BIT_NOT, // 0x7e 225 Token::ILLEGAL 226 }; 227 228 229 Token::Value Scanner::Next() { 230 current_ = next_; 231 has_line_terminator_before_next_ = false; 232 has_multiline_comment_before_next_ = false; 233 if (static_cast<unsigned>(c0_) <= 0x7f) { 234 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 235 if (token != Token::ILLEGAL) { 236 int pos = source_pos(); 237 next_.token = token; 238 next_.location.beg_pos = pos; 239 next_.location.end_pos = pos + 1; 240 Advance(); 241 return current_.token; 242 } 243 } 244 Scan(); 245 return current_.token; 246 } 247 248 249 static inline bool IsByteOrderMark(uc32 c) { 250 // The Unicode value U+FFFE is guaranteed never to be assigned as a 251 // Unicode character; this implies that in a Unicode context the 252 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 253 // character expressed in little-endian byte order (since it could 254 // not be a U+FFFE character expressed in big-endian byte 255 // order). Nevertheless, we check for it to be compatible with 256 // Spidermonkey. 257 return c == 0xFEFF || c == 0xFFFE; 258 } 259 260 261 bool Scanner::SkipWhiteSpace() { 262 int start_position = source_pos(); 263 264 while (true) { 265 // We treat byte-order marks (BOMs) as whitespace for better 266 // compatibility with Spidermonkey and other JavaScript engines. 267 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { 268 // IsWhiteSpace() includes line terminators! 269 if (unicode_cache_->IsLineTerminator(c0_)) { 270 // Ignore line terminators, but remember them. This is necessary 271 // for automatic semicolon insertion. 272 has_line_terminator_before_next_ = true; 273 } 274 Advance(); 275 } 276 277 // If there is an HTML comment end '-->' at the beginning of a 278 // line (with only whitespace in front of it), we treat the rest 279 // of the line as a comment. This is in line with the way 280 // SpiderMonkey handles it. 281 if (c0_ == '-' && has_line_terminator_before_next_) { 282 Advance(); 283 if (c0_ == '-') { 284 Advance(); 285 if (c0_ == '>') { 286 // Treat the rest of the line as a comment. 287 SkipSingleLineComment(); 288 // Continue skipping white space after the comment. 289 continue; 290 } 291 PushBack('-'); // undo Advance() 292 } 293 PushBack('-'); // undo Advance() 294 } 295 // Return whether or not we skipped any characters. 296 return source_pos() != start_position; 297 } 298 } 299 300 301 Token::Value Scanner::SkipSingleLineComment() { 302 Advance(); 303 304 // The line terminator at the end of the line is not considered 305 // to be part of the single-line comment; it is recognized 306 // separately by the lexical grammar and becomes part of the 307 // stream of input elements for the syntactic grammar (see 308 // ECMA-262, section 7.4). 309 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 310 Advance(); 311 } 312 313 return Token::WHITESPACE; 314 } 315 316 317 Token::Value Scanner::SkipMultiLineComment() { 318 ASSERT(c0_ == '*'); 319 Advance(); 320 321 while (c0_ >= 0) { 322 uc32 ch = c0_; 323 Advance(); 324 if (unicode_cache_->IsLineTerminator(ch)) { 325 // Following ECMA-262, section 7.4, a comment containing 326 // a newline will make the comment count as a line-terminator. 327 has_multiline_comment_before_next_ = true; 328 } 329 // If we have reached the end of the multi-line comment, we 330 // consume the '/' and insert a whitespace. This way all 331 // multi-line comments are treated as whitespace. 332 if (ch == '*' && c0_ == '/') { 333 c0_ = ' '; 334 return Token::WHITESPACE; 335 } 336 } 337 338 // Unterminated multi-line comment. 339 return Token::ILLEGAL; 340 } 341 342 343 Token::Value Scanner::ScanHtmlComment() { 344 // Check for <!-- comments. 345 ASSERT(c0_ == '!'); 346 Advance(); 347 if (c0_ == '-') { 348 Advance(); 349 if (c0_ == '-') return SkipSingleLineComment(); 350 PushBack('-'); // undo Advance() 351 } 352 PushBack('!'); // undo Advance() 353 ASSERT(c0_ == '!'); 354 return Token::LT; 355 } 356 357 358 void Scanner::Scan() { 359 next_.literal_chars = NULL; 360 Token::Value token; 361 do { 362 // Remember the position of the next token 363 next_.location.beg_pos = source_pos(); 364 365 switch (c0_) { 366 case ' ': 367 case '\t': 368 Advance(); 369 token = Token::WHITESPACE; 370 break; 371 372 case '\n': 373 Advance(); 374 has_line_terminator_before_next_ = true; 375 token = Token::WHITESPACE; 376 break; 377 378 case '"': case '\'': 379 token = ScanString(); 380 break; 381 382 case '<': 383 // < <= << <<= <!-- 384 Advance(); 385 if (c0_ == '=') { 386 token = Select(Token::LTE); 387 } else if (c0_ == '<') { 388 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 389 } else if (c0_ == '!') { 390 token = ScanHtmlComment(); 391 } else { 392 token = Token::LT; 393 } 394 break; 395 396 case '>': 397 // > >= >> >>= >>> >>>= 398 Advance(); 399 if (c0_ == '=') { 400 token = Select(Token::GTE); 401 } else if (c0_ == '>') { 402 // >> >>= >>> >>>= 403 Advance(); 404 if (c0_ == '=') { 405 token = Select(Token::ASSIGN_SAR); 406 } else if (c0_ == '>') { 407 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 408 } else { 409 token = Token::SAR; 410 } 411 } else { 412 token = Token::GT; 413 } 414 break; 415 416 case '=': 417 // = == === 418 Advance(); 419 if (c0_ == '=') { 420 token = Select('=', Token::EQ_STRICT, Token::EQ); 421 } else { 422 token = Token::ASSIGN; 423 } 424 break; 425 426 case '!': 427 // ! != !== 428 Advance(); 429 if (c0_ == '=') { 430 token = Select('=', Token::NE_STRICT, Token::NE); 431 } else { 432 token = Token::NOT; 433 } 434 break; 435 436 case '+': 437 // + ++ += 438 Advance(); 439 if (c0_ == '+') { 440 token = Select(Token::INC); 441 } else if (c0_ == '=') { 442 token = Select(Token::ASSIGN_ADD); 443 } else { 444 token = Token::ADD; 445 } 446 break; 447 448 case '-': 449 // - -- --> -= 450 Advance(); 451 if (c0_ == '-') { 452 Advance(); 453 if (c0_ == '>' && has_line_terminator_before_next_) { 454 // For compatibility with SpiderMonkey, we skip lines that 455 // start with an HTML comment end '-->'. 456 token = SkipSingleLineComment(); 457 } else { 458 token = Token::DEC; 459 } 460 } else if (c0_ == '=') { 461 token = Select(Token::ASSIGN_SUB); 462 } else { 463 token = Token::SUB; 464 } 465 break; 466 467 case '*': 468 // * *= 469 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 470 break; 471 472 case '%': 473 // % %= 474 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 475 break; 476 477 case '/': 478 // / // /* /= 479 Advance(); 480 if (c0_ == '/') { 481 token = SkipSingleLineComment(); 482 } else if (c0_ == '*') { 483 token = SkipMultiLineComment(); 484 } else if (c0_ == '=') { 485 token = Select(Token::ASSIGN_DIV); 486 } else { 487 token = Token::DIV; 488 } 489 break; 490 491 case '&': 492 // & && &= 493 Advance(); 494 if (c0_ == '&') { 495 token = Select(Token::AND); 496 } else if (c0_ == '=') { 497 token = Select(Token::ASSIGN_BIT_AND); 498 } else { 499 token = Token::BIT_AND; 500 } 501 break; 502 503 case '|': 504 // | || |= 505 Advance(); 506 if (c0_ == '|') { 507 token = Select(Token::OR); 508 } else if (c0_ == '=') { 509 token = Select(Token::ASSIGN_BIT_OR); 510 } else { 511 token = Token::BIT_OR; 512 } 513 break; 514 515 case '^': 516 // ^ ^= 517 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 518 break; 519 520 case '.': 521 // . Number 522 Advance(); 523 if (IsDecimalDigit(c0_)) { 524 token = ScanNumber(true); 525 } else { 526 token = Token::PERIOD; 527 } 528 break; 529 530 case ':': 531 token = Select(Token::COLON); 532 break; 533 534 case ';': 535 token = Select(Token::SEMICOLON); 536 break; 537 538 case ',': 539 token = Select(Token::COMMA); 540 break; 541 542 case '(': 543 token = Select(Token::LPAREN); 544 break; 545 546 case ')': 547 token = Select(Token::RPAREN); 548 break; 549 550 case '[': 551 token = Select(Token::LBRACK); 552 break; 553 554 case ']': 555 token = Select(Token::RBRACK); 556 break; 557 558 case '{': 559 token = Select(Token::LBRACE); 560 break; 561 562 case '}': 563 token = Select(Token::RBRACE); 564 break; 565 566 case '?': 567 token = Select(Token::CONDITIONAL); 568 break; 569 570 case '~': 571 token = Select(Token::BIT_NOT); 572 break; 573 574 default: 575 if (unicode_cache_->IsIdentifierStart(c0_)) { 576 token = ScanIdentifierOrKeyword(); 577 } else if (IsDecimalDigit(c0_)) { 578 token = ScanNumber(false); 579 } else if (SkipWhiteSpace()) { 580 token = Token::WHITESPACE; 581 } else if (c0_ < 0) { 582 token = Token::EOS; 583 } else { 584 token = Select(Token::ILLEGAL); 585 } 586 break; 587 } 588 589 // Continue scanning for tokens as long as we're just skipping 590 // whitespace. 591 } while (token == Token::WHITESPACE); 592 593 next_.location.end_pos = source_pos(); 594 next_.token = token; 595 } 596 597 598 void Scanner::SeekForward(int pos) { 599 // After this call, we will have the token at the given position as 600 // the "next" token. The "current" token will be invalid. 601 if (pos == next_.location.beg_pos) return; 602 int current_pos = source_pos(); 603 ASSERT_EQ(next_.location.end_pos, current_pos); 604 // Positions inside the lookahead token aren't supported. 605 ASSERT(pos >= current_pos); 606 if (pos != current_pos) { 607 source_->SeekForward(pos - source_->pos()); 608 Advance(); 609 // This function is only called to seek to the location 610 // of the end of a function (at the "}" token). It doesn't matter 611 // whether there was a line terminator in the part we skip. 612 has_line_terminator_before_next_ = false; 613 has_multiline_comment_before_next_ = false; 614 } 615 Scan(); 616 } 617 618 619 bool Scanner::ScanEscape() { 620 uc32 c = c0_; 621 Advance(); 622 623 // Skip escaped newlines. 624 if (unicode_cache_->IsLineTerminator(c)) { 625 // Allow CR+LF newlines in multiline string literals. 626 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 627 // Allow LF+CR newlines in multiline string literals. 628 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 629 return true; 630 } 631 632 switch (c) { 633 case '\'': // fall through 634 case '"' : // fall through 635 case '\\': break; 636 case 'b' : c = '\b'; break; 637 case 'f' : c = '\f'; break; 638 case 'n' : c = '\n'; break; 639 case 'r' : c = '\r'; break; 640 case 't' : c = '\t'; break; 641 case 'u' : { 642 c = ScanHexNumber(4); 643 if (c < 0) return false; 644 break; 645 } 646 case 'v' : c = '\v'; break; 647 case 'x' : { 648 c = ScanHexNumber(2); 649 if (c < 0) return false; 650 break; 651 } 652 case '0' : // fall through 653 case '1' : // fall through 654 case '2' : // fall through 655 case '3' : // fall through 656 case '4' : // fall through 657 case '5' : // fall through 658 case '6' : // fall through 659 case '7' : c = ScanOctalEscape(c, 2); break; 660 } 661 662 // According to ECMA-262, section 7.8.4, characters not covered by the 663 // above cases should be illegal, but they are commonly handled as 664 // non-escaped characters by JS VMs. 665 AddLiteralChar(c); 666 return true; 667 } 668 669 670 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of 671 // ECMA-262. Other JS VMs support them. 672 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 673 uc32 x = c - '0'; 674 int i = 0; 675 for (; i < length; i++) { 676 int d = c0_ - '0'; 677 if (d < 0 || d > 7) break; 678 int nx = x * 8 + d; 679 if (nx >= 256) break; 680 x = nx; 681 Advance(); 682 } 683 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 684 // Remember the position of octal escape sequences so that an error 685 // can be reported later (in strict mode). 686 // We don't report the error immediately, because the octal escape can 687 // occur before the "use strict" directive. 688 if (c != '0' || i > 0) { 689 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 690 } 691 return x; 692 } 693 694 695 Token::Value Scanner::ScanString() { 696 uc32 quote = c0_; 697 Advance(); // consume quote 698 699 LiteralScope literal(this); 700 while (c0_ != quote && c0_ >= 0 701 && !unicode_cache_->IsLineTerminator(c0_)) { 702 uc32 c = c0_; 703 Advance(); 704 if (c == '\\') { 705 if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL; 706 } else { 707 AddLiteralChar(c); 708 } 709 } 710 if (c0_ != quote) return Token::ILLEGAL; 711 literal.Complete(); 712 713 Advance(); // consume quote 714 return Token::STRING; 715 } 716 717 718 void Scanner::ScanDecimalDigits() { 719 while (IsDecimalDigit(c0_)) 720 AddLiteralCharAdvance(); 721 } 722 723 724 Token::Value Scanner::ScanNumber(bool seen_period) { 725 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 726 727 enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL; 728 729 LiteralScope literal(this); 730 if (seen_period) { 731 // we have already seen a decimal point of the float 732 AddLiteralChar('.'); 733 ScanDecimalDigits(); // we know we have at least one digit 734 735 } else { 736 // if the first character is '0' we must check for octals and hex 737 if (c0_ == '0') { 738 int start_pos = source_pos(); // For reporting octal positions. 739 AddLiteralCharAdvance(); 740 741 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or 742 // an octal number. 743 if (c0_ == 'x' || c0_ == 'X') { 744 // hex number 745 kind = HEX; 746 AddLiteralCharAdvance(); 747 if (!IsHexDigit(c0_)) { 748 // we must have at least one hex digit after 'x'/'X' 749 return Token::ILLEGAL; 750 } 751 while (IsHexDigit(c0_)) { 752 AddLiteralCharAdvance(); 753 } 754 } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) { 755 kind = OCTAL; 756 AddLiteralCharAdvance(); 757 if (!IsOctalDigit(c0_)) { 758 // we must have at least one octal digit after 'o'/'O' 759 return Token::ILLEGAL; 760 } 761 while (IsOctalDigit(c0_)) { 762 AddLiteralCharAdvance(); 763 } 764 } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) { 765 kind = BINARY; 766 AddLiteralCharAdvance(); 767 if (!IsBinaryDigit(c0_)) { 768 // we must have at least one binary digit after 'b'/'B' 769 return Token::ILLEGAL; 770 } 771 while (IsBinaryDigit(c0_)) { 772 AddLiteralCharAdvance(); 773 } 774 } else if ('0' <= c0_ && c0_ <= '7') { 775 // (possible) octal number 776 kind = IMPLICIT_OCTAL; 777 while (true) { 778 if (c0_ == '8' || c0_ == '9') { 779 kind = DECIMAL; 780 break; 781 } 782 if (c0_ < '0' || '7' < c0_) { 783 // Octal literal finished. 784 octal_pos_ = Location(start_pos, source_pos()); 785 break; 786 } 787 AddLiteralCharAdvance(); 788 } 789 } 790 } 791 792 // Parse decimal digits and allow trailing fractional part. 793 if (kind == DECIMAL) { 794 ScanDecimalDigits(); // optional 795 if (c0_ == '.') { 796 AddLiteralCharAdvance(); 797 ScanDecimalDigits(); // optional 798 } 799 } 800 } 801 802 // scan exponent, if any 803 if (c0_ == 'e' || c0_ == 'E') { 804 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 805 if (kind != DECIMAL) return Token::ILLEGAL; 806 // scan exponent 807 AddLiteralCharAdvance(); 808 if (c0_ == '+' || c0_ == '-') 809 AddLiteralCharAdvance(); 810 if (!IsDecimalDigit(c0_)) { 811 // we must have at least one decimal digit after 'e'/'E' 812 return Token::ILLEGAL; 813 } 814 ScanDecimalDigits(); 815 } 816 817 // The source character immediately following a numeric literal must 818 // not be an identifier start or a decimal digit; see ECMA-262 819 // section 7.8.3, page 17 (note that we read only one decimal digit 820 // if the value is 0). 821 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) 822 return Token::ILLEGAL; 823 824 literal.Complete(); 825 826 return Token::NUMBER; 827 } 828 829 830 uc32 Scanner::ScanIdentifierUnicodeEscape() { 831 Advance(); 832 if (c0_ != 'u') return -1; 833 Advance(); 834 uc32 result = ScanHexNumber(4); 835 if (result < 0) PushBack('u'); 836 return result; 837 } 838 839 840 // ---------------------------------------------------------------------------- 841 // Keyword Matcher 842 843 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 844 KEYWORD_GROUP('b') \ 845 KEYWORD("break", Token::BREAK) \ 846 KEYWORD_GROUP('c') \ 847 KEYWORD("case", Token::CASE) \ 848 KEYWORD("catch", Token::CATCH) \ 849 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ 850 KEYWORD("const", Token::CONST) \ 851 KEYWORD("continue", Token::CONTINUE) \ 852 KEYWORD_GROUP('d') \ 853 KEYWORD("debugger", Token::DEBUGGER) \ 854 KEYWORD("default", Token::DEFAULT) \ 855 KEYWORD("delete", Token::DELETE) \ 856 KEYWORD("do", Token::DO) \ 857 KEYWORD_GROUP('e') \ 858 KEYWORD("else", Token::ELSE) \ 859 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ 860 KEYWORD("export", harmony_modules \ 861 ? Token::EXPORT : Token::FUTURE_RESERVED_WORD) \ 862 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ 863 KEYWORD_GROUP('f') \ 864 KEYWORD("false", Token::FALSE_LITERAL) \ 865 KEYWORD("finally", Token::FINALLY) \ 866 KEYWORD("for", Token::FOR) \ 867 KEYWORD("function", Token::FUNCTION) \ 868 KEYWORD_GROUP('i') \ 869 KEYWORD("if", Token::IF) \ 870 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 871 KEYWORD("import", harmony_modules \ 872 ? Token::IMPORT : Token::FUTURE_RESERVED_WORD) \ 873 KEYWORD("in", Token::IN) \ 874 KEYWORD("instanceof", Token::INSTANCEOF) \ 875 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 876 KEYWORD_GROUP('l') \ 877 KEYWORD("let", harmony_scoping \ 878 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ 879 KEYWORD_GROUP('n') \ 880 KEYWORD("new", Token::NEW) \ 881 KEYWORD("null", Token::NULL_LITERAL) \ 882 KEYWORD_GROUP('p') \ 883 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 884 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 885 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 886 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 887 KEYWORD_GROUP('r') \ 888 KEYWORD("return", Token::RETURN) \ 889 KEYWORD_GROUP('s') \ 890 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ 891 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ 892 KEYWORD("switch", Token::SWITCH) \ 893 KEYWORD_GROUP('t') \ 894 KEYWORD("this", Token::THIS) \ 895 KEYWORD("throw", Token::THROW) \ 896 KEYWORD("true", Token::TRUE_LITERAL) \ 897 KEYWORD("try", Token::TRY) \ 898 KEYWORD("typeof", Token::TYPEOF) \ 899 KEYWORD_GROUP('v') \ 900 KEYWORD("var", Token::VAR) \ 901 KEYWORD("void", Token::VOID) \ 902 KEYWORD_GROUP('w') \ 903 KEYWORD("while", Token::WHILE) \ 904 KEYWORD("with", Token::WITH) \ 905 KEYWORD_GROUP('y') \ 906 KEYWORD("yield", Token::YIELD) 907 908 909 static Token::Value KeywordOrIdentifierToken(const char* input, 910 int input_length, 911 bool harmony_scoping, 912 bool harmony_modules) { 913 ASSERT(input_length >= 1); 914 const int kMinLength = 2; 915 const int kMaxLength = 10; 916 if (input_length < kMinLength || input_length > kMaxLength) { 917 return Token::IDENTIFIER; 918 } 919 switch (input[0]) { 920 default: 921 #define KEYWORD_GROUP_CASE(ch) \ 922 break; \ 923 case ch: 924 #define KEYWORD(keyword, token) \ 925 { \ 926 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 927 /* strlen(keyword) plus 1 for the NUL char. */ \ 928 const int keyword_length = sizeof(keyword) - 1; \ 929 STATIC_ASSERT(keyword_length >= kMinLength); \ 930 STATIC_ASSERT(keyword_length <= kMaxLength); \ 931 if (input_length == keyword_length && \ 932 input[1] == keyword[1] && \ 933 (keyword_length <= 2 || input[2] == keyword[2]) && \ 934 (keyword_length <= 3 || input[3] == keyword[3]) && \ 935 (keyword_length <= 4 || input[4] == keyword[4]) && \ 936 (keyword_length <= 5 || input[5] == keyword[5]) && \ 937 (keyword_length <= 6 || input[6] == keyword[6]) && \ 938 (keyword_length <= 7 || input[7] == keyword[7]) && \ 939 (keyword_length <= 8 || input[8] == keyword[8]) && \ 940 (keyword_length <= 9 || input[9] == keyword[9])) { \ 941 return token; \ 942 } \ 943 } 944 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 945 } 946 return Token::IDENTIFIER; 947 } 948 949 950 Token::Value Scanner::ScanIdentifierOrKeyword() { 951 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); 952 LiteralScope literal(this); 953 // Scan identifier start character. 954 if (c0_ == '\\') { 955 uc32 c = ScanIdentifierUnicodeEscape(); 956 // Only allow legal identifier start characters. 957 if (c < 0 || 958 c == '\\' || // No recursive escapes. 959 !unicode_cache_->IsIdentifierStart(c)) { 960 return Token::ILLEGAL; 961 } 962 AddLiteralChar(c); 963 return ScanIdentifierSuffix(&literal); 964 } 965 966 uc32 first_char = c0_; 967 Advance(); 968 AddLiteralChar(first_char); 969 970 // Scan the rest of the identifier characters. 971 while (unicode_cache_->IsIdentifierPart(c0_)) { 972 if (c0_ != '\\') { 973 uc32 next_char = c0_; 974 Advance(); 975 AddLiteralChar(next_char); 976 continue; 977 } 978 // Fallthrough if no longer able to complete keyword. 979 return ScanIdentifierSuffix(&literal); 980 } 981 982 literal.Complete(); 983 984 if (next_.literal_chars->is_ascii()) { 985 Vector<const char> chars = next_.literal_chars->ascii_literal(); 986 return KeywordOrIdentifierToken(chars.start(), 987 chars.length(), 988 harmony_scoping_, 989 harmony_modules_); 990 } 991 992 return Token::IDENTIFIER; 993 } 994 995 996 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) { 997 // Scan the rest of the identifier characters. 998 while (unicode_cache_->IsIdentifierPart(c0_)) { 999 if (c0_ == '\\') { 1000 uc32 c = ScanIdentifierUnicodeEscape(); 1001 // Only allow legal identifier part characters. 1002 if (c < 0 || 1003 c == '\\' || 1004 !unicode_cache_->IsIdentifierPart(c)) { 1005 return Token::ILLEGAL; 1006 } 1007 AddLiteralChar(c); 1008 } else { 1009 AddLiteralChar(c0_); 1010 Advance(); 1011 } 1012 } 1013 literal->Complete(); 1014 1015 return Token::IDENTIFIER; 1016 } 1017 1018 1019 bool Scanner::ScanRegExpPattern(bool seen_equal) { 1020 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1021 bool in_character_class = false; 1022 1023 // Previous token is either '/' or '/=', in the second case, the 1024 // pattern starts at =. 1025 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1026 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1027 1028 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1029 // the scanner should pass uninterpreted bodies to the RegExp 1030 // constructor. 1031 LiteralScope literal(this); 1032 if (seen_equal) { 1033 AddLiteralChar('='); 1034 } 1035 1036 while (c0_ != '/' || in_character_class) { 1037 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1038 if (c0_ == '\\') { // Escape sequence. 1039 AddLiteralCharAdvance(); 1040 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1041 AddLiteralCharAdvance(); 1042 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1043 // only "safe" characters are allowed (letters, digits, underscore), 1044 // otherwise the escape isn't valid and the invalid character has 1045 // its normal meaning. I.e., we can just continue scanning without 1046 // worrying whether the following characters are part of the escape 1047 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1048 // of the escape sequence. 1049 1050 // TODO(896): At some point, parse RegExps more throughly to capture 1051 // octal esacpes in strict mode. 1052 } else { // Unescaped character. 1053 if (c0_ == '[') in_character_class = true; 1054 if (c0_ == ']') in_character_class = false; 1055 AddLiteralCharAdvance(); 1056 } 1057 } 1058 Advance(); // consume '/' 1059 1060 literal.Complete(); 1061 1062 return true; 1063 } 1064 1065 1066 bool Scanner::ScanLiteralUnicodeEscape() { 1067 ASSERT(c0_ == '\\'); 1068 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; 1069 Advance(); 1070 int i = 1; 1071 if (c0_ == 'u') { 1072 i++; 1073 while (i < 6) { 1074 Advance(); 1075 if (!IsHexDigit(c0_)) break; 1076 chars_read[i] = c0_; 1077 i++; 1078 } 1079 } 1080 if (i < 6) { 1081 // Incomplete escape. Undo all advances and return false. 1082 while (i > 0) { 1083 i--; 1084 PushBack(chars_read[i]); 1085 } 1086 return false; 1087 } 1088 // Complete escape. Add all chars to current literal buffer. 1089 for (int i = 0; i < 6; i++) { 1090 AddLiteralChar(chars_read[i]); 1091 } 1092 return true; 1093 } 1094 1095 1096 bool Scanner::ScanRegExpFlags() { 1097 // Scan regular expression flags. 1098 LiteralScope literal(this); 1099 while (unicode_cache_->IsIdentifierPart(c0_)) { 1100 if (c0_ != '\\') { 1101 AddLiteralCharAdvance(); 1102 } else { 1103 if (!ScanLiteralUnicodeEscape()) { 1104 break; 1105 } 1106 Advance(); 1107 } 1108 } 1109 literal.Complete(); 1110 1111 next_.location.end_pos = source_pos() - 1; 1112 return true; 1113 } 1114 1115 1116 int DuplicateFinder::AddAsciiSymbol(Vector<const char> key, int value) { 1117 return AddSymbol(Vector<const byte>::cast(key), true, value); 1118 } 1119 1120 1121 int DuplicateFinder::AddUtf16Symbol(Vector<const uint16_t> key, int value) { 1122 return AddSymbol(Vector<const byte>::cast(key), false, value); 1123 } 1124 1125 1126 int DuplicateFinder::AddSymbol(Vector<const byte> key, 1127 bool is_ascii, 1128 int value) { 1129 uint32_t hash = Hash(key, is_ascii); 1130 byte* encoding = BackupKey(key, is_ascii); 1131 HashMap::Entry* entry = map_.Lookup(encoding, hash, true); 1132 int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value)); 1133 entry->value = 1134 reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value)); 1135 return old_value; 1136 } 1137 1138 1139 int DuplicateFinder::AddNumber(Vector<const char> key, int value) { 1140 ASSERT(key.length() > 0); 1141 // Quick check for already being in canonical form. 1142 if (IsNumberCanonical(key)) { 1143 return AddAsciiSymbol(key, value); 1144 } 1145 1146 int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY; 1147 double double_value = StringToDouble(unicode_constants_, key, flags, 0.0); 1148 int length; 1149 const char* string; 1150 if (!std::isfinite(double_value)) { 1151 string = "Infinity"; 1152 length = 8; // strlen("Infinity"); 1153 } else { 1154 string = DoubleToCString(double_value, 1155 Vector<char>(number_buffer_, kBufferSize)); 1156 length = StrLength(string); 1157 } 1158 return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string), 1159 length), true, value); 1160 } 1161 1162 1163 bool DuplicateFinder::IsNumberCanonical(Vector<const char> number) { 1164 // Test for a safe approximation of number literals that are already 1165 // in canonical form: max 15 digits, no leading zeroes, except an 1166 // integer part that is a single zero, and no trailing zeros below 1167 // the decimal point. 1168 int pos = 0; 1169 int length = number.length(); 1170 if (number.length() > 15) return false; 1171 if (number[pos] == '0') { 1172 pos++; 1173 } else { 1174 while (pos < length && 1175 static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++; 1176 } 1177 if (length == pos) return true; 1178 if (number[pos] != '.') return false; 1179 pos++; 1180 bool invalid_last_digit = true; 1181 while (pos < length) { 1182 byte digit = number[pos] - '0'; 1183 if (digit > '9' - '0') return false; 1184 invalid_last_digit = (digit == 0); 1185 pos++; 1186 } 1187 return !invalid_last_digit; 1188 } 1189 1190 1191 uint32_t DuplicateFinder::Hash(Vector<const byte> key, bool is_ascii) { 1192 // Primitive hash function, almost identical to the one used 1193 // for strings (except that it's seeded by the length and ASCII-ness). 1194 int length = key.length(); 1195 uint32_t hash = (length << 1) | (is_ascii ? 1 : 0) ; 1196 for (int i = 0; i < length; i++) { 1197 uint32_t c = key[i]; 1198 hash = (hash + c) * 1025; 1199 hash ^= (hash >> 6); 1200 } 1201 return hash; 1202 } 1203 1204 1205 bool DuplicateFinder::Match(void* first, void* second) { 1206 // Decode lengths. 1207 // Length + ASCII-bit is encoded as base 128, most significant heptet first, 1208 // with a 8th bit being non-zero while there are more heptets. 1209 // The value encodes the number of bytes following, and whether the original 1210 // was ASCII. 1211 byte* s1 = reinterpret_cast<byte*>(first); 1212 byte* s2 = reinterpret_cast<byte*>(second); 1213 uint32_t length_ascii_field = 0; 1214 byte c1; 1215 do { 1216 c1 = *s1; 1217 if (c1 != *s2) return false; 1218 length_ascii_field = (length_ascii_field << 7) | (c1 & 0x7f); 1219 s1++; 1220 s2++; 1221 } while ((c1 & 0x80) != 0); 1222 int length = static_cast<int>(length_ascii_field >> 1); 1223 return memcmp(s1, s2, length) == 0; 1224 } 1225 1226 1227 byte* DuplicateFinder::BackupKey(Vector<const byte> bytes, 1228 bool is_ascii) { 1229 uint32_t ascii_length = (bytes.length() << 1) | (is_ascii ? 1 : 0); 1230 backing_store_.StartSequence(); 1231 // Emit ascii_length as base-128 encoded number, with the 7th bit set 1232 // on the byte of every heptet except the last, least significant, one. 1233 if (ascii_length >= (1 << 7)) { 1234 if (ascii_length >= (1 << 14)) { 1235 if (ascii_length >= (1 << 21)) { 1236 if (ascii_length >= (1 << 28)) { 1237 backing_store_.Add(static_cast<byte>((ascii_length >> 28) | 0x80)); 1238 } 1239 backing_store_.Add(static_cast<byte>((ascii_length >> 21) | 0x80u)); 1240 } 1241 backing_store_.Add(static_cast<byte>((ascii_length >> 14) | 0x80u)); 1242 } 1243 backing_store_.Add(static_cast<byte>((ascii_length >> 7) | 0x80u)); 1244 } 1245 backing_store_.Add(static_cast<byte>(ascii_length & 0x7f)); 1246 1247 backing_store_.AddBlock(bytes); 1248 return backing_store_.EndSequence().start(); 1249 } 1250 1251 } } // namespace v8::internal 1252