1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 // Features shared by parsing and pre-parsing scanners. 29 30 #include "scanner.h" 31 32 #include "../include/v8stdint.h" 33 #include "char-predicates-inl.h" 34 35 namespace v8 { 36 namespace internal { 37 38 // ---------------------------------------------------------------------------- 39 // Scanner 40 41 Scanner::Scanner(UnicodeCache* unicode_cache) 42 : unicode_cache_(unicode_cache), 43 octal_pos_(Location::invalid()), 44 harmony_scoping_(false), 45 harmony_modules_(false) { } 46 47 48 void Scanner::Initialize(Utf16CharacterStream* source) { 49 source_ = source; 50 // Need to capture identifiers in order to recognize "get" and "set" 51 // in object literals. 52 Init(); 53 // Skip initial whitespace allowing HTML comment ends just like 54 // after a newline and scan first token. 55 has_line_terminator_before_next_ = true; 56 SkipWhiteSpace(); 57 Scan(); 58 } 59 60 61 uc32 Scanner::ScanHexNumber(int expected_length) { 62 ASSERT(expected_length <= 4); // prevent overflow 63 64 uc32 digits[4] = { 0, 0, 0, 0 }; 65 uc32 x = 0; 66 for (int i = 0; i < expected_length; i++) { 67 digits[i] = c0_; 68 int d = HexValue(c0_); 69 if (d < 0) { 70 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 71 // should be illegal, but other JS VMs just return the 72 // non-escaped version of the original character. 73 74 // Push back digits that we have advanced past. 75 for (int j = i-1; j >= 0; j--) { 76 PushBack(digits[j]); 77 } 78 return -1; 79 } 80 x = x * 16 + d; 81 Advance(); 82 } 83 84 return x; 85 } 86 87 88 // Ensure that tokens can be stored in a byte. 89 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 90 91 // Table of one-character tokens, by character (0x00..0x7f only). 92 static const byte one_char_tokens[] = { 93 Token::ILLEGAL, 94 Token::ILLEGAL, 95 Token::ILLEGAL, 96 Token::ILLEGAL, 97 Token::ILLEGAL, 98 Token::ILLEGAL, 99 Token::ILLEGAL, 100 Token::ILLEGAL, 101 Token::ILLEGAL, 102 Token::ILLEGAL, 103 Token::ILLEGAL, 104 Token::ILLEGAL, 105 Token::ILLEGAL, 106 Token::ILLEGAL, 107 Token::ILLEGAL, 108 Token::ILLEGAL, 109 Token::ILLEGAL, 110 Token::ILLEGAL, 111 Token::ILLEGAL, 112 Token::ILLEGAL, 113 Token::ILLEGAL, 114 Token::ILLEGAL, 115 Token::ILLEGAL, 116 Token::ILLEGAL, 117 Token::ILLEGAL, 118 Token::ILLEGAL, 119 Token::ILLEGAL, 120 Token::ILLEGAL, 121 Token::ILLEGAL, 122 Token::ILLEGAL, 123 Token::ILLEGAL, 124 Token::ILLEGAL, 125 Token::ILLEGAL, 126 Token::ILLEGAL, 127 Token::ILLEGAL, 128 Token::ILLEGAL, 129 Token::ILLEGAL, 130 Token::ILLEGAL, 131 Token::ILLEGAL, 132 Token::ILLEGAL, 133 Token::LPAREN, // 0x28 134 Token::RPAREN, // 0x29 135 Token::ILLEGAL, 136 Token::ILLEGAL, 137 Token::COMMA, // 0x2c 138 Token::ILLEGAL, 139 Token::ILLEGAL, 140 Token::ILLEGAL, 141 Token::ILLEGAL, 142 Token::ILLEGAL, 143 Token::ILLEGAL, 144 Token::ILLEGAL, 145 Token::ILLEGAL, 146 Token::ILLEGAL, 147 Token::ILLEGAL, 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::ILLEGAL, 151 Token::COLON, // 0x3a 152 Token::SEMICOLON, // 0x3b 153 Token::ILLEGAL, 154 Token::ILLEGAL, 155 Token::ILLEGAL, 156 Token::CONDITIONAL, // 0x3f 157 Token::ILLEGAL, 158 Token::ILLEGAL, 159 Token::ILLEGAL, 160 Token::ILLEGAL, 161 Token::ILLEGAL, 162 Token::ILLEGAL, 163 Token::ILLEGAL, 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::ILLEGAL, 172 Token::ILLEGAL, 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::ILLEGAL, 177 Token::ILLEGAL, 178 Token::ILLEGAL, 179 Token::ILLEGAL, 180 Token::ILLEGAL, 181 Token::ILLEGAL, 182 Token::ILLEGAL, 183 Token::ILLEGAL, 184 Token::LBRACK, // 0x5b 185 Token::ILLEGAL, 186 Token::RBRACK, // 0x5d 187 Token::ILLEGAL, 188 Token::ILLEGAL, 189 Token::ILLEGAL, 190 Token::ILLEGAL, 191 Token::ILLEGAL, 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::ILLEGAL, 205 Token::ILLEGAL, 206 Token::ILLEGAL, 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::ILLEGAL, 210 Token::ILLEGAL, 211 Token::ILLEGAL, 212 Token::ILLEGAL, 213 Token::ILLEGAL, 214 Token::ILLEGAL, 215 Token::ILLEGAL, 216 Token::LBRACE, // 0x7b 217 Token::ILLEGAL, 218 Token::RBRACE, // 0x7d 219 Token::BIT_NOT, // 0x7e 220 Token::ILLEGAL 221 }; 222 223 224 Token::Value Scanner::Next() { 225 current_ = next_; 226 has_line_terminator_before_next_ = false; 227 has_multiline_comment_before_next_ = false; 228 if (static_cast<unsigned>(c0_) <= 0x7f) { 229 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 230 if (token != Token::ILLEGAL) { 231 int pos = source_pos(); 232 next_.token = token; 233 next_.location.beg_pos = pos; 234 next_.location.end_pos = pos + 1; 235 Advance(); 236 return current_.token; 237 } 238 } 239 Scan(); 240 return current_.token; 241 } 242 243 244 static inline bool IsByteOrderMark(uc32 c) { 245 // The Unicode value U+FFFE is guaranteed never to be assigned as a 246 // Unicode character; this implies that in a Unicode context the 247 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 248 // character expressed in little-endian byte order (since it could 249 // not be a U+FFFE character expressed in big-endian byte 250 // order). Nevertheless, we check for it to be compatible with 251 // Spidermonkey. 252 return c == 0xFEFF || c == 0xFFFE; 253 } 254 255 256 bool Scanner::SkipWhiteSpace() { 257 int start_position = source_pos(); 258 259 while (true) { 260 // We treat byte-order marks (BOMs) as whitespace for better 261 // compatibility with Spidermonkey and other JavaScript engines. 262 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { 263 // IsWhiteSpace() includes line terminators! 264 if (unicode_cache_->IsLineTerminator(c0_)) { 265 // Ignore line terminators, but remember them. This is necessary 266 // for automatic semicolon insertion. 267 has_line_terminator_before_next_ = true; 268 } 269 Advance(); 270 } 271 272 // If there is an HTML comment end '-->' at the beginning of a 273 // line (with only whitespace in front of it), we treat the rest 274 // of the line as a comment. This is in line with the way 275 // SpiderMonkey handles it. 276 if (c0_ == '-' && has_line_terminator_before_next_) { 277 Advance(); 278 if (c0_ == '-') { 279 Advance(); 280 if (c0_ == '>') { 281 // Treat the rest of the line as a comment. 282 SkipSingleLineComment(); 283 // Continue skipping white space after the comment. 284 continue; 285 } 286 PushBack('-'); // undo Advance() 287 } 288 PushBack('-'); // undo Advance() 289 } 290 // Return whether or not we skipped any characters. 291 return source_pos() != start_position; 292 } 293 } 294 295 296 Token::Value Scanner::SkipSingleLineComment() { 297 Advance(); 298 299 // The line terminator at the end of the line is not considered 300 // to be part of the single-line comment; it is recognized 301 // separately by the lexical grammar and becomes part of the 302 // stream of input elements for the syntactic grammar (see 303 // ECMA-262, section 7.4). 304 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 305 Advance(); 306 } 307 308 return Token::WHITESPACE; 309 } 310 311 312 Token::Value Scanner::SkipMultiLineComment() { 313 ASSERT(c0_ == '*'); 314 Advance(); 315 316 while (c0_ >= 0) { 317 uc32 ch = c0_; 318 Advance(); 319 if (unicode_cache_->IsLineTerminator(ch)) { 320 // Following ECMA-262, section 7.4, a comment containing 321 // a newline will make the comment count as a line-terminator. 322 has_multiline_comment_before_next_ = true; 323 } 324 // If we have reached the end of the multi-line comment, we 325 // consume the '/' and insert a whitespace. This way all 326 // multi-line comments are treated as whitespace. 327 if (ch == '*' && c0_ == '/') { 328 c0_ = ' '; 329 return Token::WHITESPACE; 330 } 331 } 332 333 // Unterminated multi-line comment. 334 return Token::ILLEGAL; 335 } 336 337 338 Token::Value Scanner::ScanHtmlComment() { 339 // Check for <!-- comments. 340 ASSERT(c0_ == '!'); 341 Advance(); 342 if (c0_ == '-') { 343 Advance(); 344 if (c0_ == '-') return SkipSingleLineComment(); 345 PushBack('-'); // undo Advance() 346 } 347 PushBack('!'); // undo Advance() 348 ASSERT(c0_ == '!'); 349 return Token::LT; 350 } 351 352 353 void Scanner::Scan() { 354 next_.literal_chars = NULL; 355 Token::Value token; 356 do { 357 // Remember the position of the next token 358 next_.location.beg_pos = source_pos(); 359 360 switch (c0_) { 361 case ' ': 362 case '\t': 363 Advance(); 364 token = Token::WHITESPACE; 365 break; 366 367 case '\n': 368 Advance(); 369 has_line_terminator_before_next_ = true; 370 token = Token::WHITESPACE; 371 break; 372 373 case '"': case '\'': 374 token = ScanString(); 375 break; 376 377 case '<': 378 // < <= << <<= <!-- 379 Advance(); 380 if (c0_ == '=') { 381 token = Select(Token::LTE); 382 } else if (c0_ == '<') { 383 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 384 } else if (c0_ == '!') { 385 token = ScanHtmlComment(); 386 } else { 387 token = Token::LT; 388 } 389 break; 390 391 case '>': 392 // > >= >> >>= >>> >>>= 393 Advance(); 394 if (c0_ == '=') { 395 token = Select(Token::GTE); 396 } else if (c0_ == '>') { 397 // >> >>= >>> >>>= 398 Advance(); 399 if (c0_ == '=') { 400 token = Select(Token::ASSIGN_SAR); 401 } else if (c0_ == '>') { 402 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 403 } else { 404 token = Token::SAR; 405 } 406 } else { 407 token = Token::GT; 408 } 409 break; 410 411 case '=': 412 // = == === 413 Advance(); 414 if (c0_ == '=') { 415 token = Select('=', Token::EQ_STRICT, Token::EQ); 416 } else { 417 token = Token::ASSIGN; 418 } 419 break; 420 421 case '!': 422 // ! != !== 423 Advance(); 424 if (c0_ == '=') { 425 token = Select('=', Token::NE_STRICT, Token::NE); 426 } else { 427 token = Token::NOT; 428 } 429 break; 430 431 case '+': 432 // + ++ += 433 Advance(); 434 if (c0_ == '+') { 435 token = Select(Token::INC); 436 } else if (c0_ == '=') { 437 token = Select(Token::ASSIGN_ADD); 438 } else { 439 token = Token::ADD; 440 } 441 break; 442 443 case '-': 444 // - -- --> -= 445 Advance(); 446 if (c0_ == '-') { 447 Advance(); 448 if (c0_ == '>' && has_line_terminator_before_next_) { 449 // For compatibility with SpiderMonkey, we skip lines that 450 // start with an HTML comment end '-->'. 451 token = SkipSingleLineComment(); 452 } else { 453 token = Token::DEC; 454 } 455 } else if (c0_ == '=') { 456 token = Select(Token::ASSIGN_SUB); 457 } else { 458 token = Token::SUB; 459 } 460 break; 461 462 case '*': 463 // * *= 464 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 465 break; 466 467 case '%': 468 // % %= 469 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 470 break; 471 472 case '/': 473 // / // /* /= 474 Advance(); 475 if (c0_ == '/') { 476 token = SkipSingleLineComment(); 477 } else if (c0_ == '*') { 478 token = SkipMultiLineComment(); 479 } else if (c0_ == '=') { 480 token = Select(Token::ASSIGN_DIV); 481 } else { 482 token = Token::DIV; 483 } 484 break; 485 486 case '&': 487 // & && &= 488 Advance(); 489 if (c0_ == '&') { 490 token = Select(Token::AND); 491 } else if (c0_ == '=') { 492 token = Select(Token::ASSIGN_BIT_AND); 493 } else { 494 token = Token::BIT_AND; 495 } 496 break; 497 498 case '|': 499 // | || |= 500 Advance(); 501 if (c0_ == '|') { 502 token = Select(Token::OR); 503 } else if (c0_ == '=') { 504 token = Select(Token::ASSIGN_BIT_OR); 505 } else { 506 token = Token::BIT_OR; 507 } 508 break; 509 510 case '^': 511 // ^ ^= 512 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 513 break; 514 515 case '.': 516 // . Number 517 Advance(); 518 if (IsDecimalDigit(c0_)) { 519 token = ScanNumber(true); 520 } else { 521 token = Token::PERIOD; 522 } 523 break; 524 525 case ':': 526 token = Select(Token::COLON); 527 break; 528 529 case ';': 530 token = Select(Token::SEMICOLON); 531 break; 532 533 case ',': 534 token = Select(Token::COMMA); 535 break; 536 537 case '(': 538 token = Select(Token::LPAREN); 539 break; 540 541 case ')': 542 token = Select(Token::RPAREN); 543 break; 544 545 case '[': 546 token = Select(Token::LBRACK); 547 break; 548 549 case ']': 550 token = Select(Token::RBRACK); 551 break; 552 553 case '{': 554 token = Select(Token::LBRACE); 555 break; 556 557 case '}': 558 token = Select(Token::RBRACE); 559 break; 560 561 case '?': 562 token = Select(Token::CONDITIONAL); 563 break; 564 565 case '~': 566 token = Select(Token::BIT_NOT); 567 break; 568 569 default: 570 if (unicode_cache_->IsIdentifierStart(c0_)) { 571 token = ScanIdentifierOrKeyword(); 572 } else if (IsDecimalDigit(c0_)) { 573 token = ScanNumber(false); 574 } else if (SkipWhiteSpace()) { 575 token = Token::WHITESPACE; 576 } else if (c0_ < 0) { 577 token = Token::EOS; 578 } else { 579 token = Select(Token::ILLEGAL); 580 } 581 break; 582 } 583 584 // Continue scanning for tokens as long as we're just skipping 585 // whitespace. 586 } while (token == Token::WHITESPACE); 587 588 next_.location.end_pos = source_pos(); 589 next_.token = token; 590 } 591 592 593 void Scanner::SeekForward(int pos) { 594 // After this call, we will have the token at the given position as 595 // the "next" token. The "current" token will be invalid. 596 if (pos == next_.location.beg_pos) return; 597 int current_pos = source_pos(); 598 ASSERT_EQ(next_.location.end_pos, current_pos); 599 // Positions inside the lookahead token aren't supported. 600 ASSERT(pos >= current_pos); 601 if (pos != current_pos) { 602 source_->SeekForward(pos - source_->pos()); 603 Advance(); 604 // This function is only called to seek to the location 605 // of the end of a function (at the "}" token). It doesn't matter 606 // whether there was a line terminator in the part we skip. 607 has_line_terminator_before_next_ = false; 608 has_multiline_comment_before_next_ = false; 609 } 610 Scan(); 611 } 612 613 614 void Scanner::ScanEscape() { 615 uc32 c = c0_; 616 Advance(); 617 618 // Skip escaped newlines. 619 if (unicode_cache_->IsLineTerminator(c)) { 620 // Allow CR+LF newlines in multiline string literals. 621 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 622 // Allow LF+CR newlines in multiline string literals. 623 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 624 return; 625 } 626 627 switch (c) { 628 case '\'': // fall through 629 case '"' : // fall through 630 case '\\': break; 631 case 'b' : c = '\b'; break; 632 case 'f' : c = '\f'; break; 633 case 'n' : c = '\n'; break; 634 case 'r' : c = '\r'; break; 635 case 't' : c = '\t'; break; 636 case 'u' : { 637 c = ScanHexNumber(4); 638 if (c < 0) c = 'u'; 639 break; 640 } 641 case 'v' : c = '\v'; break; 642 case 'x' : { 643 c = ScanHexNumber(2); 644 if (c < 0) c = 'x'; 645 break; 646 } 647 case '0' : // fall through 648 case '1' : // fall through 649 case '2' : // fall through 650 case '3' : // fall through 651 case '4' : // fall through 652 case '5' : // fall through 653 case '6' : // fall through 654 case '7' : c = ScanOctalEscape(c, 2); break; 655 } 656 657 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these 658 // should be illegal, but they are commonly handled 659 // as non-escaped characters by JS VMs. 660 AddLiteralChar(c); 661 } 662 663 664 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of 665 // ECMA-262. Other JS VMs support them. 666 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 667 uc32 x = c - '0'; 668 int i = 0; 669 for (; i < length; i++) { 670 int d = c0_ - '0'; 671 if (d < 0 || d > 7) break; 672 int nx = x * 8 + d; 673 if (nx >= 256) break; 674 x = nx; 675 Advance(); 676 } 677 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 678 // Remember the position of octal escape sequences so that an error 679 // can be reported later (in strict mode). 680 // We don't report the error immediately, because the octal escape can 681 // occur before the "use strict" directive. 682 if (c != '0' || i > 0) { 683 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 684 } 685 return x; 686 } 687 688 689 Token::Value Scanner::ScanString() { 690 uc32 quote = c0_; 691 Advance(); // consume quote 692 693 LiteralScope literal(this); 694 while (c0_ != quote && c0_ >= 0 695 && !unicode_cache_->IsLineTerminator(c0_)) { 696 uc32 c = c0_; 697 Advance(); 698 if (c == '\\') { 699 if (c0_ < 0) return Token::ILLEGAL; 700 ScanEscape(); 701 } else { 702 AddLiteralChar(c); 703 } 704 } 705 if (c0_ != quote) return Token::ILLEGAL; 706 literal.Complete(); 707 708 Advance(); // consume quote 709 return Token::STRING; 710 } 711 712 713 void Scanner::ScanDecimalDigits() { 714 while (IsDecimalDigit(c0_)) 715 AddLiteralCharAdvance(); 716 } 717 718 719 Token::Value Scanner::ScanNumber(bool seen_period) { 720 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 721 722 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; 723 724 LiteralScope literal(this); 725 if (seen_period) { 726 // we have already seen a decimal point of the float 727 AddLiteralChar('.'); 728 ScanDecimalDigits(); // we know we have at least one digit 729 730 } else { 731 // if the first character is '0' we must check for octals and hex 732 if (c0_ == '0') { 733 int start_pos = source_pos(); // For reporting octal positions. 734 AddLiteralCharAdvance(); 735 736 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number 737 if (c0_ == 'x' || c0_ == 'X') { 738 // hex number 739 kind = HEX; 740 AddLiteralCharAdvance(); 741 if (!IsHexDigit(c0_)) { 742 // we must have at least one hex digit after 'x'/'X' 743 return Token::ILLEGAL; 744 } 745 while (IsHexDigit(c0_)) { 746 AddLiteralCharAdvance(); 747 } 748 } else if ('0' <= c0_ && c0_ <= '7') { 749 // (possible) octal number 750 kind = OCTAL; 751 while (true) { 752 if (c0_ == '8' || c0_ == '9') { 753 kind = DECIMAL; 754 break; 755 } 756 if (c0_ < '0' || '7' < c0_) { 757 // Octal literal finished. 758 octal_pos_ = Location(start_pos, source_pos()); 759 break; 760 } 761 AddLiteralCharAdvance(); 762 } 763 } 764 } 765 766 // Parse decimal digits and allow trailing fractional part. 767 if (kind == DECIMAL) { 768 ScanDecimalDigits(); // optional 769 if (c0_ == '.') { 770 AddLiteralCharAdvance(); 771 ScanDecimalDigits(); // optional 772 } 773 } 774 } 775 776 // scan exponent, if any 777 if (c0_ == 'e' || c0_ == 'E') { 778 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 779 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed 780 // scan exponent 781 AddLiteralCharAdvance(); 782 if (c0_ == '+' || c0_ == '-') 783 AddLiteralCharAdvance(); 784 if (!IsDecimalDigit(c0_)) { 785 // we must have at least one decimal digit after 'e'/'E' 786 return Token::ILLEGAL; 787 } 788 ScanDecimalDigits(); 789 } 790 791 // The source character immediately following a numeric literal must 792 // not be an identifier start or a decimal digit; see ECMA-262 793 // section 7.8.3, page 17 (note that we read only one decimal digit 794 // if the value is 0). 795 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) 796 return Token::ILLEGAL; 797 798 literal.Complete(); 799 800 return Token::NUMBER; 801 } 802 803 804 uc32 Scanner::ScanIdentifierUnicodeEscape() { 805 Advance(); 806 if (c0_ != 'u') return -1; 807 Advance(); 808 uc32 result = ScanHexNumber(4); 809 if (result < 0) PushBack('u'); 810 return result; 811 } 812 813 814 // ---------------------------------------------------------------------------- 815 // Keyword Matcher 816 817 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 818 KEYWORD_GROUP('b') \ 819 KEYWORD("break", Token::BREAK) \ 820 KEYWORD_GROUP('c') \ 821 KEYWORD("case", Token::CASE) \ 822 KEYWORD("catch", Token::CATCH) \ 823 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ 824 KEYWORD("const", Token::CONST) \ 825 KEYWORD("continue", Token::CONTINUE) \ 826 KEYWORD_GROUP('d') \ 827 KEYWORD("debugger", Token::DEBUGGER) \ 828 KEYWORD("default", Token::DEFAULT) \ 829 KEYWORD("delete", Token::DELETE) \ 830 KEYWORD("do", Token::DO) \ 831 KEYWORD_GROUP('e') \ 832 KEYWORD("else", Token::ELSE) \ 833 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ 834 KEYWORD("export", harmony_modules \ 835 ? Token::EXPORT : Token::FUTURE_RESERVED_WORD) \ 836 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ 837 KEYWORD_GROUP('f') \ 838 KEYWORD("false", Token::FALSE_LITERAL) \ 839 KEYWORD("finally", Token::FINALLY) \ 840 KEYWORD("for", Token::FOR) \ 841 KEYWORD("function", Token::FUNCTION) \ 842 KEYWORD_GROUP('i') \ 843 KEYWORD("if", Token::IF) \ 844 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 845 KEYWORD("import", harmony_modules \ 846 ? Token::IMPORT : Token::FUTURE_RESERVED_WORD) \ 847 KEYWORD("in", Token::IN) \ 848 KEYWORD("instanceof", Token::INSTANCEOF) \ 849 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 850 KEYWORD_GROUP('l') \ 851 KEYWORD("let", harmony_scoping \ 852 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ 853 KEYWORD_GROUP('n') \ 854 KEYWORD("new", Token::NEW) \ 855 KEYWORD("null", Token::NULL_LITERAL) \ 856 KEYWORD_GROUP('p') \ 857 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 858 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 859 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 860 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 861 KEYWORD_GROUP('r') \ 862 KEYWORD("return", Token::RETURN) \ 863 KEYWORD_GROUP('s') \ 864 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ 865 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ 866 KEYWORD("switch", Token::SWITCH) \ 867 KEYWORD_GROUP('t') \ 868 KEYWORD("this", Token::THIS) \ 869 KEYWORD("throw", Token::THROW) \ 870 KEYWORD("true", Token::TRUE_LITERAL) \ 871 KEYWORD("try", Token::TRY) \ 872 KEYWORD("typeof", Token::TYPEOF) \ 873 KEYWORD_GROUP('v') \ 874 KEYWORD("var", Token::VAR) \ 875 KEYWORD("void", Token::VOID) \ 876 KEYWORD_GROUP('w') \ 877 KEYWORD("while", Token::WHILE) \ 878 KEYWORD("with", Token::WITH) \ 879 KEYWORD_GROUP('y') \ 880 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD) 881 882 883 static Token::Value KeywordOrIdentifierToken(const char* input, 884 int input_length, 885 bool harmony_scoping, 886 bool harmony_modules) { 887 ASSERT(input_length >= 1); 888 const int kMinLength = 2; 889 const int kMaxLength = 10; 890 if (input_length < kMinLength || input_length > kMaxLength) { 891 return Token::IDENTIFIER; 892 } 893 switch (input[0]) { 894 default: 895 #define KEYWORD_GROUP_CASE(ch) \ 896 break; \ 897 case ch: 898 #define KEYWORD(keyword, token) \ 899 { \ 900 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 901 /* strlen(keyword) plus 1 for the NUL char. */ \ 902 const int keyword_length = sizeof(keyword) - 1; \ 903 STATIC_ASSERT(keyword_length >= kMinLength); \ 904 STATIC_ASSERT(keyword_length <= kMaxLength); \ 905 if (input_length == keyword_length && \ 906 input[1] == keyword[1] && \ 907 (keyword_length <= 2 || input[2] == keyword[2]) && \ 908 (keyword_length <= 3 || input[3] == keyword[3]) && \ 909 (keyword_length <= 4 || input[4] == keyword[4]) && \ 910 (keyword_length <= 5 || input[5] == keyword[5]) && \ 911 (keyword_length <= 6 || input[6] == keyword[6]) && \ 912 (keyword_length <= 7 || input[7] == keyword[7]) && \ 913 (keyword_length <= 8 || input[8] == keyword[8]) && \ 914 (keyword_length <= 9 || input[9] == keyword[9])) { \ 915 return token; \ 916 } \ 917 } 918 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 919 } 920 return Token::IDENTIFIER; 921 } 922 923 924 Token::Value Scanner::ScanIdentifierOrKeyword() { 925 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); 926 LiteralScope literal(this); 927 // Scan identifier start character. 928 if (c0_ == '\\') { 929 uc32 c = ScanIdentifierUnicodeEscape(); 930 // Only allow legal identifier start characters. 931 if (c < 0 || 932 c == '\\' || // No recursive escapes. 933 !unicode_cache_->IsIdentifierStart(c)) { 934 return Token::ILLEGAL; 935 } 936 AddLiteralChar(c); 937 return ScanIdentifierSuffix(&literal); 938 } 939 940 uc32 first_char = c0_; 941 Advance(); 942 AddLiteralChar(first_char); 943 944 // Scan the rest of the identifier characters. 945 while (unicode_cache_->IsIdentifierPart(c0_)) { 946 if (c0_ != '\\') { 947 uc32 next_char = c0_; 948 Advance(); 949 AddLiteralChar(next_char); 950 continue; 951 } 952 // Fallthrough if no longer able to complete keyword. 953 return ScanIdentifierSuffix(&literal); 954 } 955 956 literal.Complete(); 957 958 if (next_.literal_chars->is_ascii()) { 959 Vector<const char> chars = next_.literal_chars->ascii_literal(); 960 return KeywordOrIdentifierToken(chars.start(), 961 chars.length(), 962 harmony_scoping_, 963 harmony_modules_); 964 } 965 966 return Token::IDENTIFIER; 967 } 968 969 970 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) { 971 // Scan the rest of the identifier characters. 972 while (unicode_cache_->IsIdentifierPart(c0_)) { 973 if (c0_ == '\\') { 974 uc32 c = ScanIdentifierUnicodeEscape(); 975 // Only allow legal identifier part characters. 976 if (c < 0 || 977 c == '\\' || 978 !unicode_cache_->IsIdentifierPart(c)) { 979 return Token::ILLEGAL; 980 } 981 AddLiteralChar(c); 982 } else { 983 AddLiteralChar(c0_); 984 Advance(); 985 } 986 } 987 literal->Complete(); 988 989 return Token::IDENTIFIER; 990 } 991 992 993 bool Scanner::ScanRegExpPattern(bool seen_equal) { 994 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 995 bool in_character_class = false; 996 997 // Previous token is either '/' or '/=', in the second case, the 998 // pattern starts at =. 999 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1000 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1001 1002 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1003 // the scanner should pass uninterpreted bodies to the RegExp 1004 // constructor. 1005 LiteralScope literal(this); 1006 if (seen_equal) { 1007 AddLiteralChar('='); 1008 } 1009 1010 while (c0_ != '/' || in_character_class) { 1011 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1012 if (c0_ == '\\') { // Escape sequence. 1013 AddLiteralCharAdvance(); 1014 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1015 AddLiteralCharAdvance(); 1016 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1017 // only "safe" characters are allowed (letters, digits, underscore), 1018 // otherwise the escape isn't valid and the invalid character has 1019 // its normal meaning. I.e., we can just continue scanning without 1020 // worrying whether the following characters are part of the escape 1021 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1022 // of the escape sequence. 1023 1024 // TODO(896): At some point, parse RegExps more throughly to capture 1025 // octal esacpes in strict mode. 1026 } else { // Unescaped character. 1027 if (c0_ == '[') in_character_class = true; 1028 if (c0_ == ']') in_character_class = false; 1029 AddLiteralCharAdvance(); 1030 } 1031 } 1032 Advance(); // consume '/' 1033 1034 literal.Complete(); 1035 1036 return true; 1037 } 1038 1039 1040 bool Scanner::ScanLiteralUnicodeEscape() { 1041 ASSERT(c0_ == '\\'); 1042 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; 1043 Advance(); 1044 int i = 1; 1045 if (c0_ == 'u') { 1046 i++; 1047 while (i < 6) { 1048 Advance(); 1049 if (!IsHexDigit(c0_)) break; 1050 chars_read[i] = c0_; 1051 i++; 1052 } 1053 } 1054 if (i < 6) { 1055 // Incomplete escape. Undo all advances and return false. 1056 while (i > 0) { 1057 i--; 1058 PushBack(chars_read[i]); 1059 } 1060 return false; 1061 } 1062 // Complete escape. Add all chars to current literal buffer. 1063 for (int i = 0; i < 6; i++) { 1064 AddLiteralChar(chars_read[i]); 1065 } 1066 return true; 1067 } 1068 1069 1070 bool Scanner::ScanRegExpFlags() { 1071 // Scan regular expression flags. 1072 LiteralScope literal(this); 1073 while (unicode_cache_->IsIdentifierPart(c0_)) { 1074 if (c0_ != '\\') { 1075 AddLiteralCharAdvance(); 1076 } else { 1077 if (!ScanLiteralUnicodeEscape()) { 1078 break; 1079 } 1080 } 1081 } 1082 literal.Complete(); 1083 1084 next_.location.end_pos = source_pos() - 1; 1085 return true; 1086 } 1087 1088 } } // namespace v8::internal 1089