1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 // Features shared by parsing and pre-parsing scanners. 29 30 #include "scanner.h" 31 32 #include "../include/v8stdint.h" 33 #include "char-predicates-inl.h" 34 35 namespace v8 { 36 namespace internal { 37 38 // ---------------------------------------------------------------------------- 39 // Scanner 40 41 Scanner::Scanner(UnicodeCache* unicode_cache) 42 : unicode_cache_(unicode_cache), 43 octal_pos_(Location::invalid()), 44 harmony_scoping_(false), 45 harmony_modules_(false), 46 harmony_numeric_literals_(false) { } 47 48 49 void Scanner::Initialize(Utf16CharacterStream* source) { 50 source_ = source; 51 // Need to capture identifiers in order to recognize "get" and "set" 52 // in object literals. 53 Init(); 54 // Skip initial whitespace allowing HTML comment ends just like 55 // after a newline and scan first token. 56 has_line_terminator_before_next_ = true; 57 SkipWhiteSpace(); 58 Scan(); 59 } 60 61 62 uc32 Scanner::ScanHexNumber(int expected_length) { 63 ASSERT(expected_length <= 4); // prevent overflow 64 65 uc32 digits[4] = { 0, 0, 0, 0 }; 66 uc32 x = 0; 67 for (int i = 0; i < expected_length; i++) { 68 digits[i] = c0_; 69 int d = HexValue(c0_); 70 if (d < 0) { 71 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 72 // should be illegal, but other JS VMs just return the 73 // non-escaped version of the original character. 74 75 // Push back digits that we have advanced past. 76 for (int j = i-1; j >= 0; j--) { 77 PushBack(digits[j]); 78 } 79 return -1; 80 } 81 x = x * 16 + d; 82 Advance(); 83 } 84 85 return x; 86 } 87 88 89 // Ensure that tokens can be stored in a byte. 90 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 91 92 // Table of one-character tokens, by character (0x00..0x7f only). 93 static const byte one_char_tokens[] = { 94 Token::ILLEGAL, 95 Token::ILLEGAL, 96 Token::ILLEGAL, 97 Token::ILLEGAL, 98 Token::ILLEGAL, 99 Token::ILLEGAL, 100 Token::ILLEGAL, 101 Token::ILLEGAL, 102 Token::ILLEGAL, 103 Token::ILLEGAL, 104 Token::ILLEGAL, 105 Token::ILLEGAL, 106 Token::ILLEGAL, 107 Token::ILLEGAL, 108 Token::ILLEGAL, 109 Token::ILLEGAL, 110 Token::ILLEGAL, 111 Token::ILLEGAL, 112 Token::ILLEGAL, 113 Token::ILLEGAL, 114 Token::ILLEGAL, 115 Token::ILLEGAL, 116 Token::ILLEGAL, 117 Token::ILLEGAL, 118 Token::ILLEGAL, 119 Token::ILLEGAL, 120 Token::ILLEGAL, 121 Token::ILLEGAL, 122 Token::ILLEGAL, 123 Token::ILLEGAL, 124 Token::ILLEGAL, 125 Token::ILLEGAL, 126 Token::ILLEGAL, 127 Token::ILLEGAL, 128 Token::ILLEGAL, 129 Token::ILLEGAL, 130 Token::ILLEGAL, 131 Token::ILLEGAL, 132 Token::ILLEGAL, 133 Token::ILLEGAL, 134 Token::LPAREN, // 0x28 135 Token::RPAREN, // 0x29 136 Token::ILLEGAL, 137 Token::ILLEGAL, 138 Token::COMMA, // 0x2c 139 Token::ILLEGAL, 140 Token::ILLEGAL, 141 Token::ILLEGAL, 142 Token::ILLEGAL, 143 Token::ILLEGAL, 144 Token::ILLEGAL, 145 Token::ILLEGAL, 146 Token::ILLEGAL, 147 Token::ILLEGAL, 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::ILLEGAL, 151 Token::ILLEGAL, 152 Token::COLON, // 0x3a 153 Token::SEMICOLON, // 0x3b 154 Token::ILLEGAL, 155 Token::ILLEGAL, 156 Token::ILLEGAL, 157 Token::CONDITIONAL, // 0x3f 158 Token::ILLEGAL, 159 Token::ILLEGAL, 160 Token::ILLEGAL, 161 Token::ILLEGAL, 162 Token::ILLEGAL, 163 Token::ILLEGAL, 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::ILLEGAL, 172 Token::ILLEGAL, 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::ILLEGAL, 177 Token::ILLEGAL, 178 Token::ILLEGAL, 179 Token::ILLEGAL, 180 Token::ILLEGAL, 181 Token::ILLEGAL, 182 Token::ILLEGAL, 183 Token::ILLEGAL, 184 Token::ILLEGAL, 185 Token::LBRACK, // 0x5b 186 Token::ILLEGAL, 187 Token::RBRACK, // 0x5d 188 Token::ILLEGAL, 189 Token::ILLEGAL, 190 Token::ILLEGAL, 191 Token::ILLEGAL, 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::ILLEGAL, 205 Token::ILLEGAL, 206 Token::ILLEGAL, 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::ILLEGAL, 210 Token::ILLEGAL, 211 Token::ILLEGAL, 212 Token::ILLEGAL, 213 Token::ILLEGAL, 214 Token::ILLEGAL, 215 Token::ILLEGAL, 216 Token::ILLEGAL, 217 Token::LBRACE, // 0x7b 218 Token::ILLEGAL, 219 Token::RBRACE, // 0x7d 220 Token::BIT_NOT, // 0x7e 221 Token::ILLEGAL 222 }; 223 224 225 Token::Value Scanner::Next() { 226 current_ = next_; 227 has_line_terminator_before_next_ = false; 228 has_multiline_comment_before_next_ = false; 229 if (static_cast<unsigned>(c0_) <= 0x7f) { 230 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 231 if (token != Token::ILLEGAL) { 232 int pos = source_pos(); 233 next_.token = token; 234 next_.location.beg_pos = pos; 235 next_.location.end_pos = pos + 1; 236 Advance(); 237 return current_.token; 238 } 239 } 240 Scan(); 241 return current_.token; 242 } 243 244 245 static inline bool IsByteOrderMark(uc32 c) { 246 // The Unicode value U+FFFE is guaranteed never to be assigned as a 247 // Unicode character; this implies that in a Unicode context the 248 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 249 // character expressed in little-endian byte order (since it could 250 // not be a U+FFFE character expressed in big-endian byte 251 // order). Nevertheless, we check for it to be compatible with 252 // Spidermonkey. 253 return c == 0xFEFF || c == 0xFFFE; 254 } 255 256 257 bool Scanner::SkipWhiteSpace() { 258 int start_position = source_pos(); 259 260 while (true) { 261 // We treat byte-order marks (BOMs) as whitespace for better 262 // compatibility with Spidermonkey and other JavaScript engines. 263 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { 264 // IsWhiteSpace() includes line terminators! 265 if (unicode_cache_->IsLineTerminator(c0_)) { 266 // Ignore line terminators, but remember them. This is necessary 267 // for automatic semicolon insertion. 268 has_line_terminator_before_next_ = true; 269 } 270 Advance(); 271 } 272 273 // If there is an HTML comment end '-->' at the beginning of a 274 // line (with only whitespace in front of it), we treat the rest 275 // of the line as a comment. This is in line with the way 276 // SpiderMonkey handles it. 277 if (c0_ == '-' && has_line_terminator_before_next_) { 278 Advance(); 279 if (c0_ == '-') { 280 Advance(); 281 if (c0_ == '>') { 282 // Treat the rest of the line as a comment. 283 SkipSingleLineComment(); 284 // Continue skipping white space after the comment. 285 continue; 286 } 287 PushBack('-'); // undo Advance() 288 } 289 PushBack('-'); // undo Advance() 290 } 291 // Return whether or not we skipped any characters. 292 return source_pos() != start_position; 293 } 294 } 295 296 297 Token::Value Scanner::SkipSingleLineComment() { 298 Advance(); 299 300 // The line terminator at the end of the line is not considered 301 // to be part of the single-line comment; it is recognized 302 // separately by the lexical grammar and becomes part of the 303 // stream of input elements for the syntactic grammar (see 304 // ECMA-262, section 7.4). 305 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 306 Advance(); 307 } 308 309 return Token::WHITESPACE; 310 } 311 312 313 Token::Value Scanner::SkipMultiLineComment() { 314 ASSERT(c0_ == '*'); 315 Advance(); 316 317 while (c0_ >= 0) { 318 uc32 ch = c0_; 319 Advance(); 320 if (unicode_cache_->IsLineTerminator(ch)) { 321 // Following ECMA-262, section 7.4, a comment containing 322 // a newline will make the comment count as a line-terminator. 323 has_multiline_comment_before_next_ = true; 324 } 325 // If we have reached the end of the multi-line comment, we 326 // consume the '/' and insert a whitespace. This way all 327 // multi-line comments are treated as whitespace. 328 if (ch == '*' && c0_ == '/') { 329 c0_ = ' '; 330 return Token::WHITESPACE; 331 } 332 } 333 334 // Unterminated multi-line comment. 335 return Token::ILLEGAL; 336 } 337 338 339 Token::Value Scanner::ScanHtmlComment() { 340 // Check for <!-- comments. 341 ASSERT(c0_ == '!'); 342 Advance(); 343 if (c0_ == '-') { 344 Advance(); 345 if (c0_ == '-') return SkipSingleLineComment(); 346 PushBack('-'); // undo Advance() 347 } 348 PushBack('!'); // undo Advance() 349 ASSERT(c0_ == '!'); 350 return Token::LT; 351 } 352 353 354 void Scanner::Scan() { 355 next_.literal_chars = NULL; 356 Token::Value token; 357 do { 358 // Remember the position of the next token 359 next_.location.beg_pos = source_pos(); 360 361 switch (c0_) { 362 case ' ': 363 case '\t': 364 Advance(); 365 token = Token::WHITESPACE; 366 break; 367 368 case '\n': 369 Advance(); 370 has_line_terminator_before_next_ = true; 371 token = Token::WHITESPACE; 372 break; 373 374 case '"': case '\'': 375 token = ScanString(); 376 break; 377 378 case '<': 379 // < <= << <<= <!-- 380 Advance(); 381 if (c0_ == '=') { 382 token = Select(Token::LTE); 383 } else if (c0_ == '<') { 384 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 385 } else if (c0_ == '!') { 386 token = ScanHtmlComment(); 387 } else { 388 token = Token::LT; 389 } 390 break; 391 392 case '>': 393 // > >= >> >>= >>> >>>= 394 Advance(); 395 if (c0_ == '=') { 396 token = Select(Token::GTE); 397 } else if (c0_ == '>') { 398 // >> >>= >>> >>>= 399 Advance(); 400 if (c0_ == '=') { 401 token = Select(Token::ASSIGN_SAR); 402 } else if (c0_ == '>') { 403 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 404 } else { 405 token = Token::SAR; 406 } 407 } else { 408 token = Token::GT; 409 } 410 break; 411 412 case '=': 413 // = == === 414 Advance(); 415 if (c0_ == '=') { 416 token = Select('=', Token::EQ_STRICT, Token::EQ); 417 } else { 418 token = Token::ASSIGN; 419 } 420 break; 421 422 case '!': 423 // ! != !== 424 Advance(); 425 if (c0_ == '=') { 426 token = Select('=', Token::NE_STRICT, Token::NE); 427 } else { 428 token = Token::NOT; 429 } 430 break; 431 432 case '+': 433 // + ++ += 434 Advance(); 435 if (c0_ == '+') { 436 token = Select(Token::INC); 437 } else if (c0_ == '=') { 438 token = Select(Token::ASSIGN_ADD); 439 } else { 440 token = Token::ADD; 441 } 442 break; 443 444 case '-': 445 // - -- --> -= 446 Advance(); 447 if (c0_ == '-') { 448 Advance(); 449 if (c0_ == '>' && has_line_terminator_before_next_) { 450 // For compatibility with SpiderMonkey, we skip lines that 451 // start with an HTML comment end '-->'. 452 token = SkipSingleLineComment(); 453 } else { 454 token = Token::DEC; 455 } 456 } else if (c0_ == '=') { 457 token = Select(Token::ASSIGN_SUB); 458 } else { 459 token = Token::SUB; 460 } 461 break; 462 463 case '*': 464 // * *= 465 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 466 break; 467 468 case '%': 469 // % %= 470 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 471 break; 472 473 case '/': 474 // / // /* /= 475 Advance(); 476 if (c0_ == '/') { 477 token = SkipSingleLineComment(); 478 } else if (c0_ == '*') { 479 token = SkipMultiLineComment(); 480 } else if (c0_ == '=') { 481 token = Select(Token::ASSIGN_DIV); 482 } else { 483 token = Token::DIV; 484 } 485 break; 486 487 case '&': 488 // & && &= 489 Advance(); 490 if (c0_ == '&') { 491 token = Select(Token::AND); 492 } else if (c0_ == '=') { 493 token = Select(Token::ASSIGN_BIT_AND); 494 } else { 495 token = Token::BIT_AND; 496 } 497 break; 498 499 case '|': 500 // | || |= 501 Advance(); 502 if (c0_ == '|') { 503 token = Select(Token::OR); 504 } else if (c0_ == '=') { 505 token = Select(Token::ASSIGN_BIT_OR); 506 } else { 507 token = Token::BIT_OR; 508 } 509 break; 510 511 case '^': 512 // ^ ^= 513 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 514 break; 515 516 case '.': 517 // . Number 518 Advance(); 519 if (IsDecimalDigit(c0_)) { 520 token = ScanNumber(true); 521 } else { 522 token = Token::PERIOD; 523 } 524 break; 525 526 case ':': 527 token = Select(Token::COLON); 528 break; 529 530 case ';': 531 token = Select(Token::SEMICOLON); 532 break; 533 534 case ',': 535 token = Select(Token::COMMA); 536 break; 537 538 case '(': 539 token = Select(Token::LPAREN); 540 break; 541 542 case ')': 543 token = Select(Token::RPAREN); 544 break; 545 546 case '[': 547 token = Select(Token::LBRACK); 548 break; 549 550 case ']': 551 token = Select(Token::RBRACK); 552 break; 553 554 case '{': 555 token = Select(Token::LBRACE); 556 break; 557 558 case '}': 559 token = Select(Token::RBRACE); 560 break; 561 562 case '?': 563 token = Select(Token::CONDITIONAL); 564 break; 565 566 case '~': 567 token = Select(Token::BIT_NOT); 568 break; 569 570 default: 571 if (unicode_cache_->IsIdentifierStart(c0_)) { 572 token = ScanIdentifierOrKeyword(); 573 } else if (IsDecimalDigit(c0_)) { 574 token = ScanNumber(false); 575 } else if (SkipWhiteSpace()) { 576 token = Token::WHITESPACE; 577 } else if (c0_ < 0) { 578 token = Token::EOS; 579 } else { 580 token = Select(Token::ILLEGAL); 581 } 582 break; 583 } 584 585 // Continue scanning for tokens as long as we're just skipping 586 // whitespace. 587 } while (token == Token::WHITESPACE); 588 589 next_.location.end_pos = source_pos(); 590 next_.token = token; 591 } 592 593 594 void Scanner::SeekForward(int pos) { 595 // After this call, we will have the token at the given position as 596 // the "next" token. The "current" token will be invalid. 597 if (pos == next_.location.beg_pos) return; 598 int current_pos = source_pos(); 599 ASSERT_EQ(next_.location.end_pos, current_pos); 600 // Positions inside the lookahead token aren't supported. 601 ASSERT(pos >= current_pos); 602 if (pos != current_pos) { 603 source_->SeekForward(pos - source_->pos()); 604 Advance(); 605 // This function is only called to seek to the location 606 // of the end of a function (at the "}" token). It doesn't matter 607 // whether there was a line terminator in the part we skip. 608 has_line_terminator_before_next_ = false; 609 has_multiline_comment_before_next_ = false; 610 } 611 Scan(); 612 } 613 614 615 bool Scanner::ScanEscape() { 616 uc32 c = c0_; 617 Advance(); 618 619 // Skip escaped newlines. 620 if (unicode_cache_->IsLineTerminator(c)) { 621 // Allow CR+LF newlines in multiline string literals. 622 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 623 // Allow LF+CR newlines in multiline string literals. 624 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 625 return true; 626 } 627 628 switch (c) { 629 case '\'': // fall through 630 case '"' : // fall through 631 case '\\': break; 632 case 'b' : c = '\b'; break; 633 case 'f' : c = '\f'; break; 634 case 'n' : c = '\n'; break; 635 case 'r' : c = '\r'; break; 636 case 't' : c = '\t'; break; 637 case 'u' : { 638 c = ScanHexNumber(4); 639 if (c < 0) return false; 640 break; 641 } 642 case 'v' : c = '\v'; break; 643 case 'x' : { 644 c = ScanHexNumber(2); 645 if (c < 0) return false; 646 break; 647 } 648 case '0' : // fall through 649 case '1' : // fall through 650 case '2' : // fall through 651 case '3' : // fall through 652 case '4' : // fall through 653 case '5' : // fall through 654 case '6' : // fall through 655 case '7' : c = ScanOctalEscape(c, 2); break; 656 } 657 658 // According to ECMA-262, section 7.8.4, characters not covered by the 659 // above cases should be illegal, but they are commonly handled as 660 // non-escaped characters by JS VMs. 661 AddLiteralChar(c); 662 return true; 663 } 664 665 666 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of 667 // ECMA-262. Other JS VMs support them. 668 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 669 uc32 x = c - '0'; 670 int i = 0; 671 for (; i < length; i++) { 672 int d = c0_ - '0'; 673 if (d < 0 || d > 7) break; 674 int nx = x * 8 + d; 675 if (nx >= 256) break; 676 x = nx; 677 Advance(); 678 } 679 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 680 // Remember the position of octal escape sequences so that an error 681 // can be reported later (in strict mode). 682 // We don't report the error immediately, because the octal escape can 683 // occur before the "use strict" directive. 684 if (c != '0' || i > 0) { 685 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 686 } 687 return x; 688 } 689 690 691 Token::Value Scanner::ScanString() { 692 uc32 quote = c0_; 693 Advance(); // consume quote 694 695 LiteralScope literal(this); 696 while (c0_ != quote && c0_ >= 0 697 && !unicode_cache_->IsLineTerminator(c0_)) { 698 uc32 c = c0_; 699 Advance(); 700 if (c == '\\') { 701 if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL; 702 } else { 703 AddLiteralChar(c); 704 } 705 } 706 if (c0_ != quote) return Token::ILLEGAL; 707 literal.Complete(); 708 709 Advance(); // consume quote 710 return Token::STRING; 711 } 712 713 714 void Scanner::ScanDecimalDigits() { 715 while (IsDecimalDigit(c0_)) 716 AddLiteralCharAdvance(); 717 } 718 719 720 Token::Value Scanner::ScanNumber(bool seen_period) { 721 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 722 723 enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL; 724 725 LiteralScope literal(this); 726 if (seen_period) { 727 // we have already seen a decimal point of the float 728 AddLiteralChar('.'); 729 ScanDecimalDigits(); // we know we have at least one digit 730 731 } else { 732 // if the first character is '0' we must check for octals and hex 733 if (c0_ == '0') { 734 int start_pos = source_pos(); // For reporting octal positions. 735 AddLiteralCharAdvance(); 736 737 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or 738 // an octal number. 739 if (c0_ == 'x' || c0_ == 'X') { 740 // hex number 741 kind = HEX; 742 AddLiteralCharAdvance(); 743 if (!IsHexDigit(c0_)) { 744 // we must have at least one hex digit after 'x'/'X' 745 return Token::ILLEGAL; 746 } 747 while (IsHexDigit(c0_)) { 748 AddLiteralCharAdvance(); 749 } 750 } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) { 751 kind = OCTAL; 752 AddLiteralCharAdvance(); 753 if (!IsOctalDigit(c0_)) { 754 // we must have at least one octal digit after 'o'/'O' 755 return Token::ILLEGAL; 756 } 757 while (IsOctalDigit(c0_)) { 758 AddLiteralCharAdvance(); 759 } 760 } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) { 761 kind = BINARY; 762 AddLiteralCharAdvance(); 763 if (!IsBinaryDigit(c0_)) { 764 // we must have at least one binary digit after 'b'/'B' 765 return Token::ILLEGAL; 766 } 767 while (IsBinaryDigit(c0_)) { 768 AddLiteralCharAdvance(); 769 } 770 } else if ('0' <= c0_ && c0_ <= '7') { 771 // (possible) octal number 772 kind = IMPLICIT_OCTAL; 773 while (true) { 774 if (c0_ == '8' || c0_ == '9') { 775 kind = DECIMAL; 776 break; 777 } 778 if (c0_ < '0' || '7' < c0_) { 779 // Octal literal finished. 780 octal_pos_ = Location(start_pos, source_pos()); 781 break; 782 } 783 AddLiteralCharAdvance(); 784 } 785 } 786 } 787 788 // Parse decimal digits and allow trailing fractional part. 789 if (kind == DECIMAL) { 790 ScanDecimalDigits(); // optional 791 if (c0_ == '.') { 792 AddLiteralCharAdvance(); 793 ScanDecimalDigits(); // optional 794 } 795 } 796 } 797 798 // scan exponent, if any 799 if (c0_ == 'e' || c0_ == 'E') { 800 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 801 if (kind != DECIMAL) return Token::ILLEGAL; 802 // scan exponent 803 AddLiteralCharAdvance(); 804 if (c0_ == '+' || c0_ == '-') 805 AddLiteralCharAdvance(); 806 if (!IsDecimalDigit(c0_)) { 807 // we must have at least one decimal digit after 'e'/'E' 808 return Token::ILLEGAL; 809 } 810 ScanDecimalDigits(); 811 } 812 813 // The source character immediately following a numeric literal must 814 // not be an identifier start or a decimal digit; see ECMA-262 815 // section 7.8.3, page 17 (note that we read only one decimal digit 816 // if the value is 0). 817 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) 818 return Token::ILLEGAL; 819 820 literal.Complete(); 821 822 return Token::NUMBER; 823 } 824 825 826 uc32 Scanner::ScanIdentifierUnicodeEscape() { 827 Advance(); 828 if (c0_ != 'u') return -1; 829 Advance(); 830 uc32 result = ScanHexNumber(4); 831 if (result < 0) PushBack('u'); 832 return result; 833 } 834 835 836 // ---------------------------------------------------------------------------- 837 // Keyword Matcher 838 839 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 840 KEYWORD_GROUP('b') \ 841 KEYWORD("break", Token::BREAK) \ 842 KEYWORD_GROUP('c') \ 843 KEYWORD("case", Token::CASE) \ 844 KEYWORD("catch", Token::CATCH) \ 845 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ 846 KEYWORD("const", Token::CONST) \ 847 KEYWORD("continue", Token::CONTINUE) \ 848 KEYWORD_GROUP('d') \ 849 KEYWORD("debugger", Token::DEBUGGER) \ 850 KEYWORD("default", Token::DEFAULT) \ 851 KEYWORD("delete", Token::DELETE) \ 852 KEYWORD("do", Token::DO) \ 853 KEYWORD_GROUP('e') \ 854 KEYWORD("else", Token::ELSE) \ 855 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ 856 KEYWORD("export", harmony_modules \ 857 ? Token::EXPORT : Token::FUTURE_RESERVED_WORD) \ 858 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ 859 KEYWORD_GROUP('f') \ 860 KEYWORD("false", Token::FALSE_LITERAL) \ 861 KEYWORD("finally", Token::FINALLY) \ 862 KEYWORD("for", Token::FOR) \ 863 KEYWORD("function", Token::FUNCTION) \ 864 KEYWORD_GROUP('i') \ 865 KEYWORD("if", Token::IF) \ 866 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 867 KEYWORD("import", harmony_modules \ 868 ? Token::IMPORT : Token::FUTURE_RESERVED_WORD) \ 869 KEYWORD("in", Token::IN) \ 870 KEYWORD("instanceof", Token::INSTANCEOF) \ 871 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 872 KEYWORD_GROUP('l') \ 873 KEYWORD("let", harmony_scoping \ 874 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ 875 KEYWORD_GROUP('n') \ 876 KEYWORD("new", Token::NEW) \ 877 KEYWORD("null", Token::NULL_LITERAL) \ 878 KEYWORD_GROUP('p') \ 879 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 880 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 881 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 882 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 883 KEYWORD_GROUP('r') \ 884 KEYWORD("return", Token::RETURN) \ 885 KEYWORD_GROUP('s') \ 886 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ 887 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ 888 KEYWORD("switch", Token::SWITCH) \ 889 KEYWORD_GROUP('t') \ 890 KEYWORD("this", Token::THIS) \ 891 KEYWORD("throw", Token::THROW) \ 892 KEYWORD("true", Token::TRUE_LITERAL) \ 893 KEYWORD("try", Token::TRY) \ 894 KEYWORD("typeof", Token::TYPEOF) \ 895 KEYWORD_GROUP('v') \ 896 KEYWORD("var", Token::VAR) \ 897 KEYWORD("void", Token::VOID) \ 898 KEYWORD_GROUP('w') \ 899 KEYWORD("while", Token::WHILE) \ 900 KEYWORD("with", Token::WITH) \ 901 KEYWORD_GROUP('y') \ 902 KEYWORD("yield", Token::YIELD) 903 904 905 static Token::Value KeywordOrIdentifierToken(const char* input, 906 int input_length, 907 bool harmony_scoping, 908 bool harmony_modules) { 909 ASSERT(input_length >= 1); 910 const int kMinLength = 2; 911 const int kMaxLength = 10; 912 if (input_length < kMinLength || input_length > kMaxLength) { 913 return Token::IDENTIFIER; 914 } 915 switch (input[0]) { 916 default: 917 #define KEYWORD_GROUP_CASE(ch) \ 918 break; \ 919 case ch: 920 #define KEYWORD(keyword, token) \ 921 { \ 922 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 923 /* strlen(keyword) plus 1 for the NUL char. */ \ 924 const int keyword_length = sizeof(keyword) - 1; \ 925 STATIC_ASSERT(keyword_length >= kMinLength); \ 926 STATIC_ASSERT(keyword_length <= kMaxLength); \ 927 if (input_length == keyword_length && \ 928 input[1] == keyword[1] && \ 929 (keyword_length <= 2 || input[2] == keyword[2]) && \ 930 (keyword_length <= 3 || input[3] == keyword[3]) && \ 931 (keyword_length <= 4 || input[4] == keyword[4]) && \ 932 (keyword_length <= 5 || input[5] == keyword[5]) && \ 933 (keyword_length <= 6 || input[6] == keyword[6]) && \ 934 (keyword_length <= 7 || input[7] == keyword[7]) && \ 935 (keyword_length <= 8 || input[8] == keyword[8]) && \ 936 (keyword_length <= 9 || input[9] == keyword[9])) { \ 937 return token; \ 938 } \ 939 } 940 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 941 } 942 return Token::IDENTIFIER; 943 } 944 945 946 Token::Value Scanner::ScanIdentifierOrKeyword() { 947 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); 948 LiteralScope literal(this); 949 // Scan identifier start character. 950 if (c0_ == '\\') { 951 uc32 c = ScanIdentifierUnicodeEscape(); 952 // Only allow legal identifier start characters. 953 if (c < 0 || 954 c == '\\' || // No recursive escapes. 955 !unicode_cache_->IsIdentifierStart(c)) { 956 return Token::ILLEGAL; 957 } 958 AddLiteralChar(c); 959 return ScanIdentifierSuffix(&literal); 960 } 961 962 uc32 first_char = c0_; 963 Advance(); 964 AddLiteralChar(first_char); 965 966 // Scan the rest of the identifier characters. 967 while (unicode_cache_->IsIdentifierPart(c0_)) { 968 if (c0_ != '\\') { 969 uc32 next_char = c0_; 970 Advance(); 971 AddLiteralChar(next_char); 972 continue; 973 } 974 // Fallthrough if no longer able to complete keyword. 975 return ScanIdentifierSuffix(&literal); 976 } 977 978 literal.Complete(); 979 980 if (next_.literal_chars->is_ascii()) { 981 Vector<const char> chars = next_.literal_chars->ascii_literal(); 982 return KeywordOrIdentifierToken(chars.start(), 983 chars.length(), 984 harmony_scoping_, 985 harmony_modules_); 986 } 987 988 return Token::IDENTIFIER; 989 } 990 991 992 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) { 993 // Scan the rest of the identifier characters. 994 while (unicode_cache_->IsIdentifierPart(c0_)) { 995 if (c0_ == '\\') { 996 uc32 c = ScanIdentifierUnicodeEscape(); 997 // Only allow legal identifier part characters. 998 if (c < 0 || 999 c == '\\' || 1000 !unicode_cache_->IsIdentifierPart(c)) { 1001 return Token::ILLEGAL; 1002 } 1003 AddLiteralChar(c); 1004 } else { 1005 AddLiteralChar(c0_); 1006 Advance(); 1007 } 1008 } 1009 literal->Complete(); 1010 1011 return Token::IDENTIFIER; 1012 } 1013 1014 1015 bool Scanner::ScanRegExpPattern(bool seen_equal) { 1016 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1017 bool in_character_class = false; 1018 1019 // Previous token is either '/' or '/=', in the second case, the 1020 // pattern starts at =. 1021 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1022 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1023 1024 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1025 // the scanner should pass uninterpreted bodies to the RegExp 1026 // constructor. 1027 LiteralScope literal(this); 1028 if (seen_equal) { 1029 AddLiteralChar('='); 1030 } 1031 1032 while (c0_ != '/' || in_character_class) { 1033 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1034 if (c0_ == '\\') { // Escape sequence. 1035 AddLiteralCharAdvance(); 1036 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1037 AddLiteralCharAdvance(); 1038 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1039 // only "safe" characters are allowed (letters, digits, underscore), 1040 // otherwise the escape isn't valid and the invalid character has 1041 // its normal meaning. I.e., we can just continue scanning without 1042 // worrying whether the following characters are part of the escape 1043 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1044 // of the escape sequence. 1045 1046 // TODO(896): At some point, parse RegExps more throughly to capture 1047 // octal esacpes in strict mode. 1048 } else { // Unescaped character. 1049 if (c0_ == '[') in_character_class = true; 1050 if (c0_ == ']') in_character_class = false; 1051 AddLiteralCharAdvance(); 1052 } 1053 } 1054 Advance(); // consume '/' 1055 1056 literal.Complete(); 1057 1058 return true; 1059 } 1060 1061 1062 bool Scanner::ScanLiteralUnicodeEscape() { 1063 ASSERT(c0_ == '\\'); 1064 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; 1065 Advance(); 1066 int i = 1; 1067 if (c0_ == 'u') { 1068 i++; 1069 while (i < 6) { 1070 Advance(); 1071 if (!IsHexDigit(c0_)) break; 1072 chars_read[i] = c0_; 1073 i++; 1074 } 1075 } 1076 if (i < 6) { 1077 // Incomplete escape. Undo all advances and return false. 1078 while (i > 0) { 1079 i--; 1080 PushBack(chars_read[i]); 1081 } 1082 return false; 1083 } 1084 // Complete escape. Add all chars to current literal buffer. 1085 for (int i = 0; i < 6; i++) { 1086 AddLiteralChar(chars_read[i]); 1087 } 1088 return true; 1089 } 1090 1091 1092 bool Scanner::ScanRegExpFlags() { 1093 // Scan regular expression flags. 1094 LiteralScope literal(this); 1095 while (unicode_cache_->IsIdentifierPart(c0_)) { 1096 if (c0_ != '\\') { 1097 AddLiteralCharAdvance(); 1098 } else { 1099 if (!ScanLiteralUnicodeEscape()) { 1100 break; 1101 } 1102 Advance(); 1103 } 1104 } 1105 literal.Complete(); 1106 1107 next_.location.end_pos = source_pos() - 1; 1108 return true; 1109 } 1110 1111 } } // namespace v8::internal 1112