1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 // Features shared by parsing and pre-parsing scanners. 29 30 #include "../include/v8stdint.h" 31 #include "scanner-base.h" 32 #include "char-predicates-inl.h" 33 34 namespace v8 { 35 namespace internal { 36 37 // ---------------------------------------------------------------------------- 38 // Scanner 39 40 Scanner::Scanner(UnicodeCache* unicode_cache) 41 : unicode_cache_(unicode_cache), 42 octal_pos_(kNoOctalLocation) { } 43 44 45 uc32 Scanner::ScanHexEscape(uc32 c, int length) { 46 ASSERT(length <= 4); // prevent overflow 47 48 uc32 digits[4]; 49 uc32 x = 0; 50 for (int i = 0; i < length; i++) { 51 digits[i] = c0_; 52 int d = HexValue(c0_); 53 if (d < 0) { 54 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 55 // should be illegal, but other JS VMs just return the 56 // non-escaped version of the original character. 57 58 // Push back digits read, except the last one (in c0_). 59 for (int j = i-1; j >= 0; j--) { 60 PushBack(digits[j]); 61 } 62 // Notice: No handling of error - treat it as "\u"->"u". 63 return c; 64 } 65 x = x * 16 + d; 66 Advance(); 67 } 68 69 return x; 70 } 71 72 73 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of 74 // ECMA-262. Other JS VMs support them. 75 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 76 uc32 x = c - '0'; 77 int i = 0; 78 for (; i < length; i++) { 79 int d = c0_ - '0'; 80 if (d < 0 || d > 7) break; 81 int nx = x * 8 + d; 82 if (nx >= 256) break; 83 x = nx; 84 Advance(); 85 } 86 // Anything excelt '\0' is an octal escape sequence, illegal in strict mode. 87 // Remember the position of octal escape sequences so that better error 88 // can be reported later (in strict mode). 89 if (c != '0' || i > 0) { 90 octal_pos_ = source_pos() - i - 1; // Already advanced 91 } 92 return x; 93 } 94 95 96 // ---------------------------------------------------------------------------- 97 // JavaScriptScanner 98 99 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants) 100 : Scanner(scanner_contants) { } 101 102 103 Token::Value JavaScriptScanner::Next() { 104 current_ = next_; 105 has_line_terminator_before_next_ = false; 106 Scan(); 107 return current_.token; 108 } 109 110 111 static inline bool IsByteOrderMark(uc32 c) { 112 // The Unicode value U+FFFE is guaranteed never to be assigned as a 113 // Unicode character; this implies that in a Unicode context the 114 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 115 // character expressed in little-endian byte order (since it could 116 // not be a U+FFFE character expressed in big-endian byte 117 // order). Nevertheless, we check for it to be compatible with 118 // Spidermonkey. 119 return c == 0xFEFF || c == 0xFFFE; 120 } 121 122 123 bool JavaScriptScanner::SkipWhiteSpace() { 124 int start_position = source_pos(); 125 126 while (true) { 127 // We treat byte-order marks (BOMs) as whitespace for better 128 // compatibility with Spidermonkey and other JavaScript engines. 129 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { 130 // IsWhiteSpace() includes line terminators! 131 if (unicode_cache_->IsLineTerminator(c0_)) { 132 // Ignore line terminators, but remember them. This is necessary 133 // for automatic semicolon insertion. 134 has_line_terminator_before_next_ = true; 135 } 136 Advance(); 137 } 138 139 // If there is an HTML comment end '-->' at the beginning of a 140 // line (with only whitespace in front of it), we treat the rest 141 // of the line as a comment. This is in line with the way 142 // SpiderMonkey handles it. 143 if (c0_ == '-' && has_line_terminator_before_next_) { 144 Advance(); 145 if (c0_ == '-') { 146 Advance(); 147 if (c0_ == '>') { 148 // Treat the rest of the line as a comment. 149 SkipSingleLineComment(); 150 // Continue skipping white space after the comment. 151 continue; 152 } 153 PushBack('-'); // undo Advance() 154 } 155 PushBack('-'); // undo Advance() 156 } 157 // Return whether or not we skipped any characters. 158 return source_pos() != start_position; 159 } 160 } 161 162 163 Token::Value JavaScriptScanner::SkipSingleLineComment() { 164 Advance(); 165 166 // The line terminator at the end of the line is not considered 167 // to be part of the single-line comment; it is recognized 168 // separately by the lexical grammar and becomes part of the 169 // stream of input elements for the syntactic grammar (see 170 // ECMA-262, section 7.4, page 12). 171 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 172 Advance(); 173 } 174 175 return Token::WHITESPACE; 176 } 177 178 179 Token::Value JavaScriptScanner::SkipMultiLineComment() { 180 ASSERT(c0_ == '*'); 181 Advance(); 182 183 while (c0_ >= 0) { 184 char ch = c0_; 185 Advance(); 186 // If we have reached the end of the multi-line comment, we 187 // consume the '/' and insert a whitespace. This way all 188 // multi-line comments are treated as whitespace - even the ones 189 // containing line terminators. This contradicts ECMA-262, section 190 // 7.4, page 12, that says that multi-line comments containing 191 // line terminators should be treated as a line terminator, but it 192 // matches the behaviour of SpiderMonkey and KJS. 193 if (ch == '*' && c0_ == '/') { 194 c0_ = ' '; 195 return Token::WHITESPACE; 196 } 197 } 198 199 // Unterminated multi-line comment. 200 return Token::ILLEGAL; 201 } 202 203 204 Token::Value JavaScriptScanner::ScanHtmlComment() { 205 // Check for <!-- comments. 206 ASSERT(c0_ == '!'); 207 Advance(); 208 if (c0_ == '-') { 209 Advance(); 210 if (c0_ == '-') return SkipSingleLineComment(); 211 PushBack('-'); // undo Advance() 212 } 213 PushBack('!'); // undo Advance() 214 ASSERT(c0_ == '!'); 215 return Token::LT; 216 } 217 218 219 void JavaScriptScanner::Scan() { 220 next_.literal_chars = NULL; 221 Token::Value token; 222 do { 223 // Remember the position of the next token 224 next_.location.beg_pos = source_pos(); 225 226 switch (c0_) { 227 case ' ': 228 case '\t': 229 Advance(); 230 token = Token::WHITESPACE; 231 break; 232 233 case '\n': 234 Advance(); 235 has_line_terminator_before_next_ = true; 236 token = Token::WHITESPACE; 237 break; 238 239 case '"': case '\'': 240 token = ScanString(); 241 break; 242 243 case '<': 244 // < <= << <<= <!-- 245 Advance(); 246 if (c0_ == '=') { 247 token = Select(Token::LTE); 248 } else if (c0_ == '<') { 249 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 250 } else if (c0_ == '!') { 251 token = ScanHtmlComment(); 252 } else { 253 token = Token::LT; 254 } 255 break; 256 257 case '>': 258 // > >= >> >>= >>> >>>= 259 Advance(); 260 if (c0_ == '=') { 261 token = Select(Token::GTE); 262 } else if (c0_ == '>') { 263 // >> >>= >>> >>>= 264 Advance(); 265 if (c0_ == '=') { 266 token = Select(Token::ASSIGN_SAR); 267 } else if (c0_ == '>') { 268 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 269 } else { 270 token = Token::SAR; 271 } 272 } else { 273 token = Token::GT; 274 } 275 break; 276 277 case '=': 278 // = == === 279 Advance(); 280 if (c0_ == '=') { 281 token = Select('=', Token::EQ_STRICT, Token::EQ); 282 } else { 283 token = Token::ASSIGN; 284 } 285 break; 286 287 case '!': 288 // ! != !== 289 Advance(); 290 if (c0_ == '=') { 291 token = Select('=', Token::NE_STRICT, Token::NE); 292 } else { 293 token = Token::NOT; 294 } 295 break; 296 297 case '+': 298 // + ++ += 299 Advance(); 300 if (c0_ == '+') { 301 token = Select(Token::INC); 302 } else if (c0_ == '=') { 303 token = Select(Token::ASSIGN_ADD); 304 } else { 305 token = Token::ADD; 306 } 307 break; 308 309 case '-': 310 // - -- --> -= 311 Advance(); 312 if (c0_ == '-') { 313 Advance(); 314 if (c0_ == '>' && has_line_terminator_before_next_) { 315 // For compatibility with SpiderMonkey, we skip lines that 316 // start with an HTML comment end '-->'. 317 token = SkipSingleLineComment(); 318 } else { 319 token = Token::DEC; 320 } 321 } else if (c0_ == '=') { 322 token = Select(Token::ASSIGN_SUB); 323 } else { 324 token = Token::SUB; 325 } 326 break; 327 328 case '*': 329 // * *= 330 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 331 break; 332 333 case '%': 334 // % %= 335 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 336 break; 337 338 case '/': 339 // / // /* /= 340 Advance(); 341 if (c0_ == '/') { 342 token = SkipSingleLineComment(); 343 } else if (c0_ == '*') { 344 token = SkipMultiLineComment(); 345 } else if (c0_ == '=') { 346 token = Select(Token::ASSIGN_DIV); 347 } else { 348 token = Token::DIV; 349 } 350 break; 351 352 case '&': 353 // & && &= 354 Advance(); 355 if (c0_ == '&') { 356 token = Select(Token::AND); 357 } else if (c0_ == '=') { 358 token = Select(Token::ASSIGN_BIT_AND); 359 } else { 360 token = Token::BIT_AND; 361 } 362 break; 363 364 case '|': 365 // | || |= 366 Advance(); 367 if (c0_ == '|') { 368 token = Select(Token::OR); 369 } else if (c0_ == '=') { 370 token = Select(Token::ASSIGN_BIT_OR); 371 } else { 372 token = Token::BIT_OR; 373 } 374 break; 375 376 case '^': 377 // ^ ^= 378 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 379 break; 380 381 case '.': 382 // . Number 383 Advance(); 384 if (IsDecimalDigit(c0_)) { 385 token = ScanNumber(true); 386 } else { 387 token = Token::PERIOD; 388 } 389 break; 390 391 case ':': 392 token = Select(Token::COLON); 393 break; 394 395 case ';': 396 token = Select(Token::SEMICOLON); 397 break; 398 399 case ',': 400 token = Select(Token::COMMA); 401 break; 402 403 case '(': 404 token = Select(Token::LPAREN); 405 break; 406 407 case ')': 408 token = Select(Token::RPAREN); 409 break; 410 411 case '[': 412 token = Select(Token::LBRACK); 413 break; 414 415 case ']': 416 token = Select(Token::RBRACK); 417 break; 418 419 case '{': 420 token = Select(Token::LBRACE); 421 break; 422 423 case '}': 424 token = Select(Token::RBRACE); 425 break; 426 427 case '?': 428 token = Select(Token::CONDITIONAL); 429 break; 430 431 case '~': 432 token = Select(Token::BIT_NOT); 433 break; 434 435 default: 436 if (unicode_cache_->IsIdentifierStart(c0_)) { 437 token = ScanIdentifierOrKeyword(); 438 } else if (IsDecimalDigit(c0_)) { 439 token = ScanNumber(false); 440 } else if (SkipWhiteSpace()) { 441 token = Token::WHITESPACE; 442 } else if (c0_ < 0) { 443 token = Token::EOS; 444 } else { 445 token = Select(Token::ILLEGAL); 446 } 447 break; 448 } 449 450 // Continue scanning for tokens as long as we're just skipping 451 // whitespace. 452 } while (token == Token::WHITESPACE); 453 454 next_.location.end_pos = source_pos(); 455 next_.token = token; 456 } 457 458 459 void JavaScriptScanner::SeekForward(int pos) { 460 // After this call, we will have the token at the given position as 461 // the "next" token. The "current" token will be invalid. 462 if (pos == next_.location.beg_pos) return; 463 int current_pos = source_pos(); 464 ASSERT_EQ(next_.location.end_pos, current_pos); 465 // Positions inside the lookahead token aren't supported. 466 ASSERT(pos >= current_pos); 467 if (pos != current_pos) { 468 source_->SeekForward(pos - source_->pos()); 469 Advance(); 470 // This function is only called to seek to the location 471 // of the end of a function (at the "}" token). It doesn't matter 472 // whether there was a line terminator in the part we skip. 473 has_line_terminator_before_next_ = false; 474 } 475 Scan(); 476 } 477 478 479 void JavaScriptScanner::ScanEscape() { 480 uc32 c = c0_; 481 Advance(); 482 483 // Skip escaped newlines. 484 if (unicode_cache_->IsLineTerminator(c)) { 485 // Allow CR+LF newlines in multiline string literals. 486 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 487 // Allow LF+CR newlines in multiline string literals. 488 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 489 return; 490 } 491 492 switch (c) { 493 case '\'': // fall through 494 case '"' : // fall through 495 case '\\': break; 496 case 'b' : c = '\b'; break; 497 case 'f' : c = '\f'; break; 498 case 'n' : c = '\n'; break; 499 case 'r' : c = '\r'; break; 500 case 't' : c = '\t'; break; 501 case 'u' : c = ScanHexEscape(c, 4); break; 502 case 'v' : c = '\v'; break; 503 case 'x' : c = ScanHexEscape(c, 2); break; 504 case '0' : // fall through 505 case '1' : // fall through 506 case '2' : // fall through 507 case '3' : // fall through 508 case '4' : // fall through 509 case '5' : // fall through 510 case '6' : // fall through 511 case '7' : c = ScanOctalEscape(c, 2); break; 512 } 513 514 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these 515 // should be illegal, but they are commonly handled 516 // as non-escaped characters by JS VMs. 517 AddLiteralChar(c); 518 } 519 520 521 Token::Value JavaScriptScanner::ScanString() { 522 uc32 quote = c0_; 523 Advance(); // consume quote 524 525 LiteralScope literal(this); 526 while (c0_ != quote && c0_ >= 0 527 && !unicode_cache_->IsLineTerminator(c0_)) { 528 uc32 c = c0_; 529 Advance(); 530 if (c == '\\') { 531 if (c0_ < 0) return Token::ILLEGAL; 532 ScanEscape(); 533 } else { 534 AddLiteralChar(c); 535 } 536 } 537 if (c0_ != quote) return Token::ILLEGAL; 538 literal.Complete(); 539 540 Advance(); // consume quote 541 return Token::STRING; 542 } 543 544 545 void JavaScriptScanner::ScanDecimalDigits() { 546 while (IsDecimalDigit(c0_)) 547 AddLiteralCharAdvance(); 548 } 549 550 551 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) { 552 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 553 554 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; 555 556 LiteralScope literal(this); 557 if (seen_period) { 558 // we have already seen a decimal point of the float 559 AddLiteralChar('.'); 560 ScanDecimalDigits(); // we know we have at least one digit 561 562 } else { 563 // if the first character is '0' we must check for octals and hex 564 if (c0_ == '0') { 565 AddLiteralCharAdvance(); 566 567 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number 568 if (c0_ == 'x' || c0_ == 'X') { 569 // hex number 570 kind = HEX; 571 AddLiteralCharAdvance(); 572 if (!IsHexDigit(c0_)) { 573 // we must have at least one hex digit after 'x'/'X' 574 return Token::ILLEGAL; 575 } 576 while (IsHexDigit(c0_)) { 577 AddLiteralCharAdvance(); 578 } 579 } else if ('0' <= c0_ && c0_ <= '7') { 580 // (possible) octal number 581 kind = OCTAL; 582 while (true) { 583 if (c0_ == '8' || c0_ == '9') { 584 kind = DECIMAL; 585 break; 586 } 587 if (c0_ < '0' || '7' < c0_) { 588 // Octal literal finished. 589 octal_pos_ = next_.location.beg_pos; 590 break; 591 } 592 AddLiteralCharAdvance(); 593 } 594 } 595 } 596 597 // Parse decimal digits and allow trailing fractional part. 598 if (kind == DECIMAL) { 599 ScanDecimalDigits(); // optional 600 if (c0_ == '.') { 601 AddLiteralCharAdvance(); 602 ScanDecimalDigits(); // optional 603 } 604 } 605 } 606 607 // scan exponent, if any 608 if (c0_ == 'e' || c0_ == 'E') { 609 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 610 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed 611 // scan exponent 612 AddLiteralCharAdvance(); 613 if (c0_ == '+' || c0_ == '-') 614 AddLiteralCharAdvance(); 615 if (!IsDecimalDigit(c0_)) { 616 // we must have at least one decimal digit after 'e'/'E' 617 return Token::ILLEGAL; 618 } 619 ScanDecimalDigits(); 620 } 621 622 // The source character immediately following a numeric literal must 623 // not be an identifier start or a decimal digit; see ECMA-262 624 // section 7.8.3, page 17 (note that we read only one decimal digit 625 // if the value is 0). 626 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) 627 return Token::ILLEGAL; 628 629 literal.Complete(); 630 631 return Token::NUMBER; 632 } 633 634 635 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() { 636 Advance(); 637 if (c0_ != 'u') return unibrow::Utf8::kBadChar; 638 Advance(); 639 uc32 c = ScanHexEscape('u', 4); 640 // We do not allow a unicode escape sequence to start another 641 // unicode escape sequence. 642 if (c == '\\') return unibrow::Utf8::kBadChar; 643 return c; 644 } 645 646 647 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() { 648 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); 649 LiteralScope literal(this); 650 KeywordMatcher keyword_match; 651 // Scan identifier start character. 652 if (c0_ == '\\') { 653 uc32 c = ScanIdentifierUnicodeEscape(); 654 // Only allow legal identifier start characters. 655 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL; 656 AddLiteralChar(c); 657 return ScanIdentifierSuffix(&literal); 658 } 659 660 uc32 first_char = c0_; 661 Advance(); 662 AddLiteralChar(first_char); 663 if (!keyword_match.AddChar(first_char)) { 664 return ScanIdentifierSuffix(&literal); 665 } 666 667 // Scan the rest of the identifier characters. 668 while (unicode_cache_->IsIdentifierPart(c0_)) { 669 if (c0_ != '\\') { 670 uc32 next_char = c0_; 671 Advance(); 672 AddLiteralChar(next_char); 673 if (keyword_match.AddChar(next_char)) continue; 674 } 675 // Fallthrough if no loner able to complete keyword. 676 return ScanIdentifierSuffix(&literal); 677 } 678 literal.Complete(); 679 680 return keyword_match.token(); 681 } 682 683 684 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) { 685 // Scan the rest of the identifier characters. 686 while (unicode_cache_->IsIdentifierPart(c0_)) { 687 if (c0_ == '\\') { 688 uc32 c = ScanIdentifierUnicodeEscape(); 689 // Only allow legal identifier part characters. 690 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL; 691 AddLiteralChar(c); 692 } else { 693 AddLiteralChar(c0_); 694 Advance(); 695 } 696 } 697 literal->Complete(); 698 699 return Token::IDENTIFIER; 700 } 701 702 703 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) { 704 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 705 bool in_character_class = false; 706 707 // Previous token is either '/' or '/=', in the second case, the 708 // pattern starts at =. 709 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 710 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 711 712 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 713 // the scanner should pass uninterpreted bodies to the RegExp 714 // constructor. 715 LiteralScope literal(this); 716 if (seen_equal) 717 AddLiteralChar('='); 718 719 while (c0_ != '/' || in_character_class) { 720 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 721 if (c0_ == '\\') { // Escape sequence. 722 AddLiteralCharAdvance(); 723 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 724 AddLiteralCharAdvance(); 725 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 726 // only "safe" characters are allowed (letters, digits, underscore), 727 // otherwise the escape isn't valid and the invalid character has 728 // its normal meaning. I.e., we can just continue scanning without 729 // worrying whether the following characters are part of the escape 730 // or not, since any '/', '\\' or '[' is guaranteed to not be part 731 // of the escape sequence. 732 } else { // Unescaped character. 733 if (c0_ == '[') in_character_class = true; 734 if (c0_ == ']') in_character_class = false; 735 AddLiteralCharAdvance(); 736 } 737 } 738 Advance(); // consume '/' 739 740 literal.Complete(); 741 742 return true; 743 } 744 745 746 bool JavaScriptScanner::ScanRegExpFlags() { 747 // Scan regular expression flags. 748 LiteralScope literal(this); 749 while (unicode_cache_->IsIdentifierPart(c0_)) { 750 if (c0_ == '\\') { 751 uc32 c = ScanIdentifierUnicodeEscape(); 752 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { 753 // We allow any escaped character, unlike the restriction on 754 // IdentifierPart when it is used to build an IdentifierName. 755 AddLiteralChar(c); 756 continue; 757 } 758 } 759 AddLiteralCharAdvance(); 760 } 761 literal.Complete(); 762 763 next_.location.end_pos = source_pos() - 1; 764 return true; 765 } 766 767 // ---------------------------------------------------------------------------- 768 // Keyword Matcher 769 770 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = { 771 { "break", KEYWORD_PREFIX, Token::BREAK }, 772 { NULL, C, Token::ILLEGAL }, 773 { NULL, D, Token::ILLEGAL }, 774 { NULL, E, Token::ILLEGAL }, 775 { NULL, F, Token::ILLEGAL }, 776 { NULL, UNMATCHABLE, Token::ILLEGAL }, 777 { NULL, UNMATCHABLE, Token::ILLEGAL }, 778 { NULL, I, Token::ILLEGAL }, 779 { NULL, UNMATCHABLE, Token::ILLEGAL }, 780 { NULL, UNMATCHABLE, Token::ILLEGAL }, 781 { "let", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD }, 782 { NULL, UNMATCHABLE, Token::ILLEGAL }, 783 { NULL, N, Token::ILLEGAL }, 784 { NULL, UNMATCHABLE, Token::ILLEGAL }, 785 { NULL, P, Token::ILLEGAL }, 786 { NULL, UNMATCHABLE, Token::ILLEGAL }, 787 { "return", KEYWORD_PREFIX, Token::RETURN }, 788 { NULL, S, Token::ILLEGAL }, 789 { NULL, T, Token::ILLEGAL }, 790 { NULL, UNMATCHABLE, Token::ILLEGAL }, 791 { NULL, V, Token::ILLEGAL }, 792 { NULL, W, Token::ILLEGAL }, 793 { NULL, UNMATCHABLE, Token::ILLEGAL }, 794 { "yield", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD } 795 }; 796 797 798 void KeywordMatcher::Step(unibrow::uchar input) { 799 switch (state_) { 800 case INITIAL: { 801 // matching the first character is the only state with significant fanout. 802 // Match only lower-case letters in range 'b'..'y'. 803 unsigned int offset = input - kFirstCharRangeMin; 804 if (offset < kFirstCharRangeLength) { 805 state_ = first_states_[offset].state; 806 if (state_ == KEYWORD_PREFIX) { 807 keyword_ = first_states_[offset].keyword; 808 counter_ = 1; 809 keyword_token_ = first_states_[offset].token; 810 } 811 return; 812 } 813 break; 814 } 815 case KEYWORD_PREFIX: 816 if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) { 817 counter_++; 818 if (keyword_[counter_] == '\0') { 819 state_ = KEYWORD_MATCHED; 820 token_ = keyword_token_; 821 } 822 return; 823 } 824 break; 825 case KEYWORD_MATCHED: 826 token_ = Token::IDENTIFIER; 827 break; 828 case C: 829 if (MatchState(input, 'a', CA)) return; 830 if (MatchKeywordStart(input, "class", 1, 831 Token::FUTURE_RESERVED_WORD)) return; 832 if (MatchState(input, 'o', CO)) return; 833 break; 834 case CA: 835 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return; 836 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return; 837 break; 838 case CO: 839 if (MatchState(input, 'n', CON)) return; 840 break; 841 case CON: 842 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return; 843 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return; 844 break; 845 case D: 846 if (MatchState(input, 'e', DE)) return; 847 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return; 848 break; 849 case DE: 850 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return; 851 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return; 852 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return; 853 break; 854 case E: 855 if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return; 856 if (MatchKeywordStart(input, "enum", 1, 857 Token::FUTURE_RESERVED_WORD)) return; 858 if (MatchState(input, 'x', EX)) return; 859 break; 860 case EX: 861 if (MatchKeywordStart(input, "export", 2, 862 Token::FUTURE_RESERVED_WORD)) return; 863 if (MatchKeywordStart(input, "extends", 2, 864 Token::FUTURE_RESERVED_WORD)) return; 865 break; 866 case F: 867 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return; 868 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return; 869 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return; 870 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return; 871 break; 872 case I: 873 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return; 874 if (MatchState(input, 'm', IM)) return; 875 if (MatchKeyword(input, 'n', IN, Token::IN)) return; 876 break; 877 case IM: 878 if (MatchState(input, 'p', IMP)) return; 879 break; 880 case IMP: 881 if (MatchKeywordStart(input, "implements", 3, 882 Token::FUTURE_RESERVED_WORD )) return; 883 if (MatchKeywordStart(input, "import", 3, 884 Token::FUTURE_RESERVED_WORD)) return; 885 break; 886 case IN: 887 token_ = Token::IDENTIFIER; 888 if (MatchKeywordStart(input, "interface", 2, 889 Token::FUTURE_RESERVED_WORD)) return; 890 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return; 891 break; 892 case N: 893 if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return; 894 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return; 895 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return; 896 break; 897 case P: 898 if (MatchKeywordStart(input, "package", 1, 899 Token::FUTURE_RESERVED_WORD)) return; 900 if (MatchState(input, 'r', PR)) return; 901 if (MatchKeywordStart(input, "public", 1, 902 Token::FUTURE_RESERVED_WORD)) return; 903 break; 904 case PR: 905 if (MatchKeywordStart(input, "private", 2, 906 Token::FUTURE_RESERVED_WORD)) return; 907 if (MatchKeywordStart(input, "protected", 2, 908 Token::FUTURE_RESERVED_WORD)) return; 909 break; 910 case S: 911 if (MatchKeywordStart(input, "static", 1, 912 Token::FUTURE_RESERVED_WORD)) return; 913 if (MatchKeywordStart(input, "super", 1, 914 Token::FUTURE_RESERVED_WORD)) return; 915 if (MatchKeywordStart(input, "switch", 1, 916 Token::SWITCH)) return; 917 break; 918 case T: 919 if (MatchState(input, 'h', TH)) return; 920 if (MatchState(input, 'r', TR)) return; 921 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return; 922 break; 923 case TH: 924 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return; 925 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return; 926 break; 927 case TR: 928 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return; 929 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return; 930 break; 931 case V: 932 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return; 933 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return; 934 break; 935 case W: 936 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return; 937 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return; 938 break; 939 case UNMATCHABLE: 940 break; 941 } 942 // On fallthrough, it's a failure. 943 state_ = UNMATCHABLE; 944 } 945 946 } } // namespace v8::internal 947