1 /* 2 * Copyright (C) 1999-2000 Harri Porten (porten (at) kde.org) 3 * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved. 4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich (at) uwaterloo.ca) 5 * Copyright (C) 2010 Zoltan Herczeg (zherczeg (at) inf.u-szeged.hu) 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Library General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Library General Public License for more details. 16 * 17 * You should have received a copy of the GNU Library General Public License 18 * along with this library; see the file COPYING.LIB. If not, write to 19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 20 * Boston, MA 02110-1301, USA. 21 * 22 */ 23 24 #include "config.h" 25 #include "Lexer.h" 26 27 #include "JSFunction.h" 28 29 #include "JSGlobalObjectFunctions.h" 30 #include "Identifier.h" 31 #include "NodeInfo.h" 32 #include "Nodes.h" 33 #include "dtoa.h" 34 #include <ctype.h> 35 #include <limits.h> 36 #include <string.h> 37 #include <wtf/Assertions.h> 38 39 using namespace WTF; 40 using namespace Unicode; 41 42 #include "JSParser.h" 43 #include "Lookup.h" 44 #include "Lexer.lut.h" 45 46 namespace JSC { 47 48 49 enum CharacterType { 50 // Types for the main switch 51 52 // The first three types are fixed, and also used for identifying 53 // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart). 54 CharacterIdentifierStart, 55 CharacterZero, 56 CharacterNumber, 57 58 CharacterInvalid, 59 CharacterLineTerminator, 60 CharacterExclamationMark, 61 CharacterOpenParen, 62 CharacterCloseParen, 63 CharacterOpenBracket, 64 CharacterCloseBracket, 65 CharacterComma, 66 CharacterColon, 67 CharacterQuestion, 68 CharacterTilde, 69 CharacterQuote, 70 CharacterDot, 71 CharacterSlash, 72 CharacterBackSlash, 73 CharacterSemicolon, 74 CharacterOpenBrace, 75 CharacterCloseBrace, 76 77 CharacterAdd, 78 CharacterSub, 79 CharacterMultiply, 80 CharacterModulo, 81 CharacterAnd, 82 CharacterXor, 83 CharacterOr, 84 CharacterLess, 85 CharacterGreater, 86 CharacterEqual, 87 88 // Other types (only one so far) 89 CharacterWhiteSpace, 90 }; 91 92 // 128 ASCII codes 93 static const unsigned short typesOfASCIICharacters[128] = { 94 /* 0 - Null */ CharacterInvalid, 95 /* 1 - Start of Heading */ CharacterInvalid, 96 /* 2 - Start of Text */ CharacterInvalid, 97 /* 3 - End of Text */ CharacterInvalid, 98 /* 4 - End of Transm. */ CharacterInvalid, 99 /* 5 - Enquiry */ CharacterInvalid, 100 /* 6 - Acknowledgment */ CharacterInvalid, 101 /* 7 - Bell */ CharacterInvalid, 102 /* 8 - Back Space */ CharacterInvalid, 103 /* 9 - Horizontal Tab */ CharacterWhiteSpace, 104 /* 10 - Line Feed */ CharacterLineTerminator, 105 /* 11 - Vertical Tab */ CharacterWhiteSpace, 106 /* 12 - Form Feed */ CharacterWhiteSpace, 107 /* 13 - Carriage Return */ CharacterLineTerminator, 108 /* 14 - Shift Out */ CharacterInvalid, 109 /* 15 - Shift In */ CharacterInvalid, 110 /* 16 - Data Line Escape */ CharacterInvalid, 111 /* 17 - Device Control 1 */ CharacterInvalid, 112 /* 18 - Device Control 2 */ CharacterInvalid, 113 /* 19 - Device Control 3 */ CharacterInvalid, 114 /* 20 - Device Control 4 */ CharacterInvalid, 115 /* 21 - Negative Ack. */ CharacterInvalid, 116 /* 22 - Synchronous Idle */ CharacterInvalid, 117 /* 23 - End of Transmit */ CharacterInvalid, 118 /* 24 - Cancel */ CharacterInvalid, 119 /* 25 - End of Medium */ CharacterInvalid, 120 /* 26 - Substitute */ CharacterInvalid, 121 /* 27 - Escape */ CharacterInvalid, 122 /* 28 - File Separator */ CharacterInvalid, 123 /* 29 - Group Separator */ CharacterInvalid, 124 /* 30 - Record Separator */ CharacterInvalid, 125 /* 31 - Unit Separator */ CharacterInvalid, 126 /* 32 - Space */ CharacterWhiteSpace, 127 /* 33 - ! */ CharacterExclamationMark, 128 /* 34 - " */ CharacterQuote, 129 /* 35 - # */ CharacterInvalid, 130 /* 36 - $ */ CharacterIdentifierStart, 131 /* 37 - % */ CharacterModulo, 132 /* 38 - & */ CharacterAnd, 133 /* 39 - ' */ CharacterQuote, 134 /* 40 - ( */ CharacterOpenParen, 135 /* 41 - ) */ CharacterCloseParen, 136 /* 42 - * */ CharacterMultiply, 137 /* 43 - + */ CharacterAdd, 138 /* 44 - , */ CharacterComma, 139 /* 45 - - */ CharacterSub, 140 /* 46 - . */ CharacterDot, 141 /* 47 - / */ CharacterSlash, 142 /* 48 - 0 */ CharacterZero, 143 /* 49 - 1 */ CharacterNumber, 144 /* 50 - 2 */ CharacterNumber, 145 /* 51 - 3 */ CharacterNumber, 146 /* 52 - 4 */ CharacterNumber, 147 /* 53 - 5 */ CharacterNumber, 148 /* 54 - 6 */ CharacterNumber, 149 /* 55 - 7 */ CharacterNumber, 150 /* 56 - 8 */ CharacterNumber, 151 /* 57 - 9 */ CharacterNumber, 152 /* 58 - : */ CharacterColon, 153 /* 59 - ; */ CharacterSemicolon, 154 /* 60 - < */ CharacterLess, 155 /* 61 - = */ CharacterEqual, 156 /* 62 - > */ CharacterGreater, 157 /* 63 - ? */ CharacterQuestion, 158 /* 64 - @ */ CharacterInvalid, 159 /* 65 - A */ CharacterIdentifierStart, 160 /* 66 - B */ CharacterIdentifierStart, 161 /* 67 - C */ CharacterIdentifierStart, 162 /* 68 - D */ CharacterIdentifierStart, 163 /* 69 - E */ CharacterIdentifierStart, 164 /* 70 - F */ CharacterIdentifierStart, 165 /* 71 - G */ CharacterIdentifierStart, 166 /* 72 - H */ CharacterIdentifierStart, 167 /* 73 - I */ CharacterIdentifierStart, 168 /* 74 - J */ CharacterIdentifierStart, 169 /* 75 - K */ CharacterIdentifierStart, 170 /* 76 - L */ CharacterIdentifierStart, 171 /* 77 - M */ CharacterIdentifierStart, 172 /* 78 - N */ CharacterIdentifierStart, 173 /* 79 - O */ CharacterIdentifierStart, 174 /* 80 - P */ CharacterIdentifierStart, 175 /* 81 - Q */ CharacterIdentifierStart, 176 /* 82 - R */ CharacterIdentifierStart, 177 /* 83 - S */ CharacterIdentifierStart, 178 /* 84 - T */ CharacterIdentifierStart, 179 /* 85 - U */ CharacterIdentifierStart, 180 /* 86 - V */ CharacterIdentifierStart, 181 /* 87 - W */ CharacterIdentifierStart, 182 /* 88 - X */ CharacterIdentifierStart, 183 /* 89 - Y */ CharacterIdentifierStart, 184 /* 90 - Z */ CharacterIdentifierStart, 185 /* 91 - [ */ CharacterOpenBracket, 186 /* 92 - \ */ CharacterBackSlash, 187 /* 93 - ] */ CharacterCloseBracket, 188 /* 94 - ^ */ CharacterXor, 189 /* 95 - _ */ CharacterIdentifierStart, 190 /* 96 - ` */ CharacterInvalid, 191 /* 97 - a */ CharacterIdentifierStart, 192 /* 98 - b */ CharacterIdentifierStart, 193 /* 99 - c */ CharacterIdentifierStart, 194 /* 100 - d */ CharacterIdentifierStart, 195 /* 101 - e */ CharacterIdentifierStart, 196 /* 102 - f */ CharacterIdentifierStart, 197 /* 103 - g */ CharacterIdentifierStart, 198 /* 104 - h */ CharacterIdentifierStart, 199 /* 105 - i */ CharacterIdentifierStart, 200 /* 106 - j */ CharacterIdentifierStart, 201 /* 107 - k */ CharacterIdentifierStart, 202 /* 108 - l */ CharacterIdentifierStart, 203 /* 109 - m */ CharacterIdentifierStart, 204 /* 110 - n */ CharacterIdentifierStart, 205 /* 111 - o */ CharacterIdentifierStart, 206 /* 112 - p */ CharacterIdentifierStart, 207 /* 113 - q */ CharacterIdentifierStart, 208 /* 114 - r */ CharacterIdentifierStart, 209 /* 115 - s */ CharacterIdentifierStart, 210 /* 116 - t */ CharacterIdentifierStart, 211 /* 117 - u */ CharacterIdentifierStart, 212 /* 118 - v */ CharacterIdentifierStart, 213 /* 119 - w */ CharacterIdentifierStart, 214 /* 120 - x */ CharacterIdentifierStart, 215 /* 121 - y */ CharacterIdentifierStart, 216 /* 122 - z */ CharacterIdentifierStart, 217 /* 123 - { */ CharacterOpenBrace, 218 /* 124 - | */ CharacterOr, 219 /* 125 - } */ CharacterCloseBrace, 220 /* 126 - ~ */ CharacterTilde, 221 /* 127 - Delete */ CharacterInvalid, 222 }; 223 224 Lexer::Lexer(JSGlobalData* globalData) 225 : m_isReparsing(false) 226 , m_globalData(globalData) 227 , m_keywordTable(JSC::mainTable) 228 { 229 } 230 231 Lexer::~Lexer() 232 { 233 m_keywordTable.deleteTable(); 234 } 235 236 ALWAYS_INLINE const UChar* Lexer::currentCharacter() const 237 { 238 ASSERT(m_code <= m_codeEnd); 239 return m_code; 240 } 241 242 ALWAYS_INLINE int Lexer::currentOffset() const 243 { 244 return currentCharacter() - m_codeStart; 245 } 246 247 void Lexer::setCode(const SourceCode& source, ParserArena& arena) 248 { 249 m_arena = &arena.identifierArena(); 250 251 m_lineNumber = source.firstLine(); 252 m_delimited = false; 253 m_lastToken = -1; 254 255 const UChar* data = source.provider()->data(); 256 257 m_source = &source; 258 m_codeStart = data; 259 m_code = data + source.startOffset(); 260 m_codeEnd = data + source.endOffset(); 261 m_error = false; 262 m_atLineStart = true; 263 264 m_buffer8.reserveInitialCapacity(initialReadBufferCapacity); 265 m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2); 266 267 if (LIKELY(m_code < m_codeEnd)) 268 m_current = *m_code; 269 else 270 m_current = -1; 271 ASSERT(currentOffset() == source.startOffset()); 272 } 273 274 ALWAYS_INLINE void Lexer::shift() 275 { 276 // Faster than an if-else sequence 277 ASSERT(m_current != -1); 278 m_current = -1; 279 ++m_code; 280 if (LIKELY(m_code < m_codeEnd)) 281 m_current = *m_code; 282 } 283 284 ALWAYS_INLINE int Lexer::peek(int offset) 285 { 286 // Only use if necessary 287 ASSERT(offset > 0 && offset < 5); 288 const UChar* code = m_code + offset; 289 return (code < m_codeEnd) ? *code : -1; 290 } 291 292 int Lexer::getUnicodeCharacter() 293 { 294 int char1 = peek(1); 295 int char2 = peek(2); 296 int char3 = peek(3); 297 298 if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3))) 299 return -1; 300 301 int result = convertUnicode(m_current, char1, char2, char3); 302 shift(); 303 shift(); 304 shift(); 305 shift(); 306 return result; 307 } 308 309 void Lexer::shiftLineTerminator() 310 { 311 ASSERT(isLineTerminator(m_current)); 312 313 int m_prev = m_current; 314 shift(); 315 316 // Allow both CRLF and LFCR. 317 if (m_prev + m_current == '\n' + '\r') 318 shift(); 319 320 ++m_lineNumber; 321 } 322 323 ALWAYS_INLINE const Identifier* Lexer::makeIdentifier(const UChar* characters, size_t length) 324 { 325 return &m_arena->makeIdentifier(m_globalData, characters, length); 326 } 327 328 ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const 329 { 330 return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW; 331 } 332 333 static NEVER_INLINE bool isNonASCIIIdentStart(int c) 334 { 335 return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other); 336 } 337 338 static inline bool isIdentStart(int c) 339 { 340 return isASCII(c) ? typesOfASCIICharacters[c] == CharacterIdentifierStart : isNonASCIIIdentStart(c); 341 } 342 343 static NEVER_INLINE bool isNonASCIIIdentPart(int c) 344 { 345 return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other 346 | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector); 347 } 348 349 static inline bool isIdentPart(int c) 350 { 351 // Character types are divided into two groups depending on whether they can be part of an 352 // identifier or not. Those whose type value is less or equal than CharacterNumber can be 353 // part of an identifier. (See the CharacterType definition for more details.) 354 return isASCII(c) ? typesOfASCIICharacters[c] <= CharacterNumber : isNonASCIIIdentPart(c); 355 } 356 357 static inline int singleEscape(int c) 358 { 359 switch (c) { 360 case 'b': 361 return 0x08; 362 case 't': 363 return 0x09; 364 case 'n': 365 return 0x0A; 366 case 'v': 367 return 0x0B; 368 case 'f': 369 return 0x0C; 370 case 'r': 371 return 0x0D; 372 case '\\': 373 return '\\'; 374 case '\'': 375 return '\''; 376 case '"': 377 return '"'; 378 default: 379 return 0; 380 } 381 } 382 383 inline void Lexer::record8(int c) 384 { 385 ASSERT(c >= 0); 386 ASSERT(c <= 0xFF); 387 m_buffer8.append(static_cast<char>(c)); 388 } 389 390 inline void Lexer::record16(UChar c) 391 { 392 m_buffer16.append(c); 393 } 394 395 inline void Lexer::record16(int c) 396 { 397 ASSERT(c >= 0); 398 ASSERT(c <= USHRT_MAX); 399 record16(UChar(static_cast<unsigned short>(c))); 400 } 401 402 ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* lvalp, LexType lexType) 403 { 404 bool bufferRequired = false; 405 const UChar* identifierStart = currentCharacter(); 406 int identifierLength; 407 408 while (true) { 409 if (LIKELY(isIdentPart(m_current))) { 410 shift(); 411 continue; 412 } 413 if (LIKELY(m_current != '\\')) 414 break; 415 416 // \uXXXX unicode characters. 417 bufferRequired = true; 418 if (identifierStart != currentCharacter()) 419 m_buffer16.append(identifierStart, currentCharacter() - identifierStart); 420 shift(); 421 if (UNLIKELY(m_current != 'u')) 422 return ERRORTOK; 423 shift(); 424 int character = getUnicodeCharacter(); 425 if (UNLIKELY(character == -1)) 426 return ERRORTOK; 427 if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character) : !isIdentStart(character))) 428 return ERRORTOK; 429 record16(character); 430 identifierStart = currentCharacter(); 431 } 432 433 if (!bufferRequired) 434 identifierLength = currentCharacter() - identifierStart; 435 else { 436 if (identifierStart != currentCharacter()) 437 m_buffer16.append(identifierStart, currentCharacter() - identifierStart); 438 identifierStart = m_buffer16.data(); 439 identifierLength = m_buffer16.size(); 440 } 441 442 const Identifier* ident = makeIdentifier(identifierStart, identifierLength); 443 lvalp->ident = ident; 444 m_delimited = false; 445 446 if (LIKELY(!bufferRequired && lexType == IdentifyReservedWords)) { 447 // Keywords must not be recognized if there was an \uXXXX in the identifier. 448 const HashEntry* entry = m_keywordTable.entry(m_globalData, *ident); 449 return entry ? static_cast<JSTokenType>(entry->lexerValue()) : IDENT; 450 } 451 452 m_buffer16.resize(0); 453 return IDENT; 454 } 455 456 ALWAYS_INLINE bool Lexer::parseString(JSTokenData* lvalp, bool strictMode) 457 { 458 int stringQuoteCharacter = m_current; 459 shift(); 460 461 const UChar* stringStart = currentCharacter(); 462 463 while (m_current != stringQuoteCharacter) { 464 if (UNLIKELY(m_current == '\\')) { 465 if (stringStart != currentCharacter()) 466 m_buffer16.append(stringStart, currentCharacter() - stringStart); 467 shift(); 468 469 int escape = singleEscape(m_current); 470 471 // Most common escape sequences first 472 if (escape) { 473 record16(escape); 474 shift(); 475 } else if (UNLIKELY(isLineTerminator(m_current))) 476 shiftLineTerminator(); 477 else if (m_current == 'x') { 478 shift(); 479 if (isASCIIHexDigit(m_current) && isASCIIHexDigit(peek(1))) { 480 int prev = m_current; 481 shift(); 482 record16(convertHex(prev, m_current)); 483 shift(); 484 } else 485 record16('x'); 486 } else if (m_current == 'u') { 487 shift(); 488 int character = getUnicodeCharacter(); 489 if (character != -1) 490 record16(character); 491 else if (m_current == stringQuoteCharacter) 492 record16('u'); 493 else // Only stringQuoteCharacter allowed after \u 494 return false; 495 } else if (strictMode && isASCIIDigit(m_current)) { 496 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit. 497 int character1 = m_current; 498 shift(); 499 if (character1 != '0' || isASCIIDigit(m_current)) 500 return false; 501 record16(0); 502 } else if (!strictMode && isASCIIOctalDigit(m_current)) { 503 // Octal character sequences 504 int character1 = m_current; 505 shift(); 506 if (isASCIIOctalDigit(m_current)) { 507 // Two octal characters 508 int character2 = m_current; 509 shift(); 510 if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) { 511 record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0'); 512 shift(); 513 } else 514 record16((character1 - '0') * 8 + character2 - '0'); 515 } else 516 record16(character1 - '0'); 517 } else if (m_current != -1) { 518 record16(m_current); 519 shift(); 520 } else 521 return false; 522 523 stringStart = currentCharacter(); 524 continue; 525 } 526 // Fast check for characters that require special handling. 527 // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently 528 // as possible, and lets through all common ASCII characters. 529 if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) { 530 // New-line or end of input is not allowed 531 if (UNLIKELY(isLineTerminator(m_current)) || UNLIKELY(m_current == -1)) 532 return false; 533 // Anything else is just a normal character 534 } 535 shift(); 536 } 537 538 if (currentCharacter() != stringStart) 539 m_buffer16.append(stringStart, currentCharacter() - stringStart); 540 lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size()); 541 m_buffer16.resize(0); 542 return true; 543 } 544 545 ALWAYS_INLINE void Lexer::parseHex(double& returnValue) 546 { 547 // Optimization: most hexadecimal values fit into 4 bytes. 548 uint32_t hexValue = 0; 549 int maximumDigits = 7; 550 551 // Shift out the 'x' prefix. 552 shift(); 553 554 do { 555 hexValue = (hexValue << 4) + toASCIIHexValue(m_current); 556 shift(); 557 --maximumDigits; 558 } while (isASCIIHexDigit(m_current) && maximumDigits >= 0); 559 560 if (maximumDigits >= 0) { 561 returnValue = hexValue; 562 return; 563 } 564 565 // No more place in the hexValue buffer. 566 // The values are shifted out and placed into the m_buffer8 vector. 567 for (int i = 0; i < 8; ++i) { 568 int digit = hexValue >> 28; 569 if (digit < 10) 570 record8(digit + '0'); 571 else 572 record8(digit - 10 + 'a'); 573 hexValue <<= 4; 574 } 575 576 while (isASCIIHexDigit(m_current)) { 577 record8(m_current); 578 shift(); 579 } 580 581 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16); 582 } 583 584 ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue) 585 { 586 // Optimization: most octal values fit into 4 bytes. 587 uint32_t octalValue = 0; 588 int maximumDigits = 9; 589 // Temporary buffer for the digits. Makes easier 590 // to reconstruct the input characters when needed. 591 char digits[10]; 592 593 do { 594 octalValue = octalValue * 8 + (m_current - '0'); 595 digits[maximumDigits] = m_current; 596 shift(); 597 --maximumDigits; 598 } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0); 599 600 if (!isASCIIDigit(m_current) && maximumDigits >= 0) { 601 returnValue = octalValue; 602 return true; 603 } 604 605 for (int i = 9; i > maximumDigits; --i) 606 record8(digits[i]); 607 608 while (isASCIIOctalDigit(m_current)) { 609 record8(m_current); 610 shift(); 611 } 612 613 if (isASCIIDigit(m_current)) 614 return false; 615 616 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8); 617 return true; 618 } 619 620 ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue) 621 { 622 // Optimization: most decimal values fit into 4 bytes. 623 uint32_t decimalValue = 0; 624 625 // Since parseOctal may be executed before parseDecimal, 626 // the m_buffer8 may hold ascii digits. 627 if (!m_buffer8.size()) { 628 int maximumDigits = 9; 629 // Temporary buffer for the digits. Makes easier 630 // to reconstruct the input characters when needed. 631 char digits[10]; 632 633 do { 634 decimalValue = decimalValue * 10 + (m_current - '0'); 635 digits[maximumDigits] = m_current; 636 shift(); 637 --maximumDigits; 638 } while (isASCIIDigit(m_current) && maximumDigits >= 0); 639 640 if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') { 641 returnValue = decimalValue; 642 return true; 643 } 644 645 for (int i = 9; i > maximumDigits; --i) 646 record8(digits[i]); 647 } 648 649 while (isASCIIDigit(m_current)) { 650 record8(m_current); 651 shift(); 652 } 653 654 return false; 655 } 656 657 ALWAYS_INLINE void Lexer::parseNumberAfterDecimalPoint() 658 { 659 record8('.'); 660 while (isASCIIDigit(m_current)) { 661 record8(m_current); 662 shift(); 663 } 664 } 665 666 ALWAYS_INLINE bool Lexer::parseNumberAfterExponentIndicator() 667 { 668 record8('e'); 669 shift(); 670 if (m_current == '+' || m_current == '-') { 671 record8(m_current); 672 shift(); 673 } 674 675 if (!isASCIIDigit(m_current)) 676 return false; 677 678 do { 679 record8(m_current); 680 shift(); 681 } while (isASCIIDigit(m_current)); 682 return true; 683 } 684 685 ALWAYS_INLINE bool Lexer::parseMultilineComment() 686 { 687 while (true) { 688 while (UNLIKELY(m_current == '*')) { 689 shift(); 690 if (m_current == '/') { 691 shift(); 692 return true; 693 } 694 } 695 696 if (UNLIKELY(m_current == -1)) 697 return false; 698 699 if (isLineTerminator(m_current)) 700 shiftLineTerminator(); 701 else 702 shift(); 703 } 704 } 705 706 bool Lexer::nextTokenIsColon() 707 { 708 const UChar* code = m_code; 709 while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code))) 710 code++; 711 712 return code < m_codeEnd && *code == ':'; 713 } 714 715 JSTokenType Lexer::lex(JSTokenData* lvalp, JSTokenInfo* llocp, LexType lexType, bool strictMode) 716 { 717 ASSERT(!m_error); 718 ASSERT(m_buffer8.isEmpty()); 719 ASSERT(m_buffer16.isEmpty()); 720 721 JSTokenType token = ERRORTOK; 722 m_terminator = false; 723 724 start: 725 while (isWhiteSpace(m_current)) 726 shift(); 727 728 int startOffset = currentOffset(); 729 730 if (UNLIKELY(m_current == -1)) 731 return EOFTOK; 732 733 m_delimited = false; 734 735 CharacterType type; 736 if (LIKELY(isASCII(m_current))) 737 type = static_cast<CharacterType>(typesOfASCIICharacters[m_current]); 738 else if (isNonASCIIIdentStart(m_current)) 739 type = CharacterIdentifierStart; 740 else if (isLineTerminator(m_current)) 741 type = CharacterLineTerminator; 742 else 743 type = CharacterInvalid; 744 745 switch (type) { 746 case CharacterGreater: 747 shift(); 748 if (m_current == '>') { 749 shift(); 750 if (m_current == '>') { 751 shift(); 752 if (m_current == '=') { 753 shift(); 754 token = URSHIFTEQUAL; 755 break; 756 } 757 token = URSHIFT; 758 break; 759 } 760 if (m_current == '=') { 761 shift(); 762 token = RSHIFTEQUAL; 763 break; 764 } 765 token = RSHIFT; 766 break; 767 } 768 if (m_current == '=') { 769 shift(); 770 token = GE; 771 break; 772 } 773 token = GT; 774 break; 775 case CharacterEqual: 776 shift(); 777 if (m_current == '=') { 778 shift(); 779 if (m_current == '=') { 780 shift(); 781 token = STREQ; 782 break; 783 } 784 token = EQEQ; 785 break; 786 } 787 token = EQUAL; 788 break; 789 case CharacterLess: 790 shift(); 791 if (m_current == '!' && peek(1) == '-' && peek(2) == '-') { 792 // <!-- marks the beginning of a line comment (for www usage) 793 goto inSingleLineComment; 794 } 795 if (m_current == '<') { 796 shift(); 797 if (m_current == '=') { 798 shift(); 799 token = LSHIFTEQUAL; 800 break; 801 } 802 token = LSHIFT; 803 break; 804 } 805 if (m_current == '=') { 806 shift(); 807 token = LE; 808 break; 809 } 810 token = LT; 811 break; 812 case CharacterExclamationMark: 813 shift(); 814 if (m_current == '=') { 815 shift(); 816 if (m_current == '=') { 817 shift(); 818 token = STRNEQ; 819 break; 820 } 821 token = NE; 822 break; 823 } 824 token = EXCLAMATION; 825 break; 826 case CharacterAdd: 827 shift(); 828 if (m_current == '+') { 829 shift(); 830 token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS; 831 break; 832 } 833 if (m_current == '=') { 834 shift(); 835 token = PLUSEQUAL; 836 break; 837 } 838 token = PLUS; 839 break; 840 case CharacterSub: 841 shift(); 842 if (m_current == '-') { 843 shift(); 844 if (m_atLineStart && m_current == '>') { 845 shift(); 846 goto inSingleLineComment; 847 } 848 token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS; 849 break; 850 } 851 if (m_current == '=') { 852 shift(); 853 token = MINUSEQUAL; 854 break; 855 } 856 token = MINUS; 857 break; 858 case CharacterMultiply: 859 shift(); 860 if (m_current == '=') { 861 shift(); 862 token = MULTEQUAL; 863 break; 864 } 865 token = TIMES; 866 break; 867 case CharacterSlash: 868 shift(); 869 if (m_current == '/') { 870 shift(); 871 goto inSingleLineComment; 872 } 873 if (m_current == '*') { 874 shift(); 875 if (parseMultilineComment()) 876 goto start; 877 goto returnError; 878 } 879 if (m_current == '=') { 880 shift(); 881 token = DIVEQUAL; 882 break; 883 } 884 token = DIVIDE; 885 break; 886 case CharacterAnd: 887 shift(); 888 if (m_current == '&') { 889 shift(); 890 token = AND; 891 break; 892 } 893 if (m_current == '=') { 894 shift(); 895 token = ANDEQUAL; 896 break; 897 } 898 token = BITAND; 899 break; 900 case CharacterXor: 901 shift(); 902 if (m_current == '=') { 903 shift(); 904 token = XOREQUAL; 905 break; 906 } 907 token = BITXOR; 908 break; 909 case CharacterModulo: 910 shift(); 911 if (m_current == '=') { 912 shift(); 913 token = MODEQUAL; 914 break; 915 } 916 token = MOD; 917 break; 918 case CharacterOr: 919 shift(); 920 if (m_current == '=') { 921 shift(); 922 token = OREQUAL; 923 break; 924 } 925 if (m_current == '|') { 926 shift(); 927 token = OR; 928 break; 929 } 930 token = BITOR; 931 break; 932 case CharacterOpenParen: 933 token = OPENPAREN; 934 shift(); 935 break; 936 case CharacterCloseParen: 937 token = CLOSEPAREN; 938 shift(); 939 break; 940 case CharacterOpenBracket: 941 token = OPENBRACKET; 942 shift(); 943 break; 944 case CharacterCloseBracket: 945 token = CLOSEBRACKET; 946 shift(); 947 break; 948 case CharacterComma: 949 token = COMMA; 950 shift(); 951 break; 952 case CharacterColon: 953 token = COLON; 954 shift(); 955 break; 956 case CharacterQuestion: 957 token = QUESTION; 958 shift(); 959 break; 960 case CharacterTilde: 961 token = TILDE; 962 shift(); 963 break; 964 case CharacterSemicolon: 965 m_delimited = true; 966 shift(); 967 token = SEMICOLON; 968 break; 969 case CharacterOpenBrace: 970 lvalp->intValue = currentOffset(); 971 shift(); 972 token = OPENBRACE; 973 break; 974 case CharacterCloseBrace: 975 lvalp->intValue = currentOffset(); 976 m_delimited = true; 977 shift(); 978 token = CLOSEBRACE; 979 break; 980 case CharacterDot: 981 shift(); 982 if (!isASCIIDigit(m_current)) { 983 token = DOT; 984 break; 985 } 986 goto inNumberAfterDecimalPoint; 987 case CharacterZero: 988 shift(); 989 if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) { 990 parseHex(lvalp->doubleValue); 991 token = NUMBER; 992 } else { 993 record8('0'); 994 if (isASCIIOctalDigit(m_current)) { 995 if (parseOctal(lvalp->doubleValue)) { 996 if (strictMode) 997 goto returnError; 998 token = NUMBER; 999 } 1000 } 1001 } 1002 // Fall through into CharacterNumber 1003 case CharacterNumber: 1004 if (LIKELY(token != NUMBER)) { 1005 if (!parseDecimal(lvalp->doubleValue)) { 1006 if (m_current == '.') { 1007 shift(); 1008 inNumberAfterDecimalPoint: 1009 parseNumberAfterDecimalPoint(); 1010 } 1011 if ((m_current | 0x20) == 'e') 1012 if (!parseNumberAfterExponentIndicator()) 1013 goto returnError; 1014 // Null-terminate string for strtod. 1015 m_buffer8.append('\0'); 1016 lvalp->doubleValue = WTF::strtod(m_buffer8.data(), 0); 1017 } 1018 token = NUMBER; 1019 } 1020 1021 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad. 1022 if (UNLIKELY(isIdentStart(m_current))) 1023 goto returnError; 1024 m_buffer8.resize(0); 1025 m_delimited = false; 1026 break; 1027 case CharacterQuote: 1028 if (UNLIKELY(!parseString(lvalp, strictMode))) 1029 goto returnError; 1030 shift(); 1031 m_delimited = false; 1032 token = STRING; 1033 break; 1034 case CharacterIdentifierStart: 1035 ASSERT(isIdentStart(m_current)); 1036 // Fall through into CharacterBackSlash. 1037 case CharacterBackSlash: 1038 token = parseIdentifier(lvalp, lexType); 1039 break; 1040 case CharacterLineTerminator: 1041 ASSERT(isLineTerminator(m_current)); 1042 shiftLineTerminator(); 1043 m_atLineStart = true; 1044 m_terminator = true; 1045 goto start; 1046 case CharacterInvalid: 1047 goto returnError; 1048 default: 1049 ASSERT_NOT_REACHED(); 1050 goto returnError; 1051 } 1052 1053 m_atLineStart = false; 1054 goto returnToken; 1055 1056 inSingleLineComment: 1057 while (!isLineTerminator(m_current)) { 1058 if (UNLIKELY(m_current == -1)) 1059 return EOFTOK; 1060 shift(); 1061 } 1062 shiftLineTerminator(); 1063 m_atLineStart = true; 1064 m_terminator = true; 1065 if (!lastTokenWasRestrKeyword()) 1066 goto start; 1067 1068 token = SEMICOLON; 1069 m_delimited = true; 1070 // Fall through into returnToken. 1071 1072 returnToken: 1073 llocp->line = m_lineNumber; 1074 llocp->startOffset = startOffset; 1075 llocp->endOffset = currentOffset(); 1076 m_lastToken = token; 1077 return token; 1078 1079 returnError: 1080 m_error = true; 1081 return ERRORTOK; 1082 } 1083 1084 bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix) 1085 { 1086 ASSERT(m_buffer16.isEmpty()); 1087 1088 bool lastWasEscape = false; 1089 bool inBrackets = false; 1090 1091 if (patternPrefix) { 1092 ASSERT(!isLineTerminator(patternPrefix)); 1093 ASSERT(patternPrefix != '/'); 1094 ASSERT(patternPrefix != '['); 1095 record16(patternPrefix); 1096 } 1097 1098 while (true) { 1099 int current = m_current; 1100 1101 if (isLineTerminator(current) || current == -1) { 1102 m_buffer16.resize(0); 1103 return false; 1104 } 1105 1106 shift(); 1107 1108 if (current == '/' && !lastWasEscape && !inBrackets) 1109 break; 1110 1111 record16(current); 1112 1113 if (lastWasEscape) { 1114 lastWasEscape = false; 1115 continue; 1116 } 1117 1118 switch (current) { 1119 case '[': 1120 inBrackets = true; 1121 break; 1122 case ']': 1123 inBrackets = false; 1124 break; 1125 case '\\': 1126 lastWasEscape = true; 1127 break; 1128 } 1129 } 1130 1131 pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size()); 1132 m_buffer16.resize(0); 1133 1134 while (isIdentPart(m_current)) { 1135 record16(m_current); 1136 shift(); 1137 } 1138 1139 flags = makeIdentifier(m_buffer16.data(), m_buffer16.size()); 1140 m_buffer16.resize(0); 1141 1142 return true; 1143 } 1144 1145 bool Lexer::skipRegExp() 1146 { 1147 bool lastWasEscape = false; 1148 bool inBrackets = false; 1149 1150 while (true) { 1151 int current = m_current; 1152 1153 if (isLineTerminator(current) || current == -1) 1154 return false; 1155 1156 shift(); 1157 1158 if (current == '/' && !lastWasEscape && !inBrackets) 1159 break; 1160 1161 if (lastWasEscape) { 1162 lastWasEscape = false; 1163 continue; 1164 } 1165 1166 switch (current) { 1167 case '[': 1168 inBrackets = true; 1169 break; 1170 case ']': 1171 inBrackets = false; 1172 break; 1173 case '\\': 1174 lastWasEscape = true; 1175 break; 1176 } 1177 } 1178 1179 while (isIdentPart(m_current)) 1180 shift(); 1181 1182 return true; 1183 } 1184 1185 void Lexer::clear() 1186 { 1187 m_arena = 0; 1188 1189 Vector<char> newBuffer8; 1190 m_buffer8.swap(newBuffer8); 1191 1192 Vector<UChar> newBuffer16; 1193 m_buffer16.swap(newBuffer16); 1194 1195 m_isReparsing = false; 1196 } 1197 1198 SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine) 1199 { 1200 ASSERT(m_source->provider()->data()[openBrace] == '{'); 1201 ASSERT(m_source->provider()->data()[closeBrace] == '}'); 1202 return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine); 1203 } 1204 1205 } // namespace JSC 1206