Home | History | Annotate | Download | only in parser
      1 /*
      2  *  Copyright (C) 1999-2000 Harri Porten (porten (at) kde.org)
      3  *  Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
      4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich (at) uwaterloo.ca)
      5  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg (at) inf.u-szeged.hu)
      6  *
      7  *  This library is free software; you can redistribute it and/or
      8  *  modify it under the terms of the GNU Library General Public
      9  *  License as published by the Free Software Foundation; either
     10  *  version 2 of the License, or (at your option) any later version.
     11  *
     12  *  This library is distributed in the hope that it will be useful,
     13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15  *  Library General Public License for more details.
     16  *
     17  *  You should have received a copy of the GNU Library General Public License
     18  *  along with this library; see the file COPYING.LIB.  If not, write to
     19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     20  *  Boston, MA 02110-1301, USA.
     21  *
     22  */
     23 
     24 #include "config.h"
     25 #include "Lexer.h"
     26 
     27 #include "JSFunction.h"
     28 
     29 #include "JSGlobalObjectFunctions.h"
     30 #include "Identifier.h"
     31 #include "NodeInfo.h"
     32 #include "Nodes.h"
     33 #include "dtoa.h"
     34 #include <ctype.h>
     35 #include <limits.h>
     36 #include <string.h>
     37 #include <wtf/Assertions.h>
     38 
     39 using namespace WTF;
     40 using namespace Unicode;
     41 
     42 #include "JSParser.h"
     43 #include "Lookup.h"
     44 #include "Lexer.lut.h"
     45 
     46 namespace JSC {
     47 
     48 
     49 enum CharacterType {
     50     // Types for the main switch
     51 
     52     // The first three types are fixed, and also used for identifying
     53     // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
     54     CharacterIdentifierStart,
     55     CharacterZero,
     56     CharacterNumber,
     57 
     58     CharacterInvalid,
     59     CharacterLineTerminator,
     60     CharacterExclamationMark,
     61     CharacterOpenParen,
     62     CharacterCloseParen,
     63     CharacterOpenBracket,
     64     CharacterCloseBracket,
     65     CharacterComma,
     66     CharacterColon,
     67     CharacterQuestion,
     68     CharacterTilde,
     69     CharacterQuote,
     70     CharacterDot,
     71     CharacterSlash,
     72     CharacterBackSlash,
     73     CharacterSemicolon,
     74     CharacterOpenBrace,
     75     CharacterCloseBrace,
     76 
     77     CharacterAdd,
     78     CharacterSub,
     79     CharacterMultiply,
     80     CharacterModulo,
     81     CharacterAnd,
     82     CharacterXor,
     83     CharacterOr,
     84     CharacterLess,
     85     CharacterGreater,
     86     CharacterEqual,
     87 
     88     // Other types (only one so far)
     89     CharacterWhiteSpace,
     90 };
     91 
     92 // 128 ASCII codes
     93 static const unsigned short typesOfASCIICharacters[128] = {
     94 /*   0 - Null               */ CharacterInvalid,
     95 /*   1 - Start of Heading   */ CharacterInvalid,
     96 /*   2 - Start of Text      */ CharacterInvalid,
     97 /*   3 - End of Text        */ CharacterInvalid,
     98 /*   4 - End of Transm.     */ CharacterInvalid,
     99 /*   5 - Enquiry            */ CharacterInvalid,
    100 /*   6 - Acknowledgment     */ CharacterInvalid,
    101 /*   7 - Bell               */ CharacterInvalid,
    102 /*   8 - Back Space         */ CharacterInvalid,
    103 /*   9 - Horizontal Tab     */ CharacterWhiteSpace,
    104 /*  10 - Line Feed          */ CharacterLineTerminator,
    105 /*  11 - Vertical Tab       */ CharacterWhiteSpace,
    106 /*  12 - Form Feed          */ CharacterWhiteSpace,
    107 /*  13 - Carriage Return    */ CharacterLineTerminator,
    108 /*  14 - Shift Out          */ CharacterInvalid,
    109 /*  15 - Shift In           */ CharacterInvalid,
    110 /*  16 - Data Line Escape   */ CharacterInvalid,
    111 /*  17 - Device Control 1   */ CharacterInvalid,
    112 /*  18 - Device Control 2   */ CharacterInvalid,
    113 /*  19 - Device Control 3   */ CharacterInvalid,
    114 /*  20 - Device Control 4   */ CharacterInvalid,
    115 /*  21 - Negative Ack.      */ CharacterInvalid,
    116 /*  22 - Synchronous Idle   */ CharacterInvalid,
    117 /*  23 - End of Transmit    */ CharacterInvalid,
    118 /*  24 - Cancel             */ CharacterInvalid,
    119 /*  25 - End of Medium      */ CharacterInvalid,
    120 /*  26 - Substitute         */ CharacterInvalid,
    121 /*  27 - Escape             */ CharacterInvalid,
    122 /*  28 - File Separator     */ CharacterInvalid,
    123 /*  29 - Group Separator    */ CharacterInvalid,
    124 /*  30 - Record Separator   */ CharacterInvalid,
    125 /*  31 - Unit Separator     */ CharacterInvalid,
    126 /*  32 - Space              */ CharacterWhiteSpace,
    127 /*  33 - !                  */ CharacterExclamationMark,
    128 /*  34 - "                  */ CharacterQuote,
    129 /*  35 - #                  */ CharacterInvalid,
    130 /*  36 - $                  */ CharacterIdentifierStart,
    131 /*  37 - %                  */ CharacterModulo,
    132 /*  38 - &                  */ CharacterAnd,
    133 /*  39 - '                  */ CharacterQuote,
    134 /*  40 - (                  */ CharacterOpenParen,
    135 /*  41 - )                  */ CharacterCloseParen,
    136 /*  42 - *                  */ CharacterMultiply,
    137 /*  43 - +                  */ CharacterAdd,
    138 /*  44 - ,                  */ CharacterComma,
    139 /*  45 - -                  */ CharacterSub,
    140 /*  46 - .                  */ CharacterDot,
    141 /*  47 - /                  */ CharacterSlash,
    142 /*  48 - 0                  */ CharacterZero,
    143 /*  49 - 1                  */ CharacterNumber,
    144 /*  50 - 2                  */ CharacterNumber,
    145 /*  51 - 3                  */ CharacterNumber,
    146 /*  52 - 4                  */ CharacterNumber,
    147 /*  53 - 5                  */ CharacterNumber,
    148 /*  54 - 6                  */ CharacterNumber,
    149 /*  55 - 7                  */ CharacterNumber,
    150 /*  56 - 8                  */ CharacterNumber,
    151 /*  57 - 9                  */ CharacterNumber,
    152 /*  58 - :                  */ CharacterColon,
    153 /*  59 - ;                  */ CharacterSemicolon,
    154 /*  60 - <                  */ CharacterLess,
    155 /*  61 - =                  */ CharacterEqual,
    156 /*  62 - >                  */ CharacterGreater,
    157 /*  63 - ?                  */ CharacterQuestion,
    158 /*  64 - @                  */ CharacterInvalid,
    159 /*  65 - A                  */ CharacterIdentifierStart,
    160 /*  66 - B                  */ CharacterIdentifierStart,
    161 /*  67 - C                  */ CharacterIdentifierStart,
    162 /*  68 - D                  */ CharacterIdentifierStart,
    163 /*  69 - E                  */ CharacterIdentifierStart,
    164 /*  70 - F                  */ CharacterIdentifierStart,
    165 /*  71 - G                  */ CharacterIdentifierStart,
    166 /*  72 - H                  */ CharacterIdentifierStart,
    167 /*  73 - I                  */ CharacterIdentifierStart,
    168 /*  74 - J                  */ CharacterIdentifierStart,
    169 /*  75 - K                  */ CharacterIdentifierStart,
    170 /*  76 - L                  */ CharacterIdentifierStart,
    171 /*  77 - M                  */ CharacterIdentifierStart,
    172 /*  78 - N                  */ CharacterIdentifierStart,
    173 /*  79 - O                  */ CharacterIdentifierStart,
    174 /*  80 - P                  */ CharacterIdentifierStart,
    175 /*  81 - Q                  */ CharacterIdentifierStart,
    176 /*  82 - R                  */ CharacterIdentifierStart,
    177 /*  83 - S                  */ CharacterIdentifierStart,
    178 /*  84 - T                  */ CharacterIdentifierStart,
    179 /*  85 - U                  */ CharacterIdentifierStart,
    180 /*  86 - V                  */ CharacterIdentifierStart,
    181 /*  87 - W                  */ CharacterIdentifierStart,
    182 /*  88 - X                  */ CharacterIdentifierStart,
    183 /*  89 - Y                  */ CharacterIdentifierStart,
    184 /*  90 - Z                  */ CharacterIdentifierStart,
    185 /*  91 - [                  */ CharacterOpenBracket,
    186 /*  92 - \                  */ CharacterBackSlash,
    187 /*  93 - ]                  */ CharacterCloseBracket,
    188 /*  94 - ^                  */ CharacterXor,
    189 /*  95 - _                  */ CharacterIdentifierStart,
    190 /*  96 - `                  */ CharacterInvalid,
    191 /*  97 - a                  */ CharacterIdentifierStart,
    192 /*  98 - b                  */ CharacterIdentifierStart,
    193 /*  99 - c                  */ CharacterIdentifierStart,
    194 /* 100 - d                  */ CharacterIdentifierStart,
    195 /* 101 - e                  */ CharacterIdentifierStart,
    196 /* 102 - f                  */ CharacterIdentifierStart,
    197 /* 103 - g                  */ CharacterIdentifierStart,
    198 /* 104 - h                  */ CharacterIdentifierStart,
    199 /* 105 - i                  */ CharacterIdentifierStart,
    200 /* 106 - j                  */ CharacterIdentifierStart,
    201 /* 107 - k                  */ CharacterIdentifierStart,
    202 /* 108 - l                  */ CharacterIdentifierStart,
    203 /* 109 - m                  */ CharacterIdentifierStart,
    204 /* 110 - n                  */ CharacterIdentifierStart,
    205 /* 111 - o                  */ CharacterIdentifierStart,
    206 /* 112 - p                  */ CharacterIdentifierStart,
    207 /* 113 - q                  */ CharacterIdentifierStart,
    208 /* 114 - r                  */ CharacterIdentifierStart,
    209 /* 115 - s                  */ CharacterIdentifierStart,
    210 /* 116 - t                  */ CharacterIdentifierStart,
    211 /* 117 - u                  */ CharacterIdentifierStart,
    212 /* 118 - v                  */ CharacterIdentifierStart,
    213 /* 119 - w                  */ CharacterIdentifierStart,
    214 /* 120 - x                  */ CharacterIdentifierStart,
    215 /* 121 - y                  */ CharacterIdentifierStart,
    216 /* 122 - z                  */ CharacterIdentifierStart,
    217 /* 123 - {                  */ CharacterOpenBrace,
    218 /* 124 - |                  */ CharacterOr,
    219 /* 125 - }                  */ CharacterCloseBrace,
    220 /* 126 - ~                  */ CharacterTilde,
    221 /* 127 - Delete             */ CharacterInvalid,
    222 };
    223 
    224 Lexer::Lexer(JSGlobalData* globalData)
    225     : m_isReparsing(false)
    226     , m_globalData(globalData)
    227     , m_keywordTable(JSC::mainTable)
    228 {
    229 }
    230 
    231 Lexer::~Lexer()
    232 {
    233     m_keywordTable.deleteTable();
    234 }
    235 
    236 ALWAYS_INLINE const UChar* Lexer::currentCharacter() const
    237 {
    238     ASSERT(m_code <= m_codeEnd);
    239     return m_code;
    240 }
    241 
    242 ALWAYS_INLINE int Lexer::currentOffset() const
    243 {
    244     return currentCharacter() - m_codeStart;
    245 }
    246 
    247 void Lexer::setCode(const SourceCode& source, ParserArena& arena)
    248 {
    249     m_arena = &arena.identifierArena();
    250 
    251     m_lineNumber = source.firstLine();
    252     m_delimited = false;
    253     m_lastToken = -1;
    254 
    255     const UChar* data = source.provider()->data();
    256 
    257     m_source = &source;
    258     m_codeStart = data;
    259     m_code = data + source.startOffset();
    260     m_codeEnd = data + source.endOffset();
    261     m_error = false;
    262     m_atLineStart = true;
    263 
    264     m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
    265     m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
    266 
    267     if (LIKELY(m_code < m_codeEnd))
    268         m_current = *m_code;
    269     else
    270         m_current = -1;
    271     ASSERT(currentOffset() == source.startOffset());
    272 }
    273 
    274 ALWAYS_INLINE void Lexer::shift()
    275 {
    276     // Faster than an if-else sequence
    277     ASSERT(m_current != -1);
    278     m_current = -1;
    279     ++m_code;
    280     if (LIKELY(m_code < m_codeEnd))
    281         m_current = *m_code;
    282 }
    283 
    284 ALWAYS_INLINE int Lexer::peek(int offset)
    285 {
    286     // Only use if necessary
    287     ASSERT(offset > 0 && offset < 5);
    288     const UChar* code = m_code + offset;
    289     return (code < m_codeEnd) ? *code : -1;
    290 }
    291 
    292 int Lexer::getUnicodeCharacter()
    293 {
    294     int char1 = peek(1);
    295     int char2 = peek(2);
    296     int char3 = peek(3);
    297 
    298     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
    299         return -1;
    300 
    301     int result = convertUnicode(m_current, char1, char2, char3);
    302     shift();
    303     shift();
    304     shift();
    305     shift();
    306     return result;
    307 }
    308 
    309 void Lexer::shiftLineTerminator()
    310 {
    311     ASSERT(isLineTerminator(m_current));
    312 
    313     int m_prev = m_current;
    314     shift();
    315 
    316     // Allow both CRLF and LFCR.
    317     if (m_prev + m_current == '\n' + '\r')
    318         shift();
    319 
    320     ++m_lineNumber;
    321 }
    322 
    323 ALWAYS_INLINE const Identifier* Lexer::makeIdentifier(const UChar* characters, size_t length)
    324 {
    325     return &m_arena->makeIdentifier(m_globalData, characters, length);
    326 }
    327 
    328 ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const
    329 {
    330     return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
    331 }
    332 
    333 static NEVER_INLINE bool isNonASCIIIdentStart(int c)
    334 {
    335     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
    336 }
    337 
    338 static inline bool isIdentStart(int c)
    339 {
    340     return isASCII(c) ? typesOfASCIICharacters[c] == CharacterIdentifierStart : isNonASCIIIdentStart(c);
    341 }
    342 
    343 static NEVER_INLINE bool isNonASCIIIdentPart(int c)
    344 {
    345     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
    346         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector);
    347 }
    348 
    349 static inline bool isIdentPart(int c)
    350 {
    351     // Character types are divided into two groups depending on whether they can be part of an
    352     // identifier or not. Those whose type value is less or equal than CharacterNumber can be
    353     // part of an identifier. (See the CharacterType definition for more details.)
    354     return isASCII(c) ? typesOfASCIICharacters[c] <= CharacterNumber : isNonASCIIIdentPart(c);
    355 }
    356 
    357 static inline int singleEscape(int c)
    358 {
    359     switch (c) {
    360     case 'b':
    361         return 0x08;
    362     case 't':
    363         return 0x09;
    364     case 'n':
    365         return 0x0A;
    366     case 'v':
    367         return 0x0B;
    368     case 'f':
    369         return 0x0C;
    370     case 'r':
    371         return 0x0D;
    372     case '\\':
    373         return '\\';
    374     case '\'':
    375         return '\'';
    376     case '"':
    377         return '"';
    378     default:
    379         return 0;
    380     }
    381 }
    382 
    383 inline void Lexer::record8(int c)
    384 {
    385     ASSERT(c >= 0);
    386     ASSERT(c <= 0xFF);
    387     m_buffer8.append(static_cast<char>(c));
    388 }
    389 
    390 inline void Lexer::record16(UChar c)
    391 {
    392     m_buffer16.append(c);
    393 }
    394 
    395 inline void Lexer::record16(int c)
    396 {
    397     ASSERT(c >= 0);
    398     ASSERT(c <= USHRT_MAX);
    399     record16(UChar(static_cast<unsigned short>(c)));
    400 }
    401 
    402 ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* lvalp, LexType lexType)
    403 {
    404     bool bufferRequired = false;
    405     const UChar* identifierStart = currentCharacter();
    406     int identifierLength;
    407 
    408     while (true) {
    409         if (LIKELY(isIdentPart(m_current))) {
    410             shift();
    411             continue;
    412         }
    413         if (LIKELY(m_current != '\\'))
    414             break;
    415 
    416         // \uXXXX unicode characters.
    417         bufferRequired = true;
    418         if (identifierStart != currentCharacter())
    419             m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
    420         shift();
    421         if (UNLIKELY(m_current != 'u'))
    422             return ERRORTOK;
    423         shift();
    424         int character = getUnicodeCharacter();
    425         if (UNLIKELY(character == -1))
    426             return ERRORTOK;
    427         if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character) : !isIdentStart(character)))
    428             return ERRORTOK;
    429         record16(character);
    430         identifierStart = currentCharacter();
    431     }
    432 
    433     if (!bufferRequired)
    434         identifierLength = currentCharacter() - identifierStart;
    435     else {
    436         if (identifierStart != currentCharacter())
    437             m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
    438         identifierStart = m_buffer16.data();
    439         identifierLength = m_buffer16.size();
    440     }
    441 
    442     const Identifier* ident = makeIdentifier(identifierStart, identifierLength);
    443     lvalp->ident = ident;
    444     m_delimited = false;
    445 
    446     if (LIKELY(!bufferRequired && lexType == IdentifyReservedWords)) {
    447         // Keywords must not be recognized if there was an \uXXXX in the identifier.
    448         const HashEntry* entry = m_keywordTable.entry(m_globalData, *ident);
    449         return entry ? static_cast<JSTokenType>(entry->lexerValue()) : IDENT;
    450     }
    451 
    452     m_buffer16.resize(0);
    453     return IDENT;
    454 }
    455 
    456 ALWAYS_INLINE bool Lexer::parseString(JSTokenData* lvalp, bool strictMode)
    457 {
    458     int stringQuoteCharacter = m_current;
    459     shift();
    460 
    461     const UChar* stringStart = currentCharacter();
    462 
    463     while (m_current != stringQuoteCharacter) {
    464         if (UNLIKELY(m_current == '\\')) {
    465             if (stringStart != currentCharacter())
    466                 m_buffer16.append(stringStart, currentCharacter() - stringStart);
    467             shift();
    468 
    469             int escape = singleEscape(m_current);
    470 
    471             // Most common escape sequences first
    472             if (escape) {
    473                 record16(escape);
    474                 shift();
    475             } else if (UNLIKELY(isLineTerminator(m_current)))
    476                 shiftLineTerminator();
    477             else if (m_current == 'x') {
    478                 shift();
    479                 if (isASCIIHexDigit(m_current) && isASCIIHexDigit(peek(1))) {
    480                     int prev = m_current;
    481                     shift();
    482                     record16(convertHex(prev, m_current));
    483                     shift();
    484                 } else
    485                     record16('x');
    486             } else if (m_current == 'u') {
    487                 shift();
    488                 int character = getUnicodeCharacter();
    489                 if (character != -1)
    490                     record16(character);
    491                 else if (m_current == stringQuoteCharacter)
    492                     record16('u');
    493                 else // Only stringQuoteCharacter allowed after \u
    494                     return false;
    495             } else if (strictMode && isASCIIDigit(m_current)) {
    496                 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
    497                 int character1 = m_current;
    498                 shift();
    499                 if (character1 != '0' || isASCIIDigit(m_current))
    500                     return false;
    501                 record16(0);
    502             } else if (!strictMode && isASCIIOctalDigit(m_current)) {
    503                 // Octal character sequences
    504                 int character1 = m_current;
    505                 shift();
    506                 if (isASCIIOctalDigit(m_current)) {
    507                     // Two octal characters
    508                     int character2 = m_current;
    509                     shift();
    510                     if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
    511                         record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
    512                         shift();
    513                     } else
    514                         record16((character1 - '0') * 8 + character2 - '0');
    515                 } else
    516                     record16(character1 - '0');
    517             } else if (m_current != -1) {
    518                 record16(m_current);
    519                 shift();
    520             } else
    521                 return false;
    522 
    523             stringStart = currentCharacter();
    524             continue;
    525         }
    526         // Fast check for characters that require special handling.
    527         // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently
    528         // as possible, and lets through all common ASCII characters.
    529         if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
    530             // New-line or end of input is not allowed
    531             if (UNLIKELY(isLineTerminator(m_current)) || UNLIKELY(m_current == -1))
    532                 return false;
    533             // Anything else is just a normal character
    534         }
    535         shift();
    536     }
    537 
    538     if (currentCharacter() != stringStart)
    539         m_buffer16.append(stringStart, currentCharacter() - stringStart);
    540     lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
    541     m_buffer16.resize(0);
    542     return true;
    543 }
    544 
    545 ALWAYS_INLINE void Lexer::parseHex(double& returnValue)
    546 {
    547     // Optimization: most hexadecimal values fit into 4 bytes.
    548     uint32_t hexValue = 0;
    549     int maximumDigits = 7;
    550 
    551     // Shift out the 'x' prefix.
    552     shift();
    553 
    554     do {
    555         hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
    556         shift();
    557         --maximumDigits;
    558     } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
    559 
    560     if (maximumDigits >= 0) {
    561         returnValue = hexValue;
    562         return;
    563     }
    564 
    565     // No more place in the hexValue buffer.
    566     // The values are shifted out and placed into the m_buffer8 vector.
    567     for (int i = 0; i < 8; ++i) {
    568          int digit = hexValue >> 28;
    569          if (digit < 10)
    570              record8(digit + '0');
    571          else
    572              record8(digit - 10 + 'a');
    573          hexValue <<= 4;
    574     }
    575 
    576     while (isASCIIHexDigit(m_current)) {
    577         record8(m_current);
    578         shift();
    579     }
    580 
    581     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
    582 }
    583 
    584 ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue)
    585 {
    586     // Optimization: most octal values fit into 4 bytes.
    587     uint32_t octalValue = 0;
    588     int maximumDigits = 9;
    589     // Temporary buffer for the digits. Makes easier
    590     // to reconstruct the input characters when needed.
    591     char digits[10];
    592 
    593     do {
    594         octalValue = octalValue * 8 + (m_current - '0');
    595         digits[maximumDigits] = m_current;
    596         shift();
    597         --maximumDigits;
    598     } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
    599 
    600     if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
    601         returnValue = octalValue;
    602         return true;
    603     }
    604 
    605     for (int i = 9; i > maximumDigits; --i)
    606          record8(digits[i]);
    607 
    608     while (isASCIIOctalDigit(m_current)) {
    609         record8(m_current);
    610         shift();
    611     }
    612 
    613     if (isASCIIDigit(m_current))
    614         return false;
    615 
    616     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
    617     return true;
    618 }
    619 
    620 ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue)
    621 {
    622     // Optimization: most decimal values fit into 4 bytes.
    623     uint32_t decimalValue = 0;
    624 
    625     // Since parseOctal may be executed before parseDecimal,
    626     // the m_buffer8 may hold ascii digits.
    627     if (!m_buffer8.size()) {
    628         int maximumDigits = 9;
    629         // Temporary buffer for the digits. Makes easier
    630         // to reconstruct the input characters when needed.
    631         char digits[10];
    632 
    633         do {
    634             decimalValue = decimalValue * 10 + (m_current - '0');
    635             digits[maximumDigits] = m_current;
    636             shift();
    637             --maximumDigits;
    638         } while (isASCIIDigit(m_current) && maximumDigits >= 0);
    639 
    640         if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
    641             returnValue = decimalValue;
    642             return true;
    643         }
    644 
    645         for (int i = 9; i > maximumDigits; --i)
    646             record8(digits[i]);
    647     }
    648 
    649     while (isASCIIDigit(m_current)) {
    650         record8(m_current);
    651         shift();
    652     }
    653 
    654     return false;
    655 }
    656 
    657 ALWAYS_INLINE void Lexer::parseNumberAfterDecimalPoint()
    658 {
    659     record8('.');
    660     while (isASCIIDigit(m_current)) {
    661         record8(m_current);
    662         shift();
    663     }
    664 }
    665 
    666 ALWAYS_INLINE bool Lexer::parseNumberAfterExponentIndicator()
    667 {
    668     record8('e');
    669     shift();
    670     if (m_current == '+' || m_current == '-') {
    671         record8(m_current);
    672         shift();
    673     }
    674 
    675     if (!isASCIIDigit(m_current))
    676         return false;
    677 
    678     do {
    679         record8(m_current);
    680         shift();
    681     } while (isASCIIDigit(m_current));
    682     return true;
    683 }
    684 
    685 ALWAYS_INLINE bool Lexer::parseMultilineComment()
    686 {
    687     while (true) {
    688         while (UNLIKELY(m_current == '*')) {
    689             shift();
    690             if (m_current == '/') {
    691                 shift();
    692                 return true;
    693             }
    694         }
    695 
    696         if (UNLIKELY(m_current == -1))
    697             return false;
    698 
    699         if (isLineTerminator(m_current))
    700             shiftLineTerminator();
    701         else
    702             shift();
    703     }
    704 }
    705 
    706 bool Lexer::nextTokenIsColon()
    707 {
    708     const UChar* code = m_code;
    709     while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
    710         code++;
    711 
    712     return code < m_codeEnd && *code == ':';
    713 }
    714 
    715 JSTokenType Lexer::lex(JSTokenData* lvalp, JSTokenInfo* llocp, LexType lexType, bool strictMode)
    716 {
    717     ASSERT(!m_error);
    718     ASSERT(m_buffer8.isEmpty());
    719     ASSERT(m_buffer16.isEmpty());
    720 
    721     JSTokenType token = ERRORTOK;
    722     m_terminator = false;
    723 
    724 start:
    725     while (isWhiteSpace(m_current))
    726         shift();
    727 
    728     int startOffset = currentOffset();
    729 
    730     if (UNLIKELY(m_current == -1))
    731         return EOFTOK;
    732 
    733     m_delimited = false;
    734 
    735     CharacterType type;
    736     if (LIKELY(isASCII(m_current)))
    737         type = static_cast<CharacterType>(typesOfASCIICharacters[m_current]);
    738     else if (isNonASCIIIdentStart(m_current))
    739         type = CharacterIdentifierStart;
    740     else if (isLineTerminator(m_current))
    741         type = CharacterLineTerminator;
    742     else
    743         type = CharacterInvalid;
    744 
    745     switch (type) {
    746     case CharacterGreater:
    747         shift();
    748         if (m_current == '>') {
    749             shift();
    750             if (m_current == '>') {
    751                 shift();
    752                 if (m_current == '=') {
    753                     shift();
    754                     token = URSHIFTEQUAL;
    755                     break;
    756                 }
    757                 token = URSHIFT;
    758                 break;
    759             }
    760             if (m_current == '=') {
    761                 shift();
    762                 token = RSHIFTEQUAL;
    763                 break;
    764             }
    765             token = RSHIFT;
    766             break;
    767         }
    768         if (m_current == '=') {
    769             shift();
    770             token = GE;
    771             break;
    772         }
    773         token = GT;
    774         break;
    775     case CharacterEqual:
    776         shift();
    777         if (m_current == '=') {
    778             shift();
    779             if (m_current == '=') {
    780                 shift();
    781                 token = STREQ;
    782                 break;
    783             }
    784             token = EQEQ;
    785             break;
    786         }
    787         token = EQUAL;
    788         break;
    789     case CharacterLess:
    790         shift();
    791         if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
    792             // <!-- marks the beginning of a line comment (for www usage)
    793             goto inSingleLineComment;
    794         }
    795         if (m_current == '<') {
    796             shift();
    797             if (m_current == '=') {
    798                 shift();
    799                 token = LSHIFTEQUAL;
    800                 break;
    801             }
    802             token = LSHIFT;
    803             break;
    804         }
    805         if (m_current == '=') {
    806             shift();
    807             token = LE;
    808             break;
    809         }
    810         token = LT;
    811         break;
    812     case CharacterExclamationMark:
    813         shift();
    814         if (m_current == '=') {
    815             shift();
    816             if (m_current == '=') {
    817                 shift();
    818                 token = STRNEQ;
    819                 break;
    820             }
    821             token = NE;
    822             break;
    823         }
    824         token = EXCLAMATION;
    825         break;
    826     case CharacterAdd:
    827         shift();
    828         if (m_current == '+') {
    829             shift();
    830             token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
    831             break;
    832         }
    833         if (m_current == '=') {
    834             shift();
    835             token = PLUSEQUAL;
    836             break;
    837         }
    838         token = PLUS;
    839         break;
    840     case CharacterSub:
    841         shift();
    842         if (m_current == '-') {
    843             shift();
    844             if (m_atLineStart && m_current == '>') {
    845                 shift();
    846                 goto inSingleLineComment;
    847             }
    848             token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
    849             break;
    850         }
    851         if (m_current == '=') {
    852             shift();
    853             token = MINUSEQUAL;
    854             break;
    855         }
    856         token = MINUS;
    857         break;
    858     case CharacterMultiply:
    859         shift();
    860         if (m_current == '=') {
    861             shift();
    862             token = MULTEQUAL;
    863             break;
    864         }
    865         token = TIMES;
    866         break;
    867     case CharacterSlash:
    868         shift();
    869         if (m_current == '/') {
    870             shift();
    871             goto inSingleLineComment;
    872         }
    873         if (m_current == '*') {
    874             shift();
    875             if (parseMultilineComment())
    876                 goto start;
    877             goto returnError;
    878         }
    879         if (m_current == '=') {
    880             shift();
    881             token = DIVEQUAL;
    882             break;
    883         }
    884         token = DIVIDE;
    885         break;
    886     case CharacterAnd:
    887         shift();
    888         if (m_current == '&') {
    889             shift();
    890             token = AND;
    891             break;
    892         }
    893         if (m_current == '=') {
    894             shift();
    895             token = ANDEQUAL;
    896             break;
    897         }
    898         token = BITAND;
    899         break;
    900     case CharacterXor:
    901         shift();
    902         if (m_current == '=') {
    903             shift();
    904             token = XOREQUAL;
    905             break;
    906         }
    907         token = BITXOR;
    908         break;
    909     case CharacterModulo:
    910         shift();
    911         if (m_current == '=') {
    912             shift();
    913             token = MODEQUAL;
    914             break;
    915         }
    916         token = MOD;
    917         break;
    918     case CharacterOr:
    919         shift();
    920         if (m_current == '=') {
    921             shift();
    922             token = OREQUAL;
    923             break;
    924         }
    925         if (m_current == '|') {
    926             shift();
    927             token = OR;
    928             break;
    929         }
    930         token = BITOR;
    931         break;
    932     case CharacterOpenParen:
    933         token = OPENPAREN;
    934         shift();
    935         break;
    936     case CharacterCloseParen:
    937         token = CLOSEPAREN;
    938         shift();
    939         break;
    940     case CharacterOpenBracket:
    941         token = OPENBRACKET;
    942         shift();
    943         break;
    944     case CharacterCloseBracket:
    945         token = CLOSEBRACKET;
    946         shift();
    947         break;
    948     case CharacterComma:
    949         token = COMMA;
    950         shift();
    951         break;
    952     case CharacterColon:
    953         token = COLON;
    954         shift();
    955         break;
    956     case CharacterQuestion:
    957         token = QUESTION;
    958         shift();
    959         break;
    960     case CharacterTilde:
    961         token = TILDE;
    962         shift();
    963         break;
    964     case CharacterSemicolon:
    965         m_delimited = true;
    966         shift();
    967         token = SEMICOLON;
    968         break;
    969     case CharacterOpenBrace:
    970         lvalp->intValue = currentOffset();
    971         shift();
    972         token = OPENBRACE;
    973         break;
    974     case CharacterCloseBrace:
    975         lvalp->intValue = currentOffset();
    976         m_delimited = true;
    977         shift();
    978         token = CLOSEBRACE;
    979         break;
    980     case CharacterDot:
    981         shift();
    982         if (!isASCIIDigit(m_current)) {
    983             token = DOT;
    984             break;
    985         }
    986         goto inNumberAfterDecimalPoint;
    987     case CharacterZero:
    988         shift();
    989         if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
    990             parseHex(lvalp->doubleValue);
    991             token = NUMBER;
    992         } else {
    993             record8('0');
    994             if (isASCIIOctalDigit(m_current)) {
    995                 if (parseOctal(lvalp->doubleValue)) {
    996                     if (strictMode)
    997                         goto returnError;
    998                     token = NUMBER;
    999                 }
   1000             }
   1001         }
   1002         // Fall through into CharacterNumber
   1003     case CharacterNumber:
   1004         if (LIKELY(token != NUMBER)) {
   1005             if (!parseDecimal(lvalp->doubleValue)) {
   1006                 if (m_current == '.') {
   1007                     shift();
   1008 inNumberAfterDecimalPoint:
   1009                     parseNumberAfterDecimalPoint();
   1010                 }
   1011                 if ((m_current | 0x20) == 'e')
   1012                     if (!parseNumberAfterExponentIndicator())
   1013                         goto returnError;
   1014                 // Null-terminate string for strtod.
   1015                 m_buffer8.append('\0');
   1016                 lvalp->doubleValue = WTF::strtod(m_buffer8.data(), 0);
   1017             }
   1018             token = NUMBER;
   1019         }
   1020 
   1021         // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
   1022         if (UNLIKELY(isIdentStart(m_current)))
   1023             goto returnError;
   1024         m_buffer8.resize(0);
   1025         m_delimited = false;
   1026         break;
   1027     case CharacterQuote:
   1028         if (UNLIKELY(!parseString(lvalp, strictMode)))
   1029             goto returnError;
   1030         shift();
   1031         m_delimited = false;
   1032         token = STRING;
   1033         break;
   1034     case CharacterIdentifierStart:
   1035         ASSERT(isIdentStart(m_current));
   1036         // Fall through into CharacterBackSlash.
   1037     case CharacterBackSlash:
   1038         token = parseIdentifier(lvalp, lexType);
   1039         break;
   1040     case CharacterLineTerminator:
   1041         ASSERT(isLineTerminator(m_current));
   1042         shiftLineTerminator();
   1043         m_atLineStart = true;
   1044         m_terminator = true;
   1045         goto start;
   1046     case CharacterInvalid:
   1047         goto returnError;
   1048     default:
   1049         ASSERT_NOT_REACHED();
   1050         goto returnError;
   1051     }
   1052 
   1053     m_atLineStart = false;
   1054     goto returnToken;
   1055 
   1056 inSingleLineComment:
   1057     while (!isLineTerminator(m_current)) {
   1058         if (UNLIKELY(m_current == -1))
   1059             return EOFTOK;
   1060         shift();
   1061     }
   1062     shiftLineTerminator();
   1063     m_atLineStart = true;
   1064     m_terminator = true;
   1065     if (!lastTokenWasRestrKeyword())
   1066         goto start;
   1067 
   1068     token = SEMICOLON;
   1069     m_delimited = true;
   1070     // Fall through into returnToken.
   1071 
   1072 returnToken:
   1073     llocp->line = m_lineNumber;
   1074     llocp->startOffset = startOffset;
   1075     llocp->endOffset = currentOffset();
   1076     m_lastToken = token;
   1077     return token;
   1078 
   1079 returnError:
   1080     m_error = true;
   1081     return ERRORTOK;
   1082 }
   1083 
   1084 bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
   1085 {
   1086     ASSERT(m_buffer16.isEmpty());
   1087 
   1088     bool lastWasEscape = false;
   1089     bool inBrackets = false;
   1090 
   1091     if (patternPrefix) {
   1092         ASSERT(!isLineTerminator(patternPrefix));
   1093         ASSERT(patternPrefix != '/');
   1094         ASSERT(patternPrefix != '[');
   1095         record16(patternPrefix);
   1096     }
   1097 
   1098     while (true) {
   1099         int current = m_current;
   1100 
   1101         if (isLineTerminator(current) || current == -1) {
   1102             m_buffer16.resize(0);
   1103             return false;
   1104         }
   1105 
   1106         shift();
   1107 
   1108         if (current == '/' && !lastWasEscape && !inBrackets)
   1109             break;
   1110 
   1111         record16(current);
   1112 
   1113         if (lastWasEscape) {
   1114             lastWasEscape = false;
   1115             continue;
   1116         }
   1117 
   1118         switch (current) {
   1119         case '[':
   1120             inBrackets = true;
   1121             break;
   1122         case ']':
   1123             inBrackets = false;
   1124             break;
   1125         case '\\':
   1126             lastWasEscape = true;
   1127             break;
   1128         }
   1129     }
   1130 
   1131     pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size());
   1132     m_buffer16.resize(0);
   1133 
   1134     while (isIdentPart(m_current)) {
   1135         record16(m_current);
   1136         shift();
   1137     }
   1138 
   1139     flags = makeIdentifier(m_buffer16.data(), m_buffer16.size());
   1140     m_buffer16.resize(0);
   1141 
   1142     return true;
   1143 }
   1144 
   1145 bool Lexer::skipRegExp()
   1146 {
   1147     bool lastWasEscape = false;
   1148     bool inBrackets = false;
   1149 
   1150     while (true) {
   1151         int current = m_current;
   1152 
   1153         if (isLineTerminator(current) || current == -1)
   1154             return false;
   1155 
   1156         shift();
   1157 
   1158         if (current == '/' && !lastWasEscape && !inBrackets)
   1159             break;
   1160 
   1161         if (lastWasEscape) {
   1162             lastWasEscape = false;
   1163             continue;
   1164         }
   1165 
   1166         switch (current) {
   1167         case '[':
   1168             inBrackets = true;
   1169             break;
   1170         case ']':
   1171             inBrackets = false;
   1172             break;
   1173         case '\\':
   1174             lastWasEscape = true;
   1175             break;
   1176         }
   1177     }
   1178 
   1179     while (isIdentPart(m_current))
   1180         shift();
   1181 
   1182     return true;
   1183 }
   1184 
   1185 void Lexer::clear()
   1186 {
   1187     m_arena = 0;
   1188 
   1189     Vector<char> newBuffer8;
   1190     m_buffer8.swap(newBuffer8);
   1191 
   1192     Vector<UChar> newBuffer16;
   1193     m_buffer16.swap(newBuffer16);
   1194 
   1195     m_isReparsing = false;
   1196 }
   1197 
   1198 SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine)
   1199 {
   1200     ASSERT(m_source->provider()->data()[openBrace] == '{');
   1201     ASSERT(m_source->provider()->data()[closeBrace] == '}');
   1202     return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
   1203 }
   1204 
   1205 } // namespace JSC
   1206