Home | History | Annotate | Download | only in src
      1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #ifndef V8_SCANNER_H_
     29 #define V8_SCANNER_H_
     30 
     31 #include "token.h"
     32 #include "char-predicates-inl.h"
     33 
     34 namespace v8 {
     35 namespace internal {
     36 
     37 
     38 class UTF8Buffer {
     39  public:
     40   UTF8Buffer();
     41   ~UTF8Buffer();
     42 
     43   void AddChar(uc32 c) {
     44     ASSERT_NOT_NULL(data_);
     45     if (cursor_ <= limit_ &&
     46         static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
     47       *cursor_++ = static_cast<char>(c);
     48     } else {
     49       AddCharSlow(c);
     50     }
     51   }
     52 
     53   void Reset() {
     54     if (data_ == NULL) {
     55       data_ = NewArray<char>(kInitialCapacity);
     56       limit_ = ComputeLimit(data_, kInitialCapacity);
     57     }
     58     cursor_ = data_;
     59   }
     60 
     61   int pos() const {
     62     ASSERT_NOT_NULL(data_);
     63     return static_cast<int>(cursor_ - data_);
     64   }
     65 
     66   char* data() const { return data_; }
     67 
     68  private:
     69   static const int kInitialCapacity = 256;
     70   char* data_;
     71   char* cursor_;
     72   char* limit_;
     73 
     74   int Capacity() const {
     75     ASSERT_NOT_NULL(data_);
     76     return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
     77   }
     78 
     79   static char* ComputeLimit(char* data, int capacity) {
     80     return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
     81   }
     82 
     83   void AddCharSlow(uc32 c);
     84 };
     85 
     86 
     87 class UTF16Buffer {
     88  public:
     89   UTF16Buffer();
     90   virtual ~UTF16Buffer() {}
     91 
     92   virtual void PushBack(uc32 ch) = 0;
     93   // returns a value < 0 when the buffer end is reached
     94   virtual uc32 Advance() = 0;
     95   virtual void SeekForward(int pos) = 0;
     96 
     97   int pos() const { return pos_; }
     98   int size() const { return size_; }
     99   Handle<String> SubString(int start, int end);
    100 
    101  protected:
    102   Handle<String> data_;
    103   int pos_;
    104   int size_;
    105 };
    106 
    107 
    108 class CharacterStreamUTF16Buffer: public UTF16Buffer {
    109  public:
    110   CharacterStreamUTF16Buffer();
    111   virtual ~CharacterStreamUTF16Buffer() {}
    112   void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
    113   virtual void PushBack(uc32 ch);
    114   virtual uc32 Advance();
    115   virtual void SeekForward(int pos);
    116 
    117  private:
    118   List<uc32> pushback_buffer_;
    119   uc32 last_;
    120   unibrow::CharacterStream* stream_;
    121 
    122   List<uc32>* pushback_buffer() { return &pushback_buffer_; }
    123 };
    124 
    125 
    126 class TwoByteStringUTF16Buffer: public UTF16Buffer {
    127  public:
    128   TwoByteStringUTF16Buffer();
    129   virtual ~TwoByteStringUTF16Buffer() {}
    130   void Initialize(Handle<ExternalTwoByteString> data);
    131   virtual void PushBack(uc32 ch);
    132   virtual uc32 Advance();
    133   virtual void SeekForward(int pos);
    134 
    135  private:
    136   const uint16_t* raw_data_;
    137 };
    138 
    139 
    140 class KeywordMatcher {
    141 //  Incrementally recognize keywords.
    142 //
    143 //  Recognized keywords:
    144 //      break case catch const* continue debugger* default delete do else
    145 //      finally false for function if in instanceof native* new null
    146 //      return switch this throw true try typeof var void while with
    147 //
    148 //  *: Actually "future reserved keywords". These are the only ones we
    149 //     recognized, the remaining are allowed as identifiers.
    150  public:
    151   KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {}
    152 
    153   Token::Value token() { return token_; }
    154 
    155   inline void AddChar(uc32 input) {
    156     if (state_ != UNMATCHABLE) {
    157       Step(input);
    158     }
    159   }
    160 
    161   void Fail() {
    162     token_ = Token::IDENTIFIER;
    163     state_ = UNMATCHABLE;
    164   }
    165 
    166  private:
    167   enum State {
    168     UNMATCHABLE,
    169     INITIAL,
    170     KEYWORD_PREFIX,
    171     KEYWORD_MATCHED,
    172     C,
    173     CA,
    174     CO,
    175     CON,
    176     D,
    177     DE,
    178     F,
    179     I,
    180     IN,
    181     N,
    182     T,
    183     TH,
    184     TR,
    185     V,
    186     W
    187   };
    188 
    189   struct FirstState {
    190     const char* keyword;
    191     State state;
    192     Token::Value token;
    193   };
    194 
    195   // Range of possible first characters of a keyword.
    196   static const unsigned int kFirstCharRangeMin = 'b';
    197   static const unsigned int kFirstCharRangeMax = 'w';
    198   static const unsigned int kFirstCharRangeLength =
    199       kFirstCharRangeMax - kFirstCharRangeMin + 1;
    200   // State map for first keyword character range.
    201   static FirstState first_states_[kFirstCharRangeLength];
    202 
    203   // Current state.
    204   State state_;
    205   // Token for currently added characters.
    206   Token::Value token_;
    207 
    208   // Matching a specific keyword string (there is only one possible valid
    209   // keyword with the current prefix).
    210   const char* keyword_;
    211   int counter_;
    212   Token::Value keyword_token_;
    213 
    214   // If input equals keyword's character at position, continue matching keyword
    215   // from that position.
    216   inline bool MatchKeywordStart(uc32 input,
    217                                 const char* keyword,
    218                                 int position,
    219                                 Token::Value token_if_match) {
    220     if (input == keyword[position]) {
    221       state_ = KEYWORD_PREFIX;
    222       this->keyword_ = keyword;
    223       this->counter_ = position + 1;
    224       this->keyword_token_ = token_if_match;
    225       return true;
    226     }
    227     return false;
    228   }
    229 
    230   // If input equals match character, transition to new state and return true.
    231   inline bool MatchState(uc32 input, char match, State new_state) {
    232     if (input == match) {
    233       state_ = new_state;
    234       return true;
    235     }
    236     return false;
    237   }
    238 
    239   inline bool MatchKeyword(uc32 input,
    240                            char match,
    241                            State new_state,
    242                            Token::Value keyword_token) {
    243     if (input == match) {  // Matched "do".
    244       state_ = new_state;
    245       token_ = keyword_token;
    246       return true;
    247     }
    248     return false;
    249   }
    250 
    251   void Step(uc32 input);
    252 };
    253 
    254 
    255 enum ParserMode { PARSE, PREPARSE };
    256 enum ParserLanguage { JAVASCRIPT, JSON };
    257 
    258 
    259 class Scanner {
    260  public:
    261   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
    262 
    263   // Construction
    264   explicit Scanner(ParserMode parse_mode);
    265 
    266   // Initialize the Scanner to scan source:
    267   void Init(Handle<String> source,
    268             unibrow::CharacterStream* stream,
    269             int position,
    270             ParserLanguage language);
    271 
    272   // Returns the next token.
    273   Token::Value Next();
    274 
    275   // One token look-ahead (past the token returned by Next()).
    276   Token::Value peek() const  { return next_.token; }
    277 
    278   // Returns true if there was a line terminator before the peek'ed token.
    279   bool has_line_terminator_before_next() const {
    280     return has_line_terminator_before_next_;
    281   }
    282 
    283   struct Location {
    284     Location(int b, int e) : beg_pos(b), end_pos(e) { }
    285     Location() : beg_pos(0), end_pos(0) { }
    286     int beg_pos;
    287     int end_pos;
    288   };
    289 
    290   // Returns the location information for the current token
    291   // (the token returned by Next()).
    292   Location location() const  { return current_.location; }
    293   Location peek_location() const  { return next_.location; }
    294 
    295   // Returns the literal string, if any, for the current token (the
    296   // token returned by Next()). The string is 0-terminated and in
    297   // UTF-8 format; they may contain 0-characters. Literal strings are
    298   // collected for identifiers, strings, and numbers.
    299   // These functions only give the correct result if the literal
    300   // was scanned between calls to StartLiteral() and TerminateLiteral().
    301   const char* literal_string() const {
    302     return current_.literal_buffer->data();
    303   }
    304   int literal_length() const {
    305     // Excluding terminal '\0' added by TerminateLiteral().
    306     return current_.literal_buffer->pos() - 1;
    307   }
    308 
    309   // Returns the literal string for the next token (the token that
    310   // would be returned if Next() were called).
    311   const char* next_literal_string() const {
    312     return next_.literal_buffer->data();
    313   }
    314   // Returns the length of the next token (that would be returned if
    315   // Next() were called).
    316   int next_literal_length() const {
    317     return next_.literal_buffer->pos() - 1;
    318   }
    319 
    320   Vector<const char> next_literal() const {
    321     return Vector<const char>(next_literal_string(),
    322                               next_literal_length());
    323   }
    324 
    325   // Scans the input as a regular expression pattern, previous
    326   // character(s) must be /(=). Returns true if a pattern is scanned.
    327   bool ScanRegExpPattern(bool seen_equal);
    328   // Returns true if regexp flags are scanned (always since flags can
    329   // be empty).
    330   bool ScanRegExpFlags();
    331 
    332   // Seek forward to the given position.  This operation does not
    333   // work in general, for instance when there are pushed back
    334   // characters, but works for seeking forward until simple delimiter
    335   // tokens, which is what it is used for.
    336   void SeekForward(int pos);
    337 
    338   Handle<String> SubString(int start_pos, int end_pos);
    339   bool stack_overflow() { return stack_overflow_; }
    340 
    341   static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
    342 
    343   // Tells whether the buffer contains an identifier (no escapes).
    344   // Used for checking if a property name is an identifier.
    345   static bool IsIdentifier(unibrow::CharacterStream* buffer);
    346 
    347   static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
    348   static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
    349   static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
    350   static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
    351 
    352   static const int kCharacterLookaheadBufferSize = 1;
    353 
    354  private:
    355   CharacterStreamUTF16Buffer char_stream_buffer_;
    356   TwoByteStringUTF16Buffer two_byte_string_buffer_;
    357 
    358   // Source.
    359   UTF16Buffer* source_;
    360   int position_;
    361 
    362   // Buffer to hold literal values (identifiers, strings, numbers)
    363   // using 0-terminated UTF-8 encoding.
    364   UTF8Buffer literal_buffer_1_;
    365   UTF8Buffer literal_buffer_2_;
    366 
    367   bool stack_overflow_;
    368   static StaticResource<Utf8Decoder> utf8_decoder_;
    369 
    370   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
    371   uc32 c0_;
    372 
    373   // The current and look-ahead token.
    374   struct TokenDesc {
    375     Token::Value token;
    376     Location location;
    377     UTF8Buffer* literal_buffer;
    378   };
    379 
    380   TokenDesc current_;  // desc for current token (as returned by Next())
    381   TokenDesc next_;     // desc for next token (one token look-ahead)
    382   bool has_line_terminator_before_next_;
    383   bool is_pre_parsing_;
    384   bool is_parsing_json_;
    385 
    386   // Literal buffer support
    387   void StartLiteral();
    388   void AddChar(uc32 ch);
    389   void AddCharAdvance();
    390   void TerminateLiteral();
    391 
    392   // Low-level scanning support.
    393   void Advance() { c0_ = source_->Advance(); }
    394   void PushBack(uc32 ch) {
    395     source_->PushBack(ch);
    396     c0_ = ch;
    397   }
    398 
    399   bool SkipWhiteSpace() {
    400     if (is_parsing_json_) {
    401       return SkipJsonWhiteSpace();
    402     } else {
    403       return SkipJavaScriptWhiteSpace();
    404     }
    405   }
    406   bool SkipJavaScriptWhiteSpace();
    407   bool SkipJsonWhiteSpace();
    408   Token::Value SkipSingleLineComment();
    409   Token::Value SkipMultiLineComment();
    410 
    411   inline Token::Value Select(Token::Value tok);
    412   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
    413 
    414   inline void Scan() {
    415     if (is_parsing_json_) {
    416       ScanJson();
    417     } else {
    418       ScanJavaScript();
    419     }
    420   }
    421 
    422   // Scans a single JavaScript token.
    423   void ScanJavaScript();
    424 
    425   // Scan a single JSON token. The JSON lexical grammar is specified in the
    426   // ECMAScript 5 standard, section 15.12.1.1.
    427   // Recognizes all of the single-character tokens directly, or calls a function
    428   // to scan a number, string or identifier literal.
    429   // The only allowed whitespace characters between tokens are tab,
    430   // carrige-return, newline and space.
    431   void ScanJson();
    432 
    433   // A JSON number (production JSONNumber) is a subset of the valid JavaScript
    434   // decimal number literals.
    435   // It includes an optional minus sign, must have at least one
    436   // digit before and after a decimal point, may not have prefixed zeros (unless
    437   // the integer part is zero), and may include an exponent part (e.g., "e-10").
    438   // Hexadecimal and octal numbers are not allowed.
    439   Token::Value ScanJsonNumber();
    440   // A JSON string (production JSONString) is subset of valid JavaScript string
    441   // literals. The string must only be double-quoted (not single-quoted), and
    442   // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
    443   // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
    444   Token::Value ScanJsonString();
    445   // Used to recognizes one of the literals "true", "false", or "null". These
    446   // are the only valid JSON identifiers (productions JSONBooleanLiteral,
    447   // JSONNullLiteral).
    448   Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
    449 
    450   void ScanDecimalDigits();
    451   Token::Value ScanNumber(bool seen_period);
    452   Token::Value ScanIdentifier();
    453   uc32 ScanHexEscape(uc32 c, int length);
    454   uc32 ScanOctalEscape(uc32 c, int length);
    455   void ScanEscape();
    456   Token::Value ScanString();
    457 
    458   // Scans a possible HTML comment -- begins with '<!'.
    459   Token::Value ScanHtmlComment();
    460 
    461   // Return the current source position.
    462   int source_pos() {
    463     return source_->pos() - kCharacterLookaheadBufferSize + position_;
    464   }
    465 
    466   // Decodes a unicode escape-sequence which is part of an identifier.
    467   // If the escape sequence cannot be decoded the result is kBadRune.
    468   uc32 ScanIdentifierUnicodeEscape();
    469 };
    470 
    471 } }  // namespace v8::internal
    472 
    473 #endif  // V8_SCANNER_H_
    474