Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 // Features shared by parsing and pre-parsing scanners.
     29 
     30 #ifndef V8_SCANNER_BASE_H_
     31 #define V8_SCANNER_BASE_H_
     32 
     33 #include "globals.h"
     34 #include "checks.h"
     35 #include "allocation.h"
     36 #include "token.h"
     37 #include "unicode-inl.h"
     38 #include "char-predicates.h"
     39 #include "utils.h"
     40 #include "list-inl.h"
     41 
     42 namespace v8 {
     43 namespace internal {
     44 
     45 // Returns the value (0 .. 15) of a hexadecimal character c.
     46 // If c is not a legal hexadecimal character, returns a value < 0.
     47 inline int HexValue(uc32 c) {
     48   c -= '0';
     49   if (static_cast<unsigned>(c) <= 9) return c;
     50   c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
     51   if (static_cast<unsigned>(c) <= 5) return c + 10;
     52   return -1;
     53 }
     54 
     55 
     56 // ---------------------------------------------------------------------
     57 // Buffered stream of characters, using an internal UC16 buffer.
     58 
     59 class UC16CharacterStream {
     60  public:
     61   UC16CharacterStream() : pos_(0) { }
     62   virtual ~UC16CharacterStream() { }
     63 
     64   // Returns and advances past the next UC16 character in the input
     65   // stream. If there are no more characters, it returns a negative
     66   // value.
     67   inline uc32 Advance() {
     68     if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
     69       pos_++;
     70       return static_cast<uc32>(*(buffer_cursor_++));
     71     }
     72     // Note: currently the following increment is necessary to avoid a
     73     // parser problem! The scanner treats the final kEndOfInput as
     74     // a character with a position, and does math relative to that
     75     // position.
     76     pos_++;
     77 
     78     return kEndOfInput;
     79   }
     80 
     81   // Return the current position in the character stream.
     82   // Starts at zero.
     83   inline unsigned pos() const { return pos_; }
     84 
     85   // Skips forward past the next character_count UC16 characters
     86   // in the input, or until the end of input if that comes sooner.
     87   // Returns the number of characters actually skipped. If less
     88   // than character_count,
     89   inline unsigned SeekForward(unsigned character_count) {
     90     unsigned buffered_chars =
     91         static_cast<unsigned>(buffer_end_ - buffer_cursor_);
     92     if (character_count <= buffered_chars) {
     93       buffer_cursor_ += character_count;
     94       pos_ += character_count;
     95       return character_count;
     96     }
     97     return SlowSeekForward(character_count);
     98   }
     99 
    100   // Pushes back the most recently read UC16 character (or negative
    101   // value if at end of input), i.e., the value returned by the most recent
    102   // call to Advance.
    103   // Must not be used right after calling SeekForward.
    104   virtual void PushBack(int32_t character) = 0;
    105 
    106  protected:
    107   static const uc32 kEndOfInput = -1;
    108 
    109   // Ensures that the buffer_cursor_ points to the character at
    110   // position pos_ of the input, if possible. If the position
    111   // is at or after the end of the input, return false. If there
    112   // are more characters available, return true.
    113   virtual bool ReadBlock() = 0;
    114   virtual unsigned SlowSeekForward(unsigned character_count) = 0;
    115 
    116   const uc16* buffer_cursor_;
    117   const uc16* buffer_end_;
    118   unsigned pos_;
    119 };
    120 
    121 
    122 class UnicodeCache {
    123 // ---------------------------------------------------------------------
    124 // Caching predicates used by scanners.
    125  public:
    126   UnicodeCache() {}
    127   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
    128 
    129   StaticResource<Utf8Decoder>* utf8_decoder() {
    130     return &utf8_decoder_;
    131   }
    132 
    133   bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
    134   bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
    135   bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
    136   bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
    137 
    138  private:
    139 
    140   unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
    141   unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
    142   unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
    143   unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
    144   StaticResource<Utf8Decoder> utf8_decoder_;
    145 
    146   DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
    147 };
    148 
    149 
    150 // ----------------------------------------------------------------------------
    151 // LiteralBuffer -  Collector of chars of literals.
    152 
    153 class LiteralBuffer {
    154  public:
    155   LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
    156 
    157   ~LiteralBuffer() {
    158     if (backing_store_.length() > 0) {
    159       backing_store_.Dispose();
    160     }
    161   }
    162 
    163   inline void AddChar(uc16 character) {
    164     if (position_ >= backing_store_.length()) ExpandBuffer();
    165     if (is_ascii_) {
    166       if (character < kMaxAsciiCharCodeU) {
    167         backing_store_[position_] = static_cast<byte>(character);
    168         position_ += kASCIISize;
    169         return;
    170       }
    171       ConvertToUC16();
    172     }
    173     *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
    174     position_ += kUC16Size;
    175   }
    176 
    177   bool is_ascii() { return is_ascii_; }
    178 
    179   Vector<const uc16> uc16_literal() {
    180     ASSERT(!is_ascii_);
    181     ASSERT((position_ & 0x1) == 0);
    182     return Vector<const uc16>(
    183         reinterpret_cast<const uc16*>(backing_store_.start()),
    184         position_ >> 1);
    185   }
    186 
    187   Vector<const char> ascii_literal() {
    188     ASSERT(is_ascii_);
    189     return Vector<const char>(
    190         reinterpret_cast<const char*>(backing_store_.start()),
    191         position_);
    192   }
    193 
    194   int length() {
    195     return is_ascii_ ? position_ : (position_ >> 1);
    196   }
    197 
    198   void Reset() {
    199     position_ = 0;
    200     is_ascii_ = true;
    201   }
    202  private:
    203   static const int kInitialCapacity = 16;
    204   static const int kGrowthFactory = 4;
    205   static const int kMinConversionSlack = 256;
    206   static const int kMaxGrowth = 1 * MB;
    207   inline int NewCapacity(int min_capacity) {
    208     int capacity = Max(min_capacity, backing_store_.length());
    209     int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
    210     return new_capacity;
    211   }
    212 
    213   void ExpandBuffer() {
    214     Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
    215     memcpy(new_store.start(), backing_store_.start(), position_);
    216     backing_store_.Dispose();
    217     backing_store_ = new_store;
    218   }
    219 
    220   void ConvertToUC16() {
    221     ASSERT(is_ascii_);
    222     Vector<byte> new_store;
    223     int new_content_size = position_ * kUC16Size;
    224     if (new_content_size >= backing_store_.length()) {
    225       // Ensure room for all currently read characters as UC16 as well
    226       // as the character about to be stored.
    227       new_store = Vector<byte>::New(NewCapacity(new_content_size));
    228     } else {
    229       new_store = backing_store_;
    230     }
    231     char* src = reinterpret_cast<char*>(backing_store_.start());
    232     uc16* dst = reinterpret_cast<uc16*>(new_store.start());
    233     for (int i = position_ - 1; i >= 0; i--) {
    234       dst[i] = src[i];
    235     }
    236     if (new_store.start() != backing_store_.start()) {
    237       backing_store_.Dispose();
    238       backing_store_ = new_store;
    239     }
    240     position_ = new_content_size;
    241     is_ascii_ = false;
    242   }
    243 
    244   bool is_ascii_;
    245   int position_;
    246   Vector<byte> backing_store_;
    247 
    248   DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
    249 };
    250 
    251 
    252 // ----------------------------------------------------------------------------
    253 // Scanner base-class.
    254 
    255 // Generic functionality used by both JSON and JavaScript scanners.
    256 class Scanner {
    257  public:
    258   // -1 is outside of the range of any real source code.
    259   static const int kNoOctalLocation = -1;
    260 
    261   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
    262 
    263   class LiteralScope {
    264    public:
    265     explicit LiteralScope(Scanner* self);
    266     ~LiteralScope();
    267     void Complete();
    268 
    269    private:
    270     Scanner* scanner_;
    271     bool complete_;
    272   };
    273 
    274   explicit Scanner(UnicodeCache* scanner_contants);
    275 
    276   // Returns the current token again.
    277   Token::Value current_token() { return current_.token; }
    278 
    279   // One token look-ahead (past the token returned by Next()).
    280   Token::Value peek() const { return next_.token; }
    281 
    282   struct Location {
    283     Location(int b, int e) : beg_pos(b), end_pos(e) { }
    284     Location() : beg_pos(0), end_pos(0) { }
    285 
    286     bool IsValid() const {
    287       return beg_pos >= 0 && end_pos >= beg_pos;
    288     }
    289 
    290     int beg_pos;
    291     int end_pos;
    292   };
    293 
    294   static Location NoLocation() {
    295     return Location(-1, -1);
    296   }
    297 
    298   // Returns the location information for the current token
    299   // (the token returned by Next()).
    300   Location location() const { return current_.location; }
    301   Location peek_location() const { return next_.location; }
    302 
    303   // Returns the location of the last seen octal literal
    304   int octal_position() const { return octal_pos_; }
    305   void clear_octal_position() { octal_pos_ = -1; }
    306 
    307   // Returns the literal string, if any, for the current token (the
    308   // token returned by Next()). The string is 0-terminated and in
    309   // UTF-8 format; they may contain 0-characters. Literal strings are
    310   // collected for identifiers, strings, and numbers.
    311   // These functions only give the correct result if the literal
    312   // was scanned between calls to StartLiteral() and TerminateLiteral().
    313   bool is_literal_ascii() {
    314     ASSERT_NOT_NULL(current_.literal_chars);
    315     return current_.literal_chars->is_ascii();
    316   }
    317   Vector<const char> literal_ascii_string() {
    318     ASSERT_NOT_NULL(current_.literal_chars);
    319     return current_.literal_chars->ascii_literal();
    320   }
    321   Vector<const uc16> literal_uc16_string() {
    322     ASSERT_NOT_NULL(current_.literal_chars);
    323     return current_.literal_chars->uc16_literal();
    324   }
    325   int literal_length() const {
    326     ASSERT_NOT_NULL(current_.literal_chars);
    327     return current_.literal_chars->length();
    328   }
    329 
    330   // Returns the literal string for the next token (the token that
    331   // would be returned if Next() were called).
    332   bool is_next_literal_ascii() {
    333     ASSERT_NOT_NULL(next_.literal_chars);
    334     return next_.literal_chars->is_ascii();
    335   }
    336   Vector<const char> next_literal_ascii_string() {
    337     ASSERT_NOT_NULL(next_.literal_chars);
    338     return next_.literal_chars->ascii_literal();
    339   }
    340   Vector<const uc16> next_literal_uc16_string() {
    341     ASSERT_NOT_NULL(next_.literal_chars);
    342     return next_.literal_chars->uc16_literal();
    343   }
    344   int next_literal_length() const {
    345     ASSERT_NOT_NULL(next_.literal_chars);
    346     return next_.literal_chars->length();
    347   }
    348 
    349   static const int kCharacterLookaheadBufferSize = 1;
    350 
    351  protected:
    352   // The current and look-ahead token.
    353   struct TokenDesc {
    354     Token::Value token;
    355     Location location;
    356     LiteralBuffer* literal_chars;
    357   };
    358 
    359   // Call this after setting source_ to the input.
    360   void Init() {
    361     // Set c0_ (one character ahead)
    362     ASSERT(kCharacterLookaheadBufferSize == 1);
    363     Advance();
    364     // Initialize current_ to not refer to a literal.
    365     current_.literal_chars = NULL;
    366   }
    367 
    368   // Literal buffer support
    369   inline void StartLiteral() {
    370     LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
    371             &literal_buffer2_ : &literal_buffer1_;
    372     free_buffer->Reset();
    373     next_.literal_chars = free_buffer;
    374   }
    375 
    376   inline void AddLiteralChar(uc32 c) {
    377     ASSERT_NOT_NULL(next_.literal_chars);
    378     next_.literal_chars->AddChar(c);
    379   }
    380 
    381   // Complete scanning of a literal.
    382   inline void TerminateLiteral() {
    383     // Does nothing in the current implementation.
    384   }
    385 
    386   // Stops scanning of a literal and drop the collected characters,
    387   // e.g., due to an encountered error.
    388   inline void DropLiteral() {
    389     next_.literal_chars = NULL;
    390   }
    391 
    392   inline void AddLiteralCharAdvance() {
    393     AddLiteralChar(c0_);
    394     Advance();
    395   }
    396 
    397   // Low-level scanning support.
    398   void Advance() { c0_ = source_->Advance(); }
    399   void PushBack(uc32 ch) {
    400     source_->PushBack(c0_);
    401     c0_ = ch;
    402   }
    403 
    404   inline Token::Value Select(Token::Value tok) {
    405     Advance();
    406     return tok;
    407   }
    408 
    409   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
    410     Advance();
    411     if (c0_ == next) {
    412       Advance();
    413       return then;
    414     } else {
    415       return else_;
    416     }
    417   }
    418 
    419   uc32 ScanHexEscape(uc32 c, int length);
    420 
    421   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
    422   uc32 ScanOctalEscape(uc32 c, int length);
    423 
    424   // Return the current source position.
    425   int source_pos() {
    426     return source_->pos() - kCharacterLookaheadBufferSize;
    427   }
    428 
    429   UnicodeCache* unicode_cache_;
    430 
    431   // Buffers collecting literal strings, numbers, etc.
    432   LiteralBuffer literal_buffer1_;
    433   LiteralBuffer literal_buffer2_;
    434 
    435   TokenDesc current_;  // desc for current token (as returned by Next())
    436   TokenDesc next_;     // desc for next token (one token look-ahead)
    437 
    438   // Input stream. Must be initialized to an UC16CharacterStream.
    439   UC16CharacterStream* source_;
    440 
    441   // Start position of the octal literal last scanned.
    442   int octal_pos_;
    443 
    444   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
    445   uc32 c0_;
    446 };
    447 
    448 // ----------------------------------------------------------------------------
    449 // JavaScriptScanner - base logic for JavaScript scanning.
    450 
    451 class JavaScriptScanner : public Scanner {
    452  public:
    453   // A LiteralScope that disables recording of some types of JavaScript
    454   // literals. If the scanner is configured to not record the specific
    455   // type of literal, the scope will not call StartLiteral.
    456   class LiteralScope {
    457    public:
    458     explicit LiteralScope(JavaScriptScanner* self)
    459         : scanner_(self), complete_(false) {
    460       scanner_->StartLiteral();
    461     }
    462      ~LiteralScope() {
    463        if (!complete_) scanner_->DropLiteral();
    464      }
    465     void Complete() {
    466       scanner_->TerminateLiteral();
    467       complete_ = true;
    468     }
    469 
    470    private:
    471     JavaScriptScanner* scanner_;
    472     bool complete_;
    473   };
    474 
    475   explicit JavaScriptScanner(UnicodeCache* scanner_contants);
    476 
    477   // Returns the next token.
    478   Token::Value Next();
    479 
    480   // Returns true if there was a line terminator before the peek'ed token.
    481   bool has_line_terminator_before_next() const {
    482     return has_line_terminator_before_next_;
    483   }
    484 
    485   // Scans the input as a regular expression pattern, previous
    486   // character(s) must be /(=). Returns true if a pattern is scanned.
    487   bool ScanRegExpPattern(bool seen_equal);
    488   // Returns true if regexp flags are scanned (always since flags can
    489   // be empty).
    490   bool ScanRegExpFlags();
    491 
    492   // Tells whether the buffer contains an identifier (no escapes).
    493   // Used for checking if a property name is an identifier.
    494   static bool IsIdentifier(unibrow::CharacterStream* buffer);
    495 
    496   // Seek forward to the given position.  This operation does not
    497   // work in general, for instance when there are pushed back
    498   // characters, but works for seeking forward until simple delimiter
    499   // tokens, which is what it is used for.
    500   void SeekForward(int pos);
    501 
    502  protected:
    503   bool SkipWhiteSpace();
    504   Token::Value SkipSingleLineComment();
    505   Token::Value SkipMultiLineComment();
    506 
    507   // Scans a single JavaScript token.
    508   void Scan();
    509 
    510   void ScanDecimalDigits();
    511   Token::Value ScanNumber(bool seen_period);
    512   Token::Value ScanIdentifierOrKeyword();
    513   Token::Value ScanIdentifierSuffix(LiteralScope* literal);
    514 
    515   void ScanEscape();
    516   Token::Value ScanString();
    517 
    518   // Scans a possible HTML comment -- begins with '<!'.
    519   Token::Value ScanHtmlComment();
    520 
    521   // Decodes a unicode escape-sequence which is part of an identifier.
    522   // If the escape sequence cannot be decoded the result is kBadChar.
    523   uc32 ScanIdentifierUnicodeEscape();
    524 
    525   bool has_line_terminator_before_next_;
    526 };
    527 
    528 
    529 // ----------------------------------------------------------------------------
    530 // Keyword matching state machine.
    531 
    532 class KeywordMatcher {
    533 //  Incrementally recognize keywords.
    534 //
    535 //  Recognized keywords:
    536 //      break case catch const* continue debugger* default delete do else
    537 //      finally false for function if in instanceof native* new null
    538 //      return switch this throw true try typeof var void while with
    539 //
    540 //  *: Actually "future reserved keywords". These are the only ones we
    541 //     recognize, the remaining are allowed as identifiers.
    542 //     In ES5 strict mode, we should disallow all reserved keywords.
    543  public:
    544   KeywordMatcher()
    545       : state_(INITIAL),
    546         token_(Token::IDENTIFIER),
    547         keyword_(NULL),
    548         counter_(0),
    549         keyword_token_(Token::ILLEGAL) {}
    550 
    551   Token::Value token() { return token_; }
    552 
    553   inline bool AddChar(unibrow::uchar input) {
    554     if (state_ != UNMATCHABLE) {
    555       Step(input);
    556     }
    557     return state_ != UNMATCHABLE;
    558   }
    559 
    560   void Fail() {
    561     token_ = Token::IDENTIFIER;
    562     state_ = UNMATCHABLE;
    563   }
    564 
    565  private:
    566   enum State {
    567     UNMATCHABLE,
    568     INITIAL,
    569     KEYWORD_PREFIX,
    570     KEYWORD_MATCHED,
    571     C,
    572     CA,
    573     CO,
    574     CON,
    575     D,
    576     DE,
    577     E,
    578     EX,
    579     F,
    580     I,
    581     IM,
    582     IMP,
    583     IN,
    584     N,
    585     P,
    586     PR,
    587     S,
    588     T,
    589     TH,
    590     TR,
    591     V,
    592     W
    593   };
    594 
    595   struct FirstState {
    596     const char* keyword;
    597     State state;
    598     Token::Value token;
    599   };
    600 
    601   // Range of possible first characters of a keyword.
    602   static const unsigned int kFirstCharRangeMin = 'b';
    603   static const unsigned int kFirstCharRangeMax = 'y';
    604   static const unsigned int kFirstCharRangeLength =
    605       kFirstCharRangeMax - kFirstCharRangeMin + 1;
    606   // State map for first keyword character range.
    607   static FirstState first_states_[kFirstCharRangeLength];
    608 
    609   // If input equals keyword's character at position, continue matching keyword
    610   // from that position.
    611   inline bool MatchKeywordStart(unibrow::uchar input,
    612                                 const char* keyword,
    613                                 int position,
    614                                 Token::Value token_if_match) {
    615     if (input != static_cast<unibrow::uchar>(keyword[position])) {
    616       return false;
    617     }
    618     state_ = KEYWORD_PREFIX;
    619     this->keyword_ = keyword;
    620     this->counter_ = position + 1;
    621     this->keyword_token_ = token_if_match;
    622     return true;
    623   }
    624 
    625   // If input equals match character, transition to new state and return true.
    626   inline bool MatchState(unibrow::uchar input, char match, State new_state) {
    627     if (input != static_cast<unibrow::uchar>(match)) {
    628       return false;
    629     }
    630     state_ = new_state;
    631     return true;
    632   }
    633 
    634   inline bool MatchKeyword(unibrow::uchar input,
    635                            char match,
    636                            State new_state,
    637                            Token::Value keyword_token) {
    638     if (input != static_cast<unibrow::uchar>(match)) {
    639       return false;
    640     }
    641     state_ = new_state;
    642     token_ = keyword_token;
    643     return true;
    644   }
    645 
    646   void Step(unibrow::uchar input);
    647 
    648   // Current state.
    649   State state_;
    650   // Token for currently added characters.
    651   Token::Value token_;
    652 
    653   // Matching a specific keyword string (there is only one possible valid
    654   // keyword with the current prefix).
    655   const char* keyword_;
    656   int counter_;
    657   Token::Value keyword_token_;
    658 };
    659 
    660 
    661 } }  // namespace v8::internal
    662 
    663 #endif  // V8_SCANNER_BASE_H_
    664