Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 // Features shared by parsing and pre-parsing scanners.
     29 
     30 #ifndef V8_SCANNER_H_
     31 #define V8_SCANNER_H_
     32 
     33 #include "allocation.h"
     34 #include "char-predicates.h"
     35 #include "checks.h"
     36 #include "globals.h"
     37 #include "token.h"
     38 #include "unicode-inl.h"
     39 #include "utils.h"
     40 
     41 namespace v8 {
     42 namespace internal {
     43 
     44 
     45 // Returns the value (0 .. 15) of a hexadecimal character c.
     46 // If c is not a legal hexadecimal character, returns a value < 0.
     47 inline int HexValue(uc32 c) {
     48   c -= '0';
     49   if (static_cast<unsigned>(c) <= 9) return c;
     50   c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
     51   if (static_cast<unsigned>(c) <= 5) return c + 10;
     52   return -1;
     53 }
     54 
     55 
     56 // ---------------------------------------------------------------------
     57 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
     58 // A code unit is a 16 bit value representing either a 16 bit code point
     59 // or one part of a surrogate pair that make a single 21 bit code point.
     60 
     61 class Utf16CharacterStream {
     62  public:
     63   Utf16CharacterStream() : pos_(0) { }
     64   virtual ~Utf16CharacterStream() { }
     65 
     66   // Returns and advances past the next UTF-16 code unit in the input
     67   // stream. If there are no more code units, it returns a negative
     68   // value.
     69   inline uc32 Advance() {
     70     if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
     71       pos_++;
     72       return static_cast<uc32>(*(buffer_cursor_++));
     73     }
     74     // Note: currently the following increment is necessary to avoid a
     75     // parser problem! The scanner treats the final kEndOfInput as
     76     // a code unit with a position, and does math relative to that
     77     // position.
     78     pos_++;
     79 
     80     return kEndOfInput;
     81   }
     82 
     83   // Return the current position in the code unit stream.
     84   // Starts at zero.
     85   inline unsigned pos() const { return pos_; }
     86 
     87   // Skips forward past the next code_unit_count UTF-16 code units
     88   // in the input, or until the end of input if that comes sooner.
     89   // Returns the number of code units actually skipped. If less
     90   // than code_unit_count,
     91   inline unsigned SeekForward(unsigned code_unit_count) {
     92     unsigned buffered_chars =
     93         static_cast<unsigned>(buffer_end_ - buffer_cursor_);
     94     if (code_unit_count <= buffered_chars) {
     95       buffer_cursor_ += code_unit_count;
     96       pos_ += code_unit_count;
     97       return code_unit_count;
     98     }
     99     return SlowSeekForward(code_unit_count);
    100   }
    101 
    102   // Pushes back the most recently read UTF-16 code unit (or negative
    103   // value if at end of input), i.e., the value returned by the most recent
    104   // call to Advance.
    105   // Must not be used right after calling SeekForward.
    106   virtual void PushBack(int32_t code_unit) = 0;
    107 
    108  protected:
    109   static const uc32 kEndOfInput = -1;
    110 
    111   // Ensures that the buffer_cursor_ points to the code_unit at
    112   // position pos_ of the input, if possible. If the position
    113   // is at or after the end of the input, return false. If there
    114   // are more code_units available, return true.
    115   virtual bool ReadBlock() = 0;
    116   virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
    117 
    118   const uc16* buffer_cursor_;
    119   const uc16* buffer_end_;
    120   unsigned pos_;
    121 };
    122 
    123 
    124 class UnicodeCache {
    125 // ---------------------------------------------------------------------
    126 // Caching predicates used by scanners.
    127  public:
    128   UnicodeCache() {}
    129   typedef unibrow::Utf8Decoder<512> Utf8Decoder;
    130 
    131   StaticResource<Utf8Decoder>* utf8_decoder() {
    132     return &utf8_decoder_;
    133   }
    134 
    135   bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
    136   bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
    137   bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
    138   bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
    139 
    140  private:
    141   unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
    142   unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
    143   unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
    144   unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
    145   StaticResource<Utf8Decoder> utf8_decoder_;
    146 
    147   DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
    148 };
    149 
    150 
    151 // ----------------------------------------------------------------------------
    152 // LiteralBuffer -  Collector of chars of literals.
    153 
    154 class LiteralBuffer {
    155  public:
    156   LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
    157 
    158   ~LiteralBuffer() {
    159     if (backing_store_.length() > 0) {
    160       backing_store_.Dispose();
    161     }
    162   }
    163 
    164   INLINE(void AddChar(uint32_t code_unit)) {
    165     if (position_ >= backing_store_.length()) ExpandBuffer();
    166     if (is_ascii_) {
    167       if (code_unit <= unibrow::Latin1::kMaxChar) {
    168         backing_store_[position_] = static_cast<byte>(code_unit);
    169         position_ += kOneByteSize;
    170         return;
    171       }
    172       ConvertToUtf16();
    173     }
    174     ASSERT(code_unit < 0x10000u);
    175     *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
    176     position_ += kUC16Size;
    177   }
    178 
    179   bool is_ascii() { return is_ascii_; }
    180 
    181   bool is_contextual_keyword(Vector<const char> keyword) {
    182     return is_ascii() && keyword.length() == position_ &&
    183         (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
    184   }
    185 
    186   Vector<const uc16> utf16_literal() {
    187     ASSERT(!is_ascii_);
    188     ASSERT((position_ & 0x1) == 0);
    189     return Vector<const uc16>(
    190         reinterpret_cast<const uc16*>(backing_store_.start()),
    191         position_ >> 1);
    192   }
    193 
    194   Vector<const char> ascii_literal() {
    195     ASSERT(is_ascii_);
    196     return Vector<const char>(
    197         reinterpret_cast<const char*>(backing_store_.start()),
    198         position_);
    199   }
    200 
    201   int length() {
    202     return is_ascii_ ? position_ : (position_ >> 1);
    203   }
    204 
    205   void Reset() {
    206     position_ = 0;
    207     is_ascii_ = true;
    208   }
    209 
    210  private:
    211   static const int kInitialCapacity = 16;
    212   static const int kGrowthFactory = 4;
    213   static const int kMinConversionSlack = 256;
    214   static const int kMaxGrowth = 1 * MB;
    215   inline int NewCapacity(int min_capacity) {
    216     int capacity = Max(min_capacity, backing_store_.length());
    217     int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
    218     return new_capacity;
    219   }
    220 
    221   void ExpandBuffer() {
    222     Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
    223     OS::MemCopy(new_store.start(), backing_store_.start(), position_);
    224     backing_store_.Dispose();
    225     backing_store_ = new_store;
    226   }
    227 
    228   void ConvertToUtf16() {
    229     ASSERT(is_ascii_);
    230     Vector<byte> new_store;
    231     int new_content_size = position_ * kUC16Size;
    232     if (new_content_size >= backing_store_.length()) {
    233       // Ensure room for all currently read code units as UC16 as well
    234       // as the code unit about to be stored.
    235       new_store = Vector<byte>::New(NewCapacity(new_content_size));
    236     } else {
    237       new_store = backing_store_;
    238     }
    239     uint8_t* src = backing_store_.start();
    240     uc16* dst = reinterpret_cast<uc16*>(new_store.start());
    241     for (int i = position_ - 1; i >= 0; i--) {
    242       dst[i] = src[i];
    243     }
    244     if (new_store.start() != backing_store_.start()) {
    245       backing_store_.Dispose();
    246       backing_store_ = new_store;
    247     }
    248     position_ = new_content_size;
    249     is_ascii_ = false;
    250   }
    251 
    252   bool is_ascii_;
    253   int position_;
    254   Vector<byte> backing_store_;
    255 
    256   DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
    257 };
    258 
    259 
    260 // ----------------------------------------------------------------------------
    261 // JavaScript Scanner.
    262 
    263 class Scanner {
    264  public:
    265   // Scoped helper for literal recording. Automatically drops the literal
    266   // if aborting the scanning before it's complete.
    267   class LiteralScope {
    268    public:
    269     explicit LiteralScope(Scanner* self)
    270         : scanner_(self), complete_(false) {
    271       scanner_->StartLiteral();
    272     }
    273      ~LiteralScope() {
    274        if (!complete_) scanner_->DropLiteral();
    275      }
    276     void Complete() {
    277       scanner_->TerminateLiteral();
    278       complete_ = true;
    279     }
    280 
    281    private:
    282     Scanner* scanner_;
    283     bool complete_;
    284   };
    285 
    286   // Representation of an interval of source positions.
    287   struct Location {
    288     Location(int b, int e) : beg_pos(b), end_pos(e) { }
    289     Location() : beg_pos(0), end_pos(0) { }
    290 
    291     bool IsValid() const {
    292       return beg_pos >= 0 && end_pos >= beg_pos;
    293     }
    294 
    295     static Location invalid() { return Location(-1, -1); }
    296 
    297     int beg_pos;
    298     int end_pos;
    299   };
    300 
    301   // -1 is outside of the range of any real source code.
    302   static const int kNoOctalLocation = -1;
    303 
    304   explicit Scanner(UnicodeCache* scanner_contants);
    305 
    306   void Initialize(Utf16CharacterStream* source);
    307 
    308   // Returns the next token and advances input.
    309   Token::Value Next();
    310   // Returns the current token again.
    311   Token::Value current_token() { return current_.token; }
    312   // Returns the location information for the current token
    313   // (the token last returned by Next()).
    314   Location location() const { return current_.location; }
    315   // Returns the literal string, if any, for the current token (the
    316   // token last returned by Next()). The string is 0-terminated.
    317   // Literal strings are collected for identifiers, strings, and
    318   // numbers.
    319   // These functions only give the correct result if the literal
    320   // was scanned between calls to StartLiteral() and TerminateLiteral().
    321   Vector<const char> literal_ascii_string() {
    322     ASSERT_NOT_NULL(current_.literal_chars);
    323     return current_.literal_chars->ascii_literal();
    324   }
    325   Vector<const uc16> literal_utf16_string() {
    326     ASSERT_NOT_NULL(current_.literal_chars);
    327     return current_.literal_chars->utf16_literal();
    328   }
    329   bool is_literal_ascii() {
    330     ASSERT_NOT_NULL(current_.literal_chars);
    331     return current_.literal_chars->is_ascii();
    332   }
    333   bool is_literal_contextual_keyword(Vector<const char> keyword) {
    334     ASSERT_NOT_NULL(current_.literal_chars);
    335     return current_.literal_chars->is_contextual_keyword(keyword);
    336   }
    337   int literal_length() const {
    338     ASSERT_NOT_NULL(current_.literal_chars);
    339     return current_.literal_chars->length();
    340   }
    341 
    342   bool literal_contains_escapes() const {
    343     Location location = current_.location;
    344     int source_length = (location.end_pos - location.beg_pos);
    345     if (current_.token == Token::STRING) {
    346       // Subtract delimiters.
    347       source_length -= 2;
    348     }
    349     return current_.literal_chars->length() != source_length;
    350   }
    351 
    352   // Similar functions for the upcoming token.
    353 
    354   // One token look-ahead (past the token returned by Next()).
    355   Token::Value peek() const { return next_.token; }
    356 
    357   Location peek_location() const { return next_.location; }
    358 
    359   // Returns the literal string for the next token (the token that
    360   // would be returned if Next() were called).
    361   Vector<const char> next_literal_ascii_string() {
    362     ASSERT_NOT_NULL(next_.literal_chars);
    363     return next_.literal_chars->ascii_literal();
    364   }
    365   Vector<const uc16> next_literal_utf16_string() {
    366     ASSERT_NOT_NULL(next_.literal_chars);
    367     return next_.literal_chars->utf16_literal();
    368   }
    369   bool is_next_literal_ascii() {
    370     ASSERT_NOT_NULL(next_.literal_chars);
    371     return next_.literal_chars->is_ascii();
    372   }
    373   bool is_next_contextual_keyword(Vector<const char> keyword) {
    374     ASSERT_NOT_NULL(next_.literal_chars);
    375     return next_.literal_chars->is_contextual_keyword(keyword);
    376   }
    377   int next_literal_length() const {
    378     ASSERT_NOT_NULL(next_.literal_chars);
    379     return next_.literal_chars->length();
    380   }
    381 
    382   UnicodeCache* unicode_cache() { return unicode_cache_; }
    383 
    384   static const int kCharacterLookaheadBufferSize = 1;
    385 
    386   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
    387   uc32 ScanOctalEscape(uc32 c, int length);
    388 
    389   // Returns the location of the last seen octal literal.
    390   Location octal_position() const { return octal_pos_; }
    391   void clear_octal_position() { octal_pos_ = Location::invalid(); }
    392 
    393   // Seek forward to the given position.  This operation does not
    394   // work in general, for instance when there are pushed back
    395   // characters, but works for seeking forward until simple delimiter
    396   // tokens, which is what it is used for.
    397   void SeekForward(int pos);
    398 
    399   bool HarmonyScoping() const {
    400     return harmony_scoping_;
    401   }
    402   void SetHarmonyScoping(bool scoping) {
    403     harmony_scoping_ = scoping;
    404   }
    405   bool HarmonyModules() const {
    406     return harmony_modules_;
    407   }
    408   void SetHarmonyModules(bool modules) {
    409     harmony_modules_ = modules;
    410   }
    411   bool HarmonyNumericLiterals() const {
    412     return harmony_numeric_literals_;
    413   }
    414   void SetHarmonyNumericLiterals(bool numeric_literals) {
    415     harmony_numeric_literals_ = numeric_literals;
    416   }
    417 
    418   // Returns true if there was a line terminator before the peek'ed token,
    419   // possibly inside a multi-line comment.
    420   bool HasAnyLineTerminatorBeforeNext() const {
    421     return has_line_terminator_before_next_ ||
    422            has_multiline_comment_before_next_;
    423   }
    424 
    425   // Scans the input as a regular expression pattern, previous
    426   // character(s) must be /(=). Returns true if a pattern is scanned.
    427   bool ScanRegExpPattern(bool seen_equal);
    428   // Returns true if regexp flags are scanned (always since flags can
    429   // be empty).
    430   bool ScanRegExpFlags();
    431 
    432  private:
    433   // The current and look-ahead token.
    434   struct TokenDesc {
    435     Token::Value token;
    436     Location location;
    437     LiteralBuffer* literal_chars;
    438   };
    439 
    440   // Call this after setting source_ to the input.
    441   void Init() {
    442     // Set c0_ (one character ahead)
    443     STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
    444     Advance();
    445     // Initialize current_ to not refer to a literal.
    446     current_.literal_chars = NULL;
    447   }
    448 
    449   // Literal buffer support
    450   inline void StartLiteral() {
    451     LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
    452             &literal_buffer2_ : &literal_buffer1_;
    453     free_buffer->Reset();
    454     next_.literal_chars = free_buffer;
    455   }
    456 
    457   INLINE(void AddLiteralChar(uc32 c)) {
    458     ASSERT_NOT_NULL(next_.literal_chars);
    459     next_.literal_chars->AddChar(c);
    460   }
    461 
    462   // Complete scanning of a literal.
    463   inline void TerminateLiteral() {
    464     // Does nothing in the current implementation.
    465   }
    466 
    467   // Stops scanning of a literal and drop the collected characters,
    468   // e.g., due to an encountered error.
    469   inline void DropLiteral() {
    470     next_.literal_chars = NULL;
    471   }
    472 
    473   inline void AddLiteralCharAdvance() {
    474     AddLiteralChar(c0_);
    475     Advance();
    476   }
    477 
    478   // Low-level scanning support.
    479   void Advance() { c0_ = source_->Advance(); }
    480   void PushBack(uc32 ch) {
    481     source_->PushBack(c0_);
    482     c0_ = ch;
    483   }
    484 
    485   inline Token::Value Select(Token::Value tok) {
    486     Advance();
    487     return tok;
    488   }
    489 
    490   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
    491     Advance();
    492     if (c0_ == next) {
    493       Advance();
    494       return then;
    495     } else {
    496       return else_;
    497     }
    498   }
    499 
    500   uc32 ScanHexNumber(int expected_length);
    501 
    502   // Scans a single JavaScript token.
    503   void Scan();
    504 
    505   bool SkipWhiteSpace();
    506   Token::Value SkipSingleLineComment();
    507   Token::Value SkipMultiLineComment();
    508   // Scans a possible HTML comment -- begins with '<!'.
    509   Token::Value ScanHtmlComment();
    510 
    511   void ScanDecimalDigits();
    512   Token::Value ScanNumber(bool seen_period);
    513   Token::Value ScanIdentifierOrKeyword();
    514   Token::Value ScanIdentifierSuffix(LiteralScope* literal);
    515 
    516   Token::Value ScanString();
    517 
    518   // Scans an escape-sequence which is part of a string and adds the
    519   // decoded character to the current literal. Returns true if a pattern
    520   // is scanned.
    521   bool ScanEscape();
    522   // Decodes a Unicode escape-sequence which is part of an identifier.
    523   // If the escape sequence cannot be decoded the result is kBadChar.
    524   uc32 ScanIdentifierUnicodeEscape();
    525   // Scans a Unicode escape-sequence and adds its characters,
    526   // uninterpreted, to the current literal. Used for parsing RegExp
    527   // flags.
    528   bool ScanLiteralUnicodeEscape();
    529 
    530   // Return the current source position.
    531   int source_pos() {
    532     return source_->pos() - kCharacterLookaheadBufferSize;
    533   }
    534 
    535   UnicodeCache* unicode_cache_;
    536 
    537   // Buffers collecting literal strings, numbers, etc.
    538   LiteralBuffer literal_buffer1_;
    539   LiteralBuffer literal_buffer2_;
    540 
    541   TokenDesc current_;  // desc for current token (as returned by Next())
    542   TokenDesc next_;     // desc for next token (one token look-ahead)
    543 
    544   // Input stream. Must be initialized to an Utf16CharacterStream.
    545   Utf16CharacterStream* source_;
    546 
    547 
    548   // Start position of the octal literal last scanned.
    549   Location octal_pos_;
    550 
    551   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
    552   uc32 c0_;
    553 
    554   // Whether there is a line terminator whitespace character after
    555   // the current token, and  before the next. Does not count newlines
    556   // inside multiline comments.
    557   bool has_line_terminator_before_next_;
    558   // Whether there is a multi-line comment that contains a
    559   // line-terminator after the current token, and before the next.
    560   bool has_multiline_comment_before_next_;
    561   // Whether we scan 'let' as a keyword for harmony block-scoped let bindings.
    562   bool harmony_scoping_;
    563   // Whether we scan 'module', 'import', 'export' as keywords.
    564   bool harmony_modules_;
    565   // Whether we scan 0o777 and 0b111 as numbers.
    566   bool harmony_numeric_literals_;
    567 };
    568 
    569 } }  // namespace v8::internal
    570 
    571 #endif  // V8_SCANNER_H_
    572