Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 // Features shared by parsing and pre-parsing scanners.
     29 
     30 #ifndef V8_SCANNER_H_
     31 #define V8_SCANNER_H_
     32 
     33 #include "allocation.h"
     34 #include "char-predicates.h"
     35 #include "checks.h"
     36 #include "globals.h"
     37 #include "token.h"
     38 #include "unicode-inl.h"
     39 #include "utils.h"
     40 
     41 namespace v8 {
     42 namespace internal {
     43 
     44 
     45 // General collection of (multi-)bit-flags that can be passed to scanners and
     46 // parsers to signify their (initial) mode of operation.
     47 enum ParsingFlags {
     48   kNoParsingFlags = 0,
     49   // Embed LanguageMode values in parsing flags, i.e., equivalent to:
     50   // CLASSIC_MODE = 0,
     51   // STRICT_MODE,
     52   // EXTENDED_MODE,
     53   kLanguageModeMask = 0x03,
     54   kAllowLazy = 0x04,
     55   kAllowNativesSyntax = 0x08,
     56   kAllowModules = 0x10
     57 };
     58 
     59 STATIC_ASSERT((kLanguageModeMask & CLASSIC_MODE) == CLASSIC_MODE);
     60 STATIC_ASSERT((kLanguageModeMask & STRICT_MODE) == STRICT_MODE);
     61 STATIC_ASSERT((kLanguageModeMask & EXTENDED_MODE) == EXTENDED_MODE);
     62 
     63 
     64 // Returns the value (0 .. 15) of a hexadecimal character c.
     65 // If c is not a legal hexadecimal character, returns a value < 0.
     66 inline int HexValue(uc32 c) {
     67   c -= '0';
     68   if (static_cast<unsigned>(c) <= 9) return c;
     69   c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
     70   if (static_cast<unsigned>(c) <= 5) return c + 10;
     71   return -1;
     72 }
     73 
     74 
     75 // ---------------------------------------------------------------------
     76 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
     77 // A code unit is a 16 bit value representing either a 16 bit code point
     78 // or one part of a surrogate pair that make a single 21 bit code point.
     79 
     80 class Utf16CharacterStream {
     81  public:
     82   Utf16CharacterStream() : pos_(0) { }
     83   virtual ~Utf16CharacterStream() { }
     84 
     85   // Returns and advances past the next UTF-16 code unit in the input
     86   // stream. If there are no more code units, it returns a negative
     87   // value.
     88   inline uc32 Advance() {
     89     if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
     90       pos_++;
     91       return static_cast<uc32>(*(buffer_cursor_++));
     92     }
     93     // Note: currently the following increment is necessary to avoid a
     94     // parser problem! The scanner treats the final kEndOfInput as
     95     // a code unit with a position, and does math relative to that
     96     // position.
     97     pos_++;
     98 
     99     return kEndOfInput;
    100   }
    101 
    102   // Return the current position in the code unit stream.
    103   // Starts at zero.
    104   inline unsigned pos() const { return pos_; }
    105 
    106   // Skips forward past the next code_unit_count UTF-16 code units
    107   // in the input, or until the end of input if that comes sooner.
    108   // Returns the number of code units actually skipped. If less
    109   // than code_unit_count,
    110   inline unsigned SeekForward(unsigned code_unit_count) {
    111     unsigned buffered_chars =
    112         static_cast<unsigned>(buffer_end_ - buffer_cursor_);
    113     if (code_unit_count <= buffered_chars) {
    114       buffer_cursor_ += code_unit_count;
    115       pos_ += code_unit_count;
    116       return code_unit_count;
    117     }
    118     return SlowSeekForward(code_unit_count);
    119   }
    120 
    121   // Pushes back the most recently read UTF-16 code unit (or negative
    122   // value if at end of input), i.e., the value returned by the most recent
    123   // call to Advance.
    124   // Must not be used right after calling SeekForward.
    125   virtual void PushBack(int32_t code_unit) = 0;
    126 
    127  protected:
    128   static const uc32 kEndOfInput = -1;
    129 
    130   // Ensures that the buffer_cursor_ points to the code_unit at
    131   // position pos_ of the input, if possible. If the position
    132   // is at or after the end of the input, return false. If there
    133   // are more code_units available, return true.
    134   virtual bool ReadBlock() = 0;
    135   virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
    136 
    137   const uc16* buffer_cursor_;
    138   const uc16* buffer_end_;
    139   unsigned pos_;
    140 };
    141 
    142 
    143 class UnicodeCache {
    144 // ---------------------------------------------------------------------
    145 // Caching predicates used by scanners.
    146  public:
    147   UnicodeCache() {}
    148   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
    149 
    150   StaticResource<Utf8Decoder>* utf8_decoder() {
    151     return &utf8_decoder_;
    152   }
    153 
    154   bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
    155   bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
    156   bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
    157   bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
    158 
    159  private:
    160   unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
    161   unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
    162   unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
    163   unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
    164   StaticResource<Utf8Decoder> utf8_decoder_;
    165 
    166   DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
    167 };
    168 
    169 
    170 // ----------------------------------------------------------------------------
    171 // LiteralBuffer -  Collector of chars of literals.
    172 
    173 class LiteralBuffer {
    174  public:
    175   LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
    176 
    177   ~LiteralBuffer() {
    178     if (backing_store_.length() > 0) {
    179       backing_store_.Dispose();
    180     }
    181   }
    182 
    183   INLINE(void AddChar(uint32_t code_unit)) {
    184     if (position_ >= backing_store_.length()) ExpandBuffer();
    185     if (is_ascii_) {
    186       if (code_unit < kMaxAsciiCharCodeU) {
    187         backing_store_[position_] = static_cast<byte>(code_unit);
    188         position_ += kASCIISize;
    189         return;
    190       }
    191       ConvertToUtf16();
    192     }
    193     ASSERT(code_unit < 0x10000u);
    194     *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
    195     position_ += kUC16Size;
    196   }
    197 
    198   bool is_ascii() { return is_ascii_; }
    199 
    200   Vector<const uc16> utf16_literal() {
    201     ASSERT(!is_ascii_);
    202     ASSERT((position_ & 0x1) == 0);
    203     return Vector<const uc16>(
    204         reinterpret_cast<const uc16*>(backing_store_.start()),
    205         position_ >> 1);
    206   }
    207 
    208   Vector<const char> ascii_literal() {
    209     ASSERT(is_ascii_);
    210     return Vector<const char>(
    211         reinterpret_cast<const char*>(backing_store_.start()),
    212         position_);
    213   }
    214 
    215   int length() {
    216     return is_ascii_ ? position_ : (position_ >> 1);
    217   }
    218 
    219   void Reset() {
    220     position_ = 0;
    221     is_ascii_ = true;
    222   }
    223 
    224  private:
    225   static const int kInitialCapacity = 16;
    226   static const int kGrowthFactory = 4;
    227   static const int kMinConversionSlack = 256;
    228   static const int kMaxGrowth = 1 * MB;
    229   inline int NewCapacity(int min_capacity) {
    230     int capacity = Max(min_capacity, backing_store_.length());
    231     int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
    232     return new_capacity;
    233   }
    234 
    235   void ExpandBuffer() {
    236     Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
    237     memcpy(new_store.start(), backing_store_.start(), position_);
    238     backing_store_.Dispose();
    239     backing_store_ = new_store;
    240   }
    241 
    242   void ConvertToUtf16() {
    243     ASSERT(is_ascii_);
    244     Vector<byte> new_store;
    245     int new_content_size = position_ * kUC16Size;
    246     if (new_content_size >= backing_store_.length()) {
    247       // Ensure room for all currently read code units as UC16 as well
    248       // as the code unit about to be stored.
    249       new_store = Vector<byte>::New(NewCapacity(new_content_size));
    250     } else {
    251       new_store = backing_store_;
    252     }
    253     char* src = reinterpret_cast<char*>(backing_store_.start());
    254     uc16* dst = reinterpret_cast<uc16*>(new_store.start());
    255     for (int i = position_ - 1; i >= 0; i--) {
    256       dst[i] = src[i];
    257     }
    258     if (new_store.start() != backing_store_.start()) {
    259       backing_store_.Dispose();
    260       backing_store_ = new_store;
    261     }
    262     position_ = new_content_size;
    263     is_ascii_ = false;
    264   }
    265 
    266   bool is_ascii_;
    267   int position_;
    268   Vector<byte> backing_store_;
    269 
    270   DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
    271 };
    272 
    273 
    274 // ----------------------------------------------------------------------------
    275 // JavaScript Scanner.
    276 
    277 class Scanner {
    278  public:
    279   // Scoped helper for literal recording. Automatically drops the literal
    280   // if aborting the scanning before it's complete.
    281   class LiteralScope {
    282    public:
    283     explicit LiteralScope(Scanner* self)
    284         : scanner_(self), complete_(false) {
    285       scanner_->StartLiteral();
    286     }
    287      ~LiteralScope() {
    288        if (!complete_) scanner_->DropLiteral();
    289      }
    290     void Complete() {
    291       scanner_->TerminateLiteral();
    292       complete_ = true;
    293     }
    294 
    295    private:
    296     Scanner* scanner_;
    297     bool complete_;
    298   };
    299 
    300   // Representation of an interval of source positions.
    301   struct Location {
    302     Location(int b, int e) : beg_pos(b), end_pos(e) { }
    303     Location() : beg_pos(0), end_pos(0) { }
    304 
    305     bool IsValid() const {
    306       return beg_pos >= 0 && end_pos >= beg_pos;
    307     }
    308 
    309     static Location invalid() { return Location(-1, -1); }
    310 
    311     int beg_pos;
    312     int end_pos;
    313   };
    314 
    315   // -1 is outside of the range of any real source code.
    316   static const int kNoOctalLocation = -1;
    317 
    318   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
    319 
    320   explicit Scanner(UnicodeCache* scanner_contants);
    321 
    322   void Initialize(Utf16CharacterStream* source);
    323 
    324   // Returns the next token and advances input.
    325   Token::Value Next();
    326   // Returns the current token again.
    327   Token::Value current_token() { return current_.token; }
    328   // Returns the location information for the current token
    329   // (the token last returned by Next()).
    330   Location location() const { return current_.location; }
    331   // Returns the literal string, if any, for the current token (the
    332   // token last returned by Next()). The string is 0-terminated.
    333   // Literal strings are collected for identifiers, strings, and
    334   // numbers.
    335   // These functions only give the correct result if the literal
    336   // was scanned between calls to StartLiteral() and TerminateLiteral().
    337   Vector<const char> literal_ascii_string() {
    338     ASSERT_NOT_NULL(current_.literal_chars);
    339     return current_.literal_chars->ascii_literal();
    340   }
    341   Vector<const uc16> literal_utf16_string() {
    342     ASSERT_NOT_NULL(current_.literal_chars);
    343     return current_.literal_chars->utf16_literal();
    344   }
    345   bool is_literal_ascii() {
    346     ASSERT_NOT_NULL(current_.literal_chars);
    347     return current_.literal_chars->is_ascii();
    348   }
    349   int literal_length() const {
    350     ASSERT_NOT_NULL(current_.literal_chars);
    351     return current_.literal_chars->length();
    352   }
    353 
    354   bool literal_contains_escapes() const {
    355     Location location = current_.location;
    356     int source_length = (location.end_pos - location.beg_pos);
    357     if (current_.token == Token::STRING) {
    358       // Subtract delimiters.
    359       source_length -= 2;
    360     }
    361     return current_.literal_chars->length() != source_length;
    362   }
    363 
    364   // Similar functions for the upcoming token.
    365 
    366   // One token look-ahead (past the token returned by Next()).
    367   Token::Value peek() const { return next_.token; }
    368 
    369   Location peek_location() const { return next_.location; }
    370 
    371   // Returns the literal string for the next token (the token that
    372   // would be returned if Next() were called).
    373   Vector<const char> next_literal_ascii_string() {
    374     ASSERT_NOT_NULL(next_.literal_chars);
    375     return next_.literal_chars->ascii_literal();
    376   }
    377   Vector<const uc16> next_literal_utf16_string() {
    378     ASSERT_NOT_NULL(next_.literal_chars);
    379     return next_.literal_chars->utf16_literal();
    380   }
    381   bool is_next_literal_ascii() {
    382     ASSERT_NOT_NULL(next_.literal_chars);
    383     return next_.literal_chars->is_ascii();
    384   }
    385   int next_literal_length() const {
    386     ASSERT_NOT_NULL(next_.literal_chars);
    387     return next_.literal_chars->length();
    388   }
    389 
    390   UnicodeCache* unicode_cache() { return unicode_cache_; }
    391 
    392   static const int kCharacterLookaheadBufferSize = 1;
    393 
    394   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
    395   uc32 ScanOctalEscape(uc32 c, int length);
    396 
    397   // Returns the location of the last seen octal literal.
    398   Location octal_position() const { return octal_pos_; }
    399   void clear_octal_position() { octal_pos_ = Location::invalid(); }
    400 
    401   // Seek forward to the given position.  This operation does not
    402   // work in general, for instance when there are pushed back
    403   // characters, but works for seeking forward until simple delimiter
    404   // tokens, which is what it is used for.
    405   void SeekForward(int pos);
    406 
    407   bool HarmonyScoping() const {
    408     return harmony_scoping_;
    409   }
    410   void SetHarmonyScoping(bool scoping) {
    411     harmony_scoping_ = scoping;
    412   }
    413   bool HarmonyModules() const {
    414     return harmony_modules_;
    415   }
    416   void SetHarmonyModules(bool modules) {
    417     harmony_modules_ = modules;
    418   }
    419 
    420 
    421   // Returns true if there was a line terminator before the peek'ed token,
    422   // possibly inside a multi-line comment.
    423   bool HasAnyLineTerminatorBeforeNext() const {
    424     return has_line_terminator_before_next_ ||
    425            has_multiline_comment_before_next_;
    426   }
    427 
    428   // Scans the input as a regular expression pattern, previous
    429   // character(s) must be /(=). Returns true if a pattern is scanned.
    430   bool ScanRegExpPattern(bool seen_equal);
    431   // Returns true if regexp flags are scanned (always since flags can
    432   // be empty).
    433   bool ScanRegExpFlags();
    434 
    435   // Tells whether the buffer contains an identifier (no escapes).
    436   // Used for checking if a property name is an identifier.
    437   static bool IsIdentifier(unibrow::CharacterStream* buffer);
    438 
    439  private:
    440   // The current and look-ahead token.
    441   struct TokenDesc {
    442     Token::Value token;
    443     Location location;
    444     LiteralBuffer* literal_chars;
    445   };
    446 
    447   // Call this after setting source_ to the input.
    448   void Init() {
    449     // Set c0_ (one character ahead)
    450     STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
    451     Advance();
    452     // Initialize current_ to not refer to a literal.
    453     current_.literal_chars = NULL;
    454   }
    455 
    456   // Literal buffer support
    457   inline void StartLiteral() {
    458     LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
    459             &literal_buffer2_ : &literal_buffer1_;
    460     free_buffer->Reset();
    461     next_.literal_chars = free_buffer;
    462   }
    463 
    464   INLINE(void AddLiteralChar(uc32 c)) {
    465     ASSERT_NOT_NULL(next_.literal_chars);
    466     next_.literal_chars->AddChar(c);
    467   }
    468 
    469   // Complete scanning of a literal.
    470   inline void TerminateLiteral() {
    471     // Does nothing in the current implementation.
    472   }
    473 
    474   // Stops scanning of a literal and drop the collected characters,
    475   // e.g., due to an encountered error.
    476   inline void DropLiteral() {
    477     next_.literal_chars = NULL;
    478   }
    479 
    480   inline void AddLiteralCharAdvance() {
    481     AddLiteralChar(c0_);
    482     Advance();
    483   }
    484 
    485   // Low-level scanning support.
    486   void Advance() { c0_ = source_->Advance(); }
    487   void PushBack(uc32 ch) {
    488     source_->PushBack(c0_);
    489     c0_ = ch;
    490   }
    491 
    492   inline Token::Value Select(Token::Value tok) {
    493     Advance();
    494     return tok;
    495   }
    496 
    497   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
    498     Advance();
    499     if (c0_ == next) {
    500       Advance();
    501       return then;
    502     } else {
    503       return else_;
    504     }
    505   }
    506 
    507   uc32 ScanHexNumber(int expected_length);
    508 
    509   // Scans a single JavaScript token.
    510   void Scan();
    511 
    512   bool SkipWhiteSpace();
    513   Token::Value SkipSingleLineComment();
    514   Token::Value SkipMultiLineComment();
    515   // Scans a possible HTML comment -- begins with '<!'.
    516   Token::Value ScanHtmlComment();
    517 
    518   void ScanDecimalDigits();
    519   Token::Value ScanNumber(bool seen_period);
    520   Token::Value ScanIdentifierOrKeyword();
    521   Token::Value ScanIdentifierSuffix(LiteralScope* literal);
    522 
    523   void ScanEscape();
    524   Token::Value ScanString();
    525 
    526   // Decodes a unicode escape-sequence which is part of an identifier.
    527   // If the escape sequence cannot be decoded the result is kBadChar.
    528   uc32 ScanIdentifierUnicodeEscape();
    529   // Recognizes a uniocde escape-sequence and adds its characters,
    530   // uninterpreted, to the current literal. Used for parsing RegExp
    531   // flags.
    532   bool ScanLiteralUnicodeEscape();
    533 
    534   // Return the current source position.
    535   int source_pos() {
    536     return source_->pos() - kCharacterLookaheadBufferSize;
    537   }
    538 
    539   UnicodeCache* unicode_cache_;
    540 
    541   // Buffers collecting literal strings, numbers, etc.
    542   LiteralBuffer literal_buffer1_;
    543   LiteralBuffer literal_buffer2_;
    544 
    545   TokenDesc current_;  // desc for current token (as returned by Next())
    546   TokenDesc next_;     // desc for next token (one token look-ahead)
    547 
    548   // Input stream. Must be initialized to an Utf16CharacterStream.
    549   Utf16CharacterStream* source_;
    550 
    551 
    552   // Start position of the octal literal last scanned.
    553   Location octal_pos_;
    554 
    555   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
    556   uc32 c0_;
    557 
    558   // Whether there is a line terminator whitespace character after
    559   // the current token, and  before the next. Does not count newlines
    560   // inside multiline comments.
    561   bool has_line_terminator_before_next_;
    562   // Whether there is a multi-line comment that contains a
    563   // line-terminator after the current token, and before the next.
    564   bool has_multiline_comment_before_next_;
    565   // Whether we scan 'let' as a keyword for harmony block-scoped let bindings.
    566   bool harmony_scoping_;
    567   // Whether we scan 'module', 'import', 'export' as keywords.
    568   bool harmony_modules_;
    569 };
    570 
    571 } }  // namespace v8::internal
    572 
    573 #endif  // V8_SCANNER_H_
    574