Home | History | Annotate | Download | only in parsing
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Features shared by parsing and pre-parsing scanners.
      6 
      7 #ifndef V8_PARSING_SCANNER_H_
      8 #define V8_PARSING_SCANNER_H_
      9 
     10 #include "src/allocation.h"
     11 #include "src/base/logging.h"
     12 #include "src/char-predicates.h"
     13 #include "src/globals.h"
     14 #include "src/messages.h"
     15 #include "src/parsing/token.h"
     16 #include "src/unicode-decoder.h"
     17 #include "src/unicode.h"
     18 
     19 namespace v8 {
     20 namespace internal {
     21 
     22 
     23 class AstRawString;
     24 class AstValueFactory;
     25 class DuplicateFinder;
     26 class ExternalOneByteString;
     27 class ExternalTwoByteString;
     28 class ParserRecorder;
     29 class UnicodeCache;
     30 
     31 // ---------------------------------------------------------------------
     32 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
     33 // A code unit is a 16 bit value representing either a 16 bit code point
     34 // or one part of a surrogate pair that make a single 21 bit code point.
     35 class Utf16CharacterStream {
     36  public:
     37   static const uc32 kEndOfInput = -1;
     38 
     39   virtual ~Utf16CharacterStream() { }
     40 
     41   // Returns and advances past the next UTF-16 code unit in the input
     42   // stream. If there are no more code units it returns kEndOfInput.
     43   inline uc32 Advance() {
     44     if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {
     45       return static_cast<uc32>(*(buffer_cursor_++));
     46     } else if (ReadBlock()) {
     47       return static_cast<uc32>(*(buffer_cursor_++));
     48     } else {
     49       // Note: currently the following increment is necessary to avoid a
     50       // parser problem! The scanner treats the final kEndOfInput as
     51       // a code unit with a position, and does math relative to that
     52       // position.
     53       buffer_cursor_++;
     54       return kEndOfInput;
     55     }
     56   }
     57 
     58   // Go back one by one character in the input stream.
     59   // This undoes the most recent Advance().
     60   inline void Back() {
     61     // The common case - if the previous character is within
     62     // buffer_start_ .. buffer_end_ will be handles locally.
     63     // Otherwise, a new block is requested.
     64     if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {
     65       buffer_cursor_--;
     66     } else {
     67       ReadBlockAt(pos() - 1);
     68     }
     69   }
     70 
     71   // Go back one by two characters in the input stream. (This is the same as
     72   // calling Back() twice. But Back() may - in some instances - do substantial
     73   // work. Back2() guarantees this work will be done only once.)
     74   inline void Back2() {
     75     if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) {
     76       buffer_cursor_ -= 2;
     77     } else {
     78       ReadBlockAt(pos() - 2);
     79     }
     80   }
     81 
     82   inline size_t pos() const {
     83     return buffer_pos_ + (buffer_cursor_ - buffer_start_);
     84   }
     85 
     86   inline void Seek(size_t pos) {
     87     if (V8_LIKELY(pos >= buffer_pos_ &&
     88                   pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {
     89       buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);
     90     } else {
     91       ReadBlockAt(pos);
     92     }
     93   }
     94 
     95  protected:
     96   Utf16CharacterStream(const uint16_t* buffer_start,
     97                        const uint16_t* buffer_cursor,
     98                        const uint16_t* buffer_end, size_t buffer_pos)
     99       : buffer_start_(buffer_start),
    100         buffer_cursor_(buffer_cursor),
    101         buffer_end_(buffer_end),
    102         buffer_pos_(buffer_pos) {}
    103   Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}
    104 
    105   void ReadBlockAt(size_t new_pos) {
    106     // The callers of this method (Back/Back2/Seek) should handle the easy
    107     // case (seeking within the current buffer), and we should only get here
    108     // if we actually require new data.
    109     // (This is really an efficiency check, not a correctness invariant.)
    110     DCHECK(new_pos < buffer_pos_ ||
    111            new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));
    112 
    113     // Change pos() to point to new_pos.
    114     buffer_pos_ = new_pos;
    115     buffer_cursor_ = buffer_start_;
    116     bool success = ReadBlock();
    117     USE(success);
    118 
    119     // Post-conditions: 1, on success, we should be at the right position.
    120     //                  2, success == we should have more characters available.
    121     DCHECK_IMPLIES(success, pos() == new_pos);
    122     DCHECK_EQ(success, buffer_cursor_ < buffer_end_);
    123     DCHECK_EQ(success, buffer_start_ < buffer_end_);
    124   }
    125 
    126   // Read more data, and update buffer_*_ to point to it.
    127   // Returns true if more data was available.
    128   //
    129   // ReadBlock() may modify any of the buffer_*_ members, but must sure that
    130   // the result of pos() remains unaffected.
    131   //
    132   // Examples:
    133   // - a stream could either fill a separate buffer. Then buffer_start_ and
    134   //   buffer_cursor_ would point to the beginning of the buffer, and
    135   //   buffer_pos would be the old pos().
    136   // - a stream with existing buffer chunks would set buffer_start_ and
    137   //   buffer_end_ to cover the full chunk, and then buffer_cursor_ would
    138   //   point into the middle of the buffer, while buffer_pos_ would describe
    139   //   the start of the buffer.
    140   virtual bool ReadBlock() = 0;
    141 
    142   const uint16_t* buffer_start_;
    143   const uint16_t* buffer_cursor_;
    144   const uint16_t* buffer_end_;
    145   size_t buffer_pos_;
    146 };
    147 
    148 
    149 // ----------------------------------------------------------------------------
    150 // JavaScript Scanner.
    151 
    152 class Scanner {
    153  public:
    154   // Scoped helper for a re-settable bookmark.
    155   class BookmarkScope {
    156    public:
    157     explicit BookmarkScope(Scanner* scanner)
    158         : scanner_(scanner), bookmark_(kNoBookmark) {
    159       DCHECK_NOT_NULL(scanner_);
    160     }
    161     ~BookmarkScope() {}
    162 
    163     void Set();
    164     void Apply();
    165     bool HasBeenSet();
    166     bool HasBeenApplied();
    167 
    168    private:
    169     static const size_t kNoBookmark;
    170     static const size_t kBookmarkWasApplied;
    171     static const size_t kBookmarkAtFirstPos;
    172 
    173     Scanner* scanner_;
    174     size_t bookmark_;
    175 
    176     DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
    177   };
    178 
    179   // Representation of an interval of source positions.
    180   struct Location {
    181     Location(int b, int e) : beg_pos(b), end_pos(e) { }
    182     Location() : beg_pos(0), end_pos(0) { }
    183 
    184     bool IsValid() const {
    185       return beg_pos >= 0 && end_pos >= beg_pos;
    186     }
    187 
    188     static Location invalid() { return Location(-1, -1); }
    189 
    190     int beg_pos;
    191     int end_pos;
    192   };
    193 
    194   // -1 is outside of the range of any real source code.
    195   static const int kNoOctalLocation = -1;
    196   static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;
    197 
    198   explicit Scanner(UnicodeCache* scanner_contants);
    199 
    200   void Initialize(Utf16CharacterStream* source);
    201 
    202   // Returns the next token and advances input.
    203   Token::Value Next();
    204   // Returns the token following peek()
    205   Token::Value PeekAhead();
    206   // Returns the current token again.
    207   Token::Value current_token() { return current_.token; }
    208   // Returns the location information for the current token
    209   // (the token last returned by Next()).
    210   Location location() const { return current_.location; }
    211 
    212   bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
    213   MessageTemplate::Template error() const { return scanner_error_; }
    214   Location error_location() const { return scanner_error_location_; }
    215 
    216   // Similar functions for the upcoming token.
    217 
    218   // One token look-ahead (past the token returned by Next()).
    219   Token::Value peek() const { return next_.token; }
    220 
    221   Location peek_location() const { return next_.location; }
    222 
    223   bool literal_contains_escapes() const {
    224     return LiteralContainsEscapes(current_);
    225   }
    226   bool is_literal_contextual_keyword(Vector<const char> keyword) {
    227     DCHECK(current_.token == Token::IDENTIFIER ||
    228            current_.token == Token::ESCAPED_STRICT_RESERVED_WORD);
    229     DCHECK_NOT_NULL(current_.literal_chars);
    230     return current_.literal_chars->is_contextual_keyword(keyword);
    231   }
    232   bool is_next_contextual_keyword(Vector<const char> keyword) {
    233     DCHECK_NOT_NULL(next_.literal_chars);
    234     return next_.literal_chars->is_contextual_keyword(keyword);
    235   }
    236 
    237   const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
    238   const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
    239   const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
    240 
    241   double DoubleValue();
    242   bool ContainsDot();
    243   bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
    244     if (!current_.literal_chars) {
    245       return !strncmp(Token::Name(current_.token), data, length);
    246     } else if (is_literal_one_byte() && literal_length() == length &&
    247                (allow_escapes || !literal_contains_escapes())) {
    248       const char* token =
    249           reinterpret_cast<const char*>(literal_one_byte_string().start());
    250       return !strncmp(token, data, length);
    251     }
    252     return false;
    253   }
    254   inline bool UnescapedLiteralMatches(const char* data, int length) {
    255     return LiteralMatches(data, length, false);
    256   }
    257 
    258   bool IsGetOrSet(bool* is_get, bool* is_set) {
    259     if (is_literal_one_byte() &&
    260         literal_length() == 3 &&
    261         !literal_contains_escapes()) {
    262       const char* token =
    263           reinterpret_cast<const char*>(literal_one_byte_string().start());
    264       *is_get = strncmp(token, "get", 3) == 0;
    265       *is_set = !*is_get && strncmp(token, "set", 3) == 0;
    266       return *is_get || *is_set;
    267     }
    268     return false;
    269   }
    270 
    271   int FindSymbol(DuplicateFinder* finder, int value);
    272 
    273   UnicodeCache* unicode_cache() { return unicode_cache_; }
    274 
    275   // Returns the location of the last seen octal literal.
    276   Location octal_position() const { return octal_pos_; }
    277   void clear_octal_position() { octal_pos_ = Location::invalid(); }
    278   // Returns the location of the last seen decimal literal with a leading zero.
    279   Location decimal_with_leading_zero_position() const {
    280     return decimal_with_leading_zero_pos_;
    281   }
    282   void clear_decimal_with_leading_zero_position() {
    283     decimal_with_leading_zero_pos_ = Location::invalid();
    284   }
    285 
    286   // Returns the value of the last smi that was scanned.
    287   uint32_t smi_value() const { return current_.smi_value_; }
    288 
    289   // Seek forward to the given position.  This operation does not
    290   // work in general, for instance when there are pushed back
    291   // characters, but works for seeking forward until simple delimiter
    292   // tokens, which is what it is used for.
    293   void SeekForward(int pos);
    294 
    295   // Returns true if there was a line terminator before the peek'ed token,
    296   // possibly inside a multi-line comment.
    297   bool HasAnyLineTerminatorBeforeNext() const {
    298     return has_line_terminator_before_next_ ||
    299            has_multiline_comment_before_next_;
    300   }
    301 
    302   bool HasAnyLineTerminatorAfterNext() {
    303     Token::Value ensure_next_next = PeekAhead();
    304     USE(ensure_next_next);
    305     return has_line_terminator_after_next_;
    306   }
    307 
    308   // Scans the input as a regular expression pattern, next token must be /(=).
    309   // Returns true if a pattern is scanned.
    310   bool ScanRegExpPattern();
    311   // Scans the input as regular expression flags. Returns the flags on success.
    312   Maybe<RegExp::Flags> ScanRegExpFlags();
    313 
    314   // Scans the input as a template literal
    315   Token::Value ScanTemplateStart();
    316   Token::Value ScanTemplateContinuation();
    317 
    318   Handle<String> SourceUrl(Isolate* isolate) const {
    319     Handle<String> tmp;
    320     if (source_url_.length() > 0) tmp = source_url_.Internalize(isolate);
    321     return tmp;
    322   }
    323 
    324   Handle<String> SourceMappingUrl(Isolate* isolate) const {
    325     Handle<String> tmp;
    326     if (source_mapping_url_.length() > 0)
    327       tmp = source_mapping_url_.Internalize(isolate);
    328     return tmp;
    329   }
    330 
    331   bool IdentifierIsFutureStrictReserved(const AstRawString* string) const;
    332 
    333   bool FoundHtmlComment() const { return found_html_comment_; }
    334 
    335  private:
    336   // Scoped helper for literal recording. Automatically drops the literal
    337   // if aborting the scanning before it's complete.
    338   class LiteralScope {
    339    public:
    340     explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
    341       scanner_->StartLiteral();
    342     }
    343     ~LiteralScope() {
    344       if (!complete_) scanner_->DropLiteral();
    345     }
    346     void Complete() { complete_ = true; }
    347 
    348    private:
    349     Scanner* scanner_;
    350     bool complete_;
    351   };
    352 
    353   // LiteralBuffer -  Collector of chars of literals.
    354   class LiteralBuffer {
    355    public:
    356     LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() {}
    357 
    358     ~LiteralBuffer() { backing_store_.Dispose(); }
    359 
    360     INLINE(void AddChar(char code_unit)) {
    361       if (position_ >= backing_store_.length()) ExpandBuffer();
    362       DCHECK(is_one_byte_);
    363       DCHECK(IsValidAscii(code_unit));
    364       backing_store_[position_] = static_cast<byte>(code_unit);
    365       position_ += kOneByteSize;
    366       return;
    367     }
    368 
    369     INLINE(void AddChar(uc32 code_unit)) {
    370       if (position_ >= backing_store_.length()) ExpandBuffer();
    371       if (is_one_byte_) {
    372         if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
    373           backing_store_[position_] = static_cast<byte>(code_unit);
    374           position_ += kOneByteSize;
    375           return;
    376         }
    377         ConvertToTwoByte();
    378       }
    379       if (code_unit <=
    380           static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    381         *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
    382         position_ += kUC16Size;
    383       } else {
    384         *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
    385             unibrow::Utf16::LeadSurrogate(code_unit);
    386         position_ += kUC16Size;
    387         if (position_ >= backing_store_.length()) ExpandBuffer();
    388         *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
    389             unibrow::Utf16::TrailSurrogate(code_unit);
    390         position_ += kUC16Size;
    391       }
    392     }
    393 
    394     bool is_one_byte() const { return is_one_byte_; }
    395 
    396     bool is_contextual_keyword(Vector<const char> keyword) const {
    397       return is_one_byte() && keyword.length() == position_ &&
    398              (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
    399     }
    400 
    401     Vector<const uint16_t> two_byte_literal() const {
    402       DCHECK(!is_one_byte_);
    403       DCHECK((position_ & 0x1) == 0);
    404       return Vector<const uint16_t>(
    405           reinterpret_cast<const uint16_t*>(backing_store_.start()),
    406           position_ >> 1);
    407     }
    408 
    409     Vector<const uint8_t> one_byte_literal() const {
    410       DCHECK(is_one_byte_);
    411       return Vector<const uint8_t>(
    412           reinterpret_cast<const uint8_t*>(backing_store_.start()), position_);
    413     }
    414 
    415     int length() const { return is_one_byte_ ? position_ : (position_ >> 1); }
    416 
    417     void ReduceLength(int delta) {
    418       position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
    419     }
    420 
    421     void Reset() {
    422       position_ = 0;
    423       is_one_byte_ = true;
    424     }
    425 
    426     Handle<String> Internalize(Isolate* isolate) const;
    427 
    428    private:
    429     static const int kInitialCapacity = 16;
    430     static const int kGrowthFactory = 4;
    431     static const int kMinConversionSlack = 256;
    432     static const int kMaxGrowth = 1 * MB;
    433 
    434     inline bool IsValidAscii(char code_unit) {
    435       // Control characters and printable characters span the range of
    436       // valid ASCII characters (0-127). Chars are unsigned on some
    437       // platforms which causes compiler warnings if the validity check
    438       // tests the lower bound >= 0 as it's always true.
    439       return iscntrl(code_unit) || isprint(code_unit);
    440     }
    441 
    442     inline int NewCapacity(int min_capacity) {
    443       int capacity = Max(min_capacity, backing_store_.length());
    444       int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
    445       return new_capacity;
    446     }
    447 
    448     void ExpandBuffer() {
    449       Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
    450       MemCopy(new_store.start(), backing_store_.start(), position_);
    451       backing_store_.Dispose();
    452       backing_store_ = new_store;
    453     }
    454 
    455     void ConvertToTwoByte() {
    456       DCHECK(is_one_byte_);
    457       Vector<byte> new_store;
    458       int new_content_size = position_ * kUC16Size;
    459       if (new_content_size >= backing_store_.length()) {
    460         // Ensure room for all currently read code units as UC16 as well
    461         // as the code unit about to be stored.
    462         new_store = Vector<byte>::New(NewCapacity(new_content_size));
    463       } else {
    464         new_store = backing_store_;
    465       }
    466       uint8_t* src = backing_store_.start();
    467       uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
    468       for (int i = position_ - 1; i >= 0; i--) {
    469         dst[i] = src[i];
    470       }
    471       if (new_store.start() != backing_store_.start()) {
    472         backing_store_.Dispose();
    473         backing_store_ = new_store;
    474       }
    475       position_ = new_content_size;
    476       is_one_byte_ = false;
    477     }
    478 
    479     bool is_one_byte_;
    480     int position_;
    481     Vector<byte> backing_store_;
    482 
    483     DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
    484   };
    485 
    486   // The current and look-ahead token.
    487   struct TokenDesc {
    488     Location location;
    489     LiteralBuffer* literal_chars;
    490     LiteralBuffer* raw_literal_chars;
    491     uint32_t smi_value_;
    492     Token::Value token;
    493   };
    494 
    495   static const int kCharacterLookaheadBufferSize = 1;
    496   const int kMaxAscii = 127;
    497 
    498   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
    499   template <bool capture_raw>
    500   uc32 ScanOctalEscape(uc32 c, int length);
    501 
    502   // Call this after setting source_ to the input.
    503   void Init() {
    504     // Set c0_ (one character ahead)
    505     STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
    506     Advance();
    507     // Initialize current_ to not refer to a literal.
    508     current_.token = Token::UNINITIALIZED;
    509     current_.literal_chars = NULL;
    510     current_.raw_literal_chars = NULL;
    511     next_.token = Token::UNINITIALIZED;
    512     next_.literal_chars = NULL;
    513     next_.raw_literal_chars = NULL;
    514     next_next_.token = Token::UNINITIALIZED;
    515     next_next_.literal_chars = NULL;
    516     next_next_.raw_literal_chars = NULL;
    517     found_html_comment_ = false;
    518     scanner_error_ = MessageTemplate::kNone;
    519   }
    520 
    521   void ReportScannerError(const Location& location,
    522                           MessageTemplate::Template error) {
    523     if (has_error()) return;
    524     scanner_error_ = error;
    525     scanner_error_location_ = location;
    526   }
    527 
    528   void ReportScannerError(int pos, MessageTemplate::Template error) {
    529     if (has_error()) return;
    530     scanner_error_ = error;
    531     scanner_error_location_ = Location(pos, pos + 1);
    532   }
    533 
    534   // Seek to the next_ token at the given position.
    535   void SeekNext(size_t position);
    536 
    537   // Literal buffer support
    538   inline void StartLiteral() {
    539     LiteralBuffer* free_buffer =
    540         (current_.literal_chars == &literal_buffer0_)
    541             ? &literal_buffer1_
    542             : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_
    543                                                             : &literal_buffer0_;
    544     free_buffer->Reset();
    545     next_.literal_chars = free_buffer;
    546   }
    547 
    548   inline void StartRawLiteral() {
    549     LiteralBuffer* free_buffer =
    550         (current_.raw_literal_chars == &raw_literal_buffer0_)
    551             ? &raw_literal_buffer1_
    552             : (current_.raw_literal_chars == &raw_literal_buffer1_)
    553                   ? &raw_literal_buffer2_
    554                   : &raw_literal_buffer0_;
    555     free_buffer->Reset();
    556     next_.raw_literal_chars = free_buffer;
    557   }
    558 
    559   INLINE(void AddLiteralChar(uc32 c)) {
    560     DCHECK_NOT_NULL(next_.literal_chars);
    561     next_.literal_chars->AddChar(c);
    562   }
    563 
    564   INLINE(void AddLiteralChar(char c)) {
    565     DCHECK_NOT_NULL(next_.literal_chars);
    566     next_.literal_chars->AddChar(c);
    567   }
    568 
    569   INLINE(void AddRawLiteralChar(uc32 c)) {
    570     DCHECK_NOT_NULL(next_.raw_literal_chars);
    571     next_.raw_literal_chars->AddChar(c);
    572   }
    573 
    574   INLINE(void ReduceRawLiteralLength(int delta)) {
    575     DCHECK_NOT_NULL(next_.raw_literal_chars);
    576     next_.raw_literal_chars->ReduceLength(delta);
    577   }
    578 
    579   // Stops scanning of a literal and drop the collected characters,
    580   // e.g., due to an encountered error.
    581   inline void DropLiteral() {
    582     next_.literal_chars = NULL;
    583     next_.raw_literal_chars = NULL;
    584   }
    585 
    586   inline void AddLiteralCharAdvance() {
    587     AddLiteralChar(c0_);
    588     Advance();
    589   }
    590 
    591   // Low-level scanning support.
    592   template <bool capture_raw = false, bool check_surrogate = true>
    593   void Advance() {
    594     if (capture_raw) {
    595       AddRawLiteralChar(c0_);
    596     }
    597     c0_ = source_->Advance();
    598     if (check_surrogate) HandleLeadSurrogate();
    599   }
    600 
    601   void HandleLeadSurrogate() {
    602     if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
    603       uc32 c1 = source_->Advance();
    604       if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
    605         source_->Back();
    606       } else {
    607         c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
    608       }
    609     }
    610   }
    611 
    612   void PushBack(uc32 ch) {
    613     if (c0_ > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    614       source_->Back2();
    615     } else {
    616       source_->Back();
    617     }
    618     c0_ = ch;
    619   }
    620 
    621   // Same as PushBack(ch1); PushBack(ch2).
    622   // - Potentially more efficient as it uses Back2() on the stream.
    623   // - Uses char as parameters, since we're only calling it with ASCII chars in
    624   //   practice. This way, we can avoid a few edge cases.
    625   void PushBack2(char ch1, char ch2) {
    626     source_->Back2();
    627     c0_ = ch2;
    628   }
    629 
    630   inline Token::Value Select(Token::Value tok) {
    631     Advance();
    632     return tok;
    633   }
    634 
    635   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
    636     Advance();
    637     if (c0_ == next) {
    638       Advance();
    639       return then;
    640     } else {
    641       return else_;
    642     }
    643   }
    644 
    645   // Returns the literal string, if any, for the current token (the
    646   // token last returned by Next()). The string is 0-terminated.
    647   // Literal strings are collected for identifiers, strings, numbers as well
    648   // as for template literals. For template literals we also collect the raw
    649   // form.
    650   // These functions only give the correct result if the literal was scanned
    651   // when a LiteralScope object is alive.
    652   //
    653   // Current usage of these functions is unfortunately a little undisciplined,
    654   // and is_literal_one_byte() + is_literal_one_byte_string() is also
    655   // requested for tokens that do not have a literal. Hence, we treat any
    656   // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a
    657   // literal "function".
    658   Vector<const uint8_t> literal_one_byte_string() {
    659     if (current_.literal_chars)
    660       return current_.literal_chars->one_byte_literal();
    661     const char* str = Token::String(current_.token);
    662     const uint8_t* str_as_uint8 = reinterpret_cast<const uint8_t*>(str);
    663     return Vector<const uint8_t>(str_as_uint8,
    664                                  Token::StringLength(current_.token));
    665   }
    666   Vector<const uint16_t> literal_two_byte_string() {
    667     DCHECK_NOT_NULL(current_.literal_chars);
    668     return current_.literal_chars->two_byte_literal();
    669   }
    670   bool is_literal_one_byte() {
    671     return !current_.literal_chars || current_.literal_chars->is_one_byte();
    672   }
    673   int literal_length() const {
    674     if (current_.literal_chars) return current_.literal_chars->length();
    675     return Token::StringLength(current_.token);
    676   }
    677   // Returns the literal string for the next token (the token that
    678   // would be returned if Next() were called).
    679   Vector<const uint8_t> next_literal_one_byte_string() {
    680     DCHECK_NOT_NULL(next_.literal_chars);
    681     return next_.literal_chars->one_byte_literal();
    682   }
    683   Vector<const uint16_t> next_literal_two_byte_string() {
    684     DCHECK_NOT_NULL(next_.literal_chars);
    685     return next_.literal_chars->two_byte_literal();
    686   }
    687   bool is_next_literal_one_byte() {
    688     DCHECK_NOT_NULL(next_.literal_chars);
    689     return next_.literal_chars->is_one_byte();
    690   }
    691   Vector<const uint8_t> raw_literal_one_byte_string() {
    692     DCHECK_NOT_NULL(current_.raw_literal_chars);
    693     return current_.raw_literal_chars->one_byte_literal();
    694   }
    695   Vector<const uint16_t> raw_literal_two_byte_string() {
    696     DCHECK_NOT_NULL(current_.raw_literal_chars);
    697     return current_.raw_literal_chars->two_byte_literal();
    698   }
    699   bool is_raw_literal_one_byte() {
    700     DCHECK_NOT_NULL(current_.raw_literal_chars);
    701     return current_.raw_literal_chars->is_one_byte();
    702   }
    703 
    704   template <bool capture_raw, bool unicode = false>
    705   uc32 ScanHexNumber(int expected_length);
    706   // Scan a number of any length but not bigger than max_value. For example, the
    707   // number can be 000000001, so it's very long in characters but its value is
    708   // small.
    709   template <bool capture_raw>
    710   uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
    711 
    712   // Scans a single JavaScript token.
    713   void Scan();
    714 
    715   bool SkipWhiteSpace();
    716   Token::Value SkipSingleLineComment();
    717   Token::Value SkipSourceURLComment();
    718   void TryToParseSourceURLComment();
    719   Token::Value SkipMultiLineComment();
    720   // Scans a possible HTML comment -- begins with '<!'.
    721   Token::Value ScanHtmlComment();
    722 
    723   void ScanDecimalDigits();
    724   Token::Value ScanNumber(bool seen_period);
    725   Token::Value ScanIdentifierOrKeyword();
    726   Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped);
    727 
    728   Token::Value ScanString();
    729 
    730   // Scans an escape-sequence which is part of a string and adds the
    731   // decoded character to the current literal. Returns true if a pattern
    732   // is scanned.
    733   template <bool capture_raw, bool in_template_literal>
    734   bool ScanEscape();
    735 
    736   // Decodes a Unicode escape-sequence which is part of an identifier.
    737   // If the escape sequence cannot be decoded the result is kBadChar.
    738   uc32 ScanIdentifierUnicodeEscape();
    739   // Helper for the above functions.
    740   template <bool capture_raw>
    741   uc32 ScanUnicodeEscape();
    742 
    743   Token::Value ScanTemplateSpan();
    744 
    745   // Return the current source position.
    746   int source_pos() {
    747     return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
    748   }
    749 
    750   static bool LiteralContainsEscapes(const TokenDesc& token) {
    751     Location location = token.location;
    752     int source_length = (location.end_pos - location.beg_pos);
    753     if (token.token == Token::STRING) {
    754       // Subtract delimiters.
    755       source_length -= 2;
    756     }
    757     return token.literal_chars &&
    758            (token.literal_chars->length() != source_length);
    759   }
    760 
    761 #ifdef DEBUG
    762   void SanityCheckTokenDesc(const TokenDesc&) const;
    763 #endif
    764 
    765   UnicodeCache* unicode_cache_;
    766 
    767   // Buffers collecting literal strings, numbers, etc.
    768   LiteralBuffer literal_buffer0_;
    769   LiteralBuffer literal_buffer1_;
    770   LiteralBuffer literal_buffer2_;
    771 
    772   // Values parsed from magic comments.
    773   LiteralBuffer source_url_;
    774   LiteralBuffer source_mapping_url_;
    775 
    776   // Buffer to store raw string values
    777   LiteralBuffer raw_literal_buffer0_;
    778   LiteralBuffer raw_literal_buffer1_;
    779   LiteralBuffer raw_literal_buffer2_;
    780 
    781   TokenDesc current_;    // desc for current token (as returned by Next())
    782   TokenDesc next_;       // desc for next token (one token look-ahead)
    783   TokenDesc next_next_;  // desc for the token after next (after PeakAhead())
    784 
    785   // Input stream. Must be initialized to an Utf16CharacterStream.
    786   Utf16CharacterStream* source_;
    787 
    788   // Last-seen positions of potentially problematic tokens.
    789   Location octal_pos_;
    790   Location decimal_with_leading_zero_pos_;
    791 
    792   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
    793   uc32 c0_;
    794 
    795   // Whether there is a line terminator whitespace character after
    796   // the current token, and  before the next. Does not count newlines
    797   // inside multiline comments.
    798   bool has_line_terminator_before_next_;
    799   // Whether there is a multi-line comment that contains a
    800   // line-terminator after the current token, and before the next.
    801   bool has_multiline_comment_before_next_;
    802   bool has_line_terminator_after_next_;
    803 
    804   // Whether this scanner encountered an HTML comment.
    805   bool found_html_comment_;
    806 
    807   MessageTemplate::Template scanner_error_;
    808   Location scanner_error_location_;
    809 };
    810 
    811 }  // namespace internal
    812 }  // namespace v8
    813 
    814 #endif  // V8_PARSING_SCANNER_H_
    815