Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Features shared by parsing and pre-parsing scanners.
      6 
      7 #include <cmath>
      8 
      9 #include "src/v8.h"
     10 
     11 #include "include/v8stdint.h"
     12 #include "src/ast-value-factory.h"
     13 #include "src/char-predicates-inl.h"
     14 #include "src/conversions-inl.h"
     15 #include "src/list-inl.h"
     16 #include "src/parser.h"
     17 #include "src/scanner.h"
     18 
     19 namespace v8 {
     20 namespace internal {
     21 
     22 
     23 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const {
     24   if (is_one_byte()) {
     25     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
     26   }
     27   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
     28 }
     29 
     30 
     31 // ----------------------------------------------------------------------------
     32 // Scanner
     33 
     34 Scanner::Scanner(UnicodeCache* unicode_cache)
     35     : unicode_cache_(unicode_cache),
     36       octal_pos_(Location::invalid()),
     37       harmony_scoping_(false),
     38       harmony_modules_(false),
     39       harmony_numeric_literals_(false),
     40       harmony_classes_(false) { }
     41 
     42 
     43 void Scanner::Initialize(Utf16CharacterStream* source) {
     44   source_ = source;
     45   // Need to capture identifiers in order to recognize "get" and "set"
     46   // in object literals.
     47   Init();
     48   // Skip initial whitespace allowing HTML comment ends just like
     49   // after a newline and scan first token.
     50   has_line_terminator_before_next_ = true;
     51   SkipWhiteSpace();
     52   Scan();
     53 }
     54 
     55 
     56 uc32 Scanner::ScanHexNumber(int expected_length) {
     57   DCHECK(expected_length <= 4);  // prevent overflow
     58 
     59   uc32 digits[4] = { 0, 0, 0, 0 };
     60   uc32 x = 0;
     61   for (int i = 0; i < expected_length; i++) {
     62     digits[i] = c0_;
     63     int d = HexValue(c0_);
     64     if (d < 0) {
     65       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
     66       // should be illegal, but other JS VMs just return the
     67       // non-escaped version of the original character.
     68 
     69       // Push back digits that we have advanced past.
     70       for (int j = i-1; j >= 0; j--) {
     71         PushBack(digits[j]);
     72       }
     73       return -1;
     74     }
     75     x = x * 16 + d;
     76     Advance();
     77   }
     78 
     79   return x;
     80 }
     81 
     82 
     83 // Ensure that tokens can be stored in a byte.
     84 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
     85 
     86 // Table of one-character tokens, by character (0x00..0x7f only).
     87 static const byte one_char_tokens[] = {
     88   Token::ILLEGAL,
     89   Token::ILLEGAL,
     90   Token::ILLEGAL,
     91   Token::ILLEGAL,
     92   Token::ILLEGAL,
     93   Token::ILLEGAL,
     94   Token::ILLEGAL,
     95   Token::ILLEGAL,
     96   Token::ILLEGAL,
     97   Token::ILLEGAL,
     98   Token::ILLEGAL,
     99   Token::ILLEGAL,
    100   Token::ILLEGAL,
    101   Token::ILLEGAL,
    102   Token::ILLEGAL,
    103   Token::ILLEGAL,
    104   Token::ILLEGAL,
    105   Token::ILLEGAL,
    106   Token::ILLEGAL,
    107   Token::ILLEGAL,
    108   Token::ILLEGAL,
    109   Token::ILLEGAL,
    110   Token::ILLEGAL,
    111   Token::ILLEGAL,
    112   Token::ILLEGAL,
    113   Token::ILLEGAL,
    114   Token::ILLEGAL,
    115   Token::ILLEGAL,
    116   Token::ILLEGAL,
    117   Token::ILLEGAL,
    118   Token::ILLEGAL,
    119   Token::ILLEGAL,
    120   Token::ILLEGAL,
    121   Token::ILLEGAL,
    122   Token::ILLEGAL,
    123   Token::ILLEGAL,
    124   Token::ILLEGAL,
    125   Token::ILLEGAL,
    126   Token::ILLEGAL,
    127   Token::ILLEGAL,
    128   Token::LPAREN,       // 0x28
    129   Token::RPAREN,       // 0x29
    130   Token::ILLEGAL,
    131   Token::ILLEGAL,
    132   Token::COMMA,        // 0x2c
    133   Token::ILLEGAL,
    134   Token::ILLEGAL,
    135   Token::ILLEGAL,
    136   Token::ILLEGAL,
    137   Token::ILLEGAL,
    138   Token::ILLEGAL,
    139   Token::ILLEGAL,
    140   Token::ILLEGAL,
    141   Token::ILLEGAL,
    142   Token::ILLEGAL,
    143   Token::ILLEGAL,
    144   Token::ILLEGAL,
    145   Token::ILLEGAL,
    146   Token::COLON,        // 0x3a
    147   Token::SEMICOLON,    // 0x3b
    148   Token::ILLEGAL,
    149   Token::ILLEGAL,
    150   Token::ILLEGAL,
    151   Token::CONDITIONAL,  // 0x3f
    152   Token::ILLEGAL,
    153   Token::ILLEGAL,
    154   Token::ILLEGAL,
    155   Token::ILLEGAL,
    156   Token::ILLEGAL,
    157   Token::ILLEGAL,
    158   Token::ILLEGAL,
    159   Token::ILLEGAL,
    160   Token::ILLEGAL,
    161   Token::ILLEGAL,
    162   Token::ILLEGAL,
    163   Token::ILLEGAL,
    164   Token::ILLEGAL,
    165   Token::ILLEGAL,
    166   Token::ILLEGAL,
    167   Token::ILLEGAL,
    168   Token::ILLEGAL,
    169   Token::ILLEGAL,
    170   Token::ILLEGAL,
    171   Token::ILLEGAL,
    172   Token::ILLEGAL,
    173   Token::ILLEGAL,
    174   Token::ILLEGAL,
    175   Token::ILLEGAL,
    176   Token::ILLEGAL,
    177   Token::ILLEGAL,
    178   Token::ILLEGAL,
    179   Token::LBRACK,     // 0x5b
    180   Token::ILLEGAL,
    181   Token::RBRACK,     // 0x5d
    182   Token::ILLEGAL,
    183   Token::ILLEGAL,
    184   Token::ILLEGAL,
    185   Token::ILLEGAL,
    186   Token::ILLEGAL,
    187   Token::ILLEGAL,
    188   Token::ILLEGAL,
    189   Token::ILLEGAL,
    190   Token::ILLEGAL,
    191   Token::ILLEGAL,
    192   Token::ILLEGAL,
    193   Token::ILLEGAL,
    194   Token::ILLEGAL,
    195   Token::ILLEGAL,
    196   Token::ILLEGAL,
    197   Token::ILLEGAL,
    198   Token::ILLEGAL,
    199   Token::ILLEGAL,
    200   Token::ILLEGAL,
    201   Token::ILLEGAL,
    202   Token::ILLEGAL,
    203   Token::ILLEGAL,
    204   Token::ILLEGAL,
    205   Token::ILLEGAL,
    206   Token::ILLEGAL,
    207   Token::ILLEGAL,
    208   Token::ILLEGAL,
    209   Token::ILLEGAL,
    210   Token::ILLEGAL,
    211   Token::LBRACE,       // 0x7b
    212   Token::ILLEGAL,
    213   Token::RBRACE,       // 0x7d
    214   Token::BIT_NOT,      // 0x7e
    215   Token::ILLEGAL
    216 };
    217 
    218 
    219 Token::Value Scanner::Next() {
    220   current_ = next_;
    221   has_line_terminator_before_next_ = false;
    222   has_multiline_comment_before_next_ = false;
    223   if (static_cast<unsigned>(c0_) <= 0x7f) {
    224     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    225     if (token != Token::ILLEGAL) {
    226       int pos = source_pos();
    227       next_.token = token;
    228       next_.location.beg_pos = pos;
    229       next_.location.end_pos = pos + 1;
    230       Advance();
    231       return current_.token;
    232     }
    233   }
    234   Scan();
    235   return current_.token;
    236 }
    237 
    238 
    239 // TODO(yangguo): check whether this is actually necessary.
    240 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
    241   // The Unicode value U+FFFE is guaranteed never to be assigned as a
    242   // Unicode character; this implies that in a Unicode context the
    243   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
    244   // character expressed in little-endian byte order (since it could
    245   // not be a U+FFFE character expressed in big-endian byte
    246   // order). Nevertheless, we check for it to be compatible with
    247   // Spidermonkey.
    248   return c == 0xFFFE;
    249 }
    250 
    251 
    252 bool Scanner::SkipWhiteSpace() {
    253   int start_position = source_pos();
    254 
    255   while (true) {
    256     while (true) {
    257       // Advance as long as character is a WhiteSpace or LineTerminator.
    258       // Remember if the latter is the case.
    259       if (unicode_cache_->IsLineTerminator(c0_)) {
    260         has_line_terminator_before_next_ = true;
    261       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
    262                  !IsLittleEndianByteOrderMark(c0_)) {
    263         break;
    264       }
    265       Advance();
    266     }
    267 
    268     // If there is an HTML comment end '-->' at the beginning of a
    269     // line (with only whitespace in front of it), we treat the rest
    270     // of the line as a comment. This is in line with the way
    271     // SpiderMonkey handles it.
    272     if (c0_ == '-' && has_line_terminator_before_next_) {
    273       Advance();
    274       if (c0_ == '-') {
    275         Advance();
    276         if (c0_ == '>') {
    277           // Treat the rest of the line as a comment.
    278           SkipSingleLineComment();
    279           // Continue skipping white space after the comment.
    280           continue;
    281         }
    282         PushBack('-');  // undo Advance()
    283       }
    284       PushBack('-');  // undo Advance()
    285     }
    286     // Return whether or not we skipped any characters.
    287     return source_pos() != start_position;
    288   }
    289 }
    290 
    291 
    292 Token::Value Scanner::SkipSingleLineComment() {
    293   Advance();
    294 
    295   // The line terminator at the end of the line is not considered
    296   // to be part of the single-line comment; it is recognized
    297   // separately by the lexical grammar and becomes part of the
    298   // stream of input elements for the syntactic grammar (see
    299   // ECMA-262, section 7.4).
    300   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    301     Advance();
    302   }
    303 
    304   return Token::WHITESPACE;
    305 }
    306 
    307 
    308 Token::Value Scanner::SkipSourceURLComment() {
    309   TryToParseSourceURLComment();
    310   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    311     Advance();
    312   }
    313 
    314   return Token::WHITESPACE;
    315 }
    316 
    317 
    318 void Scanner::TryToParseSourceURLComment() {
    319   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
    320   // function will just return if it cannot parse a magic comment.
    321   if (!unicode_cache_->IsWhiteSpace(c0_))
    322     return;
    323   Advance();
    324   LiteralBuffer name;
    325   while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) &&
    326          c0_ != '=') {
    327     name.AddChar(c0_);
    328     Advance();
    329   }
    330   if (!name.is_one_byte()) return;
    331   Vector<const uint8_t> name_literal = name.one_byte_literal();
    332   LiteralBuffer* value;
    333   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
    334     value = &source_url_;
    335   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
    336     value = &source_mapping_url_;
    337   } else {
    338     return;
    339   }
    340   if (c0_ != '=')
    341     return;
    342   Advance();
    343   value->Reset();
    344   while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) {
    345     Advance();
    346   }
    347   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    348     // Disallowed characters.
    349     if (c0_ == '"' || c0_ == '\'') {
    350       value->Reset();
    351       return;
    352     }
    353     if (unicode_cache_->IsWhiteSpace(c0_)) {
    354       break;
    355     }
    356     value->AddChar(c0_);
    357     Advance();
    358   }
    359   // Allow whitespace at the end.
    360   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
    361     if (!unicode_cache_->IsWhiteSpace(c0_)) {
    362       value->Reset();
    363       break;
    364     }
    365     Advance();
    366   }
    367 }
    368 
    369 
    370 Token::Value Scanner::SkipMultiLineComment() {
    371   DCHECK(c0_ == '*');
    372   Advance();
    373 
    374   while (c0_ >= 0) {
    375     uc32 ch = c0_;
    376     Advance();
    377     if (unicode_cache_->IsLineTerminator(ch)) {
    378       // Following ECMA-262, section 7.4, a comment containing
    379       // a newline will make the comment count as a line-terminator.
    380       has_multiline_comment_before_next_ = true;
    381     }
    382     // If we have reached the end of the multi-line comment, we
    383     // consume the '/' and insert a whitespace. This way all
    384     // multi-line comments are treated as whitespace.
    385     if (ch == '*' && c0_ == '/') {
    386       c0_ = ' ';
    387       return Token::WHITESPACE;
    388     }
    389   }
    390 
    391   // Unterminated multi-line comment.
    392   return Token::ILLEGAL;
    393 }
    394 
    395 
    396 Token::Value Scanner::ScanHtmlComment() {
    397   // Check for <!-- comments.
    398   DCHECK(c0_ == '!');
    399   Advance();
    400   if (c0_ == '-') {
    401     Advance();
    402     if (c0_ == '-') return SkipSingleLineComment();
    403     PushBack('-');  // undo Advance()
    404   }
    405   PushBack('!');  // undo Advance()
    406   DCHECK(c0_ == '!');
    407   return Token::LT;
    408 }
    409 
    410 
    411 void Scanner::Scan() {
    412   next_.literal_chars = NULL;
    413   Token::Value token;
    414   do {
    415     // Remember the position of the next token
    416     next_.location.beg_pos = source_pos();
    417 
    418     switch (c0_) {
    419       case ' ':
    420       case '\t':
    421         Advance();
    422         token = Token::WHITESPACE;
    423         break;
    424 
    425       case '\n':
    426         Advance();
    427         has_line_terminator_before_next_ = true;
    428         token = Token::WHITESPACE;
    429         break;
    430 
    431       case '"': case '\'':
    432         token = ScanString();
    433         break;
    434 
    435       case '<':
    436         // < <= << <<= <!--
    437         Advance();
    438         if (c0_ == '=') {
    439           token = Select(Token::LTE);
    440         } else if (c0_ == '<') {
    441           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
    442         } else if (c0_ == '!') {
    443           token = ScanHtmlComment();
    444         } else {
    445           token = Token::LT;
    446         }
    447         break;
    448 
    449       case '>':
    450         // > >= >> >>= >>> >>>=
    451         Advance();
    452         if (c0_ == '=') {
    453           token = Select(Token::GTE);
    454         } else if (c0_ == '>') {
    455           // >> >>= >>> >>>=
    456           Advance();
    457           if (c0_ == '=') {
    458             token = Select(Token::ASSIGN_SAR);
    459           } else if (c0_ == '>') {
    460             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
    461           } else {
    462             token = Token::SAR;
    463           }
    464         } else {
    465           token = Token::GT;
    466         }
    467         break;
    468 
    469       case '=':
    470         // = == === =>
    471         Advance();
    472         if (c0_ == '=') {
    473           token = Select('=', Token::EQ_STRICT, Token::EQ);
    474         } else if (c0_ == '>') {
    475           token = Select(Token::ARROW);
    476         } else {
    477           token = Token::ASSIGN;
    478         }
    479         break;
    480 
    481       case '!':
    482         // ! != !==
    483         Advance();
    484         if (c0_ == '=') {
    485           token = Select('=', Token::NE_STRICT, Token::NE);
    486         } else {
    487           token = Token::NOT;
    488         }
    489         break;
    490 
    491       case '+':
    492         // + ++ +=
    493         Advance();
    494         if (c0_ == '+') {
    495           token = Select(Token::INC);
    496         } else if (c0_ == '=') {
    497           token = Select(Token::ASSIGN_ADD);
    498         } else {
    499           token = Token::ADD;
    500         }
    501         break;
    502 
    503       case '-':
    504         // - -- --> -=
    505         Advance();
    506         if (c0_ == '-') {
    507           Advance();
    508           if (c0_ == '>' && has_line_terminator_before_next_) {
    509             // For compatibility with SpiderMonkey, we skip lines that
    510             // start with an HTML comment end '-->'.
    511             token = SkipSingleLineComment();
    512           } else {
    513             token = Token::DEC;
    514           }
    515         } else if (c0_ == '=') {
    516           token = Select(Token::ASSIGN_SUB);
    517         } else {
    518           token = Token::SUB;
    519         }
    520         break;
    521 
    522       case '*':
    523         // * *=
    524         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
    525         break;
    526 
    527       case '%':
    528         // % %=
    529         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
    530         break;
    531 
    532       case '/':
    533         // /  // /* /=
    534         Advance();
    535         if (c0_ == '/') {
    536           Advance();
    537           if (c0_ == '@' || c0_ == '#') {
    538             Advance();
    539             token = SkipSourceURLComment();
    540           } else {
    541             PushBack(c0_);
    542             token = SkipSingleLineComment();
    543           }
    544         } else if (c0_ == '*') {
    545           token = SkipMultiLineComment();
    546         } else if (c0_ == '=') {
    547           token = Select(Token::ASSIGN_DIV);
    548         } else {
    549           token = Token::DIV;
    550         }
    551         break;
    552 
    553       case '&':
    554         // & && &=
    555         Advance();
    556         if (c0_ == '&') {
    557           token = Select(Token::AND);
    558         } else if (c0_ == '=') {
    559           token = Select(Token::ASSIGN_BIT_AND);
    560         } else {
    561           token = Token::BIT_AND;
    562         }
    563         break;
    564 
    565       case '|':
    566         // | || |=
    567         Advance();
    568         if (c0_ == '|') {
    569           token = Select(Token::OR);
    570         } else if (c0_ == '=') {
    571           token = Select(Token::ASSIGN_BIT_OR);
    572         } else {
    573           token = Token::BIT_OR;
    574         }
    575         break;
    576 
    577       case '^':
    578         // ^ ^=
    579         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
    580         break;
    581 
    582       case '.':
    583         // . Number
    584         Advance();
    585         if (IsDecimalDigit(c0_)) {
    586           token = ScanNumber(true);
    587         } else {
    588           token = Token::PERIOD;
    589         }
    590         break;
    591 
    592       case ':':
    593         token = Select(Token::COLON);
    594         break;
    595 
    596       case ';':
    597         token = Select(Token::SEMICOLON);
    598         break;
    599 
    600       case ',':
    601         token = Select(Token::COMMA);
    602         break;
    603 
    604       case '(':
    605         token = Select(Token::LPAREN);
    606         break;
    607 
    608       case ')':
    609         token = Select(Token::RPAREN);
    610         break;
    611 
    612       case '[':
    613         token = Select(Token::LBRACK);
    614         break;
    615 
    616       case ']':
    617         token = Select(Token::RBRACK);
    618         break;
    619 
    620       case '{':
    621         token = Select(Token::LBRACE);
    622         break;
    623 
    624       case '}':
    625         token = Select(Token::RBRACE);
    626         break;
    627 
    628       case '?':
    629         token = Select(Token::CONDITIONAL);
    630         break;
    631 
    632       case '~':
    633         token = Select(Token::BIT_NOT);
    634         break;
    635 
    636       default:
    637         if (unicode_cache_->IsIdentifierStart(c0_)) {
    638           token = ScanIdentifierOrKeyword();
    639         } else if (IsDecimalDigit(c0_)) {
    640           token = ScanNumber(false);
    641         } else if (SkipWhiteSpace()) {
    642           token = Token::WHITESPACE;
    643         } else if (c0_ < 0) {
    644           token = Token::EOS;
    645         } else {
    646           token = Select(Token::ILLEGAL);
    647         }
    648         break;
    649     }
    650 
    651     // Continue scanning for tokens as long as we're just skipping
    652     // whitespace.
    653   } while (token == Token::WHITESPACE);
    654 
    655   next_.location.end_pos = source_pos();
    656   next_.token = token;
    657 }
    658 
    659 
    660 void Scanner::SeekForward(int pos) {
    661   // After this call, we will have the token at the given position as
    662   // the "next" token. The "current" token will be invalid.
    663   if (pos == next_.location.beg_pos) return;
    664   int current_pos = source_pos();
    665   DCHECK_EQ(next_.location.end_pos, current_pos);
    666   // Positions inside the lookahead token aren't supported.
    667   DCHECK(pos >= current_pos);
    668   if (pos != current_pos) {
    669     source_->SeekForward(pos - source_->pos());
    670     Advance();
    671     // This function is only called to seek to the location
    672     // of the end of a function (at the "}" token). It doesn't matter
    673     // whether there was a line terminator in the part we skip.
    674     has_line_terminator_before_next_ = false;
    675     has_multiline_comment_before_next_ = false;
    676   }
    677   Scan();
    678 }
    679 
    680 
    681 bool Scanner::ScanEscape() {
    682   uc32 c = c0_;
    683   Advance();
    684 
    685   // Skip escaped newlines.
    686   if (unicode_cache_->IsLineTerminator(c)) {
    687     // Allow CR+LF newlines in multiline string literals.
    688     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
    689     // Allow LF+CR newlines in multiline string literals.
    690     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
    691     return true;
    692   }
    693 
    694   switch (c) {
    695     case '\'':  // fall through
    696     case '"' :  // fall through
    697     case '\\': break;
    698     case 'b' : c = '\b'; break;
    699     case 'f' : c = '\f'; break;
    700     case 'n' : c = '\n'; break;
    701     case 'r' : c = '\r'; break;
    702     case 't' : c = '\t'; break;
    703     case 'u' : {
    704       c = ScanHexNumber(4);
    705       if (c < 0) return false;
    706       break;
    707     }
    708     case 'v' : c = '\v'; break;
    709     case 'x' : {
    710       c = ScanHexNumber(2);
    711       if (c < 0) return false;
    712       break;
    713     }
    714     case '0' :  // fall through
    715     case '1' :  // fall through
    716     case '2' :  // fall through
    717     case '3' :  // fall through
    718     case '4' :  // fall through
    719     case '5' :  // fall through
    720     case '6' :  // fall through
    721     case '7' : c = ScanOctalEscape(c, 2); break;
    722   }
    723 
    724   // According to ECMA-262, section 7.8.4, characters not covered by the
    725   // above cases should be illegal, but they are commonly handled as
    726   // non-escaped characters by JS VMs.
    727   AddLiteralChar(c);
    728   return true;
    729 }
    730 
    731 
    732 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
    733 // ECMA-262. Other JS VMs support them.
    734 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
    735   uc32 x = c - '0';
    736   int i = 0;
    737   for (; i < length; i++) {
    738     int d = c0_ - '0';
    739     if (d < 0 || d > 7) break;
    740     int nx = x * 8 + d;
    741     if (nx >= 256) break;
    742     x = nx;
    743     Advance();
    744   }
    745   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
    746   // Remember the position of octal escape sequences so that an error
    747   // can be reported later (in strict mode).
    748   // We don't report the error immediately, because the octal escape can
    749   // occur before the "use strict" directive.
    750   if (c != '0' || i > 0) {
    751     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
    752   }
    753   return x;
    754 }
    755 
    756 
    757 Token::Value Scanner::ScanString() {
    758   uc32 quote = c0_;
    759   Advance();  // consume quote
    760 
    761   LiteralScope literal(this);
    762   while (c0_ != quote && c0_ >= 0
    763          && !unicode_cache_->IsLineTerminator(c0_)) {
    764     uc32 c = c0_;
    765     Advance();
    766     if (c == '\\') {
    767       if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
    768     } else {
    769       AddLiteralChar(c);
    770     }
    771   }
    772   if (c0_ != quote) return Token::ILLEGAL;
    773   literal.Complete();
    774 
    775   Advance();  // consume quote
    776   return Token::STRING;
    777 }
    778 
    779 
    780 void Scanner::ScanDecimalDigits() {
    781   while (IsDecimalDigit(c0_))
    782     AddLiteralCharAdvance();
    783 }
    784 
    785 
    786 Token::Value Scanner::ScanNumber(bool seen_period) {
    787   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
    788 
    789   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
    790 
    791   LiteralScope literal(this);
    792   if (seen_period) {
    793     // we have already seen a decimal point of the float
    794     AddLiteralChar('.');
    795     ScanDecimalDigits();  // we know we have at least one digit
    796 
    797   } else {
    798     // if the first character is '0' we must check for octals and hex
    799     if (c0_ == '0') {
    800       int start_pos = source_pos();  // For reporting octal positions.
    801       AddLiteralCharAdvance();
    802 
    803       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
    804       // an octal number.
    805       if (c0_ == 'x' || c0_ == 'X') {
    806         // hex number
    807         kind = HEX;
    808         AddLiteralCharAdvance();
    809         if (!IsHexDigit(c0_)) {
    810           // we must have at least one hex digit after 'x'/'X'
    811           return Token::ILLEGAL;
    812         }
    813         while (IsHexDigit(c0_)) {
    814           AddLiteralCharAdvance();
    815         }
    816       } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
    817         kind = OCTAL;
    818         AddLiteralCharAdvance();
    819         if (!IsOctalDigit(c0_)) {
    820           // we must have at least one octal digit after 'o'/'O'
    821           return Token::ILLEGAL;
    822         }
    823         while (IsOctalDigit(c0_)) {
    824           AddLiteralCharAdvance();
    825         }
    826       } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
    827         kind = BINARY;
    828         AddLiteralCharAdvance();
    829         if (!IsBinaryDigit(c0_)) {
    830           // we must have at least one binary digit after 'b'/'B'
    831           return Token::ILLEGAL;
    832         }
    833         while (IsBinaryDigit(c0_)) {
    834           AddLiteralCharAdvance();
    835         }
    836       } else if ('0' <= c0_ && c0_ <= '7') {
    837         // (possible) octal number
    838         kind = IMPLICIT_OCTAL;
    839         while (true) {
    840           if (c0_ == '8' || c0_ == '9') {
    841             kind = DECIMAL;
    842             break;
    843           }
    844           if (c0_  < '0' || '7'  < c0_) {
    845             // Octal literal finished.
    846             octal_pos_ = Location(start_pos, source_pos());
    847             break;
    848           }
    849           AddLiteralCharAdvance();
    850         }
    851       }
    852     }
    853 
    854     // Parse decimal digits and allow trailing fractional part.
    855     if (kind == DECIMAL) {
    856       ScanDecimalDigits();  // optional
    857       if (c0_ == '.') {
    858         AddLiteralCharAdvance();
    859         ScanDecimalDigits();  // optional
    860       }
    861     }
    862   }
    863 
    864   // scan exponent, if any
    865   if (c0_ == 'e' || c0_ == 'E') {
    866     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
    867     if (kind != DECIMAL) return Token::ILLEGAL;
    868     // scan exponent
    869     AddLiteralCharAdvance();
    870     if (c0_ == '+' || c0_ == '-')
    871       AddLiteralCharAdvance();
    872     if (!IsDecimalDigit(c0_)) {
    873       // we must have at least one decimal digit after 'e'/'E'
    874       return Token::ILLEGAL;
    875     }
    876     ScanDecimalDigits();
    877   }
    878 
    879   // The source character immediately following a numeric literal must
    880   // not be an identifier start or a decimal digit; see ECMA-262
    881   // section 7.8.3, page 17 (note that we read only one decimal digit
    882   // if the value is 0).
    883   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
    884     return Token::ILLEGAL;
    885 
    886   literal.Complete();
    887 
    888   return Token::NUMBER;
    889 }
    890 
    891 
    892 uc32 Scanner::ScanIdentifierUnicodeEscape() {
    893   Advance();
    894   if (c0_ != 'u') return -1;
    895   Advance();
    896   uc32 result = ScanHexNumber(4);
    897   if (result < 0) PushBack('u');
    898   return result;
    899 }
    900 
    901 
    902 // ----------------------------------------------------------------------------
    903 // Keyword Matcher
    904 
    905 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                                     \
    906   KEYWORD_GROUP('b')                                                         \
    907   KEYWORD("break", Token::BREAK)                                             \
    908   KEYWORD_GROUP('c')                                                         \
    909   KEYWORD("case", Token::CASE)                                               \
    910   KEYWORD("catch", Token::CATCH)                                             \
    911   KEYWORD("class",                                                           \
    912           harmony_classes ? Token::CLASS : Token::FUTURE_RESERVED_WORD)      \
    913   KEYWORD("const", Token::CONST)                                             \
    914   KEYWORD("continue", Token::CONTINUE)                                       \
    915   KEYWORD_GROUP('d')                                                         \
    916   KEYWORD("debugger", Token::DEBUGGER)                                       \
    917   KEYWORD("default", Token::DEFAULT)                                         \
    918   KEYWORD("delete", Token::DELETE)                                           \
    919   KEYWORD("do", Token::DO)                                                   \
    920   KEYWORD_GROUP('e')                                                         \
    921   KEYWORD("else", Token::ELSE)                                               \
    922   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                               \
    923   KEYWORD("export",                                                          \
    924           harmony_modules ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)     \
    925   KEYWORD("extends",                                                         \
    926           harmony_classes ? Token::EXTENDS : Token::FUTURE_RESERVED_WORD)    \
    927   KEYWORD_GROUP('f')                                                         \
    928   KEYWORD("false", Token::FALSE_LITERAL)                                     \
    929   KEYWORD("finally", Token::FINALLY)                                         \
    930   KEYWORD("for", Token::FOR)                                                 \
    931   KEYWORD("function", Token::FUNCTION)                                       \
    932   KEYWORD_GROUP('i')                                                         \
    933   KEYWORD("if", Token::IF)                                                   \
    934   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)                  \
    935   KEYWORD("import",                                                          \
    936           harmony_modules ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)     \
    937   KEYWORD("in", Token::IN)                                                   \
    938   KEYWORD("instanceof", Token::INSTANCEOF)                                   \
    939   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)                   \
    940   KEYWORD_GROUP('l')                                                         \
    941   KEYWORD("let",                                                             \
    942           harmony_scoping ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
    943   KEYWORD_GROUP('n')                                                         \
    944   KEYWORD("new", Token::NEW)                                                 \
    945   KEYWORD("null", Token::NULL_LITERAL)                                       \
    946   KEYWORD_GROUP('p')                                                         \
    947   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)                     \
    948   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)                     \
    949   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)                   \
    950   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)                      \
    951   KEYWORD_GROUP('r')                                                         \
    952   KEYWORD("return", Token::RETURN)                                           \
    953   KEYWORD_GROUP('s')                                                         \
    954   KEYWORD("static", harmony_classes ? Token::STATIC                          \
    955                                     : Token::FUTURE_STRICT_RESERVED_WORD)    \
    956   KEYWORD("super",                                                           \
    957           harmony_classes ? Token::SUPER : Token::FUTURE_RESERVED_WORD)      \
    958   KEYWORD("switch", Token::SWITCH)                                           \
    959   KEYWORD_GROUP('t')                                                         \
    960   KEYWORD("this", Token::THIS)                                               \
    961   KEYWORD("throw", Token::THROW)                                             \
    962   KEYWORD("true", Token::TRUE_LITERAL)                                       \
    963   KEYWORD("try", Token::TRY)                                                 \
    964   KEYWORD("typeof", Token::TYPEOF)                                           \
    965   KEYWORD_GROUP('v')                                                         \
    966   KEYWORD("var", Token::VAR)                                                 \
    967   KEYWORD("void", Token::VOID)                                               \
    968   KEYWORD_GROUP('w')                                                         \
    969   KEYWORD("while", Token::WHILE)                                             \
    970   KEYWORD("with", Token::WITH)                                               \
    971   KEYWORD_GROUP('y')                                                         \
    972   KEYWORD("yield", Token::YIELD)
    973 
    974 
    975 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
    976                                              int input_length,
    977                                              bool harmony_scoping,
    978                                              bool harmony_modules,
    979                                              bool harmony_classes) {
    980   DCHECK(input_length >= 1);
    981   const int kMinLength = 2;
    982   const int kMaxLength = 10;
    983   if (input_length < kMinLength || input_length > kMaxLength) {
    984     return Token::IDENTIFIER;
    985   }
    986   switch (input[0]) {
    987     default:
    988 #define KEYWORD_GROUP_CASE(ch)                                \
    989       break;                                                  \
    990     case ch:
    991 #define KEYWORD(keyword, token)                               \
    992     {                                                         \
    993       /* 'keyword' is a char array, so sizeof(keyword) is */  \
    994       /* strlen(keyword) plus 1 for the NUL char. */          \
    995       const int keyword_length = sizeof(keyword) - 1;         \
    996       STATIC_ASSERT(keyword_length >= kMinLength);            \
    997       STATIC_ASSERT(keyword_length <= kMaxLength);            \
    998       if (input_length == keyword_length &&                   \
    999           input[1] == keyword[1] &&                           \
   1000           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
   1001           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
   1002           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
   1003           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
   1004           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
   1005           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
   1006           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
   1007           (keyword_length <= 9 || input[9] == keyword[9])) {  \
   1008         return token;                                         \
   1009       }                                                       \
   1010     }
   1011     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
   1012   }
   1013   return Token::IDENTIFIER;
   1014 }
   1015 
   1016 
   1017 bool Scanner::IdentifierIsFutureStrictReserved(
   1018     const AstRawString* string) const {
   1019   // Keywords are always 1-byte strings.
   1020   return string->is_one_byte() &&
   1021          Token::FUTURE_STRICT_RESERVED_WORD ==
   1022              KeywordOrIdentifierToken(string->raw_data(), string->length(),
   1023                                       harmony_scoping_, harmony_modules_,
   1024                                       harmony_classes_);
   1025 }
   1026 
   1027 
   1028 Token::Value Scanner::ScanIdentifierOrKeyword() {
   1029   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
   1030   LiteralScope literal(this);
   1031   // Scan identifier start character.
   1032   if (c0_ == '\\') {
   1033     uc32 c = ScanIdentifierUnicodeEscape();
   1034     // Only allow legal identifier start characters.
   1035     if (c < 0 ||
   1036         c == '\\' ||  // No recursive escapes.
   1037         !unicode_cache_->IsIdentifierStart(c)) {
   1038       return Token::ILLEGAL;
   1039     }
   1040     AddLiteralChar(c);
   1041     return ScanIdentifierSuffix(&literal);
   1042   }
   1043 
   1044   uc32 first_char = c0_;
   1045   Advance();
   1046   AddLiteralChar(first_char);
   1047 
   1048   // Scan the rest of the identifier characters.
   1049   while (unicode_cache_->IsIdentifierPart(c0_)) {
   1050     if (c0_ != '\\') {
   1051       uc32 next_char = c0_;
   1052       Advance();
   1053       AddLiteralChar(next_char);
   1054       continue;
   1055     }
   1056     // Fallthrough if no longer able to complete keyword.
   1057     return ScanIdentifierSuffix(&literal);
   1058   }
   1059 
   1060   literal.Complete();
   1061 
   1062   if (next_.literal_chars->is_one_byte()) {
   1063     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
   1064     return KeywordOrIdentifierToken(chars.start(),
   1065                                     chars.length(),
   1066                                     harmony_scoping_,
   1067                                     harmony_modules_,
   1068                                     harmony_classes_);
   1069   }
   1070 
   1071   return Token::IDENTIFIER;
   1072 }
   1073 
   1074 
   1075 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
   1076   // Scan the rest of the identifier characters.
   1077   while (unicode_cache_->IsIdentifierPart(c0_)) {
   1078     if (c0_ == '\\') {
   1079       uc32 c = ScanIdentifierUnicodeEscape();
   1080       // Only allow legal identifier part characters.
   1081       if (c < 0 ||
   1082           c == '\\' ||
   1083           !unicode_cache_->IsIdentifierPart(c)) {
   1084         return Token::ILLEGAL;
   1085       }
   1086       AddLiteralChar(c);
   1087     } else {
   1088       AddLiteralChar(c0_);
   1089       Advance();
   1090     }
   1091   }
   1092   literal->Complete();
   1093 
   1094   return Token::IDENTIFIER;
   1095 }
   1096 
   1097 
   1098 bool Scanner::ScanRegExpPattern(bool seen_equal) {
   1099   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
   1100   bool in_character_class = false;
   1101 
   1102   // Previous token is either '/' or '/=', in the second case, the
   1103   // pattern starts at =.
   1104   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
   1105   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
   1106 
   1107   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
   1108   // the scanner should pass uninterpreted bodies to the RegExp
   1109   // constructor.
   1110   LiteralScope literal(this);
   1111   if (seen_equal) {
   1112     AddLiteralChar('=');
   1113   }
   1114 
   1115   while (c0_ != '/' || in_character_class) {
   1116     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1117     if (c0_ == '\\') {  // Escape sequence.
   1118       AddLiteralCharAdvance();
   1119       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
   1120       AddLiteralCharAdvance();
   1121       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
   1122       // only "safe" characters are allowed (letters, digits, underscore),
   1123       // otherwise the escape isn't valid and the invalid character has
   1124       // its normal meaning. I.e., we can just continue scanning without
   1125       // worrying whether the following characters are part of the escape
   1126       // or not, since any '/', '\\' or '[' is guaranteed to not be part
   1127       // of the escape sequence.
   1128 
   1129       // TODO(896): At some point, parse RegExps more throughly to capture
   1130       // octal esacpes in strict mode.
   1131     } else {  // Unescaped character.
   1132       if (c0_ == '[') in_character_class = true;
   1133       if (c0_ == ']') in_character_class = false;
   1134       AddLiteralCharAdvance();
   1135     }
   1136   }
   1137   Advance();  // consume '/'
   1138 
   1139   literal.Complete();
   1140 
   1141   return true;
   1142 }
   1143 
   1144 
   1145 bool Scanner::ScanLiteralUnicodeEscape() {
   1146   DCHECK(c0_ == '\\');
   1147   uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
   1148   Advance();
   1149   int i = 1;
   1150   if (c0_ == 'u') {
   1151     i++;
   1152     while (i < 6) {
   1153       Advance();
   1154       if (!IsHexDigit(c0_)) break;
   1155       chars_read[i] = c0_;
   1156       i++;
   1157     }
   1158   }
   1159   if (i < 6) {
   1160     // Incomplete escape. Undo all advances and return false.
   1161     while (i > 0) {
   1162       i--;
   1163       PushBack(chars_read[i]);
   1164     }
   1165     return false;
   1166   }
   1167   // Complete escape. Add all chars to current literal buffer.
   1168   for (int i = 0; i < 6; i++) {
   1169     AddLiteralChar(chars_read[i]);
   1170   }
   1171   return true;
   1172 }
   1173 
   1174 
   1175 bool Scanner::ScanRegExpFlags() {
   1176   // Scan regular expression flags.
   1177   LiteralScope literal(this);
   1178   while (unicode_cache_->IsIdentifierPart(c0_)) {
   1179     if (c0_ != '\\') {
   1180       AddLiteralCharAdvance();
   1181     } else {
   1182       if (!ScanLiteralUnicodeEscape()) {
   1183         break;
   1184       }
   1185       Advance();
   1186     }
   1187   }
   1188   literal.Complete();
   1189 
   1190   next_.location.end_pos = source_pos() - 1;
   1191   return true;
   1192 }
   1193 
   1194 
   1195 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
   1196   if (is_literal_one_byte()) {
   1197     return ast_value_factory->GetOneByteString(literal_one_byte_string());
   1198   }
   1199   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
   1200 }
   1201 
   1202 
   1203 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
   1204   if (is_next_literal_one_byte()) {
   1205     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
   1206   }
   1207   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
   1208 }
   1209 
   1210 
   1211 double Scanner::DoubleValue() {
   1212   DCHECK(is_literal_one_byte());
   1213   return StringToDouble(
   1214       unicode_cache_,
   1215       literal_one_byte_string(),
   1216       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
   1217 }
   1218 
   1219 
   1220 int Scanner::FindNumber(DuplicateFinder* finder, int value) {
   1221   return finder->AddNumber(literal_one_byte_string(), value);
   1222 }
   1223 
   1224 
   1225 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
   1226   if (is_literal_one_byte()) {
   1227     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
   1228   }
   1229   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
   1230 }
   1231 
   1232 
   1233 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
   1234   return AddSymbol(key, true, value);
   1235 }
   1236 
   1237 
   1238 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
   1239   return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
   1240 }
   1241 
   1242 
   1243 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
   1244                                bool is_one_byte,
   1245                                int value) {
   1246   uint32_t hash = Hash(key, is_one_byte);
   1247   byte* encoding = BackupKey(key, is_one_byte);
   1248   HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
   1249   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
   1250   entry->value =
   1251     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
   1252   return old_value;
   1253 }
   1254 
   1255 
   1256 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
   1257   DCHECK(key.length() > 0);
   1258   // Quick check for already being in canonical form.
   1259   if (IsNumberCanonical(key)) {
   1260     return AddOneByteSymbol(key, value);
   1261   }
   1262 
   1263   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
   1264   double double_value = StringToDouble(
   1265       unicode_constants_, key, flags, 0.0);
   1266   int length;
   1267   const char* string;
   1268   if (!std::isfinite(double_value)) {
   1269     string = "Infinity";
   1270     length = 8;  // strlen("Infinity");
   1271   } else {
   1272     string = DoubleToCString(double_value,
   1273                              Vector<char>(number_buffer_, kBufferSize));
   1274     length = StrLength(string);
   1275   }
   1276   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
   1277                                       length), true, value);
   1278 }
   1279 
   1280 
   1281 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
   1282   // Test for a safe approximation of number literals that are already
   1283   // in canonical form: max 15 digits, no leading zeroes, except an
   1284   // integer part that is a single zero, and no trailing zeros below
   1285   // the decimal point.
   1286   int pos = 0;
   1287   int length = number.length();
   1288   if (number.length() > 15) return false;
   1289   if (number[pos] == '0') {
   1290     pos++;
   1291   } else {
   1292     while (pos < length &&
   1293            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
   1294   }
   1295   if (length == pos) return true;
   1296   if (number[pos] != '.') return false;
   1297   pos++;
   1298   bool invalid_last_digit = true;
   1299   while (pos < length) {
   1300     uint8_t digit = number[pos] - '0';
   1301     if (digit > '9' - '0') return false;
   1302     invalid_last_digit = (digit == 0);
   1303     pos++;
   1304   }
   1305   return !invalid_last_digit;
   1306 }
   1307 
   1308 
   1309 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
   1310   // Primitive hash function, almost identical to the one used
   1311   // for strings (except that it's seeded by the length and representation).
   1312   int length = key.length();
   1313   uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0) ;
   1314   for (int i = 0; i < length; i++) {
   1315     uint32_t c = key[i];
   1316     hash = (hash + c) * 1025;
   1317     hash ^= (hash >> 6);
   1318   }
   1319   return hash;
   1320 }
   1321 
   1322 
   1323 bool DuplicateFinder::Match(void* first, void* second) {
   1324   // Decode lengths.
   1325   // Length + representation is encoded as base 128, most significant heptet
   1326   // first, with a 8th bit being non-zero while there are more heptets.
   1327   // The value encodes the number of bytes following, and whether the original
   1328   // was Latin1.
   1329   byte* s1 = reinterpret_cast<byte*>(first);
   1330   byte* s2 = reinterpret_cast<byte*>(second);
   1331   uint32_t length_one_byte_field = 0;
   1332   byte c1;
   1333   do {
   1334     c1 = *s1;
   1335     if (c1 != *s2) return false;
   1336     length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
   1337     s1++;
   1338     s2++;
   1339   } while ((c1 & 0x80) != 0);
   1340   int length = static_cast<int>(length_one_byte_field >> 1);
   1341   return memcmp(s1, s2, length) == 0;
   1342 }
   1343 
   1344 
   1345 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
   1346                                  bool is_one_byte) {
   1347   uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
   1348   backing_store_.StartSequence();
   1349   // Emit one_byte_length as base-128 encoded number, with the 7th bit set
   1350   // on the byte of every heptet except the last, least significant, one.
   1351   if (one_byte_length >= (1 << 7)) {
   1352     if (one_byte_length >= (1 << 14)) {
   1353       if (one_byte_length >= (1 << 21)) {
   1354         if (one_byte_length >= (1 << 28)) {
   1355           backing_store_.Add(
   1356               static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
   1357         }
   1358         backing_store_.Add(
   1359             static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
   1360       }
   1361       backing_store_.Add(
   1362           static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
   1363     }
   1364     backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
   1365   }
   1366   backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
   1367 
   1368   backing_store_.AddBlock(bytes);
   1369   return backing_store_.EndSequence().start();
   1370 }
   1371 
   1372 } }  // namespace v8::internal
   1373