Home | History | Annotate | Download | only in regexp
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/regexp/regexp-parser.h"
      6 
      7 #include "src/char-predicates-inl.h"
      8 #include "src/factory.h"
      9 #include "src/isolate.h"
     10 #include "src/objects-inl.h"
     11 #include "src/ostreams.h"
     12 #include "src/regexp/jsregexp.h"
     13 #include "src/utils.h"
     14 
     15 #ifdef V8_I18N_SUPPORT
     16 #include "unicode/uniset.h"
     17 #endif  // V8_I18N_SUPPORT
     18 
     19 namespace v8 {
     20 namespace internal {
     21 
     22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
     23                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
     24     : isolate_(isolate),
     25       zone_(zone),
     26       error_(error),
     27       captures_(NULL),
     28       named_captures_(NULL),
     29       named_back_references_(NULL),
     30       in_(in),
     31       current_(kEndMarker),
     32       ignore_case_(flags & JSRegExp::kIgnoreCase),
     33       multiline_(flags & JSRegExp::kMultiline),
     34       unicode_(flags & JSRegExp::kUnicode),
     35       next_pos_(0),
     36       captures_started_(0),
     37       capture_count_(0),
     38       has_more_(true),
     39       simple_(false),
     40       contains_anchor_(false),
     41       is_scanned_for_captures_(false),
     42       failed_(false) {
     43   Advance();
     44 }
     45 
     46 template <bool update_position>
     47 inline uc32 RegExpParser::ReadNext() {
     48   int position = next_pos_;
     49   uc32 c0 = in()->Get(position);
     50   position++;
     51   // Read the whole surrogate pair in case of unicode flag, if possible.
     52   if (unicode() && position < in()->length() &&
     53       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
     54     uc16 c1 = in()->Get(position);
     55     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
     56       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
     57       position++;
     58     }
     59   }
     60   if (update_position) next_pos_ = position;
     61   return c0;
     62 }
     63 
     64 
     65 uc32 RegExpParser::Next() {
     66   if (has_next()) {
     67     return ReadNext<false>();
     68   } else {
     69     return kEndMarker;
     70   }
     71 }
     72 
     73 
     74 void RegExpParser::Advance() {
     75   if (has_next()) {
     76     StackLimitCheck check(isolate());
     77     if (check.HasOverflowed()) {
     78       if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
     79       ReportError(CStrVector(
     80           MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
     81     } else if (zone()->excess_allocation()) {
     82       ReportError(CStrVector("Regular expression too large"));
     83     } else {
     84       current_ = ReadNext<true>();
     85     }
     86   } else {
     87     current_ = kEndMarker;
     88     // Advance so that position() points to 1-after-the-last-character. This is
     89     // important so that Reset() to this position works correctly.
     90     next_pos_ = in()->length() + 1;
     91     has_more_ = false;
     92   }
     93 }
     94 
     95 
     96 void RegExpParser::Reset(int pos) {
     97   next_pos_ = pos;
     98   has_more_ = (pos < in()->length());
     99   Advance();
    100 }
    101 
    102 
    103 void RegExpParser::Advance(int dist) {
    104   next_pos_ += dist - 1;
    105   Advance();
    106 }
    107 
    108 
    109 bool RegExpParser::simple() { return simple_; }
    110 
    111 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
    112   switch (c) {
    113     case '^':
    114     case '$':
    115     case '\\':
    116     case '.':
    117     case '*':
    118     case '+':
    119     case '?':
    120     case '(':
    121     case ')':
    122     case '[':
    123     case ']':
    124     case '{':
    125     case '}':
    126     case '|':
    127     case '/':
    128       return true;
    129     default:
    130       break;
    131   }
    132   return false;
    133 }
    134 
    135 
    136 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
    137   if (failed_) return NULL;  // Do not overwrite any existing error.
    138   failed_ = true;
    139   *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
    140   // Zip to the end to make sure the no more input is read.
    141   current_ = kEndMarker;
    142   next_pos_ = in()->length();
    143   return NULL;
    144 }
    145 
    146 
    147 #define CHECK_FAILED /**/); \
    148   if (failed_) return NULL; \
    149   ((void)0
    150 
    151 
    152 // Pattern ::
    153 //   Disjunction
    154 RegExpTree* RegExpParser::ParsePattern() {
    155   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
    156   PatchNamedBackReferences(CHECK_FAILED);
    157   DCHECK(!has_more());
    158   // If the result of parsing is a literal string atom, and it has the
    159   // same length as the input, then the atom is identical to the input.
    160   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
    161     simple_ = true;
    162   }
    163   return result;
    164 }
    165 
    166 
    167 // Disjunction ::
    168 //   Alternative
    169 //   Alternative | Disjunction
    170 // Alternative ::
    171 //   [empty]
    172 //   Term Alternative
    173 // Term ::
    174 //   Assertion
    175 //   Atom
    176 //   Atom Quantifier
    177 RegExpTree* RegExpParser::ParseDisjunction() {
    178   // Used to store current state while parsing subexpressions.
    179   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
    180                                   nullptr, ignore_case(), unicode(), zone());
    181   RegExpParserState* state = &initial_state;
    182   // Cache the builder in a local variable for quick access.
    183   RegExpBuilder* builder = initial_state.builder();
    184   while (true) {
    185     switch (current()) {
    186       case kEndMarker:
    187         if (state->IsSubexpression()) {
    188           // Inside a parenthesized group when hitting end of input.
    189           return ReportError(CStrVector("Unterminated group"));
    190         }
    191         DCHECK_EQ(INITIAL, state->group_type());
    192         // Parsing completed successfully.
    193         return builder->ToRegExp();
    194       case ')': {
    195         if (!state->IsSubexpression()) {
    196           return ReportError(CStrVector("Unmatched ')'"));
    197         }
    198         DCHECK_NE(INITIAL, state->group_type());
    199 
    200         Advance();
    201         // End disjunction parsing and convert builder content to new single
    202         // regexp atom.
    203         RegExpTree* body = builder->ToRegExp();
    204 
    205         int end_capture_index = captures_started();
    206 
    207         int capture_index = state->capture_index();
    208         SubexpressionType group_type = state->group_type();
    209 
    210         // Build result of subexpression.
    211         if (group_type == CAPTURE) {
    212           if (state->IsNamedCapture()) {
    213             CreateNamedCaptureAtIndex(state->capture_name(),
    214                                       capture_index CHECK_FAILED);
    215           }
    216           RegExpCapture* capture = GetCapture(capture_index);
    217           capture->set_body(body);
    218           body = capture;
    219         } else if (group_type == GROUPING) {
    220           body = new (zone()) RegExpGroup(body);
    221         } else {
    222           DCHECK(group_type == POSITIVE_LOOKAROUND ||
    223                  group_type == NEGATIVE_LOOKAROUND);
    224           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
    225           body = new (zone()) RegExpLookaround(
    226               body, is_positive, end_capture_index - capture_index,
    227               capture_index, state->lookaround_type());
    228         }
    229 
    230         // Restore previous state.
    231         state = state->previous_state();
    232         builder = state->builder();
    233 
    234         builder->AddAtom(body);
    235         // For compatability with JSC and ES3, we allow quantifiers after
    236         // lookaheads, and break in all cases.
    237         break;
    238       }
    239       case '|': {
    240         Advance();
    241         builder->NewAlternative();
    242         continue;
    243       }
    244       case '*':
    245       case '+':
    246       case '?':
    247         return ReportError(CStrVector("Nothing to repeat"));
    248       case '^': {
    249         Advance();
    250         if (multiline()) {
    251           builder->AddAssertion(
    252               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
    253         } else {
    254           builder->AddAssertion(
    255               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
    256           set_contains_anchor();
    257         }
    258         continue;
    259       }
    260       case '$': {
    261         Advance();
    262         RegExpAssertion::AssertionType assertion_type =
    263             multiline() ? RegExpAssertion::END_OF_LINE
    264                         : RegExpAssertion::END_OF_INPUT;
    265         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
    266         continue;
    267       }
    268       case '.': {
    269         Advance();
    270         // everything except \x0a, \x0d, \u2028 and \u2029
    271         ZoneList<CharacterRange>* ranges =
    272             new (zone()) ZoneList<CharacterRange>(2, zone());
    273         CharacterRange::AddClassEscape('.', ranges, zone());
    274         RegExpCharacterClass* cc =
    275             new (zone()) RegExpCharacterClass(ranges, false);
    276         builder->AddCharacterClass(cc);
    277         break;
    278       }
    279       case '(': {
    280         SubexpressionType subexpr_type = CAPTURE;
    281         RegExpLookaround::Type lookaround_type = state->lookaround_type();
    282         bool is_named_capture = false;
    283         Advance();
    284         if (current() == '?') {
    285           switch (Next()) {
    286             case ':':
    287               subexpr_type = GROUPING;
    288               Advance(2);
    289               break;
    290             case '=':
    291               lookaround_type = RegExpLookaround::LOOKAHEAD;
    292               subexpr_type = POSITIVE_LOOKAROUND;
    293               Advance(2);
    294               break;
    295             case '!':
    296               lookaround_type = RegExpLookaround::LOOKAHEAD;
    297               subexpr_type = NEGATIVE_LOOKAROUND;
    298               Advance(2);
    299               break;
    300             case '<':
    301               Advance();
    302               if (FLAG_harmony_regexp_lookbehind) {
    303                 if (Next() == '=') {
    304                   subexpr_type = POSITIVE_LOOKAROUND;
    305                   lookaround_type = RegExpLookaround::LOOKBEHIND;
    306                   Advance(2);
    307                   break;
    308                 } else if (Next() == '!') {
    309                   subexpr_type = NEGATIVE_LOOKAROUND;
    310                   lookaround_type = RegExpLookaround::LOOKBEHIND;
    311                   Advance(2);
    312                   break;
    313                 }
    314               }
    315               if (FLAG_harmony_regexp_named_captures && unicode()) {
    316                 is_named_capture = true;
    317                 Advance();
    318                 break;
    319               }
    320             // Fall through.
    321             default:
    322               return ReportError(CStrVector("Invalid group"));
    323           }
    324         }
    325 
    326         const ZoneVector<uc16>* capture_name = nullptr;
    327         if (subexpr_type == CAPTURE) {
    328           if (captures_started_ >= kMaxCaptures) {
    329             return ReportError(CStrVector("Too many captures"));
    330           }
    331           captures_started_++;
    332 
    333           if (is_named_capture) {
    334             capture_name = ParseCaptureGroupName(CHECK_FAILED);
    335           }
    336         }
    337         // Store current state and begin new disjunction parsing.
    338         state = new (zone()) RegExpParserState(
    339             state, subexpr_type, lookaround_type, captures_started_,
    340             capture_name, ignore_case(), unicode(), zone());
    341         builder = state->builder();
    342         continue;
    343       }
    344       case '[': {
    345         RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
    346         builder->AddCharacterClass(cc->AsCharacterClass());
    347         break;
    348       }
    349       // Atom ::
    350       //   \ AtomEscape
    351       case '\\':
    352         switch (Next()) {
    353           case kEndMarker:
    354             return ReportError(CStrVector("\\ at end of pattern"));
    355           case 'b':
    356             Advance(2);
    357             builder->AddAssertion(
    358                 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
    359             continue;
    360           case 'B':
    361             Advance(2);
    362             builder->AddAssertion(
    363                 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
    364             continue;
    365           // AtomEscape ::
    366           //   CharacterClassEscape
    367           //
    368           // CharacterClassEscape :: one of
    369           //   d D s S w W
    370           case 'd':
    371           case 'D':
    372           case 's':
    373           case 'S':
    374           case 'w':
    375           case 'W': {
    376             uc32 c = Next();
    377             Advance(2);
    378             ZoneList<CharacterRange>* ranges =
    379                 new (zone()) ZoneList<CharacterRange>(2, zone());
    380             CharacterRange::AddClassEscape(c, ranges, zone());
    381             RegExpCharacterClass* cc =
    382                 new (zone()) RegExpCharacterClass(ranges, false);
    383             builder->AddCharacterClass(cc);
    384             break;
    385           }
    386           case 'p':
    387           case 'P': {
    388             uc32 p = Next();
    389             Advance(2);
    390             if (unicode()) {
    391               if (FLAG_harmony_regexp_property) {
    392                 ZoneList<CharacterRange>* ranges =
    393                     new (zone()) ZoneList<CharacterRange>(2, zone());
    394                 if (!ParsePropertyClass(ranges, p == 'P')) {
    395                   return ReportError(CStrVector("Invalid property name"));
    396                 }
    397                 RegExpCharacterClass* cc =
    398                     new (zone()) RegExpCharacterClass(ranges, false);
    399                 builder->AddCharacterClass(cc);
    400               } else {
    401                 // With /u, no identity escapes except for syntax characters
    402                 // are allowed. Otherwise, all identity escapes are allowed.
    403                 return ReportError(CStrVector("Invalid escape"));
    404               }
    405             } else {
    406               builder->AddCharacter(p);
    407             }
    408             break;
    409           }
    410           case '1':
    411           case '2':
    412           case '3':
    413           case '4':
    414           case '5':
    415           case '6':
    416           case '7':
    417           case '8':
    418           case '9': {
    419             int index = 0;
    420             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
    421             if (is_backref) {
    422               if (state->IsInsideCaptureGroup(index)) {
    423                 // The back reference is inside the capture group it refers to.
    424                 // Nothing can possibly have been captured yet, so we use empty
    425                 // instead. This ensures that, when checking a back reference,
    426                 // the capture registers of the referenced capture are either
    427                 // both set or both cleared.
    428                 builder->AddEmpty();
    429               } else {
    430                 RegExpCapture* capture = GetCapture(index);
    431                 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
    432                 builder->AddAtom(atom);
    433               }
    434               break;
    435             }
    436             // With /u, no identity escapes except for syntax characters
    437             // are allowed. Otherwise, all identity escapes are allowed.
    438             if (unicode()) {
    439               return ReportError(CStrVector("Invalid escape"));
    440             }
    441             uc32 first_digit = Next();
    442             if (first_digit == '8' || first_digit == '9') {
    443               builder->AddCharacter(first_digit);
    444               Advance(2);
    445               break;
    446             }
    447           }
    448           // Fall through.
    449           case '0': {
    450             Advance();
    451             if (unicode() && Next() >= '0' && Next() <= '9') {
    452               // With /u, decimal escape with leading 0 are not parsed as octal.
    453               return ReportError(CStrVector("Invalid decimal escape"));
    454             }
    455             uc32 octal = ParseOctalLiteral();
    456             builder->AddCharacter(octal);
    457             break;
    458           }
    459           // ControlEscape :: one of
    460           //   f n r t v
    461           case 'f':
    462             Advance(2);
    463             builder->AddCharacter('\f');
    464             break;
    465           case 'n':
    466             Advance(2);
    467             builder->AddCharacter('\n');
    468             break;
    469           case 'r':
    470             Advance(2);
    471             builder->AddCharacter('\r');
    472             break;
    473           case 't':
    474             Advance(2);
    475             builder->AddCharacter('\t');
    476             break;
    477           case 'v':
    478             Advance(2);
    479             builder->AddCharacter('\v');
    480             break;
    481           case 'c': {
    482             Advance();
    483             uc32 controlLetter = Next();
    484             // Special case if it is an ASCII letter.
    485             // Convert lower case letters to uppercase.
    486             uc32 letter = controlLetter & ~('a' ^ 'A');
    487             if (letter < 'A' || 'Z' < letter) {
    488               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
    489               // This is outside the specification. We match JSC in
    490               // reading the backslash as a literal character instead
    491               // of as starting an escape.
    492               if (unicode()) {
    493                 // With /u, invalid escapes are not treated as identity escapes.
    494                 return ReportError(CStrVector("Invalid unicode escape"));
    495               }
    496               builder->AddCharacter('\\');
    497             } else {
    498               Advance(2);
    499               builder->AddCharacter(controlLetter & 0x1f);
    500             }
    501             break;
    502           }
    503           case 'x': {
    504             Advance(2);
    505             uc32 value;
    506             if (ParseHexEscape(2, &value)) {
    507               builder->AddCharacter(value);
    508             } else if (!unicode()) {
    509               builder->AddCharacter('x');
    510             } else {
    511               // With /u, invalid escapes are not treated as identity escapes.
    512               return ReportError(CStrVector("Invalid escape"));
    513             }
    514             break;
    515           }
    516           case 'u': {
    517             Advance(2);
    518             uc32 value;
    519             if (ParseUnicodeEscape(&value)) {
    520               builder->AddEscapedUnicodeCharacter(value);
    521             } else if (!unicode()) {
    522               builder->AddCharacter('u');
    523             } else {
    524               // With /u, invalid escapes are not treated as identity escapes.
    525               return ReportError(CStrVector("Invalid unicode escape"));
    526             }
    527             break;
    528           }
    529           case 'k':
    530             if (FLAG_harmony_regexp_named_captures && unicode()) {
    531               Advance(2);
    532               ParseNamedBackReference(builder, state CHECK_FAILED);
    533               break;
    534             }
    535           // Fall through.
    536           default:
    537             Advance();
    538             // With /u, no identity escapes except for syntax characters
    539             // are allowed. Otherwise, all identity escapes are allowed.
    540             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
    541               builder->AddCharacter(current());
    542               Advance();
    543             } else {
    544               return ReportError(CStrVector("Invalid escape"));
    545             }
    546             break;
    547         }
    548         break;
    549       case '{': {
    550         int dummy;
    551         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
    552         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
    553         // Fall through.
    554       }
    555       case '}':
    556       case ']':
    557         if (unicode()) {
    558           return ReportError(CStrVector("Lone quantifier brackets"));
    559         }
    560       // Fall through.
    561       default:
    562         builder->AddUnicodeCharacter(current());
    563         Advance();
    564         break;
    565     }  // end switch(current())
    566 
    567     int min;
    568     int max;
    569     switch (current()) {
    570       // QuantifierPrefix ::
    571       //   *
    572       //   +
    573       //   ?
    574       //   {
    575       case '*':
    576         min = 0;
    577         max = RegExpTree::kInfinity;
    578         Advance();
    579         break;
    580       case '+':
    581         min = 1;
    582         max = RegExpTree::kInfinity;
    583         Advance();
    584         break;
    585       case '?':
    586         min = 0;
    587         max = 1;
    588         Advance();
    589         break;
    590       case '{':
    591         if (ParseIntervalQuantifier(&min, &max)) {
    592           if (max < min) {
    593             return ReportError(
    594                 CStrVector("numbers out of order in {} quantifier"));
    595           }
    596           break;
    597         } else if (unicode()) {
    598           // With /u, incomplete quantifiers are not allowed.
    599           return ReportError(CStrVector("Incomplete quantifier"));
    600         }
    601         continue;
    602       default:
    603         continue;
    604     }
    605     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
    606     if (current() == '?') {
    607       quantifier_type = RegExpQuantifier::NON_GREEDY;
    608       Advance();
    609     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
    610       // FLAG_regexp_possessive_quantifier is a debug-only flag.
    611       quantifier_type = RegExpQuantifier::POSSESSIVE;
    612       Advance();
    613     }
    614     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
    615       return ReportError(CStrVector("Invalid quantifier"));
    616     }
    617   }
    618 }
    619 
    620 
    621 #ifdef DEBUG
    622 // Currently only used in an DCHECK.
    623 static bool IsSpecialClassEscape(uc32 c) {
    624   switch (c) {
    625     case 'd':
    626     case 'D':
    627     case 's':
    628     case 'S':
    629     case 'w':
    630     case 'W':
    631       return true;
    632     default:
    633       return false;
    634   }
    635 }
    636 #endif
    637 
    638 
    639 // In order to know whether an escape is a backreference or not we have to scan
    640 // the entire regexp and find the number of capturing parentheses.  However we
    641 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
    642 // is called when needed.  It can see the difference between capturing and
    643 // noncapturing parentheses and can skip character classes and backslash-escaped
    644 // characters.
    645 void RegExpParser::ScanForCaptures() {
    646   // Start with captures started previous to current position
    647   int capture_count = captures_started();
    648   // Add count of captures after this position.
    649   int n;
    650   while ((n = current()) != kEndMarker) {
    651     Advance();
    652     switch (n) {
    653       case '\\':
    654         Advance();
    655         break;
    656       case '[': {
    657         int c;
    658         while ((c = current()) != kEndMarker) {
    659           Advance();
    660           if (c == '\\') {
    661             Advance();
    662           } else {
    663             if (c == ']') break;
    664           }
    665         }
    666         break;
    667       }
    668       case '(':
    669         if (current() != '?') capture_count++;
    670         break;
    671     }
    672   }
    673   capture_count_ = capture_count;
    674   is_scanned_for_captures_ = true;
    675 }
    676 
    677 
    678 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
    679   DCHECK_EQ('\\', current());
    680   DCHECK('1' <= Next() && Next() <= '9');
    681   // Try to parse a decimal literal that is no greater than the total number
    682   // of left capturing parentheses in the input.
    683   int start = position();
    684   int value = Next() - '0';
    685   Advance(2);
    686   while (true) {
    687     uc32 c = current();
    688     if (IsDecimalDigit(c)) {
    689       value = 10 * value + (c - '0');
    690       if (value > kMaxCaptures) {
    691         Reset(start);
    692         return false;
    693       }
    694       Advance();
    695     } else {
    696       break;
    697     }
    698   }
    699   if (value > captures_started()) {
    700     if (!is_scanned_for_captures_) {
    701       int saved_position = position();
    702       ScanForCaptures();
    703       Reset(saved_position);
    704     }
    705     if (value > capture_count_) {
    706       Reset(start);
    707       return false;
    708     }
    709   }
    710   *index_out = value;
    711   return true;
    712 }
    713 
    714 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
    715   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
    716     v->push_back(code_unit);
    717   } else {
    718     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
    719     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
    720   }
    721 }
    722 
    723 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
    724   DCHECK(FLAG_harmony_regexp_named_captures);
    725   DCHECK(unicode());
    726 
    727   ZoneVector<uc16>* name =
    728       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
    729 
    730   bool at_start = true;
    731   while (true) {
    732     uc32 c = current();
    733     Advance();
    734 
    735     // Convert unicode escapes.
    736     if (c == '\\' && current() == 'u') {
    737       Advance();
    738       if (!ParseUnicodeEscape(&c)) {
    739         ReportError(CStrVector("Invalid Unicode escape sequence"));
    740         return nullptr;
    741       }
    742     }
    743 
    744     if (at_start) {
    745       if (!IdentifierStart::Is(c)) {
    746         ReportError(CStrVector("Invalid capture group name"));
    747         return nullptr;
    748       }
    749       push_code_unit(name, c);
    750       at_start = false;
    751     } else {
    752       if (c == '>') {
    753         break;
    754       } else if (IdentifierPart::Is(c)) {
    755         push_code_unit(name, c);
    756       } else {
    757         ReportError(CStrVector("Invalid capture group name"));
    758         return nullptr;
    759       }
    760     }
    761   }
    762 
    763   return name;
    764 }
    765 
    766 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
    767                                              int index) {
    768   DCHECK(FLAG_harmony_regexp_named_captures);
    769   DCHECK(unicode());
    770   DCHECK(0 < index && index <= captures_started_);
    771   DCHECK_NOT_NULL(name);
    772 
    773   if (named_captures_ == nullptr) {
    774     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
    775   } else {
    776     // Check for duplicates and bail if we find any.
    777     for (const auto& named_capture : *named_captures_) {
    778       if (*named_capture->name() == *name) {
    779         ReportError(CStrVector("Duplicate capture group name"));
    780         return false;
    781       }
    782     }
    783   }
    784 
    785   RegExpCapture* capture = GetCapture(index);
    786   DCHECK(capture->name() == nullptr);
    787 
    788   capture->set_name(name);
    789   named_captures_->Add(capture, zone());
    790 
    791   return true;
    792 }
    793 
    794 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
    795                                            RegExpParserState* state) {
    796   // The parser is assumed to be on the '<' in \k<name>.
    797   if (current() != '<') {
    798     ReportError(CStrVector("Invalid named reference"));
    799     return false;
    800   }
    801 
    802   Advance();
    803   const ZoneVector<uc16>* name = ParseCaptureGroupName();
    804   if (name == nullptr) {
    805     return false;
    806   }
    807 
    808   if (state->IsInsideCaptureGroup(name)) {
    809     builder->AddEmpty();
    810   } else {
    811     RegExpBackReference* atom = new (zone()) RegExpBackReference();
    812     atom->set_name(name);
    813 
    814     builder->AddAtom(atom);
    815 
    816     if (named_back_references_ == nullptr) {
    817       named_back_references_ =
    818           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
    819     }
    820     named_back_references_->Add(atom, zone());
    821   }
    822 
    823   return true;
    824 }
    825 
    826 void RegExpParser::PatchNamedBackReferences() {
    827   if (named_back_references_ == nullptr) return;
    828 
    829   if (named_captures_ == nullptr) {
    830     ReportError(CStrVector("Invalid named capture referenced"));
    831     return;
    832   }
    833 
    834   // Look up and patch the actual capture for each named back reference.
    835   // TODO(jgruber): O(n^2), optimize if necessary.
    836 
    837   for (int i = 0; i < named_back_references_->length(); i++) {
    838     RegExpBackReference* ref = named_back_references_->at(i);
    839 
    840     int index = -1;
    841     for (const auto& capture : *named_captures_) {
    842       if (*capture->name() == *ref->name()) {
    843         index = capture->index();
    844         break;
    845       }
    846     }
    847 
    848     if (index == -1) {
    849       ReportError(CStrVector("Invalid named capture referenced"));
    850       return;
    851     }
    852 
    853     ref->set_capture(GetCapture(index));
    854   }
    855 }
    856 
    857 RegExpCapture* RegExpParser::GetCapture(int index) {
    858   // The index for the capture groups are one-based. Its index in the list is
    859   // zero-based.
    860   int know_captures =
    861       is_scanned_for_captures_ ? capture_count_ : captures_started_;
    862   DCHECK(index <= know_captures);
    863   if (captures_ == NULL) {
    864     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
    865   }
    866   while (captures_->length() < know_captures) {
    867     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
    868   }
    869   return captures_->at(index - 1);
    870 }
    871 
    872 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
    873   if (named_captures_ == nullptr || named_captures_->is_empty())
    874     return Handle<FixedArray>();
    875 
    876   Factory* factory = isolate()->factory();
    877 
    878   int len = named_captures_->length() * 2;
    879   Handle<FixedArray> array = factory->NewFixedArray(len);
    880 
    881   for (int i = 0; i < named_captures_->length(); i++) {
    882     RegExpCapture* capture = named_captures_->at(i);
    883     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
    884     array->set(i * 2, *name.ToHandleChecked());
    885     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
    886   }
    887 
    888   return array;
    889 }
    890 
    891 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
    892   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
    893     if (s->group_type() != CAPTURE) continue;
    894     // Return true if we found the matching capture index.
    895     if (index == s->capture_index()) return true;
    896     // Abort if index is larger than what has been parsed up till this state.
    897     if (index > s->capture_index()) return false;
    898   }
    899   return false;
    900 }
    901 
    902 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
    903     const ZoneVector<uc16>* name) {
    904   DCHECK_NOT_NULL(name);
    905   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
    906     if (s->capture_name() == nullptr) continue;
    907     if (*s->capture_name() == *name) return true;
    908   }
    909   return false;
    910 }
    911 
    912 // QuantifierPrefix ::
    913 //   { DecimalDigits }
    914 //   { DecimalDigits , }
    915 //   { DecimalDigits , DecimalDigits }
    916 //
    917 // Returns true if parsing succeeds, and set the min_out and max_out
    918 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
    919 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
    920   DCHECK_EQ(current(), '{');
    921   int start = position();
    922   Advance();
    923   int min = 0;
    924   if (!IsDecimalDigit(current())) {
    925     Reset(start);
    926     return false;
    927   }
    928   while (IsDecimalDigit(current())) {
    929     int next = current() - '0';
    930     if (min > (RegExpTree::kInfinity - next) / 10) {
    931       // Overflow. Skip past remaining decimal digits and return -1.
    932       do {
    933         Advance();
    934       } while (IsDecimalDigit(current()));
    935       min = RegExpTree::kInfinity;
    936       break;
    937     }
    938     min = 10 * min + next;
    939     Advance();
    940   }
    941   int max = 0;
    942   if (current() == '}') {
    943     max = min;
    944     Advance();
    945   } else if (current() == ',') {
    946     Advance();
    947     if (current() == '}') {
    948       max = RegExpTree::kInfinity;
    949       Advance();
    950     } else {
    951       while (IsDecimalDigit(current())) {
    952         int next = current() - '0';
    953         if (max > (RegExpTree::kInfinity - next) / 10) {
    954           do {
    955             Advance();
    956           } while (IsDecimalDigit(current()));
    957           max = RegExpTree::kInfinity;
    958           break;
    959         }
    960         max = 10 * max + next;
    961         Advance();
    962       }
    963       if (current() != '}') {
    964         Reset(start);
    965         return false;
    966       }
    967       Advance();
    968     }
    969   } else {
    970     Reset(start);
    971     return false;
    972   }
    973   *min_out = min;
    974   *max_out = max;
    975   return true;
    976 }
    977 
    978 
    979 uc32 RegExpParser::ParseOctalLiteral() {
    980   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
    981   // For compatibility with some other browsers (not all), we parse
    982   // up to three octal digits with a value below 256.
    983   uc32 value = current() - '0';
    984   Advance();
    985   if ('0' <= current() && current() <= '7') {
    986     value = value * 8 + current() - '0';
    987     Advance();
    988     if (value < 32 && '0' <= current() && current() <= '7') {
    989       value = value * 8 + current() - '0';
    990       Advance();
    991     }
    992   }
    993   return value;
    994 }
    995 
    996 
    997 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
    998   int start = position();
    999   uc32 val = 0;
   1000   for (int i = 0; i < length; ++i) {
   1001     uc32 c = current();
   1002     int d = HexValue(c);
   1003     if (d < 0) {
   1004       Reset(start);
   1005       return false;
   1006     }
   1007     val = val * 16 + d;
   1008     Advance();
   1009   }
   1010   *value = val;
   1011   return true;
   1012 }
   1013 
   1014 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
   1015 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
   1016   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
   1017   // allowed). In the latter case, the number of hex digits between { } is
   1018   // arbitrary. \ and u have already been read.
   1019   if (current() == '{' && unicode()) {
   1020     int start = position();
   1021     Advance();
   1022     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
   1023       if (current() == '}') {
   1024         Advance();
   1025         return true;
   1026       }
   1027     }
   1028     Reset(start);
   1029     return false;
   1030   }
   1031   // \u but no {, or \u{...} escapes not allowed.
   1032   bool result = ParseHexEscape(4, value);
   1033   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
   1034       current() == '\\') {
   1035     // Attempt to read trail surrogate.
   1036     int start = position();
   1037     if (Next() == 'u') {
   1038       Advance(2);
   1039       uc32 trail;
   1040       if (ParseHexEscape(4, &trail) &&
   1041           unibrow::Utf16::IsTrailSurrogate(trail)) {
   1042         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
   1043                                                       static_cast<uc16>(trail));
   1044         return true;
   1045       }
   1046     }
   1047     Reset(start);
   1048   }
   1049   return result;
   1050 }
   1051 
   1052 #ifdef V8_I18N_SUPPORT
   1053 
   1054 namespace {
   1055 
   1056 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
   1057   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
   1058   if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
   1059   for (int i = 0;; i++) {
   1060     const char* long_name = u_getPropertyName(
   1061         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
   1062     if (long_name == NULL) break;
   1063     if (strcmp(property_name, long_name) == 0) return true;
   1064   }
   1065   return false;
   1066 }
   1067 
   1068 bool IsExactPropertyValueAlias(const char* property_value_name,
   1069                                UProperty property, int32_t property_value) {
   1070   const char* short_name =
   1071       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
   1072   if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
   1073     return true;
   1074   }
   1075   for (int i = 0;; i++) {
   1076     const char* long_name = u_getPropertyValueName(
   1077         property, property_value,
   1078         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
   1079     if (long_name == NULL) break;
   1080     if (strcmp(property_value_name, long_name) == 0) return true;
   1081   }
   1082   return false;
   1083 }
   1084 
   1085 bool LookupPropertyValueName(UProperty property,
   1086                              const char* property_value_name, bool negate,
   1087                              ZoneList<CharacterRange>* result, Zone* zone) {
   1088   UProperty property_for_lookup = property;
   1089   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
   1090     // For the property Script_Extensions, we have to do the property value
   1091     // name lookup as if the property is Script.
   1092     property_for_lookup = UCHAR_SCRIPT;
   1093   }
   1094   int32_t property_value =
   1095       u_getPropertyValueEnum(property_for_lookup, property_value_name);
   1096   if (property_value == UCHAR_INVALID_CODE) return false;
   1097 
   1098   // We require the property name to match exactly to one of the property value
   1099   // aliases. However, u_getPropertyValueEnum uses loose matching.
   1100   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
   1101                                  property_value)) {
   1102     return false;
   1103   }
   1104 
   1105   UErrorCode ec = U_ZERO_ERROR;
   1106   icu::UnicodeSet set;
   1107   set.applyIntPropertyValue(property, property_value, ec);
   1108   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
   1109 
   1110   if (success) {
   1111     set.removeAllStrings();
   1112     if (negate) set.complement();
   1113     for (int i = 0; i < set.getRangeCount(); i++) {
   1114       result->Add(
   1115           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
   1116           zone);
   1117     }
   1118   }
   1119   return success;
   1120 }
   1121 
   1122 template <size_t N>
   1123 inline bool NameEquals(const char* name, const char (&literal)[N]) {
   1124   return strncmp(name, literal, N + 1) == 0;
   1125 }
   1126 
   1127 bool LookupSpecialPropertyValueName(const char* name,
   1128                                     ZoneList<CharacterRange>* result,
   1129                                     bool negate, Zone* zone) {
   1130   if (NameEquals(name, "Any")) {
   1131     if (!negate) result->Add(CharacterRange::Everything(), zone);
   1132   } else if (NameEquals(name, "ASCII")) {
   1133     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
   1134                        : CharacterRange::Range(0x0, 0x7f),
   1135                 zone);
   1136   } else if (NameEquals(name, "Assigned")) {
   1137     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
   1138                                    !negate, result, zone);
   1139   } else {
   1140     return false;
   1141   }
   1142   return true;
   1143 }
   1144 
   1145 }  // anonymous namespace
   1146 
   1147 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
   1148                                       bool negate) {
   1149   // Parse the property class as follows:
   1150   // - In \p{name}, 'name' is interpreted
   1151   //   - either as a general category property value name.
   1152   //   - or as a binary property name.
   1153   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
   1154   //   and 'value' is interpreted as one of the available property value names.
   1155   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
   1156   // - Loose matching is not applied.
   1157   List<char> first_part;
   1158   List<char> second_part;
   1159   if (current() == '{') {
   1160     // Parse \p{[PropertyName=]PropertyNameValue}
   1161     for (Advance(); current() != '}' && current() != '='; Advance()) {
   1162       if (!has_next()) return false;
   1163       first_part.Add(static_cast<char>(current()));
   1164     }
   1165     if (current() == '=') {
   1166       for (Advance(); current() != '}'; Advance()) {
   1167         if (!has_next()) return false;
   1168         second_part.Add(static_cast<char>(current()));
   1169       }
   1170       second_part.Add(0);  // null-terminate string.
   1171     }
   1172   } else {
   1173     return false;
   1174   }
   1175   Advance();
   1176   first_part.Add(0);  // null-terminate string.
   1177 
   1178   if (second_part.is_empty()) {
   1179     // First attempt to interpret as general category property value name.
   1180     const char* name = first_part.ToConstVector().start();
   1181     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
   1182                                 result, zone())) {
   1183       return true;
   1184     }
   1185     // Interpret "Any", "ASCII", and "Assigned".
   1186     if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
   1187       return true;
   1188     }
   1189     // Then attempt to interpret as binary property name with value name 'Y'.
   1190     UProperty property = u_getPropertyEnum(name);
   1191     if (property < UCHAR_BINARY_START) return false;
   1192     if (property >= UCHAR_BINARY_LIMIT) return false;
   1193     if (!IsExactPropertyAlias(name, property)) return false;
   1194     return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
   1195                                    zone());
   1196   } else {
   1197     // Both property name and value name are specified. Attempt to interpret
   1198     // the property name as enumerated property.
   1199     const char* property_name = first_part.ToConstVector().start();
   1200     const char* value_name = second_part.ToConstVector().start();
   1201     UProperty property = u_getPropertyEnum(property_name);
   1202     if (!IsExactPropertyAlias(property_name, property)) return false;
   1203     if (property == UCHAR_GENERAL_CATEGORY) {
   1204       // We want to allow aggregate value names such as "Letter".
   1205       property = UCHAR_GENERAL_CATEGORY_MASK;
   1206     } else if (property != UCHAR_SCRIPT &&
   1207                property != UCHAR_SCRIPT_EXTENSIONS) {
   1208       return false;
   1209     }
   1210     return LookupPropertyValueName(property, value_name, negate, result,
   1211                                    zone());
   1212   }
   1213 }
   1214 
   1215 #else  // V8_I18N_SUPPORT
   1216 
   1217 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
   1218                                       bool negate) {
   1219   return false;
   1220 }
   1221 
   1222 #endif  // V8_I18N_SUPPORT
   1223 
   1224 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
   1225   uc32 x = 0;
   1226   int d = HexValue(current());
   1227   if (d < 0) {
   1228     return false;
   1229   }
   1230   while (d >= 0) {
   1231     x = x * 16 + d;
   1232     if (x > max_value) {
   1233       return false;
   1234     }
   1235     Advance();
   1236     d = HexValue(current());
   1237   }
   1238   *value = x;
   1239   return true;
   1240 }
   1241 
   1242 
   1243 uc32 RegExpParser::ParseClassCharacterEscape() {
   1244   DCHECK(current() == '\\');
   1245   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
   1246   Advance();
   1247   switch (current()) {
   1248     case 'b':
   1249       Advance();
   1250       return '\b';
   1251     // ControlEscape :: one of
   1252     //   f n r t v
   1253     case 'f':
   1254       Advance();
   1255       return '\f';
   1256     case 'n':
   1257       Advance();
   1258       return '\n';
   1259     case 'r':
   1260       Advance();
   1261       return '\r';
   1262     case 't':
   1263       Advance();
   1264       return '\t';
   1265     case 'v':
   1266       Advance();
   1267       return '\v';
   1268     case 'c': {
   1269       uc32 controlLetter = Next();
   1270       uc32 letter = controlLetter & ~('A' ^ 'a');
   1271       // For compatibility with JSC, inside a character class. We also accept
   1272       // digits and underscore as control characters, unless with /u.
   1273       if (letter >= 'A' && letter <= 'Z') {
   1274         Advance(2);
   1275         // Control letters mapped to ASCII control characters in the range
   1276         // 0x00-0x1f.
   1277         return controlLetter & 0x1f;
   1278       }
   1279       if (unicode()) {
   1280         // With /u, invalid escapes are not treated as identity escapes.
   1281         ReportError(CStrVector("Invalid class escape"));
   1282         return 0;
   1283       }
   1284       if ((controlLetter >= '0' && controlLetter <= '9') ||
   1285           controlLetter == '_') {
   1286         Advance(2);
   1287         return controlLetter & 0x1f;
   1288       }
   1289       // We match JSC in reading the backslash as a literal
   1290       // character instead of as starting an escape.
   1291       return '\\';
   1292     }
   1293     case '0':
   1294       // With /u, \0 is interpreted as NUL if not followed by another digit.
   1295       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
   1296         Advance();
   1297         return 0;
   1298       }
   1299     // Fall through.
   1300     case '1':
   1301     case '2':
   1302     case '3':
   1303     case '4':
   1304     case '5':
   1305     case '6':
   1306     case '7':
   1307       // For compatibility, we interpret a decimal escape that isn't
   1308       // a back reference (and therefore either \0 or not valid according
   1309       // to the specification) as a 1..3 digit octal character code.
   1310       if (unicode()) {
   1311         // With /u, decimal escape is not interpreted as octal character code.
   1312         ReportError(CStrVector("Invalid class escape"));
   1313         return 0;
   1314       }
   1315       return ParseOctalLiteral();
   1316     case 'x': {
   1317       Advance();
   1318       uc32 value;
   1319       if (ParseHexEscape(2, &value)) return value;
   1320       if (unicode()) {
   1321         // With /u, invalid escapes are not treated as identity escapes.
   1322         ReportError(CStrVector("Invalid escape"));
   1323         return 0;
   1324       }
   1325       // If \x is not followed by a two-digit hexadecimal, treat it
   1326       // as an identity escape.
   1327       return 'x';
   1328     }
   1329     case 'u': {
   1330       Advance();
   1331       uc32 value;
   1332       if (ParseUnicodeEscape(&value)) return value;
   1333       if (unicode()) {
   1334         // With /u, invalid escapes are not treated as identity escapes.
   1335         ReportError(CStrVector("Invalid unicode escape"));
   1336         return 0;
   1337       }
   1338       // If \u is not followed by a two-digit hexadecimal, treat it
   1339       // as an identity escape.
   1340       return 'u';
   1341     }
   1342     default: {
   1343       uc32 result = current();
   1344       // With /u, no identity escapes except for syntax characters and '-' are
   1345       // allowed. Otherwise, all identity escapes are allowed.
   1346       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
   1347         Advance();
   1348         return result;
   1349       }
   1350       ReportError(CStrVector("Invalid escape"));
   1351       return 0;
   1352     }
   1353   }
   1354   return 0;
   1355 }
   1356 
   1357 
   1358 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
   1359   DCHECK_EQ(0, *char_class);
   1360   uc32 first = current();
   1361   if (first == '\\') {
   1362     switch (Next()) {
   1363       case 'w':
   1364       case 'W':
   1365       case 'd':
   1366       case 'D':
   1367       case 's':
   1368       case 'S': {
   1369         *char_class = Next();
   1370         Advance(2);
   1371         return CharacterRange::Singleton(0);  // Return dummy value.
   1372       }
   1373       case kEndMarker:
   1374         return ReportError(CStrVector("\\ at end of pattern"));
   1375       default:
   1376         first = ParseClassCharacterEscape(CHECK_FAILED);
   1377     }
   1378   } else {
   1379     Advance();
   1380   }
   1381 
   1382   return CharacterRange::Singleton(first);
   1383 }
   1384 
   1385 static const uc16 kNoCharClass = 0;
   1386 
   1387 // Adds range or pre-defined character class to character ranges.
   1388 // If char_class is not kInvalidClass, it's interpreted as a class
   1389 // escape (i.e., 's' means whitespace, from '\s').
   1390 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
   1391                                     uc16 char_class, CharacterRange range,
   1392                                     Zone* zone) {
   1393   if (char_class != kNoCharClass) {
   1394     CharacterRange::AddClassEscape(char_class, ranges, zone);
   1395   } else {
   1396     ranges->Add(range, zone);
   1397   }
   1398 }
   1399 
   1400 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
   1401   if (!FLAG_harmony_regexp_property) return false;
   1402   if (!unicode()) return false;
   1403   if (current() != '\\') return false;
   1404   uc32 next = Next();
   1405   bool parse_success = false;
   1406   if (next == 'p') {
   1407     Advance(2);
   1408     parse_success = ParsePropertyClass(ranges, false);
   1409   } else if (next == 'P') {
   1410     Advance(2);
   1411     parse_success = ParsePropertyClass(ranges, true);
   1412   } else {
   1413     return false;
   1414   }
   1415   if (!parse_success)
   1416     ReportError(CStrVector("Invalid property name in character class"));
   1417   return parse_success;
   1418 }
   1419 
   1420 RegExpTree* RegExpParser::ParseCharacterClass() {
   1421   static const char* kUnterminated = "Unterminated character class";
   1422   static const char* kRangeInvalid = "Invalid character class";
   1423   static const char* kRangeOutOfOrder = "Range out of order in character class";
   1424 
   1425   DCHECK_EQ(current(), '[');
   1426   Advance();
   1427   bool is_negated = false;
   1428   if (current() == '^') {
   1429     is_negated = true;
   1430     Advance();
   1431   }
   1432   ZoneList<CharacterRange>* ranges =
   1433       new (zone()) ZoneList<CharacterRange>(2, zone());
   1434   while (has_more() && current() != ']') {
   1435     bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
   1436     if (parsed_property) continue;
   1437     uc16 char_class = kNoCharClass;
   1438     CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
   1439     if (current() == '-') {
   1440       Advance();
   1441       if (current() == kEndMarker) {
   1442         // If we reach the end we break out of the loop and let the
   1443         // following code report an error.
   1444         break;
   1445       } else if (current() == ']') {
   1446         AddRangeOrEscape(ranges, char_class, first, zone());
   1447         ranges->Add(CharacterRange::Singleton('-'), zone());
   1448         break;
   1449       }
   1450       uc16 char_class_2 = kNoCharClass;
   1451       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
   1452       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
   1453         // Either end is an escaped character class. Treat the '-' verbatim.
   1454         if (unicode()) {
   1455           // ES2015 21.2.2.15.1 step 1.
   1456           return ReportError(CStrVector(kRangeInvalid));
   1457         }
   1458         AddRangeOrEscape(ranges, char_class, first, zone());
   1459         ranges->Add(CharacterRange::Singleton('-'), zone());
   1460         AddRangeOrEscape(ranges, char_class_2, next, zone());
   1461         continue;
   1462       }
   1463       // ES2015 21.2.2.15.1 step 6.
   1464       if (first.from() > next.to()) {
   1465         return ReportError(CStrVector(kRangeOutOfOrder));
   1466       }
   1467       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
   1468     } else {
   1469       AddRangeOrEscape(ranges, char_class, first, zone());
   1470     }
   1471   }
   1472   if (!has_more()) {
   1473     return ReportError(CStrVector(kUnterminated));
   1474   }
   1475   Advance();
   1476   if (ranges->length() == 0) {
   1477     ranges->Add(CharacterRange::Everything(), zone());
   1478     is_negated = !is_negated;
   1479   }
   1480   return new (zone()) RegExpCharacterClass(ranges, is_negated);
   1481 }
   1482 
   1483 
   1484 #undef CHECK_FAILED
   1485 
   1486 
   1487 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
   1488                                FlatStringReader* input, JSRegExp::Flags flags,
   1489                                RegExpCompileData* result) {
   1490   DCHECK(result != NULL);
   1491   RegExpParser parser(input, &result->error, flags, isolate, zone);
   1492   RegExpTree* tree = parser.ParsePattern();
   1493   if (parser.failed()) {
   1494     DCHECK(tree == NULL);
   1495     DCHECK(!result->error.is_null());
   1496   } else {
   1497     DCHECK(tree != NULL);
   1498     DCHECK(result->error.is_null());
   1499     if (FLAG_trace_regexp_parser) {
   1500       OFStream os(stdout);
   1501       tree->Print(os, zone);
   1502       os << "\n";
   1503     }
   1504     result->tree = tree;
   1505     int capture_count = parser.captures_started();
   1506     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
   1507     result->contains_anchor = parser.contains_anchor();
   1508     result->capture_name_map = parser.CreateCaptureNameMap();
   1509     result->capture_count = capture_count;
   1510   }
   1511   return !parser.failed();
   1512 }
   1513 
   1514 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
   1515     : zone_(zone),
   1516       pending_empty_(false),
   1517       ignore_case_(ignore_case),
   1518       unicode_(unicode),
   1519       characters_(NULL),
   1520       pending_surrogate_(kNoPendingSurrogate),
   1521       terms_(),
   1522       alternatives_()
   1523 #ifdef DEBUG
   1524       ,
   1525       last_added_(ADD_NONE)
   1526 #endif
   1527 {
   1528 }
   1529 
   1530 
   1531 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
   1532   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
   1533   FlushPendingSurrogate();
   1534   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
   1535   pending_surrogate_ = lead_surrogate;
   1536 }
   1537 
   1538 
   1539 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
   1540   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
   1541   if (pending_surrogate_ != kNoPendingSurrogate) {
   1542     uc16 lead_surrogate = pending_surrogate_;
   1543     pending_surrogate_ = kNoPendingSurrogate;
   1544     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
   1545     uc32 combined =
   1546         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
   1547     if (NeedsDesugaringForIgnoreCase(combined)) {
   1548       AddCharacterClassForDesugaring(combined);
   1549     } else {
   1550       ZoneList<uc16> surrogate_pair(2, zone());
   1551       surrogate_pair.Add(lead_surrogate, zone());
   1552       surrogate_pair.Add(trail_surrogate, zone());
   1553       RegExpAtom* atom =
   1554           new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
   1555       AddAtom(atom);
   1556     }
   1557   } else {
   1558     pending_surrogate_ = trail_surrogate;
   1559     FlushPendingSurrogate();
   1560   }
   1561 }
   1562 
   1563 
   1564 void RegExpBuilder::FlushPendingSurrogate() {
   1565   if (pending_surrogate_ != kNoPendingSurrogate) {
   1566     DCHECK(unicode());
   1567     uc32 c = pending_surrogate_;
   1568     pending_surrogate_ = kNoPendingSurrogate;
   1569     AddCharacterClassForDesugaring(c);
   1570   }
   1571 }
   1572 
   1573 
   1574 void RegExpBuilder::FlushCharacters() {
   1575   FlushPendingSurrogate();
   1576   pending_empty_ = false;
   1577   if (characters_ != NULL) {
   1578     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
   1579     characters_ = NULL;
   1580     text_.Add(atom, zone());
   1581     LAST(ADD_ATOM);
   1582   }
   1583 }
   1584 
   1585 
   1586 void RegExpBuilder::FlushText() {
   1587   FlushCharacters();
   1588   int num_text = text_.length();
   1589   if (num_text == 0) {
   1590     return;
   1591   } else if (num_text == 1) {
   1592     terms_.Add(text_.last(), zone());
   1593   } else {
   1594     RegExpText* text = new (zone()) RegExpText(zone());
   1595     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
   1596     terms_.Add(text, zone());
   1597   }
   1598   text_.Clear();
   1599 }
   1600 
   1601 
   1602 void RegExpBuilder::AddCharacter(uc16 c) {
   1603   FlushPendingSurrogate();
   1604   pending_empty_ = false;
   1605   if (NeedsDesugaringForIgnoreCase(c)) {
   1606     AddCharacterClassForDesugaring(c);
   1607   } else {
   1608     if (characters_ == NULL) {
   1609       characters_ = new (zone()) ZoneList<uc16>(4, zone());
   1610     }
   1611     characters_->Add(c, zone());
   1612     LAST(ADD_CHAR);
   1613   }
   1614 }
   1615 
   1616 
   1617 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
   1618   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
   1619     DCHECK(unicode());
   1620     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
   1621     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
   1622   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
   1623     AddLeadSurrogate(c);
   1624   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
   1625     AddTrailSurrogate(c);
   1626   } else {
   1627     AddCharacter(static_cast<uc16>(c));
   1628   }
   1629 }
   1630 
   1631 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
   1632   // A lead or trail surrogate parsed via escape sequence will not
   1633   // pair up with any preceding lead or following trail surrogate.
   1634   FlushPendingSurrogate();
   1635   AddUnicodeCharacter(character);
   1636   FlushPendingSurrogate();
   1637 }
   1638 
   1639 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
   1640 
   1641 
   1642 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
   1643   if (NeedsDesugaringForUnicode(cc)) {
   1644     // With /u, character class needs to be desugared, so it
   1645     // must be a standalone term instead of being part of a RegExpText.
   1646     AddTerm(cc);
   1647   } else {
   1648     AddAtom(cc);
   1649   }
   1650 }
   1651 
   1652 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
   1653   AddTerm(new (zone()) RegExpCharacterClass(
   1654       CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
   1655 }
   1656 
   1657 
   1658 void RegExpBuilder::AddAtom(RegExpTree* term) {
   1659   if (term->IsEmpty()) {
   1660     AddEmpty();
   1661     return;
   1662   }
   1663   if (term->IsTextElement()) {
   1664     FlushCharacters();
   1665     text_.Add(term, zone());
   1666   } else {
   1667     FlushText();
   1668     terms_.Add(term, zone());
   1669   }
   1670   LAST(ADD_ATOM);
   1671 }
   1672 
   1673 
   1674 void RegExpBuilder::AddTerm(RegExpTree* term) {
   1675   FlushText();
   1676   terms_.Add(term, zone());
   1677   LAST(ADD_ATOM);
   1678 }
   1679 
   1680 
   1681 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
   1682   FlushText();
   1683   terms_.Add(assert, zone());
   1684   LAST(ADD_ASSERT);
   1685 }
   1686 
   1687 
   1688 void RegExpBuilder::NewAlternative() { FlushTerms(); }
   1689 
   1690 
   1691 void RegExpBuilder::FlushTerms() {
   1692   FlushText();
   1693   int num_terms = terms_.length();
   1694   RegExpTree* alternative;
   1695   if (num_terms == 0) {
   1696     alternative = new (zone()) RegExpEmpty();
   1697   } else if (num_terms == 1) {
   1698     alternative = terms_.last();
   1699   } else {
   1700     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
   1701   }
   1702   alternatives_.Add(alternative, zone());
   1703   terms_.Clear();
   1704   LAST(ADD_NONE);
   1705 }
   1706 
   1707 
   1708 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
   1709   if (!unicode()) return false;
   1710   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
   1711   // necessarily mean that we need to desugar. It's probably nicer to have a
   1712   // separate pass to figure out unicode desugarings.
   1713   if (ignore_case()) return true;
   1714   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
   1715   CharacterRange::Canonicalize(ranges);
   1716   for (int i = ranges->length() - 1; i >= 0; i--) {
   1717     uc32 from = ranges->at(i).from();
   1718     uc32 to = ranges->at(i).to();
   1719     // Check for non-BMP characters.
   1720     if (to >= kNonBmpStart) return true;
   1721     // Check for lone surrogates.
   1722     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
   1723   }
   1724   return false;
   1725 }
   1726 
   1727 
   1728 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
   1729 #ifdef V8_I18N_SUPPORT
   1730   if (unicode() && ignore_case()) {
   1731     icu::UnicodeSet set(c, c);
   1732     set.closeOver(USET_CASE_INSENSITIVE);
   1733     set.removeAllStrings();
   1734     return set.size() > 1;
   1735   }
   1736   // In the case where ICU is not included, we act as if the unicode flag is
   1737   // not set, and do not desugar.
   1738 #endif  // V8_I18N_SUPPORT
   1739   return false;
   1740 }
   1741 
   1742 
   1743 RegExpTree* RegExpBuilder::ToRegExp() {
   1744   FlushTerms();
   1745   int num_alternatives = alternatives_.length();
   1746   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
   1747   if (num_alternatives == 1) return alternatives_.last();
   1748   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
   1749 }
   1750 
   1751 bool RegExpBuilder::AddQuantifierToAtom(
   1752     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
   1753   FlushPendingSurrogate();
   1754   if (pending_empty_) {
   1755     pending_empty_ = false;
   1756     return true;
   1757   }
   1758   RegExpTree* atom;
   1759   if (characters_ != NULL) {
   1760     DCHECK(last_added_ == ADD_CHAR);
   1761     // Last atom was character.
   1762     Vector<const uc16> char_vector = characters_->ToConstVector();
   1763     int num_chars = char_vector.length();
   1764     if (num_chars > 1) {
   1765       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
   1766       text_.Add(new (zone()) RegExpAtom(prefix), zone());
   1767       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
   1768     }
   1769     characters_ = NULL;
   1770     atom = new (zone()) RegExpAtom(char_vector);
   1771     FlushText();
   1772   } else if (text_.length() > 0) {
   1773     DCHECK(last_added_ == ADD_ATOM);
   1774     atom = text_.RemoveLast();
   1775     FlushText();
   1776   } else if (terms_.length() > 0) {
   1777     DCHECK(last_added_ == ADD_ATOM);
   1778     atom = terms_.RemoveLast();
   1779     // With /u, lookarounds are not quantifiable.
   1780     if (unicode() && atom->IsLookaround()) return false;
   1781     if (atom->max_match() == 0) {
   1782       // Guaranteed to only match an empty string.
   1783       LAST(ADD_TERM);
   1784       if (min == 0) {
   1785         return true;
   1786       }
   1787       terms_.Add(atom, zone());
   1788       return true;
   1789     }
   1790   } else {
   1791     // Only call immediately after adding an atom or character!
   1792     UNREACHABLE();
   1793     return false;
   1794   }
   1795   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
   1796              zone());
   1797   LAST(ADD_TERM);
   1798   return true;
   1799 }
   1800 
   1801 }  // namespace internal
   1802 }  // namespace v8
   1803