Home | History | Annotate | Download | only in regexp
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/regexp/regexp-parser.h"
      6 
      7 #include "src/char-predicates-inl.h"
      8 #include "src/factory.h"
      9 #include "src/isolate.h"
     10 #include "src/objects-inl.h"
     11 #include "src/ostreams.h"
     12 #include "src/regexp/jsregexp.h"
     13 #include "src/utils.h"
     14 
     15 #ifdef V8_I18N_SUPPORT
     16 #include "unicode/uset.h"
     17 #endif  // V8_I18N_SUPPORT
     18 
     19 namespace v8 {
     20 namespace internal {
     21 
     22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
     23                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
     24     : isolate_(isolate),
     25       zone_(zone),
     26       error_(error),
     27       captures_(NULL),
     28       named_captures_(NULL),
     29       named_back_references_(NULL),
     30       in_(in),
     31       current_(kEndMarker),
     32       ignore_case_(flags & JSRegExp::kIgnoreCase),
     33       multiline_(flags & JSRegExp::kMultiline),
     34       unicode_(flags & JSRegExp::kUnicode),
     35       next_pos_(0),
     36       captures_started_(0),
     37       capture_count_(0),
     38       has_more_(true),
     39       simple_(false),
     40       contains_anchor_(false),
     41       is_scanned_for_captures_(false),
     42       failed_(false) {
     43   Advance();
     44 }
     45 
     46 template <bool update_position>
     47 inline uc32 RegExpParser::ReadNext() {
     48   int position = next_pos_;
     49   uc32 c0 = in()->Get(position);
     50   position++;
     51   // Read the whole surrogate pair in case of unicode flag, if possible.
     52   if (unicode() && position < in()->length() &&
     53       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
     54     uc16 c1 = in()->Get(position);
     55     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
     56       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
     57       position++;
     58     }
     59   }
     60   if (update_position) next_pos_ = position;
     61   return c0;
     62 }
     63 
     64 
     65 uc32 RegExpParser::Next() {
     66   if (has_next()) {
     67     return ReadNext<false>();
     68   } else {
     69     return kEndMarker;
     70   }
     71 }
     72 
     73 
     74 void RegExpParser::Advance() {
     75   if (has_next()) {
     76     StackLimitCheck check(isolate());
     77     if (check.HasOverflowed()) {
     78       ReportError(CStrVector(
     79           MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
     80     } else if (zone()->excess_allocation()) {
     81       ReportError(CStrVector("Regular expression too large"));
     82     } else {
     83       current_ = ReadNext<true>();
     84     }
     85   } else {
     86     current_ = kEndMarker;
     87     // Advance so that position() points to 1-after-the-last-character. This is
     88     // important so that Reset() to this position works correctly.
     89     next_pos_ = in()->length() + 1;
     90     has_more_ = false;
     91   }
     92 }
     93 
     94 
     95 void RegExpParser::Reset(int pos) {
     96   next_pos_ = pos;
     97   has_more_ = (pos < in()->length());
     98   Advance();
     99 }
    100 
    101 
    102 void RegExpParser::Advance(int dist) {
    103   next_pos_ += dist - 1;
    104   Advance();
    105 }
    106 
    107 
    108 bool RegExpParser::simple() { return simple_; }
    109 
    110 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
    111   switch (c) {
    112     case '^':
    113     case '$':
    114     case '\\':
    115     case '.':
    116     case '*':
    117     case '+':
    118     case '?':
    119     case '(':
    120     case ')':
    121     case '[':
    122     case ']':
    123     case '{':
    124     case '}':
    125     case '|':
    126     case '/':
    127       return true;
    128     default:
    129       break;
    130   }
    131   return false;
    132 }
    133 
    134 
    135 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
    136   if (failed_) return NULL;  // Do not overwrite any existing error.
    137   failed_ = true;
    138   *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
    139   // Zip to the end to make sure the no more input is read.
    140   current_ = kEndMarker;
    141   next_pos_ = in()->length();
    142   return NULL;
    143 }
    144 
    145 
    146 #define CHECK_FAILED /**/); \
    147   if (failed_) return NULL; \
    148   ((void)0
    149 
    150 
    151 // Pattern ::
    152 //   Disjunction
    153 RegExpTree* RegExpParser::ParsePattern() {
    154   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
    155   PatchNamedBackReferences(CHECK_FAILED);
    156   DCHECK(!has_more());
    157   // If the result of parsing is a literal string atom, and it has the
    158   // same length as the input, then the atom is identical to the input.
    159   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
    160     simple_ = true;
    161   }
    162   return result;
    163 }
    164 
    165 
    166 // Disjunction ::
    167 //   Alternative
    168 //   Alternative | Disjunction
    169 // Alternative ::
    170 //   [empty]
    171 //   Term Alternative
    172 // Term ::
    173 //   Assertion
    174 //   Atom
    175 //   Atom Quantifier
    176 RegExpTree* RegExpParser::ParseDisjunction() {
    177   // Used to store current state while parsing subexpressions.
    178   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
    179                                   nullptr, ignore_case(), unicode(), zone());
    180   RegExpParserState* state = &initial_state;
    181   // Cache the builder in a local variable for quick access.
    182   RegExpBuilder* builder = initial_state.builder();
    183   while (true) {
    184     switch (current()) {
    185       case kEndMarker:
    186         if (state->IsSubexpression()) {
    187           // Inside a parenthesized group when hitting end of input.
    188           return ReportError(CStrVector("Unterminated group"));
    189         }
    190         DCHECK_EQ(INITIAL, state->group_type());
    191         // Parsing completed successfully.
    192         return builder->ToRegExp();
    193       case ')': {
    194         if (!state->IsSubexpression()) {
    195           return ReportError(CStrVector("Unmatched ')'"));
    196         }
    197         DCHECK_NE(INITIAL, state->group_type());
    198 
    199         Advance();
    200         // End disjunction parsing and convert builder content to new single
    201         // regexp atom.
    202         RegExpTree* body = builder->ToRegExp();
    203 
    204         int end_capture_index = captures_started();
    205 
    206         int capture_index = state->capture_index();
    207         SubexpressionType group_type = state->group_type();
    208 
    209         // Build result of subexpression.
    210         if (group_type == CAPTURE) {
    211           if (state->IsNamedCapture()) {
    212             CreateNamedCaptureAtIndex(state->capture_name(),
    213                                       capture_index CHECK_FAILED);
    214           }
    215           RegExpCapture* capture = GetCapture(capture_index);
    216           capture->set_body(body);
    217           body = capture;
    218         } else if (group_type != GROUPING) {
    219           DCHECK(group_type == POSITIVE_LOOKAROUND ||
    220                  group_type == NEGATIVE_LOOKAROUND);
    221           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
    222           body = new (zone()) RegExpLookaround(
    223               body, is_positive, end_capture_index - capture_index,
    224               capture_index, state->lookaround_type());
    225         }
    226 
    227         // Restore previous state.
    228         state = state->previous_state();
    229         builder = state->builder();
    230 
    231         builder->AddAtom(body);
    232         // For compatability with JSC and ES3, we allow quantifiers after
    233         // lookaheads, and break in all cases.
    234         break;
    235       }
    236       case '|': {
    237         Advance();
    238         builder->NewAlternative();
    239         continue;
    240       }
    241       case '*':
    242       case '+':
    243       case '?':
    244         return ReportError(CStrVector("Nothing to repeat"));
    245       case '^': {
    246         Advance();
    247         if (multiline()) {
    248           builder->AddAssertion(
    249               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
    250         } else {
    251           builder->AddAssertion(
    252               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
    253           set_contains_anchor();
    254         }
    255         continue;
    256       }
    257       case '$': {
    258         Advance();
    259         RegExpAssertion::AssertionType assertion_type =
    260             multiline() ? RegExpAssertion::END_OF_LINE
    261                         : RegExpAssertion::END_OF_INPUT;
    262         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
    263         continue;
    264       }
    265       case '.': {
    266         Advance();
    267         // everything except \x0a, \x0d, \u2028 and \u2029
    268         ZoneList<CharacterRange>* ranges =
    269             new (zone()) ZoneList<CharacterRange>(2, zone());
    270         CharacterRange::AddClassEscape('.', ranges, zone());
    271         RegExpCharacterClass* cc =
    272             new (zone()) RegExpCharacterClass(ranges, false);
    273         builder->AddCharacterClass(cc);
    274         break;
    275       }
    276       case '(': {
    277         SubexpressionType subexpr_type = CAPTURE;
    278         RegExpLookaround::Type lookaround_type = state->lookaround_type();
    279         bool is_named_capture = false;
    280         Advance();
    281         if (current() == '?') {
    282           switch (Next()) {
    283             case ':':
    284               subexpr_type = GROUPING;
    285               Advance(2);
    286               break;
    287             case '=':
    288               lookaround_type = RegExpLookaround::LOOKAHEAD;
    289               subexpr_type = POSITIVE_LOOKAROUND;
    290               Advance(2);
    291               break;
    292             case '!':
    293               lookaround_type = RegExpLookaround::LOOKAHEAD;
    294               subexpr_type = NEGATIVE_LOOKAROUND;
    295               Advance(2);
    296               break;
    297             case '<':
    298               Advance();
    299               if (FLAG_harmony_regexp_lookbehind) {
    300                 if (Next() == '=') {
    301                   subexpr_type = POSITIVE_LOOKAROUND;
    302                   lookaround_type = RegExpLookaround::LOOKBEHIND;
    303                   Advance(2);
    304                   break;
    305                 } else if (Next() == '!') {
    306                   subexpr_type = NEGATIVE_LOOKAROUND;
    307                   lookaround_type = RegExpLookaround::LOOKBEHIND;
    308                   Advance(2);
    309                   break;
    310                 }
    311               }
    312               if (FLAG_harmony_regexp_named_captures && unicode()) {
    313                 is_named_capture = true;
    314                 Advance();
    315                 break;
    316               }
    317             // Fall through.
    318             default:
    319               return ReportError(CStrVector("Invalid group"));
    320           }
    321         }
    322 
    323         const ZoneVector<uc16>* capture_name = nullptr;
    324         if (subexpr_type == CAPTURE) {
    325           if (captures_started_ >= kMaxCaptures) {
    326             return ReportError(CStrVector("Too many captures"));
    327           }
    328           captures_started_++;
    329 
    330           if (is_named_capture) {
    331             capture_name = ParseCaptureGroupName(CHECK_FAILED);
    332           }
    333         }
    334         // Store current state and begin new disjunction parsing.
    335         state = new (zone()) RegExpParserState(
    336             state, subexpr_type, lookaround_type, captures_started_,
    337             capture_name, ignore_case(), unicode(), zone());
    338         builder = state->builder();
    339         continue;
    340       }
    341       case '[': {
    342         RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
    343         builder->AddCharacterClass(cc->AsCharacterClass());
    344         break;
    345       }
    346       // Atom ::
    347       //   \ AtomEscape
    348       case '\\':
    349         switch (Next()) {
    350           case kEndMarker:
    351             return ReportError(CStrVector("\\ at end of pattern"));
    352           case 'b':
    353             Advance(2);
    354             builder->AddAssertion(
    355                 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
    356             continue;
    357           case 'B':
    358             Advance(2);
    359             builder->AddAssertion(
    360                 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
    361             continue;
    362           // AtomEscape ::
    363           //   CharacterClassEscape
    364           //
    365           // CharacterClassEscape :: one of
    366           //   d D s S w W
    367           case 'd':
    368           case 'D':
    369           case 's':
    370           case 'S':
    371           case 'w':
    372           case 'W': {
    373             uc32 c = Next();
    374             Advance(2);
    375             ZoneList<CharacterRange>* ranges =
    376                 new (zone()) ZoneList<CharacterRange>(2, zone());
    377             CharacterRange::AddClassEscape(c, ranges, zone());
    378             RegExpCharacterClass* cc =
    379                 new (zone()) RegExpCharacterClass(ranges, false);
    380             builder->AddCharacterClass(cc);
    381             break;
    382           }
    383           case 'p':
    384           case 'P': {
    385             uc32 p = Next();
    386             Advance(2);
    387             if (unicode()) {
    388               if (FLAG_harmony_regexp_property) {
    389                 ZoneList<CharacterRange>* ranges =
    390                     new (zone()) ZoneList<CharacterRange>(2, zone());
    391                 if (!ParsePropertyClass(ranges, p == 'P')) {
    392                   return ReportError(CStrVector("Invalid property name"));
    393                 }
    394                 RegExpCharacterClass* cc =
    395                     new (zone()) RegExpCharacterClass(ranges, false);
    396                 builder->AddCharacterClass(cc);
    397               } else {
    398                 // With /u, no identity escapes except for syntax characters
    399                 // are allowed. Otherwise, all identity escapes are allowed.
    400                 return ReportError(CStrVector("Invalid escape"));
    401               }
    402             } else {
    403               builder->AddCharacter(p);
    404             }
    405             break;
    406           }
    407           case '1':
    408           case '2':
    409           case '3':
    410           case '4':
    411           case '5':
    412           case '6':
    413           case '7':
    414           case '8':
    415           case '9': {
    416             int index = 0;
    417             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
    418             if (is_backref) {
    419               if (state->IsInsideCaptureGroup(index)) {
    420                 // The back reference is inside the capture group it refers to.
    421                 // Nothing can possibly have been captured yet, so we use empty
    422                 // instead. This ensures that, when checking a back reference,
    423                 // the capture registers of the referenced capture are either
    424                 // both set or both cleared.
    425                 builder->AddEmpty();
    426               } else {
    427                 RegExpCapture* capture = GetCapture(index);
    428                 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
    429                 builder->AddAtom(atom);
    430               }
    431               break;
    432             }
    433             // With /u, no identity escapes except for syntax characters
    434             // are allowed. Otherwise, all identity escapes are allowed.
    435             if (unicode()) {
    436               return ReportError(CStrVector("Invalid escape"));
    437             }
    438             uc32 first_digit = Next();
    439             if (first_digit == '8' || first_digit == '9') {
    440               builder->AddCharacter(first_digit);
    441               Advance(2);
    442               break;
    443             }
    444           }
    445           // Fall through.
    446           case '0': {
    447             Advance();
    448             if (unicode() && Next() >= '0' && Next() <= '9') {
    449               // With /u, decimal escape with leading 0 are not parsed as octal.
    450               return ReportError(CStrVector("Invalid decimal escape"));
    451             }
    452             uc32 octal = ParseOctalLiteral();
    453             builder->AddCharacter(octal);
    454             break;
    455           }
    456           // ControlEscape :: one of
    457           //   f n r t v
    458           case 'f':
    459             Advance(2);
    460             builder->AddCharacter('\f');
    461             break;
    462           case 'n':
    463             Advance(2);
    464             builder->AddCharacter('\n');
    465             break;
    466           case 'r':
    467             Advance(2);
    468             builder->AddCharacter('\r');
    469             break;
    470           case 't':
    471             Advance(2);
    472             builder->AddCharacter('\t');
    473             break;
    474           case 'v':
    475             Advance(2);
    476             builder->AddCharacter('\v');
    477             break;
    478           case 'c': {
    479             Advance();
    480             uc32 controlLetter = Next();
    481             // Special case if it is an ASCII letter.
    482             // Convert lower case letters to uppercase.
    483             uc32 letter = controlLetter & ~('a' ^ 'A');
    484             if (letter < 'A' || 'Z' < letter) {
    485               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
    486               // This is outside the specification. We match JSC in
    487               // reading the backslash as a literal character instead
    488               // of as starting an escape.
    489               if (unicode()) {
    490                 // With /u, invalid escapes are not treated as identity escapes.
    491                 return ReportError(CStrVector("Invalid unicode escape"));
    492               }
    493               builder->AddCharacter('\\');
    494             } else {
    495               Advance(2);
    496               builder->AddCharacter(controlLetter & 0x1f);
    497             }
    498             break;
    499           }
    500           case 'x': {
    501             Advance(2);
    502             uc32 value;
    503             if (ParseHexEscape(2, &value)) {
    504               builder->AddCharacter(value);
    505             } else if (!unicode()) {
    506               builder->AddCharacter('x');
    507             } else {
    508               // With /u, invalid escapes are not treated as identity escapes.
    509               return ReportError(CStrVector("Invalid escape"));
    510             }
    511             break;
    512           }
    513           case 'u': {
    514             Advance(2);
    515             uc32 value;
    516             if (ParseUnicodeEscape(&value)) {
    517               builder->AddEscapedUnicodeCharacter(value);
    518             } else if (!unicode()) {
    519               builder->AddCharacter('u');
    520             } else {
    521               // With /u, invalid escapes are not treated as identity escapes.
    522               return ReportError(CStrVector("Invalid unicode escape"));
    523             }
    524             break;
    525           }
    526           case 'k':
    527             if (FLAG_harmony_regexp_named_captures && unicode()) {
    528               Advance(2);
    529               ParseNamedBackReference(builder, state CHECK_FAILED);
    530               break;
    531             }
    532           // Fall through.
    533           default:
    534             Advance();
    535             // With /u, no identity escapes except for syntax characters
    536             // are allowed. Otherwise, all identity escapes are allowed.
    537             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
    538               builder->AddCharacter(current());
    539               Advance();
    540             } else {
    541               return ReportError(CStrVector("Invalid escape"));
    542             }
    543             break;
    544         }
    545         break;
    546       case '{': {
    547         int dummy;
    548         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
    549         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
    550         // Fall through.
    551       }
    552       case '}':
    553       case ']':
    554         if (unicode()) {
    555           return ReportError(CStrVector("Lone quantifier brackets"));
    556         }
    557       // Fall through.
    558       default:
    559         builder->AddUnicodeCharacter(current());
    560         Advance();
    561         break;
    562     }  // end switch(current())
    563 
    564     int min;
    565     int max;
    566     switch (current()) {
    567       // QuantifierPrefix ::
    568       //   *
    569       //   +
    570       //   ?
    571       //   {
    572       case '*':
    573         min = 0;
    574         max = RegExpTree::kInfinity;
    575         Advance();
    576         break;
    577       case '+':
    578         min = 1;
    579         max = RegExpTree::kInfinity;
    580         Advance();
    581         break;
    582       case '?':
    583         min = 0;
    584         max = 1;
    585         Advance();
    586         break;
    587       case '{':
    588         if (ParseIntervalQuantifier(&min, &max)) {
    589           if (max < min) {
    590             return ReportError(
    591                 CStrVector("numbers out of order in {} quantifier"));
    592           }
    593           break;
    594         } else if (unicode()) {
    595           // With /u, incomplete quantifiers are not allowed.
    596           return ReportError(CStrVector("Incomplete quantifier"));
    597         }
    598         continue;
    599       default:
    600         continue;
    601     }
    602     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
    603     if (current() == '?') {
    604       quantifier_type = RegExpQuantifier::NON_GREEDY;
    605       Advance();
    606     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
    607       // FLAG_regexp_possessive_quantifier is a debug-only flag.
    608       quantifier_type = RegExpQuantifier::POSSESSIVE;
    609       Advance();
    610     }
    611     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
    612       return ReportError(CStrVector("Invalid quantifier"));
    613     }
    614   }
    615 }
    616 
    617 
    618 #ifdef DEBUG
    619 // Currently only used in an DCHECK.
    620 static bool IsSpecialClassEscape(uc32 c) {
    621   switch (c) {
    622     case 'd':
    623     case 'D':
    624     case 's':
    625     case 'S':
    626     case 'w':
    627     case 'W':
    628       return true;
    629     default:
    630       return false;
    631   }
    632 }
    633 #endif
    634 
    635 
    636 // In order to know whether an escape is a backreference or not we have to scan
    637 // the entire regexp and find the number of capturing parentheses.  However we
    638 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
    639 // is called when needed.  It can see the difference between capturing and
    640 // noncapturing parentheses and can skip character classes and backslash-escaped
    641 // characters.
    642 void RegExpParser::ScanForCaptures() {
    643   // Start with captures started previous to current position
    644   int capture_count = captures_started();
    645   // Add count of captures after this position.
    646   int n;
    647   while ((n = current()) != kEndMarker) {
    648     Advance();
    649     switch (n) {
    650       case '\\':
    651         Advance();
    652         break;
    653       case '[': {
    654         int c;
    655         while ((c = current()) != kEndMarker) {
    656           Advance();
    657           if (c == '\\') {
    658             Advance();
    659           } else {
    660             if (c == ']') break;
    661           }
    662         }
    663         break;
    664       }
    665       case '(':
    666         if (current() != '?') capture_count++;
    667         break;
    668     }
    669   }
    670   capture_count_ = capture_count;
    671   is_scanned_for_captures_ = true;
    672 }
    673 
    674 
    675 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
    676   DCHECK_EQ('\\', current());
    677   DCHECK('1' <= Next() && Next() <= '9');
    678   // Try to parse a decimal literal that is no greater than the total number
    679   // of left capturing parentheses in the input.
    680   int start = position();
    681   int value = Next() - '0';
    682   Advance(2);
    683   while (true) {
    684     uc32 c = current();
    685     if (IsDecimalDigit(c)) {
    686       value = 10 * value + (c - '0');
    687       if (value > kMaxCaptures) {
    688         Reset(start);
    689         return false;
    690       }
    691       Advance();
    692     } else {
    693       break;
    694     }
    695   }
    696   if (value > captures_started()) {
    697     if (!is_scanned_for_captures_) {
    698       int saved_position = position();
    699       ScanForCaptures();
    700       Reset(saved_position);
    701     }
    702     if (value > capture_count_) {
    703       Reset(start);
    704       return false;
    705     }
    706   }
    707   *index_out = value;
    708   return true;
    709 }
    710 
    711 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
    712   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
    713     v->push_back(code_unit);
    714   } else {
    715     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
    716     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
    717   }
    718 }
    719 
    720 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
    721   DCHECK(FLAG_harmony_regexp_named_captures);
    722   DCHECK(unicode());
    723 
    724   ZoneVector<uc16>* name =
    725       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
    726 
    727   bool at_start = true;
    728   while (true) {
    729     uc32 c = current();
    730     Advance();
    731 
    732     // Convert unicode escapes.
    733     if (c == '\\' && current() == 'u') {
    734       Advance();
    735       if (!ParseUnicodeEscape(&c)) {
    736         ReportError(CStrVector("Invalid Unicode escape sequence"));
    737         return nullptr;
    738       }
    739     }
    740 
    741     if (at_start) {
    742       if (!IdentifierStart::Is(c)) {
    743         ReportError(CStrVector("Invalid capture group name"));
    744         return nullptr;
    745       }
    746       push_code_unit(name, c);
    747       at_start = false;
    748     } else {
    749       if (c == '>') {
    750         break;
    751       } else if (IdentifierPart::Is(c)) {
    752         push_code_unit(name, c);
    753       } else {
    754         ReportError(CStrVector("Invalid capture group name"));
    755         return nullptr;
    756       }
    757     }
    758   }
    759 
    760   return name;
    761 }
    762 
    763 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
    764                                              int index) {
    765   DCHECK(FLAG_harmony_regexp_named_captures);
    766   DCHECK(unicode());
    767   DCHECK(0 < index && index <= captures_started_);
    768   DCHECK_NOT_NULL(name);
    769 
    770   if (named_captures_ == nullptr) {
    771     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
    772   } else {
    773     // Check for duplicates and bail if we find any.
    774     for (const auto& named_capture : *named_captures_) {
    775       if (*named_capture->name() == *name) {
    776         ReportError(CStrVector("Duplicate capture group name"));
    777         return false;
    778       }
    779     }
    780   }
    781 
    782   RegExpCapture* capture = GetCapture(index);
    783   DCHECK(capture->name() == nullptr);
    784 
    785   capture->set_name(name);
    786   named_captures_->Add(capture, zone());
    787 
    788   return true;
    789 }
    790 
    791 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
    792                                            RegExpParserState* state) {
    793   // The parser is assumed to be on the '<' in \k<name>.
    794   if (current() != '<') {
    795     ReportError(CStrVector("Invalid named reference"));
    796     return false;
    797   }
    798 
    799   Advance();
    800   const ZoneVector<uc16>* name = ParseCaptureGroupName();
    801   if (name == nullptr) {
    802     return false;
    803   }
    804 
    805   if (state->IsInsideCaptureGroup(name)) {
    806     builder->AddEmpty();
    807   } else {
    808     RegExpBackReference* atom = new (zone()) RegExpBackReference();
    809     atom->set_name(name);
    810 
    811     builder->AddAtom(atom);
    812 
    813     if (named_back_references_ == nullptr) {
    814       named_back_references_ =
    815           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
    816     }
    817     named_back_references_->Add(atom, zone());
    818   }
    819 
    820   return true;
    821 }
    822 
    823 void RegExpParser::PatchNamedBackReferences() {
    824   if (named_back_references_ == nullptr) return;
    825 
    826   if (named_captures_ == nullptr) {
    827     ReportError(CStrVector("Invalid named capture referenced"));
    828     return;
    829   }
    830 
    831   // Look up and patch the actual capture for each named back reference.
    832   // TODO(jgruber): O(n^2), optimize if necessary.
    833 
    834   for (int i = 0; i < named_back_references_->length(); i++) {
    835     RegExpBackReference* ref = named_back_references_->at(i);
    836 
    837     int index = -1;
    838     for (const auto& capture : *named_captures_) {
    839       if (*capture->name() == *ref->name()) {
    840         index = capture->index();
    841         break;
    842       }
    843     }
    844 
    845     if (index == -1) {
    846       ReportError(CStrVector("Invalid named capture referenced"));
    847       return;
    848     }
    849 
    850     ref->set_capture(GetCapture(index));
    851   }
    852 }
    853 
    854 RegExpCapture* RegExpParser::GetCapture(int index) {
    855   // The index for the capture groups are one-based. Its index in the list is
    856   // zero-based.
    857   int know_captures =
    858       is_scanned_for_captures_ ? capture_count_ : captures_started_;
    859   DCHECK(index <= know_captures);
    860   if (captures_ == NULL) {
    861     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
    862   }
    863   while (captures_->length() < know_captures) {
    864     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
    865   }
    866   return captures_->at(index - 1);
    867 }
    868 
    869 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
    870   if (named_captures_ == nullptr || named_captures_->is_empty())
    871     return Handle<FixedArray>();
    872 
    873   Factory* factory = isolate()->factory();
    874 
    875   int len = named_captures_->length() * 2;
    876   Handle<FixedArray> array = factory->NewFixedArray(len);
    877 
    878   for (int i = 0; i < named_captures_->length(); i++) {
    879     RegExpCapture* capture = named_captures_->at(i);
    880     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
    881     array->set(i * 2, *name.ToHandleChecked());
    882     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
    883   }
    884 
    885   return array;
    886 }
    887 
    888 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
    889   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
    890     if (s->group_type() != CAPTURE) continue;
    891     // Return true if we found the matching capture index.
    892     if (index == s->capture_index()) return true;
    893     // Abort if index is larger than what has been parsed up till this state.
    894     if (index > s->capture_index()) return false;
    895   }
    896   return false;
    897 }
    898 
    899 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
    900     const ZoneVector<uc16>* name) {
    901   DCHECK_NOT_NULL(name);
    902   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
    903     if (s->capture_name() == nullptr) continue;
    904     if (*s->capture_name() == *name) return true;
    905   }
    906   return false;
    907 }
    908 
    909 // QuantifierPrefix ::
    910 //   { DecimalDigits }
    911 //   { DecimalDigits , }
    912 //   { DecimalDigits , DecimalDigits }
    913 //
    914 // Returns true if parsing succeeds, and set the min_out and max_out
    915 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
    916 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
    917   DCHECK_EQ(current(), '{');
    918   int start = position();
    919   Advance();
    920   int min = 0;
    921   if (!IsDecimalDigit(current())) {
    922     Reset(start);
    923     return false;
    924   }
    925   while (IsDecimalDigit(current())) {
    926     int next = current() - '0';
    927     if (min > (RegExpTree::kInfinity - next) / 10) {
    928       // Overflow. Skip past remaining decimal digits and return -1.
    929       do {
    930         Advance();
    931       } while (IsDecimalDigit(current()));
    932       min = RegExpTree::kInfinity;
    933       break;
    934     }
    935     min = 10 * min + next;
    936     Advance();
    937   }
    938   int max = 0;
    939   if (current() == '}') {
    940     max = min;
    941     Advance();
    942   } else if (current() == ',') {
    943     Advance();
    944     if (current() == '}') {
    945       max = RegExpTree::kInfinity;
    946       Advance();
    947     } else {
    948       while (IsDecimalDigit(current())) {
    949         int next = current() - '0';
    950         if (max > (RegExpTree::kInfinity - next) / 10) {
    951           do {
    952             Advance();
    953           } while (IsDecimalDigit(current()));
    954           max = RegExpTree::kInfinity;
    955           break;
    956         }
    957         max = 10 * max + next;
    958         Advance();
    959       }
    960       if (current() != '}') {
    961         Reset(start);
    962         return false;
    963       }
    964       Advance();
    965     }
    966   } else {
    967     Reset(start);
    968     return false;
    969   }
    970   *min_out = min;
    971   *max_out = max;
    972   return true;
    973 }
    974 
    975 
    976 uc32 RegExpParser::ParseOctalLiteral() {
    977   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
    978   // For compatibility with some other browsers (not all), we parse
    979   // up to three octal digits with a value below 256.
    980   uc32 value = current() - '0';
    981   Advance();
    982   if ('0' <= current() && current() <= '7') {
    983     value = value * 8 + current() - '0';
    984     Advance();
    985     if (value < 32 && '0' <= current() && current() <= '7') {
    986       value = value * 8 + current() - '0';
    987       Advance();
    988     }
    989   }
    990   return value;
    991 }
    992 
    993 
    994 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
    995   int start = position();
    996   uc32 val = 0;
    997   for (int i = 0; i < length; ++i) {
    998     uc32 c = current();
    999     int d = HexValue(c);
   1000     if (d < 0) {
   1001       Reset(start);
   1002       return false;
   1003     }
   1004     val = val * 16 + d;
   1005     Advance();
   1006   }
   1007   *value = val;
   1008   return true;
   1009 }
   1010 
   1011 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
   1012 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
   1013   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
   1014   // allowed). In the latter case, the number of hex digits between { } is
   1015   // arbitrary. \ and u have already been read.
   1016   if (current() == '{' && unicode()) {
   1017     int start = position();
   1018     Advance();
   1019     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
   1020       if (current() == '}') {
   1021         Advance();
   1022         return true;
   1023       }
   1024     }
   1025     Reset(start);
   1026     return false;
   1027   }
   1028   // \u but no {, or \u{...} escapes not allowed.
   1029   bool result = ParseHexEscape(4, value);
   1030   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
   1031       current() == '\\') {
   1032     // Attempt to read trail surrogate.
   1033     int start = position();
   1034     if (Next() == 'u') {
   1035       Advance(2);
   1036       uc32 trail;
   1037       if (ParseHexEscape(4, &trail) &&
   1038           unibrow::Utf16::IsTrailSurrogate(trail)) {
   1039         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
   1040                                                       static_cast<uc16>(trail));
   1041         return true;
   1042       }
   1043     }
   1044     Reset(start);
   1045   }
   1046   return result;
   1047 }
   1048 
   1049 #ifdef V8_I18N_SUPPORT
   1050 
   1051 namespace {
   1052 
   1053 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
   1054   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
   1055   if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
   1056   for (int i = 0;; i++) {
   1057     const char* long_name = u_getPropertyName(
   1058         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
   1059     if (long_name == NULL) break;
   1060     if (strcmp(property_name, long_name) == 0) return true;
   1061   }
   1062   return false;
   1063 }
   1064 
   1065 bool IsExactPropertyValueAlias(const char* property_value_name,
   1066                                UProperty property, int32_t property_value) {
   1067   const char* short_name =
   1068       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
   1069   if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
   1070     return true;
   1071   }
   1072   for (int i = 0;; i++) {
   1073     const char* long_name = u_getPropertyValueName(
   1074         property, property_value,
   1075         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
   1076     if (long_name == NULL) break;
   1077     if (strcmp(property_value_name, long_name) == 0) return true;
   1078   }
   1079   return false;
   1080 }
   1081 
   1082 bool LookupPropertyValueName(UProperty property,
   1083                              const char* property_value_name, bool negate,
   1084                              ZoneList<CharacterRange>* result, Zone* zone) {
   1085   int32_t property_value =
   1086       u_getPropertyValueEnum(property, property_value_name);
   1087   if (property_value == UCHAR_INVALID_CODE) return false;
   1088 
   1089   // We require the property name to match exactly to one of the property value
   1090   // aliases. However, u_getPropertyValueEnum uses loose matching.
   1091   if (!IsExactPropertyValueAlias(property_value_name, property,
   1092                                  property_value)) {
   1093     return false;
   1094   }
   1095 
   1096   USet* set = uset_openEmpty();
   1097   UErrorCode ec = U_ZERO_ERROR;
   1098   uset_applyIntPropertyValue(set, property, property_value, &ec);
   1099   bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
   1100 
   1101   if (success) {
   1102     uset_removeAllStrings(set);
   1103     if (negate) uset_complement(set);
   1104     int item_count = uset_getItemCount(set);
   1105     int item_result = 0;
   1106     for (int i = 0; i < item_count; i++) {
   1107       uc32 start = 0;
   1108       uc32 end = 0;
   1109       item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
   1110       result->Add(CharacterRange::Range(start, end), zone);
   1111     }
   1112     DCHECK_EQ(U_ZERO_ERROR, ec);
   1113     DCHECK_EQ(0, item_result);
   1114   }
   1115   uset_close(set);
   1116   return success;
   1117 }
   1118 
   1119 template <size_t N>
   1120 inline bool NameEquals(const char* name, const char (&literal)[N]) {
   1121   return strncmp(name, literal, N + 1) == 0;
   1122 }
   1123 
   1124 bool LookupSpecialPropertyValueName(const char* name,
   1125                                     ZoneList<CharacterRange>* result,
   1126                                     bool negate, Zone* zone) {
   1127   if (NameEquals(name, "Any")) {
   1128     if (!negate) result->Add(CharacterRange::Everything(), zone);
   1129   } else if (NameEquals(name, "ASCII")) {
   1130     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
   1131                        : CharacterRange::Range(0x0, 0x7f),
   1132                 zone);
   1133   } else if (NameEquals(name, "Assigned")) {
   1134     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
   1135                                    !negate, result, zone);
   1136   } else {
   1137     return false;
   1138   }
   1139   return true;
   1140 }
   1141 
   1142 }  // anonymous namespace
   1143 
   1144 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
   1145                                       bool negate) {
   1146   // Parse the property class as follows:
   1147   // - In \p{name}, 'name' is interpreted
   1148   //   - either as a general category property value name.
   1149   //   - or as a binary property name.
   1150   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
   1151   //   and 'value' is interpreted as one of the available property value names.
   1152   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
   1153   // - Loose matching is not applied.
   1154   List<char> first_part;
   1155   List<char> second_part;
   1156   if (current() == '{') {
   1157     // Parse \p{[PropertyName=]PropertyNameValue}
   1158     for (Advance(); current() != '}' && current() != '='; Advance()) {
   1159       if (!has_next()) return false;
   1160       first_part.Add(static_cast<char>(current()));
   1161     }
   1162     if (current() == '=') {
   1163       for (Advance(); current() != '}'; Advance()) {
   1164         if (!has_next()) return false;
   1165         second_part.Add(static_cast<char>(current()));
   1166       }
   1167       second_part.Add(0);  // null-terminate string.
   1168     }
   1169   } else {
   1170     return false;
   1171   }
   1172   Advance();
   1173   first_part.Add(0);  // null-terminate string.
   1174 
   1175   if (second_part.is_empty()) {
   1176     // First attempt to interpret as general category property value name.
   1177     const char* name = first_part.ToConstVector().start();
   1178     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
   1179                                 result, zone())) {
   1180       return true;
   1181     }
   1182     // Interpret "Any", "ASCII", and "Assigned".
   1183     if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
   1184       return true;
   1185     }
   1186     // Then attempt to interpret as binary property name with value name 'Y'.
   1187     UProperty property = u_getPropertyEnum(name);
   1188     if (property < UCHAR_BINARY_START) return false;
   1189     if (property >= UCHAR_BINARY_LIMIT) return false;
   1190     if (!IsExactPropertyAlias(name, property)) return false;
   1191     return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
   1192                                    zone());
   1193   } else {
   1194     // Both property name and value name are specified. Attempt to interpret
   1195     // the property name as enumerated property.
   1196     const char* property_name = first_part.ToConstVector().start();
   1197     const char* value_name = second_part.ToConstVector().start();
   1198     UProperty property = u_getPropertyEnum(property_name);
   1199     if (property < UCHAR_INT_START) return false;
   1200     if (property >= UCHAR_INT_LIMIT) return false;
   1201     if (!IsExactPropertyAlias(property_name, property)) return false;
   1202     return LookupPropertyValueName(property, value_name, negate, result,
   1203                                    zone());
   1204   }
   1205 }
   1206 
   1207 #else  // V8_I18N_SUPPORT
   1208 
   1209 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
   1210                                       bool negate) {
   1211   return false;
   1212 }
   1213 
   1214 #endif  // V8_I18N_SUPPORT
   1215 
   1216 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
   1217   uc32 x = 0;
   1218   int d = HexValue(current());
   1219   if (d < 0) {
   1220     return false;
   1221   }
   1222   while (d >= 0) {
   1223     x = x * 16 + d;
   1224     if (x > max_value) {
   1225       return false;
   1226     }
   1227     Advance();
   1228     d = HexValue(current());
   1229   }
   1230   *value = x;
   1231   return true;
   1232 }
   1233 
   1234 
   1235 uc32 RegExpParser::ParseClassCharacterEscape() {
   1236   DCHECK(current() == '\\');
   1237   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
   1238   Advance();
   1239   switch (current()) {
   1240     case 'b':
   1241       Advance();
   1242       return '\b';
   1243     // ControlEscape :: one of
   1244     //   f n r t v
   1245     case 'f':
   1246       Advance();
   1247       return '\f';
   1248     case 'n':
   1249       Advance();
   1250       return '\n';
   1251     case 'r':
   1252       Advance();
   1253       return '\r';
   1254     case 't':
   1255       Advance();
   1256       return '\t';
   1257     case 'v':
   1258       Advance();
   1259       return '\v';
   1260     case 'c': {
   1261       uc32 controlLetter = Next();
   1262       uc32 letter = controlLetter & ~('A' ^ 'a');
   1263       // For compatibility with JSC, inside a character class. We also accept
   1264       // digits and underscore as control characters, unless with /u.
   1265       if (letter >= 'A' && letter <= 'Z') {
   1266         Advance(2);
   1267         // Control letters mapped to ASCII control characters in the range
   1268         // 0x00-0x1f.
   1269         return controlLetter & 0x1f;
   1270       }
   1271       if (unicode()) {
   1272         // With /u, invalid escapes are not treated as identity escapes.
   1273         ReportError(CStrVector("Invalid class escape"));
   1274         return 0;
   1275       }
   1276       if ((controlLetter >= '0' && controlLetter <= '9') ||
   1277           controlLetter == '_') {
   1278         Advance(2);
   1279         return controlLetter & 0x1f;
   1280       }
   1281       // We match JSC in reading the backslash as a literal
   1282       // character instead of as starting an escape.
   1283       return '\\';
   1284     }
   1285     case '0':
   1286       // With /u, \0 is interpreted as NUL if not followed by another digit.
   1287       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
   1288         Advance();
   1289         return 0;
   1290       }
   1291     // Fall through.
   1292     case '1':
   1293     case '2':
   1294     case '3':
   1295     case '4':
   1296     case '5':
   1297     case '6':
   1298     case '7':
   1299       // For compatibility, we interpret a decimal escape that isn't
   1300       // a back reference (and therefore either \0 or not valid according
   1301       // to the specification) as a 1..3 digit octal character code.
   1302       if (unicode()) {
   1303         // With /u, decimal escape is not interpreted as octal character code.
   1304         ReportError(CStrVector("Invalid class escape"));
   1305         return 0;
   1306       }
   1307       return ParseOctalLiteral();
   1308     case 'x': {
   1309       Advance();
   1310       uc32 value;
   1311       if (ParseHexEscape(2, &value)) return value;
   1312       if (unicode()) {
   1313         // With /u, invalid escapes are not treated as identity escapes.
   1314         ReportError(CStrVector("Invalid escape"));
   1315         return 0;
   1316       }
   1317       // If \x is not followed by a two-digit hexadecimal, treat it
   1318       // as an identity escape.
   1319       return 'x';
   1320     }
   1321     case 'u': {
   1322       Advance();
   1323       uc32 value;
   1324       if (ParseUnicodeEscape(&value)) return value;
   1325       if (unicode()) {
   1326         // With /u, invalid escapes are not treated as identity escapes.
   1327         ReportError(CStrVector("Invalid unicode escape"));
   1328         return 0;
   1329       }
   1330       // If \u is not followed by a two-digit hexadecimal, treat it
   1331       // as an identity escape.
   1332       return 'u';
   1333     }
   1334     default: {
   1335       uc32 result = current();
   1336       // With /u, no identity escapes except for syntax characters and '-' are
   1337       // allowed. Otherwise, all identity escapes are allowed.
   1338       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
   1339         Advance();
   1340         return result;
   1341       }
   1342       ReportError(CStrVector("Invalid escape"));
   1343       return 0;
   1344     }
   1345   }
   1346   return 0;
   1347 }
   1348 
   1349 
   1350 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
   1351   DCHECK_EQ(0, *char_class);
   1352   uc32 first = current();
   1353   if (first == '\\') {
   1354     switch (Next()) {
   1355       case 'w':
   1356       case 'W':
   1357       case 'd':
   1358       case 'D':
   1359       case 's':
   1360       case 'S': {
   1361         *char_class = Next();
   1362         Advance(2);
   1363         return CharacterRange::Singleton(0);  // Return dummy value.
   1364       }
   1365       case kEndMarker:
   1366         return ReportError(CStrVector("\\ at end of pattern"));
   1367       default:
   1368         first = ParseClassCharacterEscape(CHECK_FAILED);
   1369     }
   1370   } else {
   1371     Advance();
   1372   }
   1373 
   1374   return CharacterRange::Singleton(first);
   1375 }
   1376 
   1377 static const uc16 kNoCharClass = 0;
   1378 
   1379 // Adds range or pre-defined character class to character ranges.
   1380 // If char_class is not kInvalidClass, it's interpreted as a class
   1381 // escape (i.e., 's' means whitespace, from '\s').
   1382 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
   1383                                     uc16 char_class, CharacterRange range,
   1384                                     Zone* zone) {
   1385   if (char_class != kNoCharClass) {
   1386     CharacterRange::AddClassEscape(char_class, ranges, zone);
   1387   } else {
   1388     ranges->Add(range, zone);
   1389   }
   1390 }
   1391 
   1392 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
   1393   if (!FLAG_harmony_regexp_property) return false;
   1394   if (!unicode()) return false;
   1395   if (current() != '\\') return false;
   1396   uc32 next = Next();
   1397   bool parse_success = false;
   1398   if (next == 'p') {
   1399     Advance(2);
   1400     parse_success = ParsePropertyClass(ranges, false);
   1401   } else if (next == 'P') {
   1402     Advance(2);
   1403     parse_success = ParsePropertyClass(ranges, true);
   1404   } else {
   1405     return false;
   1406   }
   1407   if (!parse_success)
   1408     ReportError(CStrVector("Invalid property name in character class"));
   1409   return parse_success;
   1410 }
   1411 
   1412 RegExpTree* RegExpParser::ParseCharacterClass() {
   1413   static const char* kUnterminated = "Unterminated character class";
   1414   static const char* kRangeInvalid = "Invalid character class";
   1415   static const char* kRangeOutOfOrder = "Range out of order in character class";
   1416 
   1417   DCHECK_EQ(current(), '[');
   1418   Advance();
   1419   bool is_negated = false;
   1420   if (current() == '^') {
   1421     is_negated = true;
   1422     Advance();
   1423   }
   1424   ZoneList<CharacterRange>* ranges =
   1425       new (zone()) ZoneList<CharacterRange>(2, zone());
   1426   while (has_more() && current() != ']') {
   1427     bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
   1428     if (parsed_property) continue;
   1429     uc16 char_class = kNoCharClass;
   1430     CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
   1431     if (current() == '-') {
   1432       Advance();
   1433       if (current() == kEndMarker) {
   1434         // If we reach the end we break out of the loop and let the
   1435         // following code report an error.
   1436         break;
   1437       } else if (current() == ']') {
   1438         AddRangeOrEscape(ranges, char_class, first, zone());
   1439         ranges->Add(CharacterRange::Singleton('-'), zone());
   1440         break;
   1441       }
   1442       uc16 char_class_2 = kNoCharClass;
   1443       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
   1444       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
   1445         // Either end is an escaped character class. Treat the '-' verbatim.
   1446         if (unicode()) {
   1447           // ES2015 21.2.2.15.1 step 1.
   1448           return ReportError(CStrVector(kRangeInvalid));
   1449         }
   1450         AddRangeOrEscape(ranges, char_class, first, zone());
   1451         ranges->Add(CharacterRange::Singleton('-'), zone());
   1452         AddRangeOrEscape(ranges, char_class_2, next, zone());
   1453         continue;
   1454       }
   1455       // ES2015 21.2.2.15.1 step 6.
   1456       if (first.from() > next.to()) {
   1457         return ReportError(CStrVector(kRangeOutOfOrder));
   1458       }
   1459       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
   1460     } else {
   1461       AddRangeOrEscape(ranges, char_class, first, zone());
   1462     }
   1463   }
   1464   if (!has_more()) {
   1465     return ReportError(CStrVector(kUnterminated));
   1466   }
   1467   Advance();
   1468   if (ranges->length() == 0) {
   1469     ranges->Add(CharacterRange::Everything(), zone());
   1470     is_negated = !is_negated;
   1471   }
   1472   return new (zone()) RegExpCharacterClass(ranges, is_negated);
   1473 }
   1474 
   1475 
   1476 #undef CHECK_FAILED
   1477 
   1478 
   1479 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
   1480                                FlatStringReader* input, JSRegExp::Flags flags,
   1481                                RegExpCompileData* result) {
   1482   DCHECK(result != NULL);
   1483   RegExpParser parser(input, &result->error, flags, isolate, zone);
   1484   RegExpTree* tree = parser.ParsePattern();
   1485   if (parser.failed()) {
   1486     DCHECK(tree == NULL);
   1487     DCHECK(!result->error.is_null());
   1488   } else {
   1489     DCHECK(tree != NULL);
   1490     DCHECK(result->error.is_null());
   1491     if (FLAG_trace_regexp_parser) {
   1492       OFStream os(stdout);
   1493       tree->Print(os, zone);
   1494       os << "\n";
   1495     }
   1496     result->tree = tree;
   1497     int capture_count = parser.captures_started();
   1498     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
   1499     result->contains_anchor = parser.contains_anchor();
   1500     result->capture_name_map = parser.CreateCaptureNameMap();
   1501     result->capture_count = capture_count;
   1502   }
   1503   return !parser.failed();
   1504 }
   1505 
   1506 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
   1507     : zone_(zone),
   1508       pending_empty_(false),
   1509       ignore_case_(ignore_case),
   1510       unicode_(unicode),
   1511       characters_(NULL),
   1512       pending_surrogate_(kNoPendingSurrogate),
   1513       terms_(),
   1514       alternatives_()
   1515 #ifdef DEBUG
   1516       ,
   1517       last_added_(ADD_NONE)
   1518 #endif
   1519 {
   1520 }
   1521 
   1522 
   1523 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
   1524   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
   1525   FlushPendingSurrogate();
   1526   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
   1527   pending_surrogate_ = lead_surrogate;
   1528 }
   1529 
   1530 
   1531 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
   1532   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
   1533   if (pending_surrogate_ != kNoPendingSurrogate) {
   1534     uc16 lead_surrogate = pending_surrogate_;
   1535     pending_surrogate_ = kNoPendingSurrogate;
   1536     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
   1537     uc32 combined =
   1538         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
   1539     if (NeedsDesugaringForIgnoreCase(combined)) {
   1540       AddCharacterClassForDesugaring(combined);
   1541     } else {
   1542       ZoneList<uc16> surrogate_pair(2, zone());
   1543       surrogate_pair.Add(lead_surrogate, zone());
   1544       surrogate_pair.Add(trail_surrogate, zone());
   1545       RegExpAtom* atom =
   1546           new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
   1547       AddAtom(atom);
   1548     }
   1549   } else {
   1550     pending_surrogate_ = trail_surrogate;
   1551     FlushPendingSurrogate();
   1552   }
   1553 }
   1554 
   1555 
   1556 void RegExpBuilder::FlushPendingSurrogate() {
   1557   if (pending_surrogate_ != kNoPendingSurrogate) {
   1558     DCHECK(unicode());
   1559     uc32 c = pending_surrogate_;
   1560     pending_surrogate_ = kNoPendingSurrogate;
   1561     AddCharacterClassForDesugaring(c);
   1562   }
   1563 }
   1564 
   1565 
   1566 void RegExpBuilder::FlushCharacters() {
   1567   FlushPendingSurrogate();
   1568   pending_empty_ = false;
   1569   if (characters_ != NULL) {
   1570     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
   1571     characters_ = NULL;
   1572     text_.Add(atom, zone());
   1573     LAST(ADD_ATOM);
   1574   }
   1575 }
   1576 
   1577 
   1578 void RegExpBuilder::FlushText() {
   1579   FlushCharacters();
   1580   int num_text = text_.length();
   1581   if (num_text == 0) {
   1582     return;
   1583   } else if (num_text == 1) {
   1584     terms_.Add(text_.last(), zone());
   1585   } else {
   1586     RegExpText* text = new (zone()) RegExpText(zone());
   1587     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
   1588     terms_.Add(text, zone());
   1589   }
   1590   text_.Clear();
   1591 }
   1592 
   1593 
   1594 void RegExpBuilder::AddCharacter(uc16 c) {
   1595   FlushPendingSurrogate();
   1596   pending_empty_ = false;
   1597   if (NeedsDesugaringForIgnoreCase(c)) {
   1598     AddCharacterClassForDesugaring(c);
   1599   } else {
   1600     if (characters_ == NULL) {
   1601       characters_ = new (zone()) ZoneList<uc16>(4, zone());
   1602     }
   1603     characters_->Add(c, zone());
   1604     LAST(ADD_CHAR);
   1605   }
   1606 }
   1607 
   1608 
   1609 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
   1610   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
   1611     DCHECK(unicode());
   1612     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
   1613     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
   1614   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
   1615     AddLeadSurrogate(c);
   1616   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
   1617     AddTrailSurrogate(c);
   1618   } else {
   1619     AddCharacter(static_cast<uc16>(c));
   1620   }
   1621 }
   1622 
   1623 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
   1624   // A lead or trail surrogate parsed via escape sequence will not
   1625   // pair up with any preceding lead or following trail surrogate.
   1626   FlushPendingSurrogate();
   1627   AddUnicodeCharacter(character);
   1628   FlushPendingSurrogate();
   1629 }
   1630 
   1631 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
   1632 
   1633 
   1634 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
   1635   if (NeedsDesugaringForUnicode(cc)) {
   1636     // With /u, character class needs to be desugared, so it
   1637     // must be a standalone term instead of being part of a RegExpText.
   1638     AddTerm(cc);
   1639   } else {
   1640     AddAtom(cc);
   1641   }
   1642 }
   1643 
   1644 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
   1645   AddTerm(new (zone()) RegExpCharacterClass(
   1646       CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
   1647 }
   1648 
   1649 
   1650 void RegExpBuilder::AddAtom(RegExpTree* term) {
   1651   if (term->IsEmpty()) {
   1652     AddEmpty();
   1653     return;
   1654   }
   1655   if (term->IsTextElement()) {
   1656     FlushCharacters();
   1657     text_.Add(term, zone());
   1658   } else {
   1659     FlushText();
   1660     terms_.Add(term, zone());
   1661   }
   1662   LAST(ADD_ATOM);
   1663 }
   1664 
   1665 
   1666 void RegExpBuilder::AddTerm(RegExpTree* term) {
   1667   FlushText();
   1668   terms_.Add(term, zone());
   1669   LAST(ADD_ATOM);
   1670 }
   1671 
   1672 
   1673 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
   1674   FlushText();
   1675   terms_.Add(assert, zone());
   1676   LAST(ADD_ASSERT);
   1677 }
   1678 
   1679 
   1680 void RegExpBuilder::NewAlternative() { FlushTerms(); }
   1681 
   1682 
   1683 void RegExpBuilder::FlushTerms() {
   1684   FlushText();
   1685   int num_terms = terms_.length();
   1686   RegExpTree* alternative;
   1687   if (num_terms == 0) {
   1688     alternative = new (zone()) RegExpEmpty();
   1689   } else if (num_terms == 1) {
   1690     alternative = terms_.last();
   1691   } else {
   1692     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
   1693   }
   1694   alternatives_.Add(alternative, zone());
   1695   terms_.Clear();
   1696   LAST(ADD_NONE);
   1697 }
   1698 
   1699 
   1700 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
   1701   if (!unicode()) return false;
   1702   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
   1703   // necessarily mean that we need to desugar. It's probably nicer to have a
   1704   // separate pass to figure out unicode desugarings.
   1705   if (ignore_case()) return true;
   1706   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
   1707   CharacterRange::Canonicalize(ranges);
   1708   for (int i = ranges->length() - 1; i >= 0; i--) {
   1709     uc32 from = ranges->at(i).from();
   1710     uc32 to = ranges->at(i).to();
   1711     // Check for non-BMP characters.
   1712     if (to >= kNonBmpStart) return true;
   1713     // Check for lone surrogates.
   1714     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
   1715   }
   1716   return false;
   1717 }
   1718 
   1719 
   1720 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
   1721 #ifdef V8_I18N_SUPPORT
   1722   if (unicode() && ignore_case()) {
   1723     USet* set = uset_open(c, c);
   1724     uset_closeOver(set, USET_CASE_INSENSITIVE);
   1725     uset_removeAllStrings(set);
   1726     bool result = uset_size(set) > 1;
   1727     uset_close(set);
   1728     return result;
   1729   }
   1730   // In the case where ICU is not included, we act as if the unicode flag is
   1731   // not set, and do not desugar.
   1732 #endif  // V8_I18N_SUPPORT
   1733   return false;
   1734 }
   1735 
   1736 
   1737 RegExpTree* RegExpBuilder::ToRegExp() {
   1738   FlushTerms();
   1739   int num_alternatives = alternatives_.length();
   1740   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
   1741   if (num_alternatives == 1) return alternatives_.last();
   1742   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
   1743 }
   1744 
   1745 bool RegExpBuilder::AddQuantifierToAtom(
   1746     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
   1747   FlushPendingSurrogate();
   1748   if (pending_empty_) {
   1749     pending_empty_ = false;
   1750     return true;
   1751   }
   1752   RegExpTree* atom;
   1753   if (characters_ != NULL) {
   1754     DCHECK(last_added_ == ADD_CHAR);
   1755     // Last atom was character.
   1756     Vector<const uc16> char_vector = characters_->ToConstVector();
   1757     int num_chars = char_vector.length();
   1758     if (num_chars > 1) {
   1759       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
   1760       text_.Add(new (zone()) RegExpAtom(prefix), zone());
   1761       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
   1762     }
   1763     characters_ = NULL;
   1764     atom = new (zone()) RegExpAtom(char_vector);
   1765     FlushText();
   1766   } else if (text_.length() > 0) {
   1767     DCHECK(last_added_ == ADD_ATOM);
   1768     atom = text_.RemoveLast();
   1769     FlushText();
   1770   } else if (terms_.length() > 0) {
   1771     DCHECK(last_added_ == ADD_ATOM);
   1772     atom = terms_.RemoveLast();
   1773     // With /u, lookarounds are not quantifiable.
   1774     if (unicode() && atom->IsLookaround()) return false;
   1775     if (atom->max_match() == 0) {
   1776       // Guaranteed to only match an empty string.
   1777       LAST(ADD_TERM);
   1778       if (min == 0) {
   1779         return true;
   1780       }
   1781       terms_.Add(atom, zone());
   1782       return true;
   1783     }
   1784   } else {
   1785     // Only call immediately after adding an atom or character!
   1786     UNREACHABLE();
   1787     return false;
   1788   }
   1789   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
   1790              zone());
   1791   LAST(ADD_TERM);
   1792   return true;
   1793 }
   1794 
   1795 }  // namespace internal
   1796 }  // namespace v8
   1797