1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/regexp/regexp-parser.h" 6 7 #include "src/char-predicates-inl.h" 8 #include "src/factory.h" 9 #include "src/isolate.h" 10 #include "src/objects-inl.h" 11 #include "src/ostreams.h" 12 #include "src/regexp/jsregexp.h" 13 #include "src/utils.h" 14 15 #ifdef V8_I18N_SUPPORT 16 #include "unicode/uset.h" 17 #endif // V8_I18N_SUPPORT 18 19 namespace v8 { 20 namespace internal { 21 22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, 23 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) 24 : isolate_(isolate), 25 zone_(zone), 26 error_(error), 27 captures_(NULL), 28 named_captures_(NULL), 29 named_back_references_(NULL), 30 in_(in), 31 current_(kEndMarker), 32 ignore_case_(flags & JSRegExp::kIgnoreCase), 33 multiline_(flags & JSRegExp::kMultiline), 34 unicode_(flags & JSRegExp::kUnicode), 35 next_pos_(0), 36 captures_started_(0), 37 capture_count_(0), 38 has_more_(true), 39 simple_(false), 40 contains_anchor_(false), 41 is_scanned_for_captures_(false), 42 failed_(false) { 43 Advance(); 44 } 45 46 template <bool update_position> 47 inline uc32 RegExpParser::ReadNext() { 48 int position = next_pos_; 49 uc32 c0 = in()->Get(position); 50 position++; 51 // Read the whole surrogate pair in case of unicode flag, if possible. 52 if (unicode() && position < in()->length() && 53 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { 54 uc16 c1 = in()->Get(position); 55 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 56 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); 57 position++; 58 } 59 } 60 if (update_position) next_pos_ = position; 61 return c0; 62 } 63 64 65 uc32 RegExpParser::Next() { 66 if (has_next()) { 67 return ReadNext<false>(); 68 } else { 69 return kEndMarker; 70 } 71 } 72 73 74 void RegExpParser::Advance() { 75 if (has_next()) { 76 StackLimitCheck check(isolate()); 77 if (check.HasOverflowed()) { 78 ReportError(CStrVector( 79 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); 80 } else if (zone()->excess_allocation()) { 81 ReportError(CStrVector("Regular expression too large")); 82 } else { 83 current_ = ReadNext<true>(); 84 } 85 } else { 86 current_ = kEndMarker; 87 // Advance so that position() points to 1-after-the-last-character. This is 88 // important so that Reset() to this position works correctly. 89 next_pos_ = in()->length() + 1; 90 has_more_ = false; 91 } 92 } 93 94 95 void RegExpParser::Reset(int pos) { 96 next_pos_ = pos; 97 has_more_ = (pos < in()->length()); 98 Advance(); 99 } 100 101 102 void RegExpParser::Advance(int dist) { 103 next_pos_ += dist - 1; 104 Advance(); 105 } 106 107 108 bool RegExpParser::simple() { return simple_; } 109 110 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { 111 switch (c) { 112 case '^': 113 case '$': 114 case '\\': 115 case '.': 116 case '*': 117 case '+': 118 case '?': 119 case '(': 120 case ')': 121 case '[': 122 case ']': 123 case '{': 124 case '}': 125 case '|': 126 case '/': 127 return true; 128 default: 129 break; 130 } 131 return false; 132 } 133 134 135 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { 136 if (failed_) return NULL; // Do not overwrite any existing error. 137 failed_ = true; 138 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); 139 // Zip to the end to make sure the no more input is read. 140 current_ = kEndMarker; 141 next_pos_ = in()->length(); 142 return NULL; 143 } 144 145 146 #define CHECK_FAILED /**/); \ 147 if (failed_) return NULL; \ 148 ((void)0 149 150 151 // Pattern :: 152 // Disjunction 153 RegExpTree* RegExpParser::ParsePattern() { 154 RegExpTree* result = ParseDisjunction(CHECK_FAILED); 155 PatchNamedBackReferences(CHECK_FAILED); 156 DCHECK(!has_more()); 157 // If the result of parsing is a literal string atom, and it has the 158 // same length as the input, then the atom is identical to the input. 159 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { 160 simple_ = true; 161 } 162 return result; 163 } 164 165 166 // Disjunction :: 167 // Alternative 168 // Alternative | Disjunction 169 // Alternative :: 170 // [empty] 171 // Term Alternative 172 // Term :: 173 // Assertion 174 // Atom 175 // Atom Quantifier 176 RegExpTree* RegExpParser::ParseDisjunction() { 177 // Used to store current state while parsing subexpressions. 178 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, 179 nullptr, ignore_case(), unicode(), zone()); 180 RegExpParserState* state = &initial_state; 181 // Cache the builder in a local variable for quick access. 182 RegExpBuilder* builder = initial_state.builder(); 183 while (true) { 184 switch (current()) { 185 case kEndMarker: 186 if (state->IsSubexpression()) { 187 // Inside a parenthesized group when hitting end of input. 188 return ReportError(CStrVector("Unterminated group")); 189 } 190 DCHECK_EQ(INITIAL, state->group_type()); 191 // Parsing completed successfully. 192 return builder->ToRegExp(); 193 case ')': { 194 if (!state->IsSubexpression()) { 195 return ReportError(CStrVector("Unmatched ')'")); 196 } 197 DCHECK_NE(INITIAL, state->group_type()); 198 199 Advance(); 200 // End disjunction parsing and convert builder content to new single 201 // regexp atom. 202 RegExpTree* body = builder->ToRegExp(); 203 204 int end_capture_index = captures_started(); 205 206 int capture_index = state->capture_index(); 207 SubexpressionType group_type = state->group_type(); 208 209 // Build result of subexpression. 210 if (group_type == CAPTURE) { 211 if (state->IsNamedCapture()) { 212 CreateNamedCaptureAtIndex(state->capture_name(), 213 capture_index CHECK_FAILED); 214 } 215 RegExpCapture* capture = GetCapture(capture_index); 216 capture->set_body(body); 217 body = capture; 218 } else if (group_type != GROUPING) { 219 DCHECK(group_type == POSITIVE_LOOKAROUND || 220 group_type == NEGATIVE_LOOKAROUND); 221 bool is_positive = (group_type == POSITIVE_LOOKAROUND); 222 body = new (zone()) RegExpLookaround( 223 body, is_positive, end_capture_index - capture_index, 224 capture_index, state->lookaround_type()); 225 } 226 227 // Restore previous state. 228 state = state->previous_state(); 229 builder = state->builder(); 230 231 builder->AddAtom(body); 232 // For compatability with JSC and ES3, we allow quantifiers after 233 // lookaheads, and break in all cases. 234 break; 235 } 236 case '|': { 237 Advance(); 238 builder->NewAlternative(); 239 continue; 240 } 241 case '*': 242 case '+': 243 case '?': 244 return ReportError(CStrVector("Nothing to repeat")); 245 case '^': { 246 Advance(); 247 if (multiline()) { 248 builder->AddAssertion( 249 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); 250 } else { 251 builder->AddAssertion( 252 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); 253 set_contains_anchor(); 254 } 255 continue; 256 } 257 case '$': { 258 Advance(); 259 RegExpAssertion::AssertionType assertion_type = 260 multiline() ? RegExpAssertion::END_OF_LINE 261 : RegExpAssertion::END_OF_INPUT; 262 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); 263 continue; 264 } 265 case '.': { 266 Advance(); 267 // everything except \x0a, \x0d, \u2028 and \u2029 268 ZoneList<CharacterRange>* ranges = 269 new (zone()) ZoneList<CharacterRange>(2, zone()); 270 CharacterRange::AddClassEscape('.', ranges, zone()); 271 RegExpCharacterClass* cc = 272 new (zone()) RegExpCharacterClass(ranges, false); 273 builder->AddCharacterClass(cc); 274 break; 275 } 276 case '(': { 277 SubexpressionType subexpr_type = CAPTURE; 278 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 279 bool is_named_capture = false; 280 Advance(); 281 if (current() == '?') { 282 switch (Next()) { 283 case ':': 284 subexpr_type = GROUPING; 285 Advance(2); 286 break; 287 case '=': 288 lookaround_type = RegExpLookaround::LOOKAHEAD; 289 subexpr_type = POSITIVE_LOOKAROUND; 290 Advance(2); 291 break; 292 case '!': 293 lookaround_type = RegExpLookaround::LOOKAHEAD; 294 subexpr_type = NEGATIVE_LOOKAROUND; 295 Advance(2); 296 break; 297 case '<': 298 Advance(); 299 if (FLAG_harmony_regexp_lookbehind) { 300 if (Next() == '=') { 301 subexpr_type = POSITIVE_LOOKAROUND; 302 lookaround_type = RegExpLookaround::LOOKBEHIND; 303 Advance(2); 304 break; 305 } else if (Next() == '!') { 306 subexpr_type = NEGATIVE_LOOKAROUND; 307 lookaround_type = RegExpLookaround::LOOKBEHIND; 308 Advance(2); 309 break; 310 } 311 } 312 if (FLAG_harmony_regexp_named_captures && unicode()) { 313 is_named_capture = true; 314 Advance(); 315 break; 316 } 317 // Fall through. 318 default: 319 return ReportError(CStrVector("Invalid group")); 320 } 321 } 322 323 const ZoneVector<uc16>* capture_name = nullptr; 324 if (subexpr_type == CAPTURE) { 325 if (captures_started_ >= kMaxCaptures) { 326 return ReportError(CStrVector("Too many captures")); 327 } 328 captures_started_++; 329 330 if (is_named_capture) { 331 capture_name = ParseCaptureGroupName(CHECK_FAILED); 332 } 333 } 334 // Store current state and begin new disjunction parsing. 335 state = new (zone()) RegExpParserState( 336 state, subexpr_type, lookaround_type, captures_started_, 337 capture_name, ignore_case(), unicode(), zone()); 338 builder = state->builder(); 339 continue; 340 } 341 case '[': { 342 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); 343 builder->AddCharacterClass(cc->AsCharacterClass()); 344 break; 345 } 346 // Atom :: 347 // \ AtomEscape 348 case '\\': 349 switch (Next()) { 350 case kEndMarker: 351 return ReportError(CStrVector("\\ at end of pattern")); 352 case 'b': 353 Advance(2); 354 builder->AddAssertion( 355 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); 356 continue; 357 case 'B': 358 Advance(2); 359 builder->AddAssertion( 360 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); 361 continue; 362 // AtomEscape :: 363 // CharacterClassEscape 364 // 365 // CharacterClassEscape :: one of 366 // d D s S w W 367 case 'd': 368 case 'D': 369 case 's': 370 case 'S': 371 case 'w': 372 case 'W': { 373 uc32 c = Next(); 374 Advance(2); 375 ZoneList<CharacterRange>* ranges = 376 new (zone()) ZoneList<CharacterRange>(2, zone()); 377 CharacterRange::AddClassEscape(c, ranges, zone()); 378 RegExpCharacterClass* cc = 379 new (zone()) RegExpCharacterClass(ranges, false); 380 builder->AddCharacterClass(cc); 381 break; 382 } 383 case 'p': 384 case 'P': { 385 uc32 p = Next(); 386 Advance(2); 387 if (unicode()) { 388 if (FLAG_harmony_regexp_property) { 389 ZoneList<CharacterRange>* ranges = 390 new (zone()) ZoneList<CharacterRange>(2, zone()); 391 if (!ParsePropertyClass(ranges, p == 'P')) { 392 return ReportError(CStrVector("Invalid property name")); 393 } 394 RegExpCharacterClass* cc = 395 new (zone()) RegExpCharacterClass(ranges, false); 396 builder->AddCharacterClass(cc); 397 } else { 398 // With /u, no identity escapes except for syntax characters 399 // are allowed. Otherwise, all identity escapes are allowed. 400 return ReportError(CStrVector("Invalid escape")); 401 } 402 } else { 403 builder->AddCharacter(p); 404 } 405 break; 406 } 407 case '1': 408 case '2': 409 case '3': 410 case '4': 411 case '5': 412 case '6': 413 case '7': 414 case '8': 415 case '9': { 416 int index = 0; 417 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); 418 if (is_backref) { 419 if (state->IsInsideCaptureGroup(index)) { 420 // The back reference is inside the capture group it refers to. 421 // Nothing can possibly have been captured yet, so we use empty 422 // instead. This ensures that, when checking a back reference, 423 // the capture registers of the referenced capture are either 424 // both set or both cleared. 425 builder->AddEmpty(); 426 } else { 427 RegExpCapture* capture = GetCapture(index); 428 RegExpTree* atom = new (zone()) RegExpBackReference(capture); 429 builder->AddAtom(atom); 430 } 431 break; 432 } 433 // With /u, no identity escapes except for syntax characters 434 // are allowed. Otherwise, all identity escapes are allowed. 435 if (unicode()) { 436 return ReportError(CStrVector("Invalid escape")); 437 } 438 uc32 first_digit = Next(); 439 if (first_digit == '8' || first_digit == '9') { 440 builder->AddCharacter(first_digit); 441 Advance(2); 442 break; 443 } 444 } 445 // Fall through. 446 case '0': { 447 Advance(); 448 if (unicode() && Next() >= '0' && Next() <= '9') { 449 // With /u, decimal escape with leading 0 are not parsed as octal. 450 return ReportError(CStrVector("Invalid decimal escape")); 451 } 452 uc32 octal = ParseOctalLiteral(); 453 builder->AddCharacter(octal); 454 break; 455 } 456 // ControlEscape :: one of 457 // f n r t v 458 case 'f': 459 Advance(2); 460 builder->AddCharacter('\f'); 461 break; 462 case 'n': 463 Advance(2); 464 builder->AddCharacter('\n'); 465 break; 466 case 'r': 467 Advance(2); 468 builder->AddCharacter('\r'); 469 break; 470 case 't': 471 Advance(2); 472 builder->AddCharacter('\t'); 473 break; 474 case 'v': 475 Advance(2); 476 builder->AddCharacter('\v'); 477 break; 478 case 'c': { 479 Advance(); 480 uc32 controlLetter = Next(); 481 // Special case if it is an ASCII letter. 482 // Convert lower case letters to uppercase. 483 uc32 letter = controlLetter & ~('a' ^ 'A'); 484 if (letter < 'A' || 'Z' < letter) { 485 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. 486 // This is outside the specification. We match JSC in 487 // reading the backslash as a literal character instead 488 // of as starting an escape. 489 if (unicode()) { 490 // With /u, invalid escapes are not treated as identity escapes. 491 return ReportError(CStrVector("Invalid unicode escape")); 492 } 493 builder->AddCharacter('\\'); 494 } else { 495 Advance(2); 496 builder->AddCharacter(controlLetter & 0x1f); 497 } 498 break; 499 } 500 case 'x': { 501 Advance(2); 502 uc32 value; 503 if (ParseHexEscape(2, &value)) { 504 builder->AddCharacter(value); 505 } else if (!unicode()) { 506 builder->AddCharacter('x'); 507 } else { 508 // With /u, invalid escapes are not treated as identity escapes. 509 return ReportError(CStrVector("Invalid escape")); 510 } 511 break; 512 } 513 case 'u': { 514 Advance(2); 515 uc32 value; 516 if (ParseUnicodeEscape(&value)) { 517 builder->AddEscapedUnicodeCharacter(value); 518 } else if (!unicode()) { 519 builder->AddCharacter('u'); 520 } else { 521 // With /u, invalid escapes are not treated as identity escapes. 522 return ReportError(CStrVector("Invalid unicode escape")); 523 } 524 break; 525 } 526 case 'k': 527 if (FLAG_harmony_regexp_named_captures && unicode()) { 528 Advance(2); 529 ParseNamedBackReference(builder, state CHECK_FAILED); 530 break; 531 } 532 // Fall through. 533 default: 534 Advance(); 535 // With /u, no identity escapes except for syntax characters 536 // are allowed. Otherwise, all identity escapes are allowed. 537 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { 538 builder->AddCharacter(current()); 539 Advance(); 540 } else { 541 return ReportError(CStrVector("Invalid escape")); 542 } 543 break; 544 } 545 break; 546 case '{': { 547 int dummy; 548 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); 549 if (parsed) return ReportError(CStrVector("Nothing to repeat")); 550 // Fall through. 551 } 552 case '}': 553 case ']': 554 if (unicode()) { 555 return ReportError(CStrVector("Lone quantifier brackets")); 556 } 557 // Fall through. 558 default: 559 builder->AddUnicodeCharacter(current()); 560 Advance(); 561 break; 562 } // end switch(current()) 563 564 int min; 565 int max; 566 switch (current()) { 567 // QuantifierPrefix :: 568 // * 569 // + 570 // ? 571 // { 572 case '*': 573 min = 0; 574 max = RegExpTree::kInfinity; 575 Advance(); 576 break; 577 case '+': 578 min = 1; 579 max = RegExpTree::kInfinity; 580 Advance(); 581 break; 582 case '?': 583 min = 0; 584 max = 1; 585 Advance(); 586 break; 587 case '{': 588 if (ParseIntervalQuantifier(&min, &max)) { 589 if (max < min) { 590 return ReportError( 591 CStrVector("numbers out of order in {} quantifier")); 592 } 593 break; 594 } else if (unicode()) { 595 // With /u, incomplete quantifiers are not allowed. 596 return ReportError(CStrVector("Incomplete quantifier")); 597 } 598 continue; 599 default: 600 continue; 601 } 602 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 603 if (current() == '?') { 604 quantifier_type = RegExpQuantifier::NON_GREEDY; 605 Advance(); 606 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 607 // FLAG_regexp_possessive_quantifier is a debug-only flag. 608 quantifier_type = RegExpQuantifier::POSSESSIVE; 609 Advance(); 610 } 611 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { 612 return ReportError(CStrVector("Invalid quantifier")); 613 } 614 } 615 } 616 617 618 #ifdef DEBUG 619 // Currently only used in an DCHECK. 620 static bool IsSpecialClassEscape(uc32 c) { 621 switch (c) { 622 case 'd': 623 case 'D': 624 case 's': 625 case 'S': 626 case 'w': 627 case 'W': 628 return true; 629 default: 630 return false; 631 } 632 } 633 #endif 634 635 636 // In order to know whether an escape is a backreference or not we have to scan 637 // the entire regexp and find the number of capturing parentheses. However we 638 // don't want to scan the regexp twice unless it is necessary. This mini-parser 639 // is called when needed. It can see the difference between capturing and 640 // noncapturing parentheses and can skip character classes and backslash-escaped 641 // characters. 642 void RegExpParser::ScanForCaptures() { 643 // Start with captures started previous to current position 644 int capture_count = captures_started(); 645 // Add count of captures after this position. 646 int n; 647 while ((n = current()) != kEndMarker) { 648 Advance(); 649 switch (n) { 650 case '\\': 651 Advance(); 652 break; 653 case '[': { 654 int c; 655 while ((c = current()) != kEndMarker) { 656 Advance(); 657 if (c == '\\') { 658 Advance(); 659 } else { 660 if (c == ']') break; 661 } 662 } 663 break; 664 } 665 case '(': 666 if (current() != '?') capture_count++; 667 break; 668 } 669 } 670 capture_count_ = capture_count; 671 is_scanned_for_captures_ = true; 672 } 673 674 675 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { 676 DCHECK_EQ('\\', current()); 677 DCHECK('1' <= Next() && Next() <= '9'); 678 // Try to parse a decimal literal that is no greater than the total number 679 // of left capturing parentheses in the input. 680 int start = position(); 681 int value = Next() - '0'; 682 Advance(2); 683 while (true) { 684 uc32 c = current(); 685 if (IsDecimalDigit(c)) { 686 value = 10 * value + (c - '0'); 687 if (value > kMaxCaptures) { 688 Reset(start); 689 return false; 690 } 691 Advance(); 692 } else { 693 break; 694 } 695 } 696 if (value > captures_started()) { 697 if (!is_scanned_for_captures_) { 698 int saved_position = position(); 699 ScanForCaptures(); 700 Reset(saved_position); 701 } 702 if (value > capture_count_) { 703 Reset(start); 704 return false; 705 } 706 } 707 *index_out = value; 708 return true; 709 } 710 711 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { 712 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 713 v->push_back(code_unit); 714 } else { 715 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 716 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 717 } 718 } 719 720 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { 721 DCHECK(FLAG_harmony_regexp_named_captures); 722 DCHECK(unicode()); 723 724 ZoneVector<uc16>* name = 725 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); 726 727 bool at_start = true; 728 while (true) { 729 uc32 c = current(); 730 Advance(); 731 732 // Convert unicode escapes. 733 if (c == '\\' && current() == 'u') { 734 Advance(); 735 if (!ParseUnicodeEscape(&c)) { 736 ReportError(CStrVector("Invalid Unicode escape sequence")); 737 return nullptr; 738 } 739 } 740 741 if (at_start) { 742 if (!IdentifierStart::Is(c)) { 743 ReportError(CStrVector("Invalid capture group name")); 744 return nullptr; 745 } 746 push_code_unit(name, c); 747 at_start = false; 748 } else { 749 if (c == '>') { 750 break; 751 } else if (IdentifierPart::Is(c)) { 752 push_code_unit(name, c); 753 } else { 754 ReportError(CStrVector("Invalid capture group name")); 755 return nullptr; 756 } 757 } 758 } 759 760 return name; 761 } 762 763 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, 764 int index) { 765 DCHECK(FLAG_harmony_regexp_named_captures); 766 DCHECK(unicode()); 767 DCHECK(0 < index && index <= captures_started_); 768 DCHECK_NOT_NULL(name); 769 770 if (named_captures_ == nullptr) { 771 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); 772 } else { 773 // Check for duplicates and bail if we find any. 774 for (const auto& named_capture : *named_captures_) { 775 if (*named_capture->name() == *name) { 776 ReportError(CStrVector("Duplicate capture group name")); 777 return false; 778 } 779 } 780 } 781 782 RegExpCapture* capture = GetCapture(index); 783 DCHECK(capture->name() == nullptr); 784 785 capture->set_name(name); 786 named_captures_->Add(capture, zone()); 787 788 return true; 789 } 790 791 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, 792 RegExpParserState* state) { 793 // The parser is assumed to be on the '<' in \k<name>. 794 if (current() != '<') { 795 ReportError(CStrVector("Invalid named reference")); 796 return false; 797 } 798 799 Advance(); 800 const ZoneVector<uc16>* name = ParseCaptureGroupName(); 801 if (name == nullptr) { 802 return false; 803 } 804 805 if (state->IsInsideCaptureGroup(name)) { 806 builder->AddEmpty(); 807 } else { 808 RegExpBackReference* atom = new (zone()) RegExpBackReference(); 809 atom->set_name(name); 810 811 builder->AddAtom(atom); 812 813 if (named_back_references_ == nullptr) { 814 named_back_references_ = 815 new (zone()) ZoneList<RegExpBackReference*>(1, zone()); 816 } 817 named_back_references_->Add(atom, zone()); 818 } 819 820 return true; 821 } 822 823 void RegExpParser::PatchNamedBackReferences() { 824 if (named_back_references_ == nullptr) return; 825 826 if (named_captures_ == nullptr) { 827 ReportError(CStrVector("Invalid named capture referenced")); 828 return; 829 } 830 831 // Look up and patch the actual capture for each named back reference. 832 // TODO(jgruber): O(n^2), optimize if necessary. 833 834 for (int i = 0; i < named_back_references_->length(); i++) { 835 RegExpBackReference* ref = named_back_references_->at(i); 836 837 int index = -1; 838 for (const auto& capture : *named_captures_) { 839 if (*capture->name() == *ref->name()) { 840 index = capture->index(); 841 break; 842 } 843 } 844 845 if (index == -1) { 846 ReportError(CStrVector("Invalid named capture referenced")); 847 return; 848 } 849 850 ref->set_capture(GetCapture(index)); 851 } 852 } 853 854 RegExpCapture* RegExpParser::GetCapture(int index) { 855 // The index for the capture groups are one-based. Its index in the list is 856 // zero-based. 857 int know_captures = 858 is_scanned_for_captures_ ? capture_count_ : captures_started_; 859 DCHECK(index <= know_captures); 860 if (captures_ == NULL) { 861 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); 862 } 863 while (captures_->length() < know_captures) { 864 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); 865 } 866 return captures_->at(index - 1); 867 } 868 869 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() { 870 if (named_captures_ == nullptr || named_captures_->is_empty()) 871 return Handle<FixedArray>(); 872 873 Factory* factory = isolate()->factory(); 874 875 int len = named_captures_->length() * 2; 876 Handle<FixedArray> array = factory->NewFixedArray(len); 877 878 for (int i = 0; i < named_captures_->length(); i++) { 879 RegExpCapture* capture = named_captures_->at(i); 880 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name()); 881 array->set(i * 2, *name.ToHandleChecked()); 882 array->set(i * 2 + 1, Smi::FromInt(capture->index())); 883 } 884 885 return array; 886 } 887 888 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { 889 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { 890 if (s->group_type() != CAPTURE) continue; 891 // Return true if we found the matching capture index. 892 if (index == s->capture_index()) return true; 893 // Abort if index is larger than what has been parsed up till this state. 894 if (index > s->capture_index()) return false; 895 } 896 return false; 897 } 898 899 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( 900 const ZoneVector<uc16>* name) { 901 DCHECK_NOT_NULL(name); 902 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { 903 if (s->capture_name() == nullptr) continue; 904 if (*s->capture_name() == *name) return true; 905 } 906 return false; 907 } 908 909 // QuantifierPrefix :: 910 // { DecimalDigits } 911 // { DecimalDigits , } 912 // { DecimalDigits , DecimalDigits } 913 // 914 // Returns true if parsing succeeds, and set the min_out and max_out 915 // values. Values are truncated to RegExpTree::kInfinity if they overflow. 916 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { 917 DCHECK_EQ(current(), '{'); 918 int start = position(); 919 Advance(); 920 int min = 0; 921 if (!IsDecimalDigit(current())) { 922 Reset(start); 923 return false; 924 } 925 while (IsDecimalDigit(current())) { 926 int next = current() - '0'; 927 if (min > (RegExpTree::kInfinity - next) / 10) { 928 // Overflow. Skip past remaining decimal digits and return -1. 929 do { 930 Advance(); 931 } while (IsDecimalDigit(current())); 932 min = RegExpTree::kInfinity; 933 break; 934 } 935 min = 10 * min + next; 936 Advance(); 937 } 938 int max = 0; 939 if (current() == '}') { 940 max = min; 941 Advance(); 942 } else if (current() == ',') { 943 Advance(); 944 if (current() == '}') { 945 max = RegExpTree::kInfinity; 946 Advance(); 947 } else { 948 while (IsDecimalDigit(current())) { 949 int next = current() - '0'; 950 if (max > (RegExpTree::kInfinity - next) / 10) { 951 do { 952 Advance(); 953 } while (IsDecimalDigit(current())); 954 max = RegExpTree::kInfinity; 955 break; 956 } 957 max = 10 * max + next; 958 Advance(); 959 } 960 if (current() != '}') { 961 Reset(start); 962 return false; 963 } 964 Advance(); 965 } 966 } else { 967 Reset(start); 968 return false; 969 } 970 *min_out = min; 971 *max_out = max; 972 return true; 973 } 974 975 976 uc32 RegExpParser::ParseOctalLiteral() { 977 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); 978 // For compatibility with some other browsers (not all), we parse 979 // up to three octal digits with a value below 256. 980 uc32 value = current() - '0'; 981 Advance(); 982 if ('0' <= current() && current() <= '7') { 983 value = value * 8 + current() - '0'; 984 Advance(); 985 if (value < 32 && '0' <= current() && current() <= '7') { 986 value = value * 8 + current() - '0'; 987 Advance(); 988 } 989 } 990 return value; 991 } 992 993 994 bool RegExpParser::ParseHexEscape(int length, uc32* value) { 995 int start = position(); 996 uc32 val = 0; 997 for (int i = 0; i < length; ++i) { 998 uc32 c = current(); 999 int d = HexValue(c); 1000 if (d < 0) { 1001 Reset(start); 1002 return false; 1003 } 1004 val = val * 16 + d; 1005 Advance(); 1006 } 1007 *value = val; 1008 return true; 1009 } 1010 1011 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. 1012 bool RegExpParser::ParseUnicodeEscape(uc32* value) { 1013 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 1014 // allowed). In the latter case, the number of hex digits between { } is 1015 // arbitrary. \ and u have already been read. 1016 if (current() == '{' && unicode()) { 1017 int start = position(); 1018 Advance(); 1019 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 1020 if (current() == '}') { 1021 Advance(); 1022 return true; 1023 } 1024 } 1025 Reset(start); 1026 return false; 1027 } 1028 // \u but no {, or \u{...} escapes not allowed. 1029 bool result = ParseHexEscape(4, value); 1030 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && 1031 current() == '\\') { 1032 // Attempt to read trail surrogate. 1033 int start = position(); 1034 if (Next() == 'u') { 1035 Advance(2); 1036 uc32 trail; 1037 if (ParseHexEscape(4, &trail) && 1038 unibrow::Utf16::IsTrailSurrogate(trail)) { 1039 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), 1040 static_cast<uc16>(trail)); 1041 return true; 1042 } 1043 } 1044 Reset(start); 1045 } 1046 return result; 1047 } 1048 1049 #ifdef V8_I18N_SUPPORT 1050 1051 namespace { 1052 1053 bool IsExactPropertyAlias(const char* property_name, UProperty property) { 1054 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); 1055 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true; 1056 for (int i = 0;; i++) { 1057 const char* long_name = u_getPropertyName( 1058 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1059 if (long_name == NULL) break; 1060 if (strcmp(property_name, long_name) == 0) return true; 1061 } 1062 return false; 1063 } 1064 1065 bool IsExactPropertyValueAlias(const char* property_value_name, 1066 UProperty property, int32_t property_value) { 1067 const char* short_name = 1068 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); 1069 if (short_name != NULL && strcmp(property_value_name, short_name) == 0) { 1070 return true; 1071 } 1072 for (int i = 0;; i++) { 1073 const char* long_name = u_getPropertyValueName( 1074 property, property_value, 1075 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1076 if (long_name == NULL) break; 1077 if (strcmp(property_value_name, long_name) == 0) return true; 1078 } 1079 return false; 1080 } 1081 1082 bool LookupPropertyValueName(UProperty property, 1083 const char* property_value_name, bool negate, 1084 ZoneList<CharacterRange>* result, Zone* zone) { 1085 int32_t property_value = 1086 u_getPropertyValueEnum(property, property_value_name); 1087 if (property_value == UCHAR_INVALID_CODE) return false; 1088 1089 // We require the property name to match exactly to one of the property value 1090 // aliases. However, u_getPropertyValueEnum uses loose matching. 1091 if (!IsExactPropertyValueAlias(property_value_name, property, 1092 property_value)) { 1093 return false; 1094 } 1095 1096 USet* set = uset_openEmpty(); 1097 UErrorCode ec = U_ZERO_ERROR; 1098 uset_applyIntPropertyValue(set, property, property_value, &ec); 1099 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set); 1100 1101 if (success) { 1102 uset_removeAllStrings(set); 1103 if (negate) uset_complement(set); 1104 int item_count = uset_getItemCount(set); 1105 int item_result = 0; 1106 for (int i = 0; i < item_count; i++) { 1107 uc32 start = 0; 1108 uc32 end = 0; 1109 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); 1110 result->Add(CharacterRange::Range(start, end), zone); 1111 } 1112 DCHECK_EQ(U_ZERO_ERROR, ec); 1113 DCHECK_EQ(0, item_result); 1114 } 1115 uset_close(set); 1116 return success; 1117 } 1118 1119 template <size_t N> 1120 inline bool NameEquals(const char* name, const char (&literal)[N]) { 1121 return strncmp(name, literal, N + 1) == 0; 1122 } 1123 1124 bool LookupSpecialPropertyValueName(const char* name, 1125 ZoneList<CharacterRange>* result, 1126 bool negate, Zone* zone) { 1127 if (NameEquals(name, "Any")) { 1128 if (!negate) result->Add(CharacterRange::Everything(), zone); 1129 } else if (NameEquals(name, "ASCII")) { 1130 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) 1131 : CharacterRange::Range(0x0, 0x7f), 1132 zone); 1133 } else if (NameEquals(name, "Assigned")) { 1134 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", 1135 !negate, result, zone); 1136 } else { 1137 return false; 1138 } 1139 return true; 1140 } 1141 1142 } // anonymous namespace 1143 1144 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, 1145 bool negate) { 1146 // Parse the property class as follows: 1147 // - In \p{name}, 'name' is interpreted 1148 // - either as a general category property value name. 1149 // - or as a binary property name. 1150 // - In \p{name=value}, 'name' is interpreted as an enumerated property name, 1151 // and 'value' is interpreted as one of the available property value names. 1152 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. 1153 // - Loose matching is not applied. 1154 List<char> first_part; 1155 List<char> second_part; 1156 if (current() == '{') { 1157 // Parse \p{[PropertyName=]PropertyNameValue} 1158 for (Advance(); current() != '}' && current() != '='; Advance()) { 1159 if (!has_next()) return false; 1160 first_part.Add(static_cast<char>(current())); 1161 } 1162 if (current() == '=') { 1163 for (Advance(); current() != '}'; Advance()) { 1164 if (!has_next()) return false; 1165 second_part.Add(static_cast<char>(current())); 1166 } 1167 second_part.Add(0); // null-terminate string. 1168 } 1169 } else { 1170 return false; 1171 } 1172 Advance(); 1173 first_part.Add(0); // null-terminate string. 1174 1175 if (second_part.is_empty()) { 1176 // First attempt to interpret as general category property value name. 1177 const char* name = first_part.ToConstVector().start(); 1178 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, 1179 result, zone())) { 1180 return true; 1181 } 1182 // Interpret "Any", "ASCII", and "Assigned". 1183 if (LookupSpecialPropertyValueName(name, result, negate, zone())) { 1184 return true; 1185 } 1186 // Then attempt to interpret as binary property name with value name 'Y'. 1187 UProperty property = u_getPropertyEnum(name); 1188 if (property < UCHAR_BINARY_START) return false; 1189 if (property >= UCHAR_BINARY_LIMIT) return false; 1190 if (!IsExactPropertyAlias(name, property)) return false; 1191 return LookupPropertyValueName(property, negate ? "N" : "Y", false, result, 1192 zone()); 1193 } else { 1194 // Both property name and value name are specified. Attempt to interpret 1195 // the property name as enumerated property. 1196 const char* property_name = first_part.ToConstVector().start(); 1197 const char* value_name = second_part.ToConstVector().start(); 1198 UProperty property = u_getPropertyEnum(property_name); 1199 if (property < UCHAR_INT_START) return false; 1200 if (property >= UCHAR_INT_LIMIT) return false; 1201 if (!IsExactPropertyAlias(property_name, property)) return false; 1202 return LookupPropertyValueName(property, value_name, negate, result, 1203 zone()); 1204 } 1205 } 1206 1207 #else // V8_I18N_SUPPORT 1208 1209 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, 1210 bool negate) { 1211 return false; 1212 } 1213 1214 #endif // V8_I18N_SUPPORT 1215 1216 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { 1217 uc32 x = 0; 1218 int d = HexValue(current()); 1219 if (d < 0) { 1220 return false; 1221 } 1222 while (d >= 0) { 1223 x = x * 16 + d; 1224 if (x > max_value) { 1225 return false; 1226 } 1227 Advance(); 1228 d = HexValue(current()); 1229 } 1230 *value = x; 1231 return true; 1232 } 1233 1234 1235 uc32 RegExpParser::ParseClassCharacterEscape() { 1236 DCHECK(current() == '\\'); 1237 DCHECK(has_next() && !IsSpecialClassEscape(Next())); 1238 Advance(); 1239 switch (current()) { 1240 case 'b': 1241 Advance(); 1242 return '\b'; 1243 // ControlEscape :: one of 1244 // f n r t v 1245 case 'f': 1246 Advance(); 1247 return '\f'; 1248 case 'n': 1249 Advance(); 1250 return '\n'; 1251 case 'r': 1252 Advance(); 1253 return '\r'; 1254 case 't': 1255 Advance(); 1256 return '\t'; 1257 case 'v': 1258 Advance(); 1259 return '\v'; 1260 case 'c': { 1261 uc32 controlLetter = Next(); 1262 uc32 letter = controlLetter & ~('A' ^ 'a'); 1263 // For compatibility with JSC, inside a character class. We also accept 1264 // digits and underscore as control characters, unless with /u. 1265 if (letter >= 'A' && letter <= 'Z') { 1266 Advance(2); 1267 // Control letters mapped to ASCII control characters in the range 1268 // 0x00-0x1f. 1269 return controlLetter & 0x1f; 1270 } 1271 if (unicode()) { 1272 // With /u, invalid escapes are not treated as identity escapes. 1273 ReportError(CStrVector("Invalid class escape")); 1274 return 0; 1275 } 1276 if ((controlLetter >= '0' && controlLetter <= '9') || 1277 controlLetter == '_') { 1278 Advance(2); 1279 return controlLetter & 0x1f; 1280 } 1281 // We match JSC in reading the backslash as a literal 1282 // character instead of as starting an escape. 1283 return '\\'; 1284 } 1285 case '0': 1286 // With /u, \0 is interpreted as NUL if not followed by another digit. 1287 if (unicode() && !(Next() >= '0' && Next() <= '9')) { 1288 Advance(); 1289 return 0; 1290 } 1291 // Fall through. 1292 case '1': 1293 case '2': 1294 case '3': 1295 case '4': 1296 case '5': 1297 case '6': 1298 case '7': 1299 // For compatibility, we interpret a decimal escape that isn't 1300 // a back reference (and therefore either \0 or not valid according 1301 // to the specification) as a 1..3 digit octal character code. 1302 if (unicode()) { 1303 // With /u, decimal escape is not interpreted as octal character code. 1304 ReportError(CStrVector("Invalid class escape")); 1305 return 0; 1306 } 1307 return ParseOctalLiteral(); 1308 case 'x': { 1309 Advance(); 1310 uc32 value; 1311 if (ParseHexEscape(2, &value)) return value; 1312 if (unicode()) { 1313 // With /u, invalid escapes are not treated as identity escapes. 1314 ReportError(CStrVector("Invalid escape")); 1315 return 0; 1316 } 1317 // If \x is not followed by a two-digit hexadecimal, treat it 1318 // as an identity escape. 1319 return 'x'; 1320 } 1321 case 'u': { 1322 Advance(); 1323 uc32 value; 1324 if (ParseUnicodeEscape(&value)) return value; 1325 if (unicode()) { 1326 // With /u, invalid escapes are not treated as identity escapes. 1327 ReportError(CStrVector("Invalid unicode escape")); 1328 return 0; 1329 } 1330 // If \u is not followed by a two-digit hexadecimal, treat it 1331 // as an identity escape. 1332 return 'u'; 1333 } 1334 default: { 1335 uc32 result = current(); 1336 // With /u, no identity escapes except for syntax characters and '-' are 1337 // allowed. Otherwise, all identity escapes are allowed. 1338 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { 1339 Advance(); 1340 return result; 1341 } 1342 ReportError(CStrVector("Invalid escape")); 1343 return 0; 1344 } 1345 } 1346 return 0; 1347 } 1348 1349 1350 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { 1351 DCHECK_EQ(0, *char_class); 1352 uc32 first = current(); 1353 if (first == '\\') { 1354 switch (Next()) { 1355 case 'w': 1356 case 'W': 1357 case 'd': 1358 case 'D': 1359 case 's': 1360 case 'S': { 1361 *char_class = Next(); 1362 Advance(2); 1363 return CharacterRange::Singleton(0); // Return dummy value. 1364 } 1365 case kEndMarker: 1366 return ReportError(CStrVector("\\ at end of pattern")); 1367 default: 1368 first = ParseClassCharacterEscape(CHECK_FAILED); 1369 } 1370 } else { 1371 Advance(); 1372 } 1373 1374 return CharacterRange::Singleton(first); 1375 } 1376 1377 static const uc16 kNoCharClass = 0; 1378 1379 // Adds range or pre-defined character class to character ranges. 1380 // If char_class is not kInvalidClass, it's interpreted as a class 1381 // escape (i.e., 's' means whitespace, from '\s'). 1382 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 1383 uc16 char_class, CharacterRange range, 1384 Zone* zone) { 1385 if (char_class != kNoCharClass) { 1386 CharacterRange::AddClassEscape(char_class, ranges, zone); 1387 } else { 1388 ranges->Add(range, zone); 1389 } 1390 } 1391 1392 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) { 1393 if (!FLAG_harmony_regexp_property) return false; 1394 if (!unicode()) return false; 1395 if (current() != '\\') return false; 1396 uc32 next = Next(); 1397 bool parse_success = false; 1398 if (next == 'p') { 1399 Advance(2); 1400 parse_success = ParsePropertyClass(ranges, false); 1401 } else if (next == 'P') { 1402 Advance(2); 1403 parse_success = ParsePropertyClass(ranges, true); 1404 } else { 1405 return false; 1406 } 1407 if (!parse_success) 1408 ReportError(CStrVector("Invalid property name in character class")); 1409 return parse_success; 1410 } 1411 1412 RegExpTree* RegExpParser::ParseCharacterClass() { 1413 static const char* kUnterminated = "Unterminated character class"; 1414 static const char* kRangeInvalid = "Invalid character class"; 1415 static const char* kRangeOutOfOrder = "Range out of order in character class"; 1416 1417 DCHECK_EQ(current(), '['); 1418 Advance(); 1419 bool is_negated = false; 1420 if (current() == '^') { 1421 is_negated = true; 1422 Advance(); 1423 } 1424 ZoneList<CharacterRange>* ranges = 1425 new (zone()) ZoneList<CharacterRange>(2, zone()); 1426 while (has_more() && current() != ']') { 1427 bool parsed_property = ParseClassProperty(ranges CHECK_FAILED); 1428 if (parsed_property) continue; 1429 uc16 char_class = kNoCharClass; 1430 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); 1431 if (current() == '-') { 1432 Advance(); 1433 if (current() == kEndMarker) { 1434 // If we reach the end we break out of the loop and let the 1435 // following code report an error. 1436 break; 1437 } else if (current() == ']') { 1438 AddRangeOrEscape(ranges, char_class, first, zone()); 1439 ranges->Add(CharacterRange::Singleton('-'), zone()); 1440 break; 1441 } 1442 uc16 char_class_2 = kNoCharClass; 1443 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); 1444 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { 1445 // Either end is an escaped character class. Treat the '-' verbatim. 1446 if (unicode()) { 1447 // ES2015 21.2.2.15.1 step 1. 1448 return ReportError(CStrVector(kRangeInvalid)); 1449 } 1450 AddRangeOrEscape(ranges, char_class, first, zone()); 1451 ranges->Add(CharacterRange::Singleton('-'), zone()); 1452 AddRangeOrEscape(ranges, char_class_2, next, zone()); 1453 continue; 1454 } 1455 // ES2015 21.2.2.15.1 step 6. 1456 if (first.from() > next.to()) { 1457 return ReportError(CStrVector(kRangeOutOfOrder)); 1458 } 1459 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); 1460 } else { 1461 AddRangeOrEscape(ranges, char_class, first, zone()); 1462 } 1463 } 1464 if (!has_more()) { 1465 return ReportError(CStrVector(kUnterminated)); 1466 } 1467 Advance(); 1468 if (ranges->length() == 0) { 1469 ranges->Add(CharacterRange::Everything(), zone()); 1470 is_negated = !is_negated; 1471 } 1472 return new (zone()) RegExpCharacterClass(ranges, is_negated); 1473 } 1474 1475 1476 #undef CHECK_FAILED 1477 1478 1479 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, 1480 FlatStringReader* input, JSRegExp::Flags flags, 1481 RegExpCompileData* result) { 1482 DCHECK(result != NULL); 1483 RegExpParser parser(input, &result->error, flags, isolate, zone); 1484 RegExpTree* tree = parser.ParsePattern(); 1485 if (parser.failed()) { 1486 DCHECK(tree == NULL); 1487 DCHECK(!result->error.is_null()); 1488 } else { 1489 DCHECK(tree != NULL); 1490 DCHECK(result->error.is_null()); 1491 if (FLAG_trace_regexp_parser) { 1492 OFStream os(stdout); 1493 tree->Print(os, zone); 1494 os << "\n"; 1495 } 1496 result->tree = tree; 1497 int capture_count = parser.captures_started(); 1498 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1499 result->contains_anchor = parser.contains_anchor(); 1500 result->capture_name_map = parser.CreateCaptureNameMap(); 1501 result->capture_count = capture_count; 1502 } 1503 return !parser.failed(); 1504 } 1505 1506 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode) 1507 : zone_(zone), 1508 pending_empty_(false), 1509 ignore_case_(ignore_case), 1510 unicode_(unicode), 1511 characters_(NULL), 1512 pending_surrogate_(kNoPendingSurrogate), 1513 terms_(), 1514 alternatives_() 1515 #ifdef DEBUG 1516 , 1517 last_added_(ADD_NONE) 1518 #endif 1519 { 1520 } 1521 1522 1523 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { 1524 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 1525 FlushPendingSurrogate(); 1526 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. 1527 pending_surrogate_ = lead_surrogate; 1528 } 1529 1530 1531 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { 1532 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); 1533 if (pending_surrogate_ != kNoPendingSurrogate) { 1534 uc16 lead_surrogate = pending_surrogate_; 1535 pending_surrogate_ = kNoPendingSurrogate; 1536 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 1537 uc32 combined = 1538 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); 1539 if (NeedsDesugaringForIgnoreCase(combined)) { 1540 AddCharacterClassForDesugaring(combined); 1541 } else { 1542 ZoneList<uc16> surrogate_pair(2, zone()); 1543 surrogate_pair.Add(lead_surrogate, zone()); 1544 surrogate_pair.Add(trail_surrogate, zone()); 1545 RegExpAtom* atom = 1546 new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); 1547 AddAtom(atom); 1548 } 1549 } else { 1550 pending_surrogate_ = trail_surrogate; 1551 FlushPendingSurrogate(); 1552 } 1553 } 1554 1555 1556 void RegExpBuilder::FlushPendingSurrogate() { 1557 if (pending_surrogate_ != kNoPendingSurrogate) { 1558 DCHECK(unicode()); 1559 uc32 c = pending_surrogate_; 1560 pending_surrogate_ = kNoPendingSurrogate; 1561 AddCharacterClassForDesugaring(c); 1562 } 1563 } 1564 1565 1566 void RegExpBuilder::FlushCharacters() { 1567 FlushPendingSurrogate(); 1568 pending_empty_ = false; 1569 if (characters_ != NULL) { 1570 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); 1571 characters_ = NULL; 1572 text_.Add(atom, zone()); 1573 LAST(ADD_ATOM); 1574 } 1575 } 1576 1577 1578 void RegExpBuilder::FlushText() { 1579 FlushCharacters(); 1580 int num_text = text_.length(); 1581 if (num_text == 0) { 1582 return; 1583 } else if (num_text == 1) { 1584 terms_.Add(text_.last(), zone()); 1585 } else { 1586 RegExpText* text = new (zone()) RegExpText(zone()); 1587 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); 1588 terms_.Add(text, zone()); 1589 } 1590 text_.Clear(); 1591 } 1592 1593 1594 void RegExpBuilder::AddCharacter(uc16 c) { 1595 FlushPendingSurrogate(); 1596 pending_empty_ = false; 1597 if (NeedsDesugaringForIgnoreCase(c)) { 1598 AddCharacterClassForDesugaring(c); 1599 } else { 1600 if (characters_ == NULL) { 1601 characters_ = new (zone()) ZoneList<uc16>(4, zone()); 1602 } 1603 characters_->Add(c, zone()); 1604 LAST(ADD_CHAR); 1605 } 1606 } 1607 1608 1609 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { 1610 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 1611 DCHECK(unicode()); 1612 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); 1613 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 1614 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 1615 AddLeadSurrogate(c); 1616 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { 1617 AddTrailSurrogate(c); 1618 } else { 1619 AddCharacter(static_cast<uc16>(c)); 1620 } 1621 } 1622 1623 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { 1624 // A lead or trail surrogate parsed via escape sequence will not 1625 // pair up with any preceding lead or following trail surrogate. 1626 FlushPendingSurrogate(); 1627 AddUnicodeCharacter(character); 1628 FlushPendingSurrogate(); 1629 } 1630 1631 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1632 1633 1634 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1635 if (NeedsDesugaringForUnicode(cc)) { 1636 // With /u, character class needs to be desugared, so it 1637 // must be a standalone term instead of being part of a RegExpText. 1638 AddTerm(cc); 1639 } else { 1640 AddAtom(cc); 1641 } 1642 } 1643 1644 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { 1645 AddTerm(new (zone()) RegExpCharacterClass( 1646 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); 1647 } 1648 1649 1650 void RegExpBuilder::AddAtom(RegExpTree* term) { 1651 if (term->IsEmpty()) { 1652 AddEmpty(); 1653 return; 1654 } 1655 if (term->IsTextElement()) { 1656 FlushCharacters(); 1657 text_.Add(term, zone()); 1658 } else { 1659 FlushText(); 1660 terms_.Add(term, zone()); 1661 } 1662 LAST(ADD_ATOM); 1663 } 1664 1665 1666 void RegExpBuilder::AddTerm(RegExpTree* term) { 1667 FlushText(); 1668 terms_.Add(term, zone()); 1669 LAST(ADD_ATOM); 1670 } 1671 1672 1673 void RegExpBuilder::AddAssertion(RegExpTree* assert) { 1674 FlushText(); 1675 terms_.Add(assert, zone()); 1676 LAST(ADD_ASSERT); 1677 } 1678 1679 1680 void RegExpBuilder::NewAlternative() { FlushTerms(); } 1681 1682 1683 void RegExpBuilder::FlushTerms() { 1684 FlushText(); 1685 int num_terms = terms_.length(); 1686 RegExpTree* alternative; 1687 if (num_terms == 0) { 1688 alternative = new (zone()) RegExpEmpty(); 1689 } else if (num_terms == 1) { 1690 alternative = terms_.last(); 1691 } else { 1692 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); 1693 } 1694 alternatives_.Add(alternative, zone()); 1695 terms_.Clear(); 1696 LAST(ADD_NONE); 1697 } 1698 1699 1700 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { 1701 if (!unicode()) return false; 1702 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not 1703 // necessarily mean that we need to desugar. It's probably nicer to have a 1704 // separate pass to figure out unicode desugarings. 1705 if (ignore_case()) return true; 1706 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 1707 CharacterRange::Canonicalize(ranges); 1708 for (int i = ranges->length() - 1; i >= 0; i--) { 1709 uc32 from = ranges->at(i).from(); 1710 uc32 to = ranges->at(i).to(); 1711 // Check for non-BMP characters. 1712 if (to >= kNonBmpStart) return true; 1713 // Check for lone surrogates. 1714 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; 1715 } 1716 return false; 1717 } 1718 1719 1720 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { 1721 #ifdef V8_I18N_SUPPORT 1722 if (unicode() && ignore_case()) { 1723 USet* set = uset_open(c, c); 1724 uset_closeOver(set, USET_CASE_INSENSITIVE); 1725 uset_removeAllStrings(set); 1726 bool result = uset_size(set) > 1; 1727 uset_close(set); 1728 return result; 1729 } 1730 // In the case where ICU is not included, we act as if the unicode flag is 1731 // not set, and do not desugar. 1732 #endif // V8_I18N_SUPPORT 1733 return false; 1734 } 1735 1736 1737 RegExpTree* RegExpBuilder::ToRegExp() { 1738 FlushTerms(); 1739 int num_alternatives = alternatives_.length(); 1740 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1741 if (num_alternatives == 1) return alternatives_.last(); 1742 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1743 } 1744 1745 bool RegExpBuilder::AddQuantifierToAtom( 1746 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 1747 FlushPendingSurrogate(); 1748 if (pending_empty_) { 1749 pending_empty_ = false; 1750 return true; 1751 } 1752 RegExpTree* atom; 1753 if (characters_ != NULL) { 1754 DCHECK(last_added_ == ADD_CHAR); 1755 // Last atom was character. 1756 Vector<const uc16> char_vector = characters_->ToConstVector(); 1757 int num_chars = char_vector.length(); 1758 if (num_chars > 1) { 1759 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); 1760 text_.Add(new (zone()) RegExpAtom(prefix), zone()); 1761 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 1762 } 1763 characters_ = NULL; 1764 atom = new (zone()) RegExpAtom(char_vector); 1765 FlushText(); 1766 } else if (text_.length() > 0) { 1767 DCHECK(last_added_ == ADD_ATOM); 1768 atom = text_.RemoveLast(); 1769 FlushText(); 1770 } else if (terms_.length() > 0) { 1771 DCHECK(last_added_ == ADD_ATOM); 1772 atom = terms_.RemoveLast(); 1773 // With /u, lookarounds are not quantifiable. 1774 if (unicode() && atom->IsLookaround()) return false; 1775 if (atom->max_match() == 0) { 1776 // Guaranteed to only match an empty string. 1777 LAST(ADD_TERM); 1778 if (min == 0) { 1779 return true; 1780 } 1781 terms_.Add(atom, zone()); 1782 return true; 1783 } 1784 } else { 1785 // Only call immediately after adding an atom or character! 1786 UNREACHABLE(); 1787 return false; 1788 } 1789 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1790 zone()); 1791 LAST(ADD_TERM); 1792 return true; 1793 } 1794 1795 } // namespace internal 1796 } // namespace v8 1797