1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/regexp/regexp-parser.h" 6 7 #include "src/char-predicates-inl.h" 8 #include "src/factory.h" 9 #include "src/isolate.h" 10 #include "src/objects-inl.h" 11 #include "src/ostreams.h" 12 #include "src/regexp/jsregexp.h" 13 #include "src/utils.h" 14 15 #ifdef V8_I18N_SUPPORT 16 #include "unicode/uniset.h" 17 #endif // V8_I18N_SUPPORT 18 19 namespace v8 { 20 namespace internal { 21 22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, 23 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) 24 : isolate_(isolate), 25 zone_(zone), 26 error_(error), 27 captures_(NULL), 28 named_captures_(NULL), 29 named_back_references_(NULL), 30 in_(in), 31 current_(kEndMarker), 32 ignore_case_(flags & JSRegExp::kIgnoreCase), 33 multiline_(flags & JSRegExp::kMultiline), 34 unicode_(flags & JSRegExp::kUnicode), 35 next_pos_(0), 36 captures_started_(0), 37 capture_count_(0), 38 has_more_(true), 39 simple_(false), 40 contains_anchor_(false), 41 is_scanned_for_captures_(false), 42 failed_(false) { 43 Advance(); 44 } 45 46 template <bool update_position> 47 inline uc32 RegExpParser::ReadNext() { 48 int position = next_pos_; 49 uc32 c0 = in()->Get(position); 50 position++; 51 // Read the whole surrogate pair in case of unicode flag, if possible. 52 if (unicode() && position < in()->length() && 53 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { 54 uc16 c1 = in()->Get(position); 55 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 56 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); 57 position++; 58 } 59 } 60 if (update_position) next_pos_ = position; 61 return c0; 62 } 63 64 65 uc32 RegExpParser::Next() { 66 if (has_next()) { 67 return ReadNext<false>(); 68 } else { 69 return kEndMarker; 70 } 71 } 72 73 74 void RegExpParser::Advance() { 75 if (has_next()) { 76 StackLimitCheck check(isolate()); 77 if (check.HasOverflowed()) { 78 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); 79 ReportError(CStrVector( 80 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); 81 } else if (zone()->excess_allocation()) { 82 ReportError(CStrVector("Regular expression too large")); 83 } else { 84 current_ = ReadNext<true>(); 85 } 86 } else { 87 current_ = kEndMarker; 88 // Advance so that position() points to 1-after-the-last-character. This is 89 // important so that Reset() to this position works correctly. 90 next_pos_ = in()->length() + 1; 91 has_more_ = false; 92 } 93 } 94 95 96 void RegExpParser::Reset(int pos) { 97 next_pos_ = pos; 98 has_more_ = (pos < in()->length()); 99 Advance(); 100 } 101 102 103 void RegExpParser::Advance(int dist) { 104 next_pos_ += dist - 1; 105 Advance(); 106 } 107 108 109 bool RegExpParser::simple() { return simple_; } 110 111 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { 112 switch (c) { 113 case '^': 114 case '$': 115 case '\\': 116 case '.': 117 case '*': 118 case '+': 119 case '?': 120 case '(': 121 case ')': 122 case '[': 123 case ']': 124 case '{': 125 case '}': 126 case '|': 127 case '/': 128 return true; 129 default: 130 break; 131 } 132 return false; 133 } 134 135 136 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { 137 if (failed_) return NULL; // Do not overwrite any existing error. 138 failed_ = true; 139 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); 140 // Zip to the end to make sure the no more input is read. 141 current_ = kEndMarker; 142 next_pos_ = in()->length(); 143 return NULL; 144 } 145 146 147 #define CHECK_FAILED /**/); \ 148 if (failed_) return NULL; \ 149 ((void)0 150 151 152 // Pattern :: 153 // Disjunction 154 RegExpTree* RegExpParser::ParsePattern() { 155 RegExpTree* result = ParseDisjunction(CHECK_FAILED); 156 PatchNamedBackReferences(CHECK_FAILED); 157 DCHECK(!has_more()); 158 // If the result of parsing is a literal string atom, and it has the 159 // same length as the input, then the atom is identical to the input. 160 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { 161 simple_ = true; 162 } 163 return result; 164 } 165 166 167 // Disjunction :: 168 // Alternative 169 // Alternative | Disjunction 170 // Alternative :: 171 // [empty] 172 // Term Alternative 173 // Term :: 174 // Assertion 175 // Atom 176 // Atom Quantifier 177 RegExpTree* RegExpParser::ParseDisjunction() { 178 // Used to store current state while parsing subexpressions. 179 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, 180 nullptr, ignore_case(), unicode(), zone()); 181 RegExpParserState* state = &initial_state; 182 // Cache the builder in a local variable for quick access. 183 RegExpBuilder* builder = initial_state.builder(); 184 while (true) { 185 switch (current()) { 186 case kEndMarker: 187 if (state->IsSubexpression()) { 188 // Inside a parenthesized group when hitting end of input. 189 return ReportError(CStrVector("Unterminated group")); 190 } 191 DCHECK_EQ(INITIAL, state->group_type()); 192 // Parsing completed successfully. 193 return builder->ToRegExp(); 194 case ')': { 195 if (!state->IsSubexpression()) { 196 return ReportError(CStrVector("Unmatched ')'")); 197 } 198 DCHECK_NE(INITIAL, state->group_type()); 199 200 Advance(); 201 // End disjunction parsing and convert builder content to new single 202 // regexp atom. 203 RegExpTree* body = builder->ToRegExp(); 204 205 int end_capture_index = captures_started(); 206 207 int capture_index = state->capture_index(); 208 SubexpressionType group_type = state->group_type(); 209 210 // Build result of subexpression. 211 if (group_type == CAPTURE) { 212 if (state->IsNamedCapture()) { 213 CreateNamedCaptureAtIndex(state->capture_name(), 214 capture_index CHECK_FAILED); 215 } 216 RegExpCapture* capture = GetCapture(capture_index); 217 capture->set_body(body); 218 body = capture; 219 } else if (group_type == GROUPING) { 220 body = new (zone()) RegExpGroup(body); 221 } else { 222 DCHECK(group_type == POSITIVE_LOOKAROUND || 223 group_type == NEGATIVE_LOOKAROUND); 224 bool is_positive = (group_type == POSITIVE_LOOKAROUND); 225 body = new (zone()) RegExpLookaround( 226 body, is_positive, end_capture_index - capture_index, 227 capture_index, state->lookaround_type()); 228 } 229 230 // Restore previous state. 231 state = state->previous_state(); 232 builder = state->builder(); 233 234 builder->AddAtom(body); 235 // For compatability with JSC and ES3, we allow quantifiers after 236 // lookaheads, and break in all cases. 237 break; 238 } 239 case '|': { 240 Advance(); 241 builder->NewAlternative(); 242 continue; 243 } 244 case '*': 245 case '+': 246 case '?': 247 return ReportError(CStrVector("Nothing to repeat")); 248 case '^': { 249 Advance(); 250 if (multiline()) { 251 builder->AddAssertion( 252 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); 253 } else { 254 builder->AddAssertion( 255 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); 256 set_contains_anchor(); 257 } 258 continue; 259 } 260 case '$': { 261 Advance(); 262 RegExpAssertion::AssertionType assertion_type = 263 multiline() ? RegExpAssertion::END_OF_LINE 264 : RegExpAssertion::END_OF_INPUT; 265 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); 266 continue; 267 } 268 case '.': { 269 Advance(); 270 // everything except \x0a, \x0d, \u2028 and \u2029 271 ZoneList<CharacterRange>* ranges = 272 new (zone()) ZoneList<CharacterRange>(2, zone()); 273 CharacterRange::AddClassEscape('.', ranges, zone()); 274 RegExpCharacterClass* cc = 275 new (zone()) RegExpCharacterClass(ranges, false); 276 builder->AddCharacterClass(cc); 277 break; 278 } 279 case '(': { 280 SubexpressionType subexpr_type = CAPTURE; 281 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 282 bool is_named_capture = false; 283 Advance(); 284 if (current() == '?') { 285 switch (Next()) { 286 case ':': 287 subexpr_type = GROUPING; 288 Advance(2); 289 break; 290 case '=': 291 lookaround_type = RegExpLookaround::LOOKAHEAD; 292 subexpr_type = POSITIVE_LOOKAROUND; 293 Advance(2); 294 break; 295 case '!': 296 lookaround_type = RegExpLookaround::LOOKAHEAD; 297 subexpr_type = NEGATIVE_LOOKAROUND; 298 Advance(2); 299 break; 300 case '<': 301 Advance(); 302 if (FLAG_harmony_regexp_lookbehind) { 303 if (Next() == '=') { 304 subexpr_type = POSITIVE_LOOKAROUND; 305 lookaround_type = RegExpLookaround::LOOKBEHIND; 306 Advance(2); 307 break; 308 } else if (Next() == '!') { 309 subexpr_type = NEGATIVE_LOOKAROUND; 310 lookaround_type = RegExpLookaround::LOOKBEHIND; 311 Advance(2); 312 break; 313 } 314 } 315 if (FLAG_harmony_regexp_named_captures && unicode()) { 316 is_named_capture = true; 317 Advance(); 318 break; 319 } 320 // Fall through. 321 default: 322 return ReportError(CStrVector("Invalid group")); 323 } 324 } 325 326 const ZoneVector<uc16>* capture_name = nullptr; 327 if (subexpr_type == CAPTURE) { 328 if (captures_started_ >= kMaxCaptures) { 329 return ReportError(CStrVector("Too many captures")); 330 } 331 captures_started_++; 332 333 if (is_named_capture) { 334 capture_name = ParseCaptureGroupName(CHECK_FAILED); 335 } 336 } 337 // Store current state and begin new disjunction parsing. 338 state = new (zone()) RegExpParserState( 339 state, subexpr_type, lookaround_type, captures_started_, 340 capture_name, ignore_case(), unicode(), zone()); 341 builder = state->builder(); 342 continue; 343 } 344 case '[': { 345 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); 346 builder->AddCharacterClass(cc->AsCharacterClass()); 347 break; 348 } 349 // Atom :: 350 // \ AtomEscape 351 case '\\': 352 switch (Next()) { 353 case kEndMarker: 354 return ReportError(CStrVector("\\ at end of pattern")); 355 case 'b': 356 Advance(2); 357 builder->AddAssertion( 358 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); 359 continue; 360 case 'B': 361 Advance(2); 362 builder->AddAssertion( 363 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); 364 continue; 365 // AtomEscape :: 366 // CharacterClassEscape 367 // 368 // CharacterClassEscape :: one of 369 // d D s S w W 370 case 'd': 371 case 'D': 372 case 's': 373 case 'S': 374 case 'w': 375 case 'W': { 376 uc32 c = Next(); 377 Advance(2); 378 ZoneList<CharacterRange>* ranges = 379 new (zone()) ZoneList<CharacterRange>(2, zone()); 380 CharacterRange::AddClassEscape(c, ranges, zone()); 381 RegExpCharacterClass* cc = 382 new (zone()) RegExpCharacterClass(ranges, false); 383 builder->AddCharacterClass(cc); 384 break; 385 } 386 case 'p': 387 case 'P': { 388 uc32 p = Next(); 389 Advance(2); 390 if (unicode()) { 391 if (FLAG_harmony_regexp_property) { 392 ZoneList<CharacterRange>* ranges = 393 new (zone()) ZoneList<CharacterRange>(2, zone()); 394 if (!ParsePropertyClass(ranges, p == 'P')) { 395 return ReportError(CStrVector("Invalid property name")); 396 } 397 RegExpCharacterClass* cc = 398 new (zone()) RegExpCharacterClass(ranges, false); 399 builder->AddCharacterClass(cc); 400 } else { 401 // With /u, no identity escapes except for syntax characters 402 // are allowed. Otherwise, all identity escapes are allowed. 403 return ReportError(CStrVector("Invalid escape")); 404 } 405 } else { 406 builder->AddCharacter(p); 407 } 408 break; 409 } 410 case '1': 411 case '2': 412 case '3': 413 case '4': 414 case '5': 415 case '6': 416 case '7': 417 case '8': 418 case '9': { 419 int index = 0; 420 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); 421 if (is_backref) { 422 if (state->IsInsideCaptureGroup(index)) { 423 // The back reference is inside the capture group it refers to. 424 // Nothing can possibly have been captured yet, so we use empty 425 // instead. This ensures that, when checking a back reference, 426 // the capture registers of the referenced capture are either 427 // both set or both cleared. 428 builder->AddEmpty(); 429 } else { 430 RegExpCapture* capture = GetCapture(index); 431 RegExpTree* atom = new (zone()) RegExpBackReference(capture); 432 builder->AddAtom(atom); 433 } 434 break; 435 } 436 // With /u, no identity escapes except for syntax characters 437 // are allowed. Otherwise, all identity escapes are allowed. 438 if (unicode()) { 439 return ReportError(CStrVector("Invalid escape")); 440 } 441 uc32 first_digit = Next(); 442 if (first_digit == '8' || first_digit == '9') { 443 builder->AddCharacter(first_digit); 444 Advance(2); 445 break; 446 } 447 } 448 // Fall through. 449 case '0': { 450 Advance(); 451 if (unicode() && Next() >= '0' && Next() <= '9') { 452 // With /u, decimal escape with leading 0 are not parsed as octal. 453 return ReportError(CStrVector("Invalid decimal escape")); 454 } 455 uc32 octal = ParseOctalLiteral(); 456 builder->AddCharacter(octal); 457 break; 458 } 459 // ControlEscape :: one of 460 // f n r t v 461 case 'f': 462 Advance(2); 463 builder->AddCharacter('\f'); 464 break; 465 case 'n': 466 Advance(2); 467 builder->AddCharacter('\n'); 468 break; 469 case 'r': 470 Advance(2); 471 builder->AddCharacter('\r'); 472 break; 473 case 't': 474 Advance(2); 475 builder->AddCharacter('\t'); 476 break; 477 case 'v': 478 Advance(2); 479 builder->AddCharacter('\v'); 480 break; 481 case 'c': { 482 Advance(); 483 uc32 controlLetter = Next(); 484 // Special case if it is an ASCII letter. 485 // Convert lower case letters to uppercase. 486 uc32 letter = controlLetter & ~('a' ^ 'A'); 487 if (letter < 'A' || 'Z' < letter) { 488 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. 489 // This is outside the specification. We match JSC in 490 // reading the backslash as a literal character instead 491 // of as starting an escape. 492 if (unicode()) { 493 // With /u, invalid escapes are not treated as identity escapes. 494 return ReportError(CStrVector("Invalid unicode escape")); 495 } 496 builder->AddCharacter('\\'); 497 } else { 498 Advance(2); 499 builder->AddCharacter(controlLetter & 0x1f); 500 } 501 break; 502 } 503 case 'x': { 504 Advance(2); 505 uc32 value; 506 if (ParseHexEscape(2, &value)) { 507 builder->AddCharacter(value); 508 } else if (!unicode()) { 509 builder->AddCharacter('x'); 510 } else { 511 // With /u, invalid escapes are not treated as identity escapes. 512 return ReportError(CStrVector("Invalid escape")); 513 } 514 break; 515 } 516 case 'u': { 517 Advance(2); 518 uc32 value; 519 if (ParseUnicodeEscape(&value)) { 520 builder->AddEscapedUnicodeCharacter(value); 521 } else if (!unicode()) { 522 builder->AddCharacter('u'); 523 } else { 524 // With /u, invalid escapes are not treated as identity escapes. 525 return ReportError(CStrVector("Invalid unicode escape")); 526 } 527 break; 528 } 529 case 'k': 530 if (FLAG_harmony_regexp_named_captures && unicode()) { 531 Advance(2); 532 ParseNamedBackReference(builder, state CHECK_FAILED); 533 break; 534 } 535 // Fall through. 536 default: 537 Advance(); 538 // With /u, no identity escapes except for syntax characters 539 // are allowed. Otherwise, all identity escapes are allowed. 540 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { 541 builder->AddCharacter(current()); 542 Advance(); 543 } else { 544 return ReportError(CStrVector("Invalid escape")); 545 } 546 break; 547 } 548 break; 549 case '{': { 550 int dummy; 551 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); 552 if (parsed) return ReportError(CStrVector("Nothing to repeat")); 553 // Fall through. 554 } 555 case '}': 556 case ']': 557 if (unicode()) { 558 return ReportError(CStrVector("Lone quantifier brackets")); 559 } 560 // Fall through. 561 default: 562 builder->AddUnicodeCharacter(current()); 563 Advance(); 564 break; 565 } // end switch(current()) 566 567 int min; 568 int max; 569 switch (current()) { 570 // QuantifierPrefix :: 571 // * 572 // + 573 // ? 574 // { 575 case '*': 576 min = 0; 577 max = RegExpTree::kInfinity; 578 Advance(); 579 break; 580 case '+': 581 min = 1; 582 max = RegExpTree::kInfinity; 583 Advance(); 584 break; 585 case '?': 586 min = 0; 587 max = 1; 588 Advance(); 589 break; 590 case '{': 591 if (ParseIntervalQuantifier(&min, &max)) { 592 if (max < min) { 593 return ReportError( 594 CStrVector("numbers out of order in {} quantifier")); 595 } 596 break; 597 } else if (unicode()) { 598 // With /u, incomplete quantifiers are not allowed. 599 return ReportError(CStrVector("Incomplete quantifier")); 600 } 601 continue; 602 default: 603 continue; 604 } 605 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 606 if (current() == '?') { 607 quantifier_type = RegExpQuantifier::NON_GREEDY; 608 Advance(); 609 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 610 // FLAG_regexp_possessive_quantifier is a debug-only flag. 611 quantifier_type = RegExpQuantifier::POSSESSIVE; 612 Advance(); 613 } 614 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { 615 return ReportError(CStrVector("Invalid quantifier")); 616 } 617 } 618 } 619 620 621 #ifdef DEBUG 622 // Currently only used in an DCHECK. 623 static bool IsSpecialClassEscape(uc32 c) { 624 switch (c) { 625 case 'd': 626 case 'D': 627 case 's': 628 case 'S': 629 case 'w': 630 case 'W': 631 return true; 632 default: 633 return false; 634 } 635 } 636 #endif 637 638 639 // In order to know whether an escape is a backreference or not we have to scan 640 // the entire regexp and find the number of capturing parentheses. However we 641 // don't want to scan the regexp twice unless it is necessary. This mini-parser 642 // is called when needed. It can see the difference between capturing and 643 // noncapturing parentheses and can skip character classes and backslash-escaped 644 // characters. 645 void RegExpParser::ScanForCaptures() { 646 // Start with captures started previous to current position 647 int capture_count = captures_started(); 648 // Add count of captures after this position. 649 int n; 650 while ((n = current()) != kEndMarker) { 651 Advance(); 652 switch (n) { 653 case '\\': 654 Advance(); 655 break; 656 case '[': { 657 int c; 658 while ((c = current()) != kEndMarker) { 659 Advance(); 660 if (c == '\\') { 661 Advance(); 662 } else { 663 if (c == ']') break; 664 } 665 } 666 break; 667 } 668 case '(': 669 if (current() != '?') capture_count++; 670 break; 671 } 672 } 673 capture_count_ = capture_count; 674 is_scanned_for_captures_ = true; 675 } 676 677 678 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { 679 DCHECK_EQ('\\', current()); 680 DCHECK('1' <= Next() && Next() <= '9'); 681 // Try to parse a decimal literal that is no greater than the total number 682 // of left capturing parentheses in the input. 683 int start = position(); 684 int value = Next() - '0'; 685 Advance(2); 686 while (true) { 687 uc32 c = current(); 688 if (IsDecimalDigit(c)) { 689 value = 10 * value + (c - '0'); 690 if (value > kMaxCaptures) { 691 Reset(start); 692 return false; 693 } 694 Advance(); 695 } else { 696 break; 697 } 698 } 699 if (value > captures_started()) { 700 if (!is_scanned_for_captures_) { 701 int saved_position = position(); 702 ScanForCaptures(); 703 Reset(saved_position); 704 } 705 if (value > capture_count_) { 706 Reset(start); 707 return false; 708 } 709 } 710 *index_out = value; 711 return true; 712 } 713 714 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { 715 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 716 v->push_back(code_unit); 717 } else { 718 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 719 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 720 } 721 } 722 723 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { 724 DCHECK(FLAG_harmony_regexp_named_captures); 725 DCHECK(unicode()); 726 727 ZoneVector<uc16>* name = 728 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); 729 730 bool at_start = true; 731 while (true) { 732 uc32 c = current(); 733 Advance(); 734 735 // Convert unicode escapes. 736 if (c == '\\' && current() == 'u') { 737 Advance(); 738 if (!ParseUnicodeEscape(&c)) { 739 ReportError(CStrVector("Invalid Unicode escape sequence")); 740 return nullptr; 741 } 742 } 743 744 if (at_start) { 745 if (!IdentifierStart::Is(c)) { 746 ReportError(CStrVector("Invalid capture group name")); 747 return nullptr; 748 } 749 push_code_unit(name, c); 750 at_start = false; 751 } else { 752 if (c == '>') { 753 break; 754 } else if (IdentifierPart::Is(c)) { 755 push_code_unit(name, c); 756 } else { 757 ReportError(CStrVector("Invalid capture group name")); 758 return nullptr; 759 } 760 } 761 } 762 763 return name; 764 } 765 766 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, 767 int index) { 768 DCHECK(FLAG_harmony_regexp_named_captures); 769 DCHECK(unicode()); 770 DCHECK(0 < index && index <= captures_started_); 771 DCHECK_NOT_NULL(name); 772 773 if (named_captures_ == nullptr) { 774 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); 775 } else { 776 // Check for duplicates and bail if we find any. 777 for (const auto& named_capture : *named_captures_) { 778 if (*named_capture->name() == *name) { 779 ReportError(CStrVector("Duplicate capture group name")); 780 return false; 781 } 782 } 783 } 784 785 RegExpCapture* capture = GetCapture(index); 786 DCHECK(capture->name() == nullptr); 787 788 capture->set_name(name); 789 named_captures_->Add(capture, zone()); 790 791 return true; 792 } 793 794 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, 795 RegExpParserState* state) { 796 // The parser is assumed to be on the '<' in \k<name>. 797 if (current() != '<') { 798 ReportError(CStrVector("Invalid named reference")); 799 return false; 800 } 801 802 Advance(); 803 const ZoneVector<uc16>* name = ParseCaptureGroupName(); 804 if (name == nullptr) { 805 return false; 806 } 807 808 if (state->IsInsideCaptureGroup(name)) { 809 builder->AddEmpty(); 810 } else { 811 RegExpBackReference* atom = new (zone()) RegExpBackReference(); 812 atom->set_name(name); 813 814 builder->AddAtom(atom); 815 816 if (named_back_references_ == nullptr) { 817 named_back_references_ = 818 new (zone()) ZoneList<RegExpBackReference*>(1, zone()); 819 } 820 named_back_references_->Add(atom, zone()); 821 } 822 823 return true; 824 } 825 826 void RegExpParser::PatchNamedBackReferences() { 827 if (named_back_references_ == nullptr) return; 828 829 if (named_captures_ == nullptr) { 830 ReportError(CStrVector("Invalid named capture referenced")); 831 return; 832 } 833 834 // Look up and patch the actual capture for each named back reference. 835 // TODO(jgruber): O(n^2), optimize if necessary. 836 837 for (int i = 0; i < named_back_references_->length(); i++) { 838 RegExpBackReference* ref = named_back_references_->at(i); 839 840 int index = -1; 841 for (const auto& capture : *named_captures_) { 842 if (*capture->name() == *ref->name()) { 843 index = capture->index(); 844 break; 845 } 846 } 847 848 if (index == -1) { 849 ReportError(CStrVector("Invalid named capture referenced")); 850 return; 851 } 852 853 ref->set_capture(GetCapture(index)); 854 } 855 } 856 857 RegExpCapture* RegExpParser::GetCapture(int index) { 858 // The index for the capture groups are one-based. Its index in the list is 859 // zero-based. 860 int know_captures = 861 is_scanned_for_captures_ ? capture_count_ : captures_started_; 862 DCHECK(index <= know_captures); 863 if (captures_ == NULL) { 864 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); 865 } 866 while (captures_->length() < know_captures) { 867 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); 868 } 869 return captures_->at(index - 1); 870 } 871 872 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() { 873 if (named_captures_ == nullptr || named_captures_->is_empty()) 874 return Handle<FixedArray>(); 875 876 Factory* factory = isolate()->factory(); 877 878 int len = named_captures_->length() * 2; 879 Handle<FixedArray> array = factory->NewFixedArray(len); 880 881 for (int i = 0; i < named_captures_->length(); i++) { 882 RegExpCapture* capture = named_captures_->at(i); 883 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name()); 884 array->set(i * 2, *name.ToHandleChecked()); 885 array->set(i * 2 + 1, Smi::FromInt(capture->index())); 886 } 887 888 return array; 889 } 890 891 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { 892 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { 893 if (s->group_type() != CAPTURE) continue; 894 // Return true if we found the matching capture index. 895 if (index == s->capture_index()) return true; 896 // Abort if index is larger than what has been parsed up till this state. 897 if (index > s->capture_index()) return false; 898 } 899 return false; 900 } 901 902 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( 903 const ZoneVector<uc16>* name) { 904 DCHECK_NOT_NULL(name); 905 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { 906 if (s->capture_name() == nullptr) continue; 907 if (*s->capture_name() == *name) return true; 908 } 909 return false; 910 } 911 912 // QuantifierPrefix :: 913 // { DecimalDigits } 914 // { DecimalDigits , } 915 // { DecimalDigits , DecimalDigits } 916 // 917 // Returns true if parsing succeeds, and set the min_out and max_out 918 // values. Values are truncated to RegExpTree::kInfinity if they overflow. 919 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { 920 DCHECK_EQ(current(), '{'); 921 int start = position(); 922 Advance(); 923 int min = 0; 924 if (!IsDecimalDigit(current())) { 925 Reset(start); 926 return false; 927 } 928 while (IsDecimalDigit(current())) { 929 int next = current() - '0'; 930 if (min > (RegExpTree::kInfinity - next) / 10) { 931 // Overflow. Skip past remaining decimal digits and return -1. 932 do { 933 Advance(); 934 } while (IsDecimalDigit(current())); 935 min = RegExpTree::kInfinity; 936 break; 937 } 938 min = 10 * min + next; 939 Advance(); 940 } 941 int max = 0; 942 if (current() == '}') { 943 max = min; 944 Advance(); 945 } else if (current() == ',') { 946 Advance(); 947 if (current() == '}') { 948 max = RegExpTree::kInfinity; 949 Advance(); 950 } else { 951 while (IsDecimalDigit(current())) { 952 int next = current() - '0'; 953 if (max > (RegExpTree::kInfinity - next) / 10) { 954 do { 955 Advance(); 956 } while (IsDecimalDigit(current())); 957 max = RegExpTree::kInfinity; 958 break; 959 } 960 max = 10 * max + next; 961 Advance(); 962 } 963 if (current() != '}') { 964 Reset(start); 965 return false; 966 } 967 Advance(); 968 } 969 } else { 970 Reset(start); 971 return false; 972 } 973 *min_out = min; 974 *max_out = max; 975 return true; 976 } 977 978 979 uc32 RegExpParser::ParseOctalLiteral() { 980 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); 981 // For compatibility with some other browsers (not all), we parse 982 // up to three octal digits with a value below 256. 983 uc32 value = current() - '0'; 984 Advance(); 985 if ('0' <= current() && current() <= '7') { 986 value = value * 8 + current() - '0'; 987 Advance(); 988 if (value < 32 && '0' <= current() && current() <= '7') { 989 value = value * 8 + current() - '0'; 990 Advance(); 991 } 992 } 993 return value; 994 } 995 996 997 bool RegExpParser::ParseHexEscape(int length, uc32* value) { 998 int start = position(); 999 uc32 val = 0; 1000 for (int i = 0; i < length; ++i) { 1001 uc32 c = current(); 1002 int d = HexValue(c); 1003 if (d < 0) { 1004 Reset(start); 1005 return false; 1006 } 1007 val = val * 16 + d; 1008 Advance(); 1009 } 1010 *value = val; 1011 return true; 1012 } 1013 1014 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. 1015 bool RegExpParser::ParseUnicodeEscape(uc32* value) { 1016 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 1017 // allowed). In the latter case, the number of hex digits between { } is 1018 // arbitrary. \ and u have already been read. 1019 if (current() == '{' && unicode()) { 1020 int start = position(); 1021 Advance(); 1022 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 1023 if (current() == '}') { 1024 Advance(); 1025 return true; 1026 } 1027 } 1028 Reset(start); 1029 return false; 1030 } 1031 // \u but no {, or \u{...} escapes not allowed. 1032 bool result = ParseHexEscape(4, value); 1033 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && 1034 current() == '\\') { 1035 // Attempt to read trail surrogate. 1036 int start = position(); 1037 if (Next() == 'u') { 1038 Advance(2); 1039 uc32 trail; 1040 if (ParseHexEscape(4, &trail) && 1041 unibrow::Utf16::IsTrailSurrogate(trail)) { 1042 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), 1043 static_cast<uc16>(trail)); 1044 return true; 1045 } 1046 } 1047 Reset(start); 1048 } 1049 return result; 1050 } 1051 1052 #ifdef V8_I18N_SUPPORT 1053 1054 namespace { 1055 1056 bool IsExactPropertyAlias(const char* property_name, UProperty property) { 1057 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); 1058 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true; 1059 for (int i = 0;; i++) { 1060 const char* long_name = u_getPropertyName( 1061 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1062 if (long_name == NULL) break; 1063 if (strcmp(property_name, long_name) == 0) return true; 1064 } 1065 return false; 1066 } 1067 1068 bool IsExactPropertyValueAlias(const char* property_value_name, 1069 UProperty property, int32_t property_value) { 1070 const char* short_name = 1071 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); 1072 if (short_name != NULL && strcmp(property_value_name, short_name) == 0) { 1073 return true; 1074 } 1075 for (int i = 0;; i++) { 1076 const char* long_name = u_getPropertyValueName( 1077 property, property_value, 1078 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1079 if (long_name == NULL) break; 1080 if (strcmp(property_value_name, long_name) == 0) return true; 1081 } 1082 return false; 1083 } 1084 1085 bool LookupPropertyValueName(UProperty property, 1086 const char* property_value_name, bool negate, 1087 ZoneList<CharacterRange>* result, Zone* zone) { 1088 UProperty property_for_lookup = property; 1089 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { 1090 // For the property Script_Extensions, we have to do the property value 1091 // name lookup as if the property is Script. 1092 property_for_lookup = UCHAR_SCRIPT; 1093 } 1094 int32_t property_value = 1095 u_getPropertyValueEnum(property_for_lookup, property_value_name); 1096 if (property_value == UCHAR_INVALID_CODE) return false; 1097 1098 // We require the property name to match exactly to one of the property value 1099 // aliases. However, u_getPropertyValueEnum uses loose matching. 1100 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, 1101 property_value)) { 1102 return false; 1103 } 1104 1105 UErrorCode ec = U_ZERO_ERROR; 1106 icu::UnicodeSet set; 1107 set.applyIntPropertyValue(property, property_value, ec); 1108 bool success = ec == U_ZERO_ERROR && !set.isEmpty(); 1109 1110 if (success) { 1111 set.removeAllStrings(); 1112 if (negate) set.complement(); 1113 for (int i = 0; i < set.getRangeCount(); i++) { 1114 result->Add( 1115 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 1116 zone); 1117 } 1118 } 1119 return success; 1120 } 1121 1122 template <size_t N> 1123 inline bool NameEquals(const char* name, const char (&literal)[N]) { 1124 return strncmp(name, literal, N + 1) == 0; 1125 } 1126 1127 bool LookupSpecialPropertyValueName(const char* name, 1128 ZoneList<CharacterRange>* result, 1129 bool negate, Zone* zone) { 1130 if (NameEquals(name, "Any")) { 1131 if (!negate) result->Add(CharacterRange::Everything(), zone); 1132 } else if (NameEquals(name, "ASCII")) { 1133 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) 1134 : CharacterRange::Range(0x0, 0x7f), 1135 zone); 1136 } else if (NameEquals(name, "Assigned")) { 1137 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", 1138 !negate, result, zone); 1139 } else { 1140 return false; 1141 } 1142 return true; 1143 } 1144 1145 } // anonymous namespace 1146 1147 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, 1148 bool negate) { 1149 // Parse the property class as follows: 1150 // - In \p{name}, 'name' is interpreted 1151 // - either as a general category property value name. 1152 // - or as a binary property name. 1153 // - In \p{name=value}, 'name' is interpreted as an enumerated property name, 1154 // and 'value' is interpreted as one of the available property value names. 1155 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. 1156 // - Loose matching is not applied. 1157 List<char> first_part; 1158 List<char> second_part; 1159 if (current() == '{') { 1160 // Parse \p{[PropertyName=]PropertyNameValue} 1161 for (Advance(); current() != '}' && current() != '='; Advance()) { 1162 if (!has_next()) return false; 1163 first_part.Add(static_cast<char>(current())); 1164 } 1165 if (current() == '=') { 1166 for (Advance(); current() != '}'; Advance()) { 1167 if (!has_next()) return false; 1168 second_part.Add(static_cast<char>(current())); 1169 } 1170 second_part.Add(0); // null-terminate string. 1171 } 1172 } else { 1173 return false; 1174 } 1175 Advance(); 1176 first_part.Add(0); // null-terminate string. 1177 1178 if (second_part.is_empty()) { 1179 // First attempt to interpret as general category property value name. 1180 const char* name = first_part.ToConstVector().start(); 1181 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, 1182 result, zone())) { 1183 return true; 1184 } 1185 // Interpret "Any", "ASCII", and "Assigned". 1186 if (LookupSpecialPropertyValueName(name, result, negate, zone())) { 1187 return true; 1188 } 1189 // Then attempt to interpret as binary property name with value name 'Y'. 1190 UProperty property = u_getPropertyEnum(name); 1191 if (property < UCHAR_BINARY_START) return false; 1192 if (property >= UCHAR_BINARY_LIMIT) return false; 1193 if (!IsExactPropertyAlias(name, property)) return false; 1194 return LookupPropertyValueName(property, negate ? "N" : "Y", false, result, 1195 zone()); 1196 } else { 1197 // Both property name and value name are specified. Attempt to interpret 1198 // the property name as enumerated property. 1199 const char* property_name = first_part.ToConstVector().start(); 1200 const char* value_name = second_part.ToConstVector().start(); 1201 UProperty property = u_getPropertyEnum(property_name); 1202 if (!IsExactPropertyAlias(property_name, property)) return false; 1203 if (property == UCHAR_GENERAL_CATEGORY) { 1204 // We want to allow aggregate value names such as "Letter". 1205 property = UCHAR_GENERAL_CATEGORY_MASK; 1206 } else if (property != UCHAR_SCRIPT && 1207 property != UCHAR_SCRIPT_EXTENSIONS) { 1208 return false; 1209 } 1210 return LookupPropertyValueName(property, value_name, negate, result, 1211 zone()); 1212 } 1213 } 1214 1215 #else // V8_I18N_SUPPORT 1216 1217 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, 1218 bool negate) { 1219 return false; 1220 } 1221 1222 #endif // V8_I18N_SUPPORT 1223 1224 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { 1225 uc32 x = 0; 1226 int d = HexValue(current()); 1227 if (d < 0) { 1228 return false; 1229 } 1230 while (d >= 0) { 1231 x = x * 16 + d; 1232 if (x > max_value) { 1233 return false; 1234 } 1235 Advance(); 1236 d = HexValue(current()); 1237 } 1238 *value = x; 1239 return true; 1240 } 1241 1242 1243 uc32 RegExpParser::ParseClassCharacterEscape() { 1244 DCHECK(current() == '\\'); 1245 DCHECK(has_next() && !IsSpecialClassEscape(Next())); 1246 Advance(); 1247 switch (current()) { 1248 case 'b': 1249 Advance(); 1250 return '\b'; 1251 // ControlEscape :: one of 1252 // f n r t v 1253 case 'f': 1254 Advance(); 1255 return '\f'; 1256 case 'n': 1257 Advance(); 1258 return '\n'; 1259 case 'r': 1260 Advance(); 1261 return '\r'; 1262 case 't': 1263 Advance(); 1264 return '\t'; 1265 case 'v': 1266 Advance(); 1267 return '\v'; 1268 case 'c': { 1269 uc32 controlLetter = Next(); 1270 uc32 letter = controlLetter & ~('A' ^ 'a'); 1271 // For compatibility with JSC, inside a character class. We also accept 1272 // digits and underscore as control characters, unless with /u. 1273 if (letter >= 'A' && letter <= 'Z') { 1274 Advance(2); 1275 // Control letters mapped to ASCII control characters in the range 1276 // 0x00-0x1f. 1277 return controlLetter & 0x1f; 1278 } 1279 if (unicode()) { 1280 // With /u, invalid escapes are not treated as identity escapes. 1281 ReportError(CStrVector("Invalid class escape")); 1282 return 0; 1283 } 1284 if ((controlLetter >= '0' && controlLetter <= '9') || 1285 controlLetter == '_') { 1286 Advance(2); 1287 return controlLetter & 0x1f; 1288 } 1289 // We match JSC in reading the backslash as a literal 1290 // character instead of as starting an escape. 1291 return '\\'; 1292 } 1293 case '0': 1294 // With /u, \0 is interpreted as NUL if not followed by another digit. 1295 if (unicode() && !(Next() >= '0' && Next() <= '9')) { 1296 Advance(); 1297 return 0; 1298 } 1299 // Fall through. 1300 case '1': 1301 case '2': 1302 case '3': 1303 case '4': 1304 case '5': 1305 case '6': 1306 case '7': 1307 // For compatibility, we interpret a decimal escape that isn't 1308 // a back reference (and therefore either \0 or not valid according 1309 // to the specification) as a 1..3 digit octal character code. 1310 if (unicode()) { 1311 // With /u, decimal escape is not interpreted as octal character code. 1312 ReportError(CStrVector("Invalid class escape")); 1313 return 0; 1314 } 1315 return ParseOctalLiteral(); 1316 case 'x': { 1317 Advance(); 1318 uc32 value; 1319 if (ParseHexEscape(2, &value)) return value; 1320 if (unicode()) { 1321 // With /u, invalid escapes are not treated as identity escapes. 1322 ReportError(CStrVector("Invalid escape")); 1323 return 0; 1324 } 1325 // If \x is not followed by a two-digit hexadecimal, treat it 1326 // as an identity escape. 1327 return 'x'; 1328 } 1329 case 'u': { 1330 Advance(); 1331 uc32 value; 1332 if (ParseUnicodeEscape(&value)) return value; 1333 if (unicode()) { 1334 // With /u, invalid escapes are not treated as identity escapes. 1335 ReportError(CStrVector("Invalid unicode escape")); 1336 return 0; 1337 } 1338 // If \u is not followed by a two-digit hexadecimal, treat it 1339 // as an identity escape. 1340 return 'u'; 1341 } 1342 default: { 1343 uc32 result = current(); 1344 // With /u, no identity escapes except for syntax characters and '-' are 1345 // allowed. Otherwise, all identity escapes are allowed. 1346 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { 1347 Advance(); 1348 return result; 1349 } 1350 ReportError(CStrVector("Invalid escape")); 1351 return 0; 1352 } 1353 } 1354 return 0; 1355 } 1356 1357 1358 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { 1359 DCHECK_EQ(0, *char_class); 1360 uc32 first = current(); 1361 if (first == '\\') { 1362 switch (Next()) { 1363 case 'w': 1364 case 'W': 1365 case 'd': 1366 case 'D': 1367 case 's': 1368 case 'S': { 1369 *char_class = Next(); 1370 Advance(2); 1371 return CharacterRange::Singleton(0); // Return dummy value. 1372 } 1373 case kEndMarker: 1374 return ReportError(CStrVector("\\ at end of pattern")); 1375 default: 1376 first = ParseClassCharacterEscape(CHECK_FAILED); 1377 } 1378 } else { 1379 Advance(); 1380 } 1381 1382 return CharacterRange::Singleton(first); 1383 } 1384 1385 static const uc16 kNoCharClass = 0; 1386 1387 // Adds range or pre-defined character class to character ranges. 1388 // If char_class is not kInvalidClass, it's interpreted as a class 1389 // escape (i.e., 's' means whitespace, from '\s'). 1390 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 1391 uc16 char_class, CharacterRange range, 1392 Zone* zone) { 1393 if (char_class != kNoCharClass) { 1394 CharacterRange::AddClassEscape(char_class, ranges, zone); 1395 } else { 1396 ranges->Add(range, zone); 1397 } 1398 } 1399 1400 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) { 1401 if (!FLAG_harmony_regexp_property) return false; 1402 if (!unicode()) return false; 1403 if (current() != '\\') return false; 1404 uc32 next = Next(); 1405 bool parse_success = false; 1406 if (next == 'p') { 1407 Advance(2); 1408 parse_success = ParsePropertyClass(ranges, false); 1409 } else if (next == 'P') { 1410 Advance(2); 1411 parse_success = ParsePropertyClass(ranges, true); 1412 } else { 1413 return false; 1414 } 1415 if (!parse_success) 1416 ReportError(CStrVector("Invalid property name in character class")); 1417 return parse_success; 1418 } 1419 1420 RegExpTree* RegExpParser::ParseCharacterClass() { 1421 static const char* kUnterminated = "Unterminated character class"; 1422 static const char* kRangeInvalid = "Invalid character class"; 1423 static const char* kRangeOutOfOrder = "Range out of order in character class"; 1424 1425 DCHECK_EQ(current(), '['); 1426 Advance(); 1427 bool is_negated = false; 1428 if (current() == '^') { 1429 is_negated = true; 1430 Advance(); 1431 } 1432 ZoneList<CharacterRange>* ranges = 1433 new (zone()) ZoneList<CharacterRange>(2, zone()); 1434 while (has_more() && current() != ']') { 1435 bool parsed_property = ParseClassProperty(ranges CHECK_FAILED); 1436 if (parsed_property) continue; 1437 uc16 char_class = kNoCharClass; 1438 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); 1439 if (current() == '-') { 1440 Advance(); 1441 if (current() == kEndMarker) { 1442 // If we reach the end we break out of the loop and let the 1443 // following code report an error. 1444 break; 1445 } else if (current() == ']') { 1446 AddRangeOrEscape(ranges, char_class, first, zone()); 1447 ranges->Add(CharacterRange::Singleton('-'), zone()); 1448 break; 1449 } 1450 uc16 char_class_2 = kNoCharClass; 1451 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); 1452 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { 1453 // Either end is an escaped character class. Treat the '-' verbatim. 1454 if (unicode()) { 1455 // ES2015 21.2.2.15.1 step 1. 1456 return ReportError(CStrVector(kRangeInvalid)); 1457 } 1458 AddRangeOrEscape(ranges, char_class, first, zone()); 1459 ranges->Add(CharacterRange::Singleton('-'), zone()); 1460 AddRangeOrEscape(ranges, char_class_2, next, zone()); 1461 continue; 1462 } 1463 // ES2015 21.2.2.15.1 step 6. 1464 if (first.from() > next.to()) { 1465 return ReportError(CStrVector(kRangeOutOfOrder)); 1466 } 1467 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); 1468 } else { 1469 AddRangeOrEscape(ranges, char_class, first, zone()); 1470 } 1471 } 1472 if (!has_more()) { 1473 return ReportError(CStrVector(kUnterminated)); 1474 } 1475 Advance(); 1476 if (ranges->length() == 0) { 1477 ranges->Add(CharacterRange::Everything(), zone()); 1478 is_negated = !is_negated; 1479 } 1480 return new (zone()) RegExpCharacterClass(ranges, is_negated); 1481 } 1482 1483 1484 #undef CHECK_FAILED 1485 1486 1487 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, 1488 FlatStringReader* input, JSRegExp::Flags flags, 1489 RegExpCompileData* result) { 1490 DCHECK(result != NULL); 1491 RegExpParser parser(input, &result->error, flags, isolate, zone); 1492 RegExpTree* tree = parser.ParsePattern(); 1493 if (parser.failed()) { 1494 DCHECK(tree == NULL); 1495 DCHECK(!result->error.is_null()); 1496 } else { 1497 DCHECK(tree != NULL); 1498 DCHECK(result->error.is_null()); 1499 if (FLAG_trace_regexp_parser) { 1500 OFStream os(stdout); 1501 tree->Print(os, zone); 1502 os << "\n"; 1503 } 1504 result->tree = tree; 1505 int capture_count = parser.captures_started(); 1506 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1507 result->contains_anchor = parser.contains_anchor(); 1508 result->capture_name_map = parser.CreateCaptureNameMap(); 1509 result->capture_count = capture_count; 1510 } 1511 return !parser.failed(); 1512 } 1513 1514 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode) 1515 : zone_(zone), 1516 pending_empty_(false), 1517 ignore_case_(ignore_case), 1518 unicode_(unicode), 1519 characters_(NULL), 1520 pending_surrogate_(kNoPendingSurrogate), 1521 terms_(), 1522 alternatives_() 1523 #ifdef DEBUG 1524 , 1525 last_added_(ADD_NONE) 1526 #endif 1527 { 1528 } 1529 1530 1531 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { 1532 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 1533 FlushPendingSurrogate(); 1534 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. 1535 pending_surrogate_ = lead_surrogate; 1536 } 1537 1538 1539 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { 1540 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); 1541 if (pending_surrogate_ != kNoPendingSurrogate) { 1542 uc16 lead_surrogate = pending_surrogate_; 1543 pending_surrogate_ = kNoPendingSurrogate; 1544 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 1545 uc32 combined = 1546 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); 1547 if (NeedsDesugaringForIgnoreCase(combined)) { 1548 AddCharacterClassForDesugaring(combined); 1549 } else { 1550 ZoneList<uc16> surrogate_pair(2, zone()); 1551 surrogate_pair.Add(lead_surrogate, zone()); 1552 surrogate_pair.Add(trail_surrogate, zone()); 1553 RegExpAtom* atom = 1554 new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); 1555 AddAtom(atom); 1556 } 1557 } else { 1558 pending_surrogate_ = trail_surrogate; 1559 FlushPendingSurrogate(); 1560 } 1561 } 1562 1563 1564 void RegExpBuilder::FlushPendingSurrogate() { 1565 if (pending_surrogate_ != kNoPendingSurrogate) { 1566 DCHECK(unicode()); 1567 uc32 c = pending_surrogate_; 1568 pending_surrogate_ = kNoPendingSurrogate; 1569 AddCharacterClassForDesugaring(c); 1570 } 1571 } 1572 1573 1574 void RegExpBuilder::FlushCharacters() { 1575 FlushPendingSurrogate(); 1576 pending_empty_ = false; 1577 if (characters_ != NULL) { 1578 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); 1579 characters_ = NULL; 1580 text_.Add(atom, zone()); 1581 LAST(ADD_ATOM); 1582 } 1583 } 1584 1585 1586 void RegExpBuilder::FlushText() { 1587 FlushCharacters(); 1588 int num_text = text_.length(); 1589 if (num_text == 0) { 1590 return; 1591 } else if (num_text == 1) { 1592 terms_.Add(text_.last(), zone()); 1593 } else { 1594 RegExpText* text = new (zone()) RegExpText(zone()); 1595 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); 1596 terms_.Add(text, zone()); 1597 } 1598 text_.Clear(); 1599 } 1600 1601 1602 void RegExpBuilder::AddCharacter(uc16 c) { 1603 FlushPendingSurrogate(); 1604 pending_empty_ = false; 1605 if (NeedsDesugaringForIgnoreCase(c)) { 1606 AddCharacterClassForDesugaring(c); 1607 } else { 1608 if (characters_ == NULL) { 1609 characters_ = new (zone()) ZoneList<uc16>(4, zone()); 1610 } 1611 characters_->Add(c, zone()); 1612 LAST(ADD_CHAR); 1613 } 1614 } 1615 1616 1617 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { 1618 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 1619 DCHECK(unicode()); 1620 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); 1621 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 1622 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 1623 AddLeadSurrogate(c); 1624 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { 1625 AddTrailSurrogate(c); 1626 } else { 1627 AddCharacter(static_cast<uc16>(c)); 1628 } 1629 } 1630 1631 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { 1632 // A lead or trail surrogate parsed via escape sequence will not 1633 // pair up with any preceding lead or following trail surrogate. 1634 FlushPendingSurrogate(); 1635 AddUnicodeCharacter(character); 1636 FlushPendingSurrogate(); 1637 } 1638 1639 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1640 1641 1642 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1643 if (NeedsDesugaringForUnicode(cc)) { 1644 // With /u, character class needs to be desugared, so it 1645 // must be a standalone term instead of being part of a RegExpText. 1646 AddTerm(cc); 1647 } else { 1648 AddAtom(cc); 1649 } 1650 } 1651 1652 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { 1653 AddTerm(new (zone()) RegExpCharacterClass( 1654 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); 1655 } 1656 1657 1658 void RegExpBuilder::AddAtom(RegExpTree* term) { 1659 if (term->IsEmpty()) { 1660 AddEmpty(); 1661 return; 1662 } 1663 if (term->IsTextElement()) { 1664 FlushCharacters(); 1665 text_.Add(term, zone()); 1666 } else { 1667 FlushText(); 1668 terms_.Add(term, zone()); 1669 } 1670 LAST(ADD_ATOM); 1671 } 1672 1673 1674 void RegExpBuilder::AddTerm(RegExpTree* term) { 1675 FlushText(); 1676 terms_.Add(term, zone()); 1677 LAST(ADD_ATOM); 1678 } 1679 1680 1681 void RegExpBuilder::AddAssertion(RegExpTree* assert) { 1682 FlushText(); 1683 terms_.Add(assert, zone()); 1684 LAST(ADD_ASSERT); 1685 } 1686 1687 1688 void RegExpBuilder::NewAlternative() { FlushTerms(); } 1689 1690 1691 void RegExpBuilder::FlushTerms() { 1692 FlushText(); 1693 int num_terms = terms_.length(); 1694 RegExpTree* alternative; 1695 if (num_terms == 0) { 1696 alternative = new (zone()) RegExpEmpty(); 1697 } else if (num_terms == 1) { 1698 alternative = terms_.last(); 1699 } else { 1700 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); 1701 } 1702 alternatives_.Add(alternative, zone()); 1703 terms_.Clear(); 1704 LAST(ADD_NONE); 1705 } 1706 1707 1708 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { 1709 if (!unicode()) return false; 1710 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not 1711 // necessarily mean that we need to desugar. It's probably nicer to have a 1712 // separate pass to figure out unicode desugarings. 1713 if (ignore_case()) return true; 1714 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 1715 CharacterRange::Canonicalize(ranges); 1716 for (int i = ranges->length() - 1; i >= 0; i--) { 1717 uc32 from = ranges->at(i).from(); 1718 uc32 to = ranges->at(i).to(); 1719 // Check for non-BMP characters. 1720 if (to >= kNonBmpStart) return true; 1721 // Check for lone surrogates. 1722 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; 1723 } 1724 return false; 1725 } 1726 1727 1728 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { 1729 #ifdef V8_I18N_SUPPORT 1730 if (unicode() && ignore_case()) { 1731 icu::UnicodeSet set(c, c); 1732 set.closeOver(USET_CASE_INSENSITIVE); 1733 set.removeAllStrings(); 1734 return set.size() > 1; 1735 } 1736 // In the case where ICU is not included, we act as if the unicode flag is 1737 // not set, and do not desugar. 1738 #endif // V8_I18N_SUPPORT 1739 return false; 1740 } 1741 1742 1743 RegExpTree* RegExpBuilder::ToRegExp() { 1744 FlushTerms(); 1745 int num_alternatives = alternatives_.length(); 1746 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1747 if (num_alternatives == 1) return alternatives_.last(); 1748 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1749 } 1750 1751 bool RegExpBuilder::AddQuantifierToAtom( 1752 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 1753 FlushPendingSurrogate(); 1754 if (pending_empty_) { 1755 pending_empty_ = false; 1756 return true; 1757 } 1758 RegExpTree* atom; 1759 if (characters_ != NULL) { 1760 DCHECK(last_added_ == ADD_CHAR); 1761 // Last atom was character. 1762 Vector<const uc16> char_vector = characters_->ToConstVector(); 1763 int num_chars = char_vector.length(); 1764 if (num_chars > 1) { 1765 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); 1766 text_.Add(new (zone()) RegExpAtom(prefix), zone()); 1767 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 1768 } 1769 characters_ = NULL; 1770 atom = new (zone()) RegExpAtom(char_vector); 1771 FlushText(); 1772 } else if (text_.length() > 0) { 1773 DCHECK(last_added_ == ADD_ATOM); 1774 atom = text_.RemoveLast(); 1775 FlushText(); 1776 } else if (terms_.length() > 0) { 1777 DCHECK(last_added_ == ADD_ATOM); 1778 atom = terms_.RemoveLast(); 1779 // With /u, lookarounds are not quantifiable. 1780 if (unicode() && atom->IsLookaround()) return false; 1781 if (atom->max_match() == 0) { 1782 // Guaranteed to only match an empty string. 1783 LAST(ADD_TERM); 1784 if (min == 0) { 1785 return true; 1786 } 1787 terms_.Add(atom, zone()); 1788 return true; 1789 } 1790 } else { 1791 // Only call immediately after adding an atom or character! 1792 UNREACHABLE(); 1793 return false; 1794 } 1795 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1796 zone()); 1797 LAST(ADD_TERM); 1798 return true; 1799 } 1800 1801 } // namespace internal 1802 } // namespace v8 1803