1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_JSREGEXP_H_ 6 #define V8_REGEXP_JSREGEXP_H_ 7 8 #include "src/allocation.h" 9 #include "src/assembler.h" 10 #include "src/isolate.h" 11 #include "src/objects/js-regexp.h" 12 #include "src/regexp/regexp-ast.h" 13 #include "src/regexp/regexp-macro-assembler.h" 14 15 namespace v8 { 16 namespace internal { 17 18 class NodeVisitor; 19 class RegExpCompiler; 20 class RegExpMacroAssembler; 21 class RegExpNode; 22 class RegExpTree; 23 class BoyerMooreLookahead; 24 25 inline bool IgnoreCase(JSRegExp::Flags flags) { 26 return (flags & JSRegExp::kIgnoreCase) != 0; 27 } 28 29 inline bool IsUnicode(JSRegExp::Flags flags) { 30 return (flags & JSRegExp::kUnicode) != 0; 31 } 32 33 inline bool IsSticky(JSRegExp::Flags flags) { 34 return (flags & JSRegExp::kSticky) != 0; 35 } 36 37 inline bool IsGlobal(JSRegExp::Flags flags) { 38 return (flags & JSRegExp::kGlobal) != 0; 39 } 40 41 inline bool DotAll(JSRegExp::Flags flags) { 42 return (flags & JSRegExp::kDotAll) != 0; 43 } 44 45 inline bool Multiline(JSRegExp::Flags flags) { 46 return (flags & JSRegExp::kMultiline) != 0; 47 } 48 49 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { 50 // Both unicode and ignore_case flags are set. We need to use ICU to find 51 // the closure over case equivalents. 52 return IsUnicode(flags) && IgnoreCase(flags); 53 } 54 55 class RegExpImpl { 56 public: 57 // Whether V8 is compiled with native regexp support or not. 58 static bool UsesNativeRegExp() { 59 #ifdef V8_INTERPRETED_REGEXP 60 return false; 61 #else 62 return true; 63 #endif 64 } 65 66 // Returns a string representation of a regular expression. 67 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4. 68 // This function calls the garbage collector if necessary. 69 static Handle<String> ToString(Handle<Object> value); 70 71 // Parses the RegExp pattern and prepares the JSRegExp object with 72 // generic data and choice of implementation - as well as what 73 // the implementation wants to store in the data field. 74 // Returns false if compilation fails. 75 V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile( 76 Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, 77 JSRegExp::Flags flags); 78 79 // See ECMA-262 section 15.10.6.2. 80 // This function calls the garbage collector if necessary. 81 V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec( 82 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, 83 int index, Handle<RegExpMatchInfo> last_match_info); 84 85 // Prepares a JSRegExp object with Irregexp-specific data. 86 static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, 87 Handle<String> pattern, JSRegExp::Flags flags, 88 int capture_register_count); 89 90 static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re, 91 Handle<String> pattern, JSRegExp::Flags flags, 92 Handle<String> match_pattern); 93 94 static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 95 Handle<String> subject, int index, int32_t* output, 96 int output_size); 97 98 static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp, 99 Handle<String> subject, int index, 100 Handle<RegExpMatchInfo> last_match_info); 101 102 enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 }; 103 104 // Prepare a RegExp for being executed one or more times (using 105 // IrregexpExecOnce) on the subject. 106 // This ensures that the regexp is compiled for the subject, and that 107 // the subject is flat. 108 // Returns the number of integer spaces required by IrregexpExecOnce 109 // as its "registers" argument. If the regexp cannot be compiled, 110 // an exception is set as pending, and this function returns negative. 111 static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp, 112 Handle<String> subject); 113 114 // Execute a regular expression on the subject, starting from index. 115 // If matching succeeds, return the number of matches. This can be larger 116 // than one in the case of global regular expressions. 117 // The captures and subcaptures are stored into the registers vector. 118 // If matching fails, returns RE_FAILURE. 119 // If execution fails, sets a pending exception and returns RE_EXCEPTION. 120 static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 121 Handle<String> subject, int index, int32_t* output, 122 int output_size); 123 124 // Execute an Irregexp bytecode pattern. 125 // On a successful match, the result is a JSArray containing 126 // captured positions. On a failure, the result is the null value. 127 // Returns an empty handle in case of an exception. 128 V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec( 129 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, 130 int index, Handle<RegExpMatchInfo> last_match_info); 131 132 // Set last match info. If match is nullptr, then setting captures is 133 // omitted. 134 static Handle<RegExpMatchInfo> SetLastMatchInfo( 135 Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, 136 Handle<String> subject, int capture_count, int32_t* match); 137 138 class GlobalCache { 139 public: 140 GlobalCache(Handle<JSRegExp> regexp, 141 Handle<String> subject, 142 Isolate* isolate); 143 144 V8_INLINE ~GlobalCache(); 145 146 // Fetch the next entry in the cache for global regexp match results. 147 // This does not set the last match info. Upon failure, nullptr is 148 // returned. The cause can be checked with Result(). The previous result is 149 // still in available in memory when a failure happens. 150 V8_INLINE int32_t* FetchNext(); 151 152 V8_INLINE int32_t* LastSuccessfulMatch(); 153 154 V8_INLINE bool HasException() { return num_matches_ < 0; } 155 156 private: 157 int AdvanceZeroLength(int last_index); 158 159 int num_matches_; 160 int max_matches_; 161 int current_match_index_; 162 int registers_per_match_; 163 // Pointer to the last set of captures. 164 int32_t* register_array_; 165 int register_array_size_; 166 Handle<JSRegExp> regexp_; 167 Handle<String> subject_; 168 Isolate* isolate_; 169 }; 170 171 // For acting on the JSRegExp data FixedArray. 172 static int IrregexpMaxRegisterCount(FixedArray* re); 173 static void SetIrregexpMaxRegisterCount(FixedArray* re, int value); 174 static void SetIrregexpCaptureNameMap(FixedArray* re, 175 Handle<FixedArray> value); 176 static int IrregexpNumberOfCaptures(FixedArray* re); 177 static int IrregexpNumberOfRegisters(FixedArray* re); 178 static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte); 179 static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte); 180 181 // Limit the space regexps take up on the heap. In order to limit this we 182 // would like to keep track of the amount of regexp code on the heap. This 183 // is not tracked, however. As a conservative approximation we track the 184 // total regexp code compiled including code that has subsequently been freed 185 // and the total executable memory at any point. 186 static const size_t kRegExpExecutableMemoryLimit = 16 * MB; 187 static const size_t kRegExpCompiledLimit = 1 * MB; 188 static const int kRegExpTooLargeToOptimize = 20 * KB; 189 190 private: 191 static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, 192 Handle<String> sample_subject, bool is_one_byte); 193 static inline bool EnsureCompiledIrregexp(Isolate* isolate, 194 Handle<JSRegExp> re, 195 Handle<String> sample_subject, 196 bool is_one_byte); 197 }; 198 199 200 // Represents the location of one element relative to the intersection of 201 // two sets. Corresponds to the four areas of a Venn diagram. 202 enum ElementInSetsRelation { 203 kInsideNone = 0, 204 kInsideFirst = 1, 205 kInsideSecond = 2, 206 kInsideBoth = 3 207 }; 208 209 210 // A set of unsigned integers that behaves especially well on small 211 // integers (< 32). May do zone-allocation. 212 class OutSet: public ZoneObject { 213 public: 214 OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {} 215 OutSet* Extend(unsigned value, Zone* zone); 216 bool Get(unsigned value) const; 217 static const unsigned kFirstLimit = 32; 218 219 private: 220 // Destructively set a value in this set. In most cases you want 221 // to use Extend instead to ensure that only one instance exists 222 // that contains the same values. 223 void Set(unsigned value, Zone* zone); 224 225 // The successors are a list of sets that contain the same values 226 // as this set and the one more value that is not present in this 227 // set. 228 ZoneList<OutSet*>* successors(Zone* zone) { return successors_; } 229 230 OutSet(uint32_t first, ZoneList<unsigned>* remaining) 231 : first_(first), remaining_(remaining), successors_(nullptr) {} 232 uint32_t first_; 233 ZoneList<unsigned>* remaining_; 234 ZoneList<OutSet*>* successors_; 235 friend class Trace; 236 }; 237 238 239 // A mapping from integers, specified as ranges, to a set of integers. 240 // Used for mapping character ranges to choices. 241 class DispatchTable : public ZoneObject { 242 public: 243 explicit DispatchTable(Zone* zone) : tree_(zone) { } 244 245 class Entry { 246 public: 247 Entry() : from_(0), to_(0), out_set_(nullptr) {} 248 Entry(uc32 from, uc32 to, OutSet* out_set) 249 : from_(from), to_(to), out_set_(out_set) { 250 DCHECK(from <= to); 251 } 252 uc32 from() { return from_; } 253 uc32 to() { return to_; } 254 void set_to(uc32 value) { to_ = value; } 255 void AddValue(int value, Zone* zone) { 256 out_set_ = out_set_->Extend(value, zone); 257 } 258 OutSet* out_set() { return out_set_; } 259 private: 260 uc32 from_; 261 uc32 to_; 262 OutSet* out_set_; 263 }; 264 265 class Config { 266 public: 267 typedef uc32 Key; 268 typedef Entry Value; 269 static const uc32 kNoKey; 270 static const Entry NoValue() { return Value(); } 271 static inline int Compare(uc32 a, uc32 b) { 272 if (a == b) 273 return 0; 274 else if (a < b) 275 return -1; 276 else 277 return 1; 278 } 279 }; 280 281 void AddRange(CharacterRange range, int value, Zone* zone); 282 OutSet* Get(uc32 value); 283 void Dump(); 284 285 template <typename Callback> 286 void ForEach(Callback* callback) { 287 return tree()->ForEach(callback); 288 } 289 290 private: 291 // There can't be a static empty set since it allocates its 292 // successors in a zone and caches them. 293 OutSet* empty() { return &empty_; } 294 OutSet empty_; 295 ZoneSplayTree<Config>* tree() { return &tree_; } 296 ZoneSplayTree<Config> tree_; 297 }; 298 299 300 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. 301 class UnicodeRangeSplitter { 302 public: 303 UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base); 304 void Call(uc32 from, DispatchTable::Entry entry); 305 306 ZoneList<CharacterRange>* bmp() { return bmp_; } 307 ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; } 308 ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; } 309 ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; } 310 311 private: 312 static const int kBase = 0; 313 // Separate ranges into 314 static const int kBmpCodePoints = 1; 315 static const int kLeadSurrogates = 2; 316 static const int kTrailSurrogates = 3; 317 static const int kNonBmpCodePoints = 4; 318 319 Zone* zone_; 320 DispatchTable table_; 321 ZoneList<CharacterRange>* bmp_; 322 ZoneList<CharacterRange>* lead_surrogates_; 323 ZoneList<CharacterRange>* trail_surrogates_; 324 ZoneList<CharacterRange>* non_bmp_; 325 }; 326 327 328 #define FOR_EACH_NODE_TYPE(VISIT) \ 329 VISIT(End) \ 330 VISIT(Action) \ 331 VISIT(Choice) \ 332 VISIT(BackReference) \ 333 VISIT(Assertion) \ 334 VISIT(Text) 335 336 337 class Trace; 338 struct PreloadState; 339 class GreedyLoopState; 340 class AlternativeGenerationList; 341 342 struct NodeInfo { 343 NodeInfo() 344 : being_analyzed(false), 345 been_analyzed(false), 346 follows_word_interest(false), 347 follows_newline_interest(false), 348 follows_start_interest(false), 349 at_end(false), 350 visited(false), 351 replacement_calculated(false) { } 352 353 // Returns true if the interests and assumptions of this node 354 // matches the given one. 355 bool Matches(NodeInfo* that) { 356 return (at_end == that->at_end) && 357 (follows_word_interest == that->follows_word_interest) && 358 (follows_newline_interest == that->follows_newline_interest) && 359 (follows_start_interest == that->follows_start_interest); 360 } 361 362 // Updates the interests of this node given the interests of the 363 // node preceding it. 364 void AddFromPreceding(NodeInfo* that) { 365 at_end |= that->at_end; 366 follows_word_interest |= that->follows_word_interest; 367 follows_newline_interest |= that->follows_newline_interest; 368 follows_start_interest |= that->follows_start_interest; 369 } 370 371 bool HasLookbehind() { 372 return follows_word_interest || 373 follows_newline_interest || 374 follows_start_interest; 375 } 376 377 // Sets the interests of this node to include the interests of the 378 // following node. 379 void AddFromFollowing(NodeInfo* that) { 380 follows_word_interest |= that->follows_word_interest; 381 follows_newline_interest |= that->follows_newline_interest; 382 follows_start_interest |= that->follows_start_interest; 383 } 384 385 void ResetCompilationState() { 386 being_analyzed = false; 387 been_analyzed = false; 388 } 389 390 bool being_analyzed: 1; 391 bool been_analyzed: 1; 392 393 // These bits are set of this node has to know what the preceding 394 // character was. 395 bool follows_word_interest: 1; 396 bool follows_newline_interest: 1; 397 bool follows_start_interest: 1; 398 399 bool at_end: 1; 400 bool visited: 1; 401 bool replacement_calculated: 1; 402 }; 403 404 405 // Details of a quick mask-compare check that can look ahead in the 406 // input stream. 407 class QuickCheckDetails { 408 public: 409 QuickCheckDetails() 410 : characters_(0), 411 mask_(0), 412 value_(0), 413 cannot_match_(false) { } 414 explicit QuickCheckDetails(int characters) 415 : characters_(characters), 416 mask_(0), 417 value_(0), 418 cannot_match_(false) { } 419 bool Rationalize(bool one_byte); 420 // Merge in the information from another branch of an alternation. 421 void Merge(QuickCheckDetails* other, int from_index); 422 // Advance the current position by some amount. 423 void Advance(int by, bool one_byte); 424 void Clear(); 425 bool cannot_match() { return cannot_match_; } 426 void set_cannot_match() { cannot_match_ = true; } 427 struct Position { 428 Position() : mask(0), value(0), determines_perfectly(false) { } 429 uc16 mask; 430 uc16 value; 431 bool determines_perfectly; 432 }; 433 int characters() { return characters_; } 434 void set_characters(int characters) { characters_ = characters; } 435 Position* positions(int index) { 436 DCHECK_LE(0, index); 437 DCHECK_GT(characters_, index); 438 return positions_ + index; 439 } 440 uint32_t mask() { return mask_; } 441 uint32_t value() { return value_; } 442 443 private: 444 // How many characters do we have quick check information from. This is 445 // the same for all branches of a choice node. 446 int characters_; 447 Position positions_[4]; 448 // These values are the condensate of the above array after Rationalize(). 449 uint32_t mask_; 450 uint32_t value_; 451 // If set to true, there is no way this quick check can match at all. 452 // E.g., if it requires to be at the start of the input, and isn't. 453 bool cannot_match_; 454 }; 455 456 457 extern int kUninitializedRegExpNodePlaceHolder; 458 459 460 class RegExpNode: public ZoneObject { 461 public: 462 explicit RegExpNode(Zone* zone) 463 : replacement_(nullptr), 464 on_work_list_(false), 465 trace_count_(0), 466 zone_(zone) { 467 bm_info_[0] = bm_info_[1] = nullptr; 468 } 469 virtual ~RegExpNode(); 470 virtual void Accept(NodeVisitor* visitor) = 0; 471 // Generates a goto to this node or actually generates the code at this point. 472 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; 473 // How many characters must this node consume at a minimum in order to 474 // succeed. If we have found at least 'still_to_find' characters that 475 // must be consumed there is no need to ask any following nodes whether 476 // they are sure to eat any more characters. The not_at_start argument is 477 // used to indicate that we know we are not at the start of the input. In 478 // this case anchored branches will always fail and can be ignored when 479 // determining how many characters are consumed on success. 480 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0; 481 // Emits some quick code that checks whether the preloaded characters match. 482 // Falls through on certain failure, jumps to the label on possible success. 483 // If the node cannot make a quick check it does nothing and returns false. 484 bool EmitQuickCheck(RegExpCompiler* compiler, 485 Trace* bounds_check_trace, 486 Trace* trace, 487 bool preload_has_checked_bounds, 488 Label* on_possible_success, 489 QuickCheckDetails* details_return, 490 bool fall_through_on_failure); 491 // For a given number of characters this returns a mask and a value. The 492 // next n characters are anded with the mask and compared with the value. 493 // A comparison failure indicates the node cannot match the next n characters. 494 // A comparison success indicates the node may match. 495 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 496 RegExpCompiler* compiler, 497 int characters_filled_in, 498 bool not_at_start) = 0; 499 static const int kNodeIsTooComplexForGreedyLoops = kMinInt; 500 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 501 // Only returns the successor for a text node of length 1 that matches any 502 // character and that has no guards on it. 503 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 504 RegExpCompiler* compiler) { 505 return nullptr; 506 } 507 508 // Collects information on the possible code units (mod 128) that can match if 509 // we look forward. This is used for a Boyer-Moore-like string searching 510 // implementation. TODO(erikcorry): This should share more code with 511 // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit 512 // the number of nodes we are willing to look at in order to create this data. 513 static const int kRecursionBudget = 200; 514 bool KeepRecursing(RegExpCompiler* compiler); 515 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 516 BoyerMooreLookahead* bm, bool not_at_start) { 517 UNREACHABLE(); 518 } 519 520 // If we know that the input is one-byte then there are some nodes that can 521 // never match. This method returns a node that can be substituted for 522 // itself, or nullptr if the node can never match. 523 virtual RegExpNode* FilterOneByte(int depth) { return this; } 524 // Helper for FilterOneByte. 525 RegExpNode* replacement() { 526 DCHECK(info()->replacement_calculated); 527 return replacement_; 528 } 529 RegExpNode* set_replacement(RegExpNode* replacement) { 530 info()->replacement_calculated = true; 531 replacement_ = replacement; 532 return replacement; // For convenience. 533 } 534 535 // We want to avoid recalculating the lookahead info, so we store it on the 536 // node. Only info that is for this node is stored. We can tell that the 537 // info is for this node when offset == 0, so the information is calculated 538 // relative to this node. 539 void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { 540 if (offset == 0) set_bm_info(not_at_start, bm); 541 } 542 543 Label* label() { return &label_; } 544 // If non-generic code is generated for a node (i.e. the node is not at the 545 // start of the trace) then it cannot be reused. This variable sets a limit 546 // on how often we allow that to happen before we insist on starting a new 547 // trace and generating generic code for a node that can be reused by flushing 548 // the deferred actions in the current trace and generating a goto. 549 static const int kMaxCopiesCodeGenerated = 10; 550 551 bool on_work_list() { return on_work_list_; } 552 void set_on_work_list(bool value) { on_work_list_ = value; } 553 554 NodeInfo* info() { return &info_; } 555 556 BoyerMooreLookahead* bm_info(bool not_at_start) { 557 return bm_info_[not_at_start ? 1 : 0]; 558 } 559 560 Zone* zone() const { return zone_; } 561 562 protected: 563 enum LimitResult { DONE, CONTINUE }; 564 RegExpNode* replacement_; 565 566 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); 567 568 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { 569 bm_info_[not_at_start ? 1 : 0] = bm; 570 } 571 572 private: 573 static const int kFirstCharBudget = 10; 574 Label label_; 575 bool on_work_list_; 576 NodeInfo info_; 577 // This variable keeps track of how many times code has been generated for 578 // this node (in different traces). We don't keep track of where the 579 // generated code is located unless the code is generated at the start of 580 // a trace, in which case it is generic and can be reused by flushing the 581 // deferred operations in the current trace and generating a goto. 582 int trace_count_; 583 BoyerMooreLookahead* bm_info_[2]; 584 585 Zone* zone_; 586 }; 587 588 589 class SeqRegExpNode: public RegExpNode { 590 public: 591 explicit SeqRegExpNode(RegExpNode* on_success) 592 : RegExpNode(on_success->zone()), on_success_(on_success) { } 593 RegExpNode* on_success() { return on_success_; } 594 void set_on_success(RegExpNode* node) { on_success_ = node; } 595 virtual RegExpNode* FilterOneByte(int depth); 596 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 597 BoyerMooreLookahead* bm, bool not_at_start) { 598 on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 599 if (offset == 0) set_bm_info(not_at_start, bm); 600 } 601 602 protected: 603 RegExpNode* FilterSuccessor(int depth); 604 605 private: 606 RegExpNode* on_success_; 607 }; 608 609 610 class ActionNode: public SeqRegExpNode { 611 public: 612 enum ActionType { 613 SET_REGISTER, 614 INCREMENT_REGISTER, 615 STORE_POSITION, 616 BEGIN_SUBMATCH, 617 POSITIVE_SUBMATCH_SUCCESS, 618 EMPTY_MATCH_CHECK, 619 CLEAR_CAPTURES 620 }; 621 static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success); 622 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); 623 static ActionNode* StorePosition(int reg, 624 bool is_capture, 625 RegExpNode* on_success); 626 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); 627 static ActionNode* BeginSubmatch(int stack_pointer_reg, 628 int position_reg, 629 RegExpNode* on_success); 630 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, 631 int restore_reg, 632 int clear_capture_count, 633 int clear_capture_from, 634 RegExpNode* on_success); 635 static ActionNode* EmptyMatchCheck(int start_register, 636 int repetition_register, 637 int repetition_limit, 638 RegExpNode* on_success); 639 virtual void Accept(NodeVisitor* visitor); 640 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 641 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); 642 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 643 RegExpCompiler* compiler, 644 int filled_in, 645 bool not_at_start) { 646 return on_success()->GetQuickCheckDetails( 647 details, compiler, filled_in, not_at_start); 648 } 649 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 650 BoyerMooreLookahead* bm, bool not_at_start); 651 ActionType action_type() { return action_type_; } 652 // TODO(erikcorry): We should allow some action nodes in greedy loops. 653 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 654 655 private: 656 union { 657 struct { 658 int reg; 659 int value; 660 } u_store_register; 661 struct { 662 int reg; 663 } u_increment_register; 664 struct { 665 int reg; 666 bool is_capture; 667 } u_position_register; 668 struct { 669 int stack_pointer_register; 670 int current_position_register; 671 int clear_register_count; 672 int clear_register_from; 673 } u_submatch; 674 struct { 675 int start_register; 676 int repetition_register; 677 int repetition_limit; 678 } u_empty_match_check; 679 struct { 680 int range_from; 681 int range_to; 682 } u_clear_captures; 683 } data_; 684 ActionNode(ActionType action_type, RegExpNode* on_success) 685 : SeqRegExpNode(on_success), 686 action_type_(action_type) { } 687 ActionType action_type_; 688 friend class DotPrinter; 689 }; 690 691 692 class TextNode: public SeqRegExpNode { 693 public: 694 TextNode(ZoneList<TextElement>* elms, bool read_backward, 695 RegExpNode* on_success) 696 : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {} 697 TextNode(RegExpCharacterClass* that, bool read_backward, 698 RegExpNode* on_success) 699 : SeqRegExpNode(on_success), 700 elms_(new (zone()) ZoneList<TextElement>(1, zone())), 701 read_backward_(read_backward) { 702 elms_->Add(TextElement::CharClass(that), zone()); 703 } 704 // Create TextNode for a single character class for the given ranges. 705 static TextNode* CreateForCharacterRanges(Zone* zone, 706 ZoneList<CharacterRange>* ranges, 707 bool read_backward, 708 RegExpNode* on_success, 709 JSRegExp::Flags flags); 710 // Create TextNode for a surrogate pair with a range given for the 711 // lead and the trail surrogate each. 712 static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead, 713 CharacterRange trail, 714 bool read_backward, 715 RegExpNode* on_success, 716 JSRegExp::Flags flags); 717 virtual void Accept(NodeVisitor* visitor); 718 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 719 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); 720 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 721 RegExpCompiler* compiler, 722 int characters_filled_in, 723 bool not_at_start); 724 ZoneList<TextElement>* elements() { return elms_; } 725 bool read_backward() { return read_backward_; } 726 void MakeCaseIndependent(Isolate* isolate, bool is_one_byte); 727 virtual int GreedyLoopTextLength(); 728 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 729 RegExpCompiler* compiler); 730 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 731 BoyerMooreLookahead* bm, bool not_at_start); 732 void CalculateOffsets(); 733 virtual RegExpNode* FilterOneByte(int depth); 734 735 private: 736 enum TextEmitPassType { 737 NON_LATIN1_MATCH, // Check for characters that can't match. 738 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. 739 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. 740 CASE_CHARACTER_MATCH, // Case-independent single character check. 741 CHARACTER_CLASS_MATCH // Character class. 742 }; 743 static bool SkipPass(TextEmitPassType pass, bool ignore_case); 744 static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; 745 static const int kLastPass = CHARACTER_CLASS_MATCH; 746 void TextEmitPass(RegExpCompiler* compiler, 747 TextEmitPassType pass, 748 bool preloaded, 749 Trace* trace, 750 bool first_element_checked, 751 int* checked_up_to); 752 int Length(); 753 ZoneList<TextElement>* elms_; 754 bool read_backward_; 755 }; 756 757 758 class AssertionNode: public SeqRegExpNode { 759 public: 760 enum AssertionType { 761 AT_END, 762 AT_START, 763 AT_BOUNDARY, 764 AT_NON_BOUNDARY, 765 AFTER_NEWLINE 766 }; 767 static AssertionNode* AtEnd(RegExpNode* on_success) { 768 return new(on_success->zone()) AssertionNode(AT_END, on_success); 769 } 770 static AssertionNode* AtStart(RegExpNode* on_success) { 771 return new(on_success->zone()) AssertionNode(AT_START, on_success); 772 } 773 static AssertionNode* AtBoundary(RegExpNode* on_success) { 774 return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success); 775 } 776 static AssertionNode* AtNonBoundary(RegExpNode* on_success) { 777 return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success); 778 } 779 static AssertionNode* AfterNewline(RegExpNode* on_success) { 780 return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success); 781 } 782 virtual void Accept(NodeVisitor* visitor); 783 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 784 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); 785 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 786 RegExpCompiler* compiler, 787 int filled_in, 788 bool not_at_start); 789 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 790 BoyerMooreLookahead* bm, bool not_at_start); 791 AssertionType assertion_type() { return assertion_type_; } 792 793 private: 794 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); 795 enum IfPrevious { kIsNonWord, kIsWord }; 796 void BacktrackIfPrevious(RegExpCompiler* compiler, 797 Trace* trace, 798 IfPrevious backtrack_if_previous); 799 AssertionNode(AssertionType t, RegExpNode* on_success) 800 : SeqRegExpNode(on_success), assertion_type_(t) { } 801 AssertionType assertion_type_; 802 }; 803 804 805 class BackReferenceNode: public SeqRegExpNode { 806 public: 807 BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags, 808 bool read_backward, RegExpNode* on_success) 809 : SeqRegExpNode(on_success), 810 start_reg_(start_reg), 811 end_reg_(end_reg), 812 flags_(flags), 813 read_backward_(read_backward) {} 814 virtual void Accept(NodeVisitor* visitor); 815 int start_register() { return start_reg_; } 816 int end_register() { return end_reg_; } 817 bool read_backward() { return read_backward_; } 818 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 819 virtual int EatsAtLeast(int still_to_find, 820 int recursion_depth, 821 bool not_at_start); 822 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 823 RegExpCompiler* compiler, 824 int characters_filled_in, 825 bool not_at_start) { 826 return; 827 } 828 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 829 BoyerMooreLookahead* bm, bool not_at_start); 830 831 private: 832 int start_reg_; 833 int end_reg_; 834 JSRegExp::Flags flags_; 835 bool read_backward_; 836 }; 837 838 839 class EndNode: public RegExpNode { 840 public: 841 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; 842 EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {} 843 virtual void Accept(NodeVisitor* visitor); 844 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 845 virtual int EatsAtLeast(int still_to_find, 846 int recursion_depth, 847 bool not_at_start) { return 0; } 848 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 849 RegExpCompiler* compiler, 850 int characters_filled_in, 851 bool not_at_start) { 852 // Returning 0 from EatsAtLeast should ensure we never get here. 853 UNREACHABLE(); 854 } 855 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 856 BoyerMooreLookahead* bm, bool not_at_start) { 857 // Returning 0 from EatsAtLeast should ensure we never get here. 858 UNREACHABLE(); 859 } 860 861 private: 862 Action action_; 863 }; 864 865 866 class NegativeSubmatchSuccess: public EndNode { 867 public: 868 NegativeSubmatchSuccess(int stack_pointer_reg, 869 int position_reg, 870 int clear_capture_count, 871 int clear_capture_start, 872 Zone* zone) 873 : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone), 874 stack_pointer_register_(stack_pointer_reg), 875 current_position_register_(position_reg), 876 clear_capture_count_(clear_capture_count), 877 clear_capture_start_(clear_capture_start) { } 878 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 879 880 private: 881 int stack_pointer_register_; 882 int current_position_register_; 883 int clear_capture_count_; 884 int clear_capture_start_; 885 }; 886 887 888 class Guard: public ZoneObject { 889 public: 890 enum Relation { LT, GEQ }; 891 Guard(int reg, Relation op, int value) 892 : reg_(reg), 893 op_(op), 894 value_(value) { } 895 int reg() { return reg_; } 896 Relation op() { return op_; } 897 int value() { return value_; } 898 899 private: 900 int reg_; 901 Relation op_; 902 int value_; 903 }; 904 905 906 class GuardedAlternative { 907 public: 908 explicit GuardedAlternative(RegExpNode* node) 909 : node_(node), guards_(nullptr) {} 910 void AddGuard(Guard* guard, Zone* zone); 911 RegExpNode* node() { return node_; } 912 void set_node(RegExpNode* node) { node_ = node; } 913 ZoneList<Guard*>* guards() { return guards_; } 914 915 private: 916 RegExpNode* node_; 917 ZoneList<Guard*>* guards_; 918 }; 919 920 921 class AlternativeGeneration; 922 923 924 class ChoiceNode: public RegExpNode { 925 public: 926 explicit ChoiceNode(int expected_size, Zone* zone) 927 : RegExpNode(zone), 928 alternatives_(new (zone) 929 ZoneList<GuardedAlternative>(expected_size, zone)), 930 table_(nullptr), 931 not_at_start_(false), 932 being_calculated_(false) {} 933 virtual void Accept(NodeVisitor* visitor); 934 void AddAlternative(GuardedAlternative node) { 935 alternatives()->Add(node, zone()); 936 } 937 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } 938 DispatchTable* GetTable(bool ignore_case); 939 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 940 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); 941 int EatsAtLeastHelper(int still_to_find, 942 int budget, 943 RegExpNode* ignore_this_node, 944 bool not_at_start); 945 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 946 RegExpCompiler* compiler, 947 int characters_filled_in, 948 bool not_at_start); 949 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 950 BoyerMooreLookahead* bm, bool not_at_start); 951 952 bool being_calculated() { return being_calculated_; } 953 bool not_at_start() { return not_at_start_; } 954 void set_not_at_start() { not_at_start_ = true; } 955 void set_being_calculated(bool b) { being_calculated_ = b; } 956 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { 957 return true; 958 } 959 virtual RegExpNode* FilterOneByte(int depth); 960 virtual bool read_backward() { return false; } 961 962 protected: 963 int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); 964 ZoneList<GuardedAlternative>* alternatives_; 965 966 private: 967 friend class DispatchTableConstructor; 968 friend class Analysis; 969 void GenerateGuard(RegExpMacroAssembler* macro_assembler, 970 Guard* guard, 971 Trace* trace); 972 int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); 973 void EmitOutOfLineContinuation(RegExpCompiler* compiler, 974 Trace* trace, 975 GuardedAlternative alternative, 976 AlternativeGeneration* alt_gen, 977 int preload_characters, 978 bool next_expects_preload); 979 void SetUpPreLoad(RegExpCompiler* compiler, 980 Trace* current_trace, 981 PreloadState* preloads); 982 void AssertGuardsMentionRegisters(Trace* trace); 983 int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace); 984 Trace* EmitGreedyLoop(RegExpCompiler* compiler, 985 Trace* trace, 986 AlternativeGenerationList* alt_gens, 987 PreloadState* preloads, 988 GreedyLoopState* greedy_loop_state, 989 int text_length); 990 void EmitChoices(RegExpCompiler* compiler, 991 AlternativeGenerationList* alt_gens, 992 int first_choice, 993 Trace* trace, 994 PreloadState* preloads); 995 DispatchTable* table_; 996 // If true, this node is never checked at the start of the input. 997 // Allows a new trace to start with at_start() set to false. 998 bool not_at_start_; 999 bool being_calculated_; 1000 }; 1001 1002 1003 class NegativeLookaroundChoiceNode : public ChoiceNode { 1004 public: 1005 explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail, 1006 GuardedAlternative then_do_this, 1007 Zone* zone) 1008 : ChoiceNode(2, zone) { 1009 AddAlternative(this_must_fail); 1010 AddAlternative(then_do_this); 1011 } 1012 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); 1013 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1014 RegExpCompiler* compiler, 1015 int characters_filled_in, 1016 bool not_at_start); 1017 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 1018 BoyerMooreLookahead* bm, bool not_at_start) { 1019 alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm, 1020 not_at_start); 1021 if (offset == 0) set_bm_info(not_at_start, bm); 1022 } 1023 // For a negative lookahead we don't emit the quick check for the 1024 // alternative that is expected to fail. This is because quick check code 1025 // starts by loading enough characters for the alternative that takes fewest 1026 // characters, but on a negative lookahead the negative branch did not take 1027 // part in that calculation (EatsAtLeast) so the assumptions don't hold. 1028 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { 1029 return !is_first; 1030 } 1031 virtual RegExpNode* FilterOneByte(int depth); 1032 }; 1033 1034 1035 class LoopChoiceNode: public ChoiceNode { 1036 public: 1037 LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone) 1038 : ChoiceNode(2, zone), 1039 loop_node_(nullptr), 1040 continue_node_(nullptr), 1041 body_can_be_zero_length_(body_can_be_zero_length), 1042 read_backward_(read_backward) {} 1043 void AddLoopAlternative(GuardedAlternative alt); 1044 void AddContinueAlternative(GuardedAlternative alt); 1045 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 1046 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); 1047 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1048 RegExpCompiler* compiler, 1049 int characters_filled_in, 1050 bool not_at_start); 1051 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 1052 BoyerMooreLookahead* bm, bool not_at_start); 1053 RegExpNode* loop_node() { return loop_node_; } 1054 RegExpNode* continue_node() { return continue_node_; } 1055 bool body_can_be_zero_length() { return body_can_be_zero_length_; } 1056 virtual bool read_backward() { return read_backward_; } 1057 virtual void Accept(NodeVisitor* visitor); 1058 virtual RegExpNode* FilterOneByte(int depth); 1059 1060 private: 1061 // AddAlternative is made private for loop nodes because alternatives 1062 // should not be added freely, we need to keep track of which node 1063 // goes back to the node itself. 1064 void AddAlternative(GuardedAlternative node) { 1065 ChoiceNode::AddAlternative(node); 1066 } 1067 1068 RegExpNode* loop_node_; 1069 RegExpNode* continue_node_; 1070 bool body_can_be_zero_length_; 1071 bool read_backward_; 1072 }; 1073 1074 1075 // Improve the speed that we scan for an initial point where a non-anchored 1076 // regexp can match by using a Boyer-Moore-like table. This is done by 1077 // identifying non-greedy non-capturing loops in the nodes that eat any 1078 // character one at a time. For example in the middle of the regexp 1079 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly 1080 // inserted at the start of any non-anchored regexp. 1081 // 1082 // When we have found such a loop we look ahead in the nodes to find the set of 1083 // characters that can come at given distances. For example for the regexp 1084 // /.?foo/ we know that there are at least 3 characters ahead of us, and the 1085 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in 1086 // the lookahead info where the set of characters is reasonably constrained. In 1087 // our example this is from index 1 to 2 (0 is not constrained). We can now 1088 // look 3 characters ahead and if we don't find one of [f, o] (the union of 1089 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2). 1090 // 1091 // For Unicode input strings we do the same, but modulo 128. 1092 // 1093 // We also look at the first string fed to the regexp and use that to get a hint 1094 // of the character frequencies in the inputs. This affects the assessment of 1095 // whether the set of characters is 'reasonably constrained'. 1096 // 1097 // We also have another lookahead mechanism (called quick check in the code), 1098 // which uses a wide load of multiple characters followed by a mask and compare 1099 // to determine whether a match is possible at this point. 1100 enum ContainedInLattice { 1101 kNotYet = 0, 1102 kLatticeIn = 1, 1103 kLatticeOut = 2, 1104 kLatticeUnknown = 3 // Can also mean both in and out. 1105 }; 1106 1107 1108 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { 1109 return static_cast<ContainedInLattice>(a | b); 1110 } 1111 1112 1113 ContainedInLattice AddRange(ContainedInLattice a, 1114 const int* ranges, 1115 int ranges_size, 1116 Interval new_range); 1117 1118 1119 class BoyerMoorePositionInfo : public ZoneObject { 1120 public: 1121 explicit BoyerMoorePositionInfo(Zone* zone) 1122 : map_(new(zone) ZoneList<bool>(kMapSize, zone)), 1123 map_count_(0), 1124 w_(kNotYet), 1125 s_(kNotYet), 1126 d_(kNotYet), 1127 surrogate_(kNotYet) { 1128 for (int i = 0; i < kMapSize; i++) { 1129 map_->Add(false, zone); 1130 } 1131 } 1132 1133 bool& at(int i) { return map_->at(i); } 1134 1135 static const int kMapSize = 128; 1136 static const int kMask = kMapSize - 1; 1137 1138 int map_count() const { return map_count_; } 1139 1140 void Set(int character); 1141 void SetInterval(const Interval& interval); 1142 void SetAll(); 1143 bool is_non_word() { return w_ == kLatticeOut; } 1144 bool is_word() { return w_ == kLatticeIn; } 1145 1146 private: 1147 ZoneList<bool>* map_; 1148 int map_count_; // Number of set bits in the map. 1149 ContainedInLattice w_; // The \w character class. 1150 ContainedInLattice s_; // The \s character class. 1151 ContainedInLattice d_; // The \d character class. 1152 ContainedInLattice surrogate_; // Surrogate UTF-16 code units. 1153 }; 1154 1155 1156 class BoyerMooreLookahead : public ZoneObject { 1157 public: 1158 BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); 1159 1160 int length() { return length_; } 1161 int max_char() { return max_char_; } 1162 RegExpCompiler* compiler() { return compiler_; } 1163 1164 int Count(int map_number) { 1165 return bitmaps_->at(map_number)->map_count(); 1166 } 1167 1168 BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } 1169 1170 void Set(int map_number, int character) { 1171 if (character > max_char_) return; 1172 BoyerMoorePositionInfo* info = bitmaps_->at(map_number); 1173 info->Set(character); 1174 } 1175 1176 void SetInterval(int map_number, const Interval& interval) { 1177 if (interval.from() > max_char_) return; 1178 BoyerMoorePositionInfo* info = bitmaps_->at(map_number); 1179 if (interval.to() > max_char_) { 1180 info->SetInterval(Interval(interval.from(), max_char_)); 1181 } else { 1182 info->SetInterval(interval); 1183 } 1184 } 1185 1186 void SetAll(int map_number) { 1187 bitmaps_->at(map_number)->SetAll(); 1188 } 1189 1190 void SetRest(int from_map) { 1191 for (int i = from_map; i < length_; i++) SetAll(i); 1192 } 1193 void EmitSkipInstructions(RegExpMacroAssembler* masm); 1194 1195 private: 1196 // This is the value obtained by EatsAtLeast. If we do not have at least this 1197 // many characters left in the sample string then the match is bound to fail. 1198 // Therefore it is OK to read a character this far ahead of the current match 1199 // point. 1200 int length_; 1201 RegExpCompiler* compiler_; 1202 // 0xff for Latin1, 0xffff for UTF-16. 1203 int max_char_; 1204 ZoneList<BoyerMoorePositionInfo*>* bitmaps_; 1205 1206 int GetSkipTable(int min_lookahead, 1207 int max_lookahead, 1208 Handle<ByteArray> boolean_skip_table); 1209 bool FindWorthwhileInterval(int* from, int* to); 1210 int FindBestInterval( 1211 int max_number_of_chars, int old_biggest_points, int* from, int* to); 1212 }; 1213 1214 1215 // There are many ways to generate code for a node. This class encapsulates 1216 // the current way we should be generating. In other words it encapsulates 1217 // the current state of the code generator. The effect of this is that we 1218 // generate code for paths that the matcher can take through the regular 1219 // expression. A given node in the regexp can be code-generated several times 1220 // as it can be part of several traces. For example for the regexp: 1221 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part 1222 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code 1223 // to match foo is generated only once (the traces have a common prefix). The 1224 // code to store the capture is deferred and generated (twice) after the places 1225 // where baz has been matched. 1226 class Trace { 1227 public: 1228 // A value for a property that is either known to be true, know to be false, 1229 // or not known. 1230 enum TriBool { 1231 UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 1232 }; 1233 1234 class DeferredAction { 1235 public: 1236 DeferredAction(ActionNode::ActionType action_type, int reg) 1237 : action_type_(action_type), reg_(reg), next_(nullptr) {} 1238 DeferredAction* next() { return next_; } 1239 bool Mentions(int reg); 1240 int reg() { return reg_; } 1241 ActionNode::ActionType action_type() { return action_type_; } 1242 private: 1243 ActionNode::ActionType action_type_; 1244 int reg_; 1245 DeferredAction* next_; 1246 friend class Trace; 1247 }; 1248 1249 class DeferredCapture : public DeferredAction { 1250 public: 1251 DeferredCapture(int reg, bool is_capture, Trace* trace) 1252 : DeferredAction(ActionNode::STORE_POSITION, reg), 1253 cp_offset_(trace->cp_offset()), 1254 is_capture_(is_capture) { } 1255 int cp_offset() { return cp_offset_; } 1256 bool is_capture() { return is_capture_; } 1257 private: 1258 int cp_offset_; 1259 bool is_capture_; 1260 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 1261 }; 1262 1263 class DeferredSetRegister : public DeferredAction { 1264 public: 1265 DeferredSetRegister(int reg, int value) 1266 : DeferredAction(ActionNode::SET_REGISTER, reg), 1267 value_(value) { } 1268 int value() { return value_; } 1269 private: 1270 int value_; 1271 }; 1272 1273 class DeferredClearCaptures : public DeferredAction { 1274 public: 1275 explicit DeferredClearCaptures(Interval range) 1276 : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), 1277 range_(range) { } 1278 Interval range() { return range_; } 1279 private: 1280 Interval range_; 1281 }; 1282 1283 class DeferredIncrementRegister : public DeferredAction { 1284 public: 1285 explicit DeferredIncrementRegister(int reg) 1286 : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { } 1287 }; 1288 1289 Trace() 1290 : cp_offset_(0), 1291 actions_(nullptr), 1292 backtrack_(nullptr), 1293 stop_node_(nullptr), 1294 loop_label_(nullptr), 1295 characters_preloaded_(0), 1296 bound_checked_up_to_(0), 1297 flush_budget_(100), 1298 at_start_(UNKNOWN) {} 1299 1300 // End the trace. This involves flushing the deferred actions in the trace 1301 // and pushing a backtrack location onto the backtrack stack. Once this is 1302 // done we can start a new trace or go to one that has already been 1303 // generated. 1304 void Flush(RegExpCompiler* compiler, RegExpNode* successor); 1305 int cp_offset() { return cp_offset_; } 1306 DeferredAction* actions() { return actions_; } 1307 // A trivial trace is one that has no deferred actions or other state that 1308 // affects the assumptions used when generating code. There is no recorded 1309 // backtrack location in a trivial trace, so with a trivial trace we will 1310 // generate code that, on a failure to match, gets the backtrack location 1311 // from the backtrack stack rather than using a direct jump instruction. We 1312 // always start code generation with a trivial trace and non-trivial traces 1313 // are created as we emit code for nodes or add to the list of deferred 1314 // actions in the trace. The location of the code generated for a node using 1315 // a trivial trace is recorded in a label in the node so that gotos can be 1316 // generated to that code. 1317 bool is_trivial() { 1318 return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 && 1319 characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && 1320 quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; 1321 } 1322 TriBool at_start() { return at_start_; } 1323 void set_at_start(TriBool at_start) { at_start_ = at_start; } 1324 Label* backtrack() { return backtrack_; } 1325 Label* loop_label() { return loop_label_; } 1326 RegExpNode* stop_node() { return stop_node_; } 1327 int characters_preloaded() { return characters_preloaded_; } 1328 int bound_checked_up_to() { return bound_checked_up_to_; } 1329 int flush_budget() { return flush_budget_; } 1330 QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } 1331 bool mentions_reg(int reg); 1332 // Returns true if a deferred position store exists to the specified 1333 // register and stores the offset in the out-parameter. Otherwise 1334 // returns false. 1335 bool GetStoredPosition(int reg, int* cp_offset); 1336 // These set methods and AdvanceCurrentPositionInTrace should be used only on 1337 // new traces - the intention is that traces are immutable after creation. 1338 void add_action(DeferredAction* new_action) { 1339 DCHECK(new_action->next_ == nullptr); 1340 new_action->next_ = actions_; 1341 actions_ = new_action; 1342 } 1343 void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } 1344 void set_stop_node(RegExpNode* node) { stop_node_ = node; } 1345 void set_loop_label(Label* label) { loop_label_ = label; } 1346 void set_characters_preloaded(int count) { characters_preloaded_ = count; } 1347 void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } 1348 void set_flush_budget(int to) { flush_budget_ = to; } 1349 void set_quick_check_performed(QuickCheckDetails* d) { 1350 quick_check_performed_ = *d; 1351 } 1352 void InvalidateCurrentCharacter(); 1353 void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); 1354 1355 private: 1356 int FindAffectedRegisters(OutSet* affected_registers, Zone* zone); 1357 void PerformDeferredActions(RegExpMacroAssembler* macro, 1358 int max_register, 1359 const OutSet& affected_registers, 1360 OutSet* registers_to_pop, 1361 OutSet* registers_to_clear, 1362 Zone* zone); 1363 void RestoreAffectedRegisters(RegExpMacroAssembler* macro, 1364 int max_register, 1365 const OutSet& registers_to_pop, 1366 const OutSet& registers_to_clear); 1367 int cp_offset_; 1368 DeferredAction* actions_; 1369 Label* backtrack_; 1370 RegExpNode* stop_node_; 1371 Label* loop_label_; 1372 int characters_preloaded_; 1373 int bound_checked_up_to_; 1374 QuickCheckDetails quick_check_performed_; 1375 int flush_budget_; 1376 TriBool at_start_; 1377 }; 1378 1379 1380 class GreedyLoopState { 1381 public: 1382 explicit GreedyLoopState(bool not_at_start); 1383 1384 Label* label() { return &label_; } 1385 Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } 1386 1387 private: 1388 Label label_; 1389 Trace counter_backtrack_trace_; 1390 }; 1391 1392 1393 struct PreloadState { 1394 static const int kEatsAtLeastNotYetInitialized = -1; 1395 bool preload_is_current_; 1396 bool preload_has_checked_bounds_; 1397 int preload_characters_; 1398 int eats_at_least_; 1399 void init() { 1400 eats_at_least_ = kEatsAtLeastNotYetInitialized; 1401 } 1402 }; 1403 1404 1405 class NodeVisitor { 1406 public: 1407 virtual ~NodeVisitor() { } 1408 #define DECLARE_VISIT(Type) \ 1409 virtual void Visit##Type(Type##Node* that) = 0; 1410 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 1411 #undef DECLARE_VISIT 1412 virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); } 1413 }; 1414 1415 1416 // Node visitor used to add the start set of the alternatives to the 1417 // dispatch table of a choice node. 1418 class DispatchTableConstructor: public NodeVisitor { 1419 public: 1420 DispatchTableConstructor(DispatchTable* table, bool ignore_case, 1421 Zone* zone) 1422 : table_(table), 1423 choice_index_(-1), 1424 ignore_case_(ignore_case), 1425 zone_(zone) { } 1426 1427 void BuildTable(ChoiceNode* node); 1428 1429 void AddRange(CharacterRange range) { 1430 table()->AddRange(range, choice_index_, zone_); 1431 } 1432 1433 void AddInverse(ZoneList<CharacterRange>* ranges); 1434 1435 #define DECLARE_VISIT(Type) \ 1436 virtual void Visit##Type(Type##Node* that); 1437 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 1438 #undef DECLARE_VISIT 1439 1440 DispatchTable* table() { return table_; } 1441 void set_choice_index(int value) { choice_index_ = value; } 1442 1443 protected: 1444 DispatchTable* table_; 1445 int choice_index_; 1446 bool ignore_case_; 1447 Zone* zone_; 1448 }; 1449 1450 1451 // Assertion propagation moves information about assertions such as 1452 // \b to the affected nodes. For instance, in /.\b./ information must 1453 // be propagated to the first '.' that whatever follows needs to know 1454 // if it matched a word or a non-word, and to the second '.' that it 1455 // has to check if it succeeds a word or non-word. In this case the 1456 // result will be something like: 1457 // 1458 // +-------+ +------------+ 1459 // | . | | . | 1460 // +-------+ ---> +------------+ 1461 // | word? | | check word | 1462 // +-------+ +------------+ 1463 class Analysis: public NodeVisitor { 1464 public: 1465 Analysis(Isolate* isolate, bool is_one_byte) 1466 : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {} 1467 void EnsureAnalyzed(RegExpNode* node); 1468 1469 #define DECLARE_VISIT(Type) \ 1470 virtual void Visit##Type(Type##Node* that); 1471 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 1472 #undef DECLARE_VISIT 1473 virtual void VisitLoopChoice(LoopChoiceNode* that); 1474 1475 bool has_failed() { return error_message_ != nullptr; } 1476 const char* error_message() { 1477 DCHECK(error_message_ != nullptr); 1478 return error_message_; 1479 } 1480 void fail(const char* error_message) { 1481 error_message_ = error_message; 1482 } 1483 1484 Isolate* isolate() const { return isolate_; } 1485 1486 private: 1487 Isolate* isolate_; 1488 bool is_one_byte_; 1489 const char* error_message_; 1490 1491 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); 1492 }; 1493 1494 1495 struct RegExpCompileData { 1496 RegExpCompileData() 1497 : tree(nullptr), 1498 node(nullptr), 1499 simple(true), 1500 contains_anchor(false), 1501 capture_count(0) {} 1502 RegExpTree* tree; 1503 RegExpNode* node; 1504 bool simple; 1505 bool contains_anchor; 1506 Handle<FixedArray> capture_name_map; 1507 Handle<String> error; 1508 int capture_count; 1509 }; 1510 1511 1512 class RegExpEngine: public AllStatic { 1513 public: 1514 struct CompilationResult { 1515 CompilationResult(Isolate* isolate, const char* error_message) 1516 : error_message(error_message), 1517 code(ReadOnlyRoots(isolate).the_hole_value()), 1518 num_registers(0) {} 1519 CompilationResult(Object* code, int registers) 1520 : error_message(nullptr), code(code), num_registers(registers) {} 1521 const char* error_message; 1522 Object* code; 1523 int num_registers; 1524 }; 1525 1526 static CompilationResult Compile(Isolate* isolate, Zone* zone, 1527 RegExpCompileData* input, 1528 JSRegExp::Flags flags, 1529 Handle<String> pattern, 1530 Handle<String> sample_subject, 1531 bool is_one_byte); 1532 1533 static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern); 1534 1535 static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); 1536 }; 1537 1538 1539 class RegExpResultsCache : public AllStatic { 1540 public: 1541 enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS }; 1542 1543 // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi. 1544 // On success, the returned result is guaranteed to be a COW-array. 1545 static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern, 1546 FixedArray** last_match_out, ResultsCacheType type); 1547 // Attempt to add value_array to the cache specified by type. On success, 1548 // value_array is turned into a COW-array. 1549 static void Enter(Isolate* isolate, Handle<String> key_string, 1550 Handle<Object> key_pattern, Handle<FixedArray> value_array, 1551 Handle<FixedArray> last_match_cache, ResultsCacheType type); 1552 static void Clear(FixedArray* cache); 1553 static const int kRegExpResultsCacheSize = 0x100; 1554 1555 private: 1556 static const int kArrayEntriesPerCacheEntry = 4; 1557 static const int kStringOffset = 0; 1558 static const int kPatternOffset = 1; 1559 static const int kArrayOffset = 2; 1560 static const int kLastMatchOffset = 3; 1561 }; 1562 1563 } // namespace internal 1564 } // namespace v8 1565 1566 #endif // V8_REGEXP_JSREGEXP_H_ 1567