1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #ifndef V8_JSREGEXP_H_ 29 #define V8_JSREGEXP_H_ 30 31 #include "macro-assembler.h" 32 33 namespace v8 { 34 namespace internal { 35 36 37 class RegExpMacroAssembler; 38 39 40 class RegExpImpl { 41 public: 42 // Whether V8 is compiled with native regexp support or not. 43 static bool UsesNativeRegExp() { 44 #ifdef V8_NATIVE_REGEXP 45 return true; 46 #else 47 return false; 48 #endif 49 } 50 51 // Creates a regular expression literal in the old space. 52 // This function calls the garbage collector if necessary. 53 static Handle<Object> CreateRegExpLiteral(Handle<JSFunction> constructor, 54 Handle<String> pattern, 55 Handle<String> flags, 56 bool* has_pending_exception); 57 58 // Returns a string representation of a regular expression. 59 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4. 60 // This function calls the garbage collector if necessary. 61 static Handle<String> ToString(Handle<Object> value); 62 63 // Parses the RegExp pattern and prepares the JSRegExp object with 64 // generic data and choice of implementation - as well as what 65 // the implementation wants to store in the data field. 66 // Returns false if compilation fails. 67 static Handle<Object> Compile(Handle<JSRegExp> re, 68 Handle<String> pattern, 69 Handle<String> flags); 70 71 // See ECMA-262 section 15.10.6.2. 72 // This function calls the garbage collector if necessary. 73 static Handle<Object> Exec(Handle<JSRegExp> regexp, 74 Handle<String> subject, 75 int index, 76 Handle<JSArray> lastMatchInfo); 77 78 // Prepares a JSRegExp object with Irregexp-specific data. 79 static void IrregexpPrepare(Handle<JSRegExp> re, 80 Handle<String> pattern, 81 JSRegExp::Flags flags, 82 int capture_register_count); 83 84 85 static void AtomCompile(Handle<JSRegExp> re, 86 Handle<String> pattern, 87 JSRegExp::Flags flags, 88 Handle<String> match_pattern); 89 90 static Handle<Object> AtomExec(Handle<JSRegExp> regexp, 91 Handle<String> subject, 92 int index, 93 Handle<JSArray> lastMatchInfo); 94 95 // Execute an Irregexp bytecode pattern. 96 // On a successful match, the result is a JSArray containing 97 // captured positions. On a failure, the result is the null value. 98 // Returns an empty handle in case of an exception. 99 static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp, 100 Handle<String> subject, 101 int index, 102 Handle<JSArray> lastMatchInfo); 103 104 // Array index in the lastMatchInfo array. 105 static const int kLastCaptureCount = 0; 106 static const int kLastSubject = 1; 107 static const int kLastInput = 2; 108 static const int kFirstCapture = 3; 109 static const int kLastMatchOverhead = 3; 110 111 // Direct offset into the lastMatchInfo array. 112 static const int kLastCaptureCountOffset = 113 FixedArray::kHeaderSize + kLastCaptureCount * kPointerSize; 114 static const int kLastSubjectOffset = 115 FixedArray::kHeaderSize + kLastSubject * kPointerSize; 116 static const int kLastInputOffset = 117 FixedArray::kHeaderSize + kLastInput * kPointerSize; 118 static const int kFirstCaptureOffset = 119 FixedArray::kHeaderSize + kFirstCapture * kPointerSize; 120 121 // Used to access the lastMatchInfo array. 122 static int GetCapture(FixedArray* array, int index) { 123 return Smi::cast(array->get(index + kFirstCapture))->value(); 124 } 125 126 static void SetLastCaptureCount(FixedArray* array, int to) { 127 array->set(kLastCaptureCount, Smi::FromInt(to)); 128 } 129 130 static void SetLastSubject(FixedArray* array, String* to) { 131 array->set(kLastSubject, to); 132 } 133 134 static void SetLastInput(FixedArray* array, String* to) { 135 array->set(kLastInput, to); 136 } 137 138 static void SetCapture(FixedArray* array, int index, int to) { 139 array->set(index + kFirstCapture, Smi::FromInt(to)); 140 } 141 142 static int GetLastCaptureCount(FixedArray* array) { 143 return Smi::cast(array->get(kLastCaptureCount))->value(); 144 } 145 146 // For acting on the JSRegExp data FixedArray. 147 static int IrregexpMaxRegisterCount(FixedArray* re); 148 static void SetIrregexpMaxRegisterCount(FixedArray* re, int value); 149 static int IrregexpNumberOfCaptures(FixedArray* re); 150 static int IrregexpNumberOfRegisters(FixedArray* re); 151 static ByteArray* IrregexpByteCode(FixedArray* re, bool is_ascii); 152 static Code* IrregexpNativeCode(FixedArray* re, bool is_ascii); 153 154 private: 155 static String* last_ascii_string_; 156 static String* two_byte_cached_string_; 157 158 static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii); 159 static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii); 160 161 162 // Set the subject cache. The previous string buffer is not deleted, so the 163 // caller should ensure that it doesn't leak. 164 static void SetSubjectCache(String* subject, 165 char* utf8_subject, 166 int uft8_length, 167 int character_position, 168 int utf8_position); 169 170 // A one element cache of the last utf8_subject string and its length. The 171 // subject JS String object is cached in the heap. We also cache a 172 // translation between position and utf8 position. 173 static char* utf8_subject_cache_; 174 static int utf8_length_cache_; 175 static int utf8_position_; 176 static int character_position_; 177 }; 178 179 180 // Represents the location of one element relative to the intersection of 181 // two sets. Corresponds to the four areas of a Venn diagram. 182 enum ElementInSetsRelation { 183 kInsideNone = 0, 184 kInsideFirst = 1, 185 kInsideSecond = 2, 186 kInsideBoth = 3 187 }; 188 189 190 // Represents the relation of two sets. 191 // Sets can be either disjoint, partially or fully overlapping, or equal. 192 class SetRelation BASE_EMBEDDED { 193 public: 194 // Relation is represented by a bit saying whether there are elements in 195 // one set that is not in the other, and a bit saying that there are elements 196 // that are in both sets. 197 198 // Location of an element. Corresponds to the internal areas of 199 // a Venn diagram. 200 enum { 201 kInFirst = 1 << kInsideFirst, 202 kInSecond = 1 << kInsideSecond, 203 kInBoth = 1 << kInsideBoth 204 }; 205 SetRelation() : bits_(0) {} 206 ~SetRelation() {} 207 // Add the existence of objects in a particular 208 void SetElementsInFirstSet() { bits_ |= kInFirst; } 209 void SetElementsInSecondSet() { bits_ |= kInSecond; } 210 void SetElementsInBothSets() { bits_ |= kInBoth; } 211 // Check the currently known relation of the sets (common functions only, 212 // for other combinations, use value() to get the bits and check them 213 // manually). 214 // Sets are completely disjoint. 215 bool Disjoint() { return (bits_ & kInBoth) == 0; } 216 // Sets are equal. 217 bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; } 218 // First set contains second. 219 bool Contains() { return (bits_ & kInSecond) == 0; } 220 // Second set contains first. 221 bool ContainedIn() { return (bits_ & kInFirst) == 0; } 222 bool NonTrivialIntersection() { 223 return (bits_ == (kInFirst | kInSecond | kInBoth)); 224 } 225 int value() { return bits_; } 226 private: 227 int bits_; 228 }; 229 230 231 class CharacterRange { 232 public: 233 CharacterRange() : from_(0), to_(0) { } 234 // For compatibility with the CHECK_OK macro 235 CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT 236 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { } 237 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges); 238 static Vector<const uc16> GetWordBounds(); 239 static inline CharacterRange Singleton(uc16 value) { 240 return CharacterRange(value, value); 241 } 242 static inline CharacterRange Range(uc16 from, uc16 to) { 243 ASSERT(from <= to); 244 return CharacterRange(from, to); 245 } 246 static inline CharacterRange Everything() { 247 return CharacterRange(0, 0xFFFF); 248 } 249 bool Contains(uc16 i) { return from_ <= i && i <= to_; } 250 uc16 from() const { return from_; } 251 void set_from(uc16 value) { from_ = value; } 252 uc16 to() const { return to_; } 253 void set_to(uc16 value) { to_ = value; } 254 bool is_valid() { return from_ <= to_; } 255 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } 256 bool IsSingleton() { return (from_ == to_); } 257 void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii); 258 static void Split(ZoneList<CharacterRange>* base, 259 Vector<const uc16> overlay, 260 ZoneList<CharacterRange>** included, 261 ZoneList<CharacterRange>** excluded); 262 // Whether a range list is in canonical form: Ranges ordered by from value, 263 // and ranges non-overlapping and non-adjacent. 264 static bool IsCanonical(ZoneList<CharacterRange>* ranges); 265 // Convert range list to canonical form. The characters covered by the ranges 266 // will still be the same, but no character is in more than one range, and 267 // adjacent ranges are merged. The resulting list may be shorter than the 268 // original, but cannot be longer. 269 static void Canonicalize(ZoneList<CharacterRange>* ranges); 270 // Check how the set of characters defined by a CharacterRange list relates 271 // to the set of word characters. List must be in canonical form. 272 static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges); 273 // Takes two character range lists (representing character sets) in canonical 274 // form and merges them. 275 // The characters that are only covered by the first set are added to 276 // first_set_only_out. the characters that are only in the second set are 277 // added to second_set_only_out, and the characters that are in both are 278 // added to both_sets_out. 279 // The pointers to first_set_only_out, second_set_only_out and both_sets_out 280 // should be to empty lists, but they need not be distinct, and may be NULL. 281 // If NULL, the characters are dropped, and if two arguments are the same 282 // pointer, the result is the union of the two sets that would be created 283 // if the pointers had been distinct. 284 // This way, the Merge function can compute all the usual set operations: 285 // union (all three out-sets are equal), intersection (only both_sets_out is 286 // non-NULL), and set difference (only first_set is non-NULL). 287 static void Merge(ZoneList<CharacterRange>* first_set, 288 ZoneList<CharacterRange>* second_set, 289 ZoneList<CharacterRange>* first_set_only_out, 290 ZoneList<CharacterRange>* second_set_only_out, 291 ZoneList<CharacterRange>* both_sets_out); 292 // Negate the contents of a character range in canonical form. 293 static void Negate(ZoneList<CharacterRange>* src, 294 ZoneList<CharacterRange>* dst); 295 static const int kRangeCanonicalizeMax = 0x346; 296 static const int kStartMarker = (1 << 24); 297 static const int kPayloadMask = (1 << 24) - 1; 298 299 private: 300 uc16 from_; 301 uc16 to_; 302 }; 303 304 305 // A set of unsigned integers that behaves especially well on small 306 // integers (< 32). May do zone-allocation. 307 class OutSet: public ZoneObject { 308 public: 309 OutSet() : first_(0), remaining_(NULL), successors_(NULL) { } 310 OutSet* Extend(unsigned value); 311 bool Get(unsigned value); 312 static const unsigned kFirstLimit = 32; 313 314 private: 315 // Destructively set a value in this set. In most cases you want 316 // to use Extend instead to ensure that only one instance exists 317 // that contains the same values. 318 void Set(unsigned value); 319 320 // The successors are a list of sets that contain the same values 321 // as this set and the one more value that is not present in this 322 // set. 323 ZoneList<OutSet*>* successors() { return successors_; } 324 325 OutSet(uint32_t first, ZoneList<unsigned>* remaining) 326 : first_(first), remaining_(remaining), successors_(NULL) { } 327 uint32_t first_; 328 ZoneList<unsigned>* remaining_; 329 ZoneList<OutSet*>* successors_; 330 friend class Trace; 331 }; 332 333 334 // A mapping from integers, specified as ranges, to a set of integers. 335 // Used for mapping character ranges to choices. 336 class DispatchTable : public ZoneObject { 337 public: 338 class Entry { 339 public: 340 Entry() : from_(0), to_(0), out_set_(NULL) { } 341 Entry(uc16 from, uc16 to, OutSet* out_set) 342 : from_(from), to_(to), out_set_(out_set) { } 343 uc16 from() { return from_; } 344 uc16 to() { return to_; } 345 void set_to(uc16 value) { to_ = value; } 346 void AddValue(int value) { out_set_ = out_set_->Extend(value); } 347 OutSet* out_set() { return out_set_; } 348 private: 349 uc16 from_; 350 uc16 to_; 351 OutSet* out_set_; 352 }; 353 354 class Config { 355 public: 356 typedef uc16 Key; 357 typedef Entry Value; 358 static const uc16 kNoKey; 359 static const Entry kNoValue; 360 static inline int Compare(uc16 a, uc16 b) { 361 if (a == b) 362 return 0; 363 else if (a < b) 364 return -1; 365 else 366 return 1; 367 } 368 }; 369 370 void AddRange(CharacterRange range, int value); 371 OutSet* Get(uc16 value); 372 void Dump(); 373 374 template <typename Callback> 375 void ForEach(Callback* callback) { return tree()->ForEach(callback); } 376 private: 377 // There can't be a static empty set since it allocates its 378 // successors in a zone and caches them. 379 OutSet* empty() { return &empty_; } 380 OutSet empty_; 381 ZoneSplayTree<Config>* tree() { return &tree_; } 382 ZoneSplayTree<Config> tree_; 383 }; 384 385 386 #define FOR_EACH_NODE_TYPE(VISIT) \ 387 VISIT(End) \ 388 VISIT(Action) \ 389 VISIT(Choice) \ 390 VISIT(BackReference) \ 391 VISIT(Assertion) \ 392 VISIT(Text) 393 394 395 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 396 VISIT(Disjunction) \ 397 VISIT(Alternative) \ 398 VISIT(Assertion) \ 399 VISIT(CharacterClass) \ 400 VISIT(Atom) \ 401 VISIT(Quantifier) \ 402 VISIT(Capture) \ 403 VISIT(Lookahead) \ 404 VISIT(BackReference) \ 405 VISIT(Empty) \ 406 VISIT(Text) 407 408 409 #define FORWARD_DECLARE(Name) class RegExp##Name; 410 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) 411 #undef FORWARD_DECLARE 412 413 414 class TextElement { 415 public: 416 enum Type {UNINITIALIZED, ATOM, CHAR_CLASS}; 417 TextElement() : type(UNINITIALIZED) { } 418 explicit TextElement(Type t) : type(t), cp_offset(-1) { } 419 static TextElement Atom(RegExpAtom* atom); 420 static TextElement CharClass(RegExpCharacterClass* char_class); 421 int length(); 422 Type type; 423 union { 424 RegExpAtom* u_atom; 425 RegExpCharacterClass* u_char_class; 426 } data; 427 int cp_offset; 428 }; 429 430 431 class Trace; 432 433 434 struct NodeInfo { 435 NodeInfo() 436 : being_analyzed(false), 437 been_analyzed(false), 438 follows_word_interest(false), 439 follows_newline_interest(false), 440 follows_start_interest(false), 441 at_end(false), 442 visited(false) { } 443 444 // Returns true if the interests and assumptions of this node 445 // matches the given one. 446 bool Matches(NodeInfo* that) { 447 return (at_end == that->at_end) && 448 (follows_word_interest == that->follows_word_interest) && 449 (follows_newline_interest == that->follows_newline_interest) && 450 (follows_start_interest == that->follows_start_interest); 451 } 452 453 // Updates the interests of this node given the interests of the 454 // node preceding it. 455 void AddFromPreceding(NodeInfo* that) { 456 at_end |= that->at_end; 457 follows_word_interest |= that->follows_word_interest; 458 follows_newline_interest |= that->follows_newline_interest; 459 follows_start_interest |= that->follows_start_interest; 460 } 461 462 bool HasLookbehind() { 463 return follows_word_interest || 464 follows_newline_interest || 465 follows_start_interest; 466 } 467 468 // Sets the interests of this node to include the interests of the 469 // following node. 470 void AddFromFollowing(NodeInfo* that) { 471 follows_word_interest |= that->follows_word_interest; 472 follows_newline_interest |= that->follows_newline_interest; 473 follows_start_interest |= that->follows_start_interest; 474 } 475 476 void ResetCompilationState() { 477 being_analyzed = false; 478 been_analyzed = false; 479 } 480 481 bool being_analyzed: 1; 482 bool been_analyzed: 1; 483 484 // These bits are set of this node has to know what the preceding 485 // character was. 486 bool follows_word_interest: 1; 487 bool follows_newline_interest: 1; 488 bool follows_start_interest: 1; 489 490 bool at_end: 1; 491 bool visited: 1; 492 }; 493 494 495 class SiblingList { 496 public: 497 SiblingList() : list_(NULL) { } 498 int length() { 499 return list_ == NULL ? 0 : list_->length(); 500 } 501 void Ensure(RegExpNode* parent) { 502 if (list_ == NULL) { 503 list_ = new ZoneList<RegExpNode*>(2); 504 list_->Add(parent); 505 } 506 } 507 void Add(RegExpNode* node) { list_->Add(node); } 508 RegExpNode* Get(int index) { return list_->at(index); } 509 private: 510 ZoneList<RegExpNode*>* list_; 511 }; 512 513 514 // Details of a quick mask-compare check that can look ahead in the 515 // input stream. 516 class QuickCheckDetails { 517 public: 518 QuickCheckDetails() 519 : characters_(0), 520 mask_(0), 521 value_(0), 522 cannot_match_(false) { } 523 explicit QuickCheckDetails(int characters) 524 : characters_(characters), 525 mask_(0), 526 value_(0), 527 cannot_match_(false) { } 528 bool Rationalize(bool ascii); 529 // Merge in the information from another branch of an alternation. 530 void Merge(QuickCheckDetails* other, int from_index); 531 // Advance the current position by some amount. 532 void Advance(int by, bool ascii); 533 void Clear(); 534 bool cannot_match() { return cannot_match_; } 535 void set_cannot_match() { cannot_match_ = true; } 536 struct Position { 537 Position() : mask(0), value(0), determines_perfectly(false) { } 538 uc16 mask; 539 uc16 value; 540 bool determines_perfectly; 541 }; 542 int characters() { return characters_; } 543 void set_characters(int characters) { characters_ = characters; } 544 Position* positions(int index) { 545 ASSERT(index >= 0); 546 ASSERT(index < characters_); 547 return positions_ + index; 548 } 549 uint32_t mask() { return mask_; } 550 uint32_t value() { return value_; } 551 552 private: 553 // How many characters do we have quick check information from. This is 554 // the same for all branches of a choice node. 555 int characters_; 556 Position positions_[4]; 557 // These values are the condensate of the above array after Rationalize(). 558 uint32_t mask_; 559 uint32_t value_; 560 // If set to true, there is no way this quick check can match at all. 561 // E.g., if it requires to be at the start of the input, and isn't. 562 bool cannot_match_; 563 }; 564 565 566 class RegExpNode: public ZoneObject { 567 public: 568 RegExpNode() : first_character_set_(NULL), trace_count_(0) { } 569 virtual ~RegExpNode(); 570 virtual void Accept(NodeVisitor* visitor) = 0; 571 // Generates a goto to this node or actually generates the code at this point. 572 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; 573 // How many characters must this node consume at a minimum in order to 574 // succeed. If we have found at least 'still_to_find' characters that 575 // must be consumed there is no need to ask any following nodes whether 576 // they are sure to eat any more characters. 577 virtual int EatsAtLeast(int still_to_find, int recursion_depth) = 0; 578 // Emits some quick code that checks whether the preloaded characters match. 579 // Falls through on certain failure, jumps to the label on possible success. 580 // If the node cannot make a quick check it does nothing and returns false. 581 bool EmitQuickCheck(RegExpCompiler* compiler, 582 Trace* trace, 583 bool preload_has_checked_bounds, 584 Label* on_possible_success, 585 QuickCheckDetails* details_return, 586 bool fall_through_on_failure); 587 // For a given number of characters this returns a mask and a value. The 588 // next n characters are anded with the mask and compared with the value. 589 // A comparison failure indicates the node cannot match the next n characters. 590 // A comparison success indicates the node may match. 591 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 592 RegExpCompiler* compiler, 593 int characters_filled_in, 594 bool not_at_start) = 0; 595 static const int kNodeIsTooComplexForGreedyLoops = -1; 596 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 597 Label* label() { return &label_; } 598 // If non-generic code is generated for a node (ie the node is not at the 599 // start of the trace) then it cannot be reused. This variable sets a limit 600 // on how often we allow that to happen before we insist on starting a new 601 // trace and generating generic code for a node that can be reused by flushing 602 // the deferred actions in the current trace and generating a goto. 603 static const int kMaxCopiesCodeGenerated = 10; 604 605 NodeInfo* info() { return &info_; } 606 607 void AddSibling(RegExpNode* node) { siblings_.Add(node); } 608 609 // Static version of EnsureSibling that expresses the fact that the 610 // result has the same type as the input. 611 template <class C> 612 static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) { 613 return static_cast<C*>(node->EnsureSibling(info, cloned)); 614 } 615 616 SiblingList* siblings() { return &siblings_; } 617 void set_siblings(SiblingList* other) { siblings_ = *other; } 618 619 // Return the set of possible next characters recognized by the regexp 620 // (or a safe subset, potentially the set of all characters). 621 ZoneList<CharacterRange>* FirstCharacterSet(); 622 623 // Compute (if possible within the budget of traversed nodes) the 624 // possible first characters of the input matched by this node and 625 // its continuation. Returns the remaining budget after the computation. 626 // If the budget is spent, the result is negative, and the cached 627 // first_character_set_ value isn't set. 628 virtual int ComputeFirstCharacterSet(int budget); 629 630 // Get and set the cached first character set value. 631 ZoneList<CharacterRange>* first_character_set() { 632 return first_character_set_; 633 } 634 void set_first_character_set(ZoneList<CharacterRange>* character_set) { 635 first_character_set_ = character_set; 636 } 637 638 protected: 639 enum LimitResult { DONE, CONTINUE }; 640 static const int kComputeFirstCharacterSetFail = -1; 641 642 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); 643 644 // Returns a sibling of this node whose interests and assumptions 645 // match the ones in the given node info. If no sibling exists NULL 646 // is returned. 647 RegExpNode* TryGetSibling(NodeInfo* info); 648 649 // Returns a sibling of this node whose interests match the ones in 650 // the given node info. The info must not contain any assertions. 651 // If no node exists a new one will be created by cloning the current 652 // node. The result will always be an instance of the same concrete 653 // class as this node. 654 RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned); 655 656 // Returns a clone of this node initialized using the copy constructor 657 // of its concrete class. Note that the node may have to be pre- 658 // processed before it is on a usable state. 659 virtual RegExpNode* Clone() = 0; 660 661 private: 662 static const int kFirstCharBudget = 10; 663 Label label_; 664 NodeInfo info_; 665 SiblingList siblings_; 666 ZoneList<CharacterRange>* first_character_set_; 667 // This variable keeps track of how many times code has been generated for 668 // this node (in different traces). We don't keep track of where the 669 // generated code is located unless the code is generated at the start of 670 // a trace, in which case it is generic and can be reused by flushing the 671 // deferred operations in the current trace and generating a goto. 672 int trace_count_; 673 }; 674 675 676 // A simple closed interval. 677 class Interval { 678 public: 679 Interval() : from_(kNone), to_(kNone) { } 680 Interval(int from, int to) : from_(from), to_(to) { } 681 Interval Union(Interval that) { 682 if (that.from_ == kNone) 683 return *this; 684 else if (from_ == kNone) 685 return that; 686 else 687 return Interval(Min(from_, that.from_), Max(to_, that.to_)); 688 } 689 bool Contains(int value) { 690 return (from_ <= value) && (value <= to_); 691 } 692 bool is_empty() { return from_ == kNone; } 693 int from() { return from_; } 694 int to() { return to_; } 695 static Interval Empty() { return Interval(); } 696 static const int kNone = -1; 697 private: 698 int from_; 699 int to_; 700 }; 701 702 703 class SeqRegExpNode: public RegExpNode { 704 public: 705 explicit SeqRegExpNode(RegExpNode* on_success) 706 : on_success_(on_success) { } 707 RegExpNode* on_success() { return on_success_; } 708 void set_on_success(RegExpNode* node) { on_success_ = node; } 709 private: 710 RegExpNode* on_success_; 711 }; 712 713 714 class ActionNode: public SeqRegExpNode { 715 public: 716 enum Type { 717 SET_REGISTER, 718 INCREMENT_REGISTER, 719 STORE_POSITION, 720 BEGIN_SUBMATCH, 721 POSITIVE_SUBMATCH_SUCCESS, 722 EMPTY_MATCH_CHECK, 723 CLEAR_CAPTURES 724 }; 725 static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success); 726 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); 727 static ActionNode* StorePosition(int reg, 728 bool is_capture, 729 RegExpNode* on_success); 730 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); 731 static ActionNode* BeginSubmatch(int stack_pointer_reg, 732 int position_reg, 733 RegExpNode* on_success); 734 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, 735 int restore_reg, 736 int clear_capture_count, 737 int clear_capture_from, 738 RegExpNode* on_success); 739 static ActionNode* EmptyMatchCheck(int start_register, 740 int repetition_register, 741 int repetition_limit, 742 RegExpNode* on_success); 743 virtual void Accept(NodeVisitor* visitor); 744 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 745 virtual int EatsAtLeast(int still_to_find, int recursion_depth); 746 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 747 RegExpCompiler* compiler, 748 int filled_in, 749 bool not_at_start) { 750 return on_success()->GetQuickCheckDetails( 751 details, compiler, filled_in, not_at_start); 752 } 753 Type type() { return type_; } 754 // TODO(erikcorry): We should allow some action nodes in greedy loops. 755 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 756 virtual ActionNode* Clone() { return new ActionNode(*this); } 757 virtual int ComputeFirstCharacterSet(int budget); 758 private: 759 union { 760 struct { 761 int reg; 762 int value; 763 } u_store_register; 764 struct { 765 int reg; 766 } u_increment_register; 767 struct { 768 int reg; 769 bool is_capture; 770 } u_position_register; 771 struct { 772 int stack_pointer_register; 773 int current_position_register; 774 int clear_register_count; 775 int clear_register_from; 776 } u_submatch; 777 struct { 778 int start_register; 779 int repetition_register; 780 int repetition_limit; 781 } u_empty_match_check; 782 struct { 783 int range_from; 784 int range_to; 785 } u_clear_captures; 786 } data_; 787 ActionNode(Type type, RegExpNode* on_success) 788 : SeqRegExpNode(on_success), 789 type_(type) { } 790 Type type_; 791 friend class DotPrinter; 792 }; 793 794 795 class TextNode: public SeqRegExpNode { 796 public: 797 TextNode(ZoneList<TextElement>* elms, 798 RegExpNode* on_success) 799 : SeqRegExpNode(on_success), 800 elms_(elms) { } 801 TextNode(RegExpCharacterClass* that, 802 RegExpNode* on_success) 803 : SeqRegExpNode(on_success), 804 elms_(new ZoneList<TextElement>(1)) { 805 elms_->Add(TextElement::CharClass(that)); 806 } 807 virtual void Accept(NodeVisitor* visitor); 808 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 809 virtual int EatsAtLeast(int still_to_find, int recursion_depth); 810 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 811 RegExpCompiler* compiler, 812 int characters_filled_in, 813 bool not_at_start); 814 ZoneList<TextElement>* elements() { return elms_; } 815 void MakeCaseIndependent(bool is_ascii); 816 virtual int GreedyLoopTextLength(); 817 virtual TextNode* Clone() { 818 TextNode* result = new TextNode(*this); 819 result->CalculateOffsets(); 820 return result; 821 } 822 void CalculateOffsets(); 823 virtual int ComputeFirstCharacterSet(int budget); 824 private: 825 enum TextEmitPassType { 826 NON_ASCII_MATCH, // Check for characters that can't match. 827 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. 828 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. 829 CASE_CHARACTER_MATCH, // Case-independent single character check. 830 CHARACTER_CLASS_MATCH // Character class. 831 }; 832 static bool SkipPass(int pass, bool ignore_case); 833 static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; 834 static const int kLastPass = CHARACTER_CLASS_MATCH; 835 void TextEmitPass(RegExpCompiler* compiler, 836 TextEmitPassType pass, 837 bool preloaded, 838 Trace* trace, 839 bool first_element_checked, 840 int* checked_up_to); 841 int Length(); 842 ZoneList<TextElement>* elms_; 843 }; 844 845 846 class AssertionNode: public SeqRegExpNode { 847 public: 848 enum AssertionNodeType { 849 AT_END, 850 AT_START, 851 AT_BOUNDARY, 852 AT_NON_BOUNDARY, 853 AFTER_NEWLINE, 854 // Types not directly expressible in regexp syntax. 855 // Used for modifying a boundary node if its following character is 856 // known to be word and/or non-word. 857 AFTER_NONWORD_CHARACTER, 858 AFTER_WORD_CHARACTER 859 }; 860 static AssertionNode* AtEnd(RegExpNode* on_success) { 861 return new AssertionNode(AT_END, on_success); 862 } 863 static AssertionNode* AtStart(RegExpNode* on_success) { 864 return new AssertionNode(AT_START, on_success); 865 } 866 static AssertionNode* AtBoundary(RegExpNode* on_success) { 867 return new AssertionNode(AT_BOUNDARY, on_success); 868 } 869 static AssertionNode* AtNonBoundary(RegExpNode* on_success) { 870 return new AssertionNode(AT_NON_BOUNDARY, on_success); 871 } 872 static AssertionNode* AfterNewline(RegExpNode* on_success) { 873 return new AssertionNode(AFTER_NEWLINE, on_success); 874 } 875 virtual void Accept(NodeVisitor* visitor); 876 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 877 virtual int EatsAtLeast(int still_to_find, int recursion_depth); 878 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 879 RegExpCompiler* compiler, 880 int filled_in, 881 bool not_at_start); 882 virtual int ComputeFirstCharacterSet(int budget); 883 virtual AssertionNode* Clone() { return new AssertionNode(*this); } 884 AssertionNodeType type() { return type_; } 885 void set_type(AssertionNodeType type) { type_ = type; } 886 private: 887 AssertionNode(AssertionNodeType t, RegExpNode* on_success) 888 : SeqRegExpNode(on_success), type_(t) { } 889 AssertionNodeType type_; 890 }; 891 892 893 class BackReferenceNode: public SeqRegExpNode { 894 public: 895 BackReferenceNode(int start_reg, 896 int end_reg, 897 RegExpNode* on_success) 898 : SeqRegExpNode(on_success), 899 start_reg_(start_reg), 900 end_reg_(end_reg) { } 901 virtual void Accept(NodeVisitor* visitor); 902 int start_register() { return start_reg_; } 903 int end_register() { return end_reg_; } 904 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 905 virtual int EatsAtLeast(int still_to_find, int recursion_depth); 906 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 907 RegExpCompiler* compiler, 908 int characters_filled_in, 909 bool not_at_start) { 910 return; 911 } 912 virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } 913 virtual int ComputeFirstCharacterSet(int budget); 914 private: 915 int start_reg_; 916 int end_reg_; 917 }; 918 919 920 class EndNode: public RegExpNode { 921 public: 922 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; 923 explicit EndNode(Action action) : action_(action) { } 924 virtual void Accept(NodeVisitor* visitor); 925 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 926 virtual int EatsAtLeast(int still_to_find, int recursion_depth) { return 0; } 927 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 928 RegExpCompiler* compiler, 929 int characters_filled_in, 930 bool not_at_start) { 931 // Returning 0 from EatsAtLeast should ensure we never get here. 932 UNREACHABLE(); 933 } 934 virtual EndNode* Clone() { return new EndNode(*this); } 935 private: 936 Action action_; 937 }; 938 939 940 class NegativeSubmatchSuccess: public EndNode { 941 public: 942 NegativeSubmatchSuccess(int stack_pointer_reg, 943 int position_reg, 944 int clear_capture_count, 945 int clear_capture_start) 946 : EndNode(NEGATIVE_SUBMATCH_SUCCESS), 947 stack_pointer_register_(stack_pointer_reg), 948 current_position_register_(position_reg), 949 clear_capture_count_(clear_capture_count), 950 clear_capture_start_(clear_capture_start) { } 951 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 952 953 private: 954 int stack_pointer_register_; 955 int current_position_register_; 956 int clear_capture_count_; 957 int clear_capture_start_; 958 }; 959 960 961 class Guard: public ZoneObject { 962 public: 963 enum Relation { LT, GEQ }; 964 Guard(int reg, Relation op, int value) 965 : reg_(reg), 966 op_(op), 967 value_(value) { } 968 int reg() { return reg_; } 969 Relation op() { return op_; } 970 int value() { return value_; } 971 972 private: 973 int reg_; 974 Relation op_; 975 int value_; 976 }; 977 978 979 class GuardedAlternative { 980 public: 981 explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { } 982 void AddGuard(Guard* guard); 983 RegExpNode* node() { return node_; } 984 void set_node(RegExpNode* node) { node_ = node; } 985 ZoneList<Guard*>* guards() { return guards_; } 986 987 private: 988 RegExpNode* node_; 989 ZoneList<Guard*>* guards_; 990 }; 991 992 993 class AlternativeGeneration; 994 995 996 class ChoiceNode: public RegExpNode { 997 public: 998 explicit ChoiceNode(int expected_size) 999 : alternatives_(new ZoneList<GuardedAlternative>(expected_size)), 1000 table_(NULL), 1001 not_at_start_(false), 1002 being_calculated_(false) { } 1003 virtual void Accept(NodeVisitor* visitor); 1004 void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); } 1005 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } 1006 DispatchTable* GetTable(bool ignore_case); 1007 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 1008 virtual int EatsAtLeast(int still_to_find, int recursion_depth); 1009 int EatsAtLeastHelper(int still_to_find, 1010 int recursion_depth, 1011 RegExpNode* ignore_this_node); 1012 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1013 RegExpCompiler* compiler, 1014 int characters_filled_in, 1015 bool not_at_start); 1016 virtual ChoiceNode* Clone() { return new ChoiceNode(*this); } 1017 1018 bool being_calculated() { return being_calculated_; } 1019 bool not_at_start() { return not_at_start_; } 1020 void set_not_at_start() { not_at_start_ = true; } 1021 void set_being_calculated(bool b) { being_calculated_ = b; } 1022 virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; } 1023 1024 protected: 1025 int GreedyLoopTextLength(GuardedAlternative* alternative); 1026 ZoneList<GuardedAlternative>* alternatives_; 1027 1028 private: 1029 friend class DispatchTableConstructor; 1030 friend class Analysis; 1031 void GenerateGuard(RegExpMacroAssembler* macro_assembler, 1032 Guard* guard, 1033 Trace* trace); 1034 int CalculatePreloadCharacters(RegExpCompiler* compiler); 1035 void EmitOutOfLineContinuation(RegExpCompiler* compiler, 1036 Trace* trace, 1037 GuardedAlternative alternative, 1038 AlternativeGeneration* alt_gen, 1039 int preload_characters, 1040 bool next_expects_preload); 1041 DispatchTable* table_; 1042 // If true, this node is never checked at the start of the input. 1043 // Allows a new trace to start with at_start() set to false. 1044 bool not_at_start_; 1045 bool being_calculated_; 1046 }; 1047 1048 1049 class NegativeLookaheadChoiceNode: public ChoiceNode { 1050 public: 1051 explicit NegativeLookaheadChoiceNode(GuardedAlternative this_must_fail, 1052 GuardedAlternative then_do_this) 1053 : ChoiceNode(2) { 1054 AddAlternative(this_must_fail); 1055 AddAlternative(then_do_this); 1056 } 1057 virtual int EatsAtLeast(int still_to_find, int recursion_depth); 1058 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1059 RegExpCompiler* compiler, 1060 int characters_filled_in, 1061 bool not_at_start); 1062 // For a negative lookahead we don't emit the quick check for the 1063 // alternative that is expected to fail. This is because quick check code 1064 // starts by loading enough characters for the alternative that takes fewest 1065 // characters, but on a negative lookahead the negative branch did not take 1066 // part in that calculation (EatsAtLeast) so the assumptions don't hold. 1067 virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; } 1068 virtual int ComputeFirstCharacterSet(int budget); 1069 }; 1070 1071 1072 class LoopChoiceNode: public ChoiceNode { 1073 public: 1074 explicit LoopChoiceNode(bool body_can_be_zero_length) 1075 : ChoiceNode(2), 1076 loop_node_(NULL), 1077 continue_node_(NULL), 1078 body_can_be_zero_length_(body_can_be_zero_length) { } 1079 void AddLoopAlternative(GuardedAlternative alt); 1080 void AddContinueAlternative(GuardedAlternative alt); 1081 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 1082 virtual int EatsAtLeast(int still_to_find, int recursion_depth); 1083 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1084 RegExpCompiler* compiler, 1085 int characters_filled_in, 1086 bool not_at_start); 1087 virtual int ComputeFirstCharacterSet(int budget); 1088 virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } 1089 RegExpNode* loop_node() { return loop_node_; } 1090 RegExpNode* continue_node() { return continue_node_; } 1091 bool body_can_be_zero_length() { return body_can_be_zero_length_; } 1092 virtual void Accept(NodeVisitor* visitor); 1093 1094 private: 1095 // AddAlternative is made private for loop nodes because alternatives 1096 // should not be added freely, we need to keep track of which node 1097 // goes back to the node itself. 1098 void AddAlternative(GuardedAlternative node) { 1099 ChoiceNode::AddAlternative(node); 1100 } 1101 1102 RegExpNode* loop_node_; 1103 RegExpNode* continue_node_; 1104 bool body_can_be_zero_length_; 1105 }; 1106 1107 1108 // There are many ways to generate code for a node. This class encapsulates 1109 // the current way we should be generating. In other words it encapsulates 1110 // the current state of the code generator. The effect of this is that we 1111 // generate code for paths that the matcher can take through the regular 1112 // expression. A given node in the regexp can be code-generated several times 1113 // as it can be part of several traces. For example for the regexp: 1114 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part 1115 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code 1116 // to match foo is generated only once (the traces have a common prefix). The 1117 // code to store the capture is deferred and generated (twice) after the places 1118 // where baz has been matched. 1119 class Trace { 1120 public: 1121 // A value for a property that is either known to be true, know to be false, 1122 // or not known. 1123 enum TriBool { 1124 UNKNOWN = -1, FALSE = 0, TRUE = 1 1125 }; 1126 1127 class DeferredAction { 1128 public: 1129 DeferredAction(ActionNode::Type type, int reg) 1130 : type_(type), reg_(reg), next_(NULL) { } 1131 DeferredAction* next() { return next_; } 1132 bool Mentions(int reg); 1133 int reg() { return reg_; } 1134 ActionNode::Type type() { return type_; } 1135 private: 1136 ActionNode::Type type_; 1137 int reg_; 1138 DeferredAction* next_; 1139 friend class Trace; 1140 }; 1141 1142 class DeferredCapture : public DeferredAction { 1143 public: 1144 DeferredCapture(int reg, bool is_capture, Trace* trace) 1145 : DeferredAction(ActionNode::STORE_POSITION, reg), 1146 cp_offset_(trace->cp_offset()), 1147 is_capture_(is_capture) { } 1148 int cp_offset() { return cp_offset_; } 1149 bool is_capture() { return is_capture_; } 1150 private: 1151 int cp_offset_; 1152 bool is_capture_; 1153 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 1154 }; 1155 1156 class DeferredSetRegister : public DeferredAction { 1157 public: 1158 DeferredSetRegister(int reg, int value) 1159 : DeferredAction(ActionNode::SET_REGISTER, reg), 1160 value_(value) { } 1161 int value() { return value_; } 1162 private: 1163 int value_; 1164 }; 1165 1166 class DeferredClearCaptures : public DeferredAction { 1167 public: 1168 explicit DeferredClearCaptures(Interval range) 1169 : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), 1170 range_(range) { } 1171 Interval range() { return range_; } 1172 private: 1173 Interval range_; 1174 }; 1175 1176 class DeferredIncrementRegister : public DeferredAction { 1177 public: 1178 explicit DeferredIncrementRegister(int reg) 1179 : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { } 1180 }; 1181 1182 Trace() 1183 : cp_offset_(0), 1184 actions_(NULL), 1185 backtrack_(NULL), 1186 stop_node_(NULL), 1187 loop_label_(NULL), 1188 characters_preloaded_(0), 1189 bound_checked_up_to_(0), 1190 flush_budget_(100), 1191 at_start_(UNKNOWN) { } 1192 1193 // End the trace. This involves flushing the deferred actions in the trace 1194 // and pushing a backtrack location onto the backtrack stack. Once this is 1195 // done we can start a new trace or go to one that has already been 1196 // generated. 1197 void Flush(RegExpCompiler* compiler, RegExpNode* successor); 1198 int cp_offset() { return cp_offset_; } 1199 DeferredAction* actions() { return actions_; } 1200 // A trivial trace is one that has no deferred actions or other state that 1201 // affects the assumptions used when generating code. There is no recorded 1202 // backtrack location in a trivial trace, so with a trivial trace we will 1203 // generate code that, on a failure to match, gets the backtrack location 1204 // from the backtrack stack rather than using a direct jump instruction. We 1205 // always start code generation with a trivial trace and non-trivial traces 1206 // are created as we emit code for nodes or add to the list of deferred 1207 // actions in the trace. The location of the code generated for a node using 1208 // a trivial trace is recorded in a label in the node so that gotos can be 1209 // generated to that code. 1210 bool is_trivial() { 1211 return backtrack_ == NULL && 1212 actions_ == NULL && 1213 cp_offset_ == 0 && 1214 characters_preloaded_ == 0 && 1215 bound_checked_up_to_ == 0 && 1216 quick_check_performed_.characters() == 0 && 1217 at_start_ == UNKNOWN; 1218 } 1219 TriBool at_start() { return at_start_; } 1220 void set_at_start(bool at_start) { at_start_ = at_start ? TRUE : FALSE; } 1221 Label* backtrack() { return backtrack_; } 1222 Label* loop_label() { return loop_label_; } 1223 RegExpNode* stop_node() { return stop_node_; } 1224 int characters_preloaded() { return characters_preloaded_; } 1225 int bound_checked_up_to() { return bound_checked_up_to_; } 1226 int flush_budget() { return flush_budget_; } 1227 QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } 1228 bool mentions_reg(int reg); 1229 // Returns true if a deferred position store exists to the specified 1230 // register and stores the offset in the out-parameter. Otherwise 1231 // returns false. 1232 bool GetStoredPosition(int reg, int* cp_offset); 1233 // These set methods and AdvanceCurrentPositionInTrace should be used only on 1234 // new traces - the intention is that traces are immutable after creation. 1235 void add_action(DeferredAction* new_action) { 1236 ASSERT(new_action->next_ == NULL); 1237 new_action->next_ = actions_; 1238 actions_ = new_action; 1239 } 1240 void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } 1241 void set_stop_node(RegExpNode* node) { stop_node_ = node; } 1242 void set_loop_label(Label* label) { loop_label_ = label; } 1243 void set_characters_preloaded(int count) { characters_preloaded_ = count; } 1244 void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } 1245 void set_flush_budget(int to) { flush_budget_ = to; } 1246 void set_quick_check_performed(QuickCheckDetails* d) { 1247 quick_check_performed_ = *d; 1248 } 1249 void InvalidateCurrentCharacter(); 1250 void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); 1251 private: 1252 int FindAffectedRegisters(OutSet* affected_registers); 1253 void PerformDeferredActions(RegExpMacroAssembler* macro, 1254 int max_register, 1255 OutSet& affected_registers, 1256 OutSet* registers_to_pop, 1257 OutSet* registers_to_clear); 1258 void RestoreAffectedRegisters(RegExpMacroAssembler* macro, 1259 int max_register, 1260 OutSet& registers_to_pop, 1261 OutSet& registers_to_clear); 1262 int cp_offset_; 1263 DeferredAction* actions_; 1264 Label* backtrack_; 1265 RegExpNode* stop_node_; 1266 Label* loop_label_; 1267 int characters_preloaded_; 1268 int bound_checked_up_to_; 1269 QuickCheckDetails quick_check_performed_; 1270 int flush_budget_; 1271 TriBool at_start_; 1272 }; 1273 1274 1275 class NodeVisitor { 1276 public: 1277 virtual ~NodeVisitor() { } 1278 #define DECLARE_VISIT(Type) \ 1279 virtual void Visit##Type(Type##Node* that) = 0; 1280 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 1281 #undef DECLARE_VISIT 1282 virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); } 1283 }; 1284 1285 1286 // Node visitor used to add the start set of the alternatives to the 1287 // dispatch table of a choice node. 1288 class DispatchTableConstructor: public NodeVisitor { 1289 public: 1290 DispatchTableConstructor(DispatchTable* table, bool ignore_case) 1291 : table_(table), 1292 choice_index_(-1), 1293 ignore_case_(ignore_case) { } 1294 1295 void BuildTable(ChoiceNode* node); 1296 1297 void AddRange(CharacterRange range) { 1298 table()->AddRange(range, choice_index_); 1299 } 1300 1301 void AddInverse(ZoneList<CharacterRange>* ranges); 1302 1303 #define DECLARE_VISIT(Type) \ 1304 virtual void Visit##Type(Type##Node* that); 1305 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 1306 #undef DECLARE_VISIT 1307 1308 DispatchTable* table() { return table_; } 1309 void set_choice_index(int value) { choice_index_ = value; } 1310 1311 protected: 1312 DispatchTable* table_; 1313 int choice_index_; 1314 bool ignore_case_; 1315 }; 1316 1317 1318 // Assertion propagation moves information about assertions such as 1319 // \b to the affected nodes. For instance, in /.\b./ information must 1320 // be propagated to the first '.' that whatever follows needs to know 1321 // if it matched a word or a non-word, and to the second '.' that it 1322 // has to check if it succeeds a word or non-word. In this case the 1323 // result will be something like: 1324 // 1325 // +-------+ +------------+ 1326 // | . | | . | 1327 // +-------+ ---> +------------+ 1328 // | word? | | check word | 1329 // +-------+ +------------+ 1330 class Analysis: public NodeVisitor { 1331 public: 1332 Analysis(bool ignore_case, bool is_ascii) 1333 : ignore_case_(ignore_case), 1334 is_ascii_(is_ascii), 1335 error_message_(NULL) { } 1336 void EnsureAnalyzed(RegExpNode* node); 1337 1338 #define DECLARE_VISIT(Type) \ 1339 virtual void Visit##Type(Type##Node* that); 1340 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 1341 #undef DECLARE_VISIT 1342 virtual void VisitLoopChoice(LoopChoiceNode* that); 1343 1344 bool has_failed() { return error_message_ != NULL; } 1345 const char* error_message() { 1346 ASSERT(error_message_ != NULL); 1347 return error_message_; 1348 } 1349 void fail(const char* error_message) { 1350 error_message_ = error_message; 1351 } 1352 private: 1353 bool ignore_case_; 1354 bool is_ascii_; 1355 const char* error_message_; 1356 1357 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); 1358 }; 1359 1360 1361 struct RegExpCompileData { 1362 RegExpCompileData() 1363 : tree(NULL), 1364 node(NULL), 1365 simple(true), 1366 contains_anchor(false), 1367 capture_count(0) { } 1368 RegExpTree* tree; 1369 RegExpNode* node; 1370 bool simple; 1371 bool contains_anchor; 1372 Handle<String> error; 1373 int capture_count; 1374 }; 1375 1376 1377 class RegExpEngine: public AllStatic { 1378 public: 1379 struct CompilationResult { 1380 explicit CompilationResult(const char* error_message) 1381 : error_message(error_message), 1382 code(Heap::the_hole_value()), 1383 num_registers(0) {} 1384 CompilationResult(Object* code, int registers) 1385 : error_message(NULL), 1386 code(code), 1387 num_registers(registers) {} 1388 const char* error_message; 1389 Object* code; 1390 int num_registers; 1391 }; 1392 1393 static CompilationResult Compile(RegExpCompileData* input, 1394 bool ignore_case, 1395 bool multiline, 1396 Handle<String> pattern, 1397 bool is_ascii); 1398 1399 static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); 1400 }; 1401 1402 1403 class OffsetsVector { 1404 public: 1405 inline OffsetsVector(int num_registers) 1406 : offsets_vector_length_(num_registers) { 1407 if (offsets_vector_length_ > kStaticOffsetsVectorSize) { 1408 vector_ = NewArray<int>(offsets_vector_length_); 1409 } else { 1410 vector_ = static_offsets_vector_; 1411 } 1412 } 1413 inline ~OffsetsVector() { 1414 if (offsets_vector_length_ > kStaticOffsetsVectorSize) { 1415 DeleteArray(vector_); 1416 vector_ = NULL; 1417 } 1418 } 1419 inline int* vector() { return vector_; } 1420 inline int length() { return offsets_vector_length_; } 1421 1422 static const int kStaticOffsetsVectorSize = 50; 1423 1424 private: 1425 static Address static_offsets_vector_address() { 1426 return reinterpret_cast<Address>(&static_offsets_vector_); 1427 } 1428 1429 int* vector_; 1430 int offsets_vector_length_; 1431 static int static_offsets_vector_[kStaticOffsetsVectorSize]; 1432 1433 friend class ExternalReference; 1434 }; 1435 1436 1437 } } // namespace v8::internal 1438 1439 #endif // V8_JSREGEXP_H_ 1440