Home | History | Annotate | Download | only in regexp
      1 // Copyright 2012 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_JSREGEXP_H_
      6 #define V8_REGEXP_JSREGEXP_H_
      7 
      8 #include "src/allocation.h"
      9 #include "src/assembler.h"
     10 #include "src/isolate.h"
     11 #include "src/objects/js-regexp.h"
     12 #include "src/regexp/regexp-ast.h"
     13 #include "src/regexp/regexp-macro-assembler.h"
     14 
     15 namespace v8 {
     16 namespace internal {
     17 
     18 class NodeVisitor;
     19 class RegExpCompiler;
     20 class RegExpMacroAssembler;
     21 class RegExpNode;
     22 class RegExpTree;
     23 class BoyerMooreLookahead;
     24 
     25 inline bool IgnoreCase(JSRegExp::Flags flags) {
     26   return (flags & JSRegExp::kIgnoreCase) != 0;
     27 }
     28 
     29 inline bool IsUnicode(JSRegExp::Flags flags) {
     30   return (flags & JSRegExp::kUnicode) != 0;
     31 }
     32 
     33 inline bool IsSticky(JSRegExp::Flags flags) {
     34   return (flags & JSRegExp::kSticky) != 0;
     35 }
     36 
     37 inline bool IsGlobal(JSRegExp::Flags flags) {
     38   return (flags & JSRegExp::kGlobal) != 0;
     39 }
     40 
     41 inline bool DotAll(JSRegExp::Flags flags) {
     42   return (flags & JSRegExp::kDotAll) != 0;
     43 }
     44 
     45 inline bool Multiline(JSRegExp::Flags flags) {
     46   return (flags & JSRegExp::kMultiline) != 0;
     47 }
     48 
     49 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
     50   // Both unicode and ignore_case flags are set. We need to use ICU to find
     51   // the closure over case equivalents.
     52   return IsUnicode(flags) && IgnoreCase(flags);
     53 }
     54 
     55 class RegExpImpl {
     56  public:
     57   // Whether V8 is compiled with native regexp support or not.
     58   static bool UsesNativeRegExp() {
     59 #ifdef V8_INTERPRETED_REGEXP
     60     return false;
     61 #else
     62     return true;
     63 #endif
     64   }
     65 
     66   // Returns a string representation of a regular expression.
     67   // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
     68   // This function calls the garbage collector if necessary.
     69   static Handle<String> ToString(Handle<Object> value);
     70 
     71   // Parses the RegExp pattern and prepares the JSRegExp object with
     72   // generic data and choice of implementation - as well as what
     73   // the implementation wants to store in the data field.
     74   // Returns false if compilation fails.
     75   V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
     76       Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
     77       JSRegExp::Flags flags);
     78 
     79   // See ECMA-262 section 15.10.6.2.
     80   // This function calls the garbage collector if necessary.
     81   V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
     82       Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
     83       int index, Handle<RegExpMatchInfo> last_match_info);
     84 
     85   // Prepares a JSRegExp object with Irregexp-specific data.
     86   static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
     87                                  Handle<String> pattern, JSRegExp::Flags flags,
     88                                  int capture_register_count);
     89 
     90   static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
     91                           Handle<String> pattern, JSRegExp::Flags flags,
     92                           Handle<String> match_pattern);
     93 
     94   static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
     95                          Handle<String> subject, int index, int32_t* output,
     96                          int output_size);
     97 
     98   static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
     99                                  Handle<String> subject, int index,
    100                                  Handle<RegExpMatchInfo> last_match_info);
    101 
    102   enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
    103 
    104   // Prepare a RegExp for being executed one or more times (using
    105   // IrregexpExecOnce) on the subject.
    106   // This ensures that the regexp is compiled for the subject, and that
    107   // the subject is flat.
    108   // Returns the number of integer spaces required by IrregexpExecOnce
    109   // as its "registers" argument.  If the regexp cannot be compiled,
    110   // an exception is set as pending, and this function returns negative.
    111   static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
    112                              Handle<String> subject);
    113 
    114   // Execute a regular expression on the subject, starting from index.
    115   // If matching succeeds, return the number of matches.  This can be larger
    116   // than one in the case of global regular expressions.
    117   // The captures and subcaptures are stored into the registers vector.
    118   // If matching fails, returns RE_FAILURE.
    119   // If execution fails, sets a pending exception and returns RE_EXCEPTION.
    120   static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
    121                              Handle<String> subject, int index, int32_t* output,
    122                              int output_size);
    123 
    124   // Execute an Irregexp bytecode pattern.
    125   // On a successful match, the result is a JSArray containing
    126   // captured positions.  On a failure, the result is the null value.
    127   // Returns an empty handle in case of an exception.
    128   V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
    129       Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
    130       int index, Handle<RegExpMatchInfo> last_match_info);
    131 
    132   // Set last match info.  If match is nullptr, then setting captures is
    133   // omitted.
    134   static Handle<RegExpMatchInfo> SetLastMatchInfo(
    135       Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
    136       Handle<String> subject, int capture_count, int32_t* match);
    137 
    138   class GlobalCache {
    139    public:
    140     GlobalCache(Handle<JSRegExp> regexp,
    141                 Handle<String> subject,
    142                 Isolate* isolate);
    143 
    144     V8_INLINE ~GlobalCache();
    145 
    146     // Fetch the next entry in the cache for global regexp match results.
    147     // This does not set the last match info.  Upon failure, nullptr is
    148     // returned. The cause can be checked with Result().  The previous result is
    149     // still in available in memory when a failure happens.
    150     V8_INLINE int32_t* FetchNext();
    151 
    152     V8_INLINE int32_t* LastSuccessfulMatch();
    153 
    154     V8_INLINE bool HasException() { return num_matches_ < 0; }
    155 
    156    private:
    157     int AdvanceZeroLength(int last_index);
    158 
    159     int num_matches_;
    160     int max_matches_;
    161     int current_match_index_;
    162     int registers_per_match_;
    163     // Pointer to the last set of captures.
    164     int32_t* register_array_;
    165     int register_array_size_;
    166     Handle<JSRegExp> regexp_;
    167     Handle<String> subject_;
    168     Isolate* isolate_;
    169   };
    170 
    171   // For acting on the JSRegExp data FixedArray.
    172   static int IrregexpMaxRegisterCount(FixedArray* re);
    173   static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
    174   static void SetIrregexpCaptureNameMap(FixedArray* re,
    175                                         Handle<FixedArray> value);
    176   static int IrregexpNumberOfCaptures(FixedArray* re);
    177   static int IrregexpNumberOfRegisters(FixedArray* re);
    178   static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
    179   static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
    180 
    181   // Limit the space regexps take up on the heap.  In order to limit this we
    182   // would like to keep track of the amount of regexp code on the heap.  This
    183   // is not tracked, however.  As a conservative approximation we track the
    184   // total regexp code compiled including code that has subsequently been freed
    185   // and the total executable memory at any point.
    186   static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
    187   static const size_t kRegExpCompiledLimit = 1 * MB;
    188   static const int kRegExpTooLargeToOptimize = 20 * KB;
    189 
    190  private:
    191   static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
    192                               Handle<String> sample_subject, bool is_one_byte);
    193   static inline bool EnsureCompiledIrregexp(Isolate* isolate,
    194                                             Handle<JSRegExp> re,
    195                                             Handle<String> sample_subject,
    196                                             bool is_one_byte);
    197 };
    198 
    199 
    200 // Represents the location of one element relative to the intersection of
    201 // two sets. Corresponds to the four areas of a Venn diagram.
    202 enum ElementInSetsRelation {
    203   kInsideNone = 0,
    204   kInsideFirst = 1,
    205   kInsideSecond = 2,
    206   kInsideBoth = 3
    207 };
    208 
    209 
    210 // A set of unsigned integers that behaves especially well on small
    211 // integers (< 32).  May do zone-allocation.
    212 class OutSet: public ZoneObject {
    213  public:
    214   OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
    215   OutSet* Extend(unsigned value, Zone* zone);
    216   bool Get(unsigned value) const;
    217   static const unsigned kFirstLimit = 32;
    218 
    219  private:
    220   // Destructively set a value in this set.  In most cases you want
    221   // to use Extend instead to ensure that only one instance exists
    222   // that contains the same values.
    223   void Set(unsigned value, Zone* zone);
    224 
    225   // The successors are a list of sets that contain the same values
    226   // as this set and the one more value that is not present in this
    227   // set.
    228   ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
    229 
    230   OutSet(uint32_t first, ZoneList<unsigned>* remaining)
    231       : first_(first), remaining_(remaining), successors_(nullptr) {}
    232   uint32_t first_;
    233   ZoneList<unsigned>* remaining_;
    234   ZoneList<OutSet*>* successors_;
    235   friend class Trace;
    236 };
    237 
    238 
    239 // A mapping from integers, specified as ranges, to a set of integers.
    240 // Used for mapping character ranges to choices.
    241 class DispatchTable : public ZoneObject {
    242  public:
    243   explicit DispatchTable(Zone* zone) : tree_(zone) { }
    244 
    245   class Entry {
    246    public:
    247     Entry() : from_(0), to_(0), out_set_(nullptr) {}
    248     Entry(uc32 from, uc32 to, OutSet* out_set)
    249         : from_(from), to_(to), out_set_(out_set) {
    250       DCHECK(from <= to);
    251     }
    252     uc32 from() { return from_; }
    253     uc32 to() { return to_; }
    254     void set_to(uc32 value) { to_ = value; }
    255     void AddValue(int value, Zone* zone) {
    256       out_set_ = out_set_->Extend(value, zone);
    257     }
    258     OutSet* out_set() { return out_set_; }
    259    private:
    260     uc32 from_;
    261     uc32 to_;
    262     OutSet* out_set_;
    263   };
    264 
    265   class Config {
    266    public:
    267     typedef uc32 Key;
    268     typedef Entry Value;
    269     static const uc32 kNoKey;
    270     static const Entry NoValue() { return Value(); }
    271     static inline int Compare(uc32 a, uc32 b) {
    272       if (a == b)
    273         return 0;
    274       else if (a < b)
    275         return -1;
    276       else
    277         return 1;
    278     }
    279   };
    280 
    281   void AddRange(CharacterRange range, int value, Zone* zone);
    282   OutSet* Get(uc32 value);
    283   void Dump();
    284 
    285   template <typename Callback>
    286   void ForEach(Callback* callback) {
    287     return tree()->ForEach(callback);
    288   }
    289 
    290  private:
    291   // There can't be a static empty set since it allocates its
    292   // successors in a zone and caches them.
    293   OutSet* empty() { return &empty_; }
    294   OutSet empty_;
    295   ZoneSplayTree<Config>* tree() { return &tree_; }
    296   ZoneSplayTree<Config> tree_;
    297 };
    298 
    299 
    300 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
    301 class UnicodeRangeSplitter {
    302  public:
    303   UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
    304   void Call(uc32 from, DispatchTable::Entry entry);
    305 
    306   ZoneList<CharacterRange>* bmp() { return bmp_; }
    307   ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
    308   ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
    309   ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
    310 
    311  private:
    312   static const int kBase = 0;
    313   // Separate ranges into
    314   static const int kBmpCodePoints = 1;
    315   static const int kLeadSurrogates = 2;
    316   static const int kTrailSurrogates = 3;
    317   static const int kNonBmpCodePoints = 4;
    318 
    319   Zone* zone_;
    320   DispatchTable table_;
    321   ZoneList<CharacterRange>* bmp_;
    322   ZoneList<CharacterRange>* lead_surrogates_;
    323   ZoneList<CharacterRange>* trail_surrogates_;
    324   ZoneList<CharacterRange>* non_bmp_;
    325 };
    326 
    327 
    328 #define FOR_EACH_NODE_TYPE(VISIT)                                    \
    329   VISIT(End)                                                         \
    330   VISIT(Action)                                                      \
    331   VISIT(Choice)                                                      \
    332   VISIT(BackReference)                                               \
    333   VISIT(Assertion)                                                   \
    334   VISIT(Text)
    335 
    336 
    337 class Trace;
    338 struct PreloadState;
    339 class GreedyLoopState;
    340 class AlternativeGenerationList;
    341 
    342 struct NodeInfo {
    343   NodeInfo()
    344       : being_analyzed(false),
    345         been_analyzed(false),
    346         follows_word_interest(false),
    347         follows_newline_interest(false),
    348         follows_start_interest(false),
    349         at_end(false),
    350         visited(false),
    351         replacement_calculated(false) { }
    352 
    353   // Returns true if the interests and assumptions of this node
    354   // matches the given one.
    355   bool Matches(NodeInfo* that) {
    356     return (at_end == that->at_end) &&
    357            (follows_word_interest == that->follows_word_interest) &&
    358            (follows_newline_interest == that->follows_newline_interest) &&
    359            (follows_start_interest == that->follows_start_interest);
    360   }
    361 
    362   // Updates the interests of this node given the interests of the
    363   // node preceding it.
    364   void AddFromPreceding(NodeInfo* that) {
    365     at_end |= that->at_end;
    366     follows_word_interest |= that->follows_word_interest;
    367     follows_newline_interest |= that->follows_newline_interest;
    368     follows_start_interest |= that->follows_start_interest;
    369   }
    370 
    371   bool HasLookbehind() {
    372     return follows_word_interest ||
    373            follows_newline_interest ||
    374            follows_start_interest;
    375   }
    376 
    377   // Sets the interests of this node to include the interests of the
    378   // following node.
    379   void AddFromFollowing(NodeInfo* that) {
    380     follows_word_interest |= that->follows_word_interest;
    381     follows_newline_interest |= that->follows_newline_interest;
    382     follows_start_interest |= that->follows_start_interest;
    383   }
    384 
    385   void ResetCompilationState() {
    386     being_analyzed = false;
    387     been_analyzed = false;
    388   }
    389 
    390   bool being_analyzed: 1;
    391   bool been_analyzed: 1;
    392 
    393   // These bits are set of this node has to know what the preceding
    394   // character was.
    395   bool follows_word_interest: 1;
    396   bool follows_newline_interest: 1;
    397   bool follows_start_interest: 1;
    398 
    399   bool at_end: 1;
    400   bool visited: 1;
    401   bool replacement_calculated: 1;
    402 };
    403 
    404 
    405 // Details of a quick mask-compare check that can look ahead in the
    406 // input stream.
    407 class QuickCheckDetails {
    408  public:
    409   QuickCheckDetails()
    410       : characters_(0),
    411         mask_(0),
    412         value_(0),
    413         cannot_match_(false) { }
    414   explicit QuickCheckDetails(int characters)
    415       : characters_(characters),
    416         mask_(0),
    417         value_(0),
    418         cannot_match_(false) { }
    419   bool Rationalize(bool one_byte);
    420   // Merge in the information from another branch of an alternation.
    421   void Merge(QuickCheckDetails* other, int from_index);
    422   // Advance the current position by some amount.
    423   void Advance(int by, bool one_byte);
    424   void Clear();
    425   bool cannot_match() { return cannot_match_; }
    426   void set_cannot_match() { cannot_match_ = true; }
    427   struct Position {
    428     Position() : mask(0), value(0), determines_perfectly(false) { }
    429     uc16 mask;
    430     uc16 value;
    431     bool determines_perfectly;
    432   };
    433   int characters() { return characters_; }
    434   void set_characters(int characters) { characters_ = characters; }
    435   Position* positions(int index) {
    436     DCHECK_LE(0, index);
    437     DCHECK_GT(characters_, index);
    438     return positions_ + index;
    439   }
    440   uint32_t mask() { return mask_; }
    441   uint32_t value() { return value_; }
    442 
    443  private:
    444   // How many characters do we have quick check information from.  This is
    445   // the same for all branches of a choice node.
    446   int characters_;
    447   Position positions_[4];
    448   // These values are the condensate of the above array after Rationalize().
    449   uint32_t mask_;
    450   uint32_t value_;
    451   // If set to true, there is no way this quick check can match at all.
    452   // E.g., if it requires to be at the start of the input, and isn't.
    453   bool cannot_match_;
    454 };
    455 
    456 
    457 extern int kUninitializedRegExpNodePlaceHolder;
    458 
    459 
    460 class RegExpNode: public ZoneObject {
    461  public:
    462   explicit RegExpNode(Zone* zone)
    463       : replacement_(nullptr),
    464         on_work_list_(false),
    465         trace_count_(0),
    466         zone_(zone) {
    467     bm_info_[0] = bm_info_[1] = nullptr;
    468   }
    469   virtual ~RegExpNode();
    470   virtual void Accept(NodeVisitor* visitor) = 0;
    471   // Generates a goto to this node or actually generates the code at this point.
    472   virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
    473   // How many characters must this node consume at a minimum in order to
    474   // succeed.  If we have found at least 'still_to_find' characters that
    475   // must be consumed there is no need to ask any following nodes whether
    476   // they are sure to eat any more characters.  The not_at_start argument is
    477   // used to indicate that we know we are not at the start of the input.  In
    478   // this case anchored branches will always fail and can be ignored when
    479   // determining how many characters are consumed on success.
    480   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
    481   // Emits some quick code that checks whether the preloaded characters match.
    482   // Falls through on certain failure, jumps to the label on possible success.
    483   // If the node cannot make a quick check it does nothing and returns false.
    484   bool EmitQuickCheck(RegExpCompiler* compiler,
    485                       Trace* bounds_check_trace,
    486                       Trace* trace,
    487                       bool preload_has_checked_bounds,
    488                       Label* on_possible_success,
    489                       QuickCheckDetails* details_return,
    490                       bool fall_through_on_failure);
    491   // For a given number of characters this returns a mask and a value.  The
    492   // next n characters are anded with the mask and compared with the value.
    493   // A comparison failure indicates the node cannot match the next n characters.
    494   // A comparison success indicates the node may match.
    495   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    496                                     RegExpCompiler* compiler,
    497                                     int characters_filled_in,
    498                                     bool not_at_start) = 0;
    499   static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
    500   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
    501   // Only returns the successor for a text node of length 1 that matches any
    502   // character and that has no guards on it.
    503   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
    504       RegExpCompiler* compiler) {
    505     return nullptr;
    506   }
    507 
    508   // Collects information on the possible code units (mod 128) that can match if
    509   // we look forward.  This is used for a Boyer-Moore-like string searching
    510   // implementation.  TODO(erikcorry):  This should share more code with
    511   // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
    512   // the number of nodes we are willing to look at in order to create this data.
    513   static const int kRecursionBudget = 200;
    514   bool KeepRecursing(RegExpCompiler* compiler);
    515   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    516                             BoyerMooreLookahead* bm, bool not_at_start) {
    517     UNREACHABLE();
    518   }
    519 
    520   // If we know that the input is one-byte then there are some nodes that can
    521   // never match.  This method returns a node that can be substituted for
    522   // itself, or nullptr if the node can never match.
    523   virtual RegExpNode* FilterOneByte(int depth) { return this; }
    524   // Helper for FilterOneByte.
    525   RegExpNode* replacement() {
    526     DCHECK(info()->replacement_calculated);
    527     return replacement_;
    528   }
    529   RegExpNode* set_replacement(RegExpNode* replacement) {
    530     info()->replacement_calculated = true;
    531     replacement_ =  replacement;
    532     return replacement;  // For convenience.
    533   }
    534 
    535   // We want to avoid recalculating the lookahead info, so we store it on the
    536   // node.  Only info that is for this node is stored.  We can tell that the
    537   // info is for this node when offset == 0, so the information is calculated
    538   // relative to this node.
    539   void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
    540     if (offset == 0) set_bm_info(not_at_start, bm);
    541   }
    542 
    543   Label* label() { return &label_; }
    544   // If non-generic code is generated for a node (i.e. the node is not at the
    545   // start of the trace) then it cannot be reused.  This variable sets a limit
    546   // on how often we allow that to happen before we insist on starting a new
    547   // trace and generating generic code for a node that can be reused by flushing
    548   // the deferred actions in the current trace and generating a goto.
    549   static const int kMaxCopiesCodeGenerated = 10;
    550 
    551   bool on_work_list() { return on_work_list_; }
    552   void set_on_work_list(bool value) { on_work_list_ = value; }
    553 
    554   NodeInfo* info() { return &info_; }
    555 
    556   BoyerMooreLookahead* bm_info(bool not_at_start) {
    557     return bm_info_[not_at_start ? 1 : 0];
    558   }
    559 
    560   Zone* zone() const { return zone_; }
    561 
    562  protected:
    563   enum LimitResult { DONE, CONTINUE };
    564   RegExpNode* replacement_;
    565 
    566   LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
    567 
    568   void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
    569     bm_info_[not_at_start ? 1 : 0] = bm;
    570   }
    571 
    572  private:
    573   static const int kFirstCharBudget = 10;
    574   Label label_;
    575   bool on_work_list_;
    576   NodeInfo info_;
    577   // This variable keeps track of how many times code has been generated for
    578   // this node (in different traces).  We don't keep track of where the
    579   // generated code is located unless the code is generated at the start of
    580   // a trace, in which case it is generic and can be reused by flushing the
    581   // deferred operations in the current trace and generating a goto.
    582   int trace_count_;
    583   BoyerMooreLookahead* bm_info_[2];
    584 
    585   Zone* zone_;
    586 };
    587 
    588 
    589 class SeqRegExpNode: public RegExpNode {
    590  public:
    591   explicit SeqRegExpNode(RegExpNode* on_success)
    592       : RegExpNode(on_success->zone()), on_success_(on_success) { }
    593   RegExpNode* on_success() { return on_success_; }
    594   void set_on_success(RegExpNode* node) { on_success_ = node; }
    595   virtual RegExpNode* FilterOneByte(int depth);
    596   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    597                             BoyerMooreLookahead* bm, bool not_at_start) {
    598     on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
    599     if (offset == 0) set_bm_info(not_at_start, bm);
    600   }
    601 
    602  protected:
    603   RegExpNode* FilterSuccessor(int depth);
    604 
    605  private:
    606   RegExpNode* on_success_;
    607 };
    608 
    609 
    610 class ActionNode: public SeqRegExpNode {
    611  public:
    612   enum ActionType {
    613     SET_REGISTER,
    614     INCREMENT_REGISTER,
    615     STORE_POSITION,
    616     BEGIN_SUBMATCH,
    617     POSITIVE_SUBMATCH_SUCCESS,
    618     EMPTY_MATCH_CHECK,
    619     CLEAR_CAPTURES
    620   };
    621   static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
    622   static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
    623   static ActionNode* StorePosition(int reg,
    624                                    bool is_capture,
    625                                    RegExpNode* on_success);
    626   static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
    627   static ActionNode* BeginSubmatch(int stack_pointer_reg,
    628                                    int position_reg,
    629                                    RegExpNode* on_success);
    630   static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
    631                                              int restore_reg,
    632                                              int clear_capture_count,
    633                                              int clear_capture_from,
    634                                              RegExpNode* on_success);
    635   static ActionNode* EmptyMatchCheck(int start_register,
    636                                      int repetition_register,
    637                                      int repetition_limit,
    638                                      RegExpNode* on_success);
    639   virtual void Accept(NodeVisitor* visitor);
    640   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    641   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
    642   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    643                                     RegExpCompiler* compiler,
    644                                     int filled_in,
    645                                     bool not_at_start) {
    646     return on_success()->GetQuickCheckDetails(
    647         details, compiler, filled_in, not_at_start);
    648   }
    649   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    650                             BoyerMooreLookahead* bm, bool not_at_start);
    651   ActionType action_type() { return action_type_; }
    652   // TODO(erikcorry): We should allow some action nodes in greedy loops.
    653   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
    654 
    655  private:
    656   union {
    657     struct {
    658       int reg;
    659       int value;
    660     } u_store_register;
    661     struct {
    662       int reg;
    663     } u_increment_register;
    664     struct {
    665       int reg;
    666       bool is_capture;
    667     } u_position_register;
    668     struct {
    669       int stack_pointer_register;
    670       int current_position_register;
    671       int clear_register_count;
    672       int clear_register_from;
    673     } u_submatch;
    674     struct {
    675       int start_register;
    676       int repetition_register;
    677       int repetition_limit;
    678     } u_empty_match_check;
    679     struct {
    680       int range_from;
    681       int range_to;
    682     } u_clear_captures;
    683   } data_;
    684   ActionNode(ActionType action_type, RegExpNode* on_success)
    685       : SeqRegExpNode(on_success),
    686         action_type_(action_type) { }
    687   ActionType action_type_;
    688   friend class DotPrinter;
    689 };
    690 
    691 
    692 class TextNode: public SeqRegExpNode {
    693  public:
    694   TextNode(ZoneList<TextElement>* elms, bool read_backward,
    695            RegExpNode* on_success)
    696       : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
    697   TextNode(RegExpCharacterClass* that, bool read_backward,
    698            RegExpNode* on_success)
    699       : SeqRegExpNode(on_success),
    700         elms_(new (zone()) ZoneList<TextElement>(1, zone())),
    701         read_backward_(read_backward) {
    702     elms_->Add(TextElement::CharClass(that), zone());
    703   }
    704   // Create TextNode for a single character class for the given ranges.
    705   static TextNode* CreateForCharacterRanges(Zone* zone,
    706                                             ZoneList<CharacterRange>* ranges,
    707                                             bool read_backward,
    708                                             RegExpNode* on_success,
    709                                             JSRegExp::Flags flags);
    710   // Create TextNode for a surrogate pair with a range given for the
    711   // lead and the trail surrogate each.
    712   static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
    713                                           CharacterRange trail,
    714                                           bool read_backward,
    715                                           RegExpNode* on_success,
    716                                           JSRegExp::Flags flags);
    717   virtual void Accept(NodeVisitor* visitor);
    718   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    719   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
    720   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    721                                     RegExpCompiler* compiler,
    722                                     int characters_filled_in,
    723                                     bool not_at_start);
    724   ZoneList<TextElement>* elements() { return elms_; }
    725   bool read_backward() { return read_backward_; }
    726   void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
    727   virtual int GreedyLoopTextLength();
    728   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
    729       RegExpCompiler* compiler);
    730   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    731                             BoyerMooreLookahead* bm, bool not_at_start);
    732   void CalculateOffsets();
    733   virtual RegExpNode* FilterOneByte(int depth);
    734 
    735  private:
    736   enum TextEmitPassType {
    737     NON_LATIN1_MATCH,            // Check for characters that can't match.
    738     SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
    739     NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
    740     CASE_CHARACTER_MATCH,        // Case-independent single character check.
    741     CHARACTER_CLASS_MATCH        // Character class.
    742   };
    743   static bool SkipPass(TextEmitPassType pass, bool ignore_case);
    744   static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
    745   static const int kLastPass = CHARACTER_CLASS_MATCH;
    746   void TextEmitPass(RegExpCompiler* compiler,
    747                     TextEmitPassType pass,
    748                     bool preloaded,
    749                     Trace* trace,
    750                     bool first_element_checked,
    751                     int* checked_up_to);
    752   int Length();
    753   ZoneList<TextElement>* elms_;
    754   bool read_backward_;
    755 };
    756 
    757 
    758 class AssertionNode: public SeqRegExpNode {
    759  public:
    760   enum AssertionType {
    761     AT_END,
    762     AT_START,
    763     AT_BOUNDARY,
    764     AT_NON_BOUNDARY,
    765     AFTER_NEWLINE
    766   };
    767   static AssertionNode* AtEnd(RegExpNode* on_success) {
    768     return new(on_success->zone()) AssertionNode(AT_END, on_success);
    769   }
    770   static AssertionNode* AtStart(RegExpNode* on_success) {
    771     return new(on_success->zone()) AssertionNode(AT_START, on_success);
    772   }
    773   static AssertionNode* AtBoundary(RegExpNode* on_success) {
    774     return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
    775   }
    776   static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
    777     return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
    778   }
    779   static AssertionNode* AfterNewline(RegExpNode* on_success) {
    780     return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
    781   }
    782   virtual void Accept(NodeVisitor* visitor);
    783   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    784   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
    785   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    786                                     RegExpCompiler* compiler,
    787                                     int filled_in,
    788                                     bool not_at_start);
    789   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    790                             BoyerMooreLookahead* bm, bool not_at_start);
    791   AssertionType assertion_type() { return assertion_type_; }
    792 
    793  private:
    794   void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
    795   enum IfPrevious { kIsNonWord, kIsWord };
    796   void BacktrackIfPrevious(RegExpCompiler* compiler,
    797                            Trace* trace,
    798                            IfPrevious backtrack_if_previous);
    799   AssertionNode(AssertionType t, RegExpNode* on_success)
    800       : SeqRegExpNode(on_success), assertion_type_(t) { }
    801   AssertionType assertion_type_;
    802 };
    803 
    804 
    805 class BackReferenceNode: public SeqRegExpNode {
    806  public:
    807   BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
    808                     bool read_backward, RegExpNode* on_success)
    809       : SeqRegExpNode(on_success),
    810         start_reg_(start_reg),
    811         end_reg_(end_reg),
    812         flags_(flags),
    813         read_backward_(read_backward) {}
    814   virtual void Accept(NodeVisitor* visitor);
    815   int start_register() { return start_reg_; }
    816   int end_register() { return end_reg_; }
    817   bool read_backward() { return read_backward_; }
    818   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    819   virtual int EatsAtLeast(int still_to_find,
    820                           int recursion_depth,
    821                           bool not_at_start);
    822   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    823                                     RegExpCompiler* compiler,
    824                                     int characters_filled_in,
    825                                     bool not_at_start) {
    826     return;
    827   }
    828   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    829                             BoyerMooreLookahead* bm, bool not_at_start);
    830 
    831  private:
    832   int start_reg_;
    833   int end_reg_;
    834   JSRegExp::Flags flags_;
    835   bool read_backward_;
    836 };
    837 
    838 
    839 class EndNode: public RegExpNode {
    840  public:
    841   enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
    842   EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
    843   virtual void Accept(NodeVisitor* visitor);
    844   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    845   virtual int EatsAtLeast(int still_to_find,
    846                           int recursion_depth,
    847                           bool not_at_start) { return 0; }
    848   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    849                                     RegExpCompiler* compiler,
    850                                     int characters_filled_in,
    851                                     bool not_at_start) {
    852     // Returning 0 from EatsAtLeast should ensure we never get here.
    853     UNREACHABLE();
    854   }
    855   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    856                             BoyerMooreLookahead* bm, bool not_at_start) {
    857     // Returning 0 from EatsAtLeast should ensure we never get here.
    858     UNREACHABLE();
    859   }
    860 
    861  private:
    862   Action action_;
    863 };
    864 
    865 
    866 class NegativeSubmatchSuccess: public EndNode {
    867  public:
    868   NegativeSubmatchSuccess(int stack_pointer_reg,
    869                           int position_reg,
    870                           int clear_capture_count,
    871                           int clear_capture_start,
    872                           Zone* zone)
    873       : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
    874         stack_pointer_register_(stack_pointer_reg),
    875         current_position_register_(position_reg),
    876         clear_capture_count_(clear_capture_count),
    877         clear_capture_start_(clear_capture_start) { }
    878   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    879 
    880  private:
    881   int stack_pointer_register_;
    882   int current_position_register_;
    883   int clear_capture_count_;
    884   int clear_capture_start_;
    885 };
    886 
    887 
    888 class Guard: public ZoneObject {
    889  public:
    890   enum Relation { LT, GEQ };
    891   Guard(int reg, Relation op, int value)
    892       : reg_(reg),
    893         op_(op),
    894         value_(value) { }
    895   int reg() { return reg_; }
    896   Relation op() { return op_; }
    897   int value() { return value_; }
    898 
    899  private:
    900   int reg_;
    901   Relation op_;
    902   int value_;
    903 };
    904 
    905 
    906 class GuardedAlternative {
    907  public:
    908   explicit GuardedAlternative(RegExpNode* node)
    909       : node_(node), guards_(nullptr) {}
    910   void AddGuard(Guard* guard, Zone* zone);
    911   RegExpNode* node() { return node_; }
    912   void set_node(RegExpNode* node) { node_ = node; }
    913   ZoneList<Guard*>* guards() { return guards_; }
    914 
    915  private:
    916   RegExpNode* node_;
    917   ZoneList<Guard*>* guards_;
    918 };
    919 
    920 
    921 class AlternativeGeneration;
    922 
    923 
    924 class ChoiceNode: public RegExpNode {
    925  public:
    926   explicit ChoiceNode(int expected_size, Zone* zone)
    927       : RegExpNode(zone),
    928         alternatives_(new (zone)
    929                           ZoneList<GuardedAlternative>(expected_size, zone)),
    930         table_(nullptr),
    931         not_at_start_(false),
    932         being_calculated_(false) {}
    933   virtual void Accept(NodeVisitor* visitor);
    934   void AddAlternative(GuardedAlternative node) {
    935     alternatives()->Add(node, zone());
    936   }
    937   ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
    938   DispatchTable* GetTable(bool ignore_case);
    939   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    940   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
    941   int EatsAtLeastHelper(int still_to_find,
    942                         int budget,
    943                         RegExpNode* ignore_this_node,
    944                         bool not_at_start);
    945   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    946                                     RegExpCompiler* compiler,
    947                                     int characters_filled_in,
    948                                     bool not_at_start);
    949   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    950                             BoyerMooreLookahead* bm, bool not_at_start);
    951 
    952   bool being_calculated() { return being_calculated_; }
    953   bool not_at_start() { return not_at_start_; }
    954   void set_not_at_start() { not_at_start_ = true; }
    955   void set_being_calculated(bool b) { being_calculated_ = b; }
    956   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    957     return true;
    958   }
    959   virtual RegExpNode* FilterOneByte(int depth);
    960   virtual bool read_backward() { return false; }
    961 
    962  protected:
    963   int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
    964   ZoneList<GuardedAlternative>* alternatives_;
    965 
    966  private:
    967   friend class DispatchTableConstructor;
    968   friend class Analysis;
    969   void GenerateGuard(RegExpMacroAssembler* macro_assembler,
    970                      Guard* guard,
    971                      Trace* trace);
    972   int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
    973   void EmitOutOfLineContinuation(RegExpCompiler* compiler,
    974                                  Trace* trace,
    975                                  GuardedAlternative alternative,
    976                                  AlternativeGeneration* alt_gen,
    977                                  int preload_characters,
    978                                  bool next_expects_preload);
    979   void SetUpPreLoad(RegExpCompiler* compiler,
    980                     Trace* current_trace,
    981                     PreloadState* preloads);
    982   void AssertGuardsMentionRegisters(Trace* trace);
    983   int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
    984   Trace* EmitGreedyLoop(RegExpCompiler* compiler,
    985                         Trace* trace,
    986                         AlternativeGenerationList* alt_gens,
    987                         PreloadState* preloads,
    988                         GreedyLoopState* greedy_loop_state,
    989                         int text_length);
    990   void EmitChoices(RegExpCompiler* compiler,
    991                    AlternativeGenerationList* alt_gens,
    992                    int first_choice,
    993                    Trace* trace,
    994                    PreloadState* preloads);
    995   DispatchTable* table_;
    996   // If true, this node is never checked at the start of the input.
    997   // Allows a new trace to start with at_start() set to false.
    998   bool not_at_start_;
    999   bool being_calculated_;
   1000 };
   1001 
   1002 
   1003 class NegativeLookaroundChoiceNode : public ChoiceNode {
   1004  public:
   1005   explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
   1006                                         GuardedAlternative then_do_this,
   1007                                         Zone* zone)
   1008       : ChoiceNode(2, zone) {
   1009     AddAlternative(this_must_fail);
   1010     AddAlternative(then_do_this);
   1011   }
   1012   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
   1013   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
   1014                                     RegExpCompiler* compiler,
   1015                                     int characters_filled_in,
   1016                                     bool not_at_start);
   1017   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
   1018                             BoyerMooreLookahead* bm, bool not_at_start) {
   1019     alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
   1020                                               not_at_start);
   1021     if (offset == 0) set_bm_info(not_at_start, bm);
   1022   }
   1023   // For a negative lookahead we don't emit the quick check for the
   1024   // alternative that is expected to fail.  This is because quick check code
   1025   // starts by loading enough characters for the alternative that takes fewest
   1026   // characters, but on a negative lookahead the negative branch did not take
   1027   // part in that calculation (EatsAtLeast) so the assumptions don't hold.
   1028   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
   1029     return !is_first;
   1030   }
   1031   virtual RegExpNode* FilterOneByte(int depth);
   1032 };
   1033 
   1034 
   1035 class LoopChoiceNode: public ChoiceNode {
   1036  public:
   1037   LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
   1038       : ChoiceNode(2, zone),
   1039         loop_node_(nullptr),
   1040         continue_node_(nullptr),
   1041         body_can_be_zero_length_(body_can_be_zero_length),
   1042         read_backward_(read_backward) {}
   1043   void AddLoopAlternative(GuardedAlternative alt);
   1044   void AddContinueAlternative(GuardedAlternative alt);
   1045   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
   1046   virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
   1047   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
   1048                                     RegExpCompiler* compiler,
   1049                                     int characters_filled_in,
   1050                                     bool not_at_start);
   1051   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
   1052                             BoyerMooreLookahead* bm, bool not_at_start);
   1053   RegExpNode* loop_node() { return loop_node_; }
   1054   RegExpNode* continue_node() { return continue_node_; }
   1055   bool body_can_be_zero_length() { return body_can_be_zero_length_; }
   1056   virtual bool read_backward() { return read_backward_; }
   1057   virtual void Accept(NodeVisitor* visitor);
   1058   virtual RegExpNode* FilterOneByte(int depth);
   1059 
   1060  private:
   1061   // AddAlternative is made private for loop nodes because alternatives
   1062   // should not be added freely, we need to keep track of which node
   1063   // goes back to the node itself.
   1064   void AddAlternative(GuardedAlternative node) {
   1065     ChoiceNode::AddAlternative(node);
   1066   }
   1067 
   1068   RegExpNode* loop_node_;
   1069   RegExpNode* continue_node_;
   1070   bool body_can_be_zero_length_;
   1071   bool read_backward_;
   1072 };
   1073 
   1074 
   1075 // Improve the speed that we scan for an initial point where a non-anchored
   1076 // regexp can match by using a Boyer-Moore-like table. This is done by
   1077 // identifying non-greedy non-capturing loops in the nodes that eat any
   1078 // character one at a time.  For example in the middle of the regexp
   1079 // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
   1080 // inserted at the start of any non-anchored regexp.
   1081 //
   1082 // When we have found such a loop we look ahead in the nodes to find the set of
   1083 // characters that can come at given distances. For example for the regexp
   1084 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
   1085 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
   1086 // the lookahead info where the set of characters is reasonably constrained. In
   1087 // our example this is from index 1 to 2 (0 is not constrained). We can now
   1088 // look 3 characters ahead and if we don't find one of [f, o] (the union of
   1089 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
   1090 //
   1091 // For Unicode input strings we do the same, but modulo 128.
   1092 //
   1093 // We also look at the first string fed to the regexp and use that to get a hint
   1094 // of the character frequencies in the inputs. This affects the assessment of
   1095 // whether the set of characters is 'reasonably constrained'.
   1096 //
   1097 // We also have another lookahead mechanism (called quick check in the code),
   1098 // which uses a wide load of multiple characters followed by a mask and compare
   1099 // to determine whether a match is possible at this point.
   1100 enum ContainedInLattice {
   1101   kNotYet = 0,
   1102   kLatticeIn = 1,
   1103   kLatticeOut = 2,
   1104   kLatticeUnknown = 3  // Can also mean both in and out.
   1105 };
   1106 
   1107 
   1108 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
   1109   return static_cast<ContainedInLattice>(a | b);
   1110 }
   1111 
   1112 
   1113 ContainedInLattice AddRange(ContainedInLattice a,
   1114                             const int* ranges,
   1115                             int ranges_size,
   1116                             Interval new_range);
   1117 
   1118 
   1119 class BoyerMoorePositionInfo : public ZoneObject {
   1120  public:
   1121   explicit BoyerMoorePositionInfo(Zone* zone)
   1122       : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
   1123         map_count_(0),
   1124         w_(kNotYet),
   1125         s_(kNotYet),
   1126         d_(kNotYet),
   1127         surrogate_(kNotYet) {
   1128      for (int i = 0; i < kMapSize; i++) {
   1129        map_->Add(false, zone);
   1130      }
   1131   }
   1132 
   1133   bool& at(int i) { return map_->at(i); }
   1134 
   1135   static const int kMapSize = 128;
   1136   static const int kMask = kMapSize - 1;
   1137 
   1138   int map_count() const { return map_count_; }
   1139 
   1140   void Set(int character);
   1141   void SetInterval(const Interval& interval);
   1142   void SetAll();
   1143   bool is_non_word() { return w_ == kLatticeOut; }
   1144   bool is_word() { return w_ == kLatticeIn; }
   1145 
   1146  private:
   1147   ZoneList<bool>* map_;
   1148   int map_count_;  // Number of set bits in the map.
   1149   ContainedInLattice w_;  // The \w character class.
   1150   ContainedInLattice s_;  // The \s character class.
   1151   ContainedInLattice d_;  // The \d character class.
   1152   ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
   1153 };
   1154 
   1155 
   1156 class BoyerMooreLookahead : public ZoneObject {
   1157  public:
   1158   BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
   1159 
   1160   int length() { return length_; }
   1161   int max_char() { return max_char_; }
   1162   RegExpCompiler* compiler() { return compiler_; }
   1163 
   1164   int Count(int map_number) {
   1165     return bitmaps_->at(map_number)->map_count();
   1166   }
   1167 
   1168   BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
   1169 
   1170   void Set(int map_number, int character) {
   1171     if (character > max_char_) return;
   1172     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
   1173     info->Set(character);
   1174   }
   1175 
   1176   void SetInterval(int map_number, const Interval& interval) {
   1177     if (interval.from() > max_char_) return;
   1178     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
   1179     if (interval.to() > max_char_) {
   1180       info->SetInterval(Interval(interval.from(), max_char_));
   1181     } else {
   1182       info->SetInterval(interval);
   1183     }
   1184   }
   1185 
   1186   void SetAll(int map_number) {
   1187     bitmaps_->at(map_number)->SetAll();
   1188   }
   1189 
   1190   void SetRest(int from_map) {
   1191     for (int i = from_map; i < length_; i++) SetAll(i);
   1192   }
   1193   void EmitSkipInstructions(RegExpMacroAssembler* masm);
   1194 
   1195  private:
   1196   // This is the value obtained by EatsAtLeast.  If we do not have at least this
   1197   // many characters left in the sample string then the match is bound to fail.
   1198   // Therefore it is OK to read a character this far ahead of the current match
   1199   // point.
   1200   int length_;
   1201   RegExpCompiler* compiler_;
   1202   // 0xff for Latin1, 0xffff for UTF-16.
   1203   int max_char_;
   1204   ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
   1205 
   1206   int GetSkipTable(int min_lookahead,
   1207                    int max_lookahead,
   1208                    Handle<ByteArray> boolean_skip_table);
   1209   bool FindWorthwhileInterval(int* from, int* to);
   1210   int FindBestInterval(
   1211     int max_number_of_chars, int old_biggest_points, int* from, int* to);
   1212 };
   1213 
   1214 
   1215 // There are many ways to generate code for a node.  This class encapsulates
   1216 // the current way we should be generating.  In other words it encapsulates
   1217 // the current state of the code generator.  The effect of this is that we
   1218 // generate code for paths that the matcher can take through the regular
   1219 // expression.  A given node in the regexp can be code-generated several times
   1220 // as it can be part of several traces.  For example for the regexp:
   1221 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
   1222 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
   1223 // to match foo is generated only once (the traces have a common prefix).  The
   1224 // code to store the capture is deferred and generated (twice) after the places
   1225 // where baz has been matched.
   1226 class Trace {
   1227  public:
   1228   // A value for a property that is either known to be true, know to be false,
   1229   // or not known.
   1230   enum TriBool {
   1231     UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
   1232   };
   1233 
   1234   class DeferredAction {
   1235    public:
   1236     DeferredAction(ActionNode::ActionType action_type, int reg)
   1237         : action_type_(action_type), reg_(reg), next_(nullptr) {}
   1238     DeferredAction* next() { return next_; }
   1239     bool Mentions(int reg);
   1240     int reg() { return reg_; }
   1241     ActionNode::ActionType action_type() { return action_type_; }
   1242    private:
   1243     ActionNode::ActionType action_type_;
   1244     int reg_;
   1245     DeferredAction* next_;
   1246     friend class Trace;
   1247   };
   1248 
   1249   class DeferredCapture : public DeferredAction {
   1250    public:
   1251     DeferredCapture(int reg, bool is_capture, Trace* trace)
   1252         : DeferredAction(ActionNode::STORE_POSITION, reg),
   1253           cp_offset_(trace->cp_offset()),
   1254           is_capture_(is_capture) { }
   1255     int cp_offset() { return cp_offset_; }
   1256     bool is_capture() { return is_capture_; }
   1257    private:
   1258     int cp_offset_;
   1259     bool is_capture_;
   1260     void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
   1261   };
   1262 
   1263   class DeferredSetRegister : public DeferredAction {
   1264    public:
   1265     DeferredSetRegister(int reg, int value)
   1266         : DeferredAction(ActionNode::SET_REGISTER, reg),
   1267           value_(value) { }
   1268     int value() { return value_; }
   1269    private:
   1270     int value_;
   1271   };
   1272 
   1273   class DeferredClearCaptures : public DeferredAction {
   1274    public:
   1275     explicit DeferredClearCaptures(Interval range)
   1276         : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
   1277           range_(range) { }
   1278     Interval range() { return range_; }
   1279    private:
   1280     Interval range_;
   1281   };
   1282 
   1283   class DeferredIncrementRegister : public DeferredAction {
   1284    public:
   1285     explicit DeferredIncrementRegister(int reg)
   1286         : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
   1287   };
   1288 
   1289   Trace()
   1290       : cp_offset_(0),
   1291         actions_(nullptr),
   1292         backtrack_(nullptr),
   1293         stop_node_(nullptr),
   1294         loop_label_(nullptr),
   1295         characters_preloaded_(0),
   1296         bound_checked_up_to_(0),
   1297         flush_budget_(100),
   1298         at_start_(UNKNOWN) {}
   1299 
   1300   // End the trace.  This involves flushing the deferred actions in the trace
   1301   // and pushing a backtrack location onto the backtrack stack.  Once this is
   1302   // done we can start a new trace or go to one that has already been
   1303   // generated.
   1304   void Flush(RegExpCompiler* compiler, RegExpNode* successor);
   1305   int cp_offset() { return cp_offset_; }
   1306   DeferredAction* actions() { return actions_; }
   1307   // A trivial trace is one that has no deferred actions or other state that
   1308   // affects the assumptions used when generating code.  There is no recorded
   1309   // backtrack location in a trivial trace, so with a trivial trace we will
   1310   // generate code that, on a failure to match, gets the backtrack location
   1311   // from the backtrack stack rather than using a direct jump instruction.  We
   1312   // always start code generation with a trivial trace and non-trivial traces
   1313   // are created as we emit code for nodes or add to the list of deferred
   1314   // actions in the trace.  The location of the code generated for a node using
   1315   // a trivial trace is recorded in a label in the node so that gotos can be
   1316   // generated to that code.
   1317   bool is_trivial() {
   1318     return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
   1319            characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
   1320            quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
   1321   }
   1322   TriBool at_start() { return at_start_; }
   1323   void set_at_start(TriBool at_start) { at_start_ = at_start; }
   1324   Label* backtrack() { return backtrack_; }
   1325   Label* loop_label() { return loop_label_; }
   1326   RegExpNode* stop_node() { return stop_node_; }
   1327   int characters_preloaded() { return characters_preloaded_; }
   1328   int bound_checked_up_to() { return bound_checked_up_to_; }
   1329   int flush_budget() { return flush_budget_; }
   1330   QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
   1331   bool mentions_reg(int reg);
   1332   // Returns true if a deferred position store exists to the specified
   1333   // register and stores the offset in the out-parameter.  Otherwise
   1334   // returns false.
   1335   bool GetStoredPosition(int reg, int* cp_offset);
   1336   // These set methods and AdvanceCurrentPositionInTrace should be used only on
   1337   // new traces - the intention is that traces are immutable after creation.
   1338   void add_action(DeferredAction* new_action) {
   1339     DCHECK(new_action->next_ == nullptr);
   1340     new_action->next_ = actions_;
   1341     actions_ = new_action;
   1342   }
   1343   void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
   1344   void set_stop_node(RegExpNode* node) { stop_node_ = node; }
   1345   void set_loop_label(Label* label) { loop_label_ = label; }
   1346   void set_characters_preloaded(int count) { characters_preloaded_ = count; }
   1347   void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
   1348   void set_flush_budget(int to) { flush_budget_ = to; }
   1349   void set_quick_check_performed(QuickCheckDetails* d) {
   1350     quick_check_performed_ = *d;
   1351   }
   1352   void InvalidateCurrentCharacter();
   1353   void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
   1354 
   1355  private:
   1356   int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
   1357   void PerformDeferredActions(RegExpMacroAssembler* macro,
   1358                               int max_register,
   1359                               const OutSet& affected_registers,
   1360                               OutSet* registers_to_pop,
   1361                               OutSet* registers_to_clear,
   1362                               Zone* zone);
   1363   void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
   1364                                 int max_register,
   1365                                 const OutSet& registers_to_pop,
   1366                                 const OutSet& registers_to_clear);
   1367   int cp_offset_;
   1368   DeferredAction* actions_;
   1369   Label* backtrack_;
   1370   RegExpNode* stop_node_;
   1371   Label* loop_label_;
   1372   int characters_preloaded_;
   1373   int bound_checked_up_to_;
   1374   QuickCheckDetails quick_check_performed_;
   1375   int flush_budget_;
   1376   TriBool at_start_;
   1377 };
   1378 
   1379 
   1380 class GreedyLoopState {
   1381  public:
   1382   explicit GreedyLoopState(bool not_at_start);
   1383 
   1384   Label* label() { return &label_; }
   1385   Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
   1386 
   1387  private:
   1388   Label label_;
   1389   Trace counter_backtrack_trace_;
   1390 };
   1391 
   1392 
   1393 struct PreloadState {
   1394   static const int kEatsAtLeastNotYetInitialized = -1;
   1395   bool preload_is_current_;
   1396   bool preload_has_checked_bounds_;
   1397   int preload_characters_;
   1398   int eats_at_least_;
   1399   void init() {
   1400     eats_at_least_ = kEatsAtLeastNotYetInitialized;
   1401   }
   1402 };
   1403 
   1404 
   1405 class NodeVisitor {
   1406  public:
   1407   virtual ~NodeVisitor() { }
   1408 #define DECLARE_VISIT(Type)                                          \
   1409   virtual void Visit##Type(Type##Node* that) = 0;
   1410 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
   1411 #undef DECLARE_VISIT
   1412   virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
   1413 };
   1414 
   1415 
   1416 // Node visitor used to add the start set of the alternatives to the
   1417 // dispatch table of a choice node.
   1418 class DispatchTableConstructor: public NodeVisitor {
   1419  public:
   1420   DispatchTableConstructor(DispatchTable* table, bool ignore_case,
   1421                            Zone* zone)
   1422       : table_(table),
   1423         choice_index_(-1),
   1424         ignore_case_(ignore_case),
   1425         zone_(zone) { }
   1426 
   1427   void BuildTable(ChoiceNode* node);
   1428 
   1429   void AddRange(CharacterRange range) {
   1430     table()->AddRange(range, choice_index_, zone_);
   1431   }
   1432 
   1433   void AddInverse(ZoneList<CharacterRange>* ranges);
   1434 
   1435 #define DECLARE_VISIT(Type)                                          \
   1436   virtual void Visit##Type(Type##Node* that);
   1437 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
   1438 #undef DECLARE_VISIT
   1439 
   1440   DispatchTable* table() { return table_; }
   1441   void set_choice_index(int value) { choice_index_ = value; }
   1442 
   1443  protected:
   1444   DispatchTable* table_;
   1445   int choice_index_;
   1446   bool ignore_case_;
   1447   Zone* zone_;
   1448 };
   1449 
   1450 
   1451 // Assertion propagation moves information about assertions such as
   1452 // \b to the affected nodes.  For instance, in /.\b./ information must
   1453 // be propagated to the first '.' that whatever follows needs to know
   1454 // if it matched a word or a non-word, and to the second '.' that it
   1455 // has to check if it succeeds a word or non-word.  In this case the
   1456 // result will be something like:
   1457 //
   1458 //   +-------+        +------------+
   1459 //   |   .   |        |      .     |
   1460 //   +-------+  --->  +------------+
   1461 //   | word? |        | check word |
   1462 //   +-------+        +------------+
   1463 class Analysis: public NodeVisitor {
   1464  public:
   1465   Analysis(Isolate* isolate, bool is_one_byte)
   1466       : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
   1467   void EnsureAnalyzed(RegExpNode* node);
   1468 
   1469 #define DECLARE_VISIT(Type)                                          \
   1470   virtual void Visit##Type(Type##Node* that);
   1471 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
   1472 #undef DECLARE_VISIT
   1473   virtual void VisitLoopChoice(LoopChoiceNode* that);
   1474 
   1475   bool has_failed() { return error_message_ != nullptr; }
   1476   const char* error_message() {
   1477     DCHECK(error_message_ != nullptr);
   1478     return error_message_;
   1479   }
   1480   void fail(const char* error_message) {
   1481     error_message_ = error_message;
   1482   }
   1483 
   1484   Isolate* isolate() const { return isolate_; }
   1485 
   1486  private:
   1487   Isolate* isolate_;
   1488   bool is_one_byte_;
   1489   const char* error_message_;
   1490 
   1491   DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
   1492 };
   1493 
   1494 
   1495 struct RegExpCompileData {
   1496   RegExpCompileData()
   1497       : tree(nullptr),
   1498         node(nullptr),
   1499         simple(true),
   1500         contains_anchor(false),
   1501         capture_count(0) {}
   1502   RegExpTree* tree;
   1503   RegExpNode* node;
   1504   bool simple;
   1505   bool contains_anchor;
   1506   Handle<FixedArray> capture_name_map;
   1507   Handle<String> error;
   1508   int capture_count;
   1509 };
   1510 
   1511 
   1512 class RegExpEngine: public AllStatic {
   1513  public:
   1514   struct CompilationResult {
   1515     CompilationResult(Isolate* isolate, const char* error_message)
   1516         : error_message(error_message),
   1517           code(ReadOnlyRoots(isolate).the_hole_value()),
   1518           num_registers(0) {}
   1519     CompilationResult(Object* code, int registers)
   1520         : error_message(nullptr), code(code), num_registers(registers) {}
   1521     const char* error_message;
   1522     Object* code;
   1523     int num_registers;
   1524   };
   1525 
   1526   static CompilationResult Compile(Isolate* isolate, Zone* zone,
   1527                                    RegExpCompileData* input,
   1528                                    JSRegExp::Flags flags,
   1529                                    Handle<String> pattern,
   1530                                    Handle<String> sample_subject,
   1531                                    bool is_one_byte);
   1532 
   1533   static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern);
   1534 
   1535   static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
   1536 };
   1537 
   1538 
   1539 class RegExpResultsCache : public AllStatic {
   1540  public:
   1541   enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
   1542 
   1543   // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
   1544   // On success, the returned result is guaranteed to be a COW-array.
   1545   static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
   1546                         FixedArray** last_match_out, ResultsCacheType type);
   1547   // Attempt to add value_array to the cache specified by type.  On success,
   1548   // value_array is turned into a COW-array.
   1549   static void Enter(Isolate* isolate, Handle<String> key_string,
   1550                     Handle<Object> key_pattern, Handle<FixedArray> value_array,
   1551                     Handle<FixedArray> last_match_cache, ResultsCacheType type);
   1552   static void Clear(FixedArray* cache);
   1553   static const int kRegExpResultsCacheSize = 0x100;
   1554 
   1555  private:
   1556   static const int kArrayEntriesPerCacheEntry = 4;
   1557   static const int kStringOffset = 0;
   1558   static const int kPatternOffset = 1;
   1559   static const int kArrayOffset = 2;
   1560   static const int kLastMatchOffset = 3;
   1561 };
   1562 
   1563 }  // namespace internal
   1564 }  // namespace v8
   1565 
   1566 #endif  // V8_REGEXP_JSREGEXP_H_
   1567