Home | History | Annotate | Download | only in regexp
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_REGEXP_AST_H_
      6 #define V8_REGEXP_REGEXP_AST_H_
      7 
      8 #include "src/utils.h"
      9 #include "src/zone.h"
     10 
     11 namespace v8 {
     12 namespace internal {
     13 
     14 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
     15   VISIT(Disjunction)                      \
     16   VISIT(Alternative)                      \
     17   VISIT(Assertion)                        \
     18   VISIT(CharacterClass)                   \
     19   VISIT(Atom)                             \
     20   VISIT(Quantifier)                       \
     21   VISIT(Capture)                          \
     22   VISIT(Lookaround)                       \
     23   VISIT(BackReference)                    \
     24   VISIT(Empty)                            \
     25   VISIT(Text)
     26 
     27 
     28 #define FORWARD_DECLARE(Name) class RegExp##Name;
     29 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
     30 #undef FORWARD_DECLARE
     31 
     32 class RegExpCompiler;
     33 class RegExpNode;
     34 class RegExpTree;
     35 
     36 
     37 class RegExpVisitor BASE_EMBEDDED {
     38  public:
     39   virtual ~RegExpVisitor() {}
     40 #define MAKE_CASE(Name) \
     41   virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
     42   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
     43 #undef MAKE_CASE
     44 };
     45 
     46 
     47 // A simple closed interval.
     48 class Interval {
     49  public:
     50   Interval() : from_(kNone), to_(kNone) {}
     51   Interval(int from, int to) : from_(from), to_(to) {}
     52   Interval Union(Interval that) {
     53     if (that.from_ == kNone)
     54       return *this;
     55     else if (from_ == kNone)
     56       return that;
     57     else
     58       return Interval(Min(from_, that.from_), Max(to_, that.to_));
     59   }
     60   bool Contains(int value) { return (from_ <= value) && (value <= to_); }
     61   bool is_empty() { return from_ == kNone; }
     62   int from() const { return from_; }
     63   int to() const { return to_; }
     64   static Interval Empty() { return Interval(); }
     65   static const int kNone = -1;
     66 
     67  private:
     68   int from_;
     69   int to_;
     70 };
     71 
     72 
     73 // Represents code units in the range from from_ to to_, both ends are
     74 // inclusive.
     75 class CharacterRange {
     76  public:
     77   CharacterRange() : from_(0), to_(0) {}
     78   // For compatibility with the CHECK_OK macro
     79   CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
     80   CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {}
     81   static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
     82                              Zone* zone);
     83   static Vector<const int> GetWordBounds();
     84   static inline CharacterRange Singleton(uc16 value) {
     85     return CharacterRange(value, value);
     86   }
     87   static inline CharacterRange Range(uc16 from, uc16 to) {
     88     DCHECK(from <= to);
     89     return CharacterRange(from, to);
     90   }
     91   static inline CharacterRange Everything() {
     92     return CharacterRange(0, 0xFFFF);
     93   }
     94   bool Contains(uc16 i) { return from_ <= i && i <= to_; }
     95   uc16 from() const { return from_; }
     96   void set_from(uc16 value) { from_ = value; }
     97   uc16 to() const { return to_; }
     98   void set_to(uc16 value) { to_ = value; }
     99   bool is_valid() { return from_ <= to_; }
    100   bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
    101   bool IsSingleton() { return (from_ == to_); }
    102   void AddCaseEquivalents(Isolate* isolate, Zone* zone,
    103                           ZoneList<CharacterRange>* ranges, bool is_one_byte);
    104   static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
    105                     ZoneList<CharacterRange>** included,
    106                     ZoneList<CharacterRange>** excluded, Zone* zone);
    107   // Whether a range list is in canonical form: Ranges ordered by from value,
    108   // and ranges non-overlapping and non-adjacent.
    109   static bool IsCanonical(ZoneList<CharacterRange>* ranges);
    110   // Convert range list to canonical form. The characters covered by the ranges
    111   // will still be the same, but no character is in more than one range, and
    112   // adjacent ranges are merged. The resulting list may be shorter than the
    113   // original, but cannot be longer.
    114   static void Canonicalize(ZoneList<CharacterRange>* ranges);
    115   // Negate the contents of a character range in canonical form.
    116   static void Negate(ZoneList<CharacterRange>* src,
    117                      ZoneList<CharacterRange>* dst, Zone* zone);
    118   static const int kStartMarker = (1 << 24);
    119   static const int kPayloadMask = (1 << 24) - 1;
    120 
    121  private:
    122   uc16 from_;
    123   uc16 to_;
    124 };
    125 
    126 
    127 class CharacterSet final BASE_EMBEDDED {
    128  public:
    129   explicit CharacterSet(uc16 standard_set_type)
    130       : ranges_(NULL), standard_set_type_(standard_set_type) {}
    131   explicit CharacterSet(ZoneList<CharacterRange>* ranges)
    132       : ranges_(ranges), standard_set_type_(0) {}
    133   ZoneList<CharacterRange>* ranges(Zone* zone);
    134   uc16 standard_set_type() { return standard_set_type_; }
    135   void set_standard_set_type(uc16 special_set_type) {
    136     standard_set_type_ = special_set_type;
    137   }
    138   bool is_standard() { return standard_set_type_ != 0; }
    139   void Canonicalize();
    140 
    141  private:
    142   ZoneList<CharacterRange>* ranges_;
    143   // If non-zero, the value represents a standard set (e.g., all whitespace
    144   // characters) without having to expand the ranges.
    145   uc16 standard_set_type_;
    146 };
    147 
    148 
    149 class TextElement final BASE_EMBEDDED {
    150  public:
    151   enum TextType { ATOM, CHAR_CLASS };
    152 
    153   static TextElement Atom(RegExpAtom* atom);
    154   static TextElement CharClass(RegExpCharacterClass* char_class);
    155 
    156   int cp_offset() const { return cp_offset_; }
    157   void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
    158   int length() const;
    159 
    160   TextType text_type() const { return text_type_; }
    161 
    162   RegExpTree* tree() const { return tree_; }
    163 
    164   RegExpAtom* atom() const {
    165     DCHECK(text_type() == ATOM);
    166     return reinterpret_cast<RegExpAtom*>(tree());
    167   }
    168 
    169   RegExpCharacterClass* char_class() const {
    170     DCHECK(text_type() == CHAR_CLASS);
    171     return reinterpret_cast<RegExpCharacterClass*>(tree());
    172   }
    173 
    174  private:
    175   TextElement(TextType text_type, RegExpTree* tree)
    176       : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
    177 
    178   int cp_offset_;
    179   TextType text_type_;
    180   RegExpTree* tree_;
    181 };
    182 
    183 
    184 class RegExpTree : public ZoneObject {
    185  public:
    186   static const int kInfinity = kMaxInt;
    187   virtual ~RegExpTree() {}
    188   virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
    189   virtual RegExpNode* ToNode(RegExpCompiler* compiler,
    190                              RegExpNode* on_success) = 0;
    191   virtual bool IsTextElement() { return false; }
    192   virtual bool IsAnchoredAtStart() { return false; }
    193   virtual bool IsAnchoredAtEnd() { return false; }
    194   virtual int min_match() = 0;
    195   virtual int max_match() = 0;
    196   // Returns the interval of registers used for captures within this
    197   // expression.
    198   virtual Interval CaptureRegisters() { return Interval::Empty(); }
    199   virtual void AppendToText(RegExpText* text, Zone* zone);
    200   std::ostream& Print(std::ostream& os, Zone* zone);  // NOLINT
    201 #define MAKE_ASTYPE(Name)           \
    202   virtual RegExp##Name* As##Name(); \
    203   virtual bool Is##Name();
    204   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
    205 #undef MAKE_ASTYPE
    206 };
    207 
    208 
    209 class RegExpDisjunction final : public RegExpTree {
    210  public:
    211   explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
    212   void* Accept(RegExpVisitor* visitor, void* data) override;
    213   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    214   RegExpDisjunction* AsDisjunction() override;
    215   Interval CaptureRegisters() override;
    216   bool IsDisjunction() override;
    217   bool IsAnchoredAtStart() override;
    218   bool IsAnchoredAtEnd() override;
    219   int min_match() override { return min_match_; }
    220   int max_match() override { return max_match_; }
    221   ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
    222 
    223  private:
    224   bool SortConsecutiveAtoms(RegExpCompiler* compiler);
    225   void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
    226   void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
    227   ZoneList<RegExpTree*>* alternatives_;
    228   int min_match_;
    229   int max_match_;
    230 };
    231 
    232 
    233 class RegExpAlternative final : public RegExpTree {
    234  public:
    235   explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
    236   void* Accept(RegExpVisitor* visitor, void* data) override;
    237   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    238   RegExpAlternative* AsAlternative() override;
    239   Interval CaptureRegisters() override;
    240   bool IsAlternative() override;
    241   bool IsAnchoredAtStart() override;
    242   bool IsAnchoredAtEnd() override;
    243   int min_match() override { return min_match_; }
    244   int max_match() override { return max_match_; }
    245   ZoneList<RegExpTree*>* nodes() { return nodes_; }
    246 
    247  private:
    248   ZoneList<RegExpTree*>* nodes_;
    249   int min_match_;
    250   int max_match_;
    251 };
    252 
    253 
    254 class RegExpAssertion final : public RegExpTree {
    255  public:
    256   enum AssertionType {
    257     START_OF_LINE,
    258     START_OF_INPUT,
    259     END_OF_LINE,
    260     END_OF_INPUT,
    261     BOUNDARY,
    262     NON_BOUNDARY
    263   };
    264   explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
    265   void* Accept(RegExpVisitor* visitor, void* data) override;
    266   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    267   RegExpAssertion* AsAssertion() override;
    268   bool IsAssertion() override;
    269   bool IsAnchoredAtStart() override;
    270   bool IsAnchoredAtEnd() override;
    271   int min_match() override { return 0; }
    272   int max_match() override { return 0; }
    273   AssertionType assertion_type() { return assertion_type_; }
    274 
    275  private:
    276   AssertionType assertion_type_;
    277 };
    278 
    279 
    280 class RegExpCharacterClass final : public RegExpTree {
    281  public:
    282   RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
    283       : set_(ranges), is_negated_(is_negated) {}
    284   explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
    285   void* Accept(RegExpVisitor* visitor, void* data) override;
    286   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    287   RegExpCharacterClass* AsCharacterClass() override;
    288   bool IsCharacterClass() override;
    289   bool IsTextElement() override { return true; }
    290   int min_match() override { return 1; }
    291   int max_match() override { return 1; }
    292   void AppendToText(RegExpText* text, Zone* zone) override;
    293   CharacterSet character_set() { return set_; }
    294   // TODO(lrn): Remove need for complex version if is_standard that
    295   // recognizes a mangled standard set and just do { return set_.is_special(); }
    296   bool is_standard(Zone* zone);
    297   // Returns a value representing the standard character set if is_standard()
    298   // returns true.
    299   // Currently used values are:
    300   // s : unicode whitespace
    301   // S : unicode non-whitespace
    302   // w : ASCII word character (digit, letter, underscore)
    303   // W : non-ASCII word character
    304   // d : ASCII digit
    305   // D : non-ASCII digit
    306   // . : non-unicode non-newline
    307   // * : All characters
    308   uc16 standard_type() { return set_.standard_set_type(); }
    309   ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
    310   bool is_negated() { return is_negated_; }
    311 
    312  private:
    313   CharacterSet set_;
    314   bool is_negated_;
    315 };
    316 
    317 
    318 class RegExpAtom final : public RegExpTree {
    319  public:
    320   explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
    321   void* Accept(RegExpVisitor* visitor, void* data) override;
    322   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    323   RegExpAtom* AsAtom() override;
    324   bool IsAtom() override;
    325   bool IsTextElement() override { return true; }
    326   int min_match() override { return data_.length(); }
    327   int max_match() override { return data_.length(); }
    328   void AppendToText(RegExpText* text, Zone* zone) override;
    329   Vector<const uc16> data() { return data_; }
    330   int length() { return data_.length(); }
    331 
    332  private:
    333   Vector<const uc16> data_;
    334 };
    335 
    336 
    337 class RegExpText final : public RegExpTree {
    338  public:
    339   explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
    340   void* Accept(RegExpVisitor* visitor, void* data) override;
    341   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    342   RegExpText* AsText() override;
    343   bool IsText() override;
    344   bool IsTextElement() override { return true; }
    345   int min_match() override { return length_; }
    346   int max_match() override { return length_; }
    347   void AppendToText(RegExpText* text, Zone* zone) override;
    348   void AddElement(TextElement elm, Zone* zone) {
    349     elements_.Add(elm, zone);
    350     length_ += elm.length();
    351   }
    352   ZoneList<TextElement>* elements() { return &elements_; }
    353 
    354  private:
    355   ZoneList<TextElement> elements_;
    356   int length_;
    357 };
    358 
    359 
    360 class RegExpQuantifier final : public RegExpTree {
    361  public:
    362   enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
    363   RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
    364       : body_(body),
    365         min_(min),
    366         max_(max),
    367         min_match_(min * body->min_match()),
    368         quantifier_type_(type) {
    369     if (max > 0 && body->max_match() > kInfinity / max) {
    370       max_match_ = kInfinity;
    371     } else {
    372       max_match_ = max * body->max_match();
    373     }
    374   }
    375   void* Accept(RegExpVisitor* visitor, void* data) override;
    376   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    377   static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
    378                             RegExpCompiler* compiler, RegExpNode* on_success,
    379                             bool not_at_start = false);
    380   RegExpQuantifier* AsQuantifier() override;
    381   Interval CaptureRegisters() override;
    382   bool IsQuantifier() override;
    383   int min_match() override { return min_match_; }
    384   int max_match() override { return max_match_; }
    385   int min() { return min_; }
    386   int max() { return max_; }
    387   bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
    388   bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
    389   bool is_greedy() { return quantifier_type_ == GREEDY; }
    390   RegExpTree* body() { return body_; }
    391 
    392  private:
    393   RegExpTree* body_;
    394   int min_;
    395   int max_;
    396   int min_match_;
    397   int max_match_;
    398   QuantifierType quantifier_type_;
    399 };
    400 
    401 
    402 class RegExpCapture final : public RegExpTree {
    403  public:
    404   explicit RegExpCapture(int index) : body_(NULL), index_(index) {}
    405   void* Accept(RegExpVisitor* visitor, void* data) override;
    406   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    407   static RegExpNode* ToNode(RegExpTree* body, int index,
    408                             RegExpCompiler* compiler, RegExpNode* on_success);
    409   RegExpCapture* AsCapture() override;
    410   bool IsAnchoredAtStart() override;
    411   bool IsAnchoredAtEnd() override;
    412   Interval CaptureRegisters() override;
    413   bool IsCapture() override;
    414   int min_match() override { return body_->min_match(); }
    415   int max_match() override { return body_->max_match(); }
    416   RegExpTree* body() { return body_; }
    417   void set_body(RegExpTree* body) { body_ = body; }
    418   int index() { return index_; }
    419   static int StartRegister(int index) { return index * 2; }
    420   static int EndRegister(int index) { return index * 2 + 1; }
    421 
    422  private:
    423   RegExpTree* body_;
    424   int index_;
    425 };
    426 
    427 
    428 class RegExpLookaround final : public RegExpTree {
    429  public:
    430   enum Type { LOOKAHEAD, LOOKBEHIND };
    431 
    432   RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
    433                    int capture_from, Type type)
    434       : body_(body),
    435         is_positive_(is_positive),
    436         capture_count_(capture_count),
    437         capture_from_(capture_from),
    438         type_(type) {}
    439 
    440   void* Accept(RegExpVisitor* visitor, void* data) override;
    441   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    442   RegExpLookaround* AsLookaround() override;
    443   Interval CaptureRegisters() override;
    444   bool IsLookaround() override;
    445   bool IsAnchoredAtStart() override;
    446   int min_match() override { return 0; }
    447   int max_match() override { return 0; }
    448   RegExpTree* body() { return body_; }
    449   bool is_positive() { return is_positive_; }
    450   int capture_count() { return capture_count_; }
    451   int capture_from() { return capture_from_; }
    452   Type type() { return type_; }
    453 
    454  private:
    455   RegExpTree* body_;
    456   bool is_positive_;
    457   int capture_count_;
    458   int capture_from_;
    459   Type type_;
    460 };
    461 
    462 
    463 class RegExpBackReference final : public RegExpTree {
    464  public:
    465   explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {}
    466   void* Accept(RegExpVisitor* visitor, void* data) override;
    467   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    468   RegExpBackReference* AsBackReference() override;
    469   bool IsBackReference() override;
    470   int min_match() override { return 0; }
    471   // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
    472   // recursion, we give up. Ignorance is bliss.
    473   int max_match() override { return kInfinity; }
    474   int index() { return capture_->index(); }
    475   RegExpCapture* capture() { return capture_; }
    476 
    477  private:
    478   RegExpCapture* capture_;
    479 };
    480 
    481 
    482 class RegExpEmpty final : public RegExpTree {
    483  public:
    484   RegExpEmpty() {}
    485   void* Accept(RegExpVisitor* visitor, void* data) override;
    486   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    487   RegExpEmpty* AsEmpty() override;
    488   bool IsEmpty() override;
    489   int min_match() override { return 0; }
    490   int max_match() override { return 0; }
    491 };
    492 
    493 }  // namespace internal
    494 }  // namespace v8
    495 
    496 #endif  // V8_REGEXP_REGEXP_AST_H_
    497