Home | History | Annotate | Download | only in regexp
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_REGEXP_AST_H_
      6 #define V8_REGEXP_REGEXP_AST_H_
      7 
      8 #include "src/objects.h"
      9 #include "src/utils.h"
     10 #include "src/zone/zone-containers.h"
     11 #include "src/zone/zone.h"
     12 
     13 namespace v8 {
     14 namespace internal {
     15 
     16 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
     17   VISIT(Disjunction)                      \
     18   VISIT(Alternative)                      \
     19   VISIT(Assertion)                        \
     20   VISIT(CharacterClass)                   \
     21   VISIT(Atom)                             \
     22   VISIT(Quantifier)                       \
     23   VISIT(Capture)                          \
     24   VISIT(Group)                            \
     25   VISIT(Lookaround)                       \
     26   VISIT(BackReference)                    \
     27   VISIT(Empty)                            \
     28   VISIT(Text)
     29 
     30 #define FORWARD_DECLARE(Name) class RegExp##Name;
     31 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
     32 #undef FORWARD_DECLARE
     33 
     34 class RegExpCompiler;
     35 class RegExpNode;
     36 class RegExpTree;
     37 
     38 
     39 class RegExpVisitor BASE_EMBEDDED {
     40  public:
     41   virtual ~RegExpVisitor() {}
     42 #define MAKE_CASE(Name) \
     43   virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
     44   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
     45 #undef MAKE_CASE
     46 };
     47 
     48 
     49 // A simple closed interval.
     50 class Interval {
     51  public:
     52   Interval() : from_(kNone), to_(kNone) {}
     53   Interval(int from, int to) : from_(from), to_(to) {}
     54   Interval Union(Interval that) {
     55     if (that.from_ == kNone)
     56       return *this;
     57     else if (from_ == kNone)
     58       return that;
     59     else
     60       return Interval(Min(from_, that.from_), Max(to_, that.to_));
     61   }
     62   bool Contains(int value) { return (from_ <= value) && (value <= to_); }
     63   bool is_empty() { return from_ == kNone; }
     64   int from() const { return from_; }
     65   int to() const { return to_; }
     66   static Interval Empty() { return Interval(); }
     67   static const int kNone = -1;
     68 
     69  private:
     70   int from_;
     71   int to_;
     72 };
     73 
     74 
     75 // Represents code units in the range from from_ to to_, both ends are
     76 // inclusive.
     77 class CharacterRange {
     78  public:
     79   CharacterRange() : from_(0), to_(0) {}
     80   // For compatibility with the CHECK_OK macro
     81   CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
     82   static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
     83                              Zone* zone);
     84   static Vector<const int> GetWordBounds();
     85   static inline CharacterRange Singleton(uc32 value) {
     86     return CharacterRange(value, value);
     87   }
     88   static inline CharacterRange Range(uc32 from, uc32 to) {
     89     DCHECK(0 <= from && to <= String::kMaxCodePoint);
     90     DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
     91     return CharacterRange(from, to);
     92   }
     93   static inline CharacterRange Everything() {
     94     return CharacterRange(0, String::kMaxCodePoint);
     95   }
     96   static inline ZoneList<CharacterRange>* List(Zone* zone,
     97                                                CharacterRange range) {
     98     ZoneList<CharacterRange>* list =
     99         new (zone) ZoneList<CharacterRange>(1, zone);
    100     list->Add(range, zone);
    101     return list;
    102   }
    103   bool Contains(uc32 i) { return from_ <= i && i <= to_; }
    104   uc32 from() const { return from_; }
    105   void set_from(uc32 value) { from_ = value; }
    106   uc32 to() const { return to_; }
    107   void set_to(uc32 value) { to_ = value; }
    108   bool is_valid() { return from_ <= to_; }
    109   bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
    110   bool IsSingleton() { return (from_ == to_); }
    111   static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
    112                                  ZoneList<CharacterRange>* ranges,
    113                                  bool is_one_byte);
    114   // Whether a range list is in canonical form: Ranges ordered by from value,
    115   // and ranges non-overlapping and non-adjacent.
    116   static bool IsCanonical(ZoneList<CharacterRange>* ranges);
    117   // Convert range list to canonical form. The characters covered by the ranges
    118   // will still be the same, but no character is in more than one range, and
    119   // adjacent ranges are merged. The resulting list may be shorter than the
    120   // original, but cannot be longer.
    121   static void Canonicalize(ZoneList<CharacterRange>* ranges);
    122   // Negate the contents of a character range in canonical form.
    123   static void Negate(ZoneList<CharacterRange>* src,
    124                      ZoneList<CharacterRange>* dst, Zone* zone);
    125   static const int kStartMarker = (1 << 24);
    126   static const int kPayloadMask = (1 << 24) - 1;
    127 
    128  private:
    129   CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
    130 
    131   uc32 from_;
    132   uc32 to_;
    133 };
    134 
    135 
    136 class CharacterSet final BASE_EMBEDDED {
    137  public:
    138   explicit CharacterSet(uc16 standard_set_type)
    139       : ranges_(NULL), standard_set_type_(standard_set_type) {}
    140   explicit CharacterSet(ZoneList<CharacterRange>* ranges)
    141       : ranges_(ranges), standard_set_type_(0) {}
    142   ZoneList<CharacterRange>* ranges(Zone* zone);
    143   uc16 standard_set_type() { return standard_set_type_; }
    144   void set_standard_set_type(uc16 special_set_type) {
    145     standard_set_type_ = special_set_type;
    146   }
    147   bool is_standard() { return standard_set_type_ != 0; }
    148   void Canonicalize();
    149 
    150  private:
    151   ZoneList<CharacterRange>* ranges_;
    152   // If non-zero, the value represents a standard set (e.g., all whitespace
    153   // characters) without having to expand the ranges.
    154   uc16 standard_set_type_;
    155 };
    156 
    157 
    158 class TextElement final BASE_EMBEDDED {
    159  public:
    160   enum TextType { ATOM, CHAR_CLASS };
    161 
    162   static TextElement Atom(RegExpAtom* atom);
    163   static TextElement CharClass(RegExpCharacterClass* char_class);
    164 
    165   int cp_offset() const { return cp_offset_; }
    166   void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
    167   int length() const;
    168 
    169   TextType text_type() const { return text_type_; }
    170 
    171   RegExpTree* tree() const { return tree_; }
    172 
    173   RegExpAtom* atom() const {
    174     DCHECK(text_type() == ATOM);
    175     return reinterpret_cast<RegExpAtom*>(tree());
    176   }
    177 
    178   RegExpCharacterClass* char_class() const {
    179     DCHECK(text_type() == CHAR_CLASS);
    180     return reinterpret_cast<RegExpCharacterClass*>(tree());
    181   }
    182 
    183  private:
    184   TextElement(TextType text_type, RegExpTree* tree)
    185       : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
    186 
    187   int cp_offset_;
    188   TextType text_type_;
    189   RegExpTree* tree_;
    190 };
    191 
    192 
    193 class RegExpTree : public ZoneObject {
    194  public:
    195   static const int kInfinity = kMaxInt;
    196   virtual ~RegExpTree() {}
    197   virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
    198   virtual RegExpNode* ToNode(RegExpCompiler* compiler,
    199                              RegExpNode* on_success) = 0;
    200   virtual bool IsTextElement() { return false; }
    201   virtual bool IsAnchoredAtStart() { return false; }
    202   virtual bool IsAnchoredAtEnd() { return false; }
    203   virtual int min_match() = 0;
    204   virtual int max_match() = 0;
    205   // Returns the interval of registers used for captures within this
    206   // expression.
    207   virtual Interval CaptureRegisters() { return Interval::Empty(); }
    208   virtual void AppendToText(RegExpText* text, Zone* zone);
    209   std::ostream& Print(std::ostream& os, Zone* zone);  // NOLINT
    210 #define MAKE_ASTYPE(Name)           \
    211   virtual RegExp##Name* As##Name(); \
    212   virtual bool Is##Name();
    213   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
    214 #undef MAKE_ASTYPE
    215 };
    216 
    217 
    218 class RegExpDisjunction final : public RegExpTree {
    219  public:
    220   explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
    221   void* Accept(RegExpVisitor* visitor, void* data) override;
    222   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    223   RegExpDisjunction* AsDisjunction() override;
    224   Interval CaptureRegisters() override;
    225   bool IsDisjunction() override;
    226   bool IsAnchoredAtStart() override;
    227   bool IsAnchoredAtEnd() override;
    228   int min_match() override { return min_match_; }
    229   int max_match() override { return max_match_; }
    230   ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
    231 
    232  private:
    233   bool SortConsecutiveAtoms(RegExpCompiler* compiler);
    234   void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
    235   void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
    236   ZoneList<RegExpTree*>* alternatives_;
    237   int min_match_;
    238   int max_match_;
    239 };
    240 
    241 
    242 class RegExpAlternative final : public RegExpTree {
    243  public:
    244   explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
    245   void* Accept(RegExpVisitor* visitor, void* data) override;
    246   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    247   RegExpAlternative* AsAlternative() override;
    248   Interval CaptureRegisters() override;
    249   bool IsAlternative() override;
    250   bool IsAnchoredAtStart() override;
    251   bool IsAnchoredAtEnd() override;
    252   int min_match() override { return min_match_; }
    253   int max_match() override { return max_match_; }
    254   ZoneList<RegExpTree*>* nodes() { return nodes_; }
    255 
    256  private:
    257   ZoneList<RegExpTree*>* nodes_;
    258   int min_match_;
    259   int max_match_;
    260 };
    261 
    262 
    263 class RegExpAssertion final : public RegExpTree {
    264  public:
    265   enum AssertionType {
    266     START_OF_LINE,
    267     START_OF_INPUT,
    268     END_OF_LINE,
    269     END_OF_INPUT,
    270     BOUNDARY,
    271     NON_BOUNDARY
    272   };
    273   explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
    274   void* Accept(RegExpVisitor* visitor, void* data) override;
    275   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    276   RegExpAssertion* AsAssertion() override;
    277   bool IsAssertion() override;
    278   bool IsAnchoredAtStart() override;
    279   bool IsAnchoredAtEnd() override;
    280   int min_match() override { return 0; }
    281   int max_match() override { return 0; }
    282   AssertionType assertion_type() { return assertion_type_; }
    283 
    284  private:
    285   AssertionType assertion_type_;
    286 };
    287 
    288 
    289 class RegExpCharacterClass final : public RegExpTree {
    290  public:
    291   RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
    292       : set_(ranges), is_negated_(is_negated) {}
    293   explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
    294   void* Accept(RegExpVisitor* visitor, void* data) override;
    295   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    296   RegExpCharacterClass* AsCharacterClass() override;
    297   bool IsCharacterClass() override;
    298   bool IsTextElement() override { return true; }
    299   int min_match() override { return 1; }
    300   // The character class may match two code units for unicode regexps.
    301   // TODO(yangguo): we should split this class for usage in TextElement, and
    302   //                make max_match() dependent on the character class content.
    303   int max_match() override { return 2; }
    304   void AppendToText(RegExpText* text, Zone* zone) override;
    305   CharacterSet character_set() { return set_; }
    306   // TODO(lrn): Remove need for complex version if is_standard that
    307   // recognizes a mangled standard set and just do { return set_.is_special(); }
    308   bool is_standard(Zone* zone);
    309   // Returns a value representing the standard character set if is_standard()
    310   // returns true.
    311   // Currently used values are:
    312   // s : unicode whitespace
    313   // S : unicode non-whitespace
    314   // w : ASCII word character (digit, letter, underscore)
    315   // W : non-ASCII word character
    316   // d : ASCII digit
    317   // D : non-ASCII digit
    318   // . : non-newline
    319   // * : All characters, for advancing unanchored regexp
    320   uc16 standard_type() { return set_.standard_set_type(); }
    321   ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
    322   bool is_negated() { return is_negated_; }
    323 
    324  private:
    325   CharacterSet set_;
    326   bool is_negated_;
    327 };
    328 
    329 
    330 class RegExpAtom final : public RegExpTree {
    331  public:
    332   explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
    333   void* Accept(RegExpVisitor* visitor, void* data) override;
    334   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    335   RegExpAtom* AsAtom() override;
    336   bool IsAtom() override;
    337   bool IsTextElement() override { return true; }
    338   int min_match() override { return data_.length(); }
    339   int max_match() override { return data_.length(); }
    340   void AppendToText(RegExpText* text, Zone* zone) override;
    341   Vector<const uc16> data() { return data_; }
    342   int length() { return data_.length(); }
    343 
    344  private:
    345   Vector<const uc16> data_;
    346 };
    347 
    348 
    349 class RegExpText final : public RegExpTree {
    350  public:
    351   explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
    352   void* Accept(RegExpVisitor* visitor, void* data) override;
    353   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    354   RegExpText* AsText() override;
    355   bool IsText() override;
    356   bool IsTextElement() override { return true; }
    357   int min_match() override { return length_; }
    358   int max_match() override { return length_; }
    359   void AppendToText(RegExpText* text, Zone* zone) override;
    360   void AddElement(TextElement elm, Zone* zone) {
    361     elements_.Add(elm, zone);
    362     length_ += elm.length();
    363   }
    364   ZoneList<TextElement>* elements() { return &elements_; }
    365 
    366  private:
    367   ZoneList<TextElement> elements_;
    368   int length_;
    369 };
    370 
    371 
    372 class RegExpQuantifier final : public RegExpTree {
    373  public:
    374   enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
    375   RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
    376       : body_(body),
    377         min_(min),
    378         max_(max),
    379         min_match_(min * body->min_match()),
    380         quantifier_type_(type) {
    381     if (max > 0 && body->max_match() > kInfinity / max) {
    382       max_match_ = kInfinity;
    383     } else {
    384       max_match_ = max * body->max_match();
    385     }
    386   }
    387   void* Accept(RegExpVisitor* visitor, void* data) override;
    388   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    389   static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
    390                             RegExpCompiler* compiler, RegExpNode* on_success,
    391                             bool not_at_start = false);
    392   RegExpQuantifier* AsQuantifier() override;
    393   Interval CaptureRegisters() override;
    394   bool IsQuantifier() override;
    395   int min_match() override { return min_match_; }
    396   int max_match() override { return max_match_; }
    397   int min() { return min_; }
    398   int max() { return max_; }
    399   bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
    400   bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
    401   bool is_greedy() { return quantifier_type_ == GREEDY; }
    402   RegExpTree* body() { return body_; }
    403 
    404  private:
    405   RegExpTree* body_;
    406   int min_;
    407   int max_;
    408   int min_match_;
    409   int max_match_;
    410   QuantifierType quantifier_type_;
    411 };
    412 
    413 
    414 class RegExpCapture final : public RegExpTree {
    415  public:
    416   explicit RegExpCapture(int index)
    417       : body_(NULL), index_(index), name_(nullptr) {}
    418   void* Accept(RegExpVisitor* visitor, void* data) override;
    419   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    420   static RegExpNode* ToNode(RegExpTree* body, int index,
    421                             RegExpCompiler* compiler, RegExpNode* on_success);
    422   RegExpCapture* AsCapture() override;
    423   bool IsAnchoredAtStart() override;
    424   bool IsAnchoredAtEnd() override;
    425   Interval CaptureRegisters() override;
    426   bool IsCapture() override;
    427   int min_match() override { return body_->min_match(); }
    428   int max_match() override { return body_->max_match(); }
    429   RegExpTree* body() { return body_; }
    430   void set_body(RegExpTree* body) { body_ = body; }
    431   int index() { return index_; }
    432   const ZoneVector<uc16>* name() const { return name_; }
    433   void set_name(const ZoneVector<uc16>* name) { name_ = name; }
    434   static int StartRegister(int index) { return index * 2; }
    435   static int EndRegister(int index) { return index * 2 + 1; }
    436 
    437  private:
    438   RegExpTree* body_;
    439   int index_;
    440   const ZoneVector<uc16>* name_;
    441 };
    442 
    443 class RegExpGroup final : public RegExpTree {
    444  public:
    445   explicit RegExpGroup(RegExpTree* body) : body_(body) {}
    446   void* Accept(RegExpVisitor* visitor, void* data) override;
    447   RegExpNode* ToNode(RegExpCompiler* compiler,
    448                      RegExpNode* on_success) override {
    449     return body_->ToNode(compiler, on_success);
    450   }
    451   RegExpGroup* AsGroup() override;
    452   bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
    453   bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
    454   bool IsGroup() override;
    455   int min_match() override { return body_->min_match(); }
    456   int max_match() override { return body_->max_match(); }
    457   Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
    458   RegExpTree* body() { return body_; }
    459 
    460  private:
    461   RegExpTree* body_;
    462 };
    463 
    464 class RegExpLookaround final : public RegExpTree {
    465  public:
    466   enum Type { LOOKAHEAD, LOOKBEHIND };
    467 
    468   RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
    469                    int capture_from, Type type)
    470       : body_(body),
    471         is_positive_(is_positive),
    472         capture_count_(capture_count),
    473         capture_from_(capture_from),
    474         type_(type) {}
    475 
    476   void* Accept(RegExpVisitor* visitor, void* data) override;
    477   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    478   RegExpLookaround* AsLookaround() override;
    479   Interval CaptureRegisters() override;
    480   bool IsLookaround() override;
    481   bool IsAnchoredAtStart() override;
    482   int min_match() override { return 0; }
    483   int max_match() override { return 0; }
    484   RegExpTree* body() { return body_; }
    485   bool is_positive() { return is_positive_; }
    486   int capture_count() { return capture_count_; }
    487   int capture_from() { return capture_from_; }
    488   Type type() { return type_; }
    489 
    490   class Builder {
    491    public:
    492     Builder(bool is_positive, RegExpNode* on_success,
    493             int stack_pointer_register, int position_register,
    494             int capture_register_count = 0, int capture_register_start = 0);
    495     RegExpNode* on_match_success() { return on_match_success_; }
    496     RegExpNode* ForMatch(RegExpNode* match);
    497 
    498    private:
    499     bool is_positive_;
    500     RegExpNode* on_match_success_;
    501     RegExpNode* on_success_;
    502     int stack_pointer_register_;
    503     int position_register_;
    504   };
    505 
    506  private:
    507   RegExpTree* body_;
    508   bool is_positive_;
    509   int capture_count_;
    510   int capture_from_;
    511   Type type_;
    512 };
    513 
    514 
    515 class RegExpBackReference final : public RegExpTree {
    516  public:
    517   RegExpBackReference() : capture_(nullptr), name_(nullptr) {}
    518   explicit RegExpBackReference(RegExpCapture* capture)
    519       : capture_(capture), name_(nullptr) {}
    520   void* Accept(RegExpVisitor* visitor, void* data) override;
    521   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    522   RegExpBackReference* AsBackReference() override;
    523   bool IsBackReference() override;
    524   int min_match() override { return 0; }
    525   // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
    526   // recursion, we give up. Ignorance is bliss.
    527   int max_match() override { return kInfinity; }
    528   int index() { return capture_->index(); }
    529   RegExpCapture* capture() { return capture_; }
    530   void set_capture(RegExpCapture* capture) { capture_ = capture; }
    531   const ZoneVector<uc16>* name() const { return name_; }
    532   void set_name(const ZoneVector<uc16>* name) { name_ = name; }
    533 
    534  private:
    535   RegExpCapture* capture_;
    536   const ZoneVector<uc16>* name_;
    537 };
    538 
    539 
    540 class RegExpEmpty final : public RegExpTree {
    541  public:
    542   RegExpEmpty() {}
    543   void* Accept(RegExpVisitor* visitor, void* data) override;
    544   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
    545   RegExpEmpty* AsEmpty() override;
    546   bool IsEmpty() override;
    547   int min_match() override { return 0; }
    548   int max_match() override { return 0; }
    549 };
    550 
    551 }  // namespace internal
    552 }  // namespace v8
    553 
    554 #endif  // V8_REGEXP_REGEXP_AST_H_
    555