Home | History | Annotate | Download | only in regexp
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/ostreams.h"
      6 #include "src/regexp/regexp-ast.h"
      7 
      8 namespace v8 {
      9 namespace internal {
     10 
     11 #define MAKE_ACCEPT(Name)                                          \
     12   void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
     13     return visitor->Visit##Name(this, data);                       \
     14   }
     15 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
     16 #undef MAKE_ACCEPT
     17 
     18 #define MAKE_TYPE_CASE(Name)                            \
     19   RegExp##Name* RegExpTree::As##Name() { return NULL; } \
     20   bool RegExpTree::Is##Name() { return false; }
     21 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
     22 #undef MAKE_TYPE_CASE
     23 
     24 #define MAKE_TYPE_CASE(Name)                              \
     25   RegExp##Name* RegExp##Name::As##Name() { return this; } \
     26   bool RegExp##Name::Is##Name() { return true; }
     27 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
     28 #undef MAKE_TYPE_CASE
     29 
     30 
     31 static Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
     32   Interval result = Interval::Empty();
     33   for (int i = 0; i < children->length(); i++)
     34     result = result.Union(children->at(i)->CaptureRegisters());
     35   return result;
     36 }
     37 
     38 
     39 Interval RegExpAlternative::CaptureRegisters() {
     40   return ListCaptureRegisters(nodes());
     41 }
     42 
     43 
     44 Interval RegExpDisjunction::CaptureRegisters() {
     45   return ListCaptureRegisters(alternatives());
     46 }
     47 
     48 
     49 Interval RegExpLookaround::CaptureRegisters() {
     50   return body()->CaptureRegisters();
     51 }
     52 
     53 
     54 Interval RegExpCapture::CaptureRegisters() {
     55   Interval self(StartRegister(index()), EndRegister(index()));
     56   return self.Union(body()->CaptureRegisters());
     57 }
     58 
     59 
     60 Interval RegExpQuantifier::CaptureRegisters() {
     61   return body()->CaptureRegisters();
     62 }
     63 
     64 
     65 bool RegExpAssertion::IsAnchoredAtStart() {
     66   return assertion_type() == RegExpAssertion::START_OF_INPUT;
     67 }
     68 
     69 
     70 bool RegExpAssertion::IsAnchoredAtEnd() {
     71   return assertion_type() == RegExpAssertion::END_OF_INPUT;
     72 }
     73 
     74 
     75 bool RegExpAlternative::IsAnchoredAtStart() {
     76   ZoneList<RegExpTree*>* nodes = this->nodes();
     77   for (int i = 0; i < nodes->length(); i++) {
     78     RegExpTree* node = nodes->at(i);
     79     if (node->IsAnchoredAtStart()) {
     80       return true;
     81     }
     82     if (node->max_match() > 0) {
     83       return false;
     84     }
     85   }
     86   return false;
     87 }
     88 
     89 
     90 bool RegExpAlternative::IsAnchoredAtEnd() {
     91   ZoneList<RegExpTree*>* nodes = this->nodes();
     92   for (int i = nodes->length() - 1; i >= 0; i--) {
     93     RegExpTree* node = nodes->at(i);
     94     if (node->IsAnchoredAtEnd()) {
     95       return true;
     96     }
     97     if (node->max_match() > 0) {
     98       return false;
     99     }
    100   }
    101   return false;
    102 }
    103 
    104 
    105 bool RegExpDisjunction::IsAnchoredAtStart() {
    106   ZoneList<RegExpTree*>* alternatives = this->alternatives();
    107   for (int i = 0; i < alternatives->length(); i++) {
    108     if (!alternatives->at(i)->IsAnchoredAtStart()) return false;
    109   }
    110   return true;
    111 }
    112 
    113 
    114 bool RegExpDisjunction::IsAnchoredAtEnd() {
    115   ZoneList<RegExpTree*>* alternatives = this->alternatives();
    116   for (int i = 0; i < alternatives->length(); i++) {
    117     if (!alternatives->at(i)->IsAnchoredAtEnd()) return false;
    118   }
    119   return true;
    120 }
    121 
    122 
    123 bool RegExpLookaround::IsAnchoredAtStart() {
    124   return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
    125 }
    126 
    127 
    128 bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
    129 
    130 
    131 bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
    132 
    133 
    134 // Convert regular expression trees to a simple sexp representation.
    135 // This representation should be different from the input grammar
    136 // in as many cases as possible, to make it more difficult for incorrect
    137 // parses to look as correct ones which is likely if the input and
    138 // output formats are alike.
    139 class RegExpUnparser final : public RegExpVisitor {
    140  public:
    141   RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {}
    142   void VisitCharacterRange(CharacterRange that);
    143 #define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override;
    144   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
    145 #undef MAKE_CASE
    146  private:
    147   std::ostream& os_;
    148   Zone* zone_;
    149 };
    150 
    151 
    152 void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
    153   os_ << "(|";
    154   for (int i = 0; i < that->alternatives()->length(); i++) {
    155     os_ << " ";
    156     that->alternatives()->at(i)->Accept(this, data);
    157   }
    158   os_ << ")";
    159   return NULL;
    160 }
    161 
    162 
    163 void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
    164   os_ << "(:";
    165   for (int i = 0; i < that->nodes()->length(); i++) {
    166     os_ << " ";
    167     that->nodes()->at(i)->Accept(this, data);
    168   }
    169   os_ << ")";
    170   return NULL;
    171 }
    172 
    173 
    174 void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
    175   os_ << AsUC32(that.from());
    176   if (!that.IsSingleton()) {
    177     os_ << "-" << AsUC32(that.to());
    178   }
    179 }
    180 
    181 
    182 void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
    183                                           void* data) {
    184   if (that->is_negated()) os_ << "^";
    185   os_ << "[";
    186   for (int i = 0; i < that->ranges(zone_)->length(); i++) {
    187     if (i > 0) os_ << " ";
    188     VisitCharacterRange(that->ranges(zone_)->at(i));
    189   }
    190   os_ << "]";
    191   return NULL;
    192 }
    193 
    194 
    195 void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
    196   switch (that->assertion_type()) {
    197     case RegExpAssertion::START_OF_INPUT:
    198       os_ << "@^i";
    199       break;
    200     case RegExpAssertion::END_OF_INPUT:
    201       os_ << "@$i";
    202       break;
    203     case RegExpAssertion::START_OF_LINE:
    204       os_ << "@^l";
    205       break;
    206     case RegExpAssertion::END_OF_LINE:
    207       os_ << "@$l";
    208       break;
    209     case RegExpAssertion::BOUNDARY:
    210       os_ << "@b";
    211       break;
    212     case RegExpAssertion::NON_BOUNDARY:
    213       os_ << "@B";
    214       break;
    215   }
    216   return NULL;
    217 }
    218 
    219 
    220 void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
    221   os_ << "'";
    222   Vector<const uc16> chardata = that->data();
    223   for (int i = 0; i < chardata.length(); i++) {
    224     os_ << AsUC16(chardata[i]);
    225   }
    226   os_ << "'";
    227   return NULL;
    228 }
    229 
    230 
    231 void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
    232   if (that->elements()->length() == 1) {
    233     that->elements()->at(0).tree()->Accept(this, data);
    234   } else {
    235     os_ << "(!";
    236     for (int i = 0; i < that->elements()->length(); i++) {
    237       os_ << " ";
    238       that->elements()->at(i).tree()->Accept(this, data);
    239     }
    240     os_ << ")";
    241   }
    242   return NULL;
    243 }
    244 
    245 
    246 void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
    247   os_ << "(# " << that->min() << " ";
    248   if (that->max() == RegExpTree::kInfinity) {
    249     os_ << "- ";
    250   } else {
    251     os_ << that->max() << " ";
    252   }
    253   os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
    254   that->body()->Accept(this, data);
    255   os_ << ")";
    256   return NULL;
    257 }
    258 
    259 
    260 void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
    261   os_ << "(^ ";
    262   that->body()->Accept(this, data);
    263   os_ << ")";
    264   return NULL;
    265 }
    266 
    267 
    268 void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
    269   os_ << "(";
    270   os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-");
    271   os_ << (that->is_positive() ? " + " : " - ");
    272   that->body()->Accept(this, data);
    273   os_ << ")";
    274   return NULL;
    275 }
    276 
    277 
    278 void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
    279                                          void* data) {
    280   os_ << "(<- " << that->index() << ")";
    281   return NULL;
    282 }
    283 
    284 
    285 void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
    286   os_ << '%';
    287   return NULL;
    288 }
    289 
    290 
    291 std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) {  // NOLINT
    292   RegExpUnparser unparser(os, zone);
    293   Accept(&unparser, NULL);
    294   return os;
    295 }
    296 
    297 
    298 RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
    299     : alternatives_(alternatives) {
    300   DCHECK(alternatives->length() > 1);
    301   RegExpTree* first_alternative = alternatives->at(0);
    302   min_match_ = first_alternative->min_match();
    303   max_match_ = first_alternative->max_match();
    304   for (int i = 1; i < alternatives->length(); i++) {
    305     RegExpTree* alternative = alternatives->at(i);
    306     min_match_ = Min(min_match_, alternative->min_match());
    307     max_match_ = Max(max_match_, alternative->max_match());
    308   }
    309 }
    310 
    311 
    312 static int IncreaseBy(int previous, int increase) {
    313   if (RegExpTree::kInfinity - previous < increase) {
    314     return RegExpTree::kInfinity;
    315   } else {
    316     return previous + increase;
    317   }
    318 }
    319 
    320 
    321 RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
    322     : nodes_(nodes) {
    323   DCHECK(nodes->length() > 1);
    324   min_match_ = 0;
    325   max_match_ = 0;
    326   for (int i = 0; i < nodes->length(); i++) {
    327     RegExpTree* node = nodes->at(i);
    328     int node_min_match = node->min_match();
    329     min_match_ = IncreaseBy(min_match_, node_min_match);
    330     int node_max_match = node->max_match();
    331     max_match_ = IncreaseBy(max_match_, node_max_match);
    332   }
    333 }
    334 
    335 
    336 }  // namespace internal
    337 }  // namespace v8
    338