Home | History | Annotate | Download | only in re2
      1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Rewrite POSIX and other features in re
      6 // to use simple extended regular expression features.
      7 // Also sort and simplify character classes.
      8 
      9 #include "util/util.h"
     10 #include "re2/regexp.h"
     11 #include "re2/walker-inl.h"
     12 
     13 namespace re2 {
     14 
     15 // Parses the regexp src and then simplifies it and sets *dst to the
     16 // string representation of the simplified form.  Returns true on success.
     17 // Returns false and sets *error (if error != NULL) on error.
     18 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
     19                             string* dst,
     20                             RegexpStatus* status) {
     21   Regexp* re = Parse(src, flags, status);
     22   if (re == NULL)
     23     return false;
     24   Regexp* sre = re->Simplify();
     25   re->Decref();
     26   if (sre == NULL) {
     27     // Should not happen, since Simplify never fails.
     28     LOG(ERROR) << "Simplify failed on " << src;
     29     if (status) {
     30       status->set_code(kRegexpInternalError);
     31       status->set_error_arg(src);
     32     }
     33     return false;
     34   }
     35   *dst = sre->ToString();
     36   sre->Decref();
     37   return true;
     38 }
     39 
     40 // Assuming the simple_ flags on the children are accurate,
     41 // is this Regexp* simple?
     42 bool Regexp::ComputeSimple() {
     43   Regexp** subs;
     44   switch (op_) {
     45     case kRegexpNoMatch:
     46     case kRegexpEmptyMatch:
     47     case kRegexpLiteral:
     48     case kRegexpLiteralString:
     49     case kRegexpBeginLine:
     50     case kRegexpEndLine:
     51     case kRegexpBeginText:
     52     case kRegexpWordBoundary:
     53     case kRegexpNoWordBoundary:
     54     case kRegexpEndText:
     55     case kRegexpAnyChar:
     56     case kRegexpAnyByte:
     57     case kRegexpHaveMatch:
     58       return true;
     59     case kRegexpConcat:
     60     case kRegexpAlternate:
     61       // These are simple as long as the subpieces are simple.
     62       subs = sub();
     63       for (int i = 0; i < nsub_; i++)
     64         if (!subs[i]->simple_)
     65           return false;
     66       return true;
     67     case kRegexpCharClass:
     68       // Simple as long as the char class is not empty, not full.
     69       if (ccb_ != NULL)
     70         return !ccb_->empty() && !ccb_->full();
     71       return !cc_->empty() && !cc_->full();
     72     case kRegexpCapture:
     73       subs = sub();
     74       return subs[0]->simple_;
     75     case kRegexpStar:
     76     case kRegexpPlus:
     77     case kRegexpQuest:
     78       subs = sub();
     79       if (!subs[0]->simple_)
     80         return false;
     81       switch (subs[0]->op_) {
     82         case kRegexpStar:
     83         case kRegexpPlus:
     84         case kRegexpQuest:
     85         case kRegexpEmptyMatch:
     86         case kRegexpNoMatch:
     87           return false;
     88         default:
     89           break;
     90       }
     91       return true;
     92     case kRegexpRepeat:
     93       return false;
     94   }
     95   LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
     96   return false;
     97 }
     98 
     99 // Walker subclass used by Simplify.
    100 // The simplify walk is purely post-recursive: given the simplified children,
    101 // PostVisit creates the simplified result.
    102 // The child_args are simplified Regexp*s.
    103 class SimplifyWalker : public Regexp::Walker<Regexp*> {
    104  public:
    105   SimplifyWalker() {}
    106   virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
    107   virtual Regexp* PostVisit(Regexp* re,
    108                             Regexp* parent_arg,
    109                             Regexp* pre_arg,
    110                             Regexp** child_args, int nchild_args);
    111   virtual Regexp* Copy(Regexp* re);
    112   virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
    113 
    114  private:
    115   // These functions are declared inside SimplifyWalker so that
    116   // they can edit the private fields of the Regexps they construct.
    117 
    118   // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
    119   // Caller must Decref return value when done with it.
    120   static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
    121 
    122   // Simplifies the expression re{min,max} in terms of *, +, and ?.
    123   // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
    124   // Caller must Decref return value when done with it.
    125   static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
    126                                 Regexp::ParseFlags parse_flags);
    127 
    128   // Simplifies a character class by expanding any named classes
    129   // into rune ranges.  Does not edit re.  Does not consume ref to re.
    130   // Caller must Decref return value when done with it.
    131   static Regexp* SimplifyCharClass(Regexp* re);
    132 
    133   DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
    134 };
    135 
    136 // Simplifies a regular expression, returning a new regexp.
    137 // The new regexp uses traditional Unix egrep features only,
    138 // plus the Perl (?:) non-capturing parentheses.
    139 // Otherwise, no POSIX or Perl additions.  The new regexp
    140 // captures exactly the same subexpressions (with the same indices)
    141 // as the original.
    142 // Does not edit current object.
    143 // Caller must Decref() return value when done with it.
    144 
    145 Regexp* Regexp::Simplify() {
    146   if (simple_)
    147     return Incref();
    148   SimplifyWalker w;
    149   return w.Walk(this, NULL);
    150 }
    151 
    152 #define Simplify DontCallSimplify  // Avoid accidental recursion
    153 
    154 Regexp* SimplifyWalker::Copy(Regexp* re) {
    155   return re->Incref();
    156 }
    157 
    158 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
    159   // This should never be called, since we use Walk and not
    160   // WalkExponential.
    161   LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
    162   return re->Incref();
    163 }
    164 
    165 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
    166   if (re->simple_) {
    167     *stop = true;
    168     return re->Incref();
    169   }
    170   return NULL;
    171 }
    172 
    173 Regexp* SimplifyWalker::PostVisit(Regexp* re,
    174                                   Regexp* parent_arg,
    175                                   Regexp* pre_arg,
    176                                   Regexp** child_args,
    177                                   int nchild_args) {
    178   switch (re->op()) {
    179     case kRegexpNoMatch:
    180     case kRegexpEmptyMatch:
    181     case kRegexpLiteral:
    182     case kRegexpLiteralString:
    183     case kRegexpBeginLine:
    184     case kRegexpEndLine:
    185     case kRegexpBeginText:
    186     case kRegexpWordBoundary:
    187     case kRegexpNoWordBoundary:
    188     case kRegexpEndText:
    189     case kRegexpAnyChar:
    190     case kRegexpAnyByte:
    191     case kRegexpHaveMatch:
    192       // All these are always simple.
    193       re->simple_ = true;
    194       return re->Incref();
    195 
    196     case kRegexpConcat:
    197     case kRegexpAlternate: {
    198       // These are simple as long as the subpieces are simple.
    199       // Two passes to avoid allocation in the common case.
    200       bool changed = false;
    201       Regexp** subs = re->sub();
    202       for (int i = 0; i < re->nsub_; i++) {
    203         Regexp* sub = subs[i];
    204         Regexp* newsub = child_args[i];
    205         if (newsub != sub) {
    206           changed = true;
    207           break;
    208         }
    209       }
    210       if (!changed) {
    211         for (int i = 0; i < re->nsub_; i++) {
    212           Regexp* newsub = child_args[i];
    213           newsub->Decref();
    214         }
    215         re->simple_ = true;
    216         return re->Incref();
    217       }
    218       Regexp* nre = new Regexp(re->op(), re->parse_flags());
    219       nre->AllocSub(re->nsub_);
    220       Regexp** nre_subs = nre->sub();
    221       for (int i = 0; i <re->nsub_; i++)
    222         nre_subs[i] = child_args[i];
    223       nre->simple_ = true;
    224       return nre;
    225     }
    226 
    227     case kRegexpCapture: {
    228       Regexp* newsub = child_args[0];
    229       if (newsub == re->sub()[0]) {
    230         newsub->Decref();
    231         re->simple_ = true;
    232         return re->Incref();
    233       }
    234       Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
    235       nre->AllocSub(1);
    236       nre->sub()[0] = newsub;
    237       nre->cap_ = re->cap_;
    238       nre->simple_ = true;
    239       return nre;
    240     }
    241 
    242     case kRegexpStar:
    243     case kRegexpPlus:
    244     case kRegexpQuest: {
    245       Regexp* newsub = child_args[0];
    246       // Special case: repeat the empty string as much as
    247       // you want, but it's still the empty string.
    248       if (newsub->op() == kRegexpEmptyMatch)
    249         return newsub;
    250 
    251       // These are simple as long as the subpiece is simple.
    252       if (newsub == re->sub()[0]) {
    253         newsub->Decref();
    254         re->simple_ = true;
    255         return re->Incref();
    256       }
    257 
    258       // These are also idempotent if flags are constant.
    259       if (re->op() == newsub->op() &&
    260           re->parse_flags() == newsub->parse_flags())
    261         return newsub;
    262 
    263       Regexp* nre = new Regexp(re->op(), re->parse_flags());
    264       nre->AllocSub(1);
    265       nre->sub()[0] = newsub;
    266       nre->simple_ = true;
    267       return nre;
    268     }
    269 
    270     case kRegexpRepeat: {
    271       Regexp* newsub = child_args[0];
    272       // Special case: repeat the empty string as much as
    273       // you want, but it's still the empty string.
    274       if (newsub->op() == kRegexpEmptyMatch)
    275         return newsub;
    276 
    277       Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
    278                                    re->parse_flags());
    279       newsub->Decref();
    280       nre->simple_ = true;
    281       return nre;
    282     }
    283 
    284     case kRegexpCharClass: {
    285       Regexp* nre = SimplifyCharClass(re);
    286       nre->simple_ = true;
    287       return nre;
    288     }
    289   }
    290 
    291   LOG(ERROR) << "Simplify case not handled: " << re->op();
    292   return re->Incref();
    293 }
    294 
    295 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
    296 // Returns a new Regexp, handing the ref to the caller.
    297 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
    298                                 Regexp::ParseFlags parse_flags) {
    299   Regexp* re = new Regexp(kRegexpConcat, parse_flags);
    300   re->AllocSub(2);
    301   Regexp** subs = re->sub();
    302   subs[0] = re1;
    303   subs[1] = re2;
    304   return re;
    305 }
    306 
    307 // Simplifies the expression re{min,max} in terms of *, +, and ?.
    308 // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
    309 // Caller must Decref return value when done with it.
    310 // The result will *not* necessarily have the right capturing parens
    311 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
    312 // but in the Regexp* representation, both (x) are marked as $1.
    313 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
    314                                        Regexp::ParseFlags f) {
    315   // x{n,} means at least n matches of x.
    316   if (max == -1) {
    317     // Special case: x{0,} is x*
    318     if (min == 0)
    319       return Regexp::Star(re->Incref(), f);
    320 
    321     // Special case: x{1,} is x+
    322     if (min == 1)
    323       return Regexp::Plus(re->Incref(), f);
    324 
    325     // General case: x{4,} is xxxx+
    326     Regexp* nre = new Regexp(kRegexpConcat, f);
    327     nre->AllocSub(min);
    328     VLOG(1) << "Simplify " << min;
    329     Regexp** nre_subs = nre->sub();
    330     for (int i = 0; i < min-1; i++)
    331       nre_subs[i] = re->Incref();
    332     nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
    333     return nre;
    334   }
    335 
    336   // Special case: (x){0} matches only empty string.
    337   if (min == 0 && max == 0)
    338     return new Regexp(kRegexpEmptyMatch, f);
    339 
    340   // Special case: x{1} is just x.
    341   if (min == 1 && max == 1)
    342     return re->Incref();
    343 
    344   // General case: x{n,m} means n copies of x and m copies of x?.
    345   // The machine will do less work if we nest the final m copies,
    346   // so that x{2,5} = xx(x(x(x)?)?)?
    347 
    348   // Build leading prefix: xx.  Capturing only on the last one.
    349   Regexp* nre = NULL;
    350   if (min > 0) {
    351     nre = new Regexp(kRegexpConcat, f);
    352     nre->AllocSub(min);
    353     Regexp** nre_subs = nre->sub();
    354     for (int i = 0; i < min; i++)
    355       nre_subs[i] = re->Incref();
    356   }
    357 
    358   // Build and attach suffix: (x(x(x)?)?)?
    359   if (max > min) {
    360     Regexp* suf = Regexp::Quest(re->Incref(), f);
    361     for (int i = min+1; i < max; i++)
    362       suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
    363     if (nre == NULL)
    364       nre = suf;
    365     else
    366       nre = Concat2(nre, suf, f);
    367   }
    368 
    369   if (nre == NULL) {
    370     // Some degenerate case, like min > max, or min < max < 0.
    371     // This shouldn't happen, because the parser rejects such regexps.
    372     LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
    373     return new Regexp(kRegexpNoMatch, f);
    374   }
    375 
    376   return nre;
    377 }
    378 
    379 // Simplifies a character class.
    380 // Caller must Decref return value when done with it.
    381 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
    382   CharClass* cc = re->cc();
    383 
    384   // Special cases
    385   if (cc->empty())
    386     return new Regexp(kRegexpNoMatch, re->parse_flags());
    387   if (cc->full())
    388     return new Regexp(kRegexpAnyChar, re->parse_flags());
    389 
    390   return re->Incref();
    391 }
    392 
    393 }  // namespace re2
    394