Home | History | Annotate | Download | only in re2
      1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Determine whether this library should match PCRE exactly
      6 // for a particular Regexp.  (If so, the testing framework can
      7 // check that it does.)
      8 //
      9 // This library matches PCRE except in these cases:
     10 //   * the regexp contains a repetition of an empty string,
     11 //     like (a*)* or (a*)+.  In this case, PCRE will treat
     12 //     the repetition sequence as ending with an empty string,
     13 //     while this library does not.
     14 //   * Perl and PCRE differ on whether \v matches \n.
     15 //     For historical reasons, this library implements the Perl behavior.
     16 //   * Perl and PCRE allow $ in one-line mode to match either the very
     17 //     end of the text or just before a \n at the end of the text.
     18 //     This library requires it to match only the end of the text.
     19 //   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
     20 //     match the end of the text if the last character is a \n.
     21 //     This library does allow it.
     22 //
     23 // Regexp::MimicsPCRE checks for any of these conditions.
     24 
     25 #include "util/util.h"
     26 #include "re2/regexp.h"
     27 #include "re2/walker-inl.h"
     28 
     29 namespace re2 {
     30 
     31 // Returns whether re might match an empty string.
     32 static bool CanBeEmptyString(Regexp *re);
     33 
     34 // Walker class to compute whether library handles a regexp
     35 // exactly as PCRE would.  See comment at top for conditions.
     36 
     37 class PCREWalker : public Regexp::Walker<bool> {
     38  public:
     39   PCREWalker() {}
     40   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
     41                  int nchild_args);
     42 
     43   bool ShortVisit(Regexp* re, bool a) {
     44     // Should never be called: we use Walk not WalkExponential.
     45     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
     46     return a;
     47   }
     48 };
     49 
     50 // Called after visiting each of re's children and accumulating
     51 // the return values in child_args.  So child_args contains whether
     52 // this library mimics PCRE for those subexpressions.
     53 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
     54                            bool* child_args, int nchild_args) {
     55   // If children failed, so do we.
     56   for (int i = 0; i < nchild_args; i++)
     57     if (!child_args[i])
     58       return false;
     59 
     60   // Otherwise look for other reasons to fail.
     61   switch (re->op()) {
     62     // Look for repeated empty string.
     63     case kRegexpStar:
     64     case kRegexpPlus:
     65     case kRegexpQuest:
     66       if (CanBeEmptyString(re->sub()[0]))
     67         return false;
     68       break;
     69     case kRegexpRepeat:
     70       if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
     71         return false;
     72       break;
     73 
     74     // Look for \v
     75     case kRegexpLiteral:
     76       if (re->rune() == '\v')
     77         return false;
     78       break;
     79 
     80     // Look for $ in single-line mode.
     81     case kRegexpEndText:
     82     case kRegexpEmptyMatch:
     83       if (re->parse_flags() & Regexp::WasDollar)
     84         return false;
     85       break;
     86 
     87     // Look for ^ in multi-line mode.
     88     case kRegexpBeginLine:
     89       // No condition: in single-line mode ^ becomes kRegexpBeginText.
     90       return false;
     91 
     92     default:
     93       break;
     94   }
     95 
     96   // Not proven guilty.
     97   return true;
     98 }
     99 
    100 // Returns whether this regexp's behavior will mimic PCRE's exactly.
    101 bool Regexp::MimicsPCRE() {
    102   PCREWalker w;
    103   return w.Walk(this, true);
    104 }
    105 
    106 
    107 // Walker class to compute whether a Regexp can match an empty string.
    108 // It is okay to overestimate.  For example, \b\B cannot match an empty
    109 // string, because \b and \B are mutually exclusive, but this isn't
    110 // that smart and will say it can.  Spurious empty strings
    111 // will reduce the number of regexps we sanity check against PCRE,
    112 // but they won't break anything.
    113 
    114 class EmptyStringWalker : public Regexp::Walker<bool> {
    115  public:
    116   EmptyStringWalker() { }
    117   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
    118                  bool* child_args, int nchild_args);
    119 
    120   bool ShortVisit(Regexp* re, bool a) {
    121     // Should never be called: we use Walk not WalkExponential.
    122     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
    123     return a;
    124   }
    125 
    126  private:
    127   DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
    128 };
    129 
    130 // Called after visiting re's children.  child_args contains the return
    131 // value from each of the children's PostVisits (i.e., whether each child
    132 // can match an empty string).  Returns whether this clause can match an
    133 // empty string.
    134 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
    135                                   bool* child_args, int nchild_args) {
    136   switch (re->op()) {
    137     case kRegexpNoMatch:               // never empty
    138     case kRegexpLiteral:
    139     case kRegexpAnyChar:
    140     case kRegexpAnyByte:
    141     case kRegexpCharClass:
    142     case kRegexpLiteralString:
    143       return false;
    144 
    145     case kRegexpEmptyMatch:            // always empty
    146     case kRegexpBeginLine:             // always empty, when they match
    147     case kRegexpEndLine:
    148     case kRegexpNoWordBoundary:
    149     case kRegexpWordBoundary:
    150     case kRegexpBeginText:
    151     case kRegexpEndText:
    152     case kRegexpStar:                  // can always be empty
    153     case kRegexpQuest:
    154     case kRegexpHaveMatch:
    155       return true;
    156 
    157     case kRegexpConcat:                // can be empty if all children can
    158       for (int i = 0; i < nchild_args; i++)
    159         if (!child_args[i])
    160           return false;
    161       return true;
    162 
    163     case kRegexpAlternate:             // can be empty if any child can
    164       for (int i = 0; i < nchild_args; i++)
    165         if (child_args[i])
    166           return true;
    167       return false;
    168 
    169     case kRegexpPlus:                  // can be empty if the child can
    170     case kRegexpCapture:
    171       return child_args[0];
    172 
    173     case kRegexpRepeat:                // can be empty if child can or is x{0}
    174       return child_args[0] || re->min() == 0;
    175   }
    176   return false;
    177 }
    178 
    179 // Returns whether re can match an empty string.
    180 static bool CanBeEmptyString(Regexp* re) {
    181   EmptyStringWalker w;
    182   return w.Walk(re, true);
    183 }
    184 
    185 }  // namespace re2
    186