1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Determine whether this library should match PCRE exactly 6 // for a particular Regexp. (If so, the testing framework can 7 // check that it does.) 8 // 9 // This library matches PCRE except in these cases: 10 // * the regexp contains a repetition of an empty string, 11 // like (a*)* or (a*)+. In this case, PCRE will treat 12 // the repetition sequence as ending with an empty string, 13 // while this library does not. 14 // * Perl and PCRE differ on whether \v matches \n. 15 // For historical reasons, this library implements the Perl behavior. 16 // * Perl and PCRE allow $ in one-line mode to match either the very 17 // end of the text or just before a \n at the end of the text. 18 // This library requires it to match only the end of the text. 19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to 20 // match the end of the text if the last character is a \n. 21 // This library does allow it. 22 // 23 // Regexp::MimicsPCRE checks for any of these conditions. 24 25 #include "util/util.h" 26 #include "re2/regexp.h" 27 #include "re2/walker-inl.h" 28 29 namespace re2 { 30 31 // Returns whether re might match an empty string. 32 static bool CanBeEmptyString(Regexp *re); 33 34 // Walker class to compute whether library handles a regexp 35 // exactly as PCRE would. See comment at top for conditions. 36 37 class PCREWalker : public Regexp::Walker<bool> { 38 public: 39 PCREWalker() {} 40 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, 41 int nchild_args); 42 43 bool ShortVisit(Regexp* re, bool a) { 44 // Should never be called: we use Walk not WalkExponential. 45 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; 46 return a; 47 } 48 }; 49 50 // Called after visiting each of re's children and accumulating 51 // the return values in child_args. So child_args contains whether 52 // this library mimics PCRE for those subexpressions. 53 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, 54 bool* child_args, int nchild_args) { 55 // If children failed, so do we. 56 for (int i = 0; i < nchild_args; i++) 57 if (!child_args[i]) 58 return false; 59 60 // Otherwise look for other reasons to fail. 61 switch (re->op()) { 62 // Look for repeated empty string. 63 case kRegexpStar: 64 case kRegexpPlus: 65 case kRegexpQuest: 66 if (CanBeEmptyString(re->sub()[0])) 67 return false; 68 break; 69 case kRegexpRepeat: 70 if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) 71 return false; 72 break; 73 74 // Look for \v 75 case kRegexpLiteral: 76 if (re->rune() == '\v') 77 return false; 78 break; 79 80 // Look for $ in single-line mode. 81 case kRegexpEndText: 82 case kRegexpEmptyMatch: 83 if (re->parse_flags() & Regexp::WasDollar) 84 return false; 85 break; 86 87 // Look for ^ in multi-line mode. 88 case kRegexpBeginLine: 89 // No condition: in single-line mode ^ becomes kRegexpBeginText. 90 return false; 91 92 default: 93 break; 94 } 95 96 // Not proven guilty. 97 return true; 98 } 99 100 // Returns whether this regexp's behavior will mimic PCRE's exactly. 101 bool Regexp::MimicsPCRE() { 102 PCREWalker w; 103 return w.Walk(this, true); 104 } 105 106 107 // Walker class to compute whether a Regexp can match an empty string. 108 // It is okay to overestimate. For example, \b\B cannot match an empty 109 // string, because \b and \B are mutually exclusive, but this isn't 110 // that smart and will say it can. Spurious empty strings 111 // will reduce the number of regexps we sanity check against PCRE, 112 // but they won't break anything. 113 114 class EmptyStringWalker : public Regexp::Walker<bool> { 115 public: 116 EmptyStringWalker() { } 117 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, 118 bool* child_args, int nchild_args); 119 120 bool ShortVisit(Regexp* re, bool a) { 121 // Should never be called: we use Walk not WalkExponential. 122 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; 123 return a; 124 } 125 126 private: 127 DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker); 128 }; 129 130 // Called after visiting re's children. child_args contains the return 131 // value from each of the children's PostVisits (i.e., whether each child 132 // can match an empty string). Returns whether this clause can match an 133 // empty string. 134 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, 135 bool* child_args, int nchild_args) { 136 switch (re->op()) { 137 case kRegexpNoMatch: // never empty 138 case kRegexpLiteral: 139 case kRegexpAnyChar: 140 case kRegexpAnyByte: 141 case kRegexpCharClass: 142 case kRegexpLiteralString: 143 return false; 144 145 case kRegexpEmptyMatch: // always empty 146 case kRegexpBeginLine: // always empty, when they match 147 case kRegexpEndLine: 148 case kRegexpNoWordBoundary: 149 case kRegexpWordBoundary: 150 case kRegexpBeginText: 151 case kRegexpEndText: 152 case kRegexpStar: // can always be empty 153 case kRegexpQuest: 154 case kRegexpHaveMatch: 155 return true; 156 157 case kRegexpConcat: // can be empty if all children can 158 for (int i = 0; i < nchild_args; i++) 159 if (!child_args[i]) 160 return false; 161 return true; 162 163 case kRegexpAlternate: // can be empty if any child can 164 for (int i = 0; i < nchild_args; i++) 165 if (child_args[i]) 166 return true; 167 return false; 168 169 case kRegexpPlus: // can be empty if the child can 170 case kRegexpCapture: 171 return child_args[0]; 172 173 case kRegexpRepeat: // can be empty if child can or is x{0} 174 return child_args[0] || re->min() == 0; 175 } 176 return false; 177 } 178 179 // Returns whether re can match an empty string. 180 static bool CanBeEmptyString(Regexp* re) { 181 EmptyStringWalker w; 182 return w.Walk(re, true); 183 } 184 185 } // namespace re2 186