1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Rewrite POSIX and other features in re 6 // to use simple extended regular expression features. 7 // Also sort and simplify character classes. 8 9 #include "util/util.h" 10 #include "re2/regexp.h" 11 #include "re2/walker-inl.h" 12 13 namespace re2 { 14 15 // Parses the regexp src and then simplifies it and sets *dst to the 16 // string representation of the simplified form. Returns true on success. 17 // Returns false and sets *error (if error != NULL) on error. 18 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, 19 string* dst, 20 RegexpStatus* status) { 21 Regexp* re = Parse(src, flags, status); 22 if (re == NULL) 23 return false; 24 Regexp* sre = re->Simplify(); 25 re->Decref(); 26 if (sre == NULL) { 27 // Should not happen, since Simplify never fails. 28 LOG(ERROR) << "Simplify failed on " << src; 29 if (status) { 30 status->set_code(kRegexpInternalError); 31 status->set_error_arg(src); 32 } 33 return false; 34 } 35 *dst = sre->ToString(); 36 sre->Decref(); 37 return true; 38 } 39 40 // Assuming the simple_ flags on the children are accurate, 41 // is this Regexp* simple? 42 bool Regexp::ComputeSimple() { 43 Regexp** subs; 44 switch (op_) { 45 case kRegexpNoMatch: 46 case kRegexpEmptyMatch: 47 case kRegexpLiteral: 48 case kRegexpLiteralString: 49 case kRegexpBeginLine: 50 case kRegexpEndLine: 51 case kRegexpBeginText: 52 case kRegexpWordBoundary: 53 case kRegexpNoWordBoundary: 54 case kRegexpEndText: 55 case kRegexpAnyChar: 56 case kRegexpAnyByte: 57 case kRegexpHaveMatch: 58 return true; 59 case kRegexpConcat: 60 case kRegexpAlternate: 61 // These are simple as long as the subpieces are simple. 62 subs = sub(); 63 for (int i = 0; i < nsub_; i++) 64 if (!subs[i]->simple_) 65 return false; 66 return true; 67 case kRegexpCharClass: 68 // Simple as long as the char class is not empty, not full. 69 if (ccb_ != NULL) 70 return !ccb_->empty() && !ccb_->full(); 71 return !cc_->empty() && !cc_->full(); 72 case kRegexpCapture: 73 subs = sub(); 74 return subs[0]->simple_; 75 case kRegexpStar: 76 case kRegexpPlus: 77 case kRegexpQuest: 78 subs = sub(); 79 if (!subs[0]->simple_) 80 return false; 81 switch (subs[0]->op_) { 82 case kRegexpStar: 83 case kRegexpPlus: 84 case kRegexpQuest: 85 case kRegexpEmptyMatch: 86 case kRegexpNoMatch: 87 return false; 88 default: 89 break; 90 } 91 return true; 92 case kRegexpRepeat: 93 return false; 94 } 95 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; 96 return false; 97 } 98 99 // Walker subclass used by Simplify. 100 // The simplify walk is purely post-recursive: given the simplified children, 101 // PostVisit creates the simplified result. 102 // The child_args are simplified Regexp*s. 103 class SimplifyWalker : public Regexp::Walker<Regexp*> { 104 public: 105 SimplifyWalker() {} 106 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); 107 virtual Regexp* PostVisit(Regexp* re, 108 Regexp* parent_arg, 109 Regexp* pre_arg, 110 Regexp** child_args, int nchild_args); 111 virtual Regexp* Copy(Regexp* re); 112 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); 113 114 private: 115 // These functions are declared inside SimplifyWalker so that 116 // they can edit the private fields of the Regexps they construct. 117 118 // Creates a concatenation of two Regexp, consuming refs to re1 and re2. 119 // Caller must Decref return value when done with it. 120 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); 121 122 // Simplifies the expression re{min,max} in terms of *, +, and ?. 123 // Returns a new regexp. Does not edit re. Does not consume reference to re. 124 // Caller must Decref return value when done with it. 125 static Regexp* SimplifyRepeat(Regexp* re, int min, int max, 126 Regexp::ParseFlags parse_flags); 127 128 // Simplifies a character class by expanding any named classes 129 // into rune ranges. Does not edit re. Does not consume ref to re. 130 // Caller must Decref return value when done with it. 131 static Regexp* SimplifyCharClass(Regexp* re); 132 133 DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); 134 }; 135 136 // Simplifies a regular expression, returning a new regexp. 137 // The new regexp uses traditional Unix egrep features only, 138 // plus the Perl (?:) non-capturing parentheses. 139 // Otherwise, no POSIX or Perl additions. The new regexp 140 // captures exactly the same subexpressions (with the same indices) 141 // as the original. 142 // Does not edit current object. 143 // Caller must Decref() return value when done with it. 144 145 Regexp* Regexp::Simplify() { 146 if (simple_) 147 return Incref(); 148 SimplifyWalker w; 149 return w.Walk(this, NULL); 150 } 151 152 #define Simplify DontCallSimplify // Avoid accidental recursion 153 154 Regexp* SimplifyWalker::Copy(Regexp* re) { 155 return re->Incref(); 156 } 157 158 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { 159 // This should never be called, since we use Walk and not 160 // WalkExponential. 161 LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; 162 return re->Incref(); 163 } 164 165 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { 166 if (re->simple_) { 167 *stop = true; 168 return re->Incref(); 169 } 170 return NULL; 171 } 172 173 Regexp* SimplifyWalker::PostVisit(Regexp* re, 174 Regexp* parent_arg, 175 Regexp* pre_arg, 176 Regexp** child_args, 177 int nchild_args) { 178 switch (re->op()) { 179 case kRegexpNoMatch: 180 case kRegexpEmptyMatch: 181 case kRegexpLiteral: 182 case kRegexpLiteralString: 183 case kRegexpBeginLine: 184 case kRegexpEndLine: 185 case kRegexpBeginText: 186 case kRegexpWordBoundary: 187 case kRegexpNoWordBoundary: 188 case kRegexpEndText: 189 case kRegexpAnyChar: 190 case kRegexpAnyByte: 191 case kRegexpHaveMatch: 192 // All these are always simple. 193 re->simple_ = true; 194 return re->Incref(); 195 196 case kRegexpConcat: 197 case kRegexpAlternate: { 198 // These are simple as long as the subpieces are simple. 199 // Two passes to avoid allocation in the common case. 200 bool changed = false; 201 Regexp** subs = re->sub(); 202 for (int i = 0; i < re->nsub_; i++) { 203 Regexp* sub = subs[i]; 204 Regexp* newsub = child_args[i]; 205 if (newsub != sub) { 206 changed = true; 207 break; 208 } 209 } 210 if (!changed) { 211 for (int i = 0; i < re->nsub_; i++) { 212 Regexp* newsub = child_args[i]; 213 newsub->Decref(); 214 } 215 re->simple_ = true; 216 return re->Incref(); 217 } 218 Regexp* nre = new Regexp(re->op(), re->parse_flags()); 219 nre->AllocSub(re->nsub_); 220 Regexp** nre_subs = nre->sub(); 221 for (int i = 0; i <re->nsub_; i++) 222 nre_subs[i] = child_args[i]; 223 nre->simple_ = true; 224 return nre; 225 } 226 227 case kRegexpCapture: { 228 Regexp* newsub = child_args[0]; 229 if (newsub == re->sub()[0]) { 230 newsub->Decref(); 231 re->simple_ = true; 232 return re->Incref(); 233 } 234 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); 235 nre->AllocSub(1); 236 nre->sub()[0] = newsub; 237 nre->cap_ = re->cap_; 238 nre->simple_ = true; 239 return nre; 240 } 241 242 case kRegexpStar: 243 case kRegexpPlus: 244 case kRegexpQuest: { 245 Regexp* newsub = child_args[0]; 246 // Special case: repeat the empty string as much as 247 // you want, but it's still the empty string. 248 if (newsub->op() == kRegexpEmptyMatch) 249 return newsub; 250 251 // These are simple as long as the subpiece is simple. 252 if (newsub == re->sub()[0]) { 253 newsub->Decref(); 254 re->simple_ = true; 255 return re->Incref(); 256 } 257 258 // These are also idempotent if flags are constant. 259 if (re->op() == newsub->op() && 260 re->parse_flags() == newsub->parse_flags()) 261 return newsub; 262 263 Regexp* nre = new Regexp(re->op(), re->parse_flags()); 264 nre->AllocSub(1); 265 nre->sub()[0] = newsub; 266 nre->simple_ = true; 267 return nre; 268 } 269 270 case kRegexpRepeat: { 271 Regexp* newsub = child_args[0]; 272 // Special case: repeat the empty string as much as 273 // you want, but it's still the empty string. 274 if (newsub->op() == kRegexpEmptyMatch) 275 return newsub; 276 277 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, 278 re->parse_flags()); 279 newsub->Decref(); 280 nre->simple_ = true; 281 return nre; 282 } 283 284 case kRegexpCharClass: { 285 Regexp* nre = SimplifyCharClass(re); 286 nre->simple_ = true; 287 return nre; 288 } 289 } 290 291 LOG(ERROR) << "Simplify case not handled: " << re->op(); 292 return re->Incref(); 293 } 294 295 // Creates a concatenation of two Regexp, consuming refs to re1 and re2. 296 // Returns a new Regexp, handing the ref to the caller. 297 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, 298 Regexp::ParseFlags parse_flags) { 299 Regexp* re = new Regexp(kRegexpConcat, parse_flags); 300 re->AllocSub(2); 301 Regexp** subs = re->sub(); 302 subs[0] = re1; 303 subs[1] = re2; 304 return re; 305 } 306 307 // Simplifies the expression re{min,max} in terms of *, +, and ?. 308 // Returns a new regexp. Does not edit re. Does not consume reference to re. 309 // Caller must Decref return value when done with it. 310 // The result will *not* necessarily have the right capturing parens 311 // if you call ToString() and re-parse it: (x){2} becomes (x)(x), 312 // but in the Regexp* representation, both (x) are marked as $1. 313 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, 314 Regexp::ParseFlags f) { 315 // x{n,} means at least n matches of x. 316 if (max == -1) { 317 // Special case: x{0,} is x* 318 if (min == 0) 319 return Regexp::Star(re->Incref(), f); 320 321 // Special case: x{1,} is x+ 322 if (min == 1) 323 return Regexp::Plus(re->Incref(), f); 324 325 // General case: x{4,} is xxxx+ 326 Regexp* nre = new Regexp(kRegexpConcat, f); 327 nre->AllocSub(min); 328 VLOG(1) << "Simplify " << min; 329 Regexp** nre_subs = nre->sub(); 330 for (int i = 0; i < min-1; i++) 331 nre_subs[i] = re->Incref(); 332 nre_subs[min-1] = Regexp::Plus(re->Incref(), f); 333 return nre; 334 } 335 336 // Special case: (x){0} matches only empty string. 337 if (min == 0 && max == 0) 338 return new Regexp(kRegexpEmptyMatch, f); 339 340 // Special case: x{1} is just x. 341 if (min == 1 && max == 1) 342 return re->Incref(); 343 344 // General case: x{n,m} means n copies of x and m copies of x?. 345 // The machine will do less work if we nest the final m copies, 346 // so that x{2,5} = xx(x(x(x)?)?)? 347 348 // Build leading prefix: xx. Capturing only on the last one. 349 Regexp* nre = NULL; 350 if (min > 0) { 351 nre = new Regexp(kRegexpConcat, f); 352 nre->AllocSub(min); 353 Regexp** nre_subs = nre->sub(); 354 for (int i = 0; i < min; i++) 355 nre_subs[i] = re->Incref(); 356 } 357 358 // Build and attach suffix: (x(x(x)?)?)? 359 if (max > min) { 360 Regexp* suf = Regexp::Quest(re->Incref(), f); 361 for (int i = min+1; i < max; i++) 362 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); 363 if (nre == NULL) 364 nre = suf; 365 else 366 nre = Concat2(nre, suf, f); 367 } 368 369 if (nre == NULL) { 370 // Some degenerate case, like min > max, or min < max < 0. 371 // This shouldn't happen, because the parser rejects such regexps. 372 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; 373 return new Regexp(kRegexpNoMatch, f); 374 } 375 376 return nre; 377 } 378 379 // Simplifies a character class. 380 // Caller must Decref return value when done with it. 381 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { 382 CharClass* cc = re->cc(); 383 384 // Special cases 385 if (cc->empty()) 386 return new Regexp(kRegexpNoMatch, re->parse_flags()); 387 if (cc->full()) 388 return new Regexp(kRegexpAnyChar, re->parse_flags()); 389 390 return re->Incref(); 391 } 392 393 } // namespace re2 394