1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "util/util.h" 6 #include "re2/prefilter.h" 7 #include "re2/re2.h" 8 #include "re2/unicode_casefold.h" 9 #include "re2/walker-inl.h" 10 11 namespace re2 { 12 13 static const int Trace = false; 14 15 typedef set<string>::iterator SSIter; 16 typedef set<string>::const_iterator ConstSSIter; 17 18 static int alloc_id = 100000; // Used for debugging. 19 // Initializes a Prefilter, allocating subs_ as necessary. 20 Prefilter::Prefilter(Op op) { 21 op_ = op; 22 subs_ = NULL; 23 if (op_ == AND || op_ == OR) 24 subs_ = new vector<Prefilter*>; 25 26 alloc_id_ = alloc_id++; 27 VLOG(10) << "alloc_id: " << alloc_id_; 28 } 29 30 // Destroys a Prefilter. 31 Prefilter::~Prefilter() { 32 VLOG(10) << "Deleted: " << alloc_id_; 33 if (subs_) { 34 for (int i = 0; i < subs_->size(); i++) 35 delete (*subs_)[i]; 36 delete subs_; 37 subs_ = NULL; 38 } 39 } 40 41 // Simplify if the node is an empty Or or And. 42 Prefilter* Prefilter::Simplify() { 43 if (op_ != AND && op_ != OR) { 44 return this; 45 } 46 47 // Nothing left in the AND/OR. 48 if (subs_->size() == 0) { 49 if (op_ == AND) 50 op_ = ALL; // AND of nothing is true 51 else 52 op_ = NONE; // OR of nothing is false 53 54 return this; 55 } 56 57 // Just one subnode: throw away wrapper. 58 if (subs_->size() == 1) { 59 Prefilter* a = (*subs_)[0]; 60 subs_->clear(); 61 delete this; 62 return a->Simplify(); 63 } 64 65 return this; 66 } 67 68 // Combines two Prefilters together to create an "op" (AND or OR). 69 // The passed Prefilters will be part of the returned Prefilter or deleted. 70 // Does lots of work to avoid creating unnecessarily complicated structures. 71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { 72 // If a, b can be rewritten as op, do so. 73 a = a->Simplify(); 74 b = b->Simplify(); 75 76 // Canonicalize: a->op <= b->op. 77 if (a->op() > b->op()) { 78 Prefilter* t = a; 79 a = b; 80 b = t; 81 } 82 83 // Trivial cases. 84 // ALL AND b = b 85 // NONE OR b = b 86 // ALL OR b = ALL 87 // NONE AND b = NONE 88 // Don't need to look at b, because of canonicalization above. 89 // ALL and NONE are smallest opcodes. 90 if (a->op() == ALL || a->op() == NONE) { 91 if ((a->op() == ALL && op == AND) || 92 (a->op() == NONE && op == OR)) { 93 delete a; 94 return b; 95 } else { 96 delete b; 97 return a; 98 } 99 } 100 101 // If a and b match op, merge their contents. 102 if (a->op() == op && b->op() == op) { 103 for (int i = 0; i < b->subs()->size(); i++) { 104 Prefilter* bb = (*b->subs())[i]; 105 a->subs()->push_back(bb); 106 } 107 b->subs()->clear(); 108 delete b; 109 return a; 110 } 111 112 // If a already has the same op as the op that is under construction 113 // add in b (similarly if b already has the same op, add in a). 114 if (b->op() == op) { 115 Prefilter* t = a; 116 a = b; 117 b = t; 118 } 119 if (a->op() == op) { 120 a->subs()->push_back(b); 121 return a; 122 } 123 124 // Otherwise just return the op. 125 Prefilter* c = new Prefilter(op); 126 c->subs()->push_back(a); 127 c->subs()->push_back(b); 128 return c; 129 } 130 131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) { 132 return AndOr(AND, a, b); 133 } 134 135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { 136 return AndOr(OR, a, b); 137 } 138 139 static void SimplifyStringSet(set<string> *ss) { 140 // Now make sure that the strings aren't redundant. For example, if 141 // we know "ab" is a required string, then it doesn't help at all to 142 // know that "abc" is also a required string, so delete "abc". This 143 // is because, when we are performing a string search to filter 144 // regexps, matching ab will already allow this regexp to be a 145 // candidate for match, so further matching abc is redundant. 146 147 for (SSIter i = ss->begin(); i != ss->end(); ++i) { 148 SSIter j = i; 149 ++j; 150 while (j != ss->end()) { 151 // Increment j early so that we can erase the element it points to. 152 SSIter old_j = j; 153 ++j; 154 if (old_j->find(*i) != string::npos) 155 ss->erase(old_j); 156 } 157 } 158 } 159 160 Prefilter* Prefilter::OrStrings(set<string>* ss) { 161 SimplifyStringSet(ss); 162 Prefilter* or_prefilter = NULL; 163 if (!ss->empty()) { 164 or_prefilter = new Prefilter(NONE); 165 for (SSIter i = ss->begin(); i != ss->end(); ++i) 166 or_prefilter = Or(or_prefilter, FromString(*i)); 167 } 168 return or_prefilter; 169 } 170 171 static Rune ToLowerRune(Rune r) { 172 if (r < Runeself) { 173 if ('A' <= r && r <= 'Z') 174 r += 'a' - 'A'; 175 return r; 176 } 177 178 CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); 179 if (f == NULL || r < f->lo) 180 return r; 181 return ApplyFold(f, r); 182 } 183 184 Prefilter* Prefilter::FromString(const string& str) { 185 Prefilter* m = new Prefilter(Prefilter::ATOM); 186 m->atom_ = str; 187 return m; 188 } 189 190 // Information about a regexp used during computation of Prefilter. 191 // Can be thought of as information about the set of strings matching 192 // the given regular expression. 193 class Prefilter::Info { 194 public: 195 Info(); 196 ~Info(); 197 198 // More constructors. They delete their Info* arguments. 199 static Info* Alt(Info* a, Info* b); 200 static Info* Concat(Info* a, Info* b); 201 static Info* And(Info* a, Info* b); 202 static Info* Star(Info* a); 203 static Info* Plus(Info* a); 204 static Info* Quest(Info* a); 205 static Info* EmptyString(); 206 static Info* NoMatch(); 207 static Info* AnyChar(); 208 static Info* CClass(CharClass* cc); 209 static Info* Literal(Rune r); 210 static Info* AnyMatch(); 211 212 // Format Info as a string. 213 string ToString(); 214 215 // Caller takes ownership of the Prefilter. 216 Prefilter* TakeMatch(); 217 218 set<string>& exact() { return exact_; } 219 220 bool is_exact() const { return is_exact_; } 221 222 class Walker; 223 224 private: 225 set<string> exact_; 226 227 // When is_exact_ is true, the strings that match 228 // are placed in exact_. When it is no longer an exact 229 // set of strings that match this RE, then is_exact_ 230 // is false and the match_ contains the required match 231 // criteria. 232 bool is_exact_; 233 234 // Accumulated Prefilter query that any 235 // match for this regexp is guaranteed to match. 236 Prefilter* match_; 237 }; 238 239 240 Prefilter::Info::Info() 241 : is_exact_(false), 242 match_(NULL) { 243 } 244 245 Prefilter::Info::~Info() { 246 delete match_; 247 } 248 249 Prefilter* Prefilter::Info::TakeMatch() { 250 if (is_exact_) { 251 match_ = Prefilter::OrStrings(&exact_); 252 is_exact_ = false; 253 } 254 Prefilter* m = match_; 255 match_ = NULL; 256 return m; 257 } 258 259 // Format a Info in string form. 260 string Prefilter::Info::ToString() { 261 if (this == NULL) { 262 // Sometimes when iterating on children of a node, 263 // some children might have NULL Info. Adding 264 // the check here for NULL to take care of cases where 265 // the caller is not checking. 266 return ""; 267 } 268 269 if (is_exact_) { 270 int n = 0; 271 string s; 272 for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) { 273 if (n++ > 0) 274 s += ","; 275 s += *i; 276 } 277 return s; 278 } 279 280 if (match_) 281 return match_->DebugString(); 282 283 return ""; 284 } 285 286 // Add the strings from src to dst. 287 static void CopyIn(const set<string>& src, set<string>* dst) { 288 for (ConstSSIter i = src.begin(); i != src.end(); ++i) 289 dst->insert(*i); 290 } 291 292 // Add the cross-product of a and b to dst. 293 // (For each string i in a and j in b, add i+j.) 294 static void CrossProduct(const set<string>& a, 295 const set<string>& b, 296 set<string>* dst) { 297 for (ConstSSIter i = a.begin(); i != a.end(); ++i) 298 for (ConstSSIter j = b.begin(); j != b.end(); ++j) 299 dst->insert(*i + *j); 300 } 301 302 // Concats a and b. Requires that both are exact sets. 303 // Forms an exact set that is a crossproduct of a and b. 304 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) { 305 if (a == NULL) 306 return b; 307 DCHECK(a->is_exact_); 308 DCHECK(b && b->is_exact_); 309 Info *ab = new Info(); 310 311 CrossProduct(a->exact_, b->exact_, &ab->exact_); 312 ab->is_exact_ = true; 313 314 delete a; 315 delete b; 316 return ab; 317 } 318 319 // Constructs an inexact Info for ab given a and b. 320 // Used only when a or b is not exact or when the 321 // exact cross product is likely to be too big. 322 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) { 323 if (a == NULL) 324 return b; 325 if (b == NULL) 326 return a; 327 328 Info *ab = new Info(); 329 330 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch()); 331 ab->is_exact_ = false; 332 delete a; 333 delete b; 334 return ab; 335 } 336 337 // Constructs Info for a|b given a and b. 338 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { 339 Info *ab = new Info(); 340 341 if (a->is_exact_ && b->is_exact_) { 342 CopyIn(a->exact_, &ab->exact_); 343 CopyIn(b->exact_, &ab->exact_); 344 ab->is_exact_ = true; 345 } else { 346 // Either a or b has is_exact_ = false. If the other 347 // one has is_exact_ = true, we move it to match_ and 348 // then create a OR of a,b. The resulting Info has 349 // is_exact_ = false. 350 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch()); 351 ab->is_exact_ = false; 352 } 353 354 delete a; 355 delete b; 356 return ab; 357 } 358 359 // Constructs Info for a? given a. 360 Prefilter::Info* Prefilter::Info::Quest(Info *a) { 361 Info *ab = new Info(); 362 363 ab->is_exact_ = false; 364 ab->match_ = new Prefilter(ALL); 365 delete a; 366 return ab; 367 } 368 369 // Constructs Info for a* given a. 370 // Same as a? -- not much to do. 371 Prefilter::Info* Prefilter::Info::Star(Info *a) { 372 return Quest(a); 373 } 374 375 // Constructs Info for a+ given a. If a was exact set, it isn't 376 // anymore. 377 Prefilter::Info* Prefilter::Info::Plus(Info *a) { 378 Info *ab = new Info(); 379 380 ab->match_ = a->TakeMatch(); 381 ab->is_exact_ = false; 382 383 delete a; 384 return ab; 385 } 386 387 static string RuneToString(Rune r) { 388 char buf[UTFmax]; 389 int n = runetochar(buf, &r); 390 return string(buf, n); 391 } 392 393 // Constructs Info for literal rune. 394 Prefilter::Info* Prefilter::Info::Literal(Rune r) { 395 Info* info = new Info(); 396 info->exact_.insert(RuneToString(ToLowerRune(r))); 397 info->is_exact_ = true; 398 return info; 399 } 400 401 // Constructs Info for dot (any character). 402 Prefilter::Info* Prefilter::Info::AnyChar() { 403 Prefilter::Info* info = new Prefilter::Info(); 404 info->match_ = new Prefilter(ALL); 405 return info; 406 } 407 408 // Constructs Prefilter::Info for no possible match. 409 Prefilter::Info* Prefilter::Info::NoMatch() { 410 Prefilter::Info* info = new Prefilter::Info(); 411 info->match_ = new Prefilter(NONE); 412 return info; 413 } 414 415 // Constructs Prefilter::Info for any possible match. 416 // This Prefilter::Info is valid for any regular expression, 417 // since it makes no assertions whatsoever about the 418 // strings being matched. 419 Prefilter::Info* Prefilter::Info::AnyMatch() { 420 Prefilter::Info *info = new Prefilter::Info(); 421 info->match_ = new Prefilter(ALL); 422 return info; 423 } 424 425 // Constructs Prefilter::Info for just the empty string. 426 Prefilter::Info* Prefilter::Info::EmptyString() { 427 Prefilter::Info* info = new Prefilter::Info(); 428 info->is_exact_ = true; 429 info->exact_.insert(""); 430 return info; 431 } 432 433 // Constructs Prefilter::Info for a character class. 434 typedef CharClass::iterator CCIter; 435 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) { 436 if (Trace) { 437 VLOG(0) << "CharClassInfo:"; 438 for (CCIter i = cc->begin(); i != cc->end(); ++i) 439 VLOG(0) << " " << i->lo << "-" << i->hi; 440 } 441 442 // If the class is too large, it's okay to overestimate. 443 if (cc->size() > 10) 444 return AnyChar(); 445 446 Prefilter::Info *a = new Prefilter::Info(); 447 for (CCIter i = cc->begin(); i != cc->end(); ++i) 448 for (Rune r = i->lo; r <= i->hi; r++) 449 a->exact_.insert(RuneToString(ToLowerRune(r))); 450 451 a->is_exact_ = true; 452 453 if (Trace) { 454 VLOG(0) << " = " << a->ToString(); 455 } 456 457 return a; 458 } 459 460 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> { 461 public: 462 Walker() {} 463 464 virtual Info* PostVisit( 465 Regexp* re, Info* parent_arg, 466 Info* pre_arg, 467 Info** child_args, int nchild_args); 468 469 virtual Info* ShortVisit( 470 Regexp* re, 471 Info* parent_arg); 472 473 private: 474 DISALLOW_EVIL_CONSTRUCTORS(Walker); 475 }; 476 477 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { 478 if (Trace) { 479 LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); 480 } 481 Prefilter::Info::Walker w; 482 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); 483 484 if (w.stopped_early()) { 485 delete info; 486 return NULL; 487 } 488 489 return info; 490 } 491 492 Prefilter::Info* Prefilter::Info::Walker::ShortVisit( 493 Regexp* re, Prefilter::Info* parent_arg) { 494 return AnyMatch(); 495 } 496 497 // Constructs the Prefilter::Info for the given regular expression. 498 // Assumes re is simplified. 499 Prefilter::Info* Prefilter::Info::Walker::PostVisit( 500 Regexp* re, Prefilter::Info* parent_arg, 501 Prefilter::Info* pre_arg, Prefilter::Info** child_args, 502 int nchild_args) { 503 Prefilter::Info *info; 504 switch (re->op()) { 505 default: 506 case kRegexpRepeat: 507 LOG(DFATAL) << "Bad regexp op " << re->op(); 508 info = EmptyString(); 509 break; 510 511 case kRegexpNoMatch: 512 info = NoMatch(); 513 break; 514 515 // These ops match the empty string: 516 case kRegexpEmptyMatch: // anywhere 517 case kRegexpBeginLine: // at beginning of line 518 case kRegexpEndLine: // at end of line 519 case kRegexpBeginText: // at beginning of text 520 case kRegexpEndText: // at end of text 521 case kRegexpWordBoundary: // at word boundary 522 case kRegexpNoWordBoundary: // not at word boundary 523 info = EmptyString(); 524 break; 525 526 case kRegexpLiteral: 527 info = Literal(re->rune()); 528 break; 529 530 case kRegexpLiteralString: 531 if (re->nrunes() == 0) { 532 info = NoMatch(); 533 break; 534 } 535 info = Literal(re->runes()[0]); 536 for (int i = 1; i < re->nrunes(); i++) 537 info = Concat(info, Literal(re->runes()[i])); 538 break; 539 540 case kRegexpConcat: { 541 // Accumulate in info. 542 // Exact is concat of recent contiguous exact nodes. 543 info = NULL; 544 Info* exact = NULL; 545 for (int i = 0; i < nchild_args; i++) { 546 Info* ci = child_args[i]; // child info 547 if (!ci->is_exact() || 548 (exact && ci->exact().size() * exact->exact().size() > 16)) { 549 // Exact run is over. 550 info = And(info, exact); 551 exact = NULL; 552 // Add this child's info. 553 info = And(info, ci); 554 } else { 555 // Append to exact run. 556 exact = Concat(exact, ci); 557 } 558 } 559 info = And(info, exact); 560 } 561 break; 562 563 case kRegexpAlternate: 564 info = child_args[0]; 565 for (int i = 1; i < nchild_args; i++) 566 info = Alt(info, child_args[i]); 567 VLOG(10) << "Alt: " << info->ToString(); 568 break; 569 570 case kRegexpStar: 571 info = Star(child_args[0]); 572 break; 573 574 case kRegexpQuest: 575 info = Quest(child_args[0]); 576 break; 577 578 case kRegexpPlus: 579 info = Plus(child_args[0]); 580 break; 581 582 case kRegexpAnyChar: 583 // Claim nothing, except that it's not empty. 584 info = AnyChar(); 585 break; 586 587 case kRegexpCharClass: 588 info = CClass(re->cc()); 589 break; 590 591 case kRegexpCapture: 592 // These don't affect the set of matching strings. 593 info = child_args[0]; 594 break; 595 } 596 597 if (Trace) { 598 VLOG(0) << "BuildInfo " << re->ToString() 599 << ": " << info->ToString(); 600 } 601 602 return info; 603 } 604 605 606 Prefilter* Prefilter::FromRegexp(Regexp* re) { 607 if (re == NULL) 608 return NULL; 609 610 Regexp* simple = re->Simplify(); 611 Prefilter::Info *info = BuildInfo(simple); 612 613 simple->Decref(); 614 if (info == NULL) 615 return NULL; 616 617 Prefilter* m = info->TakeMatch(); 618 619 delete info; 620 return m; 621 } 622 623 string Prefilter::DebugString() const { 624 if (this == NULL) 625 return "<nil>"; 626 627 switch (op_) { 628 default: 629 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; 630 return StringPrintf("op%d", op_); 631 case NONE: 632 return "*no-matches*"; 633 case ATOM: 634 return atom_; 635 case ALL: 636 return ""; 637 case AND: { 638 string s = ""; 639 for (int i = 0; i < subs_->size(); i++) { 640 if (i > 0) 641 s += " "; 642 s += (*subs_)[i]->DebugString(); 643 } 644 return s; 645 } 646 case OR: { 647 string s = "("; 648 for (int i = 0; i < subs_->size(); i++) { 649 if (i > 0) 650 s += "|"; 651 s += (*subs_)[i]->DebugString(); 652 } 653 s += ")"; 654 return s; 655 } 656 } 657 } 658 659 Prefilter* Prefilter::FromRE2(const RE2* re2) { 660 if (re2 == NULL) 661 return NULL; 662 663 Regexp* regexp = re2->Regexp(); 664 if (regexp == NULL) 665 return NULL; 666 667 return FromRegexp(regexp); 668 } 669 670 671 } // namespace re2 672