1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "util/util.h" 6 #include "re2/prefilter.h" 7 #include "re2/re2.h" 8 #include "re2/unicode_casefold.h" 9 #include "re2/walker-inl.h" 10 11 namespace re2 { 12 13 static const int Trace = false; 14 15 typedef set<string>::iterator SSIter; 16 typedef set<string>::const_iterator ConstSSIter; 17 18 static int alloc_id = 100000; // Used for debugging. 19 // Initializes a Prefilter, allocating subs_ as necessary. 20 Prefilter::Prefilter(Op op) { 21 op_ = op; 22 subs_ = NULL; 23 if (op_ == AND || op_ == OR) 24 subs_ = new vector<Prefilter*>; 25 26 alloc_id_ = alloc_id++; 27 VLOG(10) << "alloc_id: " << alloc_id_; 28 } 29 30 // Destroys a Prefilter. 31 Prefilter::~Prefilter() { 32 VLOG(10) << "Deleted: " << alloc_id_; 33 if (subs_) { 34 for (int i = 0; i < subs_->size(); i++) 35 delete (*subs_)[i]; 36 delete subs_; 37 subs_ = NULL; 38 } 39 } 40 41 // Simplify if the node is an empty Or or And. 42 Prefilter* Prefilter::Simplify() { 43 if (op_ != AND && op_ != OR) { 44 return this; 45 } 46 47 // Nothing left in the AND/OR. 48 if (subs_->size() == 0) { 49 if (op_ == AND) 50 op_ = ALL; // AND of nothing is true 51 else 52 op_ = NONE; // OR of nothing is false 53 54 return this; 55 } 56 57 // Just one subnode: throw away wrapper. 58 if (subs_->size() == 1) { 59 Prefilter* a = (*subs_)[0]; 60 subs_->clear(); 61 delete this; 62 return a->Simplify(); 63 } 64 65 return this; 66 } 67 68 // Combines two Prefilters together to create an "op" (AND or OR). 69 // The passed Prefilters will be part of the returned Prefilter or deleted. 70 // Does lots of work to avoid creating unnecessarily complicated structures. 71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { 72 // If a, b can be rewritten as op, do so. 73 a = a->Simplify(); 74 b = b->Simplify(); 75 76 // Canonicalize: a->op <= b->op. 77 if (a->op() > b->op()) { 78 Prefilter* t = a; 79 a = b; 80 b = t; 81 } 82 83 // Trivial cases. 84 // ALL AND b = b 85 // NONE OR b = b 86 // ALL OR b = ALL 87 // NONE AND b = NONE 88 // Don't need to look at b, because of canonicalization above. 89 // ALL and NONE are smallest opcodes. 90 if (a->op() == ALL || a->op() == NONE) { 91 if ((a->op() == ALL && op == AND) || 92 (a->op() == NONE && op == OR)) { 93 delete a; 94 return b; 95 } else { 96 delete b; 97 return a; 98 } 99 } 100 101 // If a and b match op, merge their contents. 102 if (a->op() == op && b->op() == op) { 103 for (int i = 0; i < b->subs()->size(); i++) { 104 Prefilter* bb = (*b->subs())[i]; 105 a->subs()->push_back(bb); 106 } 107 b->subs()->clear(); 108 delete b; 109 return a; 110 } 111 112 // If a already has the same op as the op that is under construction 113 // add in b (similarly if b already has the same op, add in a). 114 if (b->op() == op) { 115 Prefilter* t = a; 116 a = b; 117 b = t; 118 } 119 if (a->op() == op) { 120 a->subs()->push_back(b); 121 return a; 122 } 123 124 // Otherwise just return the op. 125 Prefilter* c = new Prefilter(op); 126 c->subs()->push_back(a); 127 c->subs()->push_back(b); 128 return c; 129 } 130 131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) { 132 return AndOr(AND, a, b); 133 } 134 135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { 136 return AndOr(OR, a, b); 137 } 138 139 static void SimplifyStringSet(set<string> *ss) { 140 // Now make sure that the strings aren't redundant. For example, if 141 // we know "ab" is a required string, then it doesn't help at all to 142 // know that "abc" is also a required string, so delete "abc". This 143 // is because, when we are performing a string search to filter 144 // regexps, matching ab will already allow this regexp to be a 145 // candidate for match, so further matching abc is redundant. 146 147 for (SSIter i = ss->begin(); i != ss->end(); ++i) { 148 SSIter j = i; 149 ++j; 150 while (j != ss->end()) { 151 // Increment j early so that we can erase the element it points to. 152 SSIter old_j = j; 153 ++j; 154 if (old_j->find(*i) != string::npos) 155 ss->erase(old_j); 156 } 157 } 158 } 159 160 Prefilter* Prefilter::OrStrings(set<string>* ss) { 161 SimplifyStringSet(ss); 162 Prefilter* or_prefilter = NULL; 163 if (!ss->empty()) { 164 or_prefilter = new Prefilter(NONE); 165 for (SSIter i = ss->begin(); i != ss->end(); ++i) 166 or_prefilter = Or(or_prefilter, FromString(*i)); 167 } 168 return or_prefilter; 169 } 170 171 static Rune ToLowerRune(Rune r) { 172 if (r < Runeself) { 173 if ('A' <= r && r <= 'Z') 174 r += 'a' - 'A'; 175 return r; 176 } 177 178 CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); 179 if (f == NULL || r < f->lo) 180 return r; 181 return ApplyFold(f, r); 182 } 183 184 static Rune ToLowerRuneLatin1(Rune r) { 185 if ('A' <= r && r <= 'Z') 186 r += 'a' - 'A'; 187 return r; 188 } 189 190 Prefilter* Prefilter::FromString(const string& str) { 191 Prefilter* m = new Prefilter(Prefilter::ATOM); 192 m->atom_ = str; 193 return m; 194 } 195 196 // Information about a regexp used during computation of Prefilter. 197 // Can be thought of as information about the set of strings matching 198 // the given regular expression. 199 class Prefilter::Info { 200 public: 201 Info(); 202 ~Info(); 203 204 // More constructors. They delete their Info* arguments. 205 static Info* Alt(Info* a, Info* b); 206 static Info* Concat(Info* a, Info* b); 207 static Info* And(Info* a, Info* b); 208 static Info* Star(Info* a); 209 static Info* Plus(Info* a); 210 static Info* Quest(Info* a); 211 static Info* EmptyString(); 212 static Info* NoMatch(); 213 static Info* AnyChar(); 214 static Info* CClass(CharClass* cc, bool latin1); 215 static Info* Literal(Rune r); 216 static Info* LiteralLatin1(Rune r); 217 static Info* AnyMatch(); 218 219 // Format Info as a string. 220 string ToString(); 221 222 // Caller takes ownership of the Prefilter. 223 Prefilter* TakeMatch(); 224 225 set<string>& exact() { return exact_; } 226 227 bool is_exact() const { return is_exact_; } 228 229 class Walker; 230 231 private: 232 set<string> exact_; 233 234 // When is_exact_ is true, the strings that match 235 // are placed in exact_. When it is no longer an exact 236 // set of strings that match this RE, then is_exact_ 237 // is false and the match_ contains the required match 238 // criteria. 239 bool is_exact_; 240 241 // Accumulated Prefilter query that any 242 // match for this regexp is guaranteed to match. 243 Prefilter* match_; 244 }; 245 246 247 Prefilter::Info::Info() 248 : is_exact_(false), 249 match_(NULL) { 250 } 251 252 Prefilter::Info::~Info() { 253 delete match_; 254 } 255 256 Prefilter* Prefilter::Info::TakeMatch() { 257 if (is_exact_) { 258 match_ = Prefilter::OrStrings(&exact_); 259 is_exact_ = false; 260 } 261 Prefilter* m = match_; 262 match_ = NULL; 263 return m; 264 } 265 266 // Format a Info in string form. 267 string Prefilter::Info::ToString() { 268 if (is_exact_) { 269 int n = 0; 270 string s; 271 for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) { 272 if (n++ > 0) 273 s += ","; 274 s += *i; 275 } 276 return s; 277 } 278 279 if (match_) 280 return match_->DebugString(); 281 282 return ""; 283 } 284 285 // Add the strings from src to dst. 286 static void CopyIn(const set<string>& src, set<string>* dst) { 287 for (ConstSSIter i = src.begin(); i != src.end(); ++i) 288 dst->insert(*i); 289 } 290 291 // Add the cross-product of a and b to dst. 292 // (For each string i in a and j in b, add i+j.) 293 static void CrossProduct(const set<string>& a, 294 const set<string>& b, 295 set<string>* dst) { 296 for (ConstSSIter i = a.begin(); i != a.end(); ++i) 297 for (ConstSSIter j = b.begin(); j != b.end(); ++j) 298 dst->insert(*i + *j); 299 } 300 301 // Concats a and b. Requires that both are exact sets. 302 // Forms an exact set that is a crossproduct of a and b. 303 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) { 304 if (a == NULL) 305 return b; 306 DCHECK(a->is_exact_); 307 DCHECK(b && b->is_exact_); 308 Info *ab = new Info(); 309 310 CrossProduct(a->exact_, b->exact_, &ab->exact_); 311 ab->is_exact_ = true; 312 313 delete a; 314 delete b; 315 return ab; 316 } 317 318 // Constructs an inexact Info for ab given a and b. 319 // Used only when a or b is not exact or when the 320 // exact cross product is likely to be too big. 321 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) { 322 if (a == NULL) 323 return b; 324 if (b == NULL) 325 return a; 326 327 Info *ab = new Info(); 328 329 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch()); 330 ab->is_exact_ = false; 331 delete a; 332 delete b; 333 return ab; 334 } 335 336 // Constructs Info for a|b given a and b. 337 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { 338 Info *ab = new Info(); 339 340 if (a->is_exact_ && b->is_exact_) { 341 CopyIn(a->exact_, &ab->exact_); 342 CopyIn(b->exact_, &ab->exact_); 343 ab->is_exact_ = true; 344 } else { 345 // Either a or b has is_exact_ = false. If the other 346 // one has is_exact_ = true, we move it to match_ and 347 // then create a OR of a,b. The resulting Info has 348 // is_exact_ = false. 349 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch()); 350 ab->is_exact_ = false; 351 } 352 353 delete a; 354 delete b; 355 return ab; 356 } 357 358 // Constructs Info for a? given a. 359 Prefilter::Info* Prefilter::Info::Quest(Info *a) { 360 Info *ab = new Info(); 361 362 ab->is_exact_ = false; 363 ab->match_ = new Prefilter(ALL); 364 delete a; 365 return ab; 366 } 367 368 // Constructs Info for a* given a. 369 // Same as a? -- not much to do. 370 Prefilter::Info* Prefilter::Info::Star(Info *a) { 371 return Quest(a); 372 } 373 374 // Constructs Info for a+ given a. If a was exact set, it isn't 375 // anymore. 376 Prefilter::Info* Prefilter::Info::Plus(Info *a) { 377 Info *ab = new Info(); 378 379 ab->match_ = a->TakeMatch(); 380 ab->is_exact_ = false; 381 382 delete a; 383 return ab; 384 } 385 386 static string RuneToString(Rune r) { 387 char buf[UTFmax]; 388 int n = runetochar(buf, &r); 389 return string(buf, n); 390 } 391 392 static string RuneToStringLatin1(Rune r) { 393 char c = r & 0xff; 394 return string(&c, 1); 395 } 396 397 // Constructs Info for literal rune. 398 Prefilter::Info* Prefilter::Info::Literal(Rune r) { 399 Info* info = new Info(); 400 info->exact_.insert(RuneToString(ToLowerRune(r))); 401 info->is_exact_ = true; 402 return info; 403 } 404 405 // Constructs Info for literal rune for Latin1 encoded string. 406 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { 407 Info* info = new Info(); 408 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); 409 info->is_exact_ = true; 410 return info; 411 } 412 413 // Constructs Info for dot (any character). 414 Prefilter::Info* Prefilter::Info::AnyChar() { 415 Prefilter::Info* info = new Prefilter::Info(); 416 info->match_ = new Prefilter(ALL); 417 return info; 418 } 419 420 // Constructs Prefilter::Info for no possible match. 421 Prefilter::Info* Prefilter::Info::NoMatch() { 422 Prefilter::Info* info = new Prefilter::Info(); 423 info->match_ = new Prefilter(NONE); 424 return info; 425 } 426 427 // Constructs Prefilter::Info for any possible match. 428 // This Prefilter::Info is valid for any regular expression, 429 // since it makes no assertions whatsoever about the 430 // strings being matched. 431 Prefilter::Info* Prefilter::Info::AnyMatch() { 432 Prefilter::Info *info = new Prefilter::Info(); 433 info->match_ = new Prefilter(ALL); 434 return info; 435 } 436 437 // Constructs Prefilter::Info for just the empty string. 438 Prefilter::Info* Prefilter::Info::EmptyString() { 439 Prefilter::Info* info = new Prefilter::Info(); 440 info->is_exact_ = true; 441 info->exact_.insert(""); 442 return info; 443 } 444 445 // Constructs Prefilter::Info for a character class. 446 typedef CharClass::iterator CCIter; 447 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, 448 bool latin1) { 449 if (Trace) { 450 VLOG(0) << "CharClassInfo:"; 451 for (CCIter i = cc->begin(); i != cc->end(); ++i) 452 VLOG(0) << " " << i->lo << "-" << i->hi; 453 } 454 455 // If the class is too large, it's okay to overestimate. 456 if (cc->size() > 10) 457 return AnyChar(); 458 459 Prefilter::Info *a = new Prefilter::Info(); 460 for (CCIter i = cc->begin(); i != cc->end(); ++i) 461 for (Rune r = i->lo; r <= i->hi; r++) { 462 if (latin1) { 463 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); 464 } else { 465 a->exact_.insert(RuneToString(ToLowerRune(r))); 466 } 467 } 468 469 470 a->is_exact_ = true; 471 472 if (Trace) { 473 VLOG(0) << " = " << a->ToString(); 474 } 475 476 return a; 477 } 478 479 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> { 480 public: 481 Walker(bool latin1) : latin1_(latin1) {} 482 483 virtual Info* PostVisit( 484 Regexp* re, Info* parent_arg, 485 Info* pre_arg, 486 Info** child_args, int nchild_args); 487 488 virtual Info* ShortVisit( 489 Regexp* re, 490 Info* parent_arg); 491 492 bool latin1() { return latin1_; } 493 private: 494 bool latin1_; 495 DISALLOW_EVIL_CONSTRUCTORS(Walker); 496 }; 497 498 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { 499 if (Trace) { 500 LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); 501 } 502 503 bool latin1 = re->parse_flags() & Regexp::Latin1; 504 Prefilter::Info::Walker w(latin1); 505 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); 506 507 if (w.stopped_early()) { 508 delete info; 509 return NULL; 510 } 511 512 return info; 513 } 514 515 Prefilter::Info* Prefilter::Info::Walker::ShortVisit( 516 Regexp* re, Prefilter::Info* parent_arg) { 517 return AnyMatch(); 518 } 519 520 // Constructs the Prefilter::Info for the given regular expression. 521 // Assumes re is simplified. 522 Prefilter::Info* Prefilter::Info::Walker::PostVisit( 523 Regexp* re, Prefilter::Info* parent_arg, 524 Prefilter::Info* pre_arg, Prefilter::Info** child_args, 525 int nchild_args) { 526 Prefilter::Info *info; 527 switch (re->op()) { 528 default: 529 case kRegexpRepeat: 530 LOG(DFATAL) << "Bad regexp op " << re->op(); 531 info = EmptyString(); 532 break; 533 534 case kRegexpNoMatch: 535 info = NoMatch(); 536 break; 537 538 // These ops match the empty string: 539 case kRegexpEmptyMatch: // anywhere 540 case kRegexpBeginLine: // at beginning of line 541 case kRegexpEndLine: // at end of line 542 case kRegexpBeginText: // at beginning of text 543 case kRegexpEndText: // at end of text 544 case kRegexpWordBoundary: // at word boundary 545 case kRegexpNoWordBoundary: // not at word boundary 546 info = EmptyString(); 547 break; 548 549 case kRegexpLiteral: 550 if (latin1()) { 551 info = LiteralLatin1(re->rune()); 552 } 553 else { 554 info = Literal(re->rune()); 555 } 556 break; 557 558 case kRegexpLiteralString: 559 if (re->nrunes() == 0) { 560 info = NoMatch(); 561 break; 562 } 563 if (latin1()) { 564 info = LiteralLatin1(re->runes()[0]); 565 for (int i = 1; i < re->nrunes(); i++) { 566 info = Concat(info, LiteralLatin1(re->runes()[i])); 567 } 568 } else { 569 info = Literal(re->runes()[0]); 570 for (int i = 1; i < re->nrunes(); i++) { 571 info = Concat(info, Literal(re->runes()[i])); 572 } 573 } 574 break; 575 576 case kRegexpConcat: { 577 // Accumulate in info. 578 // Exact is concat of recent contiguous exact nodes. 579 info = NULL; 580 Info* exact = NULL; 581 for (int i = 0; i < nchild_args; i++) { 582 Info* ci = child_args[i]; // child info 583 if (!ci->is_exact() || 584 (exact && ci->exact().size() * exact->exact().size() > 16)) { 585 // Exact run is over. 586 info = And(info, exact); 587 exact = NULL; 588 // Add this child's info. 589 info = And(info, ci); 590 } else { 591 // Append to exact run. 592 exact = Concat(exact, ci); 593 } 594 } 595 info = And(info, exact); 596 } 597 break; 598 599 case kRegexpAlternate: 600 info = child_args[0]; 601 for (int i = 1; i < nchild_args; i++) 602 info = Alt(info, child_args[i]); 603 VLOG(10) << "Alt: " << info->ToString(); 604 break; 605 606 case kRegexpStar: 607 info = Star(child_args[0]); 608 break; 609 610 case kRegexpQuest: 611 info = Quest(child_args[0]); 612 break; 613 614 case kRegexpPlus: 615 info = Plus(child_args[0]); 616 break; 617 618 case kRegexpAnyChar: 619 // Claim nothing, except that it's not empty. 620 info = AnyChar(); 621 break; 622 623 case kRegexpCharClass: 624 info = CClass(re->cc(), latin1()); 625 break; 626 627 case kRegexpCapture: 628 // These don't affect the set of matching strings. 629 info = child_args[0]; 630 break; 631 } 632 633 if (Trace) { 634 VLOG(0) << "BuildInfo " << re->ToString() 635 << ": " << (info ? info->ToString() : ""); 636 } 637 638 return info; 639 } 640 641 642 Prefilter* Prefilter::FromRegexp(Regexp* re) { 643 if (re == NULL) 644 return NULL; 645 646 Regexp* simple = re->Simplify(); 647 Prefilter::Info *info = BuildInfo(simple); 648 649 simple->Decref(); 650 if (info == NULL) 651 return NULL; 652 653 Prefilter* m = info->TakeMatch(); 654 655 delete info; 656 return m; 657 } 658 659 string Prefilter::DebugString() const { 660 switch (op_) { 661 default: 662 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; 663 return StringPrintf("op%d", op_); 664 case NONE: 665 return "*no-matches*"; 666 case ATOM: 667 return atom_; 668 case ALL: 669 return ""; 670 case AND: { 671 string s = ""; 672 for (int i = 0; i < subs_->size(); i++) { 673 if (i > 0) 674 s += " "; 675 Prefilter* sub = (*subs_)[i]; 676 s += sub ? sub->DebugString() : "<nil>"; 677 } 678 return s; 679 } 680 case OR: { 681 string s = "("; 682 for (int i = 0; i < subs_->size(); i++) { 683 if (i > 0) 684 s += "|"; 685 Prefilter* sub = (*subs_)[i]; 686 s += sub ? sub->DebugString() : "<nil>"; 687 } 688 s += ")"; 689 return s; 690 } 691 } 692 } 693 694 Prefilter* Prefilter::FromRE2(const RE2* re2) { 695 if (re2 == NULL) 696 return NULL; 697 698 Regexp* regexp = re2->Regexp(); 699 if (regexp == NULL) 700 return NULL; 701 702 return FromRegexp(regexp); 703 } 704 705 706 } // namespace re2 707