1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "util/util.h" 6 #include "re2/prefilter.h" 7 #include "re2/re2.h" 8 #include "re2/unicode_casefold.h" 9 #include "re2/walker-inl.h" 10 11 namespace re2 { 12 13 static const int Trace = false; 14 15 typedef set<string>::iterator SSIter; 16 typedef set<string>::const_iterator ConstSSIter; 17 18 static int alloc_id = 100000; // Used for debugging. 19 // Initializes a Prefilter, allocating subs_ as necessary. 20 Prefilter::Prefilter(Op op) { 21 op_ = op; 22 subs_ = NULL; 23 if (op_ == AND || op_ == OR) 24 subs_ = new vector<Prefilter*>; 25 26 alloc_id_ = alloc_id++; 27 VLOG(10) << "alloc_id: " << alloc_id_; 28 } 29 30 // Destroys a Prefilter. 31 Prefilter::~Prefilter() { 32 VLOG(10) << "Deleted: " << alloc_id_; 33 if (subs_) { 34 for (int i = 0; i < subs_->size(); i++) 35 delete (*subs_)[i]; 36 delete subs_; 37 subs_ = NULL; 38 } 39 } 40 41 // Simplify if the node is an empty Or or And. 42 Prefilter* Prefilter::Simplify() { 43 if (op_ != AND && op_ != OR) { 44 return this; 45 } 46 47 // Nothing left in the AND/OR. 48 if (subs_->size() == 0) { 49 if (op_ == AND) 50 op_ = ALL; // AND of nothing is true 51 else 52 op_ = NONE; // OR of nothing is false 53 54 return this; 55 } 56 57 // Just one subnode: throw away wrapper. 58 if (subs_->size() == 1) { 59 Prefilter* a = (*subs_)[0]; 60 subs_->clear(); 61 delete this; 62 return a->Simplify(); 63 } 64 65 return this; 66 } 67 68 // Combines two Prefilters together to create an "op" (AND or OR). 69 // The passed Prefilters will be part of the returned Prefilter or deleted. 70 // Does lots of work to avoid creating unnecessarily complicated structures. 71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { 72 // If a, b can be rewritten as op, do so. 73 a = a->Simplify(); 74 b = b->Simplify(); 75 76 // Canonicalize: a->op <= b->op. 77 if (a->op() > b->op()) { 78 Prefilter* t = a; 79 a = b; 80 b = t; 81 } 82 83 // Trivial cases. 84 // ALL AND b = b 85 // NONE OR b = b 86 // ALL OR b = ALL 87 // NONE AND b = NONE 88 // Don't need to look at b, because of canonicalization above. 89 // ALL and NONE are smallest opcodes. 90 if (a->op() == ALL || a->op() == NONE) { 91 if ((a->op() == ALL && op == AND) || 92 (a->op() == NONE && op == OR)) { 93 delete a; 94 return b; 95 } else { 96 delete b; 97 return a; 98 } 99 } 100 101 // If a and b match op, merge their contents. 102 if (a->op() == op && b->op() == op) { 103 for (int i = 0; i < b->subs()->size(); i++) { 104 Prefilter* bb = (*b->subs())[i]; 105 a->subs()->push_back(bb); 106 } 107 b->subs()->clear(); 108 delete b; 109 return a; 110 } 111 112 // If a already has the same op as the op that is under construction 113 // add in b (similarly if b already has the same op, add in a). 114 if (b->op() == op) { 115 Prefilter* t = a; 116 a = b; 117 b = t; 118 } 119 if (a->op() == op) { 120 a->subs()->push_back(b); 121 return a; 122 } 123 124 // Otherwise just return the op. 125 Prefilter* c = new Prefilter(op); 126 c->subs()->push_back(a); 127 c->subs()->push_back(b); 128 return c; 129 } 130 131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) { 132 return AndOr(AND, a, b); 133 } 134 135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { 136 return AndOr(OR, a, b); 137 } 138 139 static void SimplifyStringSet(set<string> *ss) { 140 // Now make sure that the strings aren't redundant. For example, if 141 // we know "ab" is a required string, then it doesn't help at all to 142 // know that "abc" is also a required string, so delete "abc". This 143 // is because, when we are performing a string search to filter 144 // regexps, matching ab will already allow this regexp to be a 145 // candidate for match, so further matching abc is redundant. 146 147 for (SSIter i = ss->begin(); i != ss->end(); ++i) { 148 SSIter j = i; 149 ++j; 150 while (j != ss->end()) { 151 // Increment j early so that we can erase the element it points to. 152 SSIter old_j = j; 153 ++j; 154 if (old_j->find(*i) != string::npos) 155 ss->erase(old_j); 156 } 157 } 158 } 159 160 Prefilter* Prefilter::OrStrings(set<string>* ss) { 161 SimplifyStringSet(ss); 162 Prefilter* or_prefilter = NULL; 163 if (!ss->empty()) { 164 or_prefilter = new Prefilter(NONE); 165 for (SSIter i = ss->begin(); i != ss->end(); ++i) 166 or_prefilter = Or(or_prefilter, FromString(*i)); 167 } 168 return or_prefilter; 169 } 170 171 static Rune ToLowerRune(Rune r) { 172 if (r < Runeself) { 173 if ('A' <= r && r <= 'Z') 174 r += 'a' - 'A'; 175 return r; 176 } 177 178 CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); 179 if (f == NULL || r < f->lo) 180 return r; 181 return ApplyFold(f, r); 182 } 183 184 static Rune ToLowerRuneLatin1(Rune r) { 185 if ('A' <= r && r <= 'Z') 186 r += 'a' - 'A'; 187 return r; 188 } 189 190 Prefilter* Prefilter::FromString(const string& str) { 191 Prefilter* m = new Prefilter(Prefilter::ATOM); 192 m->atom_ = str; 193 return m; 194 } 195 196 // Information about a regexp used during computation of Prefilter. 197 // Can be thought of as information about the set of strings matching 198 // the given regular expression. 199 class Prefilter::Info { 200 public: 201 Info(); 202 ~Info(); 203 204 // More constructors. They delete their Info* arguments. 205 static Info* Alt(Info* a, Info* b); 206 static Info* Concat(Info* a, Info* b); 207 static Info* And(Info* a, Info* b); 208 static Info* Star(Info* a); 209 static Info* Plus(Info* a); 210 static Info* Quest(Info* a); 211 static Info* EmptyString(); 212 static Info* NoMatch(); 213 static Info* AnyChar(); 214 static Info* CClass(CharClass* cc, bool latin1); 215 static Info* Literal(Rune r); 216 static Info* LiteralLatin1(Rune r); 217 static Info* AnyMatch(); 218 219 // Format Info as a string. 220 string ToString(); 221 222 // Caller takes ownership of the Prefilter. 223 Prefilter* TakeMatch(); 224 225 set<string>& exact() { return exact_; } 226 227 bool is_exact() const { return is_exact_; } 228 229 class Walker; 230 231 private: 232 set<string> exact_; 233 234 // When is_exact_ is true, the strings that match 235 // are placed in exact_. When it is no longer an exact 236 // set of strings that match this RE, then is_exact_ 237 // is false and the match_ contains the required match 238 // criteria. 239 bool is_exact_; 240 241 // Accumulated Prefilter query that any 242 // match for this regexp is guaranteed to match. 243 Prefilter* match_; 244 }; 245 246 247 Prefilter::Info::Info() 248 : is_exact_(false), 249 match_(NULL) { 250 } 251 252 Prefilter::Info::~Info() { 253 delete match_; 254 } 255 256 Prefilter* Prefilter::Info::TakeMatch() { 257 if (is_exact_) { 258 match_ = Prefilter::OrStrings(&exact_); 259 is_exact_ = false; 260 } 261 Prefilter* m = match_; 262 match_ = NULL; 263 return m; 264 } 265 266 // Format a Info in string form. 267 string Prefilter::Info::ToString() { 268 if (this == NULL) { 269 // Sometimes when iterating on children of a node, 270 // some children might have NULL Info. Adding 271 // the check here for NULL to take care of cases where 272 // the caller is not checking. 273 return ""; 274 } 275 276 if (is_exact_) { 277 int n = 0; 278 string s; 279 for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) { 280 if (n++ > 0) 281 s += ","; 282 s += *i; 283 } 284 return s; 285 } 286 287 if (match_) 288 return match_->DebugString(); 289 290 return ""; 291 } 292 293 // Add the strings from src to dst. 294 static void CopyIn(const set<string>& src, set<string>* dst) { 295 for (ConstSSIter i = src.begin(); i != src.end(); ++i) 296 dst->insert(*i); 297 } 298 299 // Add the cross-product of a and b to dst. 300 // (For each string i in a and j in b, add i+j.) 301 static void CrossProduct(const set<string>& a, 302 const set<string>& b, 303 set<string>* dst) { 304 for (ConstSSIter i = a.begin(); i != a.end(); ++i) 305 for (ConstSSIter j = b.begin(); j != b.end(); ++j) 306 dst->insert(*i + *j); 307 } 308 309 // Concats a and b. Requires that both are exact sets. 310 // Forms an exact set that is a crossproduct of a and b. 311 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) { 312 if (a == NULL) 313 return b; 314 DCHECK(a->is_exact_); 315 DCHECK(b && b->is_exact_); 316 Info *ab = new Info(); 317 318 CrossProduct(a->exact_, b->exact_, &ab->exact_); 319 ab->is_exact_ = true; 320 321 delete a; 322 delete b; 323 return ab; 324 } 325 326 // Constructs an inexact Info for ab given a and b. 327 // Used only when a or b is not exact or when the 328 // exact cross product is likely to be too big. 329 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) { 330 if (a == NULL) 331 return b; 332 if (b == NULL) 333 return a; 334 335 Info *ab = new Info(); 336 337 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch()); 338 ab->is_exact_ = false; 339 delete a; 340 delete b; 341 return ab; 342 } 343 344 // Constructs Info for a|b given a and b. 345 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { 346 Info *ab = new Info(); 347 348 if (a->is_exact_ && b->is_exact_) { 349 CopyIn(a->exact_, &ab->exact_); 350 CopyIn(b->exact_, &ab->exact_); 351 ab->is_exact_ = true; 352 } else { 353 // Either a or b has is_exact_ = false. If the other 354 // one has is_exact_ = true, we move it to match_ and 355 // then create a OR of a,b. The resulting Info has 356 // is_exact_ = false. 357 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch()); 358 ab->is_exact_ = false; 359 } 360 361 delete a; 362 delete b; 363 return ab; 364 } 365 366 // Constructs Info for a? given a. 367 Prefilter::Info* Prefilter::Info::Quest(Info *a) { 368 Info *ab = new Info(); 369 370 ab->is_exact_ = false; 371 ab->match_ = new Prefilter(ALL); 372 delete a; 373 return ab; 374 } 375 376 // Constructs Info for a* given a. 377 // Same as a? -- not much to do. 378 Prefilter::Info* Prefilter::Info::Star(Info *a) { 379 return Quest(a); 380 } 381 382 // Constructs Info for a+ given a. If a was exact set, it isn't 383 // anymore. 384 Prefilter::Info* Prefilter::Info::Plus(Info *a) { 385 Info *ab = new Info(); 386 387 ab->match_ = a->TakeMatch(); 388 ab->is_exact_ = false; 389 390 delete a; 391 return ab; 392 } 393 394 static string RuneToString(Rune r) { 395 char buf[UTFmax]; 396 int n = runetochar(buf, &r); 397 return string(buf, n); 398 } 399 400 static string RuneToStringLatin1(Rune r) { 401 char c = r & 0xff; 402 return string(&c, 1); 403 } 404 405 // Constructs Info for literal rune. 406 Prefilter::Info* Prefilter::Info::Literal(Rune r) { 407 Info* info = new Info(); 408 info->exact_.insert(RuneToString(ToLowerRune(r))); 409 info->is_exact_ = true; 410 return info; 411 } 412 413 // Constructs Info for literal rune for Latin1 encoded string. 414 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { 415 Info* info = new Info(); 416 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); 417 info->is_exact_ = true; 418 return info; 419 } 420 421 // Constructs Info for dot (any character). 422 Prefilter::Info* Prefilter::Info::AnyChar() { 423 Prefilter::Info* info = new Prefilter::Info(); 424 info->match_ = new Prefilter(ALL); 425 return info; 426 } 427 428 // Constructs Prefilter::Info for no possible match. 429 Prefilter::Info* Prefilter::Info::NoMatch() { 430 Prefilter::Info* info = new Prefilter::Info(); 431 info->match_ = new Prefilter(NONE); 432 return info; 433 } 434 435 // Constructs Prefilter::Info for any possible match. 436 // This Prefilter::Info is valid for any regular expression, 437 // since it makes no assertions whatsoever about the 438 // strings being matched. 439 Prefilter::Info* Prefilter::Info::AnyMatch() { 440 Prefilter::Info *info = new Prefilter::Info(); 441 info->match_ = new Prefilter(ALL); 442 return info; 443 } 444 445 // Constructs Prefilter::Info for just the empty string. 446 Prefilter::Info* Prefilter::Info::EmptyString() { 447 Prefilter::Info* info = new Prefilter::Info(); 448 info->is_exact_ = true; 449 info->exact_.insert(""); 450 return info; 451 } 452 453 // Constructs Prefilter::Info for a character class. 454 typedef CharClass::iterator CCIter; 455 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, 456 bool latin1) { 457 if (Trace) { 458 VLOG(0) << "CharClassInfo:"; 459 for (CCIter i = cc->begin(); i != cc->end(); ++i) 460 VLOG(0) << " " << i->lo << "-" << i->hi; 461 } 462 463 // If the class is too large, it's okay to overestimate. 464 if (cc->size() > 10) 465 return AnyChar(); 466 467 Prefilter::Info *a = new Prefilter::Info(); 468 for (CCIter i = cc->begin(); i != cc->end(); ++i) 469 for (Rune r = i->lo; r <= i->hi; r++) { 470 if (latin1) { 471 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); 472 } else { 473 a->exact_.insert(RuneToString(ToLowerRune(r))); 474 } 475 } 476 477 478 a->is_exact_ = true; 479 480 if (Trace) { 481 VLOG(0) << " = " << a->ToString(); 482 } 483 484 return a; 485 } 486 487 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> { 488 public: 489 Walker(bool latin1) : latin1_(latin1) {} 490 491 virtual Info* PostVisit( 492 Regexp* re, Info* parent_arg, 493 Info* pre_arg, 494 Info** child_args, int nchild_args); 495 496 virtual Info* ShortVisit( 497 Regexp* re, 498 Info* parent_arg); 499 500 bool latin1() { return latin1_; } 501 private: 502 bool latin1_; 503 DISALLOW_EVIL_CONSTRUCTORS(Walker); 504 }; 505 506 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { 507 if (Trace) { 508 LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); 509 } 510 511 bool latin1 = re->parse_flags() & Regexp::Latin1; 512 Prefilter::Info::Walker w(latin1); 513 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); 514 515 if (w.stopped_early()) { 516 delete info; 517 return NULL; 518 } 519 520 return info; 521 } 522 523 Prefilter::Info* Prefilter::Info::Walker::ShortVisit( 524 Regexp* re, Prefilter::Info* parent_arg) { 525 return AnyMatch(); 526 } 527 528 // Constructs the Prefilter::Info for the given regular expression. 529 // Assumes re is simplified. 530 Prefilter::Info* Prefilter::Info::Walker::PostVisit( 531 Regexp* re, Prefilter::Info* parent_arg, 532 Prefilter::Info* pre_arg, Prefilter::Info** child_args, 533 int nchild_args) { 534 Prefilter::Info *info; 535 switch (re->op()) { 536 default: 537 case kRegexpRepeat: 538 LOG(DFATAL) << "Bad regexp op " << re->op(); 539 info = EmptyString(); 540 break; 541 542 case kRegexpNoMatch: 543 info = NoMatch(); 544 break; 545 546 // These ops match the empty string: 547 case kRegexpEmptyMatch: // anywhere 548 case kRegexpBeginLine: // at beginning of line 549 case kRegexpEndLine: // at end of line 550 case kRegexpBeginText: // at beginning of text 551 case kRegexpEndText: // at end of text 552 case kRegexpWordBoundary: // at word boundary 553 case kRegexpNoWordBoundary: // not at word boundary 554 info = EmptyString(); 555 break; 556 557 case kRegexpLiteral: 558 if (latin1()) { 559 info = LiteralLatin1(re->rune()); 560 } 561 else { 562 info = Literal(re->rune()); 563 } 564 break; 565 566 case kRegexpLiteralString: 567 if (re->nrunes() == 0) { 568 info = NoMatch(); 569 break; 570 } 571 if (latin1()) { 572 info = LiteralLatin1(re->runes()[0]); 573 for (int i = 1; i < re->nrunes(); i++) { 574 info = Concat(info, LiteralLatin1(re->runes()[i])); 575 } 576 } else { 577 info = Literal(re->runes()[0]); 578 for (int i = 1; i < re->nrunes(); i++) { 579 info = Concat(info, Literal(re->runes()[i])); 580 } 581 } 582 break; 583 584 case kRegexpConcat: { 585 // Accumulate in info. 586 // Exact is concat of recent contiguous exact nodes. 587 info = NULL; 588 Info* exact = NULL; 589 for (int i = 0; i < nchild_args; i++) { 590 Info* ci = child_args[i]; // child info 591 if (!ci->is_exact() || 592 (exact && ci->exact().size() * exact->exact().size() > 16)) { 593 // Exact run is over. 594 info = And(info, exact); 595 exact = NULL; 596 // Add this child's info. 597 info = And(info, ci); 598 } else { 599 // Append to exact run. 600 exact = Concat(exact, ci); 601 } 602 } 603 info = And(info, exact); 604 } 605 break; 606 607 case kRegexpAlternate: 608 info = child_args[0]; 609 for (int i = 1; i < nchild_args; i++) 610 info = Alt(info, child_args[i]); 611 VLOG(10) << "Alt: " << info->ToString(); 612 break; 613 614 case kRegexpStar: 615 info = Star(child_args[0]); 616 break; 617 618 case kRegexpQuest: 619 info = Quest(child_args[0]); 620 break; 621 622 case kRegexpPlus: 623 info = Plus(child_args[0]); 624 break; 625 626 case kRegexpAnyChar: 627 // Claim nothing, except that it's not empty. 628 info = AnyChar(); 629 break; 630 631 case kRegexpCharClass: 632 info = CClass(re->cc(), latin1()); 633 break; 634 635 case kRegexpCapture: 636 // These don't affect the set of matching strings. 637 info = child_args[0]; 638 break; 639 } 640 641 if (Trace) { 642 VLOG(0) << "BuildInfo " << re->ToString() 643 << ": " << info->ToString(); 644 } 645 646 return info; 647 } 648 649 650 Prefilter* Prefilter::FromRegexp(Regexp* re) { 651 if (re == NULL) 652 return NULL; 653 654 Regexp* simple = re->Simplify(); 655 Prefilter::Info *info = BuildInfo(simple); 656 657 simple->Decref(); 658 if (info == NULL) 659 return NULL; 660 661 Prefilter* m = info->TakeMatch(); 662 663 delete info; 664 return m; 665 } 666 667 string Prefilter::DebugString() const { 668 if (this == NULL) 669 return "<nil>"; 670 671 switch (op_) { 672 default: 673 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; 674 return StringPrintf("op%d", op_); 675 case NONE: 676 return "*no-matches*"; 677 case ATOM: 678 return atom_; 679 case ALL: 680 return ""; 681 case AND: { 682 string s = ""; 683 for (int i = 0; i < subs_->size(); i++) { 684 if (i > 0) 685 s += " "; 686 s += (*subs_)[i]->DebugString(); 687 } 688 return s; 689 } 690 case OR: { 691 string s = "("; 692 for (int i = 0; i < subs_->size(); i++) { 693 if (i > 0) 694 s += "|"; 695 s += (*subs_)[i]->DebugString(); 696 } 697 s += ")"; 698 return s; 699 } 700 } 701 } 702 703 Prefilter* Prefilter::FromRE2(const RE2* re2) { 704 if (re2 == NULL) 705 return NULL; 706 707 Regexp* regexp = re2->Regexp(); 708 if (regexp == NULL) 709 return NULL; 710 711 return FromRegexp(regexp); 712 } 713 714 715 } // namespace re2 716