Home | History | Annotate | Download | only in re2
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "util/util.h"
      6 #include "re2/prefilter.h"
      7 #include "re2/re2.h"
      8 #include "re2/unicode_casefold.h"
      9 #include "re2/walker-inl.h"
     10 
     11 namespace re2 {
     12 
     13 static const int Trace = false;
     14 
     15 typedef set<string>::iterator SSIter;
     16 typedef set<string>::const_iterator ConstSSIter;
     17 
     18 static int alloc_id = 100000;  // Used for debugging.
     19 // Initializes a Prefilter, allocating subs_ as necessary.
     20 Prefilter::Prefilter(Op op) {
     21   op_ = op;
     22   subs_ = NULL;
     23   if (op_ == AND || op_ == OR)
     24     subs_ = new vector<Prefilter*>;
     25 
     26   alloc_id_ = alloc_id++;
     27   VLOG(10) << "alloc_id: " << alloc_id_;
     28 }
     29 
     30 // Destroys a Prefilter.
     31 Prefilter::~Prefilter() {
     32   VLOG(10) << "Deleted: " << alloc_id_;
     33   if (subs_) {
     34     for (int i = 0; i < subs_->size(); i++)
     35       delete (*subs_)[i];
     36     delete subs_;
     37     subs_ = NULL;
     38   }
     39 }
     40 
     41 // Simplify if the node is an empty Or or And.
     42 Prefilter* Prefilter::Simplify() {
     43   if (op_ != AND && op_ != OR) {
     44     return this;
     45   }
     46 
     47   // Nothing left in the AND/OR.
     48   if (subs_->size() == 0) {
     49     if (op_ == AND)
     50       op_ = ALL;  // AND of nothing is true
     51     else
     52       op_ = NONE;  // OR of nothing is false
     53 
     54     return this;
     55   }
     56 
     57   // Just one subnode: throw away wrapper.
     58   if (subs_->size() == 1) {
     59     Prefilter* a = (*subs_)[0];
     60     subs_->clear();
     61     delete this;
     62     return a->Simplify();
     63   }
     64 
     65   return this;
     66 }
     67 
     68 // Combines two Prefilters together to create an "op" (AND or OR).
     69 // The passed Prefilters will be part of the returned Prefilter or deleted.
     70 // Does lots of work to avoid creating unnecessarily complicated structures.
     71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
     72   // If a, b can be rewritten as op, do so.
     73   a = a->Simplify();
     74   b = b->Simplify();
     75 
     76   // Canonicalize: a->op <= b->op.
     77   if (a->op() > b->op()) {
     78     Prefilter* t = a;
     79     a = b;
     80     b = t;
     81   }
     82 
     83   // Trivial cases.
     84   //    ALL AND b = b
     85   //    NONE OR b = b
     86   //    ALL OR b   = ALL
     87   //    NONE AND b = NONE
     88   // Don't need to look at b, because of canonicalization above.
     89   // ALL and NONE are smallest opcodes.
     90   if (a->op() == ALL || a->op() == NONE) {
     91     if ((a->op() == ALL && op == AND) ||
     92         (a->op() == NONE && op == OR)) {
     93       delete a;
     94       return b;
     95     } else {
     96       delete b;
     97       return a;
     98     }
     99   }
    100 
    101   // If a and b match op, merge their contents.
    102   if (a->op() == op && b->op() == op) {
    103     for (int i = 0; i < b->subs()->size(); i++) {
    104       Prefilter* bb = (*b->subs())[i];
    105       a->subs()->push_back(bb);
    106     }
    107     b->subs()->clear();
    108     delete b;
    109     return a;
    110   }
    111 
    112   // If a already has the same op as the op that is under construction
    113   // add in b (similarly if b already has the same op, add in a).
    114   if (b->op() == op) {
    115     Prefilter* t = a;
    116     a = b;
    117     b = t;
    118   }
    119   if (a->op() == op) {
    120     a->subs()->push_back(b);
    121     return a;
    122   }
    123 
    124   // Otherwise just return the op.
    125   Prefilter* c = new Prefilter(op);
    126   c->subs()->push_back(a);
    127   c->subs()->push_back(b);
    128   return c;
    129 }
    130 
    131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
    132   return AndOr(AND, a, b);
    133 }
    134 
    135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
    136   return AndOr(OR, a, b);
    137 }
    138 
    139 static void SimplifyStringSet(set<string> *ss) {
    140   // Now make sure that the strings aren't redundant.  For example, if
    141   // we know "ab" is a required string, then it doesn't help at all to
    142   // know that "abc" is also a required string, so delete "abc". This
    143   // is because, when we are performing a string search to filter
    144   // regexps, matching ab will already allow this regexp to be a
    145   // candidate for match, so further matching abc is redundant.
    146 
    147   for (SSIter i = ss->begin(); i != ss->end(); ++i) {
    148     SSIter j = i;
    149     ++j;
    150     while (j != ss->end()) {
    151       // Increment j early so that we can erase the element it points to.
    152       SSIter old_j = j;
    153       ++j;
    154       if (old_j->find(*i) != string::npos)
    155         ss->erase(old_j);
    156     }
    157   }
    158 }
    159 
    160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
    161   SimplifyStringSet(ss);
    162   Prefilter* or_prefilter = NULL;
    163   if (!ss->empty()) {
    164     or_prefilter = new Prefilter(NONE);
    165     for (SSIter i = ss->begin(); i != ss->end(); ++i)
    166       or_prefilter = Or(or_prefilter, FromString(*i));
    167   }
    168   return or_prefilter;
    169 }
    170 
    171 static Rune ToLowerRune(Rune r) {
    172   if (r < Runeself) {
    173     if ('A' <= r && r <= 'Z')
    174       r += 'a' - 'A';
    175     return r;
    176   }
    177 
    178   CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
    179   if (f == NULL || r < f->lo)
    180     return r;
    181   return ApplyFold(f, r);
    182 }
    183 
    184 static Rune ToLowerRuneLatin1(Rune r) {
    185   if ('A' <= r && r <= 'Z')
    186     r += 'a' - 'A';
    187   return r;
    188 }
    189 
    190 Prefilter* Prefilter::FromString(const string& str) {
    191   Prefilter* m = new Prefilter(Prefilter::ATOM);
    192   m->atom_ = str;
    193   return m;
    194 }
    195 
    196 // Information about a regexp used during computation of Prefilter.
    197 // Can be thought of as information about the set of strings matching
    198 // the given regular expression.
    199 class Prefilter::Info {
    200  public:
    201   Info();
    202   ~Info();
    203 
    204   // More constructors.  They delete their Info* arguments.
    205   static Info* Alt(Info* a, Info* b);
    206   static Info* Concat(Info* a, Info* b);
    207   static Info* And(Info* a, Info* b);
    208   static Info* Star(Info* a);
    209   static Info* Plus(Info* a);
    210   static Info* Quest(Info* a);
    211   static Info* EmptyString();
    212   static Info* NoMatch();
    213   static Info* AnyChar();
    214   static Info* CClass(CharClass* cc, bool latin1);
    215   static Info* Literal(Rune r);
    216   static Info* LiteralLatin1(Rune r);
    217   static Info* AnyMatch();
    218 
    219   // Format Info as a string.
    220   string ToString();
    221 
    222   // Caller takes ownership of the Prefilter.
    223   Prefilter* TakeMatch();
    224 
    225   set<string>& exact() { return exact_; }
    226 
    227   bool is_exact() const { return is_exact_; }
    228 
    229   class Walker;
    230 
    231  private:
    232   set<string> exact_;
    233 
    234   // When is_exact_ is true, the strings that match
    235   // are placed in exact_. When it is no longer an exact
    236   // set of strings that match this RE, then is_exact_
    237   // is false and the match_ contains the required match
    238   // criteria.
    239   bool is_exact_;
    240 
    241   // Accumulated Prefilter query that any
    242   // match for this regexp is guaranteed to match.
    243   Prefilter* match_;
    244 };
    245 
    246 
    247 Prefilter::Info::Info()
    248   : is_exact_(false),
    249     match_(NULL) {
    250 }
    251 
    252 Prefilter::Info::~Info() {
    253   delete match_;
    254 }
    255 
    256 Prefilter* Prefilter::Info::TakeMatch() {
    257   if (is_exact_) {
    258     match_ = Prefilter::OrStrings(&exact_);
    259     is_exact_ = false;
    260   }
    261   Prefilter* m = match_;
    262   match_ = NULL;
    263   return m;
    264 }
    265 
    266 // Format a Info in string form.
    267 string Prefilter::Info::ToString() {
    268   if (is_exact_) {
    269     int n = 0;
    270     string s;
    271     for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
    272       if (n++ > 0)
    273         s += ",";
    274       s += *i;
    275     }
    276     return s;
    277   }
    278 
    279   if (match_)
    280     return match_->DebugString();
    281 
    282   return "";
    283 }
    284 
    285 // Add the strings from src to dst.
    286 static void CopyIn(const set<string>& src, set<string>* dst) {
    287   for (ConstSSIter i = src.begin(); i != src.end(); ++i)
    288     dst->insert(*i);
    289 }
    290 
    291 // Add the cross-product of a and b to dst.
    292 // (For each string i in a and j in b, add i+j.)
    293 static void CrossProduct(const set<string>& a,
    294                          const set<string>& b,
    295                          set<string>* dst) {
    296   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
    297     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
    298       dst->insert(*i + *j);
    299 }
    300 
    301 // Concats a and b. Requires that both are exact sets.
    302 // Forms an exact set that is a crossproduct of a and b.
    303 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
    304   if (a == NULL)
    305     return b;
    306   DCHECK(a->is_exact_);
    307   DCHECK(b && b->is_exact_);
    308   Info *ab = new Info();
    309 
    310   CrossProduct(a->exact_, b->exact_, &ab->exact_);
    311   ab->is_exact_ = true;
    312 
    313   delete a;
    314   delete b;
    315   return ab;
    316 }
    317 
    318 // Constructs an inexact Info for ab given a and b.
    319 // Used only when a or b is not exact or when the
    320 // exact cross product is likely to be too big.
    321 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
    322   if (a == NULL)
    323     return b;
    324   if (b == NULL)
    325     return a;
    326 
    327   Info *ab = new Info();
    328 
    329   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
    330   ab->is_exact_ = false;
    331   delete a;
    332   delete b;
    333   return ab;
    334 }
    335 
    336 // Constructs Info for a|b given a and b.
    337 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
    338   Info *ab = new Info();
    339 
    340   if (a->is_exact_ && b->is_exact_) {
    341     CopyIn(a->exact_, &ab->exact_);
    342     CopyIn(b->exact_, &ab->exact_);
    343     ab->is_exact_ = true;
    344   } else {
    345     // Either a or b has is_exact_ = false. If the other
    346     // one has is_exact_ = true, we move it to match_ and
    347     // then create a OR of a,b. The resulting Info has
    348     // is_exact_ = false.
    349     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
    350     ab->is_exact_ = false;
    351   }
    352 
    353   delete a;
    354   delete b;
    355   return ab;
    356 }
    357 
    358 // Constructs Info for a? given a.
    359 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
    360   Info *ab = new Info();
    361 
    362   ab->is_exact_ = false;
    363   ab->match_ = new Prefilter(ALL);
    364   delete a;
    365   return ab;
    366 }
    367 
    368 // Constructs Info for a* given a.
    369 // Same as a? -- not much to do.
    370 Prefilter::Info* Prefilter::Info::Star(Info *a) {
    371   return Quest(a);
    372 }
    373 
    374 // Constructs Info for a+ given a. If a was exact set, it isn't
    375 // anymore.
    376 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
    377   Info *ab = new Info();
    378 
    379   ab->match_ = a->TakeMatch();
    380   ab->is_exact_ = false;
    381 
    382   delete a;
    383   return ab;
    384 }
    385 
    386 static string RuneToString(Rune r) {
    387   char buf[UTFmax];
    388   int n = runetochar(buf, &r);
    389   return string(buf, n);
    390 }
    391 
    392 static string RuneToStringLatin1(Rune r) {
    393   char c = r & 0xff;
    394   return string(&c, 1);
    395 }
    396 
    397 // Constructs Info for literal rune.
    398 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
    399   Info* info = new Info();
    400   info->exact_.insert(RuneToString(ToLowerRune(r)));
    401   info->is_exact_ = true;
    402   return info;
    403 }
    404 
    405 // Constructs Info for literal rune for Latin1 encoded string.
    406 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
    407   Info* info = new Info();
    408   info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
    409   info->is_exact_ = true;
    410   return info;
    411 }
    412 
    413 // Constructs Info for dot (any character).
    414 Prefilter::Info* Prefilter::Info::AnyChar() {
    415   Prefilter::Info* info = new Prefilter::Info();
    416   info->match_ = new Prefilter(ALL);
    417   return info;
    418 }
    419 
    420 // Constructs Prefilter::Info for no possible match.
    421 Prefilter::Info* Prefilter::Info::NoMatch() {
    422   Prefilter::Info* info = new Prefilter::Info();
    423   info->match_ = new Prefilter(NONE);
    424   return info;
    425 }
    426 
    427 // Constructs Prefilter::Info for any possible match.
    428 // This Prefilter::Info is valid for any regular expression,
    429 // since it makes no assertions whatsoever about the
    430 // strings being matched.
    431 Prefilter::Info* Prefilter::Info::AnyMatch() {
    432   Prefilter::Info *info = new Prefilter::Info();
    433   info->match_ = new Prefilter(ALL);
    434   return info;
    435 }
    436 
    437 // Constructs Prefilter::Info for just the empty string.
    438 Prefilter::Info* Prefilter::Info::EmptyString() {
    439   Prefilter::Info* info = new Prefilter::Info();
    440   info->is_exact_ = true;
    441   info->exact_.insert("");
    442   return info;
    443 }
    444 
    445 // Constructs Prefilter::Info for a character class.
    446 typedef CharClass::iterator CCIter;
    447 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
    448                                          bool latin1) {
    449   if (Trace) {
    450     VLOG(0) << "CharClassInfo:";
    451     for (CCIter i = cc->begin(); i != cc->end(); ++i)
    452       VLOG(0) << "  " << i->lo << "-" << i->hi;
    453   }
    454 
    455   // If the class is too large, it's okay to overestimate.
    456   if (cc->size() > 10)
    457     return AnyChar();
    458 
    459   Prefilter::Info *a = new Prefilter::Info();
    460   for (CCIter i = cc->begin(); i != cc->end(); ++i)
    461     for (Rune r = i->lo; r <= i->hi; r++) {
    462       if (latin1) {
    463         a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
    464       } else {
    465         a->exact_.insert(RuneToString(ToLowerRune(r)));
    466       }
    467     }
    468 
    469 
    470   a->is_exact_ = true;
    471 
    472   if (Trace) {
    473     VLOG(0) << " = " << a->ToString();
    474   }
    475 
    476   return a;
    477 }
    478 
    479 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
    480  public:
    481   Walker(bool latin1) : latin1_(latin1) {}
    482 
    483   virtual Info* PostVisit(
    484       Regexp* re, Info* parent_arg,
    485       Info* pre_arg,
    486       Info** child_args, int nchild_args);
    487 
    488   virtual Info* ShortVisit(
    489       Regexp* re,
    490       Info* parent_arg);
    491 
    492   bool latin1() { return latin1_; }
    493  private:
    494   bool latin1_;
    495   DISALLOW_EVIL_CONSTRUCTORS(Walker);
    496 };
    497 
    498 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
    499   if (Trace) {
    500     LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
    501   }
    502 
    503   bool latin1 = re->parse_flags() & Regexp::Latin1;
    504   Prefilter::Info::Walker w(latin1);
    505   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
    506 
    507   if (w.stopped_early()) {
    508     delete info;
    509     return NULL;
    510   }
    511 
    512   return info;
    513 }
    514 
    515 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
    516     Regexp* re, Prefilter::Info* parent_arg) {
    517   return AnyMatch();
    518 }
    519 
    520 // Constructs the Prefilter::Info for the given regular expression.
    521 // Assumes re is simplified.
    522 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
    523     Regexp* re, Prefilter::Info* parent_arg,
    524     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
    525     int nchild_args) {
    526   Prefilter::Info *info;
    527   switch (re->op()) {
    528     default:
    529     case kRegexpRepeat:
    530       LOG(DFATAL) << "Bad regexp op " << re->op();
    531       info = EmptyString();
    532       break;
    533 
    534     case kRegexpNoMatch:
    535       info = NoMatch();
    536       break;
    537 
    538     // These ops match the empty string:
    539     case kRegexpEmptyMatch:      // anywhere
    540     case kRegexpBeginLine:       // at beginning of line
    541     case kRegexpEndLine:         // at end of line
    542     case kRegexpBeginText:       // at beginning of text
    543     case kRegexpEndText:         // at end of text
    544     case kRegexpWordBoundary:    // at word boundary
    545     case kRegexpNoWordBoundary:  // not at word boundary
    546       info = EmptyString();
    547       break;
    548 
    549     case kRegexpLiteral:
    550       if (latin1()) {
    551         info = LiteralLatin1(re->rune());
    552       }
    553       else {
    554         info = Literal(re->rune());
    555       }
    556       break;
    557 
    558     case kRegexpLiteralString:
    559       if (re->nrunes() == 0) {
    560         info = NoMatch();
    561         break;
    562       }
    563       if (latin1()) {
    564         info = LiteralLatin1(re->runes()[0]);
    565         for (int i = 1; i < re->nrunes(); i++) {
    566           info = Concat(info, LiteralLatin1(re->runes()[i]));
    567         }
    568       } else {
    569         info = Literal(re->runes()[0]);
    570         for (int i = 1; i < re->nrunes(); i++) {
    571           info = Concat(info, Literal(re->runes()[i]));
    572         }
    573       }
    574       break;
    575 
    576     case kRegexpConcat: {
    577       // Accumulate in info.
    578       // Exact is concat of recent contiguous exact nodes.
    579       info = NULL;
    580       Info* exact = NULL;
    581       for (int i = 0; i < nchild_args; i++) {
    582         Info* ci = child_args[i];  // child info
    583         if (!ci->is_exact() ||
    584             (exact && ci->exact().size() * exact->exact().size() > 16)) {
    585           // Exact run is over.
    586           info = And(info, exact);
    587           exact = NULL;
    588           // Add this child's info.
    589           info = And(info, ci);
    590         } else {
    591           // Append to exact run.
    592           exact = Concat(exact, ci);
    593         }
    594       }
    595       info = And(info, exact);
    596     }
    597       break;
    598 
    599     case kRegexpAlternate:
    600       info = child_args[0];
    601       for (int i = 1; i < nchild_args; i++)
    602         info = Alt(info, child_args[i]);
    603       VLOG(10) << "Alt: " << info->ToString();
    604       break;
    605 
    606     case kRegexpStar:
    607       info = Star(child_args[0]);
    608       break;
    609 
    610     case kRegexpQuest:
    611       info = Quest(child_args[0]);
    612       break;
    613 
    614     case kRegexpPlus:
    615       info = Plus(child_args[0]);
    616       break;
    617 
    618     case kRegexpAnyChar:
    619       // Claim nothing, except that it's not empty.
    620       info = AnyChar();
    621       break;
    622 
    623     case kRegexpCharClass:
    624       info = CClass(re->cc(), latin1());
    625       break;
    626 
    627     case kRegexpCapture:
    628       // These don't affect the set of matching strings.
    629       info = child_args[0];
    630       break;
    631   }
    632 
    633   if (Trace) {
    634     VLOG(0) << "BuildInfo " << re->ToString()
    635             << ": " << (info ? info->ToString() : "");
    636   }
    637 
    638   return info;
    639 }
    640 
    641 
    642 Prefilter* Prefilter::FromRegexp(Regexp* re) {
    643   if (re == NULL)
    644     return NULL;
    645 
    646   Regexp* simple = re->Simplify();
    647   Prefilter::Info *info = BuildInfo(simple);
    648 
    649   simple->Decref();
    650   if (info == NULL)
    651     return NULL;
    652 
    653   Prefilter* m = info->TakeMatch();
    654 
    655   delete info;
    656   return m;
    657 }
    658 
    659 string Prefilter::DebugString() const {
    660   switch (op_) {
    661     default:
    662       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
    663       return StringPrintf("op%d", op_);
    664     case NONE:
    665       return "*no-matches*";
    666     case ATOM:
    667       return atom_;
    668     case ALL:
    669       return "";
    670     case AND: {
    671       string s = "";
    672       for (int i = 0; i < subs_->size(); i++) {
    673         if (i > 0)
    674           s += " ";
    675         Prefilter* sub = (*subs_)[i];
    676         s += sub ? sub->DebugString() : "<nil>";
    677       }
    678       return s;
    679     }
    680     case OR: {
    681       string s = "(";
    682       for (int i = 0; i < subs_->size(); i++) {
    683         if (i > 0)
    684           s += "|";
    685         Prefilter* sub = (*subs_)[i];
    686         s += sub ? sub->DebugString() : "<nil>";
    687       }
    688       s += ")";
    689       return s;
    690     }
    691   }
    692 }
    693 
    694 Prefilter* Prefilter::FromRE2(const RE2* re2) {
    695   if (re2 == NULL)
    696     return NULL;
    697 
    698   Regexp* regexp = re2->Regexp();
    699   if (regexp == NULL)
    700     return NULL;
    701 
    702   return FromRegexp(regexp);
    703 }
    704 
    705 
    706 }  // namespace re2
    707