Home | History | Annotate | Download | only in re2
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "util/util.h"
      6 #include "re2/prefilter.h"
      7 #include "re2/re2.h"
      8 #include "re2/unicode_casefold.h"
      9 #include "re2/walker-inl.h"
     10 
     11 namespace re2 {
     12 
     13 static const int Trace = false;
     14 
     15 typedef set<string>::iterator SSIter;
     16 typedef set<string>::const_iterator ConstSSIter;
     17 
     18 static int alloc_id = 100000;  // Used for debugging.
     19 // Initializes a Prefilter, allocating subs_ as necessary.
     20 Prefilter::Prefilter(Op op) {
     21   op_ = op;
     22   subs_ = NULL;
     23   if (op_ == AND || op_ == OR)
     24     subs_ = new vector<Prefilter*>;
     25 
     26   alloc_id_ = alloc_id++;
     27   VLOG(10) << "alloc_id: " << alloc_id_;
     28 }
     29 
     30 // Destroys a Prefilter.
     31 Prefilter::~Prefilter() {
     32   VLOG(10) << "Deleted: " << alloc_id_;
     33   if (subs_) {
     34     for (int i = 0; i < subs_->size(); i++)
     35       delete (*subs_)[i];
     36     delete subs_;
     37     subs_ = NULL;
     38   }
     39 }
     40 
     41 // Simplify if the node is an empty Or or And.
     42 Prefilter* Prefilter::Simplify() {
     43   if (op_ != AND && op_ != OR) {
     44     return this;
     45   }
     46 
     47   // Nothing left in the AND/OR.
     48   if (subs_->size() == 0) {
     49     if (op_ == AND)
     50       op_ = ALL;  // AND of nothing is true
     51     else
     52       op_ = NONE;  // OR of nothing is false
     53 
     54     return this;
     55   }
     56 
     57   // Just one subnode: throw away wrapper.
     58   if (subs_->size() == 1) {
     59     Prefilter* a = (*subs_)[0];
     60     subs_->clear();
     61     delete this;
     62     return a->Simplify();
     63   }
     64 
     65   return this;
     66 }
     67 
     68 // Combines two Prefilters together to create an "op" (AND or OR).
     69 // The passed Prefilters will be part of the returned Prefilter or deleted.
     70 // Does lots of work to avoid creating unnecessarily complicated structures.
     71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
     72   // If a, b can be rewritten as op, do so.
     73   a = a->Simplify();
     74   b = b->Simplify();
     75 
     76   // Canonicalize: a->op <= b->op.
     77   if (a->op() > b->op()) {
     78     Prefilter* t = a;
     79     a = b;
     80     b = t;
     81   }
     82 
     83   // Trivial cases.
     84   //    ALL AND b = b
     85   //    NONE OR b = b
     86   //    ALL OR b   = ALL
     87   //    NONE AND b = NONE
     88   // Don't need to look at b, because of canonicalization above.
     89   // ALL and NONE are smallest opcodes.
     90   if (a->op() == ALL || a->op() == NONE) {
     91     if ((a->op() == ALL && op == AND) ||
     92         (a->op() == NONE && op == OR)) {
     93       delete a;
     94       return b;
     95     } else {
     96       delete b;
     97       return a;
     98     }
     99   }
    100 
    101   // If a and b match op, merge their contents.
    102   if (a->op() == op && b->op() == op) {
    103     for (int i = 0; i < b->subs()->size(); i++) {
    104       Prefilter* bb = (*b->subs())[i];
    105       a->subs()->push_back(bb);
    106     }
    107     b->subs()->clear();
    108     delete b;
    109     return a;
    110   }
    111 
    112   // If a already has the same op as the op that is under construction
    113   // add in b (similarly if b already has the same op, add in a).
    114   if (b->op() == op) {
    115     Prefilter* t = a;
    116     a = b;
    117     b = t;
    118   }
    119   if (a->op() == op) {
    120     a->subs()->push_back(b);
    121     return a;
    122   }
    123 
    124   // Otherwise just return the op.
    125   Prefilter* c = new Prefilter(op);
    126   c->subs()->push_back(a);
    127   c->subs()->push_back(b);
    128   return c;
    129 }
    130 
    131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
    132   return AndOr(AND, a, b);
    133 }
    134 
    135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
    136   return AndOr(OR, a, b);
    137 }
    138 
    139 static void SimplifyStringSet(set<string> *ss) {
    140   // Now make sure that the strings aren't redundant.  For example, if
    141   // we know "ab" is a required string, then it doesn't help at all to
    142   // know that "abc" is also a required string, so delete "abc". This
    143   // is because, when we are performing a string search to filter
    144   // regexps, matching ab will already allow this regexp to be a
    145   // candidate for match, so further matching abc is redundant.
    146 
    147   for (SSIter i = ss->begin(); i != ss->end(); ++i) {
    148     SSIter j = i;
    149     ++j;
    150     while (j != ss->end()) {
    151       // Increment j early so that we can erase the element it points to.
    152       SSIter old_j = j;
    153       ++j;
    154       if (old_j->find(*i) != string::npos)
    155         ss->erase(old_j);
    156     }
    157   }
    158 }
    159 
    160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
    161   SimplifyStringSet(ss);
    162   Prefilter* or_prefilter = NULL;
    163   if (!ss->empty()) {
    164     or_prefilter = new Prefilter(NONE);
    165     for (SSIter i = ss->begin(); i != ss->end(); ++i)
    166       or_prefilter = Or(or_prefilter, FromString(*i));
    167   }
    168   return or_prefilter;
    169 }
    170 
    171 static Rune ToLowerRune(Rune r) {
    172   if (r < Runeself) {
    173     if ('A' <= r && r <= 'Z')
    174       r += 'a' - 'A';
    175     return r;
    176   }
    177 
    178   CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
    179   if (f == NULL || r < f->lo)
    180     return r;
    181   return ApplyFold(f, r);
    182 }
    183 
    184 static Rune ToLowerRuneLatin1(Rune r) {
    185   if ('A' <= r && r <= 'Z')
    186     r += 'a' - 'A';
    187   return r;
    188 }
    189 
    190 Prefilter* Prefilter::FromString(const string& str) {
    191   Prefilter* m = new Prefilter(Prefilter::ATOM);
    192   m->atom_ = str;
    193   return m;
    194 }
    195 
    196 // Information about a regexp used during computation of Prefilter.
    197 // Can be thought of as information about the set of strings matching
    198 // the given regular expression.
    199 class Prefilter::Info {
    200  public:
    201   Info();
    202   ~Info();
    203 
    204   // More constructors.  They delete their Info* arguments.
    205   static Info* Alt(Info* a, Info* b);
    206   static Info* Concat(Info* a, Info* b);
    207   static Info* And(Info* a, Info* b);
    208   static Info* Star(Info* a);
    209   static Info* Plus(Info* a);
    210   static Info* Quest(Info* a);
    211   static Info* EmptyString();
    212   static Info* NoMatch();
    213   static Info* AnyChar();
    214   static Info* CClass(CharClass* cc, bool latin1);
    215   static Info* Literal(Rune r);
    216   static Info* LiteralLatin1(Rune r);
    217   static Info* AnyMatch();
    218 
    219   // Format Info as a string.
    220   string ToString();
    221 
    222   // Caller takes ownership of the Prefilter.
    223   Prefilter* TakeMatch();
    224 
    225   set<string>& exact() { return exact_; }
    226 
    227   bool is_exact() const { return is_exact_; }
    228 
    229   class Walker;
    230 
    231  private:
    232   set<string> exact_;
    233 
    234   // When is_exact_ is true, the strings that match
    235   // are placed in exact_. When it is no longer an exact
    236   // set of strings that match this RE, then is_exact_
    237   // is false and the match_ contains the required match
    238   // criteria.
    239   bool is_exact_;
    240 
    241   // Accumulated Prefilter query that any
    242   // match for this regexp is guaranteed to match.
    243   Prefilter* match_;
    244 };
    245 
    246 
    247 Prefilter::Info::Info()
    248   : is_exact_(false),
    249     match_(NULL) {
    250 }
    251 
    252 Prefilter::Info::~Info() {
    253   delete match_;
    254 }
    255 
    256 Prefilter* Prefilter::Info::TakeMatch() {
    257   if (is_exact_) {
    258     match_ = Prefilter::OrStrings(&exact_);
    259     is_exact_ = false;
    260   }
    261   Prefilter* m = match_;
    262   match_ = NULL;
    263   return m;
    264 }
    265 
    266 // Format a Info in string form.
    267 string Prefilter::Info::ToString() {
    268   if (this == NULL) {
    269     // Sometimes when iterating on children of a node,
    270     // some children might have NULL Info. Adding
    271     // the check here for NULL to take care of cases where
    272     // the caller is not checking.
    273     return "";
    274   }
    275 
    276   if (is_exact_) {
    277     int n = 0;
    278     string s;
    279     for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
    280       if (n++ > 0)
    281         s += ",";
    282       s += *i;
    283     }
    284     return s;
    285   }
    286 
    287   if (match_)
    288     return match_->DebugString();
    289 
    290   return "";
    291 }
    292 
    293 // Add the strings from src to dst.
    294 static void CopyIn(const set<string>& src, set<string>* dst) {
    295   for (ConstSSIter i = src.begin(); i != src.end(); ++i)
    296     dst->insert(*i);
    297 }
    298 
    299 // Add the cross-product of a and b to dst.
    300 // (For each string i in a and j in b, add i+j.)
    301 static void CrossProduct(const set<string>& a,
    302                          const set<string>& b,
    303                          set<string>* dst) {
    304   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
    305     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
    306       dst->insert(*i + *j);
    307 }
    308 
    309 // Concats a and b. Requires that both are exact sets.
    310 // Forms an exact set that is a crossproduct of a and b.
    311 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
    312   if (a == NULL)
    313     return b;
    314   DCHECK(a->is_exact_);
    315   DCHECK(b && b->is_exact_);
    316   Info *ab = new Info();
    317 
    318   CrossProduct(a->exact_, b->exact_, &ab->exact_);
    319   ab->is_exact_ = true;
    320 
    321   delete a;
    322   delete b;
    323   return ab;
    324 }
    325 
    326 // Constructs an inexact Info for ab given a and b.
    327 // Used only when a or b is not exact or when the
    328 // exact cross product is likely to be too big.
    329 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
    330   if (a == NULL)
    331     return b;
    332   if (b == NULL)
    333     return a;
    334 
    335   Info *ab = new Info();
    336 
    337   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
    338   ab->is_exact_ = false;
    339   delete a;
    340   delete b;
    341   return ab;
    342 }
    343 
    344 // Constructs Info for a|b given a and b.
    345 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
    346   Info *ab = new Info();
    347 
    348   if (a->is_exact_ && b->is_exact_) {
    349     CopyIn(a->exact_, &ab->exact_);
    350     CopyIn(b->exact_, &ab->exact_);
    351     ab->is_exact_ = true;
    352   } else {
    353     // Either a or b has is_exact_ = false. If the other
    354     // one has is_exact_ = true, we move it to match_ and
    355     // then create a OR of a,b. The resulting Info has
    356     // is_exact_ = false.
    357     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
    358     ab->is_exact_ = false;
    359   }
    360 
    361   delete a;
    362   delete b;
    363   return ab;
    364 }
    365 
    366 // Constructs Info for a? given a.
    367 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
    368   Info *ab = new Info();
    369 
    370   ab->is_exact_ = false;
    371   ab->match_ = new Prefilter(ALL);
    372   delete a;
    373   return ab;
    374 }
    375 
    376 // Constructs Info for a* given a.
    377 // Same as a? -- not much to do.
    378 Prefilter::Info* Prefilter::Info::Star(Info *a) {
    379   return Quest(a);
    380 }
    381 
    382 // Constructs Info for a+ given a. If a was exact set, it isn't
    383 // anymore.
    384 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
    385   Info *ab = new Info();
    386 
    387   ab->match_ = a->TakeMatch();
    388   ab->is_exact_ = false;
    389 
    390   delete a;
    391   return ab;
    392 }
    393 
    394 static string RuneToString(Rune r) {
    395   char buf[UTFmax];
    396   int n = runetochar(buf, &r);
    397   return string(buf, n);
    398 }
    399 
    400 static string RuneToStringLatin1(Rune r) {
    401   char c = r & 0xff;
    402   return string(&c, 1);
    403 }
    404 
    405 // Constructs Info for literal rune.
    406 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
    407   Info* info = new Info();
    408   info->exact_.insert(RuneToString(ToLowerRune(r)));
    409   info->is_exact_ = true;
    410   return info;
    411 }
    412 
    413 // Constructs Info for literal rune for Latin1 encoded string.
    414 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
    415   Info* info = new Info();
    416   info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
    417   info->is_exact_ = true;
    418   return info;
    419 }
    420 
    421 // Constructs Info for dot (any character).
    422 Prefilter::Info* Prefilter::Info::AnyChar() {
    423   Prefilter::Info* info = new Prefilter::Info();
    424   info->match_ = new Prefilter(ALL);
    425   return info;
    426 }
    427 
    428 // Constructs Prefilter::Info for no possible match.
    429 Prefilter::Info* Prefilter::Info::NoMatch() {
    430   Prefilter::Info* info = new Prefilter::Info();
    431   info->match_ = new Prefilter(NONE);
    432   return info;
    433 }
    434 
    435 // Constructs Prefilter::Info for any possible match.
    436 // This Prefilter::Info is valid for any regular expression,
    437 // since it makes no assertions whatsoever about the
    438 // strings being matched.
    439 Prefilter::Info* Prefilter::Info::AnyMatch() {
    440   Prefilter::Info *info = new Prefilter::Info();
    441   info->match_ = new Prefilter(ALL);
    442   return info;
    443 }
    444 
    445 // Constructs Prefilter::Info for just the empty string.
    446 Prefilter::Info* Prefilter::Info::EmptyString() {
    447   Prefilter::Info* info = new Prefilter::Info();
    448   info->is_exact_ = true;
    449   info->exact_.insert("");
    450   return info;
    451 }
    452 
    453 // Constructs Prefilter::Info for a character class.
    454 typedef CharClass::iterator CCIter;
    455 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
    456                                          bool latin1) {
    457   if (Trace) {
    458     VLOG(0) << "CharClassInfo:";
    459     for (CCIter i = cc->begin(); i != cc->end(); ++i)
    460       VLOG(0) << "  " << i->lo << "-" << i->hi;
    461   }
    462 
    463   // If the class is too large, it's okay to overestimate.
    464   if (cc->size() > 10)
    465     return AnyChar();
    466 
    467   Prefilter::Info *a = new Prefilter::Info();
    468   for (CCIter i = cc->begin(); i != cc->end(); ++i)
    469     for (Rune r = i->lo; r <= i->hi; r++) {
    470       if (latin1) {
    471         a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
    472       } else {
    473         a->exact_.insert(RuneToString(ToLowerRune(r)));
    474       }
    475     }
    476 
    477 
    478   a->is_exact_ = true;
    479 
    480   if (Trace) {
    481     VLOG(0) << " = " << a->ToString();
    482   }
    483 
    484   return a;
    485 }
    486 
    487 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
    488  public:
    489   Walker(bool latin1) : latin1_(latin1) {}
    490 
    491   virtual Info* PostVisit(
    492       Regexp* re, Info* parent_arg,
    493       Info* pre_arg,
    494       Info** child_args, int nchild_args);
    495 
    496   virtual Info* ShortVisit(
    497       Regexp* re,
    498       Info* parent_arg);
    499 
    500   bool latin1() { return latin1_; }
    501  private:
    502   bool latin1_;
    503   DISALLOW_EVIL_CONSTRUCTORS(Walker);
    504 };
    505 
    506 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
    507   if (Trace) {
    508     LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
    509   }
    510 
    511   bool latin1 = re->parse_flags() & Regexp::Latin1;
    512   Prefilter::Info::Walker w(latin1);
    513   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
    514 
    515   if (w.stopped_early()) {
    516     delete info;
    517     return NULL;
    518   }
    519 
    520   return info;
    521 }
    522 
    523 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
    524     Regexp* re, Prefilter::Info* parent_arg) {
    525   return AnyMatch();
    526 }
    527 
    528 // Constructs the Prefilter::Info for the given regular expression.
    529 // Assumes re is simplified.
    530 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
    531     Regexp* re, Prefilter::Info* parent_arg,
    532     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
    533     int nchild_args) {
    534   Prefilter::Info *info;
    535   switch (re->op()) {
    536     default:
    537     case kRegexpRepeat:
    538       LOG(DFATAL) << "Bad regexp op " << re->op();
    539       info = EmptyString();
    540       break;
    541 
    542     case kRegexpNoMatch:
    543       info = NoMatch();
    544       break;
    545 
    546     // These ops match the empty string:
    547     case kRegexpEmptyMatch:      // anywhere
    548     case kRegexpBeginLine:       // at beginning of line
    549     case kRegexpEndLine:         // at end of line
    550     case kRegexpBeginText:       // at beginning of text
    551     case kRegexpEndText:         // at end of text
    552     case kRegexpWordBoundary:    // at word boundary
    553     case kRegexpNoWordBoundary:  // not at word boundary
    554       info = EmptyString();
    555       break;
    556 
    557     case kRegexpLiteral:
    558       if (latin1()) {
    559         info = LiteralLatin1(re->rune());
    560       }
    561       else {
    562         info = Literal(re->rune());
    563       }
    564       break;
    565 
    566     case kRegexpLiteralString:
    567       if (re->nrunes() == 0) {
    568         info = NoMatch();
    569         break;
    570       }
    571       if (latin1()) {
    572         info = LiteralLatin1(re->runes()[0]);
    573         for (int i = 1; i < re->nrunes(); i++) {
    574           info = Concat(info, LiteralLatin1(re->runes()[i]));
    575         }
    576       } else {
    577         info = Literal(re->runes()[0]);
    578         for (int i = 1; i < re->nrunes(); i++) {
    579           info = Concat(info, Literal(re->runes()[i]));
    580         }
    581       }
    582       break;
    583 
    584     case kRegexpConcat: {
    585       // Accumulate in info.
    586       // Exact is concat of recent contiguous exact nodes.
    587       info = NULL;
    588       Info* exact = NULL;
    589       for (int i = 0; i < nchild_args; i++) {
    590         Info* ci = child_args[i];  // child info
    591         if (!ci->is_exact() ||
    592             (exact && ci->exact().size() * exact->exact().size() > 16)) {
    593           // Exact run is over.
    594           info = And(info, exact);
    595           exact = NULL;
    596           // Add this child's info.
    597           info = And(info, ci);
    598         } else {
    599           // Append to exact run.
    600           exact = Concat(exact, ci);
    601         }
    602       }
    603       info = And(info, exact);
    604     }
    605       break;
    606 
    607     case kRegexpAlternate:
    608       info = child_args[0];
    609       for (int i = 1; i < nchild_args; i++)
    610         info = Alt(info, child_args[i]);
    611       VLOG(10) << "Alt: " << info->ToString();
    612       break;
    613 
    614     case kRegexpStar:
    615       info = Star(child_args[0]);
    616       break;
    617 
    618     case kRegexpQuest:
    619       info = Quest(child_args[0]);
    620       break;
    621 
    622     case kRegexpPlus:
    623       info = Plus(child_args[0]);
    624       break;
    625 
    626     case kRegexpAnyChar:
    627       // Claim nothing, except that it's not empty.
    628       info = AnyChar();
    629       break;
    630 
    631     case kRegexpCharClass:
    632       info = CClass(re->cc(), latin1());
    633       break;
    634 
    635     case kRegexpCapture:
    636       // These don't affect the set of matching strings.
    637       info = child_args[0];
    638       break;
    639   }
    640 
    641   if (Trace) {
    642     VLOG(0) << "BuildInfo " << re->ToString()
    643             << ": " << info->ToString();
    644   }
    645 
    646   return info;
    647 }
    648 
    649 
    650 Prefilter* Prefilter::FromRegexp(Regexp* re) {
    651   if (re == NULL)
    652     return NULL;
    653 
    654   Regexp* simple = re->Simplify();
    655   Prefilter::Info *info = BuildInfo(simple);
    656 
    657   simple->Decref();
    658   if (info == NULL)
    659     return NULL;
    660 
    661   Prefilter* m = info->TakeMatch();
    662 
    663   delete info;
    664   return m;
    665 }
    666 
    667 string Prefilter::DebugString() const {
    668   if (this == NULL)
    669     return "<nil>";
    670 
    671   switch (op_) {
    672     default:
    673       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
    674       return StringPrintf("op%d", op_);
    675     case NONE:
    676       return "*no-matches*";
    677     case ATOM:
    678       return atom_;
    679     case ALL:
    680       return "";
    681     case AND: {
    682       string s = "";
    683       for (int i = 0; i < subs_->size(); i++) {
    684         if (i > 0)
    685           s += " ";
    686         s += (*subs_)[i]->DebugString();
    687       }
    688       return s;
    689     }
    690     case OR: {
    691       string s = "(";
    692       for (int i = 0; i < subs_->size(); i++) {
    693         if (i > 0)
    694           s += "|";
    695         s += (*subs_)[i]->DebugString();
    696       }
    697       s += ")";
    698       return s;
    699     }
    700   }
    701 }
    702 
    703 Prefilter* Prefilter::FromRE2(const RE2* re2) {
    704   if (re2 == NULL)
    705     return NULL;
    706 
    707   Regexp* regexp = re2->Regexp();
    708   if (regexp == NULL)
    709     return NULL;
    710 
    711   return FromRegexp(regexp);
    712 }
    713 
    714 
    715 }  // namespace re2
    716