Home | History | Annotate | Download | only in re2
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "util/util.h"
      6 #include "re2/prefilter.h"
      7 #include "re2/re2.h"
      8 #include "re2/unicode_casefold.h"
      9 #include "re2/walker-inl.h"
     10 
     11 namespace re2 {
     12 
     13 static const int Trace = false;
     14 
     15 typedef set<string>::iterator SSIter;
     16 typedef set<string>::const_iterator ConstSSIter;
     17 
     18 static int alloc_id = 100000;  // Used for debugging.
     19 // Initializes a Prefilter, allocating subs_ as necessary.
     20 Prefilter::Prefilter(Op op) {
     21   op_ = op;
     22   subs_ = NULL;
     23   if (op_ == AND || op_ == OR)
     24     subs_ = new vector<Prefilter*>;
     25 
     26   alloc_id_ = alloc_id++;
     27   VLOG(10) << "alloc_id: " << alloc_id_;
     28 }
     29 
     30 // Destroys a Prefilter.
     31 Prefilter::~Prefilter() {
     32   VLOG(10) << "Deleted: " << alloc_id_;
     33   if (subs_) {
     34     for (int i = 0; i < subs_->size(); i++)
     35       delete (*subs_)[i];
     36     delete subs_;
     37     subs_ = NULL;
     38   }
     39 }
     40 
     41 // Simplify if the node is an empty Or or And.
     42 Prefilter* Prefilter::Simplify() {
     43   if (op_ != AND && op_ != OR) {
     44     return this;
     45   }
     46 
     47   // Nothing left in the AND/OR.
     48   if (subs_->size() == 0) {
     49     if (op_ == AND)
     50       op_ = ALL;  // AND of nothing is true
     51     else
     52       op_ = NONE;  // OR of nothing is false
     53 
     54     return this;
     55   }
     56 
     57   // Just one subnode: throw away wrapper.
     58   if (subs_->size() == 1) {
     59     Prefilter* a = (*subs_)[0];
     60     subs_->clear();
     61     delete this;
     62     return a->Simplify();
     63   }
     64 
     65   return this;
     66 }
     67 
     68 // Combines two Prefilters together to create an "op" (AND or OR).
     69 // The passed Prefilters will be part of the returned Prefilter or deleted.
     70 // Does lots of work to avoid creating unnecessarily complicated structures.
     71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
     72   // If a, b can be rewritten as op, do so.
     73   a = a->Simplify();
     74   b = b->Simplify();
     75 
     76   // Canonicalize: a->op <= b->op.
     77   if (a->op() > b->op()) {
     78     Prefilter* t = a;
     79     a = b;
     80     b = t;
     81   }
     82 
     83   // Trivial cases.
     84   //    ALL AND b = b
     85   //    NONE OR b = b
     86   //    ALL OR b   = ALL
     87   //    NONE AND b = NONE
     88   // Don't need to look at b, because of canonicalization above.
     89   // ALL and NONE are smallest opcodes.
     90   if (a->op() == ALL || a->op() == NONE) {
     91     if ((a->op() == ALL && op == AND) ||
     92         (a->op() == NONE && op == OR)) {
     93       delete a;
     94       return b;
     95     } else {
     96       delete b;
     97       return a;
     98     }
     99   }
    100 
    101   // If a and b match op, merge their contents.
    102   if (a->op() == op && b->op() == op) {
    103     for (int i = 0; i < b->subs()->size(); i++) {
    104       Prefilter* bb = (*b->subs())[i];
    105       a->subs()->push_back(bb);
    106     }
    107     b->subs()->clear();
    108     delete b;
    109     return a;
    110   }
    111 
    112   // If a already has the same op as the op that is under construction
    113   // add in b (similarly if b already has the same op, add in a).
    114   if (b->op() == op) {
    115     Prefilter* t = a;
    116     a = b;
    117     b = t;
    118   }
    119   if (a->op() == op) {
    120     a->subs()->push_back(b);
    121     return a;
    122   }
    123 
    124   // Otherwise just return the op.
    125   Prefilter* c = new Prefilter(op);
    126   c->subs()->push_back(a);
    127   c->subs()->push_back(b);
    128   return c;
    129 }
    130 
    131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
    132   return AndOr(AND, a, b);
    133 }
    134 
    135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
    136   return AndOr(OR, a, b);
    137 }
    138 
    139 static void SimplifyStringSet(set<string> *ss) {
    140   // Now make sure that the strings aren't redundant.  For example, if
    141   // we know "ab" is a required string, then it doesn't help at all to
    142   // know that "abc" is also a required string, so delete "abc". This
    143   // is because, when we are performing a string search to filter
    144   // regexps, matching ab will already allow this regexp to be a
    145   // candidate for match, so further matching abc is redundant.
    146 
    147   for (SSIter i = ss->begin(); i != ss->end(); ++i) {
    148     SSIter j = i;
    149     ++j;
    150     while (j != ss->end()) {
    151       // Increment j early so that we can erase the element it points to.
    152       SSIter old_j = j;
    153       ++j;
    154       if (old_j->find(*i) != string::npos)
    155         ss->erase(old_j);
    156     }
    157   }
    158 }
    159 
    160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
    161   SimplifyStringSet(ss);
    162   Prefilter* or_prefilter = NULL;
    163   if (!ss->empty()) {
    164     or_prefilter = new Prefilter(NONE);
    165     for (SSIter i = ss->begin(); i != ss->end(); ++i)
    166       or_prefilter = Or(or_prefilter, FromString(*i));
    167   }
    168   return or_prefilter;
    169 }
    170 
    171 static Rune ToLowerRune(Rune r) {
    172   if (r < Runeself) {
    173     if ('A' <= r && r <= 'Z')
    174       r += 'a' - 'A';
    175     return r;
    176   }
    177 
    178   CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
    179   if (f == NULL || r < f->lo)
    180     return r;
    181   return ApplyFold(f, r);
    182 }
    183 
    184 Prefilter* Prefilter::FromString(const string& str) {
    185   Prefilter* m = new Prefilter(Prefilter::ATOM);
    186   m->atom_ = str;
    187   return m;
    188 }
    189 
    190 // Information about a regexp used during computation of Prefilter.
    191 // Can be thought of as information about the set of strings matching
    192 // the given regular expression.
    193 class Prefilter::Info {
    194  public:
    195   Info();
    196   ~Info();
    197 
    198   // More constructors.  They delete their Info* arguments.
    199   static Info* Alt(Info* a, Info* b);
    200   static Info* Concat(Info* a, Info* b);
    201   static Info* And(Info* a, Info* b);
    202   static Info* Star(Info* a);
    203   static Info* Plus(Info* a);
    204   static Info* Quest(Info* a);
    205   static Info* EmptyString();
    206   static Info* NoMatch();
    207   static Info* AnyChar();
    208   static Info* CClass(CharClass* cc);
    209   static Info* Literal(Rune r);
    210   static Info* AnyMatch();
    211 
    212   // Format Info as a string.
    213   string ToString();
    214 
    215   // Caller takes ownership of the Prefilter.
    216   Prefilter* TakeMatch();
    217 
    218   set<string>& exact() { return exact_; }
    219 
    220   bool is_exact() const { return is_exact_; }
    221 
    222   class Walker;
    223 
    224  private:
    225   set<string> exact_;
    226 
    227   // When is_exact_ is true, the strings that match
    228   // are placed in exact_. When it is no longer an exact
    229   // set of strings that match this RE, then is_exact_
    230   // is false and the match_ contains the required match
    231   // criteria.
    232   bool is_exact_;
    233 
    234   // Accumulated Prefilter query that any
    235   // match for this regexp is guaranteed to match.
    236   Prefilter* match_;
    237 };
    238 
    239 
    240 Prefilter::Info::Info()
    241   : is_exact_(false),
    242     match_(NULL) {
    243 }
    244 
    245 Prefilter::Info::~Info() {
    246   delete match_;
    247 }
    248 
    249 Prefilter* Prefilter::Info::TakeMatch() {
    250   if (is_exact_) {
    251     match_ = Prefilter::OrStrings(&exact_);
    252     is_exact_ = false;
    253   }
    254   Prefilter* m = match_;
    255   match_ = NULL;
    256   return m;
    257 }
    258 
    259 // Format a Info in string form.
    260 string Prefilter::Info::ToString() {
    261   if (this == NULL) {
    262     // Sometimes when iterating on children of a node,
    263     // some children might have NULL Info. Adding
    264     // the check here for NULL to take care of cases where
    265     // the caller is not checking.
    266     return "";
    267   }
    268 
    269   if (is_exact_) {
    270     int n = 0;
    271     string s;
    272     for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
    273       if (n++ > 0)
    274         s += ",";
    275       s += *i;
    276     }
    277     return s;
    278   }
    279 
    280   if (match_)
    281     return match_->DebugString();
    282 
    283   return "";
    284 }
    285 
    286 // Add the strings from src to dst.
    287 static void CopyIn(const set<string>& src, set<string>* dst) {
    288   for (ConstSSIter i = src.begin(); i != src.end(); ++i)
    289     dst->insert(*i);
    290 }
    291 
    292 // Add the cross-product of a and b to dst.
    293 // (For each string i in a and j in b, add i+j.)
    294 static void CrossProduct(const set<string>& a,
    295                          const set<string>& b,
    296                          set<string>* dst) {
    297   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
    298     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
    299       dst->insert(*i + *j);
    300 }
    301 
    302 // Concats a and b. Requires that both are exact sets.
    303 // Forms an exact set that is a crossproduct of a and b.
    304 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
    305   if (a == NULL)
    306     return b;
    307   DCHECK(a->is_exact_);
    308   DCHECK(b && b->is_exact_);
    309   Info *ab = new Info();
    310 
    311   CrossProduct(a->exact_, b->exact_, &ab->exact_);
    312   ab->is_exact_ = true;
    313 
    314   delete a;
    315   delete b;
    316   return ab;
    317 }
    318 
    319 // Constructs an inexact Info for ab given a and b.
    320 // Used only when a or b is not exact or when the
    321 // exact cross product is likely to be too big.
    322 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
    323   if (a == NULL)
    324     return b;
    325   if (b == NULL)
    326     return a;
    327 
    328   Info *ab = new Info();
    329 
    330   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
    331   ab->is_exact_ = false;
    332   delete a;
    333   delete b;
    334   return ab;
    335 }
    336 
    337 // Constructs Info for a|b given a and b.
    338 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
    339   Info *ab = new Info();
    340 
    341   if (a->is_exact_ && b->is_exact_) {
    342     CopyIn(a->exact_, &ab->exact_);
    343     CopyIn(b->exact_, &ab->exact_);
    344     ab->is_exact_ = true;
    345   } else {
    346     // Either a or b has is_exact_ = false. If the other
    347     // one has is_exact_ = true, we move it to match_ and
    348     // then create a OR of a,b. The resulting Info has
    349     // is_exact_ = false.
    350     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
    351     ab->is_exact_ = false;
    352   }
    353 
    354   delete a;
    355   delete b;
    356   return ab;
    357 }
    358 
    359 // Constructs Info for a? given a.
    360 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
    361   Info *ab = new Info();
    362 
    363   ab->is_exact_ = false;
    364   ab->match_ = new Prefilter(ALL);
    365   delete a;
    366   return ab;
    367 }
    368 
    369 // Constructs Info for a* given a.
    370 // Same as a? -- not much to do.
    371 Prefilter::Info* Prefilter::Info::Star(Info *a) {
    372   return Quest(a);
    373 }
    374 
    375 // Constructs Info for a+ given a. If a was exact set, it isn't
    376 // anymore.
    377 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
    378   Info *ab = new Info();
    379 
    380   ab->match_ = a->TakeMatch();
    381   ab->is_exact_ = false;
    382 
    383   delete a;
    384   return ab;
    385 }
    386 
    387 static string RuneToString(Rune r) {
    388   char buf[UTFmax];
    389   int n = runetochar(buf, &r);
    390   return string(buf, n);
    391 }
    392 
    393 // Constructs Info for literal rune.
    394 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
    395   Info* info = new Info();
    396   info->exact_.insert(RuneToString(ToLowerRune(r)));
    397   info->is_exact_ = true;
    398   return info;
    399 }
    400 
    401 // Constructs Info for dot (any character).
    402 Prefilter::Info* Prefilter::Info::AnyChar() {
    403   Prefilter::Info* info = new Prefilter::Info();
    404   info->match_ = new Prefilter(ALL);
    405   return info;
    406 }
    407 
    408 // Constructs Prefilter::Info for no possible match.
    409 Prefilter::Info* Prefilter::Info::NoMatch() {
    410   Prefilter::Info* info = new Prefilter::Info();
    411   info->match_ = new Prefilter(NONE);
    412   return info;
    413 }
    414 
    415 // Constructs Prefilter::Info for any possible match.
    416 // This Prefilter::Info is valid for any regular expression,
    417 // since it makes no assertions whatsoever about the
    418 // strings being matched.
    419 Prefilter::Info* Prefilter::Info::AnyMatch() {
    420   Prefilter::Info *info = new Prefilter::Info();
    421   info->match_ = new Prefilter(ALL);
    422   return info;
    423 }
    424 
    425 // Constructs Prefilter::Info for just the empty string.
    426 Prefilter::Info* Prefilter::Info::EmptyString() {
    427   Prefilter::Info* info = new Prefilter::Info();
    428   info->is_exact_ = true;
    429   info->exact_.insert("");
    430   return info;
    431 }
    432 
    433 // Constructs Prefilter::Info for a character class.
    434 typedef CharClass::iterator CCIter;
    435 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
    436   if (Trace) {
    437     VLOG(0) << "CharClassInfo:";
    438     for (CCIter i = cc->begin(); i != cc->end(); ++i)
    439       VLOG(0) << "  " << i->lo << "-" << i->hi;
    440   }
    441 
    442   // If the class is too large, it's okay to overestimate.
    443   if (cc->size() > 10)
    444     return AnyChar();
    445 
    446   Prefilter::Info *a = new Prefilter::Info();
    447   for (CCIter i = cc->begin(); i != cc->end(); ++i)
    448     for (Rune r = i->lo; r <= i->hi; r++)
    449       a->exact_.insert(RuneToString(ToLowerRune(r)));
    450 
    451   a->is_exact_ = true;
    452 
    453   if (Trace) {
    454     VLOG(0) << " = " << a->ToString();
    455   }
    456 
    457   return a;
    458 }
    459 
    460 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
    461  public:
    462   Walker() {}
    463 
    464   virtual Info* PostVisit(
    465       Regexp* re, Info* parent_arg,
    466       Info* pre_arg,
    467       Info** child_args, int nchild_args);
    468 
    469   virtual Info* ShortVisit(
    470       Regexp* re,
    471       Info* parent_arg);
    472 
    473  private:
    474   DISALLOW_EVIL_CONSTRUCTORS(Walker);
    475 };
    476 
    477 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
    478   if (Trace) {
    479     LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
    480   }
    481   Prefilter::Info::Walker w;
    482   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
    483 
    484   if (w.stopped_early()) {
    485     delete info;
    486     return NULL;
    487   }
    488 
    489   return info;
    490 }
    491 
    492 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
    493     Regexp* re, Prefilter::Info* parent_arg) {
    494   return AnyMatch();
    495 }
    496 
    497 // Constructs the Prefilter::Info for the given regular expression.
    498 // Assumes re is simplified.
    499 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
    500     Regexp* re, Prefilter::Info* parent_arg,
    501     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
    502     int nchild_args) {
    503   Prefilter::Info *info;
    504   switch (re->op()) {
    505     default:
    506     case kRegexpRepeat:
    507       LOG(DFATAL) << "Bad regexp op " << re->op();
    508       info = EmptyString();
    509       break;
    510 
    511     case kRegexpNoMatch:
    512       info = NoMatch();
    513       break;
    514 
    515     // These ops match the empty string:
    516     case kRegexpEmptyMatch:      // anywhere
    517     case kRegexpBeginLine:       // at beginning of line
    518     case kRegexpEndLine:         // at end of line
    519     case kRegexpBeginText:       // at beginning of text
    520     case kRegexpEndText:         // at end of text
    521     case kRegexpWordBoundary:    // at word boundary
    522     case kRegexpNoWordBoundary:  // not at word boundary
    523       info = EmptyString();
    524       break;
    525 
    526     case kRegexpLiteral:
    527       info = Literal(re->rune());
    528       break;
    529 
    530     case kRegexpLiteralString:
    531       if (re->nrunes() == 0) {
    532         info = NoMatch();
    533         break;
    534       }
    535       info = Literal(re->runes()[0]);
    536       for (int i = 1; i < re->nrunes(); i++)
    537         info = Concat(info, Literal(re->runes()[i]));
    538       break;
    539 
    540     case kRegexpConcat: {
    541       // Accumulate in info.
    542       // Exact is concat of recent contiguous exact nodes.
    543       info = NULL;
    544       Info* exact = NULL;
    545       for (int i = 0; i < nchild_args; i++) {
    546         Info* ci = child_args[i];  // child info
    547         if (!ci->is_exact() ||
    548             (exact && ci->exact().size() * exact->exact().size() > 16)) {
    549           // Exact run is over.
    550           info = And(info, exact);
    551           exact = NULL;
    552           // Add this child's info.
    553           info = And(info, ci);
    554         } else {
    555           // Append to exact run.
    556           exact = Concat(exact, ci);
    557         }
    558       }
    559       info = And(info, exact);
    560     }
    561       break;
    562 
    563     case kRegexpAlternate:
    564       info = child_args[0];
    565       for (int i = 1; i < nchild_args; i++)
    566         info = Alt(info, child_args[i]);
    567       VLOG(10) << "Alt: " << info->ToString();
    568       break;
    569 
    570     case kRegexpStar:
    571       info = Star(child_args[0]);
    572       break;
    573 
    574     case kRegexpQuest:
    575       info = Quest(child_args[0]);
    576       break;
    577 
    578     case kRegexpPlus:
    579       info = Plus(child_args[0]);
    580       break;
    581 
    582     case kRegexpAnyChar:
    583       // Claim nothing, except that it's not empty.
    584       info = AnyChar();
    585       break;
    586 
    587     case kRegexpCharClass:
    588       info = CClass(re->cc());
    589       break;
    590 
    591     case kRegexpCapture:
    592       // These don't affect the set of matching strings.
    593       info = child_args[0];
    594       break;
    595   }
    596 
    597   if (Trace) {
    598     VLOG(0) << "BuildInfo " << re->ToString()
    599             << ": " << info->ToString();
    600   }
    601 
    602   return info;
    603 }
    604 
    605 
    606 Prefilter* Prefilter::FromRegexp(Regexp* re) {
    607   if (re == NULL)
    608     return NULL;
    609 
    610   Regexp* simple = re->Simplify();
    611   Prefilter::Info *info = BuildInfo(simple);
    612 
    613   simple->Decref();
    614   if (info == NULL)
    615     return NULL;
    616 
    617   Prefilter* m = info->TakeMatch();
    618 
    619   delete info;
    620   return m;
    621 }
    622 
    623 string Prefilter::DebugString() const {
    624   if (this == NULL)
    625     return "<nil>";
    626 
    627   switch (op_) {
    628     default:
    629       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
    630       return StringPrintf("op%d", op_);
    631     case NONE:
    632       return "*no-matches*";
    633     case ATOM:
    634       return atom_;
    635     case ALL:
    636       return "";
    637     case AND: {
    638       string s = "";
    639       for (int i = 0; i < subs_->size(); i++) {
    640         if (i > 0)
    641           s += " ";
    642         s += (*subs_)[i]->DebugString();
    643       }
    644       return s;
    645     }
    646     case OR: {
    647       string s = "(";
    648       for (int i = 0; i < subs_->size(); i++) {
    649         if (i > 0)
    650           s += "|";
    651         s += (*subs_)[i]->DebugString();
    652       }
    653       s += ")";
    654       return s;
    655     }
    656   }
    657 }
    658 
    659 Prefilter* Prefilter::FromRE2(const RE2* re2) {
    660   if (re2 == NULL)
    661     return NULL;
    662 
    663   Regexp* regexp = re2->Regexp();
    664   if (regexp == NULL)
    665     return NULL;
    666 
    667   return FromRegexp(regexp);
    668 }
    669 
    670 
    671 }  // namespace re2
    672