Home | History | Annotate | Download | only in re2
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "util/util.h"
      6 #include "util/flags.h"
      7 #include "re2/prefilter.h"
      8 #include "re2/prefilter_tree.h"
      9 #include "re2/re2.h"
     10 
     11 DEFINE_int32(filtered_re2_min_atom_len,
     12              3,
     13              "Strings less than this length are not stored as atoms");
     14 
     15 namespace re2 {
     16 
     17 PrefilterTree::PrefilterTree()
     18     : compiled_(false) {
     19 }
     20 
     21 PrefilterTree::~PrefilterTree() {
     22   for (int i = 0; i < prefilter_vec_.size(); i++)
     23     delete prefilter_vec_[i];
     24 
     25   for (int i = 0; i < entries_.size(); i++)
     26     delete entries_[i].parents;
     27 }
     28 
     29 // Functions used for adding and Compiling prefilters to the
     30 // PrefilterTree.
     31 static bool KeepPart(Prefilter* prefilter, int level) {
     32   if (prefilter == NULL)
     33     return false;
     34 
     35   switch (prefilter->op()) {
     36     default:
     37       LOG(DFATAL) << "Unexpected op in KeepPart: "
     38                   << prefilter->op();
     39       return false;
     40 
     41     case Prefilter::ALL:
     42       return false;
     43 
     44     case Prefilter::ATOM:
     45       return prefilter->atom().size() >=
     46           FLAGS_filtered_re2_min_atom_len;
     47 
     48     case Prefilter::AND: {
     49       int j = 0;
     50       vector<Prefilter*>* subs = prefilter->subs();
     51       for (int i = 0; i < subs->size(); i++)
     52         if (KeepPart((*subs)[i], level + 1))
     53           (*subs)[j++] = (*subs)[i];
     54         else
     55           delete (*subs)[i];
     56 
     57       subs->resize(j);
     58       return j > 0;
     59     }
     60 
     61     case Prefilter::OR:
     62       for (int i = 0; i < prefilter->subs()->size(); i++)
     63         if (!KeepPart((*prefilter->subs())[i], level + 1))
     64           return false;
     65       return true;
     66   }
     67 }
     68 
     69 void PrefilterTree::Add(Prefilter *f) {
     70   if (compiled_) {
     71     LOG(DFATAL) << "Add after Compile.";
     72     return;
     73   }
     74   if (f != NULL && !KeepPart(f, 0)) {
     75     delete f;
     76     f = NULL;
     77   }
     78 
     79   prefilter_vec_.push_back(f);
     80 }
     81 
     82 void PrefilterTree::Compile(vector<string>* atom_vec) {
     83   if (compiled_) {
     84     LOG(DFATAL) << "Compile after Compile.";
     85     return;
     86   }
     87 
     88   // We do this check to support some legacy uses of
     89   // PrefilterTree that call Compile before adding any regexps,
     90   // and expect Compile not to have effect.
     91   if (prefilter_vec_.empty())
     92     return;
     93 
     94   compiled_ = true;
     95 
     96   AssignUniqueIds(atom_vec);
     97 
     98   // Identify nodes that are too common among prefilters and are
     99   // triggering too many parents. Then get rid of them if possible.
    100   // Note that getting rid of a prefilter node simply means they are
    101   // no longer necessary for their parent to trigger; that is, we do
    102   // not miss out on any regexps triggering by getting rid of a
    103   // prefilter node.
    104   for (int i = 0; i < entries_.size(); i++) {
    105     IntMap* parents = entries_[i].parents;
    106     if (parents->size() > 8) {
    107       // This one triggers too many things. If all the parents are AND
    108       // nodes and have other things guarding them, then get rid of
    109       // this trigger. TODO(vsri): Adjust the threshold appropriately,
    110       // make it a function of total number of nodes?
    111       bool have_other_guard = true;
    112       for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
    113         have_other_guard = have_other_guard &&
    114             (entries_[it->index()].propagate_up_at_count > 1);
    115 
    116       if (have_other_guard) {
    117         for (IntMap::iterator it = parents->begin();
    118              it != parents->end(); ++it)
    119           entries_[it->index()].propagate_up_at_count -= 1;
    120 
    121         parents->clear();  // Forget the parents
    122       }
    123     }
    124   }
    125 
    126   PrintDebugInfo();
    127 }
    128 
    129 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
    130   string node_string = NodeString(node);
    131   map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
    132   if (iter == node_map_.end())
    133     return NULL;
    134   return (*iter).second;
    135 }
    136 
    137 static string Itoa(int n) {
    138   char buf[100];
    139   snprintf(buf, sizeof buf, "%d", n);
    140   return string(buf);
    141 }
    142 
    143 string PrefilterTree::NodeString(Prefilter* node) const {
    144   // Adding the operation disambiguates AND/OR/atom nodes.
    145   string s = Itoa(node->op()) + ":";
    146   if (node->op() == Prefilter::ATOM) {
    147     s += node->atom();
    148   } else {
    149     for (int i = 0; i < node->subs()->size() ; i++) {
    150       if (i > 0)
    151         s += ',';
    152       s += Itoa((*node->subs())[i]->unique_id());
    153     }
    154   }
    155   return s;
    156 }
    157 
    158 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
    159   atom_vec->clear();
    160 
    161   // Build vector of all filter nodes, sorted topologically
    162   // from top to bottom in v.
    163   vector<Prefilter*> v;
    164 
    165   // Add the top level nodes of each regexp prefilter.
    166   for (int i = 0; i < prefilter_vec_.size(); i++) {
    167     Prefilter* f = prefilter_vec_[i];
    168     if (f == NULL)
    169       unfiltered_.push_back(i);
    170 
    171     // We push NULL also on to v, so that we maintain the
    172     // mapping of index==regexpid for level=0 prefilter nodes.
    173     v.push_back(f);
    174   }
    175 
    176   // Now add all the descendant nodes.
    177   for (int i = 0; i < v.size(); i++) {
    178     Prefilter* f = v[i];
    179     if (f == NULL)
    180       continue;
    181     if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
    182       const vector<Prefilter*>& subs = *f->subs();
    183       for (int j = 0; j < subs.size(); j++)
    184         v.push_back(subs[j]);
    185     }
    186   }
    187 
    188   // Identify unique nodes.
    189   int unique_id = 0;
    190   for (int i = v.size() - 1; i >= 0; i--) {
    191     Prefilter *node = v[i];
    192     if (node == NULL)
    193       continue;
    194     node->set_unique_id(-1);
    195     Prefilter* canonical = CanonicalNode(node);
    196     if (canonical == NULL) {
    197       // Any further nodes that have the same node string
    198       // will find this node as the canonical node.
    199       node_map_[NodeString(node)] = node;
    200       if (node->op() == Prefilter::ATOM) {
    201         atom_vec->push_back(node->atom());
    202         atom_index_to_id_.push_back(unique_id);
    203       }
    204       node->set_unique_id(unique_id++);
    205     } else {
    206       node->set_unique_id(canonical->unique_id());
    207     }
    208   }
    209   entries_.resize(node_map_.size());
    210 
    211   // Create parent IntMap for the entries.
    212   for (int i = v.size()  - 1; i >= 0; i--) {
    213     Prefilter* prefilter = v[i];
    214     if (prefilter == NULL)
    215       continue;
    216 
    217     if (CanonicalNode(prefilter) != prefilter)
    218       continue;
    219 
    220     Entry* entry = &entries_[prefilter->unique_id()];
    221     entry->parents = new IntMap(node_map_.size());
    222   }
    223 
    224   // Fill the entries.
    225   for (int i = v.size()  - 1; i >= 0; i--) {
    226     Prefilter* prefilter = v[i];
    227     if (prefilter == NULL)
    228       continue;
    229 
    230     if (CanonicalNode(prefilter) != prefilter)
    231       continue;
    232 
    233     Entry* entry = &entries_[prefilter->unique_id()];
    234 
    235     switch (prefilter->op()) {
    236       default:
    237       case Prefilter::ALL:
    238         LOG(DFATAL) << "Unexpected op: " << prefilter->op();
    239         return;
    240 
    241       case Prefilter::ATOM:
    242         entry->propagate_up_at_count = 1;
    243         break;
    244 
    245       case Prefilter::OR:
    246       case Prefilter::AND: {
    247         IntMap uniq_child(node_map_.size());
    248         for (int j = 0; j < prefilter->subs()->size() ; j++) {
    249           Prefilter* child = (*prefilter->subs())[j];
    250           Prefilter* canonical = CanonicalNode(child);
    251           if (canonical == NULL) {
    252             LOG(DFATAL) << "Null canonical node";
    253             return;
    254           }
    255           int child_id = canonical->unique_id();
    256           if (!uniq_child.has_index(child_id))
    257             uniq_child.set_new(child_id, 1);
    258           // To the child, we want to add to parent indices.
    259           Entry* child_entry = &entries_[child_id];
    260           if (!child_entry->parents->has_index(prefilter->unique_id()))
    261             child_entry->parents->set_new(prefilter->unique_id(), 1);
    262         }
    263         entry->propagate_up_at_count =
    264             prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
    265 
    266         break;
    267       }
    268     }
    269   }
    270 
    271   // For top level nodes, populate regexp id.
    272   for (int i = 0; i < prefilter_vec_.size(); i++) {
    273     if (prefilter_vec_[i] == NULL)
    274       continue;
    275     int id = CanonicalNode(prefilter_vec_[i])->unique_id();
    276     DCHECK_LE(0, id);
    277     Entry* entry = &entries_[id];
    278     entry->regexps.push_back(i);
    279   }
    280 }
    281 
    282 // Functions for triggering during search.
    283 void PrefilterTree::RegexpsGivenStrings(
    284     const vector<int>& matched_atoms,
    285     vector<int>* regexps) const {
    286   regexps->clear();
    287   if (!compiled_) {
    288     LOG(WARNING) << "Compile() not called";
    289     for (int i = 0; i < prefilter_vec_.size(); ++i)
    290       regexps->push_back(i);
    291   } else {
    292     if (!prefilter_vec_.empty()) {
    293       IntMap regexps_map(prefilter_vec_.size());
    294       vector<int> matched_atom_ids;
    295       for (int j = 0; j < matched_atoms.size(); j++) {
    296         matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
    297         VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
    298       }
    299       PropagateMatch(matched_atom_ids, &regexps_map);
    300       for (IntMap::iterator it = regexps_map.begin();
    301            it != regexps_map.end();
    302            ++it)
    303         regexps->push_back(it->index());
    304 
    305       regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
    306     }
    307   }
    308   sort(regexps->begin(), regexps->end());
    309 }
    310 
    311 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
    312                                    IntMap* regexps) const {
    313   IntMap count(entries_.size());
    314   IntMap work(entries_.size());
    315   for (int i = 0; i < atom_ids.size(); i++)
    316     work.set(atom_ids[i], 1);
    317   for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
    318     const Entry& entry = entries_[it->index()];
    319     VLOG(10) << "Processing: " << it->index();
    320     // Record regexps triggered.
    321     for (int i = 0; i < entry.regexps.size(); i++) {
    322       VLOG(10) << "Regexp triggered: " << entry.regexps[i];
    323       regexps->set(entry.regexps[i], 1);
    324     }
    325     int c;
    326     // Pass trigger up to parents.
    327     for (IntMap::iterator it = entry.parents->begin();
    328          it != entry.parents->end();
    329          ++it) {
    330       int j = it->index();
    331       const Entry& parent = entries_[j];
    332       VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
    333       // Delay until all the children have succeeded.
    334       if (parent.propagate_up_at_count > 1) {
    335         if (count.has_index(j)) {
    336           c = count.get_existing(j) + 1;
    337           count.set_existing(j, c);
    338         } else {
    339           c = 1;
    340           count.set_new(j, c);
    341         }
    342         if (c < parent.propagate_up_at_count)
    343           continue;
    344       }
    345       VLOG(10) << "Triggering: " << j;
    346       // Trigger the parent.
    347       work.set(j, 1);
    348     }
    349   }
    350 }
    351 
    352 // Debugging help.
    353 void PrefilterTree::PrintPrefilter(int regexpid) {
    354   LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
    355 }
    356 
    357 void PrefilterTree::PrintDebugInfo() {
    358   VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
    359   VLOG(10) << "#Unique Nodes: " << entries_.size();
    360 
    361   for (int i = 0; i < entries_.size(); ++i) {
    362     IntMap* parents = entries_[i].parents;
    363     const vector<int>& regexps = entries_[i].regexps;
    364     VLOG(10) << "EntryId: " << i
    365             << " N: " << parents->size() << " R: " << regexps.size();
    366     for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
    367       VLOG(10) << it->index();
    368   }
    369   VLOG(10) << "Map:";
    370   for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
    371        iter != node_map_.end(); ++iter)
    372     VLOG(10) << "NodeId: " << (*iter).second->unique_id()
    373             << " Str: " << (*iter).first;
    374 }
    375 
    376 string PrefilterTree::DebugNodeString(Prefilter* node) const {
    377   string node_string = "";
    378 
    379   if (node->op() == Prefilter::ATOM) {
    380     DCHECK(!node->atom().empty());
    381     node_string += node->atom();
    382   } else {
    383     // Adding the operation disambiguates AND and OR nodes.
    384     node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
    385     node_string += "(";
    386     for (int i = 0; i < node->subs()->size() ; i++) {
    387       if (i > 0)
    388         node_string += ',';
    389       node_string += Itoa((*node->subs())[i]->unique_id());
    390       node_string += ":";
    391       node_string += DebugNodeString((*node->subs())[i]);
    392     }
    393     node_string += ")";
    394   }
    395   return node_string;
    396 }
    397 
    398 }  // namespace re2
    399