Home | History | Annotate | Download | only in re2
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "util/util.h"
      6 #include "util/flags.h"
      7 #include "re2/prefilter.h"
      8 #include "re2/prefilter_tree.h"
      9 #include "re2/re2.h"
     10 
     11 #ifdef WIN32
     12 #include <stdio.h>
     13 #define snprintf _snprintf
     14 #endif
     15 
     16 DEFINE_int32(filtered_re2_min_atom_len,
     17              3,
     18              "Strings less than this length are not stored as atoms");
     19 
     20 namespace re2 {
     21 
     22 PrefilterTree::PrefilterTree()
     23     : compiled_(false) {
     24 }
     25 
     26 PrefilterTree::~PrefilterTree() {
     27   for (int i = 0; i < prefilter_vec_.size(); i++)
     28     delete prefilter_vec_[i];
     29 
     30   for (int i = 0; i < entries_.size(); i++)
     31     delete entries_[i].parents;
     32 }
     33 
     34 // Functions used for adding and Compiling prefilters to the
     35 // PrefilterTree.
     36 static bool KeepPart(Prefilter* prefilter, int level) {
     37   if (prefilter == NULL)
     38     return false;
     39 
     40   switch (prefilter->op()) {
     41     default:
     42       LOG(DFATAL) << "Unexpected op in KeepPart: "
     43                   << prefilter->op();
     44       return false;
     45 
     46     case Prefilter::ALL:
     47       return false;
     48 
     49     case Prefilter::ATOM:
     50       return prefilter->atom().size() >=
     51           FLAGS_filtered_re2_min_atom_len;
     52 
     53     case Prefilter::AND: {
     54       int j = 0;
     55       vector<Prefilter*>* subs = prefilter->subs();
     56       for (int i = 0; i < subs->size(); i++)
     57         if (KeepPart((*subs)[i], level + 1))
     58           (*subs)[j++] = (*subs)[i];
     59         else
     60           delete (*subs)[i];
     61 
     62       subs->resize(j);
     63       return j > 0;
     64     }
     65 
     66     case Prefilter::OR:
     67       for (int i = 0; i < prefilter->subs()->size(); i++)
     68         if (!KeepPart((*prefilter->subs())[i], level + 1))
     69           return false;
     70       return true;
     71   }
     72 }
     73 
     74 void PrefilterTree::Add(Prefilter *f) {
     75   if (compiled_) {
     76     LOG(DFATAL) << "Add after Compile.";
     77     return;
     78   }
     79   if (f != NULL && !KeepPart(f, 0)) {
     80     delete f;
     81     f = NULL;
     82   }
     83 
     84   prefilter_vec_.push_back(f);
     85 }
     86 
     87 void PrefilterTree::Compile(vector<string>* atom_vec) {
     88   if (compiled_) {
     89     LOG(DFATAL) << "Compile after Compile.";
     90     return;
     91   }
     92 
     93   // We do this check to support some legacy uses of
     94   // PrefilterTree that call Compile before adding any regexps,
     95   // and expect Compile not to have effect.
     96   if (prefilter_vec_.empty())
     97     return;
     98 
     99   compiled_ = true;
    100 
    101   AssignUniqueIds(atom_vec);
    102 
    103   // Identify nodes that are too common among prefilters and are
    104   // triggering too many parents. Then get rid of them if possible.
    105   // Note that getting rid of a prefilter node simply means they are
    106   // no longer necessary for their parent to trigger; that is, we do
    107   // not miss out on any regexps triggering by getting rid of a
    108   // prefilter node.
    109   for (int i = 0; i < entries_.size(); i++) {
    110     StdIntMap* parents = entries_[i].parents;
    111     if (parents->size() > 8) {
    112       // This one triggers too many things. If all the parents are AND
    113       // nodes and have other things guarding them, then get rid of
    114       // this trigger. TODO(vsri): Adjust the threshold appropriately,
    115       // make it a function of total number of nodes?
    116       bool have_other_guard = true;
    117       for (StdIntMap::iterator it = parents->begin();
    118            it != parents->end(); ++it) {
    119         have_other_guard = have_other_guard &&
    120             (entries_[it->first].propagate_up_at_count > 1);
    121       }
    122 
    123       if (have_other_guard) {
    124         for (StdIntMap::iterator it = parents->begin();
    125              it != parents->end(); ++it)
    126           entries_[it->first].propagate_up_at_count -= 1;
    127 
    128         parents->clear();  // Forget the parents
    129       }
    130     }
    131   }
    132 
    133   PrintDebugInfo();
    134 }
    135 
    136 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
    137   string node_string = NodeString(node);
    138   map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
    139   if (iter == node_map_.end())
    140     return NULL;
    141   return (*iter).second;
    142 }
    143 
    144 static string Itoa(int n) {
    145   char buf[100];
    146   snprintf(buf, sizeof buf, "%d", n);
    147   return string(buf);
    148 }
    149 
    150 string PrefilterTree::NodeString(Prefilter* node) const {
    151   // Adding the operation disambiguates AND/OR/atom nodes.
    152   string s = Itoa(node->op()) + ":";
    153   if (node->op() == Prefilter::ATOM) {
    154     s += node->atom();
    155   } else {
    156     for (int i = 0; i < node->subs()->size() ; i++) {
    157       if (i > 0)
    158         s += ',';
    159       s += Itoa((*node->subs())[i]->unique_id());
    160     }
    161   }
    162   return s;
    163 }
    164 
    165 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
    166   atom_vec->clear();
    167 
    168   // Build vector of all filter nodes, sorted topologically
    169   // from top to bottom in v.
    170   vector<Prefilter*> v;
    171 
    172   // Add the top level nodes of each regexp prefilter.
    173   for (int i = 0; i < prefilter_vec_.size(); i++) {
    174     Prefilter* f = prefilter_vec_[i];
    175     if (f == NULL)
    176       unfiltered_.push_back(i);
    177 
    178     // We push NULL also on to v, so that we maintain the
    179     // mapping of index==regexpid for level=0 prefilter nodes.
    180     v.push_back(f);
    181   }
    182 
    183   // Now add all the descendant nodes.
    184   for (int i = 0; i < v.size(); i++) {
    185     Prefilter* f = v[i];
    186     if (f == NULL)
    187       continue;
    188     if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
    189       const vector<Prefilter*>& subs = *f->subs();
    190       for (int j = 0; j < subs.size(); j++)
    191         v.push_back(subs[j]);
    192     }
    193   }
    194 
    195   // Identify unique nodes.
    196   int unique_id = 0;
    197   for (int i = v.size() - 1; i >= 0; i--) {
    198     Prefilter *node = v[i];
    199     if (node == NULL)
    200       continue;
    201     node->set_unique_id(-1);
    202     Prefilter* canonical = CanonicalNode(node);
    203     if (canonical == NULL) {
    204       // Any further nodes that have the same node string
    205       // will find this node as the canonical node.
    206       node_map_[NodeString(node)] = node;
    207       if (node->op() == Prefilter::ATOM) {
    208         atom_vec->push_back(node->atom());
    209         atom_index_to_id_.push_back(unique_id);
    210       }
    211       node->set_unique_id(unique_id++);
    212     } else {
    213       node->set_unique_id(canonical->unique_id());
    214     }
    215   }
    216   entries_.resize(node_map_.size());
    217 
    218   // Create parent StdIntMap for the entries.
    219   for (int i = v.size()  - 1; i >= 0; i--) {
    220     Prefilter* prefilter = v[i];
    221     if (prefilter == NULL)
    222       continue;
    223 
    224     if (CanonicalNode(prefilter) != prefilter)
    225       continue;
    226 
    227     Entry* entry = &entries_[prefilter->unique_id()];
    228     entry->parents = new StdIntMap();
    229   }
    230 
    231   // Fill the entries.
    232   for (int i = v.size()  - 1; i >= 0; i--) {
    233     Prefilter* prefilter = v[i];
    234     if (prefilter == NULL)
    235       continue;
    236 
    237     if (CanonicalNode(prefilter) != prefilter)
    238       continue;
    239 
    240     Entry* entry = &entries_[prefilter->unique_id()];
    241 
    242     switch (prefilter->op()) {
    243       default:
    244       case Prefilter::ALL:
    245         LOG(DFATAL) << "Unexpected op: " << prefilter->op();
    246         return;
    247 
    248       case Prefilter::ATOM:
    249         entry->propagate_up_at_count = 1;
    250         break;
    251 
    252       case Prefilter::OR:
    253       case Prefilter::AND: {
    254         std::set<int> uniq_child;
    255         for (int j = 0; j < prefilter->subs()->size() ; j++) {
    256           Prefilter* child = (*prefilter->subs())[j];
    257           Prefilter* canonical = CanonicalNode(child);
    258           if (canonical == NULL) {
    259             LOG(DFATAL) << "Null canonical node";
    260             return;
    261           }
    262           int child_id = canonical->unique_id();
    263           uniq_child.insert(child_id);
    264           // To the child, we want to add to parent indices.
    265           Entry* child_entry = &entries_[child_id];
    266           if (child_entry->parents->find(prefilter->unique_id()) ==
    267               child_entry->parents->end())
    268             (*child_entry->parents)[prefilter->unique_id()] = 1;
    269         }
    270         entry->propagate_up_at_count =
    271             prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
    272 
    273         break;
    274       }
    275     }
    276   }
    277 
    278   // For top level nodes, populate regexp id.
    279   for (int i = 0; i < prefilter_vec_.size(); i++) {
    280     if (prefilter_vec_[i] == NULL)
    281       continue;
    282     int id = CanonicalNode(prefilter_vec_[i])->unique_id();
    283     DCHECK_LE(0, id);
    284     Entry* entry = &entries_[id];
    285     entry->regexps.push_back(i);
    286   }
    287 }
    288 
    289 // Functions for triggering during search.
    290 void PrefilterTree::RegexpsGivenStrings(
    291     const vector<int>& matched_atoms,
    292     vector<int>* regexps) const {
    293   regexps->clear();
    294   if (!compiled_) {
    295     LOG(WARNING) << "Compile() not called";
    296     for (int i = 0; i < prefilter_vec_.size(); ++i)
    297       regexps->push_back(i);
    298   } else {
    299     if (!prefilter_vec_.empty()) {
    300       IntMap regexps_map(prefilter_vec_.size());
    301       vector<int> matched_atom_ids;
    302       for (int j = 0; j < matched_atoms.size(); j++) {
    303         matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
    304         VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
    305       }
    306       PropagateMatch(matched_atom_ids, &regexps_map);
    307       for (IntMap::iterator it = regexps_map.begin();
    308            it != regexps_map.end();
    309            ++it)
    310         regexps->push_back(it->index());
    311 
    312       regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
    313     }
    314   }
    315   sort(regexps->begin(), regexps->end());
    316 }
    317 
    318 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
    319                                    IntMap* regexps) const {
    320   IntMap count(entries_.size());
    321   IntMap work(entries_.size());
    322   for (int i = 0; i < atom_ids.size(); i++)
    323     work.set(atom_ids[i], 1);
    324   for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
    325     const Entry& entry = entries_[it->index()];
    326     VLOG(10) << "Processing: " << it->index();
    327     // Record regexps triggered.
    328     for (int i = 0; i < entry.regexps.size(); i++) {
    329       VLOG(10) << "Regexp triggered: " << entry.regexps[i];
    330       regexps->set(entry.regexps[i], 1);
    331     }
    332     int c;
    333     // Pass trigger up to parents.
    334     for (StdIntMap::iterator it = entry.parents->begin();
    335          it != entry.parents->end();
    336          ++it) {
    337       int j = it->first;
    338       const Entry& parent = entries_[j];
    339       VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
    340       // Delay until all the children have succeeded.
    341       if (parent.propagate_up_at_count > 1) {
    342         if (count.has_index(j)) {
    343           c = count.get_existing(j) + 1;
    344           count.set_existing(j, c);
    345         } else {
    346           c = 1;
    347           count.set_new(j, c);
    348         }
    349         if (c < parent.propagate_up_at_count)
    350           continue;
    351       }
    352       VLOG(10) << "Triggering: " << j;
    353       // Trigger the parent.
    354       work.set(j, 1);
    355     }
    356   }
    357 }
    358 
    359 // Debugging help.
    360 void PrefilterTree::PrintPrefilter(int regexpid) {
    361   LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
    362 }
    363 
    364 void PrefilterTree::PrintDebugInfo() {
    365   VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
    366   VLOG(10) << "#Unique Nodes: " << entries_.size();
    367 
    368   for (int i = 0; i < entries_.size(); ++i) {
    369     StdIntMap* parents = entries_[i].parents;
    370     const vector<int>& regexps = entries_[i].regexps;
    371     VLOG(10) << "EntryId: " << i
    372             << " N: " << parents->size() << " R: " << regexps.size();
    373     for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
    374       VLOG(10) << it->first;
    375   }
    376   VLOG(10) << "Map:";
    377   for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
    378        iter != node_map_.end(); ++iter)
    379     VLOG(10) << "NodeId: " << (*iter).second->unique_id()
    380             << " Str: " << (*iter).first;
    381 }
    382 
    383 string PrefilterTree::DebugNodeString(Prefilter* node) const {
    384   string node_string = "";
    385 
    386   if (node->op() == Prefilter::ATOM) {
    387     DCHECK(!node->atom().empty());
    388     node_string += node->atom();
    389   } else {
    390     // Adding the operation disambiguates AND and OR nodes.
    391     node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
    392     node_string += "(";
    393     for (int i = 0; i < node->subs()->size() ; i++) {
    394       if (i > 0)
    395         node_string += ',';
    396       node_string += Itoa((*node->subs())[i]->unique_id());
    397       node_string += ":";
    398       node_string += DebugNodeString((*node->subs())[i]);
    399     }
    400     node_string += ")";
    401   }
    402   return node_string;
    403 }
    404 
    405 }  // namespace re2
    406