1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "util/util.h" 6 #include "util/flags.h" 7 #include "re2/prefilter.h" 8 #include "re2/prefilter_tree.h" 9 #include "re2/re2.h" 10 11 #ifdef WIN32 12 #include <stdio.h> 13 #define snprintf _snprintf 14 #endif 15 16 DEFINE_int32(filtered_re2_min_atom_len, 17 3, 18 "Strings less than this length are not stored as atoms"); 19 20 namespace re2 { 21 22 PrefilterTree::PrefilterTree() 23 : compiled_(false) { 24 } 25 26 PrefilterTree::~PrefilterTree() { 27 for (int i = 0; i < prefilter_vec_.size(); i++) 28 delete prefilter_vec_[i]; 29 30 for (int i = 0; i < entries_.size(); i++) 31 delete entries_[i].parents; 32 } 33 34 // Functions used for adding and Compiling prefilters to the 35 // PrefilterTree. 36 static bool KeepPart(Prefilter* prefilter, int level) { 37 if (prefilter == NULL) 38 return false; 39 40 switch (prefilter->op()) { 41 default: 42 LOG(DFATAL) << "Unexpected op in KeepPart: " 43 << prefilter->op(); 44 return false; 45 46 case Prefilter::ALL: 47 return false; 48 49 case Prefilter::ATOM: 50 return prefilter->atom().size() >= 51 FLAGS_filtered_re2_min_atom_len; 52 53 case Prefilter::AND: { 54 int j = 0; 55 vector<Prefilter*>* subs = prefilter->subs(); 56 for (int i = 0; i < subs->size(); i++) 57 if (KeepPart((*subs)[i], level + 1)) 58 (*subs)[j++] = (*subs)[i]; 59 else 60 delete (*subs)[i]; 61 62 subs->resize(j); 63 return j > 0; 64 } 65 66 case Prefilter::OR: 67 for (int i = 0; i < prefilter->subs()->size(); i++) 68 if (!KeepPart((*prefilter->subs())[i], level + 1)) 69 return false; 70 return true; 71 } 72 } 73 74 void PrefilterTree::Add(Prefilter *f) { 75 if (compiled_) { 76 LOG(DFATAL) << "Add after Compile."; 77 return; 78 } 79 if (f != NULL && !KeepPart(f, 0)) { 80 delete f; 81 f = NULL; 82 } 83 84 prefilter_vec_.push_back(f); 85 } 86 87 void PrefilterTree::Compile(vector<string>* atom_vec) { 88 if (compiled_) { 89 LOG(DFATAL) << "Compile after Compile."; 90 return; 91 } 92 93 // We do this check to support some legacy uses of 94 // PrefilterTree that call Compile before adding any regexps, 95 // and expect Compile not to have effect. 96 if (prefilter_vec_.empty()) 97 return; 98 99 compiled_ = true; 100 101 AssignUniqueIds(atom_vec); 102 103 // Identify nodes that are too common among prefilters and are 104 // triggering too many parents. Then get rid of them if possible. 105 // Note that getting rid of a prefilter node simply means they are 106 // no longer necessary for their parent to trigger; that is, we do 107 // not miss out on any regexps triggering by getting rid of a 108 // prefilter node. 109 for (int i = 0; i < entries_.size(); i++) { 110 StdIntMap* parents = entries_[i].parents; 111 if (parents->size() > 8) { 112 // This one triggers too many things. If all the parents are AND 113 // nodes and have other things guarding them, then get rid of 114 // this trigger. TODO(vsri): Adjust the threshold appropriately, 115 // make it a function of total number of nodes? 116 bool have_other_guard = true; 117 for (StdIntMap::iterator it = parents->begin(); 118 it != parents->end(); ++it) { 119 have_other_guard = have_other_guard && 120 (entries_[it->first].propagate_up_at_count > 1); 121 } 122 123 if (have_other_guard) { 124 for (StdIntMap::iterator it = parents->begin(); 125 it != parents->end(); ++it) 126 entries_[it->first].propagate_up_at_count -= 1; 127 128 parents->clear(); // Forget the parents 129 } 130 } 131 } 132 133 PrintDebugInfo(); 134 } 135 136 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { 137 string node_string = NodeString(node); 138 map<string, Prefilter*>::iterator iter = node_map_.find(node_string); 139 if (iter == node_map_.end()) 140 return NULL; 141 return (*iter).second; 142 } 143 144 static string Itoa(int n) { 145 char buf[100]; 146 snprintf(buf, sizeof buf, "%d", n); 147 return string(buf); 148 } 149 150 string PrefilterTree::NodeString(Prefilter* node) const { 151 // Adding the operation disambiguates AND/OR/atom nodes. 152 string s = Itoa(node->op()) + ":"; 153 if (node->op() == Prefilter::ATOM) { 154 s += node->atom(); 155 } else { 156 for (int i = 0; i < node->subs()->size() ; i++) { 157 if (i > 0) 158 s += ','; 159 s += Itoa((*node->subs())[i]->unique_id()); 160 } 161 } 162 return s; 163 } 164 165 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { 166 atom_vec->clear(); 167 168 // Build vector of all filter nodes, sorted topologically 169 // from top to bottom in v. 170 vector<Prefilter*> v; 171 172 // Add the top level nodes of each regexp prefilter. 173 for (int i = 0; i < prefilter_vec_.size(); i++) { 174 Prefilter* f = prefilter_vec_[i]; 175 if (f == NULL) 176 unfiltered_.push_back(i); 177 178 // We push NULL also on to v, so that we maintain the 179 // mapping of index==regexpid for level=0 prefilter nodes. 180 v.push_back(f); 181 } 182 183 // Now add all the descendant nodes. 184 for (int i = 0; i < v.size(); i++) { 185 Prefilter* f = v[i]; 186 if (f == NULL) 187 continue; 188 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { 189 const vector<Prefilter*>& subs = *f->subs(); 190 for (int j = 0; j < subs.size(); j++) 191 v.push_back(subs[j]); 192 } 193 } 194 195 // Identify unique nodes. 196 int unique_id = 0; 197 for (int i = v.size() - 1; i >= 0; i--) { 198 Prefilter *node = v[i]; 199 if (node == NULL) 200 continue; 201 node->set_unique_id(-1); 202 Prefilter* canonical = CanonicalNode(node); 203 if (canonical == NULL) { 204 // Any further nodes that have the same node string 205 // will find this node as the canonical node. 206 node_map_[NodeString(node)] = node; 207 if (node->op() == Prefilter::ATOM) { 208 atom_vec->push_back(node->atom()); 209 atom_index_to_id_.push_back(unique_id); 210 } 211 node->set_unique_id(unique_id++); 212 } else { 213 node->set_unique_id(canonical->unique_id()); 214 } 215 } 216 entries_.resize(node_map_.size()); 217 218 // Create parent StdIntMap for the entries. 219 for (int i = v.size() - 1; i >= 0; i--) { 220 Prefilter* prefilter = v[i]; 221 if (prefilter == NULL) 222 continue; 223 224 if (CanonicalNode(prefilter) != prefilter) 225 continue; 226 227 Entry* entry = &entries_[prefilter->unique_id()]; 228 entry->parents = new StdIntMap(); 229 } 230 231 // Fill the entries. 232 for (int i = v.size() - 1; i >= 0; i--) { 233 Prefilter* prefilter = v[i]; 234 if (prefilter == NULL) 235 continue; 236 237 if (CanonicalNode(prefilter) != prefilter) 238 continue; 239 240 Entry* entry = &entries_[prefilter->unique_id()]; 241 242 switch (prefilter->op()) { 243 default: 244 case Prefilter::ALL: 245 LOG(DFATAL) << "Unexpected op: " << prefilter->op(); 246 return; 247 248 case Prefilter::ATOM: 249 entry->propagate_up_at_count = 1; 250 break; 251 252 case Prefilter::OR: 253 case Prefilter::AND: { 254 std::set<int> uniq_child; 255 for (int j = 0; j < prefilter->subs()->size() ; j++) { 256 Prefilter* child = (*prefilter->subs())[j]; 257 Prefilter* canonical = CanonicalNode(child); 258 if (canonical == NULL) { 259 LOG(DFATAL) << "Null canonical node"; 260 return; 261 } 262 int child_id = canonical->unique_id(); 263 uniq_child.insert(child_id); 264 // To the child, we want to add to parent indices. 265 Entry* child_entry = &entries_[child_id]; 266 if (child_entry->parents->find(prefilter->unique_id()) == 267 child_entry->parents->end()) 268 (*child_entry->parents)[prefilter->unique_id()] = 1; 269 } 270 entry->propagate_up_at_count = 271 prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; 272 273 break; 274 } 275 } 276 } 277 278 // For top level nodes, populate regexp id. 279 for (int i = 0; i < prefilter_vec_.size(); i++) { 280 if (prefilter_vec_[i] == NULL) 281 continue; 282 int id = CanonicalNode(prefilter_vec_[i])->unique_id(); 283 DCHECK_LE(0, id); 284 Entry* entry = &entries_[id]; 285 entry->regexps.push_back(i); 286 } 287 } 288 289 // Functions for triggering during search. 290 void PrefilterTree::RegexpsGivenStrings( 291 const vector<int>& matched_atoms, 292 vector<int>* regexps) const { 293 regexps->clear(); 294 if (!compiled_) { 295 LOG(WARNING) << "Compile() not called"; 296 for (int i = 0; i < prefilter_vec_.size(); ++i) 297 regexps->push_back(i); 298 } else { 299 if (!prefilter_vec_.empty()) { 300 IntMap regexps_map(prefilter_vec_.size()); 301 vector<int> matched_atom_ids; 302 for (int j = 0; j < matched_atoms.size(); j++) { 303 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); 304 VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; 305 } 306 PropagateMatch(matched_atom_ids, ®exps_map); 307 for (IntMap::iterator it = regexps_map.begin(); 308 it != regexps_map.end(); 309 ++it) 310 regexps->push_back(it->index()); 311 312 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); 313 } 314 } 315 sort(regexps->begin(), regexps->end()); 316 } 317 318 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids, 319 IntMap* regexps) const { 320 IntMap count(entries_.size()); 321 IntMap work(entries_.size()); 322 for (int i = 0; i < atom_ids.size(); i++) 323 work.set(atom_ids[i], 1); 324 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { 325 const Entry& entry = entries_[it->index()]; 326 VLOG(10) << "Processing: " << it->index(); 327 // Record regexps triggered. 328 for (int i = 0; i < entry.regexps.size(); i++) { 329 VLOG(10) << "Regexp triggered: " << entry.regexps[i]; 330 regexps->set(entry.regexps[i], 1); 331 } 332 int c; 333 // Pass trigger up to parents. 334 for (StdIntMap::iterator it = entry.parents->begin(); 335 it != entry.parents->end(); 336 ++it) { 337 int j = it->first; 338 const Entry& parent = entries_[j]; 339 VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; 340 // Delay until all the children have succeeded. 341 if (parent.propagate_up_at_count > 1) { 342 if (count.has_index(j)) { 343 c = count.get_existing(j) + 1; 344 count.set_existing(j, c); 345 } else { 346 c = 1; 347 count.set_new(j, c); 348 } 349 if (c < parent.propagate_up_at_count) 350 continue; 351 } 352 VLOG(10) << "Triggering: " << j; 353 // Trigger the parent. 354 work.set(j, 1); 355 } 356 } 357 } 358 359 // Debugging help. 360 void PrefilterTree::PrintPrefilter(int regexpid) { 361 LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); 362 } 363 364 void PrefilterTree::PrintDebugInfo() { 365 VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); 366 VLOG(10) << "#Unique Nodes: " << entries_.size(); 367 368 for (int i = 0; i < entries_.size(); ++i) { 369 StdIntMap* parents = entries_[i].parents; 370 const vector<int>& regexps = entries_[i].regexps; 371 VLOG(10) << "EntryId: " << i 372 << " N: " << parents->size() << " R: " << regexps.size(); 373 for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) 374 VLOG(10) << it->first; 375 } 376 VLOG(10) << "Map:"; 377 for (map<string, Prefilter*>::const_iterator iter = node_map_.begin(); 378 iter != node_map_.end(); ++iter) 379 VLOG(10) << "NodeId: " << (*iter).second->unique_id() 380 << " Str: " << (*iter).first; 381 } 382 383 string PrefilterTree::DebugNodeString(Prefilter* node) const { 384 string node_string = ""; 385 386 if (node->op() == Prefilter::ATOM) { 387 DCHECK(!node->atom().empty()); 388 node_string += node->atom(); 389 } else { 390 // Adding the operation disambiguates AND and OR nodes. 391 node_string += node->op() == Prefilter::AND ? "AND" : "OR"; 392 node_string += "("; 393 for (int i = 0; i < node->subs()->size() ; i++) { 394 if (i > 0) 395 node_string += ','; 396 node_string += Itoa((*node->subs())[i]->unique_id()); 397 node_string += ":"; 398 node_string += DebugNodeString((*node->subs())[i]); 399 } 400 node_string += ")"; 401 } 402 return node_string; 403 } 404 405 } // namespace re2 406