1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "util/util.h" 6 #include "util/flags.h" 7 #include "re2/prefilter.h" 8 #include "re2/prefilter_tree.h" 9 #include "re2/re2.h" 10 11 DEFINE_int32(filtered_re2_min_atom_len, 12 3, 13 "Strings less than this length are not stored as atoms"); 14 15 namespace re2 { 16 17 PrefilterTree::PrefilterTree() 18 : compiled_(false) { 19 } 20 21 PrefilterTree::~PrefilterTree() { 22 for (int i = 0; i < prefilter_vec_.size(); i++) 23 delete prefilter_vec_[i]; 24 25 for (int i = 0; i < entries_.size(); i++) 26 delete entries_[i].parents; 27 } 28 29 // Functions used for adding and Compiling prefilters to the 30 // PrefilterTree. 31 static bool KeepPart(Prefilter* prefilter, int level) { 32 if (prefilter == NULL) 33 return false; 34 35 switch (prefilter->op()) { 36 default: 37 LOG(DFATAL) << "Unexpected op in KeepPart: " 38 << prefilter->op(); 39 return false; 40 41 case Prefilter::ALL: 42 return false; 43 44 case Prefilter::ATOM: 45 return prefilter->atom().size() >= 46 FLAGS_filtered_re2_min_atom_len; 47 48 case Prefilter::AND: { 49 int j = 0; 50 vector<Prefilter*>* subs = prefilter->subs(); 51 for (int i = 0; i < subs->size(); i++) 52 if (KeepPart((*subs)[i], level + 1)) 53 (*subs)[j++] = (*subs)[i]; 54 else 55 delete (*subs)[i]; 56 57 subs->resize(j); 58 return j > 0; 59 } 60 61 case Prefilter::OR: 62 for (int i = 0; i < prefilter->subs()->size(); i++) 63 if (!KeepPart((*prefilter->subs())[i], level + 1)) 64 return false; 65 return true; 66 } 67 } 68 69 void PrefilterTree::Add(Prefilter *f) { 70 if (compiled_) { 71 LOG(DFATAL) << "Add after Compile."; 72 return; 73 } 74 if (f != NULL && !KeepPart(f, 0)) { 75 delete f; 76 f = NULL; 77 } 78 79 prefilter_vec_.push_back(f); 80 } 81 82 void PrefilterTree::Compile(vector<string>* atom_vec) { 83 if (compiled_) { 84 LOG(DFATAL) << "Compile after Compile."; 85 return; 86 } 87 88 // We do this check to support some legacy uses of 89 // PrefilterTree that call Compile before adding any regexps, 90 // and expect Compile not to have effect. 91 if (prefilter_vec_.empty()) 92 return; 93 94 compiled_ = true; 95 96 AssignUniqueIds(atom_vec); 97 98 // Identify nodes that are too common among prefilters and are 99 // triggering too many parents. Then get rid of them if possible. 100 // Note that getting rid of a prefilter node simply means they are 101 // no longer necessary for their parent to trigger; that is, we do 102 // not miss out on any regexps triggering by getting rid of a 103 // prefilter node. 104 for (int i = 0; i < entries_.size(); i++) { 105 IntMap* parents = entries_[i].parents; 106 if (parents->size() > 8) { 107 // This one triggers too many things. If all the parents are AND 108 // nodes and have other things guarding them, then get rid of 109 // this trigger. TODO(vsri): Adjust the threshold appropriately, 110 // make it a function of total number of nodes? 111 bool have_other_guard = true; 112 for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it) 113 have_other_guard = have_other_guard && 114 (entries_[it->index()].propagate_up_at_count > 1); 115 116 if (have_other_guard) { 117 for (IntMap::iterator it = parents->begin(); 118 it != parents->end(); ++it) 119 entries_[it->index()].propagate_up_at_count -= 1; 120 121 parents->clear(); // Forget the parents 122 } 123 } 124 } 125 126 PrintDebugInfo(); 127 } 128 129 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { 130 string node_string = NodeString(node); 131 map<string, Prefilter*>::iterator iter = node_map_.find(node_string); 132 if (iter == node_map_.end()) 133 return NULL; 134 return (*iter).second; 135 } 136 137 static string Itoa(int n) { 138 char buf[100]; 139 snprintf(buf, sizeof buf, "%d", n); 140 return string(buf); 141 } 142 143 string PrefilterTree::NodeString(Prefilter* node) const { 144 // Adding the operation disambiguates AND/OR/atom nodes. 145 string s = Itoa(node->op()) + ":"; 146 if (node->op() == Prefilter::ATOM) { 147 s += node->atom(); 148 } else { 149 for (int i = 0; i < node->subs()->size() ; i++) { 150 if (i > 0) 151 s += ','; 152 s += Itoa((*node->subs())[i]->unique_id()); 153 } 154 } 155 return s; 156 } 157 158 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { 159 atom_vec->clear(); 160 161 // Build vector of all filter nodes, sorted topologically 162 // from top to bottom in v. 163 vector<Prefilter*> v; 164 165 // Add the top level nodes of each regexp prefilter. 166 for (int i = 0; i < prefilter_vec_.size(); i++) { 167 Prefilter* f = prefilter_vec_[i]; 168 if (f == NULL) 169 unfiltered_.push_back(i); 170 171 // We push NULL also on to v, so that we maintain the 172 // mapping of index==regexpid for level=0 prefilter nodes. 173 v.push_back(f); 174 } 175 176 // Now add all the descendant nodes. 177 for (int i = 0; i < v.size(); i++) { 178 Prefilter* f = v[i]; 179 if (f == NULL) 180 continue; 181 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { 182 const vector<Prefilter*>& subs = *f->subs(); 183 for (int j = 0; j < subs.size(); j++) 184 v.push_back(subs[j]); 185 } 186 } 187 188 // Identify unique nodes. 189 int unique_id = 0; 190 for (int i = v.size() - 1; i >= 0; i--) { 191 Prefilter *node = v[i]; 192 if (node == NULL) 193 continue; 194 node->set_unique_id(-1); 195 Prefilter* canonical = CanonicalNode(node); 196 if (canonical == NULL) { 197 // Any further nodes that have the same node string 198 // will find this node as the canonical node. 199 node_map_[NodeString(node)] = node; 200 if (node->op() == Prefilter::ATOM) { 201 atom_vec->push_back(node->atom()); 202 atom_index_to_id_.push_back(unique_id); 203 } 204 node->set_unique_id(unique_id++); 205 } else { 206 node->set_unique_id(canonical->unique_id()); 207 } 208 } 209 entries_.resize(node_map_.size()); 210 211 // Create parent IntMap for the entries. 212 for (int i = v.size() - 1; i >= 0; i--) { 213 Prefilter* prefilter = v[i]; 214 if (prefilter == NULL) 215 continue; 216 217 if (CanonicalNode(prefilter) != prefilter) 218 continue; 219 220 Entry* entry = &entries_[prefilter->unique_id()]; 221 entry->parents = new IntMap(node_map_.size()); 222 } 223 224 // Fill the entries. 225 for (int i = v.size() - 1; i >= 0; i--) { 226 Prefilter* prefilter = v[i]; 227 if (prefilter == NULL) 228 continue; 229 230 if (CanonicalNode(prefilter) != prefilter) 231 continue; 232 233 Entry* entry = &entries_[prefilter->unique_id()]; 234 235 switch (prefilter->op()) { 236 default: 237 case Prefilter::ALL: 238 LOG(DFATAL) << "Unexpected op: " << prefilter->op(); 239 return; 240 241 case Prefilter::ATOM: 242 entry->propagate_up_at_count = 1; 243 break; 244 245 case Prefilter::OR: 246 case Prefilter::AND: { 247 IntMap uniq_child(node_map_.size()); 248 for (int j = 0; j < prefilter->subs()->size() ; j++) { 249 Prefilter* child = (*prefilter->subs())[j]; 250 Prefilter* canonical = CanonicalNode(child); 251 if (canonical == NULL) { 252 LOG(DFATAL) << "Null canonical node"; 253 return; 254 } 255 int child_id = canonical->unique_id(); 256 if (!uniq_child.has_index(child_id)) 257 uniq_child.set_new(child_id, 1); 258 // To the child, we want to add to parent indices. 259 Entry* child_entry = &entries_[child_id]; 260 if (!child_entry->parents->has_index(prefilter->unique_id())) 261 child_entry->parents->set_new(prefilter->unique_id(), 1); 262 } 263 entry->propagate_up_at_count = 264 prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; 265 266 break; 267 } 268 } 269 } 270 271 // For top level nodes, populate regexp id. 272 for (int i = 0; i < prefilter_vec_.size(); i++) { 273 if (prefilter_vec_[i] == NULL) 274 continue; 275 int id = CanonicalNode(prefilter_vec_[i])->unique_id(); 276 DCHECK_LE(0, id); 277 Entry* entry = &entries_[id]; 278 entry->regexps.push_back(i); 279 } 280 } 281 282 // Functions for triggering during search. 283 void PrefilterTree::RegexpsGivenStrings( 284 const vector<int>& matched_atoms, 285 vector<int>* regexps) const { 286 regexps->clear(); 287 if (!compiled_) { 288 LOG(WARNING) << "Compile() not called"; 289 for (int i = 0; i < prefilter_vec_.size(); ++i) 290 regexps->push_back(i); 291 } else { 292 if (!prefilter_vec_.empty()) { 293 IntMap regexps_map(prefilter_vec_.size()); 294 vector<int> matched_atom_ids; 295 for (int j = 0; j < matched_atoms.size(); j++) { 296 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); 297 VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; 298 } 299 PropagateMatch(matched_atom_ids, ®exps_map); 300 for (IntMap::iterator it = regexps_map.begin(); 301 it != regexps_map.end(); 302 ++it) 303 regexps->push_back(it->index()); 304 305 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); 306 } 307 } 308 sort(regexps->begin(), regexps->end()); 309 } 310 311 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids, 312 IntMap* regexps) const { 313 IntMap count(entries_.size()); 314 IntMap work(entries_.size()); 315 for (int i = 0; i < atom_ids.size(); i++) 316 work.set(atom_ids[i], 1); 317 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { 318 const Entry& entry = entries_[it->index()]; 319 VLOG(10) << "Processing: " << it->index(); 320 // Record regexps triggered. 321 for (int i = 0; i < entry.regexps.size(); i++) { 322 VLOG(10) << "Regexp triggered: " << entry.regexps[i]; 323 regexps->set(entry.regexps[i], 1); 324 } 325 int c; 326 // Pass trigger up to parents. 327 for (IntMap::iterator it = entry.parents->begin(); 328 it != entry.parents->end(); 329 ++it) { 330 int j = it->index(); 331 const Entry& parent = entries_[j]; 332 VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; 333 // Delay until all the children have succeeded. 334 if (parent.propagate_up_at_count > 1) { 335 if (count.has_index(j)) { 336 c = count.get_existing(j) + 1; 337 count.set_existing(j, c); 338 } else { 339 c = 1; 340 count.set_new(j, c); 341 } 342 if (c < parent.propagate_up_at_count) 343 continue; 344 } 345 VLOG(10) << "Triggering: " << j; 346 // Trigger the parent. 347 work.set(j, 1); 348 } 349 } 350 } 351 352 // Debugging help. 353 void PrefilterTree::PrintPrefilter(int regexpid) { 354 LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); 355 } 356 357 void PrefilterTree::PrintDebugInfo() { 358 VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); 359 VLOG(10) << "#Unique Nodes: " << entries_.size(); 360 361 for (int i = 0; i < entries_.size(); ++i) { 362 IntMap* parents = entries_[i].parents; 363 const vector<int>& regexps = entries_[i].regexps; 364 VLOG(10) << "EntryId: " << i 365 << " N: " << parents->size() << " R: " << regexps.size(); 366 for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it) 367 VLOG(10) << it->index(); 368 } 369 VLOG(10) << "Map:"; 370 for (map<string, Prefilter*>::const_iterator iter = node_map_.begin(); 371 iter != node_map_.end(); ++iter) 372 VLOG(10) << "NodeId: " << (*iter).second->unique_id() 373 << " Str: " << (*iter).first; 374 } 375 376 string PrefilterTree::DebugNodeString(Prefilter* node) const { 377 string node_string = ""; 378 379 if (node->op() == Prefilter::ATOM) { 380 DCHECK(!node->atom().empty()); 381 node_string += node->atom(); 382 } else { 383 // Adding the operation disambiguates AND and OR nodes. 384 node_string += node->op() == Prefilter::AND ? "AND" : "OR"; 385 node_string += "("; 386 for (int i = 0; i < node->subs()->size() ; i++) { 387 if (i > 0) 388 node_string += ','; 389 node_string += Itoa((*node->subs())[i]->unique_id()); 390 node_string += ":"; 391 node_string += DebugNodeString((*node->subs())[i]); 392 } 393 node_string += ")"; 394 } 395 return node_string; 396 } 397 398 } // namespace re2 399