1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // The PrefilterTree class is used to form an AND-OR tree of strings 6 // that would trigger each regexp. The 'prefilter' of each regexp is 7 // added tp PrefilterTree, and then PrefilterTree is used to find all 8 // the unique strings across the prefilters. During search, by using 9 // matches from a string matching engine, PrefilterTree deduces the 10 // set of regexps that are to be triggered. The 'string matching 11 // engine' itself is outside of this class, and the caller can use any 12 // favorite engine. PrefilterTree provides a set of strings (called 13 // atoms) that the user of this class should use to do the string 14 // matching. 15 // 16 #ifndef RE2_PREFILTER_TREE_H_ 17 #define RE2_PREFILTER_TREE_H_ 18 19 #include "util/util.h" 20 #include "util/sparse_array.h" 21 22 namespace re2 { 23 24 typedef SparseArray<int> IntMap; 25 26 class Prefilter; 27 28 class PrefilterTree { 29 public: 30 PrefilterTree(); 31 ~PrefilterTree(); 32 33 // Adds the prefilter for the next regexp. Note that we assume that 34 // Add called sequentially for all regexps. All Add calls 35 // must precede Compile. 36 void Add(Prefilter* prefilter); 37 38 // The Compile returns a vector of string in atom_vec. 39 // Call this after all the prefilters are added through Add. 40 // No calls to Add after Compile are allowed. 41 // The caller should use the returned set of strings to do string matching. 42 // Each time a string matches, the corresponding index then has to be 43 // and passed to RegexpsGivenStrings below. 44 void Compile(vector<string>* atom_vec); 45 46 // Given the indices of the atoms that matched, returns the indexes 47 // of regexps that should be searched. The matched_atoms should 48 // contain all the ids of string atoms that were found to match the 49 // content. The caller can use any string match engine to perform 50 // this function. This function is thread safe. 51 void RegexpsGivenStrings(const vector<int>& matched_atoms, 52 vector<int>* regexps) const; 53 54 // Print debug prefilter. Also prints unique ids associated with 55 // nodes of the prefilter of the regexp. 56 void PrintPrefilter(int regexpid); 57 58 59 // Each unique node has a corresponding Entry that helps in 60 // passing the matching trigger information along the tree. 61 struct Entry { 62 public: 63 // How many children should match before this node triggers the 64 // parent. For an atom and an OR node, this is 1 and for an AND 65 // node, it is the number of unique children. 66 int propagate_up_at_count; 67 68 // When this node is ready to trigger the parent, what are the indices 69 // of the parent nodes to trigger. The reason there may be more than 70 // one is because of sharing. For example (abc | def) and (xyz | def) 71 // are two different nodes, but they share the atom 'def'. So when 72 // 'def' matches, it triggers two parents, corresponding to the two 73 // different OR nodes. 74 IntMap* parents; 75 76 // When this node is ready to trigger the parent, what are the 77 // regexps that are triggered. 78 vector<int> regexps; 79 }; 80 81 private: 82 // This function assigns unique ids to various parts of the 83 // prefilter, by looking at if these nodes are already in the 84 // PrefilterTree. 85 void AssignUniqueIds(vector<string>* atom_vec); 86 87 // Given the matching atoms, find the regexps to be triggered. 88 void PropagateMatch(const vector<int>& atom_ids, 89 IntMap* regexps) const; 90 91 // Returns the prefilter node that has the same NodeString as this 92 // node. For the canonical node, returns node. 93 Prefilter* CanonicalNode(Prefilter* node); 94 95 // A string that uniquely identifies the node. Assumes that the 96 // children of node has already been assigned unique ids. 97 string NodeString(Prefilter* node) const; 98 99 // Recursively constructs a readable prefilter string. 100 string DebugNodeString(Prefilter* node) const; 101 102 // Used for debugging. 103 void PrintDebugInfo(); 104 105 // These are all the nodes formed by Compile. Essentially, there is 106 // one node for each unique atom and each unique AND/OR node. 107 vector<Entry> entries_; 108 109 // Map node string to canonical Prefilter node. 110 map<string, Prefilter*> node_map_; 111 112 // indices of regexps that always pass through the filter (since we 113 // found no required literals in these regexps). 114 vector<int> unfiltered_; 115 116 // vector of Prefilter for all regexps. 117 vector<Prefilter*> prefilter_vec_; 118 119 // Atom index in returned strings to entry id mapping. 120 vector<int> atom_index_to_id_; 121 122 // Has the prefilter tree been compiled. 123 bool compiled_; 124 125 DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree); 126 }; 127 128 } // namespace 129 130 #endif // RE2_PREFILTER_TREE_H_ 131