Home | History | Annotate | Download | only in re2
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // The PrefilterTree class is used to form an AND-OR tree of strings
      6 // that would trigger each regexp. The 'prefilter' of each regexp is
      7 // added tp PrefilterTree, and then PrefilterTree is used to find all
      8 // the unique strings across the prefilters. During search, by using
      9 // matches from a string matching engine, PrefilterTree deduces the
     10 // set of regexps that are to be triggered. The 'string matching
     11 // engine' itself is outside of this class, and the caller can use any
     12 // favorite engine. PrefilterTree provides a set of strings (called
     13 // atoms) that the user of this class should use to do the string
     14 // matching.
     15 //
     16 #ifndef RE2_PREFILTER_TREE_H_
     17 #define RE2_PREFILTER_TREE_H_
     18 
     19 #include <map>
     20 
     21 #include "util/util.h"
     22 #include "util/sparse_array.h"
     23 
     24 namespace re2 {
     25 
     26 typedef SparseArray<int> IntMap;
     27 typedef std::map<int, int> StdIntMap;
     28 
     29 class Prefilter;
     30 
     31 class PrefilterTree {
     32  public:
     33   PrefilterTree();
     34   ~PrefilterTree();
     35 
     36   // Adds the prefilter for the next regexp. Note that we assume that
     37   // Add called sequentially for all regexps. All Add calls
     38   // must precede Compile.
     39   void Add(Prefilter* prefilter);
     40 
     41   // The Compile returns a vector of string in atom_vec.
     42   // Call this after all the prefilters are added through Add.
     43   // No calls to Add after Compile are allowed.
     44   // The caller should use the returned set of strings to do string matching.
     45   // Each time a string matches, the corresponding index then has to be
     46   // and passed to RegexpsGivenStrings below.
     47   void Compile(vector<string>* atom_vec);
     48 
     49   // Given the indices of the atoms that matched, returns the indexes
     50   // of regexps that should be searched.  The matched_atoms should
     51   // contain all the ids of string atoms that were found to match the
     52   // content. The caller can use any string match engine to perform
     53   // this function. This function is thread safe.
     54   void RegexpsGivenStrings(const vector<int>& matched_atoms,
     55                            vector<int>* regexps) const;
     56 
     57   // Print debug prefilter. Also prints unique ids associated with
     58   // nodes of the prefilter of the regexp.
     59   void PrintPrefilter(int regexpid);
     60 
     61 
     62   // Each unique node has a corresponding Entry that helps in
     63   // passing the matching trigger information along the tree.
     64   struct Entry {
     65    public:
     66     // How many children should match before this node triggers the
     67     // parent. For an atom and an OR node, this is 1 and for an AND
     68     // node, it is the number of unique children.
     69     int propagate_up_at_count;
     70 
     71     // When this node is ready to trigger the parent, what are the indices
     72     // of the parent nodes to trigger. The reason there may be more than
     73     // one is because of sharing. For example (abc | def) and (xyz | def)
     74     // are two different nodes, but they share the atom 'def'. So when
     75     // 'def' matches, it triggers two parents, corresponding to the two
     76     // different OR nodes.
     77     StdIntMap* parents;
     78 
     79     // When this node is ready to trigger the parent, what are the
     80     // regexps that are triggered.
     81     vector<int> regexps;
     82   };
     83 
     84  private:
     85   // This function assigns unique ids to various parts of the
     86   // prefilter, by looking at if these nodes are already in the
     87   // PrefilterTree.
     88   void AssignUniqueIds(vector<string>* atom_vec);
     89 
     90   // Given the matching atoms, find the regexps to be triggered.
     91   void PropagateMatch(const vector<int>& atom_ids,
     92                       IntMap* regexps) const;
     93 
     94   // Returns the prefilter node that has the same NodeString as this
     95   // node. For the canonical node, returns node.
     96   Prefilter* CanonicalNode(Prefilter* node);
     97 
     98   // A string that uniquely identifies the node. Assumes that the
     99   // children of node has already been assigned unique ids.
    100   string NodeString(Prefilter* node) const;
    101 
    102   // Recursively constructs a readable prefilter string.
    103   string DebugNodeString(Prefilter* node) const;
    104 
    105   // Used for debugging.
    106   void PrintDebugInfo();
    107 
    108   // These are all the nodes formed by Compile. Essentially, there is
    109   // one node for each unique atom and each unique AND/OR node.
    110   vector<Entry> entries_;
    111 
    112   // Map node string to canonical Prefilter node.
    113   map<string, Prefilter*> node_map_;
    114 
    115   // indices of regexps that always pass through the filter (since we
    116   // found no required literals in these regexps).
    117   vector<int> unfiltered_;
    118 
    119   // vector of Prefilter for all regexps.
    120   vector<Prefilter*> prefilter_vec_;
    121 
    122   // Atom index in returned strings to entry id mapping.
    123   vector<int> atom_index_to_id_;
    124 
    125   // Has the prefilter tree been compiled.
    126   bool compiled_;
    127 
    128   DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
    129 };
    130 
    131 }  // namespace
    132 
    133 #endif  // RE2_PREFILTER_TREE_H_
    134