Home | History | Annotate | Download | only in re2
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
      6 // It provides a prefilter mechanism that helps in cutting down the
      7 // number of regexps that need to be actually searched.
      8 //
      9 // By design, it does not include a string matching engine. This is to
     10 // allow the user of the class to use their favorite string match
     11 // engine. The overall flow is: Add all the regexps using Add, then
     12 // Compile the FilteredRE2. The compile returns strings that need to
     13 // be matched. Note that all returned strings are lowercase. For
     14 // applying regexps to a search text, the caller does the string
     15 // matching using the strings returned. When doing the string match,
     16 // note that the caller has to do that on lower cased version of the
     17 // search text. Then call FirstMatch or AllMatches with a vector of
     18 // indices of strings that were found in the text to get the actual
     19 // regexp matches.
     20 
     21 #ifndef RE2_FILTERED_RE2_H_
     22 #define RE2_FILTERED_RE2_H_
     23 
     24 #include <vector>
     25 #include "re2/re2.h"
     26 
     27 namespace re2 {
     28 using std::vector;
     29 
     30 class PrefilterTree;
     31 
     32 class FilteredRE2 {
     33  public:
     34   FilteredRE2();
     35   ~FilteredRE2();
     36 
     37   // Uses RE2 constructor to create a RE2 object (re). Returns
     38   // re->error_code(). If error_code is other than NoError, then re is
     39   // deleted and not added to re2_vec_.
     40   RE2::ErrorCode Add(const StringPiece& pattern,
     41                      const RE2::Options& options,
     42                      int *id);
     43 
     44   // Prepares the regexps added by Add for filtering.  Returns a set
     45   // of strings that the caller should check for in candidate texts.
     46   // The returned strings are lowercased. When doing string matching,
     47   // the search text should be lowercased first to find matching
     48   // strings from the set of strings returned by Compile.  Call after
     49   // all Add calls are done.
     50   void Compile(vector<string>* strings_to_match);
     51 
     52   // Returns the index of the first matching regexp.
     53   // Returns -1 on no match. Can be called prior to Compile.
     54   // Does not do any filtering: simply tries to Match the
     55   // regexps in a loop.
     56   int SlowFirstMatch(const StringPiece& text) const;
     57 
     58   // Returns the index of the first matching regexp.
     59   // Returns -1 on no match. Compile has to be called before
     60   // calling this.
     61   int FirstMatch(const StringPiece& text,
     62                  const vector<int>& atoms) const;
     63 
     64   // Returns the indices of all matching regexps, after first clearing
     65   // matched_regexps.
     66   bool AllMatches(const StringPiece& text,
     67                   const vector<int>& atoms,
     68                   vector<int>* matching_regexps) const;
     69 
     70   // The number of regexps added.
     71   int NumRegexps() const { return re2_vec_.size(); }
     72 
     73  private:
     74 
     75   // Get the individual RE2 objects. Useful for testing.
     76   RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
     77 
     78   // Print prefilter.
     79   void PrintPrefilter(int regexpid);
     80 
     81   // Useful for testing and debugging.
     82   void RegexpsGivenStrings(const vector<int>& matched_atoms,
     83                            vector<int>* passed_regexps);
     84 
     85   // All the regexps in the FilteredRE2.
     86   vector<RE2*> re2_vec_;
     87 
     88   // Has the FilteredRE2 been compiled using Compile()
     89   bool compiled_;
     90 
     91   // An AND-OR tree of string atoms used for filtering regexps.
     92   PrefilterTree* prefilter_tree_;
     93 
     94   //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
     95   FilteredRE2(const FilteredRE2&);
     96   void operator=(const FilteredRE2&);
     97 };
     98 
     99 }  // namespace re2
    100 
    101 #endif  // RE2_FILTERED_RE2_H_
    102