Home | History | Annotate | Download | only in url_matcher
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
      6 #define COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
      7 
      8 #include <map>
      9 #include <set>
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "base/memory/scoped_ptr.h"
     14 #include "components/url_matcher/string_pattern.h"
     15 #include "components/url_matcher/substring_set_matcher.h"
     16 #include "components/url_matcher/url_matcher_export.h"
     17 
     18 namespace re2 {
     19 class FilteredRE2;
     20 }
     21 
     22 namespace url_matcher {
     23 
     24 // Efficiently matches URLs against a collection of regular expressions,
     25 // using FilteredRE2 to reduce the number of regexes that must be matched
     26 // by pre-filtering with substring matching. See:
     27 // http://swtch.com/~rsc/regexp/regexp3.html#analysis
     28 class URL_MATCHER_EXPORT RegexSetMatcher {
     29  public:
     30   RegexSetMatcher();
     31   virtual ~RegexSetMatcher();
     32 
     33   // Adds the regex patterns in |regex_list| to the matcher. Also rebuilds
     34   // the FilteredRE2 matcher; thus, for efficiency, prefer adding multiple
     35   // patterns at once.
     36   // Ownership of the patterns remains with the caller.
     37   void AddPatterns(const std::vector<const StringPattern*>& regex_list);
     38 
     39   // Removes all regex patterns.
     40   void ClearPatterns();
     41 
     42   // Appends the IDs of regular expressions in our set that match the |text|
     43   // to |matches|.
     44   bool Match(const std::string& text,
     45              std::set<StringPattern::ID>* matches) const;
     46 
     47   bool IsEmpty() const;
     48 
     49  private:
     50   typedef int RE2ID;
     51   typedef std::map<StringPattern::ID, const StringPattern*> RegexMap;
     52   typedef std::vector<StringPattern::ID> RE2IDMap;
     53 
     54   // Use Aho-Corasick SubstringSetMatcher to find which literal patterns
     55   // match the |text|.
     56   std::vector<RE2ID> FindSubstringMatches(const std::string& text) const;
     57 
     58   // Rebuild FilteredRE2 from scratch. Needs to be called whenever
     59   // our set of regexes changes.
     60   // TODO(yoz): investigate if it could be done incrementally;
     61   // apparently not supported by FilteredRE2.
     62   void RebuildMatcher();
     63 
     64   // Clean up StringPatterns in |substring_patterns_|.
     65   void DeleteSubstringPatterns();
     66 
     67   // Mapping of regex StringPattern::IDs to regexes.
     68   RegexMap regexes_;
     69   // Mapping of RE2IDs from FilteredRE2 (which are assigned in order)
     70   // to regex StringPattern::IDs.
     71   RE2IDMap re2_id_map_;
     72 
     73   scoped_ptr<re2::FilteredRE2> filtered_re2_;
     74   scoped_ptr<SubstringSetMatcher> substring_matcher_;
     75 
     76   // The substring patterns from FilteredRE2, which are used in
     77   // |substring_matcher_| but whose lifetime is managed here.
     78   std::vector<const StringPattern*> substring_patterns_;
     79 };
     80 
     81 }  // namespace url_matcher
     82 
     83 #endif  // COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
     84