Home | History | Annotate | Download | only in matcher
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef EXTENSIONS_COMMON_MATCHER_URL_MATCHER_H_
      6 #define EXTENSIONS_COMMON_MATCHER_URL_MATCHER_H_
      7 
      8 #include <set>
      9 #include <vector>
     10 
     11 #include "base/memory/ref_counted.h"
     12 #include "base/memory/scoped_ptr.h"
     13 #include "base/memory/scoped_vector.h"
     14 #include "extensions/common/matcher/regex_set_matcher.h"
     15 #include "extensions/common/matcher/substring_set_matcher.h"
     16 
     17 class GURL;
     18 
     19 namespace base {
     20 class DictionaryValue;
     21 }
     22 
     23 namespace extensions {
     24 
     25 // This class represents a single URL matching condition, e.g. a match on the
     26 // host suffix or the containment of a string in the query component of a GURL.
     27 //
     28 // The difference from a simple StringPattern is that this also supports
     29 // checking whether the {Host, Path, Query} of a URL contains a string. The
     30 // reduction of URL matching conditions to StringPatterns conducted by
     31 // URLMatcherConditionFactory is not capable of expressing that alone.
     32 //
     33 // Also supported is matching regular expressions against the URL (URL_MATCHES).
     34 class URLMatcherCondition {
     35  public:
     36   enum Criterion {
     37     HOST_PREFIX,
     38     HOST_SUFFIX,
     39     HOST_CONTAINS,
     40     HOST_EQUALS,
     41     PATH_PREFIX,
     42     PATH_SUFFIX,
     43     PATH_CONTAINS,
     44     PATH_EQUALS,
     45     QUERY_PREFIX,
     46     QUERY_SUFFIX,
     47     QUERY_CONTAINS,
     48     QUERY_EQUALS,
     49     HOST_SUFFIX_PATH_PREFIX,
     50     HOST_EQUALS_PATH_PREFIX,
     51     URL_PREFIX,
     52     URL_SUFFIX,
     53     URL_CONTAINS,
     54     URL_EQUALS,
     55     URL_MATCHES,
     56     ORIGIN_AND_PATH_MATCHES,  // Matches the URL minus its query string.
     57   };
     58 
     59   URLMatcherCondition();
     60   ~URLMatcherCondition();
     61   URLMatcherCondition(Criterion criterion,
     62                       const StringPattern* substring_pattern);
     63   URLMatcherCondition(const URLMatcherCondition& rhs);
     64   URLMatcherCondition& operator=(const URLMatcherCondition& rhs);
     65   bool operator<(const URLMatcherCondition& rhs) const;
     66 
     67   Criterion criterion() const { return criterion_; }
     68   const StringPattern* string_pattern() const {
     69     return string_pattern_;
     70   }
     71 
     72   // Returns whether this URLMatcherCondition needs to be executed on a
     73   // full URL rather than the individual components (see
     74   // URLMatcherConditionFactory).
     75   bool IsFullURLCondition() const;
     76 
     77   // Returns whether this URLMatcherCondition is a regular expression to be
     78   // handled by a regex matcher instead of a substring matcher.
     79   bool IsRegexCondition() const;
     80 
     81   // Returns whether this URLMatcherCondition is a regular expression that shall
     82   // be evaluated on the URL without the query parameter.
     83   bool IsOriginAndPathRegexCondition() const;
     84 
     85   // Returns whether this condition is fulfilled according to
     86   // |matching_patterns| and |url|.
     87   bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
     88                const GURL& url) const;
     89 
     90  private:
     91   // |criterion_| and |string_pattern_| describe together what property a URL
     92   // needs to fulfill to be considered a match.
     93   Criterion criterion_;
     94 
     95   // This is the StringPattern that is used in a SubstringSetMatcher.
     96   const StringPattern* string_pattern_;
     97 };
     98 
     99 // Class to map the problem of finding {host, path, query} {prefixes, suffixes,
    100 // containments, and equality} in GURLs to the substring matching problem.
    101 //
    102 // Say, you want to check whether the path of a URL starts with "/index.html".
    103 // This class preprocesses a URL like "www.google.com/index.html" into something
    104 // like "www.google.com|/index.html". After preprocessing, you can search for
    105 // "|/index.html" in the string and see that this candidate URL actually has
    106 // a path that starts with "/index.html". On the contrary,
    107 // "www.google.com/images/index.html" would be normalized to
    108 // "www.google.com|/images/index.html". It is easy to see that it contains
    109 // "/index.html" but the path of the URL does not start with "/index.html".
    110 //
    111 // This preprocessing is important if you want to match a URL against many
    112 // patterns because it reduces the matching to a "discover all substrings
    113 // of a dictionary in a text" problem, which can be solved very efficiently
    114 // by the Aho-Corasick algorithm.
    115 //
    116 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern
    117 // referenced by created URLMatcherConditions. Therefore, it must outlive
    118 // all created URLMatcherCondition and the SubstringSetMatcher.
    119 class URLMatcherConditionFactory {
    120  public:
    121   URLMatcherConditionFactory();
    122   ~URLMatcherConditionFactory();
    123 
    124   // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
    125   std::string CanonicalizeURLForComponentSearches(const GURL& url) const;
    126 
    127   // Factory methods for various condition types.
    128   //
    129   // Note that these methods fill the pattern_singletons_. If you create
    130   // conditions and don't register them to a URLMatcher, they will continue to
    131   // consume memory. You need to call ForgetUnusedPatterns() or
    132   // URLMatcher::ClearUnusedConditionSets() in this case.
    133   URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix);
    134   URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix);
    135   URLMatcherCondition CreateHostContainsCondition(const std::string& str);
    136   URLMatcherCondition CreateHostEqualsCondition(const std::string& str);
    137 
    138   URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix);
    139   URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix);
    140   URLMatcherCondition CreatePathContainsCondition(const std::string& str);
    141   URLMatcherCondition CreatePathEqualsCondition(const std::string& str);
    142 
    143   URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix);
    144   URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix);
    145   URLMatcherCondition CreateQueryContainsCondition(const std::string& str);
    146   URLMatcherCondition CreateQueryEqualsCondition(const std::string& str);
    147 
    148   // This covers the common case, where you don't care whether a domain
    149   // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
    150   // should be followed by a given |path_prefix|.
    151   URLMatcherCondition CreateHostSuffixPathPrefixCondition(
    152       const std::string& host_suffix,
    153       const std::string& path_prefix);
    154   URLMatcherCondition CreateHostEqualsPathPrefixCondition(
    155       const std::string& host,
    156       const std::string& path_prefix);
    157 
    158   // Canonicalizes a URL for "CreateURL*Condition" searches.
    159   std::string CanonicalizeURLForFullSearches(const GURL& url) const;
    160 
    161   // Canonicalizes a URL for "CreateURLMatchesCondition" searches.
    162   std::string CanonicalizeURLForRegexSearches(const GURL& url) const;
    163   // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches.
    164   std::string CanonicalizeURLForOriginAndPathRegexSearches(
    165       const GURL& url) const;
    166 
    167   URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix);
    168   URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix);
    169   URLMatcherCondition CreateURLContainsCondition(const std::string& str);
    170   URLMatcherCondition CreateURLEqualsCondition(const std::string& str);
    171 
    172   URLMatcherCondition CreateURLMatchesCondition(const std::string& regex);
    173   URLMatcherCondition CreateOriginAndPathMatchesCondition(
    174       const std::string& regex);
    175 
    176   // Removes all patterns from |pattern_singletons_| that are not listed in
    177   // |used_patterns|. These patterns are not referenced any more and get
    178   // freed.
    179   void ForgetUnusedPatterns(
    180       const std::set<StringPattern::ID>& used_patterns);
    181 
    182   // Returns true if this object retains no allocated data. Only for debugging.
    183   bool IsEmpty() const;
    184 
    185  private:
    186   // Creates a URLMatcherCondition according to the parameters passed.
    187   // The URLMatcherCondition will refer to a StringPattern that is
    188   // owned by |pattern_singletons_|.
    189   URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion,
    190                                       const std::string& pattern);
    191 
    192   // Prepends a "." to the hostname if it does not start with one.
    193   std::string CanonicalizeHostname(const std::string& hostname) const;
    194 
    195   // Counter that ensures that all created StringPatterns have unique IDs.
    196   // Note that substring patterns and regex patterns will use different IDs.
    197   int id_counter_;
    198 
    199   // This comparison considers only the pattern() value of the
    200   // StringPatterns.
    201   struct StringPatternPointerCompare {
    202     bool operator()(StringPattern* lhs, StringPattern* rhs) const;
    203   };
    204   // Set to ensure that we generate only one StringPattern for each content
    205   // of StringPattern::pattern().
    206   typedef std::set<StringPattern*, StringPatternPointerCompare>
    207       PatternSingletons;
    208   PatternSingletons substring_pattern_singletons_;
    209   PatternSingletons regex_pattern_singletons_;
    210   PatternSingletons origin_and_path_regex_pattern_singletons_;
    211 
    212   DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory);
    213 };
    214 
    215 // This class represents a filter for the URL scheme to be hooked up into a
    216 // URLMatcherConditionSet.
    217 class URLMatcherSchemeFilter {
    218  public:
    219   explicit URLMatcherSchemeFilter(const std::string& filter);
    220   explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters);
    221   ~URLMatcherSchemeFilter();
    222   bool IsMatch(const GURL& url) const;
    223 
    224  private:
    225   std::vector<std::string> filters_;
    226 
    227   DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter);
    228 };
    229 
    230 // This class represents a filter for port numbers to be hooked up into a
    231 // URLMatcherConditionSet.
    232 class URLMatcherPortFilter {
    233  public:
    234   // Boundaries of a port range (both ends are included).
    235   typedef std::pair<int, int> Range;
    236   explicit URLMatcherPortFilter(const std::vector<Range>& ranges);
    237   ~URLMatcherPortFilter();
    238   bool IsMatch(const GURL& url) const;
    239 
    240   // Creates a port range [from, to]; both ends are included.
    241   static Range CreateRange(int from, int to);
    242   // Creates a port range containing a single port.
    243   static Range CreateRange(int port);
    244 
    245  private:
    246   std::vector<Range> ranges_;
    247 
    248   DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter);
    249 };
    250 
    251 // This class represents a set of conditions that all need to match on a
    252 // given URL in order to be considered a match.
    253 class URLMatcherConditionSet : public base::RefCounted<URLMatcherConditionSet> {
    254  public:
    255   typedef int ID;
    256   typedef std::set<URLMatcherCondition> Conditions;
    257   typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector;
    258 
    259   // Matches if all conditions in |conditions| are fulfilled.
    260   URLMatcherConditionSet(ID id, const Conditions& conditions);
    261 
    262   // Matches if all conditions in |conditions|, |scheme_filter| and
    263   // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL,
    264   // in which case, no restrictions are imposed on the scheme/port of a URL.
    265   URLMatcherConditionSet(ID id, const Conditions& conditions,
    266                          scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    267                          scoped_ptr<URLMatcherPortFilter> port_filter);
    268 
    269   ID id() const { return id_; }
    270   const Conditions& conditions() const { return conditions_; }
    271 
    272   bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
    273                const GURL& url) const;
    274 
    275  private:
    276   friend class base::RefCounted<URLMatcherConditionSet>;
    277   ~URLMatcherConditionSet();
    278   ID id_;
    279   Conditions conditions_;
    280   scoped_ptr<URLMatcherSchemeFilter> scheme_filter_;
    281   scoped_ptr<URLMatcherPortFilter> port_filter_;
    282 
    283   DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet);
    284 };
    285 
    286 // This class allows matching one URL against a large set of
    287 // URLMatcherConditionSets at the same time.
    288 class URLMatcher {
    289  public:
    290   URLMatcher();
    291   ~URLMatcher();
    292 
    293   // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
    294   // must have a unique ID.
    295   // This is an expensive operation as it triggers pre-calculations on the
    296   // currently registered condition sets. Do not call this operation many
    297   // times with a single condition set in each call.
    298   void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets);
    299 
    300   // Removes the listed condition sets. All |condition_set_ids| must be
    301   // currently registered. This function should be called with large batches
    302   // of |condition_set_ids| at a time to improve performance.
    303   void RemoveConditionSets(
    304       const std::vector<URLMatcherConditionSet::ID>& condition_set_ids);
    305 
    306   // Removes all unused condition sets from the ConditionFactory.
    307   void ClearUnusedConditionSets();
    308 
    309   // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
    310   std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const;
    311 
    312   // Returns the URLMatcherConditionFactory that must be used to create
    313   // URLMatcherConditionSets for this URLMatcher.
    314   URLMatcherConditionFactory* condition_factory() {
    315     return &condition_factory_;
    316   }
    317 
    318   // Returns true if this object retains no allocated data. Only for debugging.
    319   bool IsEmpty() const;
    320 
    321  private:
    322   void UpdateSubstringSetMatcher(bool full_url_conditions);
    323   void UpdateRegexSetMatcher();
    324   void UpdateTriggers();
    325   void UpdateConditionFactory();
    326   void UpdateInternalDatastructures();
    327 
    328   URLMatcherConditionFactory condition_factory_;
    329 
    330   // Maps the ID of a URLMatcherConditionSet to the respective
    331   // URLMatcherConditionSet.
    332   typedef std::map<URLMatcherConditionSet::ID,
    333                    scoped_refptr<URLMatcherConditionSet> >
    334       URLMatcherConditionSets;
    335   URLMatcherConditionSets url_matcher_condition_sets_;
    336 
    337   // Maps a StringPattern ID to the URLMatcherConditions that need to
    338   // be triggered in case of a StringPattern match.
    339   typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> >
    340       StringPatternTriggers;
    341   StringPatternTriggers substring_match_triggers_;
    342 
    343   SubstringSetMatcher full_url_matcher_;
    344   SubstringSetMatcher url_component_matcher_;
    345   RegexSetMatcher regex_set_matcher_;
    346   RegexSetMatcher origin_and_path_regex_set_matcher_;
    347   std::set<const StringPattern*> registered_full_url_patterns_;
    348   std::set<const StringPattern*> registered_url_component_patterns_;
    349 
    350   DISALLOW_COPY_AND_ASSIGN(URLMatcher);
    351 };
    352 
    353 }  // namespace extensions
    354 
    355 #endif  // EXTENSIONS_COMMON_MATCHER_URL_MATCHER_H_
    356