Home | History | Annotate | Download | only in url_matcher
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef COMPONENTS_URL_MATCHER_URL_MATCHER_H_
      6 #define COMPONENTS_URL_MATCHER_URL_MATCHER_H_
      7 
      8 #include <set>
      9 #include <vector>
     10 
     11 #include "base/memory/ref_counted.h"
     12 #include "base/memory/scoped_ptr.h"
     13 #include "base/memory/scoped_vector.h"
     14 #include "components/url_matcher/regex_set_matcher.h"
     15 #include "components/url_matcher/substring_set_matcher.h"
     16 #include "components/url_matcher/url_matcher_export.h"
     17 
     18 class GURL;
     19 
     20 namespace base {
     21 class DictionaryValue;
     22 }
     23 
     24 namespace url_matcher {
     25 
     26 // This class represents a single URL matching condition, e.g. a match on the
     27 // host suffix or the containment of a string in the query component of a GURL.
     28 //
     29 // The difference from a simple StringPattern is that this also supports
     30 // checking whether the {Host, Path, Query} of a URL contains a string. The
     31 // reduction of URL matching conditions to StringPatterns conducted by
     32 // URLMatcherConditionFactory is not capable of expressing that alone.
     33 //
     34 // Also supported is matching regular expressions against the URL (URL_MATCHES).
     35 class URL_MATCHER_EXPORT URLMatcherCondition {
     36  public:
     37   enum Criterion {
     38     HOST_PREFIX,
     39     HOST_SUFFIX,
     40     HOST_CONTAINS,
     41     HOST_EQUALS,
     42     PATH_PREFIX,
     43     PATH_SUFFIX,
     44     PATH_CONTAINS,
     45     PATH_EQUALS,
     46     QUERY_PREFIX,
     47     QUERY_SUFFIX,
     48     QUERY_CONTAINS,
     49     QUERY_EQUALS,
     50     HOST_SUFFIX_PATH_PREFIX,
     51     HOST_EQUALS_PATH_PREFIX,
     52     URL_PREFIX,
     53     URL_SUFFIX,
     54     URL_CONTAINS,
     55     URL_EQUALS,
     56     URL_MATCHES,
     57     ORIGIN_AND_PATH_MATCHES,  // Matches the URL minus its query string.
     58   };
     59 
     60   URLMatcherCondition();
     61   ~URLMatcherCondition();
     62   URLMatcherCondition(Criterion criterion,
     63                       const StringPattern* substring_pattern);
     64   URLMatcherCondition(const URLMatcherCondition& rhs);
     65   URLMatcherCondition& operator=(const URLMatcherCondition& rhs);
     66   bool operator<(const URLMatcherCondition& rhs) const;
     67 
     68   Criterion criterion() const { return criterion_; }
     69   const StringPattern* string_pattern() const {
     70     return string_pattern_;
     71   }
     72 
     73   // Returns whether this URLMatcherCondition needs to be executed on a
     74   // full URL rather than the individual components (see
     75   // URLMatcherConditionFactory).
     76   bool IsFullURLCondition() const;
     77 
     78   // Returns whether this URLMatcherCondition is a regular expression to be
     79   // handled by a regex matcher instead of a substring matcher.
     80   bool IsRegexCondition() const;
     81 
     82   // Returns whether this URLMatcherCondition is a regular expression that shall
     83   // be evaluated on the URL without the query parameter.
     84   bool IsOriginAndPathRegexCondition() const;
     85 
     86   // Returns whether this condition is fulfilled according to
     87   // |matching_patterns| and |url|.
     88   bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
     89                const GURL& url) const;
     90 
     91  private:
     92   // |criterion_| and |string_pattern_| describe together what property a URL
     93   // needs to fulfill to be considered a match.
     94   Criterion criterion_;
     95 
     96   // This is the StringPattern that is used in a SubstringSetMatcher.
     97   const StringPattern* string_pattern_;
     98 };
     99 
    100 // Class to map the problem of finding {host, path, query} {prefixes, suffixes,
    101 // containments, and equality} in GURLs to the substring matching problem.
    102 //
    103 // Say, you want to check whether the path of a URL starts with "/index.html".
    104 // This class preprocesses a URL like "www.google.com/index.html" into something
    105 // like "www.google.com|/index.html". After preprocessing, you can search for
    106 // "|/index.html" in the string and see that this candidate URL actually has
    107 // a path that starts with "/index.html". On the contrary,
    108 // "www.google.com/images/index.html" would be normalized to
    109 // "www.google.com|/images/index.html". It is easy to see that it contains
    110 // "/index.html" but the path of the URL does not start with "/index.html".
    111 //
    112 // This preprocessing is important if you want to match a URL against many
    113 // patterns because it reduces the matching to a "discover all substrings
    114 // of a dictionary in a text" problem, which can be solved very efficiently
    115 // by the Aho-Corasick algorithm.
    116 //
    117 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern
    118 // referenced by created URLMatcherConditions. Therefore, it must outlive
    119 // all created URLMatcherCondition and the SubstringSetMatcher.
    120 class URL_MATCHER_EXPORT URLMatcherConditionFactory {
    121  public:
    122   URLMatcherConditionFactory();
    123   ~URLMatcherConditionFactory();
    124 
    125   // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
    126   std::string CanonicalizeURLForComponentSearches(const GURL& url) const;
    127 
    128   // Factory methods for various condition types.
    129   //
    130   // Note that these methods fill the pattern_singletons_. If you create
    131   // conditions and don't register them to a URLMatcher, they will continue to
    132   // consume memory. You need to call ForgetUnusedPatterns() or
    133   // URLMatcher::ClearUnusedConditionSets() in this case.
    134   URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix);
    135   URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix);
    136   URLMatcherCondition CreateHostContainsCondition(const std::string& str);
    137   URLMatcherCondition CreateHostEqualsCondition(const std::string& str);
    138 
    139   URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix);
    140   URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix);
    141   URLMatcherCondition CreatePathContainsCondition(const std::string& str);
    142   URLMatcherCondition CreatePathEqualsCondition(const std::string& str);
    143 
    144   URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix);
    145   URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix);
    146   URLMatcherCondition CreateQueryContainsCondition(const std::string& str);
    147   URLMatcherCondition CreateQueryEqualsCondition(const std::string& str);
    148 
    149   // This covers the common case, where you don't care whether a domain
    150   // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
    151   // should be followed by a given |path_prefix|.
    152   URLMatcherCondition CreateHostSuffixPathPrefixCondition(
    153       const std::string& host_suffix,
    154       const std::string& path_prefix);
    155   URLMatcherCondition CreateHostEqualsPathPrefixCondition(
    156       const std::string& host,
    157       const std::string& path_prefix);
    158 
    159   // Canonicalizes a URL for "CreateURL*Condition" searches.
    160   std::string CanonicalizeURLForFullSearches(const GURL& url) const;
    161 
    162   // Canonicalizes a URL for "CreateURLMatchesCondition" searches.
    163   std::string CanonicalizeURLForRegexSearches(const GURL& url) const;
    164   // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches.
    165   std::string CanonicalizeURLForOriginAndPathRegexSearches(
    166       const GURL& url) const;
    167 
    168   URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix);
    169   URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix);
    170   URLMatcherCondition CreateURLContainsCondition(const std::string& str);
    171   URLMatcherCondition CreateURLEqualsCondition(const std::string& str);
    172 
    173   URLMatcherCondition CreateURLMatchesCondition(const std::string& regex);
    174   URLMatcherCondition CreateOriginAndPathMatchesCondition(
    175       const std::string& regex);
    176 
    177   // Removes all patterns from |pattern_singletons_| that are not listed in
    178   // |used_patterns|. These patterns are not referenced any more and get
    179   // freed.
    180   void ForgetUnusedPatterns(
    181       const std::set<StringPattern::ID>& used_patterns);
    182 
    183   // Returns true if this object retains no allocated data. Only for debugging.
    184   bool IsEmpty() const;
    185 
    186  private:
    187   // Creates a URLMatcherCondition according to the parameters passed.
    188   // The URLMatcherCondition will refer to a StringPattern that is
    189   // owned by |pattern_singletons_|.
    190   URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion,
    191                                       const std::string& pattern);
    192 
    193   // Prepends a "." to the hostname if it does not start with one.
    194   std::string CanonicalizeHostname(const std::string& hostname) const;
    195 
    196   // Convert the query string to canonical form suitable for key token search.
    197   std::string CanonicalizeQuery(std::string query,
    198                                 bool prepend_beginning_of_query_component,
    199                                 bool append_end_of_query_component) const;
    200 
    201   // Counter that ensures that all created StringPatterns have unique IDs.
    202   // Note that substring patterns and regex patterns will use different IDs.
    203   int id_counter_;
    204 
    205   // This comparison considers only the pattern() value of the
    206   // StringPatterns.
    207   struct StringPatternPointerCompare {
    208     bool operator()(StringPattern* lhs, StringPattern* rhs) const;
    209   };
    210   // Set to ensure that we generate only one StringPattern for each content
    211   // of StringPattern::pattern().
    212   typedef std::set<StringPattern*, StringPatternPointerCompare>
    213       PatternSingletons;
    214   PatternSingletons substring_pattern_singletons_;
    215   PatternSingletons regex_pattern_singletons_;
    216   PatternSingletons origin_and_path_regex_pattern_singletons_;
    217 
    218   DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory);
    219 };
    220 
    221 // This class represents a single URL query matching condition. The query
    222 // matching is done as a search for a key and optionally a value.
    223 // The matching makes use of CanonicalizeURLForComponentSearches to ensure that
    224 // the key starts and ends (optionally) with the right marker.
    225 class URL_MATCHER_EXPORT URLQueryElementMatcherCondition {
    226  public:
    227   // Multiple occurrences of the same key can happen in a URL query. The type
    228   // ensures that every (MATCH_ALL), any (MATCH_ANY), first (MATCH_FIRST) or
    229   // last (MATCH_LAST) instance of the key occurrence matches the value.
    230   enum Type { MATCH_ANY, MATCH_FIRST, MATCH_LAST, MATCH_ALL };
    231 
    232   // Allows the match to be exact (QUERY_VALUE_MATCH_EXACT, starts and ends with
    233   // a delimiter or a border) or simply a prefix (QUERY_VALUE_MATCH_PREFIX,
    234   // starts with a delimiter or a border).
    235   enum QueryValueMatchType {
    236     QUERY_VALUE_MATCH_EXACT,
    237     QUERY_VALUE_MATCH_PREFIX
    238   };
    239 
    240   // Used to indicate if the query parameter is of type &key=value&
    241   // (ELEMENT_TYPE_KEY_VALUE) or simply &key& (ELEMENT_TYPE_KEY).
    242   enum QueryElementType { ELEMENT_TYPE_KEY_VALUE, ELEMENT_TYPE_KEY };
    243 
    244   URLQueryElementMatcherCondition(const std::string& key,
    245                                   const std::string& value,
    246                                   QueryValueMatchType query_value_match_type,
    247                                   QueryElementType query_element_type,
    248                                   Type match_type,
    249                                   URLMatcherConditionFactory* factory);
    250   ~URLQueryElementMatcherCondition();
    251 
    252   bool operator<(const URLQueryElementMatcherCondition& rhs) const;
    253 
    254   // Returns whether the URL query satisfies the key value constraint.
    255   bool IsMatch(const std::string& canonical_url_query) const;
    256 
    257   const StringPattern* string_pattern() const { return string_pattern_; }
    258 
    259  private:
    260   Type match_type_;
    261   std::string key_;
    262   std::string value_;
    263   size_t key_length_;
    264   size_t value_length_;
    265   const StringPattern* string_pattern_;
    266 };
    267 
    268 // This class represents a filter for the URL scheme to be hooked up into a
    269 // URLMatcherConditionSet.
    270 class URL_MATCHER_EXPORT URLMatcherSchemeFilter {
    271  public:
    272   explicit URLMatcherSchemeFilter(const std::string& filter);
    273   explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters);
    274   ~URLMatcherSchemeFilter();
    275   bool IsMatch(const GURL& url) const;
    276 
    277  private:
    278   std::vector<std::string> filters_;
    279 
    280   DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter);
    281 };
    282 
    283 // This class represents a filter for port numbers to be hooked up into a
    284 // URLMatcherConditionSet.
    285 class URL_MATCHER_EXPORT URLMatcherPortFilter {
    286  public:
    287   // Boundaries of a port range (both ends are included).
    288   typedef std::pair<int, int> Range;
    289   explicit URLMatcherPortFilter(const std::vector<Range>& ranges);
    290   ~URLMatcherPortFilter();
    291   bool IsMatch(const GURL& url) const;
    292 
    293   // Creates a port range [from, to]; both ends are included.
    294   static Range CreateRange(int from, int to);
    295   // Creates a port range containing a single port.
    296   static Range CreateRange(int port);
    297 
    298  private:
    299   std::vector<Range> ranges_;
    300 
    301   DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter);
    302 };
    303 
    304 // This class represents a set of conditions that all need to match on a
    305 // given URL in order to be considered a match.
    306 class URL_MATCHER_EXPORT URLMatcherConditionSet
    307     : public base::RefCounted<URLMatcherConditionSet> {
    308  public:
    309   typedef int ID;
    310   typedef std::set<URLMatcherCondition> Conditions;
    311   typedef std::set<URLQueryElementMatcherCondition> QueryConditions;
    312   typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector;
    313 
    314   // Matches if all conditions in |conditions| are fulfilled.
    315   URLMatcherConditionSet(ID id, const Conditions& conditions);
    316 
    317   // Matches if all conditions in |conditions|, |scheme_filter| and
    318   // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL,
    319   // in which case, no restrictions are imposed on the scheme/port of a URL.
    320   URLMatcherConditionSet(ID id, const Conditions& conditions,
    321                          scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    322                          scoped_ptr<URLMatcherPortFilter> port_filter);
    323 
    324   // Matches if all conditions in |conditions|, |query_conditions|,
    325   // |scheme_filter| and |port_filter| are fulfilled. |scheme_filter| and
    326   // |port_filter| may be NULL, in which case, no restrictions are imposed on
    327   // the scheme/port of a URL.
    328   URLMatcherConditionSet(ID id,
    329                          const Conditions& conditions,
    330                          const QueryConditions& query_conditions,
    331                          scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    332                          scoped_ptr<URLMatcherPortFilter> port_filter);
    333 
    334   ID id() const { return id_; }
    335   const Conditions& conditions() const { return conditions_; }
    336   const QueryConditions& query_conditions() const { return query_conditions_; }
    337 
    338   bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
    339                const GURL& url) const;
    340 
    341   bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
    342                const GURL& url,
    343                const std::string& url_for_component_searches) const;
    344 
    345  private:
    346   friend class base::RefCounted<URLMatcherConditionSet>;
    347   ~URLMatcherConditionSet();
    348   ID id_;
    349   Conditions conditions_;
    350   QueryConditions query_conditions_;
    351   scoped_ptr<URLMatcherSchemeFilter> scheme_filter_;
    352   scoped_ptr<URLMatcherPortFilter> port_filter_;
    353 
    354   DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet);
    355 };
    356 
    357 // This class allows matching one URL against a large set of
    358 // URLMatcherConditionSets at the same time.
    359 class URL_MATCHER_EXPORT URLMatcher {
    360  public:
    361   URLMatcher();
    362   ~URLMatcher();
    363 
    364   // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
    365   // must have a unique ID.
    366   // This is an expensive operation as it triggers pre-calculations on the
    367   // currently registered condition sets. Do not call this operation many
    368   // times with a single condition set in each call.
    369   void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets);
    370 
    371   // Removes the listed condition sets. All |condition_set_ids| must be
    372   // currently registered. This function should be called with large batches
    373   // of |condition_set_ids| at a time to improve performance.
    374   void RemoveConditionSets(
    375       const std::vector<URLMatcherConditionSet::ID>& condition_set_ids);
    376 
    377   // Removes all unused condition sets from the ConditionFactory.
    378   void ClearUnusedConditionSets();
    379 
    380   // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
    381   std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const;
    382 
    383   // Returns the URLMatcherConditionFactory that must be used to create
    384   // URLMatcherConditionSets for this URLMatcher.
    385   URLMatcherConditionFactory* condition_factory() {
    386     return &condition_factory_;
    387   }
    388 
    389   // Returns true if this object retains no allocated data. Only for debugging.
    390   bool IsEmpty() const;
    391 
    392  private:
    393   void UpdateSubstringSetMatcher(bool full_url_conditions);
    394   void UpdateRegexSetMatcher();
    395   void UpdateTriggers();
    396   void UpdateConditionFactory();
    397   void UpdateInternalDatastructures();
    398 
    399   URLMatcherConditionFactory condition_factory_;
    400 
    401   // Maps the ID of a URLMatcherConditionSet to the respective
    402   // URLMatcherConditionSet.
    403   typedef std::map<URLMatcherConditionSet::ID,
    404                    scoped_refptr<URLMatcherConditionSet> >
    405       URLMatcherConditionSets;
    406   URLMatcherConditionSets url_matcher_condition_sets_;
    407 
    408   // Maps a StringPattern ID to the URLMatcherConditions that need to
    409   // be triggered in case of a StringPattern match.
    410   typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> >
    411       StringPatternTriggers;
    412   StringPatternTriggers substring_match_triggers_;
    413 
    414   SubstringSetMatcher full_url_matcher_;
    415   SubstringSetMatcher url_component_matcher_;
    416   RegexSetMatcher regex_set_matcher_;
    417   RegexSetMatcher origin_and_path_regex_set_matcher_;
    418   std::set<const StringPattern*> registered_full_url_patterns_;
    419   std::set<const StringPattern*> registered_url_component_patterns_;
    420 
    421   DISALLOW_COPY_AND_ASSIGN(URLMatcher);
    422 };
    423 
    424 }  // namespace url_matcher
    425 
    426 #endif  // COMPONENTS_URL_MATCHER_URL_MATCHER_H_
    427