1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef COMPONENTS_URL_MATCHER_URL_MATCHER_H_ 6 #define COMPONENTS_URL_MATCHER_URL_MATCHER_H_ 7 8 #include <set> 9 #include <vector> 10 11 #include "base/memory/ref_counted.h" 12 #include "base/memory/scoped_ptr.h" 13 #include "base/memory/scoped_vector.h" 14 #include "components/url_matcher/regex_set_matcher.h" 15 #include "components/url_matcher/substring_set_matcher.h" 16 #include "components/url_matcher/url_matcher_export.h" 17 18 class GURL; 19 20 namespace base { 21 class DictionaryValue; 22 } 23 24 namespace url_matcher { 25 26 // This class represents a single URL matching condition, e.g. a match on the 27 // host suffix or the containment of a string in the query component of a GURL. 28 // 29 // The difference from a simple StringPattern is that this also supports 30 // checking whether the {Host, Path, Query} of a URL contains a string. The 31 // reduction of URL matching conditions to StringPatterns conducted by 32 // URLMatcherConditionFactory is not capable of expressing that alone. 33 // 34 // Also supported is matching regular expressions against the URL (URL_MATCHES). 35 class URL_MATCHER_EXPORT URLMatcherCondition { 36 public: 37 enum Criterion { 38 HOST_PREFIX, 39 HOST_SUFFIX, 40 HOST_CONTAINS, 41 HOST_EQUALS, 42 PATH_PREFIX, 43 PATH_SUFFIX, 44 PATH_CONTAINS, 45 PATH_EQUALS, 46 QUERY_PREFIX, 47 QUERY_SUFFIX, 48 QUERY_CONTAINS, 49 QUERY_EQUALS, 50 HOST_SUFFIX_PATH_PREFIX, 51 HOST_EQUALS_PATH_PREFIX, 52 URL_PREFIX, 53 URL_SUFFIX, 54 URL_CONTAINS, 55 URL_EQUALS, 56 URL_MATCHES, 57 ORIGIN_AND_PATH_MATCHES, // Matches the URL minus its query string. 58 }; 59 60 URLMatcherCondition(); 61 ~URLMatcherCondition(); 62 URLMatcherCondition(Criterion criterion, 63 const StringPattern* substring_pattern); 64 URLMatcherCondition(const URLMatcherCondition& rhs); 65 URLMatcherCondition& operator=(const URLMatcherCondition& rhs); 66 bool operator<(const URLMatcherCondition& rhs) const; 67 68 Criterion criterion() const { return criterion_; } 69 const StringPattern* string_pattern() const { 70 return string_pattern_; 71 } 72 73 // Returns whether this URLMatcherCondition needs to be executed on a 74 // full URL rather than the individual components (see 75 // URLMatcherConditionFactory). 76 bool IsFullURLCondition() const; 77 78 // Returns whether this URLMatcherCondition is a regular expression to be 79 // handled by a regex matcher instead of a substring matcher. 80 bool IsRegexCondition() const; 81 82 // Returns whether this URLMatcherCondition is a regular expression that shall 83 // be evaluated on the URL without the query parameter. 84 bool IsOriginAndPathRegexCondition() const; 85 86 // Returns whether this condition is fulfilled according to 87 // |matching_patterns| and |url|. 88 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns, 89 const GURL& url) const; 90 91 private: 92 // |criterion_| and |string_pattern_| describe together what property a URL 93 // needs to fulfill to be considered a match. 94 Criterion criterion_; 95 96 // This is the StringPattern that is used in a SubstringSetMatcher. 97 const StringPattern* string_pattern_; 98 }; 99 100 // Class to map the problem of finding {host, path, query} {prefixes, suffixes, 101 // containments, and equality} in GURLs to the substring matching problem. 102 // 103 // Say, you want to check whether the path of a URL starts with "/index.html". 104 // This class preprocesses a URL like "www.google.com/index.html" into something 105 // like "www.google.com|/index.html". After preprocessing, you can search for 106 // "|/index.html" in the string and see that this candidate URL actually has 107 // a path that starts with "/index.html". On the contrary, 108 // "www.google.com/images/index.html" would be normalized to 109 // "www.google.com|/images/index.html". It is easy to see that it contains 110 // "/index.html" but the path of the URL does not start with "/index.html". 111 // 112 // This preprocessing is important if you want to match a URL against many 113 // patterns because it reduces the matching to a "discover all substrings 114 // of a dictionary in a text" problem, which can be solved very efficiently 115 // by the Aho-Corasick algorithm. 116 // 117 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern 118 // referenced by created URLMatcherConditions. Therefore, it must outlive 119 // all created URLMatcherCondition and the SubstringSetMatcher. 120 class URL_MATCHER_EXPORT URLMatcherConditionFactory { 121 public: 122 URLMatcherConditionFactory(); 123 ~URLMatcherConditionFactory(); 124 125 // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches. 126 std::string CanonicalizeURLForComponentSearches(const GURL& url) const; 127 128 // Factory methods for various condition types. 129 // 130 // Note that these methods fill the pattern_singletons_. If you create 131 // conditions and don't register them to a URLMatcher, they will continue to 132 // consume memory. You need to call ForgetUnusedPatterns() or 133 // URLMatcher::ClearUnusedConditionSets() in this case. 134 URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix); 135 URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix); 136 URLMatcherCondition CreateHostContainsCondition(const std::string& str); 137 URLMatcherCondition CreateHostEqualsCondition(const std::string& str); 138 139 URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix); 140 URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix); 141 URLMatcherCondition CreatePathContainsCondition(const std::string& str); 142 URLMatcherCondition CreatePathEqualsCondition(const std::string& str); 143 144 URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix); 145 URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix); 146 URLMatcherCondition CreateQueryContainsCondition(const std::string& str); 147 URLMatcherCondition CreateQueryEqualsCondition(const std::string& str); 148 149 // This covers the common case, where you don't care whether a domain 150 // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it 151 // should be followed by a given |path_prefix|. 152 URLMatcherCondition CreateHostSuffixPathPrefixCondition( 153 const std::string& host_suffix, 154 const std::string& path_prefix); 155 URLMatcherCondition CreateHostEqualsPathPrefixCondition( 156 const std::string& host, 157 const std::string& path_prefix); 158 159 // Canonicalizes a URL for "CreateURL*Condition" searches. 160 std::string CanonicalizeURLForFullSearches(const GURL& url) const; 161 162 // Canonicalizes a URL for "CreateURLMatchesCondition" searches. 163 std::string CanonicalizeURLForRegexSearches(const GURL& url) const; 164 // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches. 165 std::string CanonicalizeURLForOriginAndPathRegexSearches( 166 const GURL& url) const; 167 168 URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix); 169 URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix); 170 URLMatcherCondition CreateURLContainsCondition(const std::string& str); 171 URLMatcherCondition CreateURLEqualsCondition(const std::string& str); 172 173 URLMatcherCondition CreateURLMatchesCondition(const std::string& regex); 174 URLMatcherCondition CreateOriginAndPathMatchesCondition( 175 const std::string& regex); 176 177 // Removes all patterns from |pattern_singletons_| that are not listed in 178 // |used_patterns|. These patterns are not referenced any more and get 179 // freed. 180 void ForgetUnusedPatterns( 181 const std::set<StringPattern::ID>& used_patterns); 182 183 // Returns true if this object retains no allocated data. Only for debugging. 184 bool IsEmpty() const; 185 186 private: 187 // Creates a URLMatcherCondition according to the parameters passed. 188 // The URLMatcherCondition will refer to a StringPattern that is 189 // owned by |pattern_singletons_|. 190 URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion, 191 const std::string& pattern); 192 193 // Prepends a "." to the hostname if it does not start with one. 194 std::string CanonicalizeHostname(const std::string& hostname) const; 195 196 // Convert the query string to canonical form suitable for key token search. 197 std::string CanonicalizeQuery(std::string query, 198 bool prepend_beginning_of_query_component, 199 bool append_end_of_query_component) const; 200 201 // Counter that ensures that all created StringPatterns have unique IDs. 202 // Note that substring patterns and regex patterns will use different IDs. 203 int id_counter_; 204 205 // This comparison considers only the pattern() value of the 206 // StringPatterns. 207 struct StringPatternPointerCompare { 208 bool operator()(StringPattern* lhs, StringPattern* rhs) const; 209 }; 210 // Set to ensure that we generate only one StringPattern for each content 211 // of StringPattern::pattern(). 212 typedef std::set<StringPattern*, StringPatternPointerCompare> 213 PatternSingletons; 214 PatternSingletons substring_pattern_singletons_; 215 PatternSingletons regex_pattern_singletons_; 216 PatternSingletons origin_and_path_regex_pattern_singletons_; 217 218 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory); 219 }; 220 221 // This class represents a single URL query matching condition. The query 222 // matching is done as a search for a key and optionally a value. 223 // The matching makes use of CanonicalizeURLForComponentSearches to ensure that 224 // the key starts and ends (optionally) with the right marker. 225 class URL_MATCHER_EXPORT URLQueryElementMatcherCondition { 226 public: 227 // Multiple occurrences of the same key can happen in a URL query. The type 228 // ensures that every (MATCH_ALL), any (MATCH_ANY), first (MATCH_FIRST) or 229 // last (MATCH_LAST) instance of the key occurrence matches the value. 230 enum Type { MATCH_ANY, MATCH_FIRST, MATCH_LAST, MATCH_ALL }; 231 232 // Allows the match to be exact (QUERY_VALUE_MATCH_EXACT, starts and ends with 233 // a delimiter or a border) or simply a prefix (QUERY_VALUE_MATCH_PREFIX, 234 // starts with a delimiter or a border). 235 enum QueryValueMatchType { 236 QUERY_VALUE_MATCH_EXACT, 237 QUERY_VALUE_MATCH_PREFIX 238 }; 239 240 // Used to indicate if the query parameter is of type &key=value& 241 // (ELEMENT_TYPE_KEY_VALUE) or simply &key& (ELEMENT_TYPE_KEY). 242 enum QueryElementType { ELEMENT_TYPE_KEY_VALUE, ELEMENT_TYPE_KEY }; 243 244 URLQueryElementMatcherCondition(const std::string& key, 245 const std::string& value, 246 QueryValueMatchType query_value_match_type, 247 QueryElementType query_element_type, 248 Type match_type, 249 URLMatcherConditionFactory* factory); 250 ~URLQueryElementMatcherCondition(); 251 252 bool operator<(const URLQueryElementMatcherCondition& rhs) const; 253 254 // Returns whether the URL query satisfies the key value constraint. 255 bool IsMatch(const std::string& canonical_url_query) const; 256 257 const StringPattern* string_pattern() const { return string_pattern_; } 258 259 private: 260 Type match_type_; 261 std::string key_; 262 std::string value_; 263 size_t key_length_; 264 size_t value_length_; 265 const StringPattern* string_pattern_; 266 }; 267 268 // This class represents a filter for the URL scheme to be hooked up into a 269 // URLMatcherConditionSet. 270 class URL_MATCHER_EXPORT URLMatcherSchemeFilter { 271 public: 272 explicit URLMatcherSchemeFilter(const std::string& filter); 273 explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters); 274 ~URLMatcherSchemeFilter(); 275 bool IsMatch(const GURL& url) const; 276 277 private: 278 std::vector<std::string> filters_; 279 280 DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter); 281 }; 282 283 // This class represents a filter for port numbers to be hooked up into a 284 // URLMatcherConditionSet. 285 class URL_MATCHER_EXPORT URLMatcherPortFilter { 286 public: 287 // Boundaries of a port range (both ends are included). 288 typedef std::pair<int, int> Range; 289 explicit URLMatcherPortFilter(const std::vector<Range>& ranges); 290 ~URLMatcherPortFilter(); 291 bool IsMatch(const GURL& url) const; 292 293 // Creates a port range [from, to]; both ends are included. 294 static Range CreateRange(int from, int to); 295 // Creates a port range containing a single port. 296 static Range CreateRange(int port); 297 298 private: 299 std::vector<Range> ranges_; 300 301 DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter); 302 }; 303 304 // This class represents a set of conditions that all need to match on a 305 // given URL in order to be considered a match. 306 class URL_MATCHER_EXPORT URLMatcherConditionSet 307 : public base::RefCounted<URLMatcherConditionSet> { 308 public: 309 typedef int ID; 310 typedef std::set<URLMatcherCondition> Conditions; 311 typedef std::set<URLQueryElementMatcherCondition> QueryConditions; 312 typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector; 313 314 // Matches if all conditions in |conditions| are fulfilled. 315 URLMatcherConditionSet(ID id, const Conditions& conditions); 316 317 // Matches if all conditions in |conditions|, |scheme_filter| and 318 // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL, 319 // in which case, no restrictions are imposed on the scheme/port of a URL. 320 URLMatcherConditionSet(ID id, const Conditions& conditions, 321 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 322 scoped_ptr<URLMatcherPortFilter> port_filter); 323 324 // Matches if all conditions in |conditions|, |query_conditions|, 325 // |scheme_filter| and |port_filter| are fulfilled. |scheme_filter| and 326 // |port_filter| may be NULL, in which case, no restrictions are imposed on 327 // the scheme/port of a URL. 328 URLMatcherConditionSet(ID id, 329 const Conditions& conditions, 330 const QueryConditions& query_conditions, 331 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 332 scoped_ptr<URLMatcherPortFilter> port_filter); 333 334 ID id() const { return id_; } 335 const Conditions& conditions() const { return conditions_; } 336 const QueryConditions& query_conditions() const { return query_conditions_; } 337 338 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns, 339 const GURL& url) const; 340 341 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns, 342 const GURL& url, 343 const std::string& url_for_component_searches) const; 344 345 private: 346 friend class base::RefCounted<URLMatcherConditionSet>; 347 ~URLMatcherConditionSet(); 348 ID id_; 349 Conditions conditions_; 350 QueryConditions query_conditions_; 351 scoped_ptr<URLMatcherSchemeFilter> scheme_filter_; 352 scoped_ptr<URLMatcherPortFilter> port_filter_; 353 354 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet); 355 }; 356 357 // This class allows matching one URL against a large set of 358 // URLMatcherConditionSets at the same time. 359 class URL_MATCHER_EXPORT URLMatcher { 360 public: 361 URLMatcher(); 362 ~URLMatcher(); 363 364 // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set 365 // must have a unique ID. 366 // This is an expensive operation as it triggers pre-calculations on the 367 // currently registered condition sets. Do not call this operation many 368 // times with a single condition set in each call. 369 void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets); 370 371 // Removes the listed condition sets. All |condition_set_ids| must be 372 // currently registered. This function should be called with large batches 373 // of |condition_set_ids| at a time to improve performance. 374 void RemoveConditionSets( 375 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids); 376 377 // Removes all unused condition sets from the ConditionFactory. 378 void ClearUnusedConditionSets(); 379 380 // Returns the IDs of all URLMatcherConditionSet that match to this |url|. 381 std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const; 382 383 // Returns the URLMatcherConditionFactory that must be used to create 384 // URLMatcherConditionSets for this URLMatcher. 385 URLMatcherConditionFactory* condition_factory() { 386 return &condition_factory_; 387 } 388 389 // Returns true if this object retains no allocated data. Only for debugging. 390 bool IsEmpty() const; 391 392 private: 393 void UpdateSubstringSetMatcher(bool full_url_conditions); 394 void UpdateRegexSetMatcher(); 395 void UpdateTriggers(); 396 void UpdateConditionFactory(); 397 void UpdateInternalDatastructures(); 398 399 URLMatcherConditionFactory condition_factory_; 400 401 // Maps the ID of a URLMatcherConditionSet to the respective 402 // URLMatcherConditionSet. 403 typedef std::map<URLMatcherConditionSet::ID, 404 scoped_refptr<URLMatcherConditionSet> > 405 URLMatcherConditionSets; 406 URLMatcherConditionSets url_matcher_condition_sets_; 407 408 // Maps a StringPattern ID to the URLMatcherConditions that need to 409 // be triggered in case of a StringPattern match. 410 typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> > 411 StringPatternTriggers; 412 StringPatternTriggers substring_match_triggers_; 413 414 SubstringSetMatcher full_url_matcher_; 415 SubstringSetMatcher url_component_matcher_; 416 RegexSetMatcher regex_set_matcher_; 417 RegexSetMatcher origin_and_path_regex_set_matcher_; 418 std::set<const StringPattern*> registered_full_url_patterns_; 419 std::set<const StringPattern*> registered_url_component_patterns_; 420 421 DISALLOW_COPY_AND_ASSIGN(URLMatcher); 422 }; 423 424 } // namespace url_matcher 425 426 #endif // COMPONENTS_URL_MATCHER_URL_MATCHER_H_ 427