1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef EXTENSIONS_COMMON_MATCHER_URL_MATCHER_H_ 6 #define EXTENSIONS_COMMON_MATCHER_URL_MATCHER_H_ 7 8 #include <set> 9 #include <vector> 10 11 #include "base/memory/ref_counted.h" 12 #include "base/memory/scoped_ptr.h" 13 #include "base/memory/scoped_vector.h" 14 #include "extensions/common/matcher/regex_set_matcher.h" 15 #include "extensions/common/matcher/substring_set_matcher.h" 16 17 class GURL; 18 19 namespace base { 20 class DictionaryValue; 21 } 22 23 namespace extensions { 24 25 // This class represents a single URL matching condition, e.g. a match on the 26 // host suffix or the containment of a string in the query component of a GURL. 27 // 28 // The difference from a simple StringPattern is that this also supports 29 // checking whether the {Host, Path, Query} of a URL contains a string. The 30 // reduction of URL matching conditions to StringPatterns conducted by 31 // URLMatcherConditionFactory is not capable of expressing that alone. 32 // 33 // Also supported is matching regular expressions against the URL (URL_MATCHES). 34 class URLMatcherCondition { 35 public: 36 enum Criterion { 37 HOST_PREFIX, 38 HOST_SUFFIX, 39 HOST_CONTAINS, 40 HOST_EQUALS, 41 PATH_PREFIX, 42 PATH_SUFFIX, 43 PATH_CONTAINS, 44 PATH_EQUALS, 45 QUERY_PREFIX, 46 QUERY_SUFFIX, 47 QUERY_CONTAINS, 48 QUERY_EQUALS, 49 HOST_SUFFIX_PATH_PREFIX, 50 HOST_EQUALS_PATH_PREFIX, 51 URL_PREFIX, 52 URL_SUFFIX, 53 URL_CONTAINS, 54 URL_EQUALS, 55 URL_MATCHES, 56 ORIGIN_AND_PATH_MATCHES, // Matches the URL minus its query string. 57 }; 58 59 URLMatcherCondition(); 60 ~URLMatcherCondition(); 61 URLMatcherCondition(Criterion criterion, 62 const StringPattern* substring_pattern); 63 URLMatcherCondition(const URLMatcherCondition& rhs); 64 URLMatcherCondition& operator=(const URLMatcherCondition& rhs); 65 bool operator<(const URLMatcherCondition& rhs) const; 66 67 Criterion criterion() const { return criterion_; } 68 const StringPattern* string_pattern() const { 69 return string_pattern_; 70 } 71 72 // Returns whether this URLMatcherCondition needs to be executed on a 73 // full URL rather than the individual components (see 74 // URLMatcherConditionFactory). 75 bool IsFullURLCondition() const; 76 77 // Returns whether this URLMatcherCondition is a regular expression to be 78 // handled by a regex matcher instead of a substring matcher. 79 bool IsRegexCondition() const; 80 81 // Returns whether this URLMatcherCondition is a regular expression that shall 82 // be evaluated on the URL without the query parameter. 83 bool IsOriginAndPathRegexCondition() const; 84 85 // Returns whether this condition is fulfilled according to 86 // |matching_patterns| and |url|. 87 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns, 88 const GURL& url) const; 89 90 private: 91 // |criterion_| and |string_pattern_| describe together what property a URL 92 // needs to fulfill to be considered a match. 93 Criterion criterion_; 94 95 // This is the StringPattern that is used in a SubstringSetMatcher. 96 const StringPattern* string_pattern_; 97 }; 98 99 // Class to map the problem of finding {host, path, query} {prefixes, suffixes, 100 // containments, and equality} in GURLs to the substring matching problem. 101 // 102 // Say, you want to check whether the path of a URL starts with "/index.html". 103 // This class preprocesses a URL like "www.google.com/index.html" into something 104 // like "www.google.com|/index.html". After preprocessing, you can search for 105 // "|/index.html" in the string and see that this candidate URL actually has 106 // a path that starts with "/index.html". On the contrary, 107 // "www.google.com/images/index.html" would be normalized to 108 // "www.google.com|/images/index.html". It is easy to see that it contains 109 // "/index.html" but the path of the URL does not start with "/index.html". 110 // 111 // This preprocessing is important if you want to match a URL against many 112 // patterns because it reduces the matching to a "discover all substrings 113 // of a dictionary in a text" problem, which can be solved very efficiently 114 // by the Aho-Corasick algorithm. 115 // 116 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern 117 // referenced by created URLMatcherConditions. Therefore, it must outlive 118 // all created URLMatcherCondition and the SubstringSetMatcher. 119 class URLMatcherConditionFactory { 120 public: 121 URLMatcherConditionFactory(); 122 ~URLMatcherConditionFactory(); 123 124 // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches. 125 std::string CanonicalizeURLForComponentSearches(const GURL& url) const; 126 127 // Factory methods for various condition types. 128 // 129 // Note that these methods fill the pattern_singletons_. If you create 130 // conditions and don't register them to a URLMatcher, they will continue to 131 // consume memory. You need to call ForgetUnusedPatterns() or 132 // URLMatcher::ClearUnusedConditionSets() in this case. 133 URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix); 134 URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix); 135 URLMatcherCondition CreateHostContainsCondition(const std::string& str); 136 URLMatcherCondition CreateHostEqualsCondition(const std::string& str); 137 138 URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix); 139 URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix); 140 URLMatcherCondition CreatePathContainsCondition(const std::string& str); 141 URLMatcherCondition CreatePathEqualsCondition(const std::string& str); 142 143 URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix); 144 URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix); 145 URLMatcherCondition CreateQueryContainsCondition(const std::string& str); 146 URLMatcherCondition CreateQueryEqualsCondition(const std::string& str); 147 148 // This covers the common case, where you don't care whether a domain 149 // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it 150 // should be followed by a given |path_prefix|. 151 URLMatcherCondition CreateHostSuffixPathPrefixCondition( 152 const std::string& host_suffix, 153 const std::string& path_prefix); 154 URLMatcherCondition CreateHostEqualsPathPrefixCondition( 155 const std::string& host, 156 const std::string& path_prefix); 157 158 // Canonicalizes a URL for "CreateURL*Condition" searches. 159 std::string CanonicalizeURLForFullSearches(const GURL& url) const; 160 161 // Canonicalizes a URL for "CreateURLMatchesCondition" searches. 162 std::string CanonicalizeURLForRegexSearches(const GURL& url) const; 163 // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches. 164 std::string CanonicalizeURLForOriginAndPathRegexSearches( 165 const GURL& url) const; 166 167 URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix); 168 URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix); 169 URLMatcherCondition CreateURLContainsCondition(const std::string& str); 170 URLMatcherCondition CreateURLEqualsCondition(const std::string& str); 171 172 URLMatcherCondition CreateURLMatchesCondition(const std::string& regex); 173 URLMatcherCondition CreateOriginAndPathMatchesCondition( 174 const std::string& regex); 175 176 // Removes all patterns from |pattern_singletons_| that are not listed in 177 // |used_patterns|. These patterns are not referenced any more and get 178 // freed. 179 void ForgetUnusedPatterns( 180 const std::set<StringPattern::ID>& used_patterns); 181 182 // Returns true if this object retains no allocated data. Only for debugging. 183 bool IsEmpty() const; 184 185 private: 186 // Creates a URLMatcherCondition according to the parameters passed. 187 // The URLMatcherCondition will refer to a StringPattern that is 188 // owned by |pattern_singletons_|. 189 URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion, 190 const std::string& pattern); 191 192 // Prepends a "." to the hostname if it does not start with one. 193 std::string CanonicalizeHostname(const std::string& hostname) const; 194 195 // Counter that ensures that all created StringPatterns have unique IDs. 196 // Note that substring patterns and regex patterns will use different IDs. 197 int id_counter_; 198 199 // This comparison considers only the pattern() value of the 200 // StringPatterns. 201 struct StringPatternPointerCompare { 202 bool operator()(StringPattern* lhs, StringPattern* rhs) const; 203 }; 204 // Set to ensure that we generate only one StringPattern for each content 205 // of StringPattern::pattern(). 206 typedef std::set<StringPattern*, StringPatternPointerCompare> 207 PatternSingletons; 208 PatternSingletons substring_pattern_singletons_; 209 PatternSingletons regex_pattern_singletons_; 210 PatternSingletons origin_and_path_regex_pattern_singletons_; 211 212 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory); 213 }; 214 215 // This class represents a filter for the URL scheme to be hooked up into a 216 // URLMatcherConditionSet. 217 class URLMatcherSchemeFilter { 218 public: 219 explicit URLMatcherSchemeFilter(const std::string& filter); 220 explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters); 221 ~URLMatcherSchemeFilter(); 222 bool IsMatch(const GURL& url) const; 223 224 private: 225 std::vector<std::string> filters_; 226 227 DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter); 228 }; 229 230 // This class represents a filter for port numbers to be hooked up into a 231 // URLMatcherConditionSet. 232 class URLMatcherPortFilter { 233 public: 234 // Boundaries of a port range (both ends are included). 235 typedef std::pair<int, int> Range; 236 explicit URLMatcherPortFilter(const std::vector<Range>& ranges); 237 ~URLMatcherPortFilter(); 238 bool IsMatch(const GURL& url) const; 239 240 // Creates a port range [from, to]; both ends are included. 241 static Range CreateRange(int from, int to); 242 // Creates a port range containing a single port. 243 static Range CreateRange(int port); 244 245 private: 246 std::vector<Range> ranges_; 247 248 DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter); 249 }; 250 251 // This class represents a set of conditions that all need to match on a 252 // given URL in order to be considered a match. 253 class URLMatcherConditionSet : public base::RefCounted<URLMatcherConditionSet> { 254 public: 255 typedef int ID; 256 typedef std::set<URLMatcherCondition> Conditions; 257 typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector; 258 259 // Matches if all conditions in |conditions| are fulfilled. 260 URLMatcherConditionSet(ID id, const Conditions& conditions); 261 262 // Matches if all conditions in |conditions|, |scheme_filter| and 263 // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL, 264 // in which case, no restrictions are imposed on the scheme/port of a URL. 265 URLMatcherConditionSet(ID id, const Conditions& conditions, 266 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 267 scoped_ptr<URLMatcherPortFilter> port_filter); 268 269 ID id() const { return id_; } 270 const Conditions& conditions() const { return conditions_; } 271 272 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns, 273 const GURL& url) const; 274 275 private: 276 friend class base::RefCounted<URLMatcherConditionSet>; 277 ~URLMatcherConditionSet(); 278 ID id_; 279 Conditions conditions_; 280 scoped_ptr<URLMatcherSchemeFilter> scheme_filter_; 281 scoped_ptr<URLMatcherPortFilter> port_filter_; 282 283 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet); 284 }; 285 286 // This class allows matching one URL against a large set of 287 // URLMatcherConditionSets at the same time. 288 class URLMatcher { 289 public: 290 URLMatcher(); 291 ~URLMatcher(); 292 293 // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set 294 // must have a unique ID. 295 // This is an expensive operation as it triggers pre-calculations on the 296 // currently registered condition sets. Do not call this operation many 297 // times with a single condition set in each call. 298 void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets); 299 300 // Removes the listed condition sets. All |condition_set_ids| must be 301 // currently registered. This function should be called with large batches 302 // of |condition_set_ids| at a time to improve performance. 303 void RemoveConditionSets( 304 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids); 305 306 // Removes all unused condition sets from the ConditionFactory. 307 void ClearUnusedConditionSets(); 308 309 // Returns the IDs of all URLMatcherConditionSet that match to this |url|. 310 std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const; 311 312 // Returns the URLMatcherConditionFactory that must be used to create 313 // URLMatcherConditionSets for this URLMatcher. 314 URLMatcherConditionFactory* condition_factory() { 315 return &condition_factory_; 316 } 317 318 // Returns true if this object retains no allocated data. Only for debugging. 319 bool IsEmpty() const; 320 321 private: 322 void UpdateSubstringSetMatcher(bool full_url_conditions); 323 void UpdateRegexSetMatcher(); 324 void UpdateTriggers(); 325 void UpdateConditionFactory(); 326 void UpdateInternalDatastructures(); 327 328 URLMatcherConditionFactory condition_factory_; 329 330 // Maps the ID of a URLMatcherConditionSet to the respective 331 // URLMatcherConditionSet. 332 typedef std::map<URLMatcherConditionSet::ID, 333 scoped_refptr<URLMatcherConditionSet> > 334 URLMatcherConditionSets; 335 URLMatcherConditionSets url_matcher_condition_sets_; 336 337 // Maps a StringPattern ID to the URLMatcherConditions that need to 338 // be triggered in case of a StringPattern match. 339 typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> > 340 StringPatternTriggers; 341 StringPatternTriggers substring_match_triggers_; 342 343 SubstringSetMatcher full_url_matcher_; 344 SubstringSetMatcher url_component_matcher_; 345 RegexSetMatcher regex_set_matcher_; 346 RegexSetMatcher origin_and_path_regex_set_matcher_; 347 std::set<const StringPattern*> registered_full_url_patterns_; 348 std::set<const StringPattern*> registered_url_component_patterns_; 349 350 DISALLOW_COPY_AND_ASSIGN(URLMatcher); 351 }; 352 353 } // namespace extensions 354 355 #endif // EXTENSIONS_COMMON_MATCHER_URL_MATCHER_H_ 356