1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/url_matcher/url_matcher.h" 6 7 #include <algorithm> 8 #include <iterator> 9 10 #include "base/logging.h" 11 #include "url/gurl.h" 12 #include "url/url_canon.h" 13 14 namespace url_matcher { 15 16 // This set of classes implement a mapping of URL Component Patterns, such as 17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns 18 // for use in substring comparisons. 19 // 20 // The idea of this mapping is to reduce the problem of comparing many 21 // URL Component Patterns against one URL to the problem of searching many 22 // substrings in one string: 23 // 24 // ---------------------- ----------------- 25 // | URL Query operator | ----translate----> | StringPattern | 26 // ---------------------- ----------------- 27 // ^ 28 // | 29 // compare 30 // | 31 // v 32 // ---------------------- ----------------- 33 // | URL to compare | | | 34 // | to all URL Query | ----translate----> | String | 35 // | operators | | | 36 // ---------------------- ----------------- 37 // 38 // The reason for this problem reduction is that there are efficient algorithms 39 // for searching many substrings in one string (see Aho-Corasick algorithm). 40 // 41 // Additionally, some of the same pieces are reused to implement regular 42 // expression comparisons. The FilteredRE2 implementation for matching many 43 // regular expressions against one string uses prefiltering, in which a set 44 // of substrings (derived from the regexes) are first searched for, to reduce 45 // the number of regular expressions to test; the prefiltering step also 46 // uses Aho-Corasick. 47 // 48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches. 49 // ========================================================== 50 // 51 // For searches in this class, we normalize URLs as follows: 52 // 53 // Step 1: 54 // Remove scheme, port and segment from URL: 55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 56 // www.example.com/index.html?search=foo 57 // 58 // We remove the scheme and port number because they can be checked later 59 // in a secondary filter step. We remove the segment (the #... part) because 60 // this is not guaranteed to be ASCII-7 encoded. 61 // 62 // Step 2: 63 // Translate URL to String and add the following position markers: 64 // - BU = Beginning of URL 65 // - ED = End of Domain 66 // - EP = End of Path 67 // - EU = End of URL 68 // Furthermore, the hostname is canonicalized to start with a ".". 69 // 70 // Position markers are represented as characters >127, which are therefore 71 // guaranteed not to be part of the ASCII-7 encoded URL character set. 72 // 73 // -> www.example.com/index.html?search=foo becomes 74 // BU .www.example.com ED /index.html EP ?search=foo EU 75 // 76 // -> www.example.com/index.html becomes 77 // BU .www.example.com ED /index.html EP EU 78 // 79 // Step 3: 80 // Translate URL Component Patterns as follows: 81 // 82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix) 83 // -> host_prefix("www.example") = BU .www.example 84 // 85 // host_suffix(suffix) = suffix ED 86 // -> host_suffix("example.com") = example.com ED 87 // -> host_suffix(".example.com") = .example.com ED 88 // 89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED 90 // -> host_equals("www.example.com") = BU .www.example.com ED 91 // 92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). 93 // 94 // With this, we can search the StringPatterns in the normalized URL. 95 // 96 // 97 // Case 2: url_{prefix,suffix,equals,contains} searches. 98 // ===================================================== 99 // 100 // Step 1: as above, except that 101 // - the scheme is not removed 102 // - the port is not removed if it is specified and does not match the default 103 // port for the given scheme. 104 // 105 // Step 2: 106 // Translate URL to String and add the following position markers: 107 // - BU = Beginning of URL 108 // - EU = End of URL 109 // 110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 111 // BU http://www.example.com:8080/index.html?search=foo EU 112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes 113 // BU http://www.example.com/index.html?search=foo EU 114 // 115 // url_prefix(prefix) = BU prefix 116 // -> url_prefix("http://www.example") = BU http://www.example 117 // 118 // url_contains(substring) = substring 119 // -> url_contains("index") = index 120 // 121 // 122 // Case 3: {host,path,query}_contains searches. 123 // ============================================ 124 // 125 // These kinds of searches are not supported directly but can be derived 126 // by a combination of a url_contains() query followed by an explicit test: 127 // 128 // host_contains(str) = url_contains(str) followed by test whether str occurs 129 // in host component of original URL. 130 // -> host_contains("example.co") = example.co 131 // followed by gurl.host().find("example.co"); 132 // 133 // [similarly for path_contains and query_contains]. 134 // 135 // 136 // Regular expression matching (url_matches searches) 137 // ================================================== 138 // 139 // This class also supports matching regular expressions (RE2 syntax) 140 // against full URLs, which are transformed as in case 2. 141 142 namespace { 143 144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { 145 return criterion == URLMatcherCondition::URL_MATCHES; 146 } 147 148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { 149 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; 150 } 151 152 } // namespace 153 154 // 155 // URLMatcherCondition 156 // 157 158 URLMatcherCondition::URLMatcherCondition() 159 : criterion_(HOST_PREFIX), 160 string_pattern_(NULL) {} 161 162 URLMatcherCondition::~URLMatcherCondition() {} 163 164 URLMatcherCondition::URLMatcherCondition( 165 Criterion criterion, 166 const StringPattern* string_pattern) 167 : criterion_(criterion), 168 string_pattern_(string_pattern) {} 169 170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) 171 : criterion_(rhs.criterion_), 172 string_pattern_(rhs.string_pattern_) {} 173 174 URLMatcherCondition& URLMatcherCondition::operator=( 175 const URLMatcherCondition& rhs) { 176 criterion_ = rhs.criterion_; 177 string_pattern_ = rhs.string_pattern_; 178 return *this; 179 } 180 181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { 182 if (criterion_ < rhs.criterion_) return true; 183 if (criterion_ > rhs.criterion_) return false; 184 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 185 return *string_pattern_ < *rhs.string_pattern_; 186 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; 187 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 188 // or both are NULL. 189 return false; 190 } 191 192 bool URLMatcherCondition::IsFullURLCondition() const { 193 // For these criteria the SubstringMatcher needs to be executed on the 194 // GURL that is canonicalized with 195 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. 196 switch (criterion_) { 197 case HOST_CONTAINS: 198 case PATH_CONTAINS: 199 case QUERY_CONTAINS: 200 case URL_PREFIX: 201 case URL_SUFFIX: 202 case URL_CONTAINS: 203 case URL_EQUALS: 204 return true; 205 default: 206 break; 207 } 208 return false; 209 } 210 211 bool URLMatcherCondition::IsRegexCondition() const { 212 return IsRegexCriterion(criterion_); 213 } 214 215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const { 216 return IsOriginAndPathRegexCriterion(criterion_); 217 } 218 219 bool URLMatcherCondition::IsMatch( 220 const std::set<StringPattern::ID>& matching_patterns, 221 const GURL& url) const { 222 DCHECK(string_pattern_); 223 if (!ContainsKey(matching_patterns, string_pattern_->id())) 224 return false; 225 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on 226 // a substring match on the raw URL. In case of a match, we need to verify 227 // that the match was found in the correct component of the URL. 228 switch (criterion_) { 229 case HOST_CONTAINS: 230 return url.host().find(string_pattern_->pattern()) != 231 std::string::npos; 232 case PATH_CONTAINS: 233 return url.path().find(string_pattern_->pattern()) != 234 std::string::npos; 235 case QUERY_CONTAINS: 236 return url.query().find(string_pattern_->pattern()) != 237 std::string::npos; 238 default: 239 break; 240 } 241 return true; 242 } 243 244 // 245 // URLMatcherConditionFactory 246 // 247 248 namespace { 249 // These are symbols that are not contained in 7-bit ASCII used in GURLs. 250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; 251 const char kEndOfDomain[] = {static_cast<char>(-2), 0}; 252 const char kEndOfPath[] = {static_cast<char>(-3), 0}; 253 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0}; 254 const char kEndOfURL[] = {static_cast<char>(-5), 0}; 255 256 // The delimiter for query parameters 257 const char kQuerySeparator = '&'; 258 } // namespace 259 260 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} 261 262 URLMatcherConditionFactory::~URLMatcherConditionFactory() { 263 STLDeleteElements(&substring_pattern_singletons_); 264 STLDeleteElements(®ex_pattern_singletons_); 265 STLDeleteElements(&origin_and_path_regex_pattern_singletons_); 266 } 267 268 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( 269 const GURL& url) const { 270 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + 271 url.path() + kEndOfPath + 272 (url.has_query() ? CanonicalizeQuery(url.query(), true, true) 273 : std::string()) + 274 kEndOfURL; 275 } 276 277 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( 278 const std::string& prefix) { 279 return CreateCondition(URLMatcherCondition::HOST_PREFIX, 280 kBeginningOfURL + CanonicalizeHostname(prefix)); 281 } 282 283 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( 284 const std::string& suffix) { 285 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, 286 suffix + kEndOfDomain); 287 } 288 289 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( 290 const std::string& str) { 291 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); 292 } 293 294 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( 295 const std::string& str) { 296 return CreateCondition(URLMatcherCondition::HOST_EQUALS, 297 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); 298 } 299 300 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( 301 const std::string& prefix) { 302 return CreateCondition(URLMatcherCondition::PATH_PREFIX, 303 kEndOfDomain + prefix); 304 } 305 306 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( 307 const std::string& suffix) { 308 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); 309 } 310 311 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( 312 const std::string& str) { 313 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); 314 } 315 316 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( 317 const std::string& str) { 318 return CreateCondition(URLMatcherCondition::PATH_EQUALS, 319 kEndOfDomain + str + kEndOfPath); 320 } 321 322 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( 323 const std::string& prefix) { 324 std::string pattern; 325 if (!prefix.empty() && prefix[0] == '?') 326 pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false); 327 else 328 pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false); 329 330 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); 331 } 332 333 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( 334 const std::string& suffix) { 335 if (!suffix.empty() && suffix[0] == '?') { 336 return CreateQueryEqualsCondition(suffix); 337 } else { 338 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, 339 CanonicalizeQuery(suffix, false, true) + kEndOfURL); 340 } 341 } 342 343 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( 344 const std::string& str) { 345 if (!str.empty() && str[0] == '?') 346 return CreateQueryPrefixCondition(str); 347 else 348 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); 349 } 350 351 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( 352 const std::string& str) { 353 std::string pattern; 354 if (!str.empty() && str[0] == '?') 355 pattern = 356 kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL; 357 else 358 pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL; 359 360 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); 361 } 362 363 URLMatcherCondition 364 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( 365 const std::string& host_suffix, 366 const std::string& path_prefix) { 367 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, 368 host_suffix + kEndOfDomain + path_prefix); 369 } 370 371 URLMatcherCondition 372 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( 373 const std::string& host, 374 const std::string& path_prefix) { 375 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, 376 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + 377 path_prefix); 378 } 379 380 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( 381 const GURL& url) const { 382 GURL::Replacements replacements; 383 replacements.ClearPassword(); 384 replacements.ClearUsername(); 385 replacements.ClearRef(); 386 // Clear port if it is implicit from scheme. 387 if (url.has_port()) { 388 const std::string& port = url.scheme(); 389 if (url::DefaultPortForScheme(port.c_str(), port.size()) == 390 url.EffectiveIntPort()) { 391 replacements.ClearPort(); 392 } 393 } 394 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + 395 kEndOfURL; 396 } 397 398 static std::string CanonicalizeURLForRegexSearchesHelper( 399 const GURL& url, 400 bool clear_query) { 401 GURL::Replacements replacements; 402 replacements.ClearPassword(); 403 replacements.ClearUsername(); 404 replacements.ClearRef(); 405 if (clear_query) 406 replacements.ClearQuery(); 407 // Clear port if it is implicit from scheme. 408 if (url.has_port()) { 409 const std::string& port = url.scheme(); 410 if (url::DefaultPortForScheme(port.c_str(), port.size()) == 411 url.EffectiveIntPort()) { 412 replacements.ClearPort(); 413 } 414 } 415 return url.ReplaceComponents(replacements).spec(); 416 } 417 418 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( 419 const GURL& url) const { 420 return CanonicalizeURLForRegexSearchesHelper(url, false); 421 } 422 423 std::string 424 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches( 425 const GURL& url) const { 426 return CanonicalizeURLForRegexSearchesHelper(url, true); 427 } 428 429 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( 430 const std::string& prefix) { 431 return CreateCondition(URLMatcherCondition::URL_PREFIX, 432 kBeginningOfURL + prefix); 433 } 434 435 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( 436 const std::string& suffix) { 437 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); 438 } 439 440 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( 441 const std::string& str) { 442 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); 443 } 444 445 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( 446 const std::string& str) { 447 return CreateCondition(URLMatcherCondition::URL_EQUALS, 448 kBeginningOfURL + str + kEndOfURL); 449 } 450 451 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( 452 const std::string& regex) { 453 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); 454 } 455 456 URLMatcherCondition 457 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition( 458 const std::string& regex) { 459 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex); 460 } 461 462 void URLMatcherConditionFactory::ForgetUnusedPatterns( 463 const std::set<StringPattern::ID>& used_patterns) { 464 PatternSingletons::iterator i = substring_pattern_singletons_.begin(); 465 while (i != substring_pattern_singletons_.end()) { 466 if (ContainsKey(used_patterns, (*i)->id())) { 467 ++i; 468 } else { 469 delete *i; 470 substring_pattern_singletons_.erase(i++); 471 } 472 } 473 i = regex_pattern_singletons_.begin(); 474 while (i != regex_pattern_singletons_.end()) { 475 if (ContainsKey(used_patterns, (*i)->id())) { 476 ++i; 477 } else { 478 delete *i; 479 regex_pattern_singletons_.erase(i++); 480 } 481 } 482 i = origin_and_path_regex_pattern_singletons_.begin(); 483 while (i != origin_and_path_regex_pattern_singletons_.end()) { 484 if (ContainsKey(used_patterns, (*i)->id())) { 485 ++i; 486 } else { 487 delete *i; 488 origin_and_path_regex_pattern_singletons_.erase(i++); 489 } 490 } 491 } 492 493 bool URLMatcherConditionFactory::IsEmpty() const { 494 return substring_pattern_singletons_.empty() && 495 regex_pattern_singletons_.empty() && 496 origin_and_path_regex_pattern_singletons_.empty(); 497 } 498 499 URLMatcherCondition URLMatcherConditionFactory::CreateCondition( 500 URLMatcherCondition::Criterion criterion, 501 const std::string& pattern) { 502 StringPattern search_pattern(pattern, 0); 503 PatternSingletons* pattern_singletons = NULL; 504 if (IsRegexCriterion(criterion)) 505 pattern_singletons = ®ex_pattern_singletons_; 506 else if (IsOriginAndPathRegexCriterion(criterion)) 507 pattern_singletons = &origin_and_path_regex_pattern_singletons_; 508 else 509 pattern_singletons = &substring_pattern_singletons_; 510 511 PatternSingletons::const_iterator iter = 512 pattern_singletons->find(&search_pattern); 513 514 if (iter != pattern_singletons->end()) { 515 return URLMatcherCondition(criterion, *iter); 516 } else { 517 StringPattern* new_pattern = 518 new StringPattern(pattern, id_counter_++); 519 pattern_singletons->insert(new_pattern); 520 return URLMatcherCondition(criterion, new_pattern); 521 } 522 } 523 524 std::string URLMatcherConditionFactory::CanonicalizeHostname( 525 const std::string& hostname) const { 526 if (!hostname.empty() && hostname[0] == '.') 527 return hostname; 528 else 529 return "." + hostname; 530 } 531 532 // This function prepares the query string by replacing query separator with a 533 // magic value (|kQueryComponentDelimiter|). When the boolean 534 // |prepend_beginning_of_query_component| is true the function prepends the 535 // query with the same magic. This is done to locate the start of a key value 536 // pair in the query string. The parameter |query| is passed by value 537 // intentionally, since it is locally modified. 538 std::string URLMatcherConditionFactory::CanonicalizeQuery( 539 std::string query, 540 bool prepend_beginning_of_query_component, 541 bool append_end_of_query_component) const { 542 for (std::string::iterator it = query.begin(); it != query.end(); ++it) { 543 if (*it == kQuerySeparator) 544 *it = kQueryComponentDelimiter[0]; 545 } 546 if (prepend_beginning_of_query_component) 547 query = kQueryComponentDelimiter + query; 548 if (append_end_of_query_component) 549 query += kQueryComponentDelimiter; 550 return query; 551 } 552 553 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( 554 StringPattern* lhs, 555 StringPattern* rhs) const { 556 if (lhs == NULL && rhs != NULL) return true; 557 if (lhs != NULL && rhs != NULL) 558 return lhs->pattern() < rhs->pattern(); 559 // Either both are NULL or only rhs is NULL. 560 return false; 561 } 562 563 // 564 // URLQueryElementMatcherCondition 565 // 566 567 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition( 568 const std::string& key, 569 const std::string& value, 570 QueryValueMatchType query_value_match_type, 571 QueryElementType query_element_type, 572 Type match_type, 573 URLMatcherConditionFactory* factory) { 574 match_type_ = match_type; 575 576 if (query_element_type == ELEMENT_TYPE_KEY_VALUE) { 577 key_ = kQueryComponentDelimiter + key + "="; 578 value_ = value; 579 } else { 580 key_ = kQueryComponentDelimiter + key; 581 value_ = std::string(); 582 } 583 584 if (query_value_match_type == QUERY_VALUE_MATCH_EXACT) 585 value_ += kQueryComponentDelimiter; 586 587 // If |value_| is empty no need to find the |key_| and verify if the value 588 // matches. Simply checking the presence of key is sufficient, which is done 589 // by MATCH_ANY 590 if (value_.empty()) 591 match_type_ = MATCH_ANY; 592 593 URLMatcherCondition condition; 594 // If |match_type_| is MATCH_ANY, then we could simply look for the 595 // combination of |key_| + |value_|, which can be efficiently done by 596 // SubstringMatcher 597 if (match_type_ == MATCH_ANY) 598 condition = factory->CreateQueryContainsCondition(key_ + value_); 599 else 600 condition = factory->CreateQueryContainsCondition(key_); 601 string_pattern_ = condition.string_pattern(); 602 603 key_length_ = key_.length(); 604 value_length_ = value_.length(); 605 } 606 607 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {} 608 609 bool URLQueryElementMatcherCondition::operator<( 610 const URLQueryElementMatcherCondition& rhs) const { 611 if (match_type_ != rhs.match_type_) 612 return match_type_ < rhs.match_type_; 613 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 614 return *string_pattern_ < *rhs.string_pattern_; 615 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) 616 return true; 617 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 618 // or both are NULL. 619 return false; 620 } 621 622 bool URLQueryElementMatcherCondition::IsMatch( 623 const std::string& url_for_component_searches) const { 624 switch (match_type_) { 625 case MATCH_ANY: { 626 // For MATCH_ANY, no additional verification step is needed. We can trust 627 // the SubstringMatcher to do the verification. 628 return true; 629 } 630 case MATCH_ALL: { 631 size_t start = 0; 632 int found = 0; 633 size_t offset; 634 while ((offset = url_for_component_searches.find(key_, start)) != 635 std::string::npos) { 636 if (url_for_component_searches.compare( 637 offset + key_length_, value_length_, value_) != 0) { 638 return false; 639 } else { 640 ++found; 641 } 642 start = offset + key_length_ + value_length_ - 1; 643 } 644 return !!found; 645 } 646 case MATCH_FIRST: { 647 size_t offset = url_for_component_searches.find(key_); 648 return url_for_component_searches.compare( 649 offset + key_length_, value_length_, value_) == 0; 650 } 651 case MATCH_LAST: { 652 size_t offset = url_for_component_searches.rfind(key_); 653 return url_for_component_searches.compare( 654 offset + key_length_, value_length_, value_) == 0; 655 } 656 } 657 NOTREACHED(); 658 return false; 659 } 660 661 // 662 // URLMatcherSchemeFilter 663 // 664 665 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) 666 : filters_(1) { 667 filters_.push_back(filter); 668 } 669 670 URLMatcherSchemeFilter::URLMatcherSchemeFilter( 671 const std::vector<std::string>& filters) 672 : filters_(filters) {} 673 674 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} 675 676 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { 677 return std::find(filters_.begin(), filters_.end(), url.scheme()) != 678 filters_.end(); 679 } 680 681 // 682 // URLMatcherPortFilter 683 // 684 685 URLMatcherPortFilter::URLMatcherPortFilter( 686 const std::vector<URLMatcherPortFilter::Range>& ranges) 687 : ranges_(ranges) {} 688 689 URLMatcherPortFilter::~URLMatcherPortFilter() {} 690 691 bool URLMatcherPortFilter::IsMatch(const GURL& url) const { 692 int port = url.EffectiveIntPort(); 693 for (std::vector<Range>::const_iterator i = ranges_.begin(); 694 i != ranges_.end(); ++i) { 695 if (i->first <= port && port <= i->second) 696 return true; 697 } 698 return false; 699 } 700 701 // static 702 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, 703 int to) { 704 return Range(from, to); 705 } 706 707 // static 708 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { 709 return Range(port, port); 710 } 711 712 // 713 // URLMatcherConditionSet 714 // 715 716 URLMatcherConditionSet::~URLMatcherConditionSet() {} 717 718 URLMatcherConditionSet::URLMatcherConditionSet( 719 ID id, 720 const Conditions& conditions) 721 : id_(id), 722 conditions_(conditions) {} 723 724 URLMatcherConditionSet::URLMatcherConditionSet( 725 ID id, 726 const Conditions& conditions, 727 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 728 scoped_ptr<URLMatcherPortFilter> port_filter) 729 : id_(id), 730 conditions_(conditions), 731 scheme_filter_(scheme_filter.Pass()), 732 port_filter_(port_filter.Pass()) {} 733 734 URLMatcherConditionSet::URLMatcherConditionSet( 735 ID id, 736 const Conditions& conditions, 737 const QueryConditions& query_conditions, 738 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 739 scoped_ptr<URLMatcherPortFilter> port_filter) 740 : id_(id), 741 conditions_(conditions), 742 query_conditions_(query_conditions), 743 scheme_filter_(scheme_filter.Pass()), 744 port_filter_(port_filter.Pass()) {} 745 746 bool URLMatcherConditionSet::IsMatch( 747 const std::set<StringPattern::ID>& matching_patterns, 748 const GURL& url) const { 749 return IsMatch(matching_patterns, url, std::string()); 750 } 751 752 bool URLMatcherConditionSet::IsMatch( 753 const std::set<StringPattern::ID>& matching_patterns, 754 const GURL& url, 755 const std::string& url_for_component_searches) const { 756 for (Conditions::const_iterator i = conditions_.begin(); 757 i != conditions_.end(); ++i) { 758 if (!i->IsMatch(matching_patterns, url)) 759 return false; 760 } 761 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) 762 return false; 763 if (port_filter_.get() && !port_filter_->IsMatch(url)) 764 return false; 765 if (query_conditions_.empty()) 766 return true; 767 // The loop is duplicated below for performance reasons. If not all query 768 // elements are found, no need to verify match that is expected to take more 769 // cycles. 770 for (QueryConditions::const_iterator i = query_conditions_.begin(); 771 i != query_conditions_.end(); 772 ++i) { 773 if (!ContainsKey(matching_patterns, i->string_pattern()->id())) 774 return false; 775 } 776 for (QueryConditions::const_iterator i = query_conditions_.begin(); 777 i != query_conditions_.end(); 778 ++i) { 779 if (!i->IsMatch(url_for_component_searches)) 780 return false; 781 } 782 return true; 783 } 784 785 // 786 // URLMatcher 787 // 788 789 URLMatcher::URLMatcher() {} 790 791 URLMatcher::~URLMatcher() {} 792 793 void URLMatcher::AddConditionSets( 794 const URLMatcherConditionSet::Vector& condition_sets) { 795 for (URLMatcherConditionSet::Vector::const_iterator i = 796 condition_sets.begin(); i != condition_sets.end(); ++i) { 797 DCHECK(url_matcher_condition_sets_.find((*i)->id()) == 798 url_matcher_condition_sets_.end()); 799 url_matcher_condition_sets_[(*i)->id()] = *i; 800 } 801 UpdateInternalDatastructures(); 802 } 803 804 void URLMatcher::RemoveConditionSets( 805 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { 806 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = 807 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { 808 DCHECK(url_matcher_condition_sets_.find(*i) != 809 url_matcher_condition_sets_.end()); 810 url_matcher_condition_sets_.erase(*i); 811 } 812 UpdateInternalDatastructures(); 813 } 814 815 void URLMatcher::ClearUnusedConditionSets() { 816 UpdateConditionFactory(); 817 } 818 819 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( 820 const GURL& url) const { 821 // Find all IDs of StringPatterns that match |url|. 822 // See URLMatcherConditionFactory for the canonicalization of URLs and the 823 // distinction between full url searches and url component searches. 824 std::set<StringPattern::ID> matches; 825 std::string url_for_component_searches; 826 827 if (!full_url_matcher_.IsEmpty()) { 828 full_url_matcher_.Match( 829 condition_factory_.CanonicalizeURLForFullSearches(url), &matches); 830 } 831 if (!url_component_matcher_.IsEmpty()) { 832 url_for_component_searches = 833 condition_factory_.CanonicalizeURLForComponentSearches(url); 834 url_component_matcher_.Match(url_for_component_searches, &matches); 835 } 836 if (!regex_set_matcher_.IsEmpty()) { 837 regex_set_matcher_.Match( 838 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); 839 } 840 if (!origin_and_path_regex_set_matcher_.IsEmpty()) { 841 origin_and_path_regex_set_matcher_.Match( 842 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url), 843 &matches); 844 } 845 846 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions 847 // were fulfilled. 848 std::set<URLMatcherConditionSet::ID> result; 849 for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); 850 i != matches.end(); ++i) { 851 // For each URLMatcherConditionSet there is exactly one condition 852 // registered in substring_match_triggers_. This means that the following 853 // logic tests each URLMatcherConditionSet exactly once if it can be 854 // completely fulfilled. 855 StringPatternTriggers::const_iterator triggered_condition_sets_iter = 856 substring_match_triggers_.find(*i); 857 if (triggered_condition_sets_iter == substring_match_triggers_.end()) 858 continue; // Not all substring matches are triggers for a condition set. 859 const std::set<URLMatcherConditionSet::ID>& condition_sets = 860 triggered_condition_sets_iter->second; 861 for (std::set<URLMatcherConditionSet::ID>::const_iterator j = 862 condition_sets.begin(); j != condition_sets.end(); ++j) { 863 URLMatcherConditionSets::const_iterator condition_set_iter = 864 url_matcher_condition_sets_.find(*j); 865 DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); 866 if (condition_set_iter->second->IsMatch( 867 matches, url, url_for_component_searches)) 868 result.insert(*j); 869 } 870 } 871 872 return result; 873 } 874 875 bool URLMatcher::IsEmpty() const { 876 return condition_factory_.IsEmpty() && 877 url_matcher_condition_sets_.empty() && 878 substring_match_triggers_.empty() && 879 full_url_matcher_.IsEmpty() && 880 url_component_matcher_.IsEmpty() && 881 regex_set_matcher_.IsEmpty() && 882 origin_and_path_regex_set_matcher_.IsEmpty() && 883 registered_full_url_patterns_.empty() && 884 registered_url_component_patterns_.empty(); 885 } 886 887 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { 888 // The purpose of |full_url_conditions| is just that we need to execute 889 // the same logic once for Full URL searches and once for URL Component 890 // searches (see URLMatcherConditionFactory). 891 892 // Determine which patterns need to be registered when this function 893 // terminates. 894 std::set<const StringPattern*> new_patterns; 895 for (URLMatcherConditionSets::const_iterator condition_set_iter = 896 url_matcher_condition_sets_.begin(); 897 condition_set_iter != url_matcher_condition_sets_.end(); 898 ++condition_set_iter) { 899 const URLMatcherConditionSet::Conditions& conditions = 900 condition_set_iter->second->conditions(); 901 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 902 conditions.begin(); condition_iter != conditions.end(); 903 ++condition_iter) { 904 // If we are called to process Full URL searches, ignore others, and 905 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) 906 if (!condition_iter->IsRegexCondition() && 907 !condition_iter->IsOriginAndPathRegexCondition() && 908 full_url_conditions == condition_iter->IsFullURLCondition()) 909 new_patterns.insert(condition_iter->string_pattern()); 910 } 911 912 if (full_url_conditions) 913 continue; 914 915 const URLMatcherConditionSet::QueryConditions& query_conditions = 916 condition_set_iter->second->query_conditions(); 917 for (URLMatcherConditionSet::QueryConditions::const_iterator 918 query_condition_iter = query_conditions.begin(); 919 query_condition_iter != query_conditions.end(); 920 ++query_condition_iter) { 921 new_patterns.insert(query_condition_iter->string_pattern()); 922 } 923 } 924 925 // This is the set of patterns that were registered before this function 926 // is called. 927 std::set<const StringPattern*>& registered_patterns = 928 full_url_conditions ? registered_full_url_patterns_ 929 : registered_url_component_patterns_; 930 931 // Add all patterns that are in new_patterns but not in registered_patterns. 932 std::vector<const StringPattern*> patterns_to_register = 933 base::STLSetDifference<std::vector<const StringPattern*> >( 934 new_patterns, registered_patterns); 935 936 // Remove all patterns that are in registered_patterns but not in 937 // new_patterns. 938 std::vector<const StringPattern*> patterns_to_unregister = 939 base::STLSetDifference<std::vector<const StringPattern*> >( 940 registered_patterns, new_patterns); 941 942 // Update the SubstringSetMatcher. 943 SubstringSetMatcher& url_matcher = 944 full_url_conditions ? full_url_matcher_ : url_component_matcher_; 945 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, 946 patterns_to_unregister); 947 948 // Update the set of registered_patterns for the next time this function 949 // is being called. 950 registered_patterns.swap(new_patterns); 951 } 952 953 void URLMatcher::UpdateRegexSetMatcher() { 954 std::vector<const StringPattern*> new_patterns; 955 std::vector<const StringPattern*> new_origin_and_path_patterns; 956 957 for (URLMatcherConditionSets::const_iterator condition_set_iter = 958 url_matcher_condition_sets_.begin(); 959 condition_set_iter != url_matcher_condition_sets_.end(); 960 ++condition_set_iter) { 961 const URLMatcherConditionSet::Conditions& conditions = 962 condition_set_iter->second->conditions(); 963 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 964 conditions.begin(); condition_iter != conditions.end(); 965 ++condition_iter) { 966 if (condition_iter->IsRegexCondition()) { 967 new_patterns.push_back(condition_iter->string_pattern()); 968 } else if (condition_iter->IsOriginAndPathRegexCondition()) { 969 new_origin_and_path_patterns.push_back( 970 condition_iter->string_pattern()); 971 } 972 } 973 } 974 975 // Start over from scratch. We can't really do better than this, since the 976 // FilteredRE2 backend doesn't support incremental updates. 977 regex_set_matcher_.ClearPatterns(); 978 regex_set_matcher_.AddPatterns(new_patterns); 979 origin_and_path_regex_set_matcher_.ClearPatterns(); 980 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns); 981 } 982 983 void URLMatcher::UpdateTriggers() { 984 // Count substring pattern frequencies. 985 std::map<StringPattern::ID, size_t> substring_pattern_frequencies; 986 for (URLMatcherConditionSets::const_iterator condition_set_iter = 987 url_matcher_condition_sets_.begin(); 988 condition_set_iter != url_matcher_condition_sets_.end(); 989 ++condition_set_iter) { 990 const URLMatcherConditionSet::Conditions& conditions = 991 condition_set_iter->second->conditions(); 992 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 993 conditions.begin(); condition_iter != conditions.end(); 994 ++condition_iter) { 995 const StringPattern* pattern = condition_iter->string_pattern(); 996 substring_pattern_frequencies[pattern->id()]++; 997 } 998 999 const URLMatcherConditionSet::QueryConditions& query_conditions = 1000 condition_set_iter->second->query_conditions(); 1001 for (URLMatcherConditionSet::QueryConditions::const_iterator 1002 query_condition_iter = query_conditions.begin(); 1003 query_condition_iter != query_conditions.end(); 1004 ++query_condition_iter) { 1005 const StringPattern* pattern = query_condition_iter->string_pattern(); 1006 substring_pattern_frequencies[pattern->id()]++; 1007 } 1008 } 1009 1010 // Update trigger conditions: Determine for each URLMatcherConditionSet which 1011 // URLMatcherCondition contains a StringPattern that occurs least 1012 // frequently in this URLMatcher. We assume that this condition is very 1013 // specific and occurs rarely in URLs. If a match occurs for this 1014 // URLMatcherCondition, we want to test all other URLMatcherCondition in the 1015 // respective URLMatcherConditionSet as well to see whether the entire 1016 // URLMatcherConditionSet is considered matching. 1017 substring_match_triggers_.clear(); 1018 for (URLMatcherConditionSets::const_iterator condition_set_iter = 1019 url_matcher_condition_sets_.begin(); 1020 condition_set_iter != url_matcher_condition_sets_.end(); 1021 ++condition_set_iter) { 1022 const URLMatcherConditionSet::Conditions& conditions = 1023 condition_set_iter->second->conditions(); 1024 if (conditions.empty()) 1025 continue; 1026 URLMatcherConditionSet::Conditions::const_iterator condition_iter = 1027 conditions.begin(); 1028 StringPattern::ID trigger = condition_iter->string_pattern()->id(); 1029 // We skip the first element in the following loop. 1030 ++condition_iter; 1031 for (; condition_iter != conditions.end(); ++condition_iter) { 1032 StringPattern::ID current_id = 1033 condition_iter->string_pattern()->id(); 1034 if (substring_pattern_frequencies[trigger] > 1035 substring_pattern_frequencies[current_id]) { 1036 trigger = current_id; 1037 } 1038 } 1039 1040 const URLMatcherConditionSet::QueryConditions& query_conditions = 1041 condition_set_iter->second->query_conditions(); 1042 for (URLMatcherConditionSet::QueryConditions::const_iterator 1043 query_condition_iter = query_conditions.begin(); 1044 query_condition_iter != query_conditions.end(); 1045 ++query_condition_iter) { 1046 StringPattern::ID current_id = 1047 query_condition_iter->string_pattern()->id(); 1048 if (substring_pattern_frequencies[trigger] > 1049 substring_pattern_frequencies[current_id]) { 1050 trigger = current_id; 1051 } 1052 } 1053 1054 substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); 1055 } 1056 } 1057 1058 void URLMatcher::UpdateConditionFactory() { 1059 std::set<StringPattern::ID> used_patterns; 1060 for (URLMatcherConditionSets::const_iterator condition_set_iter = 1061 url_matcher_condition_sets_.begin(); 1062 condition_set_iter != url_matcher_condition_sets_.end(); 1063 ++condition_set_iter) { 1064 const URLMatcherConditionSet::Conditions& conditions = 1065 condition_set_iter->second->conditions(); 1066 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 1067 conditions.begin(); condition_iter != conditions.end(); 1068 ++condition_iter) { 1069 used_patterns.insert(condition_iter->string_pattern()->id()); 1070 } 1071 const URLMatcherConditionSet::QueryConditions& query_conditions = 1072 condition_set_iter->second->query_conditions(); 1073 for (URLMatcherConditionSet::QueryConditions::const_iterator 1074 query_condition_iter = query_conditions.begin(); 1075 query_condition_iter != query_conditions.end(); 1076 ++query_condition_iter) { 1077 used_patterns.insert(query_condition_iter->string_pattern()->id()); 1078 } 1079 } 1080 condition_factory_.ForgetUnusedPatterns(used_patterns); 1081 } 1082 1083 void URLMatcher::UpdateInternalDatastructures() { 1084 UpdateSubstringSetMatcher(false); 1085 UpdateSubstringSetMatcher(true); 1086 UpdateRegexSetMatcher(); 1087 UpdateTriggers(); 1088 UpdateConditionFactory(); 1089 } 1090 1091 } // namespace url_matcher 1092