1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/url_matcher/url_matcher.h" 6 7 #include <algorithm> 8 #include <iterator> 9 10 #include "base/logging.h" 11 #include "base/profiler/scoped_profile.h" 12 #include "url/gurl.h" 13 #include "url/url_canon.h" 14 15 namespace url_matcher { 16 17 // This set of classes implement a mapping of URL Component Patterns, such as 18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns 19 // for use in substring comparisons. 20 // 21 // The idea of this mapping is to reduce the problem of comparing many 22 // URL Component Patterns against one URL to the problem of searching many 23 // substrings in one string: 24 // 25 // ---------------------- ----------------- 26 // | URL Query operator | ----translate----> | StringPattern | 27 // ---------------------- ----------------- 28 // ^ 29 // | 30 // compare 31 // | 32 // v 33 // ---------------------- ----------------- 34 // | URL to compare | | | 35 // | to all URL Query | ----translate----> | String | 36 // | operators | | | 37 // ---------------------- ----------------- 38 // 39 // The reason for this problem reduction is that there are efficient algorithms 40 // for searching many substrings in one string (see Aho-Corasick algorithm). 41 // 42 // Additionally, some of the same pieces are reused to implement regular 43 // expression comparisons. The FilteredRE2 implementation for matching many 44 // regular expressions against one string uses prefiltering, in which a set 45 // of substrings (derived from the regexes) are first searched for, to reduce 46 // the number of regular expressions to test; the prefiltering step also 47 // uses Aho-Corasick. 48 // 49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches. 50 // ========================================================== 51 // 52 // For searches in this class, we normalize URLs as follows: 53 // 54 // Step 1: 55 // Remove scheme, port and segment from URL: 56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 57 // www.example.com/index.html?search=foo 58 // 59 // We remove the scheme and port number because they can be checked later 60 // in a secondary filter step. We remove the segment (the #... part) because 61 // this is not guaranteed to be ASCII-7 encoded. 62 // 63 // Step 2: 64 // Translate URL to String and add the following position markers: 65 // - BU = Beginning of URL 66 // - ED = End of Domain 67 // - EP = End of Path 68 // - EU = End of URL 69 // Furthermore, the hostname is canonicalized to start with a ".". 70 // 71 // Position markers are represented as characters >127, which are therefore 72 // guaranteed not to be part of the ASCII-7 encoded URL character set. 73 // 74 // -> www.example.com/index.html?search=foo becomes 75 // BU .www.example.com ED /index.html EP ?search=foo EU 76 // 77 // -> www.example.com/index.html becomes 78 // BU .www.example.com ED /index.html EP EU 79 // 80 // Step 3: 81 // Translate URL Component Patterns as follows: 82 // 83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix) 84 // -> host_prefix("www.example") = BU .www.example 85 // 86 // host_suffix(suffix) = suffix ED 87 // -> host_suffix("example.com") = example.com ED 88 // -> host_suffix(".example.com") = .example.com ED 89 // 90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED 91 // -> host_equals("www.example.com") = BU .www.example.com ED 92 // 93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). 94 // 95 // With this, we can search the StringPatterns in the normalized URL. 96 // 97 // 98 // Case 2: url_{prefix,suffix,equals,contains} searches. 99 // ===================================================== 100 // 101 // Step 1: as above, except that 102 // - the scheme is not removed 103 // - the port is not removed if it is specified and does not match the default 104 // port for the given scheme. 105 // 106 // Step 2: 107 // Translate URL to String and add the following position markers: 108 // - BU = Beginning of URL 109 // - EU = End of URL 110 // 111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 112 // BU http://www.example.com:8080/index.html?search=foo EU 113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes 114 // BU http://www.example.com/index.html?search=foo EU 115 // 116 // url_prefix(prefix) = BU prefix 117 // -> url_prefix("http://www.example") = BU http://www.example 118 // 119 // url_contains(substring) = substring 120 // -> url_contains("index") = index 121 // 122 // 123 // Case 3: {host,path,query}_contains searches. 124 // ============================================ 125 // 126 // These kinds of searches are not supported directly but can be derived 127 // by a combination of a url_contains() query followed by an explicit test: 128 // 129 // host_contains(str) = url_contains(str) followed by test whether str occurs 130 // in host component of original URL. 131 // -> host_contains("example.co") = example.co 132 // followed by gurl.host().find("example.co"); 133 // 134 // [similarly for path_contains and query_contains]. 135 // 136 // 137 // Regular expression matching (url_matches searches) 138 // ================================================== 139 // 140 // This class also supports matching regular expressions (RE2 syntax) 141 // against full URLs, which are transformed as in case 2. 142 143 namespace { 144 145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { 146 return criterion == URLMatcherCondition::URL_MATCHES; 147 } 148 149 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { 150 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; 151 } 152 153 } // namespace 154 155 // 156 // URLMatcherCondition 157 // 158 159 URLMatcherCondition::URLMatcherCondition() 160 : criterion_(HOST_PREFIX), 161 string_pattern_(NULL) {} 162 163 URLMatcherCondition::~URLMatcherCondition() {} 164 165 URLMatcherCondition::URLMatcherCondition( 166 Criterion criterion, 167 const StringPattern* string_pattern) 168 : criterion_(criterion), 169 string_pattern_(string_pattern) {} 170 171 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) 172 : criterion_(rhs.criterion_), 173 string_pattern_(rhs.string_pattern_) {} 174 175 URLMatcherCondition& URLMatcherCondition::operator=( 176 const URLMatcherCondition& rhs) { 177 criterion_ = rhs.criterion_; 178 string_pattern_ = rhs.string_pattern_; 179 return *this; 180 } 181 182 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { 183 if (criterion_ < rhs.criterion_) return true; 184 if (criterion_ > rhs.criterion_) return false; 185 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 186 return *string_pattern_ < *rhs.string_pattern_; 187 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; 188 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 189 // or both are NULL. 190 return false; 191 } 192 193 bool URLMatcherCondition::IsFullURLCondition() const { 194 // For these criteria the SubstringMatcher needs to be executed on the 195 // GURL that is canonicalized with 196 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. 197 switch (criterion_) { 198 case HOST_CONTAINS: 199 case PATH_CONTAINS: 200 case QUERY_CONTAINS: 201 case URL_PREFIX: 202 case URL_SUFFIX: 203 case URL_CONTAINS: 204 case URL_EQUALS: 205 return true; 206 default: 207 break; 208 } 209 return false; 210 } 211 212 bool URLMatcherCondition::IsRegexCondition() const { 213 return IsRegexCriterion(criterion_); 214 } 215 216 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const { 217 return IsOriginAndPathRegexCriterion(criterion_); 218 } 219 220 bool URLMatcherCondition::IsMatch( 221 const std::set<StringPattern::ID>& matching_patterns, 222 const GURL& url) const { 223 DCHECK(string_pattern_); 224 if (!ContainsKey(matching_patterns, string_pattern_->id())) 225 return false; 226 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on 227 // a substring match on the raw URL. In case of a match, we need to verify 228 // that the match was found in the correct component of the URL. 229 switch (criterion_) { 230 case HOST_CONTAINS: 231 return url.host().find(string_pattern_->pattern()) != 232 std::string::npos; 233 case PATH_CONTAINS: 234 return url.path().find(string_pattern_->pattern()) != 235 std::string::npos; 236 case QUERY_CONTAINS: 237 return url.query().find(string_pattern_->pattern()) != 238 std::string::npos; 239 default: 240 break; 241 } 242 return true; 243 } 244 245 // 246 // URLMatcherConditionFactory 247 // 248 249 namespace { 250 // These are symbols that are not contained in 7-bit ASCII used in GURLs. 251 const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; 252 const char kEndOfDomain[] = {static_cast<char>(-2), 0}; 253 const char kEndOfPath[] = {static_cast<char>(-3), 0}; 254 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0}; 255 const char kEndOfURL[] = {static_cast<char>(-5), 0}; 256 257 // The delimiter for query parameters 258 const char kQuerySeparator = '&'; 259 } // namespace 260 261 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} 262 263 URLMatcherConditionFactory::~URLMatcherConditionFactory() { 264 STLDeleteElements(&substring_pattern_singletons_); 265 STLDeleteElements(®ex_pattern_singletons_); 266 STLDeleteElements(&origin_and_path_regex_pattern_singletons_); 267 } 268 269 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( 270 const GURL& url) const { 271 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + 272 url.path() + kEndOfPath + 273 (url.has_query() ? CanonicalizeQuery(url.query(), true, true) 274 : std::string()) + 275 kEndOfURL; 276 } 277 278 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( 279 const std::string& prefix) { 280 return CreateCondition(URLMatcherCondition::HOST_PREFIX, 281 kBeginningOfURL + CanonicalizeHostname(prefix)); 282 } 283 284 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( 285 const std::string& suffix) { 286 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, 287 suffix + kEndOfDomain); 288 } 289 290 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( 291 const std::string& str) { 292 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); 293 } 294 295 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( 296 const std::string& str) { 297 return CreateCondition(URLMatcherCondition::HOST_EQUALS, 298 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); 299 } 300 301 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( 302 const std::string& prefix) { 303 return CreateCondition(URLMatcherCondition::PATH_PREFIX, 304 kEndOfDomain + prefix); 305 } 306 307 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( 308 const std::string& suffix) { 309 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); 310 } 311 312 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( 313 const std::string& str) { 314 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); 315 } 316 317 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( 318 const std::string& str) { 319 return CreateCondition(URLMatcherCondition::PATH_EQUALS, 320 kEndOfDomain + str + kEndOfPath); 321 } 322 323 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( 324 const std::string& prefix) { 325 std::string pattern; 326 if (!prefix.empty() && prefix[0] == '?') 327 pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false); 328 else 329 pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false); 330 331 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); 332 } 333 334 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( 335 const std::string& suffix) { 336 if (!suffix.empty() && suffix[0] == '?') { 337 return CreateQueryEqualsCondition(suffix); 338 } else { 339 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, 340 CanonicalizeQuery(suffix, false, true) + kEndOfURL); 341 } 342 } 343 344 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( 345 const std::string& str) { 346 if (!str.empty() && str[0] == '?') 347 return CreateQueryPrefixCondition(str); 348 else 349 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); 350 } 351 352 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( 353 const std::string& str) { 354 std::string pattern; 355 if (!str.empty() && str[0] == '?') 356 pattern = 357 kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL; 358 else 359 pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL; 360 361 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); 362 } 363 364 URLMatcherCondition 365 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( 366 const std::string& host_suffix, 367 const std::string& path_prefix) { 368 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, 369 host_suffix + kEndOfDomain + path_prefix); 370 } 371 372 URLMatcherCondition 373 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( 374 const std::string& host, 375 const std::string& path_prefix) { 376 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, 377 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + 378 path_prefix); 379 } 380 381 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( 382 const GURL& url) const { 383 GURL::Replacements replacements; 384 replacements.ClearPassword(); 385 replacements.ClearUsername(); 386 replacements.ClearRef(); 387 // Clear port if it is implicit from scheme. 388 if (url.has_port()) { 389 const std::string& port = url.scheme(); 390 if (url::DefaultPortForScheme(port.c_str(), port.size()) == 391 url.EffectiveIntPort()) { 392 replacements.ClearPort(); 393 } 394 } 395 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + 396 kEndOfURL; 397 } 398 399 static std::string CanonicalizeURLForRegexSearchesHelper( 400 const GURL& url, 401 bool clear_query) { 402 GURL::Replacements replacements; 403 replacements.ClearPassword(); 404 replacements.ClearUsername(); 405 replacements.ClearRef(); 406 if (clear_query) 407 replacements.ClearQuery(); 408 // Clear port if it is implicit from scheme. 409 if (url.has_port()) { 410 const std::string& port = url.scheme(); 411 if (url::DefaultPortForScheme(port.c_str(), port.size()) == 412 url.EffectiveIntPort()) { 413 replacements.ClearPort(); 414 } 415 } 416 return url.ReplaceComponents(replacements).spec(); 417 } 418 419 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( 420 const GURL& url) const { 421 return CanonicalizeURLForRegexSearchesHelper(url, false); 422 } 423 424 std::string 425 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches( 426 const GURL& url) const { 427 return CanonicalizeURLForRegexSearchesHelper(url, true); 428 } 429 430 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( 431 const std::string& prefix) { 432 return CreateCondition(URLMatcherCondition::URL_PREFIX, 433 kBeginningOfURL + prefix); 434 } 435 436 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( 437 const std::string& suffix) { 438 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); 439 } 440 441 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( 442 const std::string& str) { 443 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); 444 } 445 446 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( 447 const std::string& str) { 448 return CreateCondition(URLMatcherCondition::URL_EQUALS, 449 kBeginningOfURL + str + kEndOfURL); 450 } 451 452 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( 453 const std::string& regex) { 454 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); 455 } 456 457 URLMatcherCondition 458 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition( 459 const std::string& regex) { 460 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex); 461 } 462 463 void URLMatcherConditionFactory::ForgetUnusedPatterns( 464 const std::set<StringPattern::ID>& used_patterns) { 465 PatternSingletons::iterator i = substring_pattern_singletons_.begin(); 466 while (i != substring_pattern_singletons_.end()) { 467 if (ContainsKey(used_patterns, (*i)->id())) { 468 ++i; 469 } else { 470 delete *i; 471 substring_pattern_singletons_.erase(i++); 472 } 473 } 474 i = regex_pattern_singletons_.begin(); 475 while (i != regex_pattern_singletons_.end()) { 476 if (ContainsKey(used_patterns, (*i)->id())) { 477 ++i; 478 } else { 479 delete *i; 480 regex_pattern_singletons_.erase(i++); 481 } 482 } 483 i = origin_and_path_regex_pattern_singletons_.begin(); 484 while (i != origin_and_path_regex_pattern_singletons_.end()) { 485 if (ContainsKey(used_patterns, (*i)->id())) { 486 ++i; 487 } else { 488 delete *i; 489 origin_and_path_regex_pattern_singletons_.erase(i++); 490 } 491 } 492 } 493 494 bool URLMatcherConditionFactory::IsEmpty() const { 495 return substring_pattern_singletons_.empty() && 496 regex_pattern_singletons_.empty() && 497 origin_and_path_regex_pattern_singletons_.empty(); 498 } 499 500 URLMatcherCondition URLMatcherConditionFactory::CreateCondition( 501 URLMatcherCondition::Criterion criterion, 502 const std::string& pattern) { 503 StringPattern search_pattern(pattern, 0); 504 PatternSingletons* pattern_singletons = NULL; 505 if (IsRegexCriterion(criterion)) 506 pattern_singletons = ®ex_pattern_singletons_; 507 else if (IsOriginAndPathRegexCriterion(criterion)) 508 pattern_singletons = &origin_and_path_regex_pattern_singletons_; 509 else 510 pattern_singletons = &substring_pattern_singletons_; 511 512 PatternSingletons::const_iterator iter = 513 pattern_singletons->find(&search_pattern); 514 515 if (iter != pattern_singletons->end()) { 516 return URLMatcherCondition(criterion, *iter); 517 } else { 518 StringPattern* new_pattern = 519 new StringPattern(pattern, id_counter_++); 520 pattern_singletons->insert(new_pattern); 521 return URLMatcherCondition(criterion, new_pattern); 522 } 523 } 524 525 std::string URLMatcherConditionFactory::CanonicalizeHostname( 526 const std::string& hostname) const { 527 if (!hostname.empty() && hostname[0] == '.') 528 return hostname; 529 else 530 return "." + hostname; 531 } 532 533 // This function prepares the query string by replacing query separator with a 534 // magic value (|kQueryComponentDelimiter|). When the boolean 535 // |prepend_beginning_of_query_component| is true the function prepends the 536 // query with the same magic. This is done to locate the start of a key value 537 // pair in the query string. The parameter |query| is passed by value 538 // intentionally, since it is locally modified. 539 std::string URLMatcherConditionFactory::CanonicalizeQuery( 540 std::string query, 541 bool prepend_beginning_of_query_component, 542 bool append_end_of_query_component) const { 543 for (std::string::iterator it = query.begin(); it != query.end(); ++it) { 544 if (*it == kQuerySeparator) 545 *it = kQueryComponentDelimiter[0]; 546 } 547 if (prepend_beginning_of_query_component) 548 query = kQueryComponentDelimiter + query; 549 if (append_end_of_query_component) 550 query += kQueryComponentDelimiter; 551 return query; 552 } 553 554 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( 555 StringPattern* lhs, 556 StringPattern* rhs) const { 557 if (lhs == NULL && rhs != NULL) return true; 558 if (lhs != NULL && rhs != NULL) 559 return lhs->pattern() < rhs->pattern(); 560 // Either both are NULL or only rhs is NULL. 561 return false; 562 } 563 564 // 565 // URLQueryElementMatcherCondition 566 // 567 568 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition( 569 const std::string& key, 570 const std::string& value, 571 QueryValueMatchType query_value_match_type, 572 QueryElementType query_element_type, 573 Type match_type, 574 URLMatcherConditionFactory* factory) { 575 match_type_ = match_type; 576 577 if (query_element_type == ELEMENT_TYPE_KEY_VALUE) { 578 key_ = kQueryComponentDelimiter + key + "="; 579 value_ = value; 580 } else { 581 key_ = kQueryComponentDelimiter + key; 582 value_ = std::string(); 583 } 584 585 if (query_value_match_type == QUERY_VALUE_MATCH_EXACT) 586 value_ += kQueryComponentDelimiter; 587 588 // If |value_| is empty no need to find the |key_| and verify if the value 589 // matches. Simply checking the presence of key is sufficient, which is done 590 // by MATCH_ANY 591 if (value_.empty()) 592 match_type_ = MATCH_ANY; 593 594 URLMatcherCondition condition; 595 // If |match_type_| is MATCH_ANY, then we could simply look for the 596 // combination of |key_| + |value_|, which can be efficiently done by 597 // SubstringMatcher 598 if (match_type_ == MATCH_ANY) 599 condition = factory->CreateQueryContainsCondition(key_ + value_); 600 else 601 condition = factory->CreateQueryContainsCondition(key_); 602 string_pattern_ = condition.string_pattern(); 603 604 key_length_ = key_.length(); 605 value_length_ = value_.length(); 606 } 607 608 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {} 609 610 bool URLQueryElementMatcherCondition::operator<( 611 const URLQueryElementMatcherCondition& rhs) const { 612 if (match_type_ != rhs.match_type_) 613 return match_type_ < rhs.match_type_; 614 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 615 return *string_pattern_ < *rhs.string_pattern_; 616 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) 617 return true; 618 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 619 // or both are NULL. 620 return false; 621 } 622 623 bool URLQueryElementMatcherCondition::IsMatch( 624 const std::string& url_for_component_searches) const { 625 switch (match_type_) { 626 case MATCH_ANY: { 627 // For MATCH_ANY, no additional verification step is needed. We can trust 628 // the SubstringMatcher to do the verification. 629 return true; 630 } 631 case MATCH_ALL: { 632 size_t start = 0; 633 int found = 0; 634 size_t offset; 635 while ((offset = url_for_component_searches.find(key_, start)) != 636 std::string::npos) { 637 if (url_for_component_searches.compare( 638 offset + key_length_, value_length_, value_) != 0) { 639 return false; 640 } else { 641 ++found; 642 } 643 start = offset + key_length_ + value_length_ - 1; 644 } 645 return !!found; 646 } 647 case MATCH_FIRST: { 648 size_t offset = url_for_component_searches.find(key_); 649 return url_for_component_searches.compare( 650 offset + key_length_, value_length_, value_) == 0; 651 } 652 case MATCH_LAST: { 653 size_t offset = url_for_component_searches.rfind(key_); 654 return url_for_component_searches.compare( 655 offset + key_length_, value_length_, value_) == 0; 656 } 657 } 658 NOTREACHED(); 659 return false; 660 } 661 662 // 663 // URLMatcherSchemeFilter 664 // 665 666 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) 667 : filters_(1) { 668 filters_.push_back(filter); 669 } 670 671 URLMatcherSchemeFilter::URLMatcherSchemeFilter( 672 const std::vector<std::string>& filters) 673 : filters_(filters) {} 674 675 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} 676 677 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { 678 return std::find(filters_.begin(), filters_.end(), url.scheme()) != 679 filters_.end(); 680 } 681 682 // 683 // URLMatcherPortFilter 684 // 685 686 URLMatcherPortFilter::URLMatcherPortFilter( 687 const std::vector<URLMatcherPortFilter::Range>& ranges) 688 : ranges_(ranges) {} 689 690 URLMatcherPortFilter::~URLMatcherPortFilter() {} 691 692 bool URLMatcherPortFilter::IsMatch(const GURL& url) const { 693 int port = url.EffectiveIntPort(); 694 for (std::vector<Range>::const_iterator i = ranges_.begin(); 695 i != ranges_.end(); ++i) { 696 if (i->first <= port && port <= i->second) 697 return true; 698 } 699 return false; 700 } 701 702 // static 703 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, 704 int to) { 705 return Range(from, to); 706 } 707 708 // static 709 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { 710 return Range(port, port); 711 } 712 713 // 714 // URLMatcherConditionSet 715 // 716 717 URLMatcherConditionSet::~URLMatcherConditionSet() {} 718 719 URLMatcherConditionSet::URLMatcherConditionSet( 720 ID id, 721 const Conditions& conditions) 722 : id_(id), 723 conditions_(conditions) {} 724 725 URLMatcherConditionSet::URLMatcherConditionSet( 726 ID id, 727 const Conditions& conditions, 728 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 729 scoped_ptr<URLMatcherPortFilter> port_filter) 730 : id_(id), 731 conditions_(conditions), 732 scheme_filter_(scheme_filter.Pass()), 733 port_filter_(port_filter.Pass()) {} 734 735 URLMatcherConditionSet::URLMatcherConditionSet( 736 ID id, 737 const Conditions& conditions, 738 const QueryConditions& query_conditions, 739 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 740 scoped_ptr<URLMatcherPortFilter> port_filter) 741 : id_(id), 742 conditions_(conditions), 743 query_conditions_(query_conditions), 744 scheme_filter_(scheme_filter.Pass()), 745 port_filter_(port_filter.Pass()) {} 746 747 bool URLMatcherConditionSet::IsMatch( 748 const std::set<StringPattern::ID>& matching_patterns, 749 const GURL& url) const { 750 return IsMatch(matching_patterns, url, std::string()); 751 } 752 753 bool URLMatcherConditionSet::IsMatch( 754 const std::set<StringPattern::ID>& matching_patterns, 755 const GURL& url, 756 const std::string& url_for_component_searches) const { 757 for (Conditions::const_iterator i = conditions_.begin(); 758 i != conditions_.end(); ++i) { 759 if (!i->IsMatch(matching_patterns, url)) 760 return false; 761 } 762 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) 763 return false; 764 if (port_filter_.get() && !port_filter_->IsMatch(url)) 765 return false; 766 if (query_conditions_.empty()) 767 return true; 768 // The loop is duplicated below for performance reasons. If not all query 769 // elements are found, no need to verify match that is expected to take more 770 // cycles. 771 for (QueryConditions::const_iterator i = query_conditions_.begin(); 772 i != query_conditions_.end(); 773 ++i) { 774 if (!ContainsKey(matching_patterns, i->string_pattern()->id())) 775 return false; 776 } 777 for (QueryConditions::const_iterator i = query_conditions_.begin(); 778 i != query_conditions_.end(); 779 ++i) { 780 if (!i->IsMatch(url_for_component_searches)) 781 return false; 782 } 783 return true; 784 } 785 786 // 787 // URLMatcher 788 // 789 790 URLMatcher::URLMatcher() {} 791 792 URLMatcher::~URLMatcher() {} 793 794 void URLMatcher::AddConditionSets( 795 const URLMatcherConditionSet::Vector& condition_sets) { 796 for (URLMatcherConditionSet::Vector::const_iterator i = 797 condition_sets.begin(); i != condition_sets.end(); ++i) { 798 DCHECK(url_matcher_condition_sets_.find((*i)->id()) == 799 url_matcher_condition_sets_.end()); 800 url_matcher_condition_sets_[(*i)->id()] = *i; 801 } 802 UpdateInternalDatastructures(); 803 } 804 805 void URLMatcher::RemoveConditionSets( 806 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { 807 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = 808 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { 809 DCHECK(url_matcher_condition_sets_.find(*i) != 810 url_matcher_condition_sets_.end()); 811 url_matcher_condition_sets_.erase(*i); 812 } 813 UpdateInternalDatastructures(); 814 } 815 816 void URLMatcher::ClearUnusedConditionSets() { 817 UpdateConditionFactory(); 818 } 819 820 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( 821 const GURL& url) const { 822 // Find all IDs of StringPatterns that match |url|. 823 // See URLMatcherConditionFactory for the canonicalization of URLs and the 824 // distinction between full url searches and url component searches. 825 std::set<StringPattern::ID> matches; 826 std::string url_for_component_searches; 827 828 if (!full_url_matcher_.IsEmpty()) { 829 full_url_matcher_.Match( 830 condition_factory_.CanonicalizeURLForFullSearches(url), &matches); 831 } 832 if (!url_component_matcher_.IsEmpty()) { 833 url_for_component_searches = 834 condition_factory_.CanonicalizeURLForComponentSearches(url); 835 url_component_matcher_.Match(url_for_component_searches, &matches); 836 } 837 if (!regex_set_matcher_.IsEmpty()) { 838 regex_set_matcher_.Match( 839 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); 840 } 841 if (!origin_and_path_regex_set_matcher_.IsEmpty()) { 842 origin_and_path_regex_set_matcher_.Match( 843 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url), 844 &matches); 845 } 846 847 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions 848 // were fulfilled. 849 std::set<URLMatcherConditionSet::ID> result; 850 for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); 851 i != matches.end(); ++i) { 852 // For each URLMatcherConditionSet there is exactly one condition 853 // registered in substring_match_triggers_. This means that the following 854 // logic tests each URLMatcherConditionSet exactly once if it can be 855 // completely fulfilled. 856 StringPatternTriggers::const_iterator triggered_condition_sets_iter = 857 substring_match_triggers_.find(*i); 858 if (triggered_condition_sets_iter == substring_match_triggers_.end()) 859 continue; // Not all substring matches are triggers for a condition set. 860 const std::set<URLMatcherConditionSet::ID>& condition_sets = 861 triggered_condition_sets_iter->second; 862 for (std::set<URLMatcherConditionSet::ID>::const_iterator j = 863 condition_sets.begin(); j != condition_sets.end(); ++j) { 864 URLMatcherConditionSets::const_iterator condition_set_iter = 865 url_matcher_condition_sets_.find(*j); 866 DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); 867 if (condition_set_iter->second->IsMatch( 868 matches, url, url_for_component_searches)) 869 result.insert(*j); 870 } 871 } 872 873 return result; 874 } 875 876 bool URLMatcher::IsEmpty() const { 877 return condition_factory_.IsEmpty() && 878 url_matcher_condition_sets_.empty() && 879 substring_match_triggers_.empty() && 880 full_url_matcher_.IsEmpty() && 881 url_component_matcher_.IsEmpty() && 882 regex_set_matcher_.IsEmpty() && 883 origin_and_path_regex_set_matcher_.IsEmpty() && 884 registered_full_url_patterns_.empty() && 885 registered_url_component_patterns_.empty(); 886 } 887 888 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { 889 // The purpose of |full_url_conditions| is just that we need to execute 890 // the same logic once for Full URL searches and once for URL Component 891 // searches (see URLMatcherConditionFactory). 892 893 // Determine which patterns need to be registered when this function 894 // terminates. 895 std::set<const StringPattern*> new_patterns; 896 for (URLMatcherConditionSets::const_iterator condition_set_iter = 897 url_matcher_condition_sets_.begin(); 898 condition_set_iter != url_matcher_condition_sets_.end(); 899 ++condition_set_iter) { 900 const URLMatcherConditionSet::Conditions& conditions = 901 condition_set_iter->second->conditions(); 902 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 903 conditions.begin(); condition_iter != conditions.end(); 904 ++condition_iter) { 905 // If we are called to process Full URL searches, ignore others, and 906 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) 907 if (!condition_iter->IsRegexCondition() && 908 !condition_iter->IsOriginAndPathRegexCondition() && 909 full_url_conditions == condition_iter->IsFullURLCondition()) 910 new_patterns.insert(condition_iter->string_pattern()); 911 } 912 913 if (full_url_conditions) 914 continue; 915 916 const URLMatcherConditionSet::QueryConditions& query_conditions = 917 condition_set_iter->second->query_conditions(); 918 for (URLMatcherConditionSet::QueryConditions::const_iterator 919 query_condition_iter = query_conditions.begin(); 920 query_condition_iter != query_conditions.end(); 921 ++query_condition_iter) { 922 new_patterns.insert(query_condition_iter->string_pattern()); 923 } 924 } 925 926 // This is the set of patterns that were registered before this function 927 // is called. 928 std::set<const StringPattern*>& registered_patterns = 929 full_url_conditions ? registered_full_url_patterns_ 930 : registered_url_component_patterns_; 931 932 // Add all patterns that are in new_patterns but not in registered_patterns. 933 std::vector<const StringPattern*> patterns_to_register = 934 base::STLSetDifference<std::vector<const StringPattern*> >( 935 new_patterns, registered_patterns); 936 937 // Remove all patterns that are in registered_patterns but not in 938 // new_patterns. 939 std::vector<const StringPattern*> patterns_to_unregister = 940 base::STLSetDifference<std::vector<const StringPattern*> >( 941 registered_patterns, new_patterns); 942 943 // Update the SubstringSetMatcher. 944 SubstringSetMatcher& url_matcher = 945 full_url_conditions ? full_url_matcher_ : url_component_matcher_; 946 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, 947 patterns_to_unregister); 948 949 // Update the set of registered_patterns for the next time this function 950 // is being called. 951 registered_patterns.swap(new_patterns); 952 } 953 954 void URLMatcher::UpdateRegexSetMatcher() { 955 std::vector<const StringPattern*> new_patterns; 956 std::vector<const StringPattern*> new_origin_and_path_patterns; 957 958 for (URLMatcherConditionSets::const_iterator condition_set_iter = 959 url_matcher_condition_sets_.begin(); 960 condition_set_iter != url_matcher_condition_sets_.end(); 961 ++condition_set_iter) { 962 const URLMatcherConditionSet::Conditions& conditions = 963 condition_set_iter->second->conditions(); 964 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 965 conditions.begin(); condition_iter != conditions.end(); 966 ++condition_iter) { 967 if (condition_iter->IsRegexCondition()) { 968 new_patterns.push_back(condition_iter->string_pattern()); 969 } else if (condition_iter->IsOriginAndPathRegexCondition()) { 970 new_origin_and_path_patterns.push_back( 971 condition_iter->string_pattern()); 972 } 973 } 974 } 975 976 // Start over from scratch. We can't really do better than this, since the 977 // FilteredRE2 backend doesn't support incremental updates. 978 regex_set_matcher_.ClearPatterns(); 979 regex_set_matcher_.AddPatterns(new_patterns); 980 origin_and_path_regex_set_matcher_.ClearPatterns(); 981 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns); 982 } 983 984 void URLMatcher::UpdateTriggers() { 985 // Count substring pattern frequencies. 986 std::map<StringPattern::ID, size_t> substring_pattern_frequencies; 987 for (URLMatcherConditionSets::const_iterator condition_set_iter = 988 url_matcher_condition_sets_.begin(); 989 condition_set_iter != url_matcher_condition_sets_.end(); 990 ++condition_set_iter) { 991 const URLMatcherConditionSet::Conditions& conditions = 992 condition_set_iter->second->conditions(); 993 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 994 conditions.begin(); condition_iter != conditions.end(); 995 ++condition_iter) { 996 const StringPattern* pattern = condition_iter->string_pattern(); 997 substring_pattern_frequencies[pattern->id()]++; 998 } 999 1000 const URLMatcherConditionSet::QueryConditions& query_conditions = 1001 condition_set_iter->second->query_conditions(); 1002 for (URLMatcherConditionSet::QueryConditions::const_iterator 1003 query_condition_iter = query_conditions.begin(); 1004 query_condition_iter != query_conditions.end(); 1005 ++query_condition_iter) { 1006 const StringPattern* pattern = query_condition_iter->string_pattern(); 1007 substring_pattern_frequencies[pattern->id()]++; 1008 } 1009 } 1010 1011 // Update trigger conditions: Determine for each URLMatcherConditionSet which 1012 // URLMatcherCondition contains a StringPattern that occurs least 1013 // frequently in this URLMatcher. We assume that this condition is very 1014 // specific and occurs rarely in URLs. If a match occurs for this 1015 // URLMatcherCondition, we want to test all other URLMatcherCondition in the 1016 // respective URLMatcherConditionSet as well to see whether the entire 1017 // URLMatcherConditionSet is considered matching. 1018 substring_match_triggers_.clear(); 1019 for (URLMatcherConditionSets::const_iterator condition_set_iter = 1020 url_matcher_condition_sets_.begin(); 1021 condition_set_iter != url_matcher_condition_sets_.end(); 1022 ++condition_set_iter) { 1023 const URLMatcherConditionSet::Conditions& conditions = 1024 condition_set_iter->second->conditions(); 1025 if (conditions.empty()) 1026 continue; 1027 URLMatcherConditionSet::Conditions::const_iterator condition_iter = 1028 conditions.begin(); 1029 StringPattern::ID trigger = condition_iter->string_pattern()->id(); 1030 // We skip the first element in the following loop. 1031 ++condition_iter; 1032 for (; condition_iter != conditions.end(); ++condition_iter) { 1033 StringPattern::ID current_id = 1034 condition_iter->string_pattern()->id(); 1035 if (substring_pattern_frequencies[trigger] > 1036 substring_pattern_frequencies[current_id]) { 1037 trigger = current_id; 1038 } 1039 } 1040 1041 const URLMatcherConditionSet::QueryConditions& query_conditions = 1042 condition_set_iter->second->query_conditions(); 1043 for (URLMatcherConditionSet::QueryConditions::const_iterator 1044 query_condition_iter = query_conditions.begin(); 1045 query_condition_iter != query_conditions.end(); 1046 ++query_condition_iter) { 1047 StringPattern::ID current_id = 1048 query_condition_iter->string_pattern()->id(); 1049 if (substring_pattern_frequencies[trigger] > 1050 substring_pattern_frequencies[current_id]) { 1051 trigger = current_id; 1052 } 1053 } 1054 1055 substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); 1056 } 1057 } 1058 1059 void URLMatcher::UpdateConditionFactory() { 1060 std::set<StringPattern::ID> used_patterns; 1061 for (URLMatcherConditionSets::const_iterator condition_set_iter = 1062 url_matcher_condition_sets_.begin(); 1063 condition_set_iter != url_matcher_condition_sets_.end(); 1064 ++condition_set_iter) { 1065 const URLMatcherConditionSet::Conditions& conditions = 1066 condition_set_iter->second->conditions(); 1067 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 1068 conditions.begin(); condition_iter != conditions.end(); 1069 ++condition_iter) { 1070 used_patterns.insert(condition_iter->string_pattern()->id()); 1071 } 1072 const URLMatcherConditionSet::QueryConditions& query_conditions = 1073 condition_set_iter->second->query_conditions(); 1074 for (URLMatcherConditionSet::QueryConditions::const_iterator 1075 query_condition_iter = query_conditions.begin(); 1076 query_condition_iter != query_conditions.end(); 1077 ++query_condition_iter) { 1078 used_patterns.insert(query_condition_iter->string_pattern()->id()); 1079 } 1080 } 1081 condition_factory_.ForgetUnusedPatterns(used_patterns); 1082 } 1083 1084 void URLMatcher::UpdateInternalDatastructures() { 1085 // TODO(vadimt): Remove ScopedProfile below once crbug.com/417106 is fixed. 1086 tracked_objects::ScopedProfile tracking_profile( 1087 FROM_HERE_WITH_EXPLICIT_FUNCTION( 1088 "URLMatcher_UpdateInternalDatastructures")); 1089 UpdateSubstringSetMatcher(false); 1090 UpdateSubstringSetMatcher(true); 1091 UpdateRegexSetMatcher(); 1092 UpdateTriggers(); 1093 UpdateConditionFactory(); 1094 } 1095 1096 } // namespace url_matcher 1097