1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/url_matcher/url_matcher.h" 6 7 #include <algorithm> 8 #include <iterator> 9 10 #include "base/logging.h" 11 #include "url/gurl.h" 12 #include "url/url_canon.h" 13 14 namespace url_matcher { 15 16 // This set of classes implement a mapping of URL Component Patterns, such as 17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns 18 // for use in substring comparisons. 19 // 20 // The idea of this mapping is to reduce the problem of comparing many 21 // URL Component Patterns against one URL to the problem of searching many 22 // substrings in one string: 23 // 24 // ---------------------- ----------------- 25 // | URL Query operator | ----translate----> | StringPattern | 26 // ---------------------- ----------------- 27 // ^ 28 // | 29 // compare 30 // | 31 // v 32 // ---------------------- ----------------- 33 // | URL to compare | | | 34 // | to all URL Query | ----translate----> | String | 35 // | operators | | | 36 // ---------------------- ----------------- 37 // 38 // The reason for this problem reduction is that there are efficient algorithms 39 // for searching many substrings in one string (see Aho-Corasick algorithm). 40 // 41 // Additionally, some of the same pieces are reused to implement regular 42 // expression comparisons. The FilteredRE2 implementation for matching many 43 // regular expressions against one string uses prefiltering, in which a set 44 // of substrings (derived from the regexes) are first searched for, to reduce 45 // the number of regular expressions to test; the prefiltering step also 46 // uses Aho-Corasick. 47 // 48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches. 49 // ========================================================== 50 // 51 // For searches in this class, we normalize URLs as follows: 52 // 53 // Step 1: 54 // Remove scheme, port and segment from URL: 55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 56 // www.example.com/index.html?search=foo 57 // 58 // We remove the scheme and port number because they can be checked later 59 // in a secondary filter step. We remove the segment (the #... part) because 60 // this is not guaranteed to be ASCII-7 encoded. 61 // 62 // Step 2: 63 // Translate URL to String and add the following position markers: 64 // - BU = Beginning of URL 65 // - ED = End of Domain 66 // - EP = End of Path 67 // - EU = End of URL 68 // Furthermore, the hostname is canonicalized to start with a ".". 69 // 70 // Position markers are represented as characters >127, which are therefore 71 // guaranteed not to be part of the ASCII-7 encoded URL character set. 72 // 73 // -> www.example.com/index.html?search=foo becomes 74 // BU .www.example.com ED /index.html EP ?search=foo EU 75 // 76 // -> www.example.com/index.html becomes 77 // BU .www.example.com ED /index.html EP EU 78 // 79 // Step 3: 80 // Translate URL Component Patterns as follows: 81 // 82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix) 83 // -> host_prefix("www.example") = BU .www.example 84 // 85 // host_suffix(suffix) = suffix ED 86 // -> host_suffix("example.com") = example.com ED 87 // -> host_suffix(".example.com") = .example.com ED 88 // 89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED 90 // -> host_equals("www.example.com") = BU .www.example.com ED 91 // 92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). 93 // 94 // With this, we can search the StringPatterns in the normalized URL. 95 // 96 // 97 // Case 2: url_{prefix,suffix,equals,contains} searches. 98 // ===================================================== 99 // 100 // Step 1: as above, except that 101 // - the scheme is not removed 102 // - the port is not removed if it is specified and does not match the default 103 // port for the given scheme. 104 // 105 // Step 2: 106 // Translate URL to String and add the following position markers: 107 // - BU = Beginning of URL 108 // - EU = End of URL 109 // 110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 111 // BU http://www.example.com:8080/index.html?search=foo EU 112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes 113 // BU http://www.example.com/index.html?search=foo EU 114 // 115 // url_prefix(prefix) = BU prefix 116 // -> url_prefix("http://www.example") = BU http://www.example 117 // 118 // url_contains(substring) = substring 119 // -> url_contains("index") = index 120 // 121 // 122 // Case 3: {host,path,query}_contains searches. 123 // ============================================ 124 // 125 // These kinds of searches are not supported directly but can be derived 126 // by a combination of a url_contains() query followed by an explicit test: 127 // 128 // host_contains(str) = url_contains(str) followed by test whether str occurs 129 // in host component of original URL. 130 // -> host_contains("example.co") = example.co 131 // followed by gurl.host().find("example.co"); 132 // 133 // [similarly for path_contains and query_contains]. 134 // 135 // 136 // Regular expression matching (url_matches searches) 137 // ================================================== 138 // 139 // This class also supports matching regular expressions (RE2 syntax) 140 // against full URLs, which are transformed as in case 2. 141 142 namespace { 143 144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { 145 return criterion == URLMatcherCondition::URL_MATCHES; 146 } 147 148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { 149 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; 150 } 151 152 } // namespace 153 154 // 155 // URLMatcherCondition 156 // 157 158 URLMatcherCondition::URLMatcherCondition() 159 : criterion_(HOST_PREFIX), 160 string_pattern_(NULL) {} 161 162 URLMatcherCondition::~URLMatcherCondition() {} 163 164 URLMatcherCondition::URLMatcherCondition( 165 Criterion criterion, 166 const StringPattern* string_pattern) 167 : criterion_(criterion), 168 string_pattern_(string_pattern) {} 169 170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) 171 : criterion_(rhs.criterion_), 172 string_pattern_(rhs.string_pattern_) {} 173 174 URLMatcherCondition& URLMatcherCondition::operator=( 175 const URLMatcherCondition& rhs) { 176 criterion_ = rhs.criterion_; 177 string_pattern_ = rhs.string_pattern_; 178 return *this; 179 } 180 181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { 182 if (criterion_ < rhs.criterion_) return true; 183 if (criterion_ > rhs.criterion_) return false; 184 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 185 return *string_pattern_ < *rhs.string_pattern_; 186 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; 187 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 188 // or both are NULL. 189 return false; 190 } 191 192 bool URLMatcherCondition::IsFullURLCondition() const { 193 // For these criteria the SubstringMatcher needs to be executed on the 194 // GURL that is canonicalized with 195 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. 196 switch (criterion_) { 197 case HOST_CONTAINS: 198 case PATH_CONTAINS: 199 case QUERY_CONTAINS: 200 case URL_PREFIX: 201 case URL_SUFFIX: 202 case URL_CONTAINS: 203 case URL_EQUALS: 204 return true; 205 default: 206 break; 207 } 208 return false; 209 } 210 211 bool URLMatcherCondition::IsRegexCondition() const { 212 return IsRegexCriterion(criterion_); 213 } 214 215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const { 216 return IsOriginAndPathRegexCriterion(criterion_); 217 } 218 219 bool URLMatcherCondition::IsMatch( 220 const std::set<StringPattern::ID>& matching_patterns, 221 const GURL& url) const { 222 DCHECK(string_pattern_); 223 if (!ContainsKey(matching_patterns, string_pattern_->id())) 224 return false; 225 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on 226 // a substring match on the raw URL. In case of a match, we need to verify 227 // that the match was found in the correct component of the URL. 228 switch (criterion_) { 229 case HOST_CONTAINS: 230 return url.host().find(string_pattern_->pattern()) != 231 std::string::npos; 232 case PATH_CONTAINS: 233 return url.path().find(string_pattern_->pattern()) != 234 std::string::npos; 235 case QUERY_CONTAINS: 236 return url.query().find(string_pattern_->pattern()) != 237 std::string::npos; 238 default: 239 break; 240 } 241 return true; 242 } 243 244 // 245 // URLMatcherConditionFactory 246 // 247 248 namespace { 249 // These are symbols that are not contained in 7-bit ASCII used in GURLs. 250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; 251 const char kEndOfDomain[] = {static_cast<char>(-2), 0}; 252 const char kEndOfPath[] = {static_cast<char>(-3), 0}; 253 const char kEndOfURL[] = {static_cast<char>(-4), 0}; 254 } // namespace 255 256 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} 257 258 URLMatcherConditionFactory::~URLMatcherConditionFactory() { 259 STLDeleteElements(&substring_pattern_singletons_); 260 STLDeleteElements(®ex_pattern_singletons_); 261 STLDeleteElements(&origin_and_path_regex_pattern_singletons_); 262 } 263 264 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( 265 const GURL& url) const { 266 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + 267 url.path() + kEndOfPath + 268 (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL; 269 } 270 271 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( 272 const std::string& prefix) { 273 return CreateCondition(URLMatcherCondition::HOST_PREFIX, 274 kBeginningOfURL + CanonicalizeHostname(prefix)); 275 } 276 277 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( 278 const std::string& suffix) { 279 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, 280 suffix + kEndOfDomain); 281 } 282 283 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( 284 const std::string& str) { 285 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); 286 } 287 288 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( 289 const std::string& str) { 290 return CreateCondition(URLMatcherCondition::HOST_EQUALS, 291 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); 292 } 293 294 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( 295 const std::string& prefix) { 296 return CreateCondition(URLMatcherCondition::PATH_PREFIX, 297 kEndOfDomain + prefix); 298 } 299 300 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( 301 const std::string& suffix) { 302 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); 303 } 304 305 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( 306 const std::string& str) { 307 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); 308 } 309 310 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( 311 const std::string& str) { 312 return CreateCondition(URLMatcherCondition::PATH_EQUALS, 313 kEndOfDomain + str + kEndOfPath); 314 } 315 316 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( 317 const std::string& prefix) { 318 std::string pattern; 319 if (!prefix.empty() && prefix[0] == '?') 320 pattern = kEndOfPath + prefix; 321 else 322 pattern = kEndOfPath + ('?' + prefix); 323 324 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); 325 } 326 327 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( 328 const std::string& suffix) { 329 if (!suffix.empty() && suffix[0] == '?') { 330 return CreateQueryEqualsCondition(suffix); 331 } else { 332 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, 333 suffix + kEndOfURL); 334 } 335 } 336 337 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( 338 const std::string& str) { 339 if (!str.empty() && str[0] == '?') 340 return CreateQueryPrefixCondition(str); 341 else 342 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); 343 } 344 345 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( 346 const std::string& str) { 347 std::string pattern; 348 if (!str.empty() && str[0] == '?') 349 pattern = kEndOfPath + str + kEndOfURL; 350 else 351 pattern = kEndOfPath + ('?' + str) + kEndOfURL; 352 353 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); 354 } 355 356 URLMatcherCondition 357 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( 358 const std::string& host_suffix, 359 const std::string& path_prefix) { 360 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, 361 host_suffix + kEndOfDomain + path_prefix); 362 } 363 364 URLMatcherCondition 365 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( 366 const std::string& host, 367 const std::string& path_prefix) { 368 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, 369 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + 370 path_prefix); 371 } 372 373 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( 374 const GURL& url) const { 375 GURL::Replacements replacements; 376 replacements.ClearPassword(); 377 replacements.ClearUsername(); 378 replacements.ClearRef(); 379 // Clear port if it is implicit from scheme. 380 if (url.has_port()) { 381 const std::string& port = url.scheme(); 382 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == 383 url.EffectiveIntPort()) { 384 replacements.ClearPort(); 385 } 386 } 387 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + 388 kEndOfURL; 389 } 390 391 static std::string CanonicalizeURLForRegexSearchesHelper( 392 const GURL& url, 393 bool clear_query) { 394 GURL::Replacements replacements; 395 replacements.ClearPassword(); 396 replacements.ClearUsername(); 397 replacements.ClearRef(); 398 if (clear_query) 399 replacements.ClearQuery(); 400 // Clear port if it is implicit from scheme. 401 if (url.has_port()) { 402 const std::string& port = url.scheme(); 403 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == 404 url.EffectiveIntPort()) { 405 replacements.ClearPort(); 406 } 407 } 408 return url.ReplaceComponents(replacements).spec(); 409 } 410 411 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( 412 const GURL& url) const { 413 return CanonicalizeURLForRegexSearchesHelper(url, false); 414 } 415 416 std::string 417 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches( 418 const GURL& url) const { 419 return CanonicalizeURLForRegexSearchesHelper(url, true); 420 } 421 422 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( 423 const std::string& prefix) { 424 return CreateCondition(URLMatcherCondition::URL_PREFIX, 425 kBeginningOfURL + prefix); 426 } 427 428 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( 429 const std::string& suffix) { 430 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); 431 } 432 433 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( 434 const std::string& str) { 435 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); 436 } 437 438 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( 439 const std::string& str) { 440 return CreateCondition(URLMatcherCondition::URL_EQUALS, 441 kBeginningOfURL + str + kEndOfURL); 442 } 443 444 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( 445 const std::string& regex) { 446 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); 447 } 448 449 URLMatcherCondition 450 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition( 451 const std::string& regex) { 452 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex); 453 } 454 455 void URLMatcherConditionFactory::ForgetUnusedPatterns( 456 const std::set<StringPattern::ID>& used_patterns) { 457 PatternSingletons::iterator i = substring_pattern_singletons_.begin(); 458 while (i != substring_pattern_singletons_.end()) { 459 if (ContainsKey(used_patterns, (*i)->id())) { 460 ++i; 461 } else { 462 delete *i; 463 substring_pattern_singletons_.erase(i++); 464 } 465 } 466 i = regex_pattern_singletons_.begin(); 467 while (i != regex_pattern_singletons_.end()) { 468 if (ContainsKey(used_patterns, (*i)->id())) { 469 ++i; 470 } else { 471 delete *i; 472 regex_pattern_singletons_.erase(i++); 473 } 474 } 475 i = origin_and_path_regex_pattern_singletons_.begin(); 476 while (i != origin_and_path_regex_pattern_singletons_.end()) { 477 if (ContainsKey(used_patterns, (*i)->id())) { 478 ++i; 479 } else { 480 delete *i; 481 origin_and_path_regex_pattern_singletons_.erase(i++); 482 } 483 } 484 } 485 486 bool URLMatcherConditionFactory::IsEmpty() const { 487 return substring_pattern_singletons_.empty() && 488 regex_pattern_singletons_.empty() && 489 origin_and_path_regex_pattern_singletons_.empty(); 490 } 491 492 URLMatcherCondition URLMatcherConditionFactory::CreateCondition( 493 URLMatcherCondition::Criterion criterion, 494 const std::string& pattern) { 495 StringPattern search_pattern(pattern, 0); 496 PatternSingletons* pattern_singletons = NULL; 497 if (IsRegexCriterion(criterion)) 498 pattern_singletons = ®ex_pattern_singletons_; 499 else if (IsOriginAndPathRegexCriterion(criterion)) 500 pattern_singletons = &origin_and_path_regex_pattern_singletons_; 501 else 502 pattern_singletons = &substring_pattern_singletons_; 503 504 PatternSingletons::const_iterator iter = 505 pattern_singletons->find(&search_pattern); 506 507 if (iter != pattern_singletons->end()) { 508 return URLMatcherCondition(criterion, *iter); 509 } else { 510 StringPattern* new_pattern = 511 new StringPattern(pattern, id_counter_++); 512 pattern_singletons->insert(new_pattern); 513 return URLMatcherCondition(criterion, new_pattern); 514 } 515 } 516 517 std::string URLMatcherConditionFactory::CanonicalizeHostname( 518 const std::string& hostname) const { 519 if (!hostname.empty() && hostname[0] == '.') 520 return hostname; 521 else 522 return "." + hostname; 523 } 524 525 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( 526 StringPattern* lhs, 527 StringPattern* rhs) const { 528 if (lhs == NULL && rhs != NULL) return true; 529 if (lhs != NULL && rhs != NULL) 530 return lhs->pattern() < rhs->pattern(); 531 // Either both are NULL or only rhs is NULL. 532 return false; 533 } 534 535 // 536 // URLMatcherSchemeFilter 537 // 538 539 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) 540 : filters_(1) { 541 filters_.push_back(filter); 542 } 543 544 URLMatcherSchemeFilter::URLMatcherSchemeFilter( 545 const std::vector<std::string>& filters) 546 : filters_(filters) {} 547 548 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} 549 550 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { 551 return std::find(filters_.begin(), filters_.end(), url.scheme()) != 552 filters_.end(); 553 } 554 555 // 556 // URLMatcherPortFilter 557 // 558 559 URLMatcherPortFilter::URLMatcherPortFilter( 560 const std::vector<URLMatcherPortFilter::Range>& ranges) 561 : ranges_(ranges) {} 562 563 URLMatcherPortFilter::~URLMatcherPortFilter() {} 564 565 bool URLMatcherPortFilter::IsMatch(const GURL& url) const { 566 int port = url.EffectiveIntPort(); 567 for (std::vector<Range>::const_iterator i = ranges_.begin(); 568 i != ranges_.end(); ++i) { 569 if (i->first <= port && port <= i->second) 570 return true; 571 } 572 return false; 573 } 574 575 // static 576 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, 577 int to) { 578 return Range(from, to); 579 } 580 581 // static 582 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { 583 return Range(port, port); 584 } 585 586 // 587 // URLMatcherConditionSet 588 // 589 590 URLMatcherConditionSet::~URLMatcherConditionSet() {} 591 592 URLMatcherConditionSet::URLMatcherConditionSet( 593 ID id, 594 const Conditions& conditions) 595 : id_(id), 596 conditions_(conditions) {} 597 598 URLMatcherConditionSet::URLMatcherConditionSet( 599 ID id, 600 const Conditions& conditions, 601 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 602 scoped_ptr<URLMatcherPortFilter> port_filter) 603 : id_(id), 604 conditions_(conditions), 605 scheme_filter_(scheme_filter.Pass()), 606 port_filter_(port_filter.Pass()) {} 607 608 bool URLMatcherConditionSet::IsMatch( 609 const std::set<StringPattern::ID>& matching_patterns, 610 const GURL& url) const { 611 for (Conditions::const_iterator i = conditions_.begin(); 612 i != conditions_.end(); ++i) { 613 if (!i->IsMatch(matching_patterns, url)) 614 return false; 615 } 616 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) 617 return false; 618 if (port_filter_.get() && !port_filter_->IsMatch(url)) 619 return false; 620 return true; 621 } 622 623 // 624 // URLMatcher 625 // 626 627 URLMatcher::URLMatcher() {} 628 629 URLMatcher::~URLMatcher() {} 630 631 void URLMatcher::AddConditionSets( 632 const URLMatcherConditionSet::Vector& condition_sets) { 633 for (URLMatcherConditionSet::Vector::const_iterator i = 634 condition_sets.begin(); i != condition_sets.end(); ++i) { 635 DCHECK(url_matcher_condition_sets_.find((*i)->id()) == 636 url_matcher_condition_sets_.end()); 637 url_matcher_condition_sets_[(*i)->id()] = *i; 638 } 639 UpdateInternalDatastructures(); 640 } 641 642 void URLMatcher::RemoveConditionSets( 643 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { 644 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = 645 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { 646 DCHECK(url_matcher_condition_sets_.find(*i) != 647 url_matcher_condition_sets_.end()); 648 url_matcher_condition_sets_.erase(*i); 649 } 650 UpdateInternalDatastructures(); 651 } 652 653 void URLMatcher::ClearUnusedConditionSets() { 654 UpdateConditionFactory(); 655 } 656 657 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( 658 const GURL& url) const { 659 // Find all IDs of StringPatterns that match |url|. 660 // See URLMatcherConditionFactory for the canonicalization of URLs and the 661 // distinction between full url searches and url component searches. 662 std::set<StringPattern::ID> matches; 663 if (!full_url_matcher_.IsEmpty()) { 664 full_url_matcher_.Match( 665 condition_factory_.CanonicalizeURLForFullSearches(url), &matches); 666 } 667 if (!url_component_matcher_.IsEmpty()) { 668 url_component_matcher_.Match( 669 condition_factory_.CanonicalizeURLForComponentSearches(url), &matches); 670 } 671 if (!regex_set_matcher_.IsEmpty()) { 672 regex_set_matcher_.Match( 673 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); 674 } 675 if (!origin_and_path_regex_set_matcher_.IsEmpty()) { 676 origin_and_path_regex_set_matcher_.Match( 677 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url), 678 &matches); 679 } 680 681 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions 682 // were fulfilled. 683 std::set<URLMatcherConditionSet::ID> result; 684 for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); 685 i != matches.end(); ++i) { 686 // For each URLMatcherConditionSet there is exactly one condition 687 // registered in substring_match_triggers_. This means that the following 688 // logic tests each URLMatcherConditionSet exactly once if it can be 689 // completely fulfilled. 690 StringPatternTriggers::const_iterator triggered_condition_sets_iter = 691 substring_match_triggers_.find(*i); 692 if (triggered_condition_sets_iter == substring_match_triggers_.end()) 693 continue; // Not all substring matches are triggers for a condition set. 694 const std::set<URLMatcherConditionSet::ID>& condition_sets = 695 triggered_condition_sets_iter->second; 696 for (std::set<URLMatcherConditionSet::ID>::const_iterator j = 697 condition_sets.begin(); j != condition_sets.end(); ++j) { 698 URLMatcherConditionSets::const_iterator condition_set_iter = 699 url_matcher_condition_sets_.find(*j); 700 DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); 701 if (condition_set_iter->second->IsMatch(matches, url)) 702 result.insert(*j); 703 } 704 } 705 706 return result; 707 } 708 709 bool URLMatcher::IsEmpty() const { 710 return condition_factory_.IsEmpty() && 711 url_matcher_condition_sets_.empty() && 712 substring_match_triggers_.empty() && 713 full_url_matcher_.IsEmpty() && 714 url_component_matcher_.IsEmpty() && 715 regex_set_matcher_.IsEmpty() && 716 origin_and_path_regex_set_matcher_.IsEmpty() && 717 registered_full_url_patterns_.empty() && 718 registered_url_component_patterns_.empty(); 719 } 720 721 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { 722 // The purpose of |full_url_conditions| is just that we need to execute 723 // the same logic once for Full URL searches and once for URL Component 724 // searches (see URLMatcherConditionFactory). 725 726 // Determine which patterns need to be registered when this function 727 // terminates. 728 std::set<const StringPattern*> new_patterns; 729 for (URLMatcherConditionSets::const_iterator condition_set_iter = 730 url_matcher_condition_sets_.begin(); 731 condition_set_iter != url_matcher_condition_sets_.end(); 732 ++condition_set_iter) { 733 const URLMatcherConditionSet::Conditions& conditions = 734 condition_set_iter->second->conditions(); 735 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 736 conditions.begin(); condition_iter != conditions.end(); 737 ++condition_iter) { 738 // If we are called to process Full URL searches, ignore others, and 739 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) 740 if (!condition_iter->IsRegexCondition() && 741 !condition_iter->IsOriginAndPathRegexCondition() && 742 full_url_conditions == condition_iter->IsFullURLCondition()) 743 new_patterns.insert(condition_iter->string_pattern()); 744 } 745 } 746 747 // This is the set of patterns that were registered before this function 748 // is called. 749 std::set<const StringPattern*>& registered_patterns = 750 full_url_conditions ? registered_full_url_patterns_ 751 : registered_url_component_patterns_; 752 753 // Add all patterns that are in new_patterns but not in registered_patterns. 754 std::vector<const StringPattern*> patterns_to_register = 755 base::STLSetDifference<std::vector<const StringPattern*> >( 756 new_patterns, registered_patterns); 757 758 // Remove all patterns that are in registered_patterns but not in 759 // new_patterns. 760 std::vector<const StringPattern*> patterns_to_unregister = 761 base::STLSetDifference<std::vector<const StringPattern*> >( 762 registered_patterns, new_patterns); 763 764 // Update the SubstringSetMatcher. 765 SubstringSetMatcher& url_matcher = 766 full_url_conditions ? full_url_matcher_ : url_component_matcher_; 767 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, 768 patterns_to_unregister); 769 770 // Update the set of registered_patterns for the next time this function 771 // is being called. 772 registered_patterns.swap(new_patterns); 773 } 774 775 void URLMatcher::UpdateRegexSetMatcher() { 776 std::vector<const StringPattern*> new_patterns; 777 std::vector<const StringPattern*> new_origin_and_path_patterns; 778 779 for (URLMatcherConditionSets::const_iterator condition_set_iter = 780 url_matcher_condition_sets_.begin(); 781 condition_set_iter != url_matcher_condition_sets_.end(); 782 ++condition_set_iter) { 783 const URLMatcherConditionSet::Conditions& conditions = 784 condition_set_iter->second->conditions(); 785 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 786 conditions.begin(); condition_iter != conditions.end(); 787 ++condition_iter) { 788 if (condition_iter->IsRegexCondition()) { 789 new_patterns.push_back(condition_iter->string_pattern()); 790 } else if (condition_iter->IsOriginAndPathRegexCondition()) { 791 new_origin_and_path_patterns.push_back( 792 condition_iter->string_pattern()); 793 } 794 } 795 } 796 797 // Start over from scratch. We can't really do better than this, since the 798 // FilteredRE2 backend doesn't support incremental updates. 799 regex_set_matcher_.ClearPatterns(); 800 regex_set_matcher_.AddPatterns(new_patterns); 801 origin_and_path_regex_set_matcher_.ClearPatterns(); 802 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns); 803 } 804 805 void URLMatcher::UpdateTriggers() { 806 // Count substring pattern frequencies. 807 std::map<StringPattern::ID, size_t> substring_pattern_frequencies; 808 for (URLMatcherConditionSets::const_iterator condition_set_iter = 809 url_matcher_condition_sets_.begin(); 810 condition_set_iter != url_matcher_condition_sets_.end(); 811 ++condition_set_iter) { 812 const URLMatcherConditionSet::Conditions& conditions = 813 condition_set_iter->second->conditions(); 814 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 815 conditions.begin(); condition_iter != conditions.end(); 816 ++condition_iter) { 817 const StringPattern* pattern = condition_iter->string_pattern(); 818 substring_pattern_frequencies[pattern->id()]++; 819 } 820 } 821 822 // Update trigger conditions: Determine for each URLMatcherConditionSet which 823 // URLMatcherCondition contains a StringPattern that occurs least 824 // frequently in this URLMatcher. We assume that this condition is very 825 // specific and occurs rarely in URLs. If a match occurs for this 826 // URLMatcherCondition, we want to test all other URLMatcherCondition in the 827 // respective URLMatcherConditionSet as well to see whether the entire 828 // URLMatcherConditionSet is considered matching. 829 substring_match_triggers_.clear(); 830 for (URLMatcherConditionSets::const_iterator condition_set_iter = 831 url_matcher_condition_sets_.begin(); 832 condition_set_iter != url_matcher_condition_sets_.end(); 833 ++condition_set_iter) { 834 const URLMatcherConditionSet::Conditions& conditions = 835 condition_set_iter->second->conditions(); 836 if (conditions.empty()) 837 continue; 838 URLMatcherConditionSet::Conditions::const_iterator condition_iter = 839 conditions.begin(); 840 StringPattern::ID trigger = condition_iter->string_pattern()->id(); 841 // We skip the first element in the following loop. 842 ++condition_iter; 843 for (; condition_iter != conditions.end(); ++condition_iter) { 844 StringPattern::ID current_id = 845 condition_iter->string_pattern()->id(); 846 if (substring_pattern_frequencies[trigger] > 847 substring_pattern_frequencies[current_id]) { 848 trigger = current_id; 849 } 850 } 851 substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); 852 } 853 } 854 855 void URLMatcher::UpdateConditionFactory() { 856 std::set<StringPattern::ID> used_patterns; 857 for (URLMatcherConditionSets::const_iterator condition_set_iter = 858 url_matcher_condition_sets_.begin(); 859 condition_set_iter != url_matcher_condition_sets_.end(); 860 ++condition_set_iter) { 861 const URLMatcherConditionSet::Conditions& conditions = 862 condition_set_iter->second->conditions(); 863 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 864 conditions.begin(); condition_iter != conditions.end(); 865 ++condition_iter) { 866 used_patterns.insert(condition_iter->string_pattern()->id()); 867 } 868 } 869 condition_factory_.ForgetUnusedPatterns(used_patterns); 870 } 871 872 void URLMatcher::UpdateInternalDatastructures() { 873 UpdateSubstringSetMatcher(false); 874 UpdateSubstringSetMatcher(true); 875 UpdateRegexSetMatcher(); 876 UpdateTriggers(); 877 UpdateConditionFactory(); 878 } 879 880 } // namespace url_matcher 881