1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "extensions/common/matcher/url_matcher.h" 6 7 #include <algorithm> 8 #include <iterator> 9 10 #include "base/logging.h" 11 #include "content/public/common/url_constants.h" 12 #include "url/gurl.h" 13 #include "url/url_canon.h" 14 15 namespace extensions { 16 17 // This set of classes implement a mapping of URL Component Patterns, such as 18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns 19 // for use in substring comparisons. 20 // 21 // The idea of this mapping is to reduce the problem of comparing many 22 // URL Component Patterns against one URL to the problem of searching many 23 // substrings in one string: 24 // 25 // ---------------------- ----------------- 26 // | URL Query operator | ----translate----> | StringPattern | 27 // ---------------------- ----------------- 28 // ^ 29 // | 30 // compare 31 // | 32 // v 33 // ---------------------- ----------------- 34 // | URL to compare | | | 35 // | to all URL Query | ----translate----> | String | 36 // | operators | | | 37 // ---------------------- ----------------- 38 // 39 // The reason for this problem reduction is that there are efficient algorithms 40 // for searching many substrings in one string (see Aho-Corasick algorithm). 41 // 42 // Additionally, some of the same pieces are reused to implement regular 43 // expression comparisons. The FilteredRE2 implementation for matching many 44 // regular expressions against one string uses prefiltering, in which a set 45 // of substrings (derived from the regexes) are first searched for, to reduce 46 // the number of regular expressions to test; the prefiltering step also 47 // uses Aho-Corasick. 48 // 49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches. 50 // ========================================================== 51 // 52 // For searches in this class, we normalize URLs as follows: 53 // 54 // Step 1: 55 // Remove scheme, port and segment from URL: 56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 57 // www.example.com/index.html?search=foo 58 // 59 // We remove the scheme and port number because they can be checked later 60 // in a secondary filter step. We remove the segment (the #... part) because 61 // this is not guaranteed to be ASCII-7 encoded. 62 // 63 // Step 2: 64 // Translate URL to String and add the following position markers: 65 // - BU = Beginning of URL 66 // - ED = End of Domain 67 // - EP = End of Path 68 // - EU = End of URL 69 // Furthermore, the hostname is canonicalized to start with a ".". 70 // 71 // Position markers are represented as characters >127, which are therefore 72 // guaranteed not to be part of the ASCII-7 encoded URL character set. 73 // 74 // -> www.example.com/index.html?search=foo becomes 75 // BU .www.example.com ED /index.html EP ?search=foo EU 76 // 77 // -> www.example.com/index.html becomes 78 // BU .www.example.com ED /index.html EP EU 79 // 80 // Step 3: 81 // Translate URL Component Patterns as follows: 82 // 83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix) 84 // -> host_prefix("www.example") = BU .www.example 85 // 86 // host_suffix(suffix) = suffix ED 87 // -> host_suffix("example.com") = example.com ED 88 // -> host_suffix(".example.com") = .example.com ED 89 // 90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED 91 // -> host_equals("www.example.com") = BU .www.example.com ED 92 // 93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). 94 // 95 // With this, we can search the StringPatterns in the normalized URL. 96 // 97 // 98 // Case 2: url_{prefix,suffix,equals,contains} searches. 99 // ===================================================== 100 // 101 // Step 1: as above, except that 102 // - the scheme is not removed 103 // - the port is not removed if it is specified and does not match the default 104 // port for the given scheme. 105 // 106 // Step 2: 107 // Translate URL to String and add the following position markers: 108 // - BU = Beginning of URL 109 // - EU = End of URL 110 // 111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes 112 // BU http://www.example.com:8080/index.html?search=foo EU 113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes 114 // BU http://www.example.com/index.html?search=foo EU 115 // 116 // url_prefix(prefix) = BU prefix 117 // -> url_prefix("http://www.example") = BU http://www.example 118 // 119 // url_contains(substring) = substring 120 // -> url_contains("index") = index 121 // 122 // 123 // Case 3: {host,path,query}_contains searches. 124 // ============================================ 125 // 126 // These kinds of searches are not supported directly but can be derived 127 // by a combination of a url_contains() query followed by an explicit test: 128 // 129 // host_contains(str) = url_contains(str) followed by test whether str occurs 130 // in host component of original URL. 131 // -> host_contains("example.co") = example.co 132 // followed by gurl.host().find("example.co"); 133 // 134 // [similarly for path_contains and query_contains]. 135 // 136 // 137 // Regular expression matching (url_matches searches) 138 // ================================================== 139 // 140 // This class also supports matching regular expressions (RE2 syntax) 141 // against full URLs, which are transformed as in case 2. 142 143 namespace { 144 145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { 146 return criterion == URLMatcherCondition::URL_MATCHES; 147 } 148 149 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { 150 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; 151 } 152 153 } // namespace 154 155 // 156 // URLMatcherCondition 157 // 158 159 URLMatcherCondition::URLMatcherCondition() 160 : criterion_(HOST_PREFIX), 161 string_pattern_(NULL) {} 162 163 URLMatcherCondition::~URLMatcherCondition() {} 164 165 URLMatcherCondition::URLMatcherCondition( 166 Criterion criterion, 167 const StringPattern* string_pattern) 168 : criterion_(criterion), 169 string_pattern_(string_pattern) {} 170 171 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) 172 : criterion_(rhs.criterion_), 173 string_pattern_(rhs.string_pattern_) {} 174 175 URLMatcherCondition& URLMatcherCondition::operator=( 176 const URLMatcherCondition& rhs) { 177 criterion_ = rhs.criterion_; 178 string_pattern_ = rhs.string_pattern_; 179 return *this; 180 } 181 182 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { 183 if (criterion_ < rhs.criterion_) return true; 184 if (criterion_ > rhs.criterion_) return false; 185 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 186 return *string_pattern_ < *rhs.string_pattern_; 187 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; 188 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 189 // or both are NULL. 190 return false; 191 } 192 193 bool URLMatcherCondition::IsFullURLCondition() const { 194 // For these criteria the SubstringMatcher needs to be executed on the 195 // GURL that is canonicalized with 196 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. 197 switch (criterion_) { 198 case HOST_CONTAINS: 199 case PATH_CONTAINS: 200 case QUERY_CONTAINS: 201 case URL_PREFIX: 202 case URL_SUFFIX: 203 case URL_CONTAINS: 204 case URL_EQUALS: 205 return true; 206 default: 207 break; 208 } 209 return false; 210 } 211 212 bool URLMatcherCondition::IsRegexCondition() const { 213 return IsRegexCriterion(criterion_); 214 } 215 216 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const { 217 return IsOriginAndPathRegexCriterion(criterion_); 218 } 219 220 bool URLMatcherCondition::IsMatch( 221 const std::set<StringPattern::ID>& matching_patterns, 222 const GURL& url) const { 223 DCHECK(string_pattern_); 224 if (!ContainsKey(matching_patterns, string_pattern_->id())) 225 return false; 226 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on 227 // a substring match on the raw URL. In case of a match, we need to verify 228 // that the match was found in the correct component of the URL. 229 switch (criterion_) { 230 case HOST_CONTAINS: 231 return url.host().find(string_pattern_->pattern()) != 232 std::string::npos; 233 case PATH_CONTAINS: 234 return url.path().find(string_pattern_->pattern()) != 235 std::string::npos; 236 case QUERY_CONTAINS: 237 return url.query().find(string_pattern_->pattern()) != 238 std::string::npos; 239 default: 240 break; 241 } 242 return true; 243 } 244 245 // 246 // URLMatcherConditionFactory 247 // 248 249 namespace { 250 // These are symbols that are not contained in 7-bit ASCII used in GURLs. 251 const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; 252 const char kEndOfDomain[] = {static_cast<char>(-2), 0}; 253 const char kEndOfPath[] = {static_cast<char>(-3), 0}; 254 const char kEndOfURL[] = {static_cast<char>(-4), 0}; 255 } // namespace 256 257 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} 258 259 URLMatcherConditionFactory::~URLMatcherConditionFactory() { 260 STLDeleteElements(&substring_pattern_singletons_); 261 STLDeleteElements(®ex_pattern_singletons_); 262 STLDeleteElements(&origin_and_path_regex_pattern_singletons_); 263 } 264 265 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( 266 const GURL& url) const { 267 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + 268 url.path() + kEndOfPath + 269 (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL; 270 } 271 272 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( 273 const std::string& prefix) { 274 return CreateCondition(URLMatcherCondition::HOST_PREFIX, 275 kBeginningOfURL + CanonicalizeHostname(prefix)); 276 } 277 278 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( 279 const std::string& suffix) { 280 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, 281 suffix + kEndOfDomain); 282 } 283 284 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( 285 const std::string& str) { 286 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); 287 } 288 289 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( 290 const std::string& str) { 291 return CreateCondition(URLMatcherCondition::HOST_EQUALS, 292 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); 293 } 294 295 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( 296 const std::string& prefix) { 297 return CreateCondition(URLMatcherCondition::PATH_PREFIX, 298 kEndOfDomain + prefix); 299 } 300 301 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( 302 const std::string& suffix) { 303 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); 304 } 305 306 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( 307 const std::string& str) { 308 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); 309 } 310 311 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( 312 const std::string& str) { 313 return CreateCondition(URLMatcherCondition::PATH_EQUALS, 314 kEndOfDomain + str + kEndOfPath); 315 } 316 317 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( 318 const std::string& prefix) { 319 std::string pattern; 320 if (!prefix.empty() && prefix[0] == '?') 321 pattern = kEndOfPath + prefix; 322 else 323 pattern = kEndOfPath + ('?' + prefix); 324 325 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); 326 } 327 328 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( 329 const std::string& suffix) { 330 if (!suffix.empty() && suffix[0] == '?') { 331 return CreateQueryEqualsCondition(suffix); 332 } else { 333 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, 334 suffix + kEndOfURL); 335 } 336 } 337 338 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( 339 const std::string& str) { 340 if (!str.empty() && str[0] == '?') 341 return CreateQueryPrefixCondition(str); 342 else 343 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); 344 } 345 346 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( 347 const std::string& str) { 348 std::string pattern; 349 if (!str.empty() && str[0] == '?') 350 pattern = kEndOfPath + str + kEndOfURL; 351 else 352 pattern = kEndOfPath + ('?' + str) + kEndOfURL; 353 354 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); 355 } 356 357 URLMatcherCondition 358 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( 359 const std::string& host_suffix, 360 const std::string& path_prefix) { 361 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, 362 host_suffix + kEndOfDomain + path_prefix); 363 } 364 365 URLMatcherCondition 366 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( 367 const std::string& host, 368 const std::string& path_prefix) { 369 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, 370 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + 371 path_prefix); 372 } 373 374 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( 375 const GURL& url) const { 376 GURL::Replacements replacements; 377 replacements.ClearPassword(); 378 replacements.ClearUsername(); 379 replacements.ClearRef(); 380 // Clear port if it is implicit from scheme. 381 if (url.has_port()) { 382 const std::string& port = url.scheme(); 383 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == 384 url.EffectiveIntPort()) { 385 replacements.ClearPort(); 386 } 387 } 388 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + 389 kEndOfURL; 390 } 391 392 static std::string CanonicalizeURLForRegexSearchesHelper( 393 const GURL& url, 394 bool clear_query) { 395 GURL::Replacements replacements; 396 replacements.ClearPassword(); 397 replacements.ClearUsername(); 398 replacements.ClearRef(); 399 if (clear_query) 400 replacements.ClearQuery(); 401 // Clear port if it is implicit from scheme. 402 if (url.has_port()) { 403 const std::string& port = url.scheme(); 404 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == 405 url.EffectiveIntPort()) { 406 replacements.ClearPort(); 407 } 408 } 409 return url.ReplaceComponents(replacements).spec(); 410 } 411 412 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( 413 const GURL& url) const { 414 return CanonicalizeURLForRegexSearchesHelper(url, false); 415 } 416 417 std::string 418 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches( 419 const GURL& url) const { 420 return CanonicalizeURLForRegexSearchesHelper(url, true); 421 } 422 423 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( 424 const std::string& prefix) { 425 return CreateCondition(URLMatcherCondition::URL_PREFIX, 426 kBeginningOfURL + prefix); 427 } 428 429 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( 430 const std::string& suffix) { 431 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); 432 } 433 434 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( 435 const std::string& str) { 436 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); 437 } 438 439 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( 440 const std::string& str) { 441 return CreateCondition(URLMatcherCondition::URL_EQUALS, 442 kBeginningOfURL + str + kEndOfURL); 443 } 444 445 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( 446 const std::string& regex) { 447 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); 448 } 449 450 URLMatcherCondition 451 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition( 452 const std::string& regex) { 453 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex); 454 } 455 456 void URLMatcherConditionFactory::ForgetUnusedPatterns( 457 const std::set<StringPattern::ID>& used_patterns) { 458 PatternSingletons::iterator i = substring_pattern_singletons_.begin(); 459 while (i != substring_pattern_singletons_.end()) { 460 if (ContainsKey(used_patterns, (*i)->id())) { 461 ++i; 462 } else { 463 delete *i; 464 substring_pattern_singletons_.erase(i++); 465 } 466 } 467 i = regex_pattern_singletons_.begin(); 468 while (i != regex_pattern_singletons_.end()) { 469 if (ContainsKey(used_patterns, (*i)->id())) { 470 ++i; 471 } else { 472 delete *i; 473 regex_pattern_singletons_.erase(i++); 474 } 475 } 476 i = origin_and_path_regex_pattern_singletons_.begin(); 477 while (i != origin_and_path_regex_pattern_singletons_.end()) { 478 if (ContainsKey(used_patterns, (*i)->id())) { 479 ++i; 480 } else { 481 delete *i; 482 origin_and_path_regex_pattern_singletons_.erase(i++); 483 } 484 } 485 } 486 487 bool URLMatcherConditionFactory::IsEmpty() const { 488 return substring_pattern_singletons_.empty() && 489 regex_pattern_singletons_.empty() && 490 origin_and_path_regex_pattern_singletons_.empty(); 491 } 492 493 URLMatcherCondition URLMatcherConditionFactory::CreateCondition( 494 URLMatcherCondition::Criterion criterion, 495 const std::string& pattern) { 496 StringPattern search_pattern(pattern, 0); 497 PatternSingletons* pattern_singletons = NULL; 498 if (IsRegexCriterion(criterion)) 499 pattern_singletons = ®ex_pattern_singletons_; 500 else if (IsOriginAndPathRegexCriterion(criterion)) 501 pattern_singletons = &origin_and_path_regex_pattern_singletons_; 502 else 503 pattern_singletons = &substring_pattern_singletons_; 504 505 PatternSingletons::const_iterator iter = 506 pattern_singletons->find(&search_pattern); 507 508 if (iter != pattern_singletons->end()) { 509 return URLMatcherCondition(criterion, *iter); 510 } else { 511 StringPattern* new_pattern = 512 new StringPattern(pattern, id_counter_++); 513 pattern_singletons->insert(new_pattern); 514 return URLMatcherCondition(criterion, new_pattern); 515 } 516 } 517 518 std::string URLMatcherConditionFactory::CanonicalizeHostname( 519 const std::string& hostname) const { 520 if (!hostname.empty() && hostname[0] == '.') 521 return hostname; 522 else 523 return "." + hostname; 524 } 525 526 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( 527 StringPattern* lhs, 528 StringPattern* rhs) const { 529 if (lhs == NULL && rhs != NULL) return true; 530 if (lhs != NULL && rhs != NULL) 531 return lhs->pattern() < rhs->pattern(); 532 // Either both are NULL or only rhs is NULL. 533 return false; 534 } 535 536 // 537 // URLMatcherSchemeFilter 538 // 539 540 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) 541 : filters_(1) { 542 filters_.push_back(filter); 543 } 544 545 URLMatcherSchemeFilter::URLMatcherSchemeFilter( 546 const std::vector<std::string>& filters) 547 : filters_(filters) {} 548 549 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} 550 551 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { 552 return std::find(filters_.begin(), filters_.end(), url.scheme()) != 553 filters_.end(); 554 } 555 556 // 557 // URLMatcherPortFilter 558 // 559 560 URLMatcherPortFilter::URLMatcherPortFilter( 561 const std::vector<URLMatcherPortFilter::Range>& ranges) 562 : ranges_(ranges) {} 563 564 URLMatcherPortFilter::~URLMatcherPortFilter() {} 565 566 bool URLMatcherPortFilter::IsMatch(const GURL& url) const { 567 int port = url.EffectiveIntPort(); 568 for (std::vector<Range>::const_iterator i = ranges_.begin(); 569 i != ranges_.end(); ++i) { 570 if (i->first <= port && port <= i->second) 571 return true; 572 } 573 return false; 574 } 575 576 // static 577 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, 578 int to) { 579 return Range(from, to); 580 } 581 582 // static 583 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { 584 return Range(port, port); 585 } 586 587 // 588 // URLMatcherConditionSet 589 // 590 591 URLMatcherConditionSet::~URLMatcherConditionSet() {} 592 593 URLMatcherConditionSet::URLMatcherConditionSet( 594 ID id, 595 const Conditions& conditions) 596 : id_(id), 597 conditions_(conditions) {} 598 599 URLMatcherConditionSet::URLMatcherConditionSet( 600 ID id, 601 const Conditions& conditions, 602 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 603 scoped_ptr<URLMatcherPortFilter> port_filter) 604 : id_(id), 605 conditions_(conditions), 606 scheme_filter_(scheme_filter.Pass()), 607 port_filter_(port_filter.Pass()) {} 608 609 bool URLMatcherConditionSet::IsMatch( 610 const std::set<StringPattern::ID>& matching_patterns, 611 const GURL& url) const { 612 for (Conditions::const_iterator i = conditions_.begin(); 613 i != conditions_.end(); ++i) { 614 if (!i->IsMatch(matching_patterns, url)) 615 return false; 616 } 617 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) 618 return false; 619 if (port_filter_.get() && !port_filter_->IsMatch(url)) 620 return false; 621 return true; 622 } 623 624 // 625 // URLMatcher 626 // 627 628 URLMatcher::URLMatcher() {} 629 630 URLMatcher::~URLMatcher() {} 631 632 void URLMatcher::AddConditionSets( 633 const URLMatcherConditionSet::Vector& condition_sets) { 634 for (URLMatcherConditionSet::Vector::const_iterator i = 635 condition_sets.begin(); i != condition_sets.end(); ++i) { 636 DCHECK(url_matcher_condition_sets_.find((*i)->id()) == 637 url_matcher_condition_sets_.end()); 638 url_matcher_condition_sets_[(*i)->id()] = *i; 639 } 640 UpdateInternalDatastructures(); 641 } 642 643 void URLMatcher::RemoveConditionSets( 644 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { 645 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = 646 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { 647 DCHECK(url_matcher_condition_sets_.find(*i) != 648 url_matcher_condition_sets_.end()); 649 url_matcher_condition_sets_.erase(*i); 650 } 651 UpdateInternalDatastructures(); 652 } 653 654 void URLMatcher::ClearUnusedConditionSets() { 655 UpdateConditionFactory(); 656 } 657 658 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( 659 const GURL& url) const { 660 // Find all IDs of StringPatterns that match |url|. 661 // See URLMatcherConditionFactory for the canonicalization of URLs and the 662 // distinction between full url searches and url component searches. 663 std::set<StringPattern::ID> matches; 664 if (!full_url_matcher_.IsEmpty()) { 665 full_url_matcher_.Match( 666 condition_factory_.CanonicalizeURLForFullSearches(url), &matches); 667 } 668 if (!url_component_matcher_.IsEmpty()) { 669 url_component_matcher_.Match( 670 condition_factory_.CanonicalizeURLForComponentSearches(url), &matches); 671 } 672 if (!regex_set_matcher_.IsEmpty()) { 673 regex_set_matcher_.Match( 674 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); 675 } 676 if (!origin_and_path_regex_set_matcher_.IsEmpty()) { 677 origin_and_path_regex_set_matcher_.Match( 678 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url), 679 &matches); 680 } 681 682 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions 683 // were fulfilled. 684 std::set<URLMatcherConditionSet::ID> result; 685 for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); 686 i != matches.end(); ++i) { 687 // For each URLMatcherConditionSet there is exactly one condition 688 // registered in substring_match_triggers_. This means that the following 689 // logic tests each URLMatcherConditionSet exactly once if it can be 690 // completely fulfilled. 691 StringPatternTriggers::const_iterator triggered_condition_sets_iter = 692 substring_match_triggers_.find(*i); 693 if (triggered_condition_sets_iter == substring_match_triggers_.end()) 694 continue; // Not all substring matches are triggers for a condition set. 695 const std::set<URLMatcherConditionSet::ID>& condition_sets = 696 triggered_condition_sets_iter->second; 697 for (std::set<URLMatcherConditionSet::ID>::const_iterator j = 698 condition_sets.begin(); j != condition_sets.end(); ++j) { 699 URLMatcherConditionSets::const_iterator condition_set_iter = 700 url_matcher_condition_sets_.find(*j); 701 DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); 702 if (condition_set_iter->second->IsMatch(matches, url)) 703 result.insert(*j); 704 } 705 } 706 707 return result; 708 } 709 710 bool URLMatcher::IsEmpty() const { 711 return condition_factory_.IsEmpty() && 712 url_matcher_condition_sets_.empty() && 713 substring_match_triggers_.empty() && 714 full_url_matcher_.IsEmpty() && 715 url_component_matcher_.IsEmpty() && 716 regex_set_matcher_.IsEmpty() && 717 origin_and_path_regex_set_matcher_.IsEmpty() && 718 registered_full_url_patterns_.empty() && 719 registered_url_component_patterns_.empty(); 720 } 721 722 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { 723 // The purpose of |full_url_conditions| is just that we need to execute 724 // the same logic once for Full URL searches and once for URL Component 725 // searches (see URLMatcherConditionFactory). 726 727 // Determine which patterns need to be registered when this function 728 // terminates. 729 std::set<const StringPattern*> new_patterns; 730 for (URLMatcherConditionSets::const_iterator condition_set_iter = 731 url_matcher_condition_sets_.begin(); 732 condition_set_iter != url_matcher_condition_sets_.end(); 733 ++condition_set_iter) { 734 const URLMatcherConditionSet::Conditions& conditions = 735 condition_set_iter->second->conditions(); 736 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 737 conditions.begin(); condition_iter != conditions.end(); 738 ++condition_iter) { 739 // If we are called to process Full URL searches, ignore others, and 740 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) 741 if (!condition_iter->IsRegexCondition() && 742 !condition_iter->IsOriginAndPathRegexCondition() && 743 full_url_conditions == condition_iter->IsFullURLCondition()) 744 new_patterns.insert(condition_iter->string_pattern()); 745 } 746 } 747 748 // This is the set of patterns that were registered before this function 749 // is called. 750 std::set<const StringPattern*>& registered_patterns = 751 full_url_conditions ? registered_full_url_patterns_ 752 : registered_url_component_patterns_; 753 754 // Add all patterns that are in new_patterns but not in registered_patterns. 755 std::vector<const StringPattern*> patterns_to_register; 756 std::set_difference( 757 new_patterns.begin(), new_patterns.end(), 758 registered_patterns.begin(), registered_patterns.end(), 759 std::back_inserter(patterns_to_register)); 760 761 // Remove all patterns that are in registered_patterns but not in 762 // new_patterns. 763 std::vector<const StringPattern*> patterns_to_unregister; 764 std::set_difference( 765 registered_patterns.begin(), registered_patterns.end(), 766 new_patterns.begin(), new_patterns.end(), 767 std::back_inserter(patterns_to_unregister)); 768 769 // Update the SubstringSetMatcher. 770 SubstringSetMatcher& url_matcher = 771 full_url_conditions ? full_url_matcher_ : url_component_matcher_; 772 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, 773 patterns_to_unregister); 774 775 // Update the set of registered_patterns for the next time this function 776 // is being called. 777 registered_patterns.swap(new_patterns); 778 } 779 780 void URLMatcher::UpdateRegexSetMatcher() { 781 std::vector<const StringPattern*> new_patterns; 782 std::vector<const StringPattern*> new_origin_and_path_patterns; 783 784 for (URLMatcherConditionSets::const_iterator condition_set_iter = 785 url_matcher_condition_sets_.begin(); 786 condition_set_iter != url_matcher_condition_sets_.end(); 787 ++condition_set_iter) { 788 const URLMatcherConditionSet::Conditions& conditions = 789 condition_set_iter->second->conditions(); 790 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 791 conditions.begin(); condition_iter != conditions.end(); 792 ++condition_iter) { 793 if (condition_iter->IsRegexCondition()) { 794 new_patterns.push_back(condition_iter->string_pattern()); 795 } else if (condition_iter->IsOriginAndPathRegexCondition()) { 796 new_origin_and_path_patterns.push_back( 797 condition_iter->string_pattern()); 798 } 799 } 800 } 801 802 // Start over from scratch. We can't really do better than this, since the 803 // FilteredRE2 backend doesn't support incremental updates. 804 regex_set_matcher_.ClearPatterns(); 805 regex_set_matcher_.AddPatterns(new_patterns); 806 origin_and_path_regex_set_matcher_.ClearPatterns(); 807 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns); 808 } 809 810 void URLMatcher::UpdateTriggers() { 811 // Count substring pattern frequencies. 812 std::map<StringPattern::ID, size_t> substring_pattern_frequencies; 813 for (URLMatcherConditionSets::const_iterator condition_set_iter = 814 url_matcher_condition_sets_.begin(); 815 condition_set_iter != url_matcher_condition_sets_.end(); 816 ++condition_set_iter) { 817 const URLMatcherConditionSet::Conditions& conditions = 818 condition_set_iter->second->conditions(); 819 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 820 conditions.begin(); condition_iter != conditions.end(); 821 ++condition_iter) { 822 const StringPattern* pattern = condition_iter->string_pattern(); 823 substring_pattern_frequencies[pattern->id()]++; 824 } 825 } 826 827 // Update trigger conditions: Determine for each URLMatcherConditionSet which 828 // URLMatcherCondition contains a StringPattern that occurs least 829 // frequently in this URLMatcher. We assume that this condition is very 830 // specific and occurs rarely in URLs. If a match occurs for this 831 // URLMatcherCondition, we want to test all other URLMatcherCondition in the 832 // respective URLMatcherConditionSet as well to see whether the entire 833 // URLMatcherConditionSet is considered matching. 834 substring_match_triggers_.clear(); 835 for (URLMatcherConditionSets::const_iterator condition_set_iter = 836 url_matcher_condition_sets_.begin(); 837 condition_set_iter != url_matcher_condition_sets_.end(); 838 ++condition_set_iter) { 839 const URLMatcherConditionSet::Conditions& conditions = 840 condition_set_iter->second->conditions(); 841 if (conditions.empty()) 842 continue; 843 URLMatcherConditionSet::Conditions::const_iterator condition_iter = 844 conditions.begin(); 845 StringPattern::ID trigger = condition_iter->string_pattern()->id(); 846 // We skip the first element in the following loop. 847 ++condition_iter; 848 for (; condition_iter != conditions.end(); ++condition_iter) { 849 StringPattern::ID current_id = 850 condition_iter->string_pattern()->id(); 851 if (substring_pattern_frequencies[trigger] > 852 substring_pattern_frequencies[current_id]) { 853 trigger = current_id; 854 } 855 } 856 substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); 857 } 858 } 859 860 void URLMatcher::UpdateConditionFactory() { 861 std::set<StringPattern::ID> used_patterns; 862 for (URLMatcherConditionSets::const_iterator condition_set_iter = 863 url_matcher_condition_sets_.begin(); 864 condition_set_iter != url_matcher_condition_sets_.end(); 865 ++condition_set_iter) { 866 const URLMatcherConditionSet::Conditions& conditions = 867 condition_set_iter->second->conditions(); 868 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 869 conditions.begin(); condition_iter != conditions.end(); 870 ++condition_iter) { 871 used_patterns.insert(condition_iter->string_pattern()->id()); 872 } 873 } 874 condition_factory_.ForgetUnusedPatterns(used_patterns); 875 } 876 877 void URLMatcher::UpdateInternalDatastructures() { 878 UpdateSubstringSetMatcher(false); 879 UpdateSubstringSetMatcher(true); 880 UpdateRegexSetMatcher(); 881 UpdateTriggers(); 882 UpdateConditionFactory(); 883 } 884 885 } // namespace extensions 886