Home | History | Annotate | Download | only in url_matcher
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/url_matcher/url_matcher.h"
      6 
      7 #include <algorithm>
      8 #include <iterator>
      9 
     10 #include "base/logging.h"
     11 #include "url/gurl.h"
     12 #include "url/url_canon.h"
     13 
     14 namespace url_matcher {
     15 
     16 // This set of classes implement a mapping of URL Component Patterns, such as
     17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
     18 // for use in substring comparisons.
     19 //
     20 // The idea of this mapping is to reduce the problem of comparing many
     21 // URL Component Patterns against one URL to the problem of searching many
     22 // substrings in one string:
     23 //
     24 // ----------------------                    -----------------
     25 // | URL Query operator | ----translate----> | StringPattern |
     26 // ----------------------                    -----------------
     27 //                                                   ^
     28 //                                                   |
     29 //                                                compare
     30 //                                                   |
     31 //                                                   v
     32 // ----------------------                    -----------------
     33 // | URL to compare     |                    |               |
     34 // | to all URL Query   | ----translate----> | String        |
     35 // | operators          |                    |               |
     36 // ----------------------                    -----------------
     37 //
     38 // The reason for this problem reduction is that there are efficient algorithms
     39 // for searching many substrings in one string (see Aho-Corasick algorithm).
     40 //
     41 // Additionally, some of the same pieces are reused to implement regular
     42 // expression comparisons. The FilteredRE2 implementation for matching many
     43 // regular expressions against one string uses prefiltering, in which a set
     44 // of substrings (derived from the regexes) are first searched for, to reduce
     45 // the number of regular expressions to test; the prefiltering step also
     46 // uses Aho-Corasick.
     47 //
     48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
     49 // ==========================================================
     50 //
     51 // For searches in this class, we normalize URLs as follows:
     52 //
     53 // Step 1:
     54 // Remove scheme, port and segment from URL:
     55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
     56 //    www.example.com/index.html?search=foo
     57 //
     58 // We remove the scheme and port number because they can be checked later
     59 // in a secondary filter step. We remove the segment (the #... part) because
     60 // this is not guaranteed to be ASCII-7 encoded.
     61 //
     62 // Step 2:
     63 // Translate URL to String and add the following position markers:
     64 // - BU = Beginning of URL
     65 // - ED = End of Domain
     66 // - EP = End of Path
     67 // - EU = End of URL
     68 // Furthermore, the hostname is canonicalized to start with a ".".
     69 //
     70 // Position markers are represented as characters >127, which are therefore
     71 // guaranteed not to be part of the ASCII-7 encoded URL character set.
     72 //
     73 // -> www.example.com/index.html?search=foo becomes
     74 // BU .www.example.com ED /index.html EP ?search=foo EU
     75 //
     76 // -> www.example.com/index.html becomes
     77 // BU .www.example.com ED /index.html EP EU
     78 //
     79 // Step 3:
     80 // Translate URL Component Patterns as follows:
     81 //
     82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
     83 // -> host_prefix("www.example") = BU .www.example
     84 //
     85 // host_suffix(suffix) = suffix ED
     86 // -> host_suffix("example.com") = example.com ED
     87 // -> host_suffix(".example.com") = .example.com ED
     88 //
     89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
     90 // -> host_equals("www.example.com") = BU .www.example.com ED
     91 //
     92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
     93 //
     94 // With this, we can search the StringPatterns in the normalized URL.
     95 //
     96 //
     97 // Case 2: url_{prefix,suffix,equals,contains} searches.
     98 // =====================================================
     99 //
    100 // Step 1: as above, except that
    101 // - the scheme is not removed
    102 // - the port is not removed if it is specified and does not match the default
    103 //   port for the given scheme.
    104 //
    105 // Step 2:
    106 // Translate URL to String and add the following position markers:
    107 // - BU = Beginning of URL
    108 // - EU = End of URL
    109 //
    110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
    111 // BU http://www.example.com:8080/index.html?search=foo EU
    112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
    113 // BU http://www.example.com/index.html?search=foo EU
    114 //
    115 // url_prefix(prefix) = BU prefix
    116 // -> url_prefix("http://www.example") = BU http://www.example
    117 //
    118 // url_contains(substring) = substring
    119 // -> url_contains("index") = index
    120 //
    121 //
    122 // Case 3: {host,path,query}_contains searches.
    123 // ============================================
    124 //
    125 // These kinds of searches are not supported directly but can be derived
    126 // by a combination of a url_contains() query followed by an explicit test:
    127 //
    128 // host_contains(str) = url_contains(str) followed by test whether str occurs
    129 //   in host component of original URL.
    130 // -> host_contains("example.co") = example.co
    131 //    followed by gurl.host().find("example.co");
    132 //
    133 // [similarly for path_contains and query_contains].
    134 //
    135 //
    136 // Regular expression matching (url_matches searches)
    137 // ==================================================
    138 //
    139 // This class also supports matching regular expressions (RE2 syntax)
    140 // against full URLs, which are transformed as in case 2.
    141 
    142 namespace {
    143 
    144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
    145   return criterion == URLMatcherCondition::URL_MATCHES;
    146 }
    147 
    148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
    149   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
    150 }
    151 
    152 }  // namespace
    153 
    154 //
    155 // URLMatcherCondition
    156 //
    157 
    158 URLMatcherCondition::URLMatcherCondition()
    159     : criterion_(HOST_PREFIX),
    160       string_pattern_(NULL) {}
    161 
    162 URLMatcherCondition::~URLMatcherCondition() {}
    163 
    164 URLMatcherCondition::URLMatcherCondition(
    165     Criterion criterion,
    166     const StringPattern* string_pattern)
    167     : criterion_(criterion),
    168       string_pattern_(string_pattern) {}
    169 
    170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    171     : criterion_(rhs.criterion_),
    172       string_pattern_(rhs.string_pattern_) {}
    173 
    174 URLMatcherCondition& URLMatcherCondition::operator=(
    175     const URLMatcherCondition& rhs) {
    176   criterion_ = rhs.criterion_;
    177   string_pattern_ = rhs.string_pattern_;
    178   return *this;
    179 }
    180 
    181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
    182   if (criterion_ < rhs.criterion_) return true;
    183   if (criterion_ > rhs.criterion_) return false;
    184   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    185     return *string_pattern_ < *rhs.string_pattern_;
    186   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
    187   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
    188   // or both are NULL.
    189   return false;
    190 }
    191 
    192 bool URLMatcherCondition::IsFullURLCondition() const {
    193   // For these criteria the SubstringMatcher needs to be executed on the
    194   // GURL that is canonicalized with
    195   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
    196   switch (criterion_) {
    197     case HOST_CONTAINS:
    198     case PATH_CONTAINS:
    199     case QUERY_CONTAINS:
    200     case URL_PREFIX:
    201     case URL_SUFFIX:
    202     case URL_CONTAINS:
    203     case URL_EQUALS:
    204       return true;
    205     default:
    206       break;
    207   }
    208   return false;
    209 }
    210 
    211 bool URLMatcherCondition::IsRegexCondition() const {
    212   return IsRegexCriterion(criterion_);
    213 }
    214 
    215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
    216   return IsOriginAndPathRegexCriterion(criterion_);
    217 }
    218 
    219 bool URLMatcherCondition::IsMatch(
    220     const std::set<StringPattern::ID>& matching_patterns,
    221     const GURL& url) const {
    222   DCHECK(string_pattern_);
    223   if (!ContainsKey(matching_patterns, string_pattern_->id()))
    224     return false;
    225   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
    226   // a substring match on the raw URL. In case of a match, we need to verify
    227   // that the match was found in the correct component of the URL.
    228   switch (criterion_) {
    229     case HOST_CONTAINS:
    230       return url.host().find(string_pattern_->pattern()) !=
    231           std::string::npos;
    232     case PATH_CONTAINS:
    233       return url.path().find(string_pattern_->pattern()) !=
    234           std::string::npos;
    235     case QUERY_CONTAINS:
    236       return url.query().find(string_pattern_->pattern()) !=
    237           std::string::npos;
    238     default:
    239       break;
    240   }
    241   return true;
    242 }
    243 
    244 //
    245 // URLMatcherConditionFactory
    246 //
    247 
    248 namespace {
    249 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
    250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
    251 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
    252 const char kEndOfPath[] = {static_cast<char>(-3), 0};
    253 const char kEndOfURL[] = {static_cast<char>(-4), 0};
    254 }  // namespace
    255 
    256 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
    257 
    258 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
    259   STLDeleteElements(&substring_pattern_singletons_);
    260   STLDeleteElements(&regex_pattern_singletons_);
    261   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
    262 }
    263 
    264 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    265     const GURL& url) const {
    266   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
    267          url.path() + kEndOfPath +
    268          (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
    269 }
    270 
    271 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    272     const std::string& prefix) {
    273   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
    274       kBeginningOfURL + CanonicalizeHostname(prefix));
    275 }
    276 
    277 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    278     const std::string& suffix) {
    279   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
    280       suffix + kEndOfDomain);
    281 }
    282 
    283 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    284     const std::string& str) {
    285   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
    286 }
    287 
    288 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    289     const std::string& str) {
    290   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
    291       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
    292 }
    293 
    294 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    295     const std::string& prefix) {
    296   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
    297       kEndOfDomain + prefix);
    298 }
    299 
    300 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    301     const std::string& suffix) {
    302   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
    303 }
    304 
    305 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    306     const std::string& str) {
    307   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
    308 }
    309 
    310 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    311     const std::string& str) {
    312   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
    313       kEndOfDomain + str + kEndOfPath);
    314 }
    315 
    316 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    317     const std::string& prefix) {
    318   std::string pattern;
    319   if (!prefix.empty() && prefix[0] == '?')
    320     pattern = kEndOfPath + prefix;
    321   else
    322     pattern = kEndOfPath + ('?' + prefix);
    323 
    324   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
    325 }
    326 
    327 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    328     const std::string& suffix) {
    329   if (!suffix.empty() && suffix[0] == '?') {
    330     return CreateQueryEqualsCondition(suffix);
    331   } else {
    332     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
    333                            suffix + kEndOfURL);
    334   }
    335 }
    336 
    337 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    338     const std::string& str) {
    339   if (!str.empty() && str[0] == '?')
    340     return CreateQueryPrefixCondition(str);
    341   else
    342     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
    343 }
    344 
    345 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    346     const std::string& str) {
    347   std::string pattern;
    348   if (!str.empty() && str[0] == '?')
    349     pattern = kEndOfPath + str + kEndOfURL;
    350   else
    351     pattern = kEndOfPath + ('?' + str) + kEndOfURL;
    352 
    353   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
    354 }
    355 
    356 URLMatcherCondition
    357     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    358     const std::string& host_suffix,
    359     const std::string& path_prefix) {
    360   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
    361       host_suffix + kEndOfDomain + path_prefix);
    362 }
    363 
    364 URLMatcherCondition
    365 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    366     const std::string& host,
    367     const std::string& path_prefix) {
    368   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
    369       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
    370       path_prefix);
    371 }
    372 
    373 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    374     const GURL& url) const {
    375   GURL::Replacements replacements;
    376   replacements.ClearPassword();
    377   replacements.ClearUsername();
    378   replacements.ClearRef();
    379   // Clear port if it is implicit from scheme.
    380   if (url.has_port()) {
    381     const std::string& port = url.scheme();
    382     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
    383             url.EffectiveIntPort()) {
    384       replacements.ClearPort();
    385     }
    386   }
    387   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
    388       kEndOfURL;
    389 }
    390 
    391 static std::string CanonicalizeURLForRegexSearchesHelper(
    392     const GURL& url,
    393     bool clear_query) {
    394   GURL::Replacements replacements;
    395   replacements.ClearPassword();
    396   replacements.ClearUsername();
    397   replacements.ClearRef();
    398   if (clear_query)
    399     replacements.ClearQuery();
    400   // Clear port if it is implicit from scheme.
    401   if (url.has_port()) {
    402     const std::string& port = url.scheme();
    403     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
    404             url.EffectiveIntPort()) {
    405       replacements.ClearPort();
    406     }
    407   }
    408   return url.ReplaceComponents(replacements).spec();
    409 }
    410 
    411 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    412     const GURL& url) const {
    413   return CanonicalizeURLForRegexSearchesHelper(url, false);
    414 }
    415 
    416 std::string
    417 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    418     const GURL& url) const {
    419   return CanonicalizeURLForRegexSearchesHelper(url, true);
    420 }
    421 
    422 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    423     const std::string& prefix) {
    424   return CreateCondition(URLMatcherCondition::URL_PREFIX,
    425       kBeginningOfURL + prefix);
    426 }
    427 
    428 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    429     const std::string& suffix) {
    430   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
    431 }
    432 
    433 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    434     const std::string& str) {
    435   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
    436 }
    437 
    438 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    439     const std::string& str) {
    440   return CreateCondition(URLMatcherCondition::URL_EQUALS,
    441       kBeginningOfURL + str + kEndOfURL);
    442 }
    443 
    444 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    445     const std::string& regex) {
    446   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
    447 }
    448 
    449 URLMatcherCondition
    450 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    451     const std::string& regex) {
    452   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
    453 }
    454 
    455 void URLMatcherConditionFactory::ForgetUnusedPatterns(
    456       const std::set<StringPattern::ID>& used_patterns) {
    457   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
    458   while (i != substring_pattern_singletons_.end()) {
    459     if (ContainsKey(used_patterns, (*i)->id())) {
    460       ++i;
    461     } else {
    462       delete *i;
    463       substring_pattern_singletons_.erase(i++);
    464     }
    465   }
    466   i = regex_pattern_singletons_.begin();
    467   while (i != regex_pattern_singletons_.end()) {
    468     if (ContainsKey(used_patterns, (*i)->id())) {
    469       ++i;
    470     } else {
    471       delete *i;
    472       regex_pattern_singletons_.erase(i++);
    473     }
    474   }
    475   i = origin_and_path_regex_pattern_singletons_.begin();
    476   while (i != origin_and_path_regex_pattern_singletons_.end()) {
    477     if (ContainsKey(used_patterns, (*i)->id())) {
    478       ++i;
    479     } else {
    480       delete *i;
    481       origin_and_path_regex_pattern_singletons_.erase(i++);
    482     }
    483   }
    484 }
    485 
    486 bool URLMatcherConditionFactory::IsEmpty() const {
    487   return substring_pattern_singletons_.empty() &&
    488       regex_pattern_singletons_.empty() &&
    489       origin_and_path_regex_pattern_singletons_.empty();
    490 }
    491 
    492 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    493     URLMatcherCondition::Criterion criterion,
    494     const std::string& pattern) {
    495   StringPattern search_pattern(pattern, 0);
    496   PatternSingletons* pattern_singletons = NULL;
    497   if (IsRegexCriterion(criterion))
    498     pattern_singletons = &regex_pattern_singletons_;
    499   else if (IsOriginAndPathRegexCriterion(criterion))
    500     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
    501   else
    502     pattern_singletons = &substring_pattern_singletons_;
    503 
    504   PatternSingletons::const_iterator iter =
    505       pattern_singletons->find(&search_pattern);
    506 
    507   if (iter != pattern_singletons->end()) {
    508     return URLMatcherCondition(criterion, *iter);
    509   } else {
    510     StringPattern* new_pattern =
    511         new StringPattern(pattern, id_counter_++);
    512     pattern_singletons->insert(new_pattern);
    513     return URLMatcherCondition(criterion, new_pattern);
    514   }
    515 }
    516 
    517 std::string URLMatcherConditionFactory::CanonicalizeHostname(
    518     const std::string& hostname) const {
    519   if (!hostname.empty() && hostname[0] == '.')
    520     return hostname;
    521   else
    522     return "." + hostname;
    523 }
    524 
    525 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
    526     StringPattern* lhs,
    527     StringPattern* rhs) const {
    528   if (lhs == NULL && rhs != NULL) return true;
    529   if (lhs != NULL && rhs != NULL)
    530     return lhs->pattern() < rhs->pattern();
    531   // Either both are NULL or only rhs is NULL.
    532   return false;
    533 }
    534 
    535 //
    536 // URLMatcherSchemeFilter
    537 //
    538 
    539 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    540     : filters_(1) {
    541   filters_.push_back(filter);
    542 }
    543 
    544 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    545     const std::vector<std::string>& filters)
    546     : filters_(filters) {}
    547 
    548 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
    549 
    550 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
    551   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
    552       filters_.end();
    553 }
    554 
    555 //
    556 // URLMatcherPortFilter
    557 //
    558 
    559 URLMatcherPortFilter::URLMatcherPortFilter(
    560     const std::vector<URLMatcherPortFilter::Range>& ranges)
    561     : ranges_(ranges) {}
    562 
    563 URLMatcherPortFilter::~URLMatcherPortFilter() {}
    564 
    565 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
    566   int port = url.EffectiveIntPort();
    567   for (std::vector<Range>::const_iterator i = ranges_.begin();
    568        i != ranges_.end(); ++i) {
    569     if (i->first <= port && port <= i->second)
    570       return true;
    571   }
    572   return false;
    573 }
    574 
    575 // static
    576 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
    577                                                               int to) {
    578   return Range(from, to);
    579 }
    580 
    581 // static
    582 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
    583   return Range(port, port);
    584 }
    585 
    586 //
    587 // URLMatcherConditionSet
    588 //
    589 
    590 URLMatcherConditionSet::~URLMatcherConditionSet() {}
    591 
    592 URLMatcherConditionSet::URLMatcherConditionSet(
    593     ID id,
    594     const Conditions& conditions)
    595     : id_(id),
    596       conditions_(conditions) {}
    597 
    598 URLMatcherConditionSet::URLMatcherConditionSet(
    599     ID id,
    600     const Conditions& conditions,
    601     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    602     scoped_ptr<URLMatcherPortFilter> port_filter)
    603     : id_(id),
    604       conditions_(conditions),
    605       scheme_filter_(scheme_filter.Pass()),
    606       port_filter_(port_filter.Pass()) {}
    607 
    608 bool URLMatcherConditionSet::IsMatch(
    609     const std::set<StringPattern::ID>& matching_patterns,
    610     const GURL& url) const {
    611   for (Conditions::const_iterator i = conditions_.begin();
    612        i != conditions_.end(); ++i) {
    613     if (!i->IsMatch(matching_patterns, url))
    614       return false;
    615   }
    616   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
    617     return false;
    618   if (port_filter_.get() && !port_filter_->IsMatch(url))
    619     return false;
    620   return true;
    621 }
    622 
    623 //
    624 // URLMatcher
    625 //
    626 
    627 URLMatcher::URLMatcher() {}
    628 
    629 URLMatcher::~URLMatcher() {}
    630 
    631 void URLMatcher::AddConditionSets(
    632     const URLMatcherConditionSet::Vector& condition_sets) {
    633   for (URLMatcherConditionSet::Vector::const_iterator i =
    634        condition_sets.begin(); i != condition_sets.end(); ++i) {
    635     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
    636         url_matcher_condition_sets_.end());
    637     url_matcher_condition_sets_[(*i)->id()] = *i;
    638   }
    639   UpdateInternalDatastructures();
    640 }
    641 
    642 void URLMatcher::RemoveConditionSets(
    643     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
    644   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
    645        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
    646     DCHECK(url_matcher_condition_sets_.find(*i) !=
    647         url_matcher_condition_sets_.end());
    648     url_matcher_condition_sets_.erase(*i);
    649   }
    650   UpdateInternalDatastructures();
    651 }
    652 
    653 void URLMatcher::ClearUnusedConditionSets() {
    654   UpdateConditionFactory();
    655 }
    656 
    657 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
    658     const GURL& url) const {
    659   // Find all IDs of StringPatterns that match |url|.
    660   // See URLMatcherConditionFactory for the canonicalization of URLs and the
    661   // distinction between full url searches and url component searches.
    662   std::set<StringPattern::ID> matches;
    663   if (!full_url_matcher_.IsEmpty()) {
    664     full_url_matcher_.Match(
    665         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
    666   }
    667   if (!url_component_matcher_.IsEmpty()) {
    668     url_component_matcher_.Match(
    669         condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
    670   }
    671   if (!regex_set_matcher_.IsEmpty()) {
    672     regex_set_matcher_.Match(
    673         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
    674   }
    675   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
    676     origin_and_path_regex_set_matcher_.Match(
    677         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
    678         &matches);
    679   }
    680 
    681   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
    682   // were fulfilled.
    683   std::set<URLMatcherConditionSet::ID> result;
    684   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
    685        i != matches.end(); ++i) {
    686     // For each URLMatcherConditionSet there is exactly one condition
    687     // registered in substring_match_triggers_. This means that the following
    688     // logic tests each URLMatcherConditionSet exactly once if it can be
    689     // completely fulfilled.
    690     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
    691         substring_match_triggers_.find(*i);
    692     if (triggered_condition_sets_iter == substring_match_triggers_.end())
    693       continue;  // Not all substring matches are triggers for a condition set.
    694     const std::set<URLMatcherConditionSet::ID>& condition_sets =
    695         triggered_condition_sets_iter->second;
    696     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
    697          condition_sets.begin(); j != condition_sets.end(); ++j) {
    698       URLMatcherConditionSets::const_iterator condition_set_iter =
    699           url_matcher_condition_sets_.find(*j);
    700       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
    701       if (condition_set_iter->second->IsMatch(matches, url))
    702         result.insert(*j);
    703     }
    704   }
    705 
    706   return result;
    707 }
    708 
    709 bool URLMatcher::IsEmpty() const {
    710   return condition_factory_.IsEmpty() &&
    711       url_matcher_condition_sets_.empty() &&
    712       substring_match_triggers_.empty() &&
    713       full_url_matcher_.IsEmpty() &&
    714       url_component_matcher_.IsEmpty() &&
    715       regex_set_matcher_.IsEmpty() &&
    716       origin_and_path_regex_set_matcher_.IsEmpty() &&
    717       registered_full_url_patterns_.empty() &&
    718       registered_url_component_patterns_.empty();
    719 }
    720 
    721 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
    722   // The purpose of |full_url_conditions| is just that we need to execute
    723   // the same logic once for Full URL searches and once for URL Component
    724   // searches (see URLMatcherConditionFactory).
    725 
    726   // Determine which patterns need to be registered when this function
    727   // terminates.
    728   std::set<const StringPattern*> new_patterns;
    729   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    730       url_matcher_condition_sets_.begin();
    731       condition_set_iter != url_matcher_condition_sets_.end();
    732       ++condition_set_iter) {
    733     const URLMatcherConditionSet::Conditions& conditions =
    734         condition_set_iter->second->conditions();
    735     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    736          conditions.begin(); condition_iter != conditions.end();
    737          ++condition_iter) {
    738       // If we are called to process Full URL searches, ignore others, and
    739       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
    740       if (!condition_iter->IsRegexCondition() &&
    741           !condition_iter->IsOriginAndPathRegexCondition() &&
    742           full_url_conditions == condition_iter->IsFullURLCondition())
    743         new_patterns.insert(condition_iter->string_pattern());
    744     }
    745   }
    746 
    747   // This is the set of patterns that were registered before this function
    748   // is called.
    749   std::set<const StringPattern*>& registered_patterns =
    750       full_url_conditions ? registered_full_url_patterns_
    751                           : registered_url_component_patterns_;
    752 
    753   // Add all patterns that are in new_patterns but not in registered_patterns.
    754   std::vector<const StringPattern*> patterns_to_register =
    755       base::STLSetDifference<std::vector<const StringPattern*> >(
    756           new_patterns, registered_patterns);
    757 
    758   // Remove all patterns that are in registered_patterns but not in
    759   // new_patterns.
    760   std::vector<const StringPattern*> patterns_to_unregister =
    761       base::STLSetDifference<std::vector<const StringPattern*> >(
    762            registered_patterns, new_patterns);
    763 
    764   // Update the SubstringSetMatcher.
    765   SubstringSetMatcher& url_matcher =
    766       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
    767   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
    768                                             patterns_to_unregister);
    769 
    770   // Update the set of registered_patterns for the next time this function
    771   // is being called.
    772   registered_patterns.swap(new_patterns);
    773 }
    774 
    775 void URLMatcher::UpdateRegexSetMatcher() {
    776   std::vector<const StringPattern*> new_patterns;
    777   std::vector<const StringPattern*> new_origin_and_path_patterns;
    778 
    779   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    780       url_matcher_condition_sets_.begin();
    781       condition_set_iter != url_matcher_condition_sets_.end();
    782       ++condition_set_iter) {
    783     const URLMatcherConditionSet::Conditions& conditions =
    784         condition_set_iter->second->conditions();
    785     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    786          conditions.begin(); condition_iter != conditions.end();
    787          ++condition_iter) {
    788       if (condition_iter->IsRegexCondition()) {
    789         new_patterns.push_back(condition_iter->string_pattern());
    790       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
    791         new_origin_and_path_patterns.push_back(
    792             condition_iter->string_pattern());
    793       }
    794     }
    795   }
    796 
    797   // Start over from scratch. We can't really do better than this, since the
    798   // FilteredRE2 backend doesn't support incremental updates.
    799   regex_set_matcher_.ClearPatterns();
    800   regex_set_matcher_.AddPatterns(new_patterns);
    801   origin_and_path_regex_set_matcher_.ClearPatterns();
    802   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
    803 }
    804 
    805 void URLMatcher::UpdateTriggers() {
    806   // Count substring pattern frequencies.
    807   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
    808   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    809       url_matcher_condition_sets_.begin();
    810       condition_set_iter != url_matcher_condition_sets_.end();
    811       ++condition_set_iter) {
    812     const URLMatcherConditionSet::Conditions& conditions =
    813         condition_set_iter->second->conditions();
    814     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    815          conditions.begin(); condition_iter != conditions.end();
    816          ++condition_iter) {
    817       const StringPattern* pattern = condition_iter->string_pattern();
    818       substring_pattern_frequencies[pattern->id()]++;
    819     }
    820   }
    821 
    822   // Update trigger conditions: Determine for each URLMatcherConditionSet which
    823   // URLMatcherCondition contains a StringPattern that occurs least
    824   // frequently in this URLMatcher. We assume that this condition is very
    825   // specific and occurs rarely in URLs. If a match occurs for this
    826   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
    827   // respective URLMatcherConditionSet as well to see whether the entire
    828   // URLMatcherConditionSet is considered matching.
    829   substring_match_triggers_.clear();
    830   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    831       url_matcher_condition_sets_.begin();
    832       condition_set_iter != url_matcher_condition_sets_.end();
    833       ++condition_set_iter) {
    834     const URLMatcherConditionSet::Conditions& conditions =
    835         condition_set_iter->second->conditions();
    836     if (conditions.empty())
    837       continue;
    838     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    839         conditions.begin();
    840     StringPattern::ID trigger = condition_iter->string_pattern()->id();
    841     // We skip the first element in the following loop.
    842     ++condition_iter;
    843     for (; condition_iter != conditions.end(); ++condition_iter) {
    844       StringPattern::ID current_id =
    845           condition_iter->string_pattern()->id();
    846       if (substring_pattern_frequencies[trigger] >
    847           substring_pattern_frequencies[current_id]) {
    848         trigger = current_id;
    849       }
    850     }
    851     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
    852   }
    853 }
    854 
    855 void URLMatcher::UpdateConditionFactory() {
    856   std::set<StringPattern::ID> used_patterns;
    857   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    858       url_matcher_condition_sets_.begin();
    859       condition_set_iter != url_matcher_condition_sets_.end();
    860       ++condition_set_iter) {
    861     const URLMatcherConditionSet::Conditions& conditions =
    862         condition_set_iter->second->conditions();
    863     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    864          conditions.begin(); condition_iter != conditions.end();
    865          ++condition_iter) {
    866       used_patterns.insert(condition_iter->string_pattern()->id());
    867     }
    868   }
    869   condition_factory_.ForgetUnusedPatterns(used_patterns);
    870 }
    871 
    872 void URLMatcher::UpdateInternalDatastructures() {
    873   UpdateSubstringSetMatcher(false);
    874   UpdateSubstringSetMatcher(true);
    875   UpdateRegexSetMatcher();
    876   UpdateTriggers();
    877   UpdateConditionFactory();
    878 }
    879 
    880 }  // namespace url_matcher
    881