Home | History | Annotate | Download | only in matcher
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "extensions/common/matcher/url_matcher.h"
      6 
      7 #include <algorithm>
      8 #include <iterator>
      9 
     10 #include "base/logging.h"
     11 #include "content/public/common/url_constants.h"
     12 #include "url/gurl.h"
     13 #include "url/url_canon.h"
     14 
     15 namespace extensions {
     16 
     17 // This set of classes implement a mapping of URL Component Patterns, such as
     18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
     19 // for use in substring comparisons.
     20 //
     21 // The idea of this mapping is to reduce the problem of comparing many
     22 // URL Component Patterns against one URL to the problem of searching many
     23 // substrings in one string:
     24 //
     25 // ----------------------                    -----------------
     26 // | URL Query operator | ----translate----> | StringPattern |
     27 // ----------------------                    -----------------
     28 //                                                   ^
     29 //                                                   |
     30 //                                                compare
     31 //                                                   |
     32 //                                                   v
     33 // ----------------------                    -----------------
     34 // | URL to compare     |                    |               |
     35 // | to all URL Query   | ----translate----> | String        |
     36 // | operators          |                    |               |
     37 // ----------------------                    -----------------
     38 //
     39 // The reason for this problem reduction is that there are efficient algorithms
     40 // for searching many substrings in one string (see Aho-Corasick algorithm).
     41 //
     42 // Additionally, some of the same pieces are reused to implement regular
     43 // expression comparisons. The FilteredRE2 implementation for matching many
     44 // regular expressions against one string uses prefiltering, in which a set
     45 // of substrings (derived from the regexes) are first searched for, to reduce
     46 // the number of regular expressions to test; the prefiltering step also
     47 // uses Aho-Corasick.
     48 //
     49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
     50 // ==========================================================
     51 //
     52 // For searches in this class, we normalize URLs as follows:
     53 //
     54 // Step 1:
     55 // Remove scheme, port and segment from URL:
     56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
     57 //    www.example.com/index.html?search=foo
     58 //
     59 // We remove the scheme and port number because they can be checked later
     60 // in a secondary filter step. We remove the segment (the #... part) because
     61 // this is not guaranteed to be ASCII-7 encoded.
     62 //
     63 // Step 2:
     64 // Translate URL to String and add the following position markers:
     65 // - BU = Beginning of URL
     66 // - ED = End of Domain
     67 // - EP = End of Path
     68 // - EU = End of URL
     69 // Furthermore, the hostname is canonicalized to start with a ".".
     70 //
     71 // Position markers are represented as characters >127, which are therefore
     72 // guaranteed not to be part of the ASCII-7 encoded URL character set.
     73 //
     74 // -> www.example.com/index.html?search=foo becomes
     75 // BU .www.example.com ED /index.html EP ?search=foo EU
     76 //
     77 // -> www.example.com/index.html becomes
     78 // BU .www.example.com ED /index.html EP EU
     79 //
     80 // Step 3:
     81 // Translate URL Component Patterns as follows:
     82 //
     83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
     84 // -> host_prefix("www.example") = BU .www.example
     85 //
     86 // host_suffix(suffix) = suffix ED
     87 // -> host_suffix("example.com") = example.com ED
     88 // -> host_suffix(".example.com") = .example.com ED
     89 //
     90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
     91 // -> host_equals("www.example.com") = BU .www.example.com ED
     92 //
     93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
     94 //
     95 // With this, we can search the StringPatterns in the normalized URL.
     96 //
     97 //
     98 // Case 2: url_{prefix,suffix,equals,contains} searches.
     99 // =====================================================
    100 //
    101 // Step 1: as above, except that
    102 // - the scheme is not removed
    103 // - the port is not removed if it is specified and does not match the default
    104 //   port for the given scheme.
    105 //
    106 // Step 2:
    107 // Translate URL to String and add the following position markers:
    108 // - BU = Beginning of URL
    109 // - EU = End of URL
    110 //
    111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
    112 // BU http://www.example.com:8080/index.html?search=foo EU
    113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
    114 // BU http://www.example.com/index.html?search=foo EU
    115 //
    116 // url_prefix(prefix) = BU prefix
    117 // -> url_prefix("http://www.example") = BU http://www.example
    118 //
    119 // url_contains(substring) = substring
    120 // -> url_contains("index") = index
    121 //
    122 //
    123 // Case 3: {host,path,query}_contains searches.
    124 // ============================================
    125 //
    126 // These kinds of searches are not supported directly but can be derived
    127 // by a combination of a url_contains() query followed by an explicit test:
    128 //
    129 // host_contains(str) = url_contains(str) followed by test whether str occurs
    130 //   in host component of original URL.
    131 // -> host_contains("example.co") = example.co
    132 //    followed by gurl.host().find("example.co");
    133 //
    134 // [similarly for path_contains and query_contains].
    135 //
    136 //
    137 // Regular expression matching (url_matches searches)
    138 // ==================================================
    139 //
    140 // This class also supports matching regular expressions (RE2 syntax)
    141 // against full URLs, which are transformed as in case 2.
    142 
    143 namespace {
    144 
    145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
    146   return criterion == URLMatcherCondition::URL_MATCHES;
    147 }
    148 
    149 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
    150   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
    151 }
    152 
    153 }  // namespace
    154 
    155 //
    156 // URLMatcherCondition
    157 //
    158 
    159 URLMatcherCondition::URLMatcherCondition()
    160     : criterion_(HOST_PREFIX),
    161       string_pattern_(NULL) {}
    162 
    163 URLMatcherCondition::~URLMatcherCondition() {}
    164 
    165 URLMatcherCondition::URLMatcherCondition(
    166     Criterion criterion,
    167     const StringPattern* string_pattern)
    168     : criterion_(criterion),
    169       string_pattern_(string_pattern) {}
    170 
    171 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    172     : criterion_(rhs.criterion_),
    173       string_pattern_(rhs.string_pattern_) {}
    174 
    175 URLMatcherCondition& URLMatcherCondition::operator=(
    176     const URLMatcherCondition& rhs) {
    177   criterion_ = rhs.criterion_;
    178   string_pattern_ = rhs.string_pattern_;
    179   return *this;
    180 }
    181 
    182 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
    183   if (criterion_ < rhs.criterion_) return true;
    184   if (criterion_ > rhs.criterion_) return false;
    185   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    186     return *string_pattern_ < *rhs.string_pattern_;
    187   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
    188   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
    189   // or both are NULL.
    190   return false;
    191 }
    192 
    193 bool URLMatcherCondition::IsFullURLCondition() const {
    194   // For these criteria the SubstringMatcher needs to be executed on the
    195   // GURL that is canonicalized with
    196   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
    197   switch (criterion_) {
    198     case HOST_CONTAINS:
    199     case PATH_CONTAINS:
    200     case QUERY_CONTAINS:
    201     case URL_PREFIX:
    202     case URL_SUFFIX:
    203     case URL_CONTAINS:
    204     case URL_EQUALS:
    205       return true;
    206     default:
    207       break;
    208   }
    209   return false;
    210 }
    211 
    212 bool URLMatcherCondition::IsRegexCondition() const {
    213   return IsRegexCriterion(criterion_);
    214 }
    215 
    216 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
    217   return IsOriginAndPathRegexCriterion(criterion_);
    218 }
    219 
    220 bool URLMatcherCondition::IsMatch(
    221     const std::set<StringPattern::ID>& matching_patterns,
    222     const GURL& url) const {
    223   DCHECK(string_pattern_);
    224   if (!ContainsKey(matching_patterns, string_pattern_->id()))
    225     return false;
    226   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
    227   // a substring match on the raw URL. In case of a match, we need to verify
    228   // that the match was found in the correct component of the URL.
    229   switch (criterion_) {
    230     case HOST_CONTAINS:
    231       return url.host().find(string_pattern_->pattern()) !=
    232           std::string::npos;
    233     case PATH_CONTAINS:
    234       return url.path().find(string_pattern_->pattern()) !=
    235           std::string::npos;
    236     case QUERY_CONTAINS:
    237       return url.query().find(string_pattern_->pattern()) !=
    238           std::string::npos;
    239     default:
    240       break;
    241   }
    242   return true;
    243 }
    244 
    245 //
    246 // URLMatcherConditionFactory
    247 //
    248 
    249 namespace {
    250 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
    251 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
    252 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
    253 const char kEndOfPath[] = {static_cast<char>(-3), 0};
    254 const char kEndOfURL[] = {static_cast<char>(-4), 0};
    255 }  // namespace
    256 
    257 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
    258 
    259 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
    260   STLDeleteElements(&substring_pattern_singletons_);
    261   STLDeleteElements(&regex_pattern_singletons_);
    262   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
    263 }
    264 
    265 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    266     const GURL& url) const {
    267   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
    268          url.path() + kEndOfPath +
    269          (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
    270 }
    271 
    272 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    273     const std::string& prefix) {
    274   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
    275       kBeginningOfURL + CanonicalizeHostname(prefix));
    276 }
    277 
    278 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    279     const std::string& suffix) {
    280   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
    281       suffix + kEndOfDomain);
    282 }
    283 
    284 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    285     const std::string& str) {
    286   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
    287 }
    288 
    289 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    290     const std::string& str) {
    291   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
    292       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
    293 }
    294 
    295 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    296     const std::string& prefix) {
    297   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
    298       kEndOfDomain + prefix);
    299 }
    300 
    301 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    302     const std::string& suffix) {
    303   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
    304 }
    305 
    306 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    307     const std::string& str) {
    308   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
    309 }
    310 
    311 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    312     const std::string& str) {
    313   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
    314       kEndOfDomain + str + kEndOfPath);
    315 }
    316 
    317 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    318     const std::string& prefix) {
    319   std::string pattern;
    320   if (!prefix.empty() && prefix[0] == '?')
    321     pattern = kEndOfPath + prefix;
    322   else
    323     pattern = kEndOfPath + ('?' + prefix);
    324 
    325   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
    326 }
    327 
    328 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    329     const std::string& suffix) {
    330   if (!suffix.empty() && suffix[0] == '?') {
    331     return CreateQueryEqualsCondition(suffix);
    332   } else {
    333     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
    334                            suffix + kEndOfURL);
    335   }
    336 }
    337 
    338 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    339     const std::string& str) {
    340   if (!str.empty() && str[0] == '?')
    341     return CreateQueryPrefixCondition(str);
    342   else
    343     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
    344 }
    345 
    346 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    347     const std::string& str) {
    348   std::string pattern;
    349   if (!str.empty() && str[0] == '?')
    350     pattern = kEndOfPath + str + kEndOfURL;
    351   else
    352     pattern = kEndOfPath + ('?' + str) + kEndOfURL;
    353 
    354   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
    355 }
    356 
    357 URLMatcherCondition
    358     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    359     const std::string& host_suffix,
    360     const std::string& path_prefix) {
    361   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
    362       host_suffix + kEndOfDomain + path_prefix);
    363 }
    364 
    365 URLMatcherCondition
    366 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    367     const std::string& host,
    368     const std::string& path_prefix) {
    369   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
    370       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
    371       path_prefix);
    372 }
    373 
    374 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    375     const GURL& url) const {
    376   GURL::Replacements replacements;
    377   replacements.ClearPassword();
    378   replacements.ClearUsername();
    379   replacements.ClearRef();
    380   // Clear port if it is implicit from scheme.
    381   if (url.has_port()) {
    382     const std::string& port = url.scheme();
    383     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
    384             url.EffectiveIntPort()) {
    385       replacements.ClearPort();
    386     }
    387   }
    388   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
    389       kEndOfURL;
    390 }
    391 
    392 static std::string CanonicalizeURLForRegexSearchesHelper(
    393     const GURL& url,
    394     bool clear_query) {
    395   GURL::Replacements replacements;
    396   replacements.ClearPassword();
    397   replacements.ClearUsername();
    398   replacements.ClearRef();
    399   if (clear_query)
    400     replacements.ClearQuery();
    401   // Clear port if it is implicit from scheme.
    402   if (url.has_port()) {
    403     const std::string& port = url.scheme();
    404     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
    405             url.EffectiveIntPort()) {
    406       replacements.ClearPort();
    407     }
    408   }
    409   return url.ReplaceComponents(replacements).spec();
    410 }
    411 
    412 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    413     const GURL& url) const {
    414   return CanonicalizeURLForRegexSearchesHelper(url, false);
    415 }
    416 
    417 std::string
    418 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    419     const GURL& url) const {
    420   return CanonicalizeURLForRegexSearchesHelper(url, true);
    421 }
    422 
    423 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    424     const std::string& prefix) {
    425   return CreateCondition(URLMatcherCondition::URL_PREFIX,
    426       kBeginningOfURL + prefix);
    427 }
    428 
    429 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    430     const std::string& suffix) {
    431   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
    432 }
    433 
    434 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    435     const std::string& str) {
    436   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
    437 }
    438 
    439 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    440     const std::string& str) {
    441   return CreateCondition(URLMatcherCondition::URL_EQUALS,
    442       kBeginningOfURL + str + kEndOfURL);
    443 }
    444 
    445 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    446     const std::string& regex) {
    447   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
    448 }
    449 
    450 URLMatcherCondition
    451 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    452     const std::string& regex) {
    453   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
    454 }
    455 
    456 void URLMatcherConditionFactory::ForgetUnusedPatterns(
    457       const std::set<StringPattern::ID>& used_patterns) {
    458   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
    459   while (i != substring_pattern_singletons_.end()) {
    460     if (ContainsKey(used_patterns, (*i)->id())) {
    461       ++i;
    462     } else {
    463       delete *i;
    464       substring_pattern_singletons_.erase(i++);
    465     }
    466   }
    467   i = regex_pattern_singletons_.begin();
    468   while (i != regex_pattern_singletons_.end()) {
    469     if (ContainsKey(used_patterns, (*i)->id())) {
    470       ++i;
    471     } else {
    472       delete *i;
    473       regex_pattern_singletons_.erase(i++);
    474     }
    475   }
    476   i = origin_and_path_regex_pattern_singletons_.begin();
    477   while (i != origin_and_path_regex_pattern_singletons_.end()) {
    478     if (ContainsKey(used_patterns, (*i)->id())) {
    479       ++i;
    480     } else {
    481       delete *i;
    482       origin_and_path_regex_pattern_singletons_.erase(i++);
    483     }
    484   }
    485 }
    486 
    487 bool URLMatcherConditionFactory::IsEmpty() const {
    488   return substring_pattern_singletons_.empty() &&
    489       regex_pattern_singletons_.empty() &&
    490       origin_and_path_regex_pattern_singletons_.empty();
    491 }
    492 
    493 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    494     URLMatcherCondition::Criterion criterion,
    495     const std::string& pattern) {
    496   StringPattern search_pattern(pattern, 0);
    497   PatternSingletons* pattern_singletons = NULL;
    498   if (IsRegexCriterion(criterion))
    499     pattern_singletons = &regex_pattern_singletons_;
    500   else if (IsOriginAndPathRegexCriterion(criterion))
    501     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
    502   else
    503     pattern_singletons = &substring_pattern_singletons_;
    504 
    505   PatternSingletons::const_iterator iter =
    506       pattern_singletons->find(&search_pattern);
    507 
    508   if (iter != pattern_singletons->end()) {
    509     return URLMatcherCondition(criterion, *iter);
    510   } else {
    511     StringPattern* new_pattern =
    512         new StringPattern(pattern, id_counter_++);
    513     pattern_singletons->insert(new_pattern);
    514     return URLMatcherCondition(criterion, new_pattern);
    515   }
    516 }
    517 
    518 std::string URLMatcherConditionFactory::CanonicalizeHostname(
    519     const std::string& hostname) const {
    520   if (!hostname.empty() && hostname[0] == '.')
    521     return hostname;
    522   else
    523     return "." + hostname;
    524 }
    525 
    526 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
    527     StringPattern* lhs,
    528     StringPattern* rhs) const {
    529   if (lhs == NULL && rhs != NULL) return true;
    530   if (lhs != NULL && rhs != NULL)
    531     return lhs->pattern() < rhs->pattern();
    532   // Either both are NULL or only rhs is NULL.
    533   return false;
    534 }
    535 
    536 //
    537 // URLMatcherSchemeFilter
    538 //
    539 
    540 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    541     : filters_(1) {
    542   filters_.push_back(filter);
    543 }
    544 
    545 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    546     const std::vector<std::string>& filters)
    547     : filters_(filters) {}
    548 
    549 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
    550 
    551 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
    552   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
    553       filters_.end();
    554 }
    555 
    556 //
    557 // URLMatcherPortFilter
    558 //
    559 
    560 URLMatcherPortFilter::URLMatcherPortFilter(
    561     const std::vector<URLMatcherPortFilter::Range>& ranges)
    562     : ranges_(ranges) {}
    563 
    564 URLMatcherPortFilter::~URLMatcherPortFilter() {}
    565 
    566 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
    567   int port = url.EffectiveIntPort();
    568   for (std::vector<Range>::const_iterator i = ranges_.begin();
    569        i != ranges_.end(); ++i) {
    570     if (i->first <= port && port <= i->second)
    571       return true;
    572   }
    573   return false;
    574 }
    575 
    576 // static
    577 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
    578                                                               int to) {
    579   return Range(from, to);
    580 }
    581 
    582 // static
    583 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
    584   return Range(port, port);
    585 }
    586 
    587 //
    588 // URLMatcherConditionSet
    589 //
    590 
    591 URLMatcherConditionSet::~URLMatcherConditionSet() {}
    592 
    593 URLMatcherConditionSet::URLMatcherConditionSet(
    594     ID id,
    595     const Conditions& conditions)
    596     : id_(id),
    597       conditions_(conditions) {}
    598 
    599 URLMatcherConditionSet::URLMatcherConditionSet(
    600     ID id,
    601     const Conditions& conditions,
    602     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    603     scoped_ptr<URLMatcherPortFilter> port_filter)
    604     : id_(id),
    605       conditions_(conditions),
    606       scheme_filter_(scheme_filter.Pass()),
    607       port_filter_(port_filter.Pass()) {}
    608 
    609 bool URLMatcherConditionSet::IsMatch(
    610     const std::set<StringPattern::ID>& matching_patterns,
    611     const GURL& url) const {
    612   for (Conditions::const_iterator i = conditions_.begin();
    613        i != conditions_.end(); ++i) {
    614     if (!i->IsMatch(matching_patterns, url))
    615       return false;
    616   }
    617   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
    618     return false;
    619   if (port_filter_.get() && !port_filter_->IsMatch(url))
    620     return false;
    621   return true;
    622 }
    623 
    624 //
    625 // URLMatcher
    626 //
    627 
    628 URLMatcher::URLMatcher() {}
    629 
    630 URLMatcher::~URLMatcher() {}
    631 
    632 void URLMatcher::AddConditionSets(
    633     const URLMatcherConditionSet::Vector& condition_sets) {
    634   for (URLMatcherConditionSet::Vector::const_iterator i =
    635        condition_sets.begin(); i != condition_sets.end(); ++i) {
    636     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
    637         url_matcher_condition_sets_.end());
    638     url_matcher_condition_sets_[(*i)->id()] = *i;
    639   }
    640   UpdateInternalDatastructures();
    641 }
    642 
    643 void URLMatcher::RemoveConditionSets(
    644     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
    645   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
    646        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
    647     DCHECK(url_matcher_condition_sets_.find(*i) !=
    648         url_matcher_condition_sets_.end());
    649     url_matcher_condition_sets_.erase(*i);
    650   }
    651   UpdateInternalDatastructures();
    652 }
    653 
    654 void URLMatcher::ClearUnusedConditionSets() {
    655   UpdateConditionFactory();
    656 }
    657 
    658 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
    659     const GURL& url) const {
    660   // Find all IDs of StringPatterns that match |url|.
    661   // See URLMatcherConditionFactory for the canonicalization of URLs and the
    662   // distinction between full url searches and url component searches.
    663   std::set<StringPattern::ID> matches;
    664   if (!full_url_matcher_.IsEmpty()) {
    665     full_url_matcher_.Match(
    666         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
    667   }
    668   if (!url_component_matcher_.IsEmpty()) {
    669     url_component_matcher_.Match(
    670         condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
    671   }
    672   if (!regex_set_matcher_.IsEmpty()) {
    673     regex_set_matcher_.Match(
    674         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
    675   }
    676   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
    677     origin_and_path_regex_set_matcher_.Match(
    678         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
    679         &matches);
    680   }
    681 
    682   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
    683   // were fulfilled.
    684   std::set<URLMatcherConditionSet::ID> result;
    685   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
    686        i != matches.end(); ++i) {
    687     // For each URLMatcherConditionSet there is exactly one condition
    688     // registered in substring_match_triggers_. This means that the following
    689     // logic tests each URLMatcherConditionSet exactly once if it can be
    690     // completely fulfilled.
    691     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
    692         substring_match_triggers_.find(*i);
    693     if (triggered_condition_sets_iter == substring_match_triggers_.end())
    694       continue;  // Not all substring matches are triggers for a condition set.
    695     const std::set<URLMatcherConditionSet::ID>& condition_sets =
    696         triggered_condition_sets_iter->second;
    697     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
    698          condition_sets.begin(); j != condition_sets.end(); ++j) {
    699       URLMatcherConditionSets::const_iterator condition_set_iter =
    700           url_matcher_condition_sets_.find(*j);
    701       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
    702       if (condition_set_iter->second->IsMatch(matches, url))
    703         result.insert(*j);
    704     }
    705   }
    706 
    707   return result;
    708 }
    709 
    710 bool URLMatcher::IsEmpty() const {
    711   return condition_factory_.IsEmpty() &&
    712       url_matcher_condition_sets_.empty() &&
    713       substring_match_triggers_.empty() &&
    714       full_url_matcher_.IsEmpty() &&
    715       url_component_matcher_.IsEmpty() &&
    716       regex_set_matcher_.IsEmpty() &&
    717       origin_and_path_regex_set_matcher_.IsEmpty() &&
    718       registered_full_url_patterns_.empty() &&
    719       registered_url_component_patterns_.empty();
    720 }
    721 
    722 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
    723   // The purpose of |full_url_conditions| is just that we need to execute
    724   // the same logic once for Full URL searches and once for URL Component
    725   // searches (see URLMatcherConditionFactory).
    726 
    727   // Determine which patterns need to be registered when this function
    728   // terminates.
    729   std::set<const StringPattern*> new_patterns;
    730   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    731       url_matcher_condition_sets_.begin();
    732       condition_set_iter != url_matcher_condition_sets_.end();
    733       ++condition_set_iter) {
    734     const URLMatcherConditionSet::Conditions& conditions =
    735         condition_set_iter->second->conditions();
    736     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    737          conditions.begin(); condition_iter != conditions.end();
    738          ++condition_iter) {
    739       // If we are called to process Full URL searches, ignore others, and
    740       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
    741       if (!condition_iter->IsRegexCondition() &&
    742           !condition_iter->IsOriginAndPathRegexCondition() &&
    743           full_url_conditions == condition_iter->IsFullURLCondition())
    744         new_patterns.insert(condition_iter->string_pattern());
    745     }
    746   }
    747 
    748   // This is the set of patterns that were registered before this function
    749   // is called.
    750   std::set<const StringPattern*>& registered_patterns =
    751       full_url_conditions ? registered_full_url_patterns_
    752                           : registered_url_component_patterns_;
    753 
    754   // Add all patterns that are in new_patterns but not in registered_patterns.
    755   std::vector<const StringPattern*> patterns_to_register;
    756   std::set_difference(
    757       new_patterns.begin(), new_patterns.end(),
    758       registered_patterns.begin(), registered_patterns.end(),
    759       std::back_inserter(patterns_to_register));
    760 
    761   // Remove all patterns that are in registered_patterns but not in
    762   // new_patterns.
    763   std::vector<const StringPattern*> patterns_to_unregister;
    764   std::set_difference(
    765       registered_patterns.begin(), registered_patterns.end(),
    766       new_patterns.begin(), new_patterns.end(),
    767       std::back_inserter(patterns_to_unregister));
    768 
    769   // Update the SubstringSetMatcher.
    770   SubstringSetMatcher& url_matcher =
    771       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
    772   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
    773                                             patterns_to_unregister);
    774 
    775   // Update the set of registered_patterns for the next time this function
    776   // is being called.
    777   registered_patterns.swap(new_patterns);
    778 }
    779 
    780 void URLMatcher::UpdateRegexSetMatcher() {
    781   std::vector<const StringPattern*> new_patterns;
    782   std::vector<const StringPattern*> new_origin_and_path_patterns;
    783 
    784   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    785       url_matcher_condition_sets_.begin();
    786       condition_set_iter != url_matcher_condition_sets_.end();
    787       ++condition_set_iter) {
    788     const URLMatcherConditionSet::Conditions& conditions =
    789         condition_set_iter->second->conditions();
    790     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    791          conditions.begin(); condition_iter != conditions.end();
    792          ++condition_iter) {
    793       if (condition_iter->IsRegexCondition()) {
    794         new_patterns.push_back(condition_iter->string_pattern());
    795       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
    796         new_origin_and_path_patterns.push_back(
    797             condition_iter->string_pattern());
    798       }
    799     }
    800   }
    801 
    802   // Start over from scratch. We can't really do better than this, since the
    803   // FilteredRE2 backend doesn't support incremental updates.
    804   regex_set_matcher_.ClearPatterns();
    805   regex_set_matcher_.AddPatterns(new_patterns);
    806   origin_and_path_regex_set_matcher_.ClearPatterns();
    807   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
    808 }
    809 
    810 void URLMatcher::UpdateTriggers() {
    811   // Count substring pattern frequencies.
    812   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
    813   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    814       url_matcher_condition_sets_.begin();
    815       condition_set_iter != url_matcher_condition_sets_.end();
    816       ++condition_set_iter) {
    817     const URLMatcherConditionSet::Conditions& conditions =
    818         condition_set_iter->second->conditions();
    819     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    820          conditions.begin(); condition_iter != conditions.end();
    821          ++condition_iter) {
    822       const StringPattern* pattern = condition_iter->string_pattern();
    823       substring_pattern_frequencies[pattern->id()]++;
    824     }
    825   }
    826 
    827   // Update trigger conditions: Determine for each URLMatcherConditionSet which
    828   // URLMatcherCondition contains a StringPattern that occurs least
    829   // frequently in this URLMatcher. We assume that this condition is very
    830   // specific and occurs rarely in URLs. If a match occurs for this
    831   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
    832   // respective URLMatcherConditionSet as well to see whether the entire
    833   // URLMatcherConditionSet is considered matching.
    834   substring_match_triggers_.clear();
    835   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    836       url_matcher_condition_sets_.begin();
    837       condition_set_iter != url_matcher_condition_sets_.end();
    838       ++condition_set_iter) {
    839     const URLMatcherConditionSet::Conditions& conditions =
    840         condition_set_iter->second->conditions();
    841     if (conditions.empty())
    842       continue;
    843     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    844         conditions.begin();
    845     StringPattern::ID trigger = condition_iter->string_pattern()->id();
    846     // We skip the first element in the following loop.
    847     ++condition_iter;
    848     for (; condition_iter != conditions.end(); ++condition_iter) {
    849       StringPattern::ID current_id =
    850           condition_iter->string_pattern()->id();
    851       if (substring_pattern_frequencies[trigger] >
    852           substring_pattern_frequencies[current_id]) {
    853         trigger = current_id;
    854       }
    855     }
    856     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
    857   }
    858 }
    859 
    860 void URLMatcher::UpdateConditionFactory() {
    861   std::set<StringPattern::ID> used_patterns;
    862   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    863       url_matcher_condition_sets_.begin();
    864       condition_set_iter != url_matcher_condition_sets_.end();
    865       ++condition_set_iter) {
    866     const URLMatcherConditionSet::Conditions& conditions =
    867         condition_set_iter->second->conditions();
    868     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    869          conditions.begin(); condition_iter != conditions.end();
    870          ++condition_iter) {
    871       used_patterns.insert(condition_iter->string_pattern()->id());
    872     }
    873   }
    874   condition_factory_.ForgetUnusedPatterns(used_patterns);
    875 }
    876 
    877 void URLMatcher::UpdateInternalDatastructures() {
    878   UpdateSubstringSetMatcher(false);
    879   UpdateSubstringSetMatcher(true);
    880   UpdateRegexSetMatcher();
    881   UpdateTriggers();
    882   UpdateConditionFactory();
    883 }
    884 
    885 }  // namespace extensions
    886