Home | History | Annotate | Download | only in url_matcher
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/url_matcher/url_matcher.h"
      6 
      7 #include <algorithm>
      8 #include <iterator>
      9 
     10 #include "base/logging.h"
     11 #include "url/gurl.h"
     12 #include "url/url_canon.h"
     13 
     14 namespace url_matcher {
     15 
     16 // This set of classes implement a mapping of URL Component Patterns, such as
     17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
     18 // for use in substring comparisons.
     19 //
     20 // The idea of this mapping is to reduce the problem of comparing many
     21 // URL Component Patterns against one URL to the problem of searching many
     22 // substrings in one string:
     23 //
     24 // ----------------------                    -----------------
     25 // | URL Query operator | ----translate----> | StringPattern |
     26 // ----------------------                    -----------------
     27 //                                                   ^
     28 //                                                   |
     29 //                                                compare
     30 //                                                   |
     31 //                                                   v
     32 // ----------------------                    -----------------
     33 // | URL to compare     |                    |               |
     34 // | to all URL Query   | ----translate----> | String        |
     35 // | operators          |                    |               |
     36 // ----------------------                    -----------------
     37 //
     38 // The reason for this problem reduction is that there are efficient algorithms
     39 // for searching many substrings in one string (see Aho-Corasick algorithm).
     40 //
     41 // Additionally, some of the same pieces are reused to implement regular
     42 // expression comparisons. The FilteredRE2 implementation for matching many
     43 // regular expressions against one string uses prefiltering, in which a set
     44 // of substrings (derived from the regexes) are first searched for, to reduce
     45 // the number of regular expressions to test; the prefiltering step also
     46 // uses Aho-Corasick.
     47 //
     48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
     49 // ==========================================================
     50 //
     51 // For searches in this class, we normalize URLs as follows:
     52 //
     53 // Step 1:
     54 // Remove scheme, port and segment from URL:
     55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
     56 //    www.example.com/index.html?search=foo
     57 //
     58 // We remove the scheme and port number because they can be checked later
     59 // in a secondary filter step. We remove the segment (the #... part) because
     60 // this is not guaranteed to be ASCII-7 encoded.
     61 //
     62 // Step 2:
     63 // Translate URL to String and add the following position markers:
     64 // - BU = Beginning of URL
     65 // - ED = End of Domain
     66 // - EP = End of Path
     67 // - EU = End of URL
     68 // Furthermore, the hostname is canonicalized to start with a ".".
     69 //
     70 // Position markers are represented as characters >127, which are therefore
     71 // guaranteed not to be part of the ASCII-7 encoded URL character set.
     72 //
     73 // -> www.example.com/index.html?search=foo becomes
     74 // BU .www.example.com ED /index.html EP ?search=foo EU
     75 //
     76 // -> www.example.com/index.html becomes
     77 // BU .www.example.com ED /index.html EP EU
     78 //
     79 // Step 3:
     80 // Translate URL Component Patterns as follows:
     81 //
     82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
     83 // -> host_prefix("www.example") = BU .www.example
     84 //
     85 // host_suffix(suffix) = suffix ED
     86 // -> host_suffix("example.com") = example.com ED
     87 // -> host_suffix(".example.com") = .example.com ED
     88 //
     89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
     90 // -> host_equals("www.example.com") = BU .www.example.com ED
     91 //
     92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
     93 //
     94 // With this, we can search the StringPatterns in the normalized URL.
     95 //
     96 //
     97 // Case 2: url_{prefix,suffix,equals,contains} searches.
     98 // =====================================================
     99 //
    100 // Step 1: as above, except that
    101 // - the scheme is not removed
    102 // - the port is not removed if it is specified and does not match the default
    103 //   port for the given scheme.
    104 //
    105 // Step 2:
    106 // Translate URL to String and add the following position markers:
    107 // - BU = Beginning of URL
    108 // - EU = End of URL
    109 //
    110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
    111 // BU http://www.example.com:8080/index.html?search=foo EU
    112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
    113 // BU http://www.example.com/index.html?search=foo EU
    114 //
    115 // url_prefix(prefix) = BU prefix
    116 // -> url_prefix("http://www.example") = BU http://www.example
    117 //
    118 // url_contains(substring) = substring
    119 // -> url_contains("index") = index
    120 //
    121 //
    122 // Case 3: {host,path,query}_contains searches.
    123 // ============================================
    124 //
    125 // These kinds of searches are not supported directly but can be derived
    126 // by a combination of a url_contains() query followed by an explicit test:
    127 //
    128 // host_contains(str) = url_contains(str) followed by test whether str occurs
    129 //   in host component of original URL.
    130 // -> host_contains("example.co") = example.co
    131 //    followed by gurl.host().find("example.co");
    132 //
    133 // [similarly for path_contains and query_contains].
    134 //
    135 //
    136 // Regular expression matching (url_matches searches)
    137 // ==================================================
    138 //
    139 // This class also supports matching regular expressions (RE2 syntax)
    140 // against full URLs, which are transformed as in case 2.
    141 
    142 namespace {
    143 
    144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
    145   return criterion == URLMatcherCondition::URL_MATCHES;
    146 }
    147 
    148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
    149   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
    150 }
    151 
    152 }  // namespace
    153 
    154 //
    155 // URLMatcherCondition
    156 //
    157 
    158 URLMatcherCondition::URLMatcherCondition()
    159     : criterion_(HOST_PREFIX),
    160       string_pattern_(NULL) {}
    161 
    162 URLMatcherCondition::~URLMatcherCondition() {}
    163 
    164 URLMatcherCondition::URLMatcherCondition(
    165     Criterion criterion,
    166     const StringPattern* string_pattern)
    167     : criterion_(criterion),
    168       string_pattern_(string_pattern) {}
    169 
    170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    171     : criterion_(rhs.criterion_),
    172       string_pattern_(rhs.string_pattern_) {}
    173 
    174 URLMatcherCondition& URLMatcherCondition::operator=(
    175     const URLMatcherCondition& rhs) {
    176   criterion_ = rhs.criterion_;
    177   string_pattern_ = rhs.string_pattern_;
    178   return *this;
    179 }
    180 
    181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
    182   if (criterion_ < rhs.criterion_) return true;
    183   if (criterion_ > rhs.criterion_) return false;
    184   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    185     return *string_pattern_ < *rhs.string_pattern_;
    186   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
    187   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
    188   // or both are NULL.
    189   return false;
    190 }
    191 
    192 bool URLMatcherCondition::IsFullURLCondition() const {
    193   // For these criteria the SubstringMatcher needs to be executed on the
    194   // GURL that is canonicalized with
    195   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
    196   switch (criterion_) {
    197     case HOST_CONTAINS:
    198     case PATH_CONTAINS:
    199     case QUERY_CONTAINS:
    200     case URL_PREFIX:
    201     case URL_SUFFIX:
    202     case URL_CONTAINS:
    203     case URL_EQUALS:
    204       return true;
    205     default:
    206       break;
    207   }
    208   return false;
    209 }
    210 
    211 bool URLMatcherCondition::IsRegexCondition() const {
    212   return IsRegexCriterion(criterion_);
    213 }
    214 
    215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
    216   return IsOriginAndPathRegexCriterion(criterion_);
    217 }
    218 
    219 bool URLMatcherCondition::IsMatch(
    220     const std::set<StringPattern::ID>& matching_patterns,
    221     const GURL& url) const {
    222   DCHECK(string_pattern_);
    223   if (!ContainsKey(matching_patterns, string_pattern_->id()))
    224     return false;
    225   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
    226   // a substring match on the raw URL. In case of a match, we need to verify
    227   // that the match was found in the correct component of the URL.
    228   switch (criterion_) {
    229     case HOST_CONTAINS:
    230       return url.host().find(string_pattern_->pattern()) !=
    231           std::string::npos;
    232     case PATH_CONTAINS:
    233       return url.path().find(string_pattern_->pattern()) !=
    234           std::string::npos;
    235     case QUERY_CONTAINS:
    236       return url.query().find(string_pattern_->pattern()) !=
    237           std::string::npos;
    238     default:
    239       break;
    240   }
    241   return true;
    242 }
    243 
    244 //
    245 // URLMatcherConditionFactory
    246 //
    247 
    248 namespace {
    249 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
    250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
    251 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
    252 const char kEndOfPath[] = {static_cast<char>(-3), 0};
    253 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
    254 const char kEndOfURL[] = {static_cast<char>(-5), 0};
    255 
    256 // The delimiter for query parameters
    257 const char kQuerySeparator = '&';
    258 }  // namespace
    259 
    260 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
    261 
    262 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
    263   STLDeleteElements(&substring_pattern_singletons_);
    264   STLDeleteElements(&regex_pattern_singletons_);
    265   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
    266 }
    267 
    268 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    269     const GURL& url) const {
    270   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
    271          url.path() + kEndOfPath +
    272          (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
    273                           : std::string()) +
    274          kEndOfURL;
    275 }
    276 
    277 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    278     const std::string& prefix) {
    279   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
    280       kBeginningOfURL + CanonicalizeHostname(prefix));
    281 }
    282 
    283 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    284     const std::string& suffix) {
    285   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
    286       suffix + kEndOfDomain);
    287 }
    288 
    289 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    290     const std::string& str) {
    291   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
    292 }
    293 
    294 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    295     const std::string& str) {
    296   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
    297       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
    298 }
    299 
    300 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    301     const std::string& prefix) {
    302   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
    303       kEndOfDomain + prefix);
    304 }
    305 
    306 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    307     const std::string& suffix) {
    308   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
    309 }
    310 
    311 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    312     const std::string& str) {
    313   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
    314 }
    315 
    316 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    317     const std::string& str) {
    318   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
    319       kEndOfDomain + str + kEndOfPath);
    320 }
    321 
    322 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    323     const std::string& prefix) {
    324   std::string pattern;
    325   if (!prefix.empty() && prefix[0] == '?')
    326     pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
    327   else
    328     pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
    329 
    330   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
    331 }
    332 
    333 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    334     const std::string& suffix) {
    335   if (!suffix.empty() && suffix[0] == '?') {
    336     return CreateQueryEqualsCondition(suffix);
    337   } else {
    338     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
    339                            CanonicalizeQuery(suffix, false, true) + kEndOfURL);
    340   }
    341 }
    342 
    343 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    344     const std::string& str) {
    345   if (!str.empty() && str[0] == '?')
    346     return CreateQueryPrefixCondition(str);
    347   else
    348     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
    349 }
    350 
    351 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    352     const std::string& str) {
    353   std::string pattern;
    354   if (!str.empty() && str[0] == '?')
    355     pattern =
    356         kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
    357   else
    358     pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
    359 
    360   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
    361 }
    362 
    363 URLMatcherCondition
    364     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    365     const std::string& host_suffix,
    366     const std::string& path_prefix) {
    367   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
    368       host_suffix + kEndOfDomain + path_prefix);
    369 }
    370 
    371 URLMatcherCondition
    372 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    373     const std::string& host,
    374     const std::string& path_prefix) {
    375   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
    376       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
    377       path_prefix);
    378 }
    379 
    380 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    381     const GURL& url) const {
    382   GURL::Replacements replacements;
    383   replacements.ClearPassword();
    384   replacements.ClearUsername();
    385   replacements.ClearRef();
    386   // Clear port if it is implicit from scheme.
    387   if (url.has_port()) {
    388     const std::string& port = url.scheme();
    389     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
    390         url.EffectiveIntPort()) {
    391       replacements.ClearPort();
    392     }
    393   }
    394   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
    395       kEndOfURL;
    396 }
    397 
    398 static std::string CanonicalizeURLForRegexSearchesHelper(
    399     const GURL& url,
    400     bool clear_query) {
    401   GURL::Replacements replacements;
    402   replacements.ClearPassword();
    403   replacements.ClearUsername();
    404   replacements.ClearRef();
    405   if (clear_query)
    406     replacements.ClearQuery();
    407   // Clear port if it is implicit from scheme.
    408   if (url.has_port()) {
    409     const std::string& port = url.scheme();
    410     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
    411         url.EffectiveIntPort()) {
    412       replacements.ClearPort();
    413     }
    414   }
    415   return url.ReplaceComponents(replacements).spec();
    416 }
    417 
    418 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    419     const GURL& url) const {
    420   return CanonicalizeURLForRegexSearchesHelper(url, false);
    421 }
    422 
    423 std::string
    424 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    425     const GURL& url) const {
    426   return CanonicalizeURLForRegexSearchesHelper(url, true);
    427 }
    428 
    429 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    430     const std::string& prefix) {
    431   return CreateCondition(URLMatcherCondition::URL_PREFIX,
    432       kBeginningOfURL + prefix);
    433 }
    434 
    435 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    436     const std::string& suffix) {
    437   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
    438 }
    439 
    440 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    441     const std::string& str) {
    442   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
    443 }
    444 
    445 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    446     const std::string& str) {
    447   return CreateCondition(URLMatcherCondition::URL_EQUALS,
    448       kBeginningOfURL + str + kEndOfURL);
    449 }
    450 
    451 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    452     const std::string& regex) {
    453   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
    454 }
    455 
    456 URLMatcherCondition
    457 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    458     const std::string& regex) {
    459   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
    460 }
    461 
    462 void URLMatcherConditionFactory::ForgetUnusedPatterns(
    463       const std::set<StringPattern::ID>& used_patterns) {
    464   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
    465   while (i != substring_pattern_singletons_.end()) {
    466     if (ContainsKey(used_patterns, (*i)->id())) {
    467       ++i;
    468     } else {
    469       delete *i;
    470       substring_pattern_singletons_.erase(i++);
    471     }
    472   }
    473   i = regex_pattern_singletons_.begin();
    474   while (i != regex_pattern_singletons_.end()) {
    475     if (ContainsKey(used_patterns, (*i)->id())) {
    476       ++i;
    477     } else {
    478       delete *i;
    479       regex_pattern_singletons_.erase(i++);
    480     }
    481   }
    482   i = origin_and_path_regex_pattern_singletons_.begin();
    483   while (i != origin_and_path_regex_pattern_singletons_.end()) {
    484     if (ContainsKey(used_patterns, (*i)->id())) {
    485       ++i;
    486     } else {
    487       delete *i;
    488       origin_and_path_regex_pattern_singletons_.erase(i++);
    489     }
    490   }
    491 }
    492 
    493 bool URLMatcherConditionFactory::IsEmpty() const {
    494   return substring_pattern_singletons_.empty() &&
    495       regex_pattern_singletons_.empty() &&
    496       origin_and_path_regex_pattern_singletons_.empty();
    497 }
    498 
    499 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    500     URLMatcherCondition::Criterion criterion,
    501     const std::string& pattern) {
    502   StringPattern search_pattern(pattern, 0);
    503   PatternSingletons* pattern_singletons = NULL;
    504   if (IsRegexCriterion(criterion))
    505     pattern_singletons = &regex_pattern_singletons_;
    506   else if (IsOriginAndPathRegexCriterion(criterion))
    507     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
    508   else
    509     pattern_singletons = &substring_pattern_singletons_;
    510 
    511   PatternSingletons::const_iterator iter =
    512       pattern_singletons->find(&search_pattern);
    513 
    514   if (iter != pattern_singletons->end()) {
    515     return URLMatcherCondition(criterion, *iter);
    516   } else {
    517     StringPattern* new_pattern =
    518         new StringPattern(pattern, id_counter_++);
    519     pattern_singletons->insert(new_pattern);
    520     return URLMatcherCondition(criterion, new_pattern);
    521   }
    522 }
    523 
    524 std::string URLMatcherConditionFactory::CanonicalizeHostname(
    525     const std::string& hostname) const {
    526   if (!hostname.empty() && hostname[0] == '.')
    527     return hostname;
    528   else
    529     return "." + hostname;
    530 }
    531 
    532 // This function prepares the query string by replacing query separator with a
    533 // magic value (|kQueryComponentDelimiter|). When the boolean
    534 // |prepend_beginning_of_query_component| is true the function prepends the
    535 // query with the same magic. This is done to locate the start of a key value
    536 // pair in the query string. The parameter |query| is passed by value
    537 // intentionally, since it is locally modified.
    538 std::string URLMatcherConditionFactory::CanonicalizeQuery(
    539     std::string query,
    540     bool prepend_beginning_of_query_component,
    541     bool append_end_of_query_component) const {
    542   for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
    543     if (*it == kQuerySeparator)
    544       *it = kQueryComponentDelimiter[0];
    545   }
    546   if (prepend_beginning_of_query_component)
    547     query = kQueryComponentDelimiter + query;
    548   if (append_end_of_query_component)
    549     query += kQueryComponentDelimiter;
    550   return query;
    551 }
    552 
    553 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
    554     StringPattern* lhs,
    555     StringPattern* rhs) const {
    556   if (lhs == NULL && rhs != NULL) return true;
    557   if (lhs != NULL && rhs != NULL)
    558     return lhs->pattern() < rhs->pattern();
    559   // Either both are NULL or only rhs is NULL.
    560   return false;
    561 }
    562 
    563 //
    564 // URLQueryElementMatcherCondition
    565 //
    566 
    567 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
    568     const std::string& key,
    569     const std::string& value,
    570     QueryValueMatchType query_value_match_type,
    571     QueryElementType query_element_type,
    572     Type match_type,
    573     URLMatcherConditionFactory* factory) {
    574   match_type_ = match_type;
    575 
    576   if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
    577     key_ = kQueryComponentDelimiter + key + "=";
    578     value_ = value;
    579   } else {
    580     key_ = kQueryComponentDelimiter + key;
    581     value_ = std::string();
    582   }
    583 
    584   if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
    585     value_ += kQueryComponentDelimiter;
    586 
    587   // If |value_| is empty no need to find the |key_| and verify if the value
    588   // matches. Simply checking the presence of key is sufficient, which is done
    589   // by MATCH_ANY
    590   if (value_.empty())
    591     match_type_ = MATCH_ANY;
    592 
    593   URLMatcherCondition condition;
    594   // If |match_type_| is MATCH_ANY, then we could simply look for the
    595   // combination of |key_| + |value_|, which can be efficiently done by
    596   // SubstringMatcher
    597   if (match_type_ == MATCH_ANY)
    598     condition = factory->CreateQueryContainsCondition(key_ + value_);
    599   else
    600     condition = factory->CreateQueryContainsCondition(key_);
    601   string_pattern_ = condition.string_pattern();
    602 
    603   key_length_ = key_.length();
    604   value_length_ = value_.length();
    605 }
    606 
    607 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
    608 
    609 bool URLQueryElementMatcherCondition::operator<(
    610     const URLQueryElementMatcherCondition& rhs) const {
    611   if (match_type_ != rhs.match_type_)
    612     return match_type_ < rhs.match_type_;
    613   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    614     return *string_pattern_ < *rhs.string_pattern_;
    615   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL)
    616     return true;
    617   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
    618   // or both are NULL.
    619   return false;
    620 }
    621 
    622 bool URLQueryElementMatcherCondition::IsMatch(
    623     const std::string& url_for_component_searches) const {
    624   switch (match_type_) {
    625     case MATCH_ANY: {
    626       // For MATCH_ANY, no additional verification step is needed. We can trust
    627       // the SubstringMatcher to do the verification.
    628       return true;
    629     }
    630     case MATCH_ALL: {
    631       size_t start = 0;
    632       int found = 0;
    633       size_t offset;
    634       while ((offset = url_for_component_searches.find(key_, start)) !=
    635              std::string::npos) {
    636         if (url_for_component_searches.compare(
    637                 offset + key_length_, value_length_, value_) != 0) {
    638           return false;
    639         } else {
    640           ++found;
    641         }
    642         start = offset + key_length_ + value_length_ - 1;
    643       }
    644       return !!found;
    645     }
    646     case MATCH_FIRST: {
    647       size_t offset = url_for_component_searches.find(key_);
    648       return url_for_component_searches.compare(
    649                  offset + key_length_, value_length_, value_) == 0;
    650     }
    651     case MATCH_LAST: {
    652       size_t offset = url_for_component_searches.rfind(key_);
    653       return url_for_component_searches.compare(
    654                  offset + key_length_, value_length_, value_) == 0;
    655     }
    656   }
    657   NOTREACHED();
    658   return false;
    659 }
    660 
    661 //
    662 // URLMatcherSchemeFilter
    663 //
    664 
    665 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    666     : filters_(1) {
    667   filters_.push_back(filter);
    668 }
    669 
    670 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    671     const std::vector<std::string>& filters)
    672     : filters_(filters) {}
    673 
    674 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
    675 
    676 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
    677   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
    678       filters_.end();
    679 }
    680 
    681 //
    682 // URLMatcherPortFilter
    683 //
    684 
    685 URLMatcherPortFilter::URLMatcherPortFilter(
    686     const std::vector<URLMatcherPortFilter::Range>& ranges)
    687     : ranges_(ranges) {}
    688 
    689 URLMatcherPortFilter::~URLMatcherPortFilter() {}
    690 
    691 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
    692   int port = url.EffectiveIntPort();
    693   for (std::vector<Range>::const_iterator i = ranges_.begin();
    694        i != ranges_.end(); ++i) {
    695     if (i->first <= port && port <= i->second)
    696       return true;
    697   }
    698   return false;
    699 }
    700 
    701 // static
    702 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
    703                                                               int to) {
    704   return Range(from, to);
    705 }
    706 
    707 // static
    708 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
    709   return Range(port, port);
    710 }
    711 
    712 //
    713 // URLMatcherConditionSet
    714 //
    715 
    716 URLMatcherConditionSet::~URLMatcherConditionSet() {}
    717 
    718 URLMatcherConditionSet::URLMatcherConditionSet(
    719     ID id,
    720     const Conditions& conditions)
    721     : id_(id),
    722       conditions_(conditions) {}
    723 
    724 URLMatcherConditionSet::URLMatcherConditionSet(
    725     ID id,
    726     const Conditions& conditions,
    727     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    728     scoped_ptr<URLMatcherPortFilter> port_filter)
    729     : id_(id),
    730       conditions_(conditions),
    731       scheme_filter_(scheme_filter.Pass()),
    732       port_filter_(port_filter.Pass()) {}
    733 
    734 URLMatcherConditionSet::URLMatcherConditionSet(
    735     ID id,
    736     const Conditions& conditions,
    737     const QueryConditions& query_conditions,
    738     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    739     scoped_ptr<URLMatcherPortFilter> port_filter)
    740     : id_(id),
    741       conditions_(conditions),
    742       query_conditions_(query_conditions),
    743       scheme_filter_(scheme_filter.Pass()),
    744       port_filter_(port_filter.Pass()) {}
    745 
    746 bool URLMatcherConditionSet::IsMatch(
    747     const std::set<StringPattern::ID>& matching_patterns,
    748     const GURL& url) const {
    749   return IsMatch(matching_patterns, url, std::string());
    750 }
    751 
    752 bool URLMatcherConditionSet::IsMatch(
    753     const std::set<StringPattern::ID>& matching_patterns,
    754     const GURL& url,
    755     const std::string& url_for_component_searches) const {
    756   for (Conditions::const_iterator i = conditions_.begin();
    757        i != conditions_.end(); ++i) {
    758     if (!i->IsMatch(matching_patterns, url))
    759       return false;
    760   }
    761   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
    762     return false;
    763   if (port_filter_.get() && !port_filter_->IsMatch(url))
    764     return false;
    765   if (query_conditions_.empty())
    766     return true;
    767   // The loop is duplicated below for performance reasons. If not all query
    768   // elements are found, no need to verify match that is expected to take more
    769   // cycles.
    770   for (QueryConditions::const_iterator i = query_conditions_.begin();
    771        i != query_conditions_.end();
    772        ++i) {
    773     if (!ContainsKey(matching_patterns, i->string_pattern()->id()))
    774       return false;
    775   }
    776   for (QueryConditions::const_iterator i = query_conditions_.begin();
    777        i != query_conditions_.end();
    778        ++i) {
    779     if (!i->IsMatch(url_for_component_searches))
    780       return false;
    781   }
    782   return true;
    783 }
    784 
    785 //
    786 // URLMatcher
    787 //
    788 
    789 URLMatcher::URLMatcher() {}
    790 
    791 URLMatcher::~URLMatcher() {}
    792 
    793 void URLMatcher::AddConditionSets(
    794     const URLMatcherConditionSet::Vector& condition_sets) {
    795   for (URLMatcherConditionSet::Vector::const_iterator i =
    796        condition_sets.begin(); i != condition_sets.end(); ++i) {
    797     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
    798         url_matcher_condition_sets_.end());
    799     url_matcher_condition_sets_[(*i)->id()] = *i;
    800   }
    801   UpdateInternalDatastructures();
    802 }
    803 
    804 void URLMatcher::RemoveConditionSets(
    805     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
    806   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
    807        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
    808     DCHECK(url_matcher_condition_sets_.find(*i) !=
    809         url_matcher_condition_sets_.end());
    810     url_matcher_condition_sets_.erase(*i);
    811   }
    812   UpdateInternalDatastructures();
    813 }
    814 
    815 void URLMatcher::ClearUnusedConditionSets() {
    816   UpdateConditionFactory();
    817 }
    818 
    819 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
    820     const GURL& url) const {
    821   // Find all IDs of StringPatterns that match |url|.
    822   // See URLMatcherConditionFactory for the canonicalization of URLs and the
    823   // distinction between full url searches and url component searches.
    824   std::set<StringPattern::ID> matches;
    825   std::string url_for_component_searches;
    826 
    827   if (!full_url_matcher_.IsEmpty()) {
    828     full_url_matcher_.Match(
    829         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
    830   }
    831   if (!url_component_matcher_.IsEmpty()) {
    832     url_for_component_searches =
    833         condition_factory_.CanonicalizeURLForComponentSearches(url);
    834     url_component_matcher_.Match(url_for_component_searches, &matches);
    835   }
    836   if (!regex_set_matcher_.IsEmpty()) {
    837     regex_set_matcher_.Match(
    838         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
    839   }
    840   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
    841     origin_and_path_regex_set_matcher_.Match(
    842         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
    843         &matches);
    844   }
    845 
    846   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
    847   // were fulfilled.
    848   std::set<URLMatcherConditionSet::ID> result;
    849   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
    850        i != matches.end(); ++i) {
    851     // For each URLMatcherConditionSet there is exactly one condition
    852     // registered in substring_match_triggers_. This means that the following
    853     // logic tests each URLMatcherConditionSet exactly once if it can be
    854     // completely fulfilled.
    855     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
    856         substring_match_triggers_.find(*i);
    857     if (triggered_condition_sets_iter == substring_match_triggers_.end())
    858       continue;  // Not all substring matches are triggers for a condition set.
    859     const std::set<URLMatcherConditionSet::ID>& condition_sets =
    860         triggered_condition_sets_iter->second;
    861     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
    862          condition_sets.begin(); j != condition_sets.end(); ++j) {
    863       URLMatcherConditionSets::const_iterator condition_set_iter =
    864           url_matcher_condition_sets_.find(*j);
    865       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
    866       if (condition_set_iter->second->IsMatch(
    867               matches, url, url_for_component_searches))
    868         result.insert(*j);
    869     }
    870   }
    871 
    872   return result;
    873 }
    874 
    875 bool URLMatcher::IsEmpty() const {
    876   return condition_factory_.IsEmpty() &&
    877       url_matcher_condition_sets_.empty() &&
    878       substring_match_triggers_.empty() &&
    879       full_url_matcher_.IsEmpty() &&
    880       url_component_matcher_.IsEmpty() &&
    881       regex_set_matcher_.IsEmpty() &&
    882       origin_and_path_regex_set_matcher_.IsEmpty() &&
    883       registered_full_url_patterns_.empty() &&
    884       registered_url_component_patterns_.empty();
    885 }
    886 
    887 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
    888   // The purpose of |full_url_conditions| is just that we need to execute
    889   // the same logic once for Full URL searches and once for URL Component
    890   // searches (see URLMatcherConditionFactory).
    891 
    892   // Determine which patterns need to be registered when this function
    893   // terminates.
    894   std::set<const StringPattern*> new_patterns;
    895   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    896       url_matcher_condition_sets_.begin();
    897       condition_set_iter != url_matcher_condition_sets_.end();
    898       ++condition_set_iter) {
    899     const URLMatcherConditionSet::Conditions& conditions =
    900         condition_set_iter->second->conditions();
    901     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    902          conditions.begin(); condition_iter != conditions.end();
    903          ++condition_iter) {
    904       // If we are called to process Full URL searches, ignore others, and
    905       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
    906       if (!condition_iter->IsRegexCondition() &&
    907           !condition_iter->IsOriginAndPathRegexCondition() &&
    908           full_url_conditions == condition_iter->IsFullURLCondition())
    909         new_patterns.insert(condition_iter->string_pattern());
    910     }
    911 
    912     if (full_url_conditions)
    913       continue;
    914 
    915     const URLMatcherConditionSet::QueryConditions& query_conditions =
    916         condition_set_iter->second->query_conditions();
    917     for (URLMatcherConditionSet::QueryConditions::const_iterator
    918              query_condition_iter = query_conditions.begin();
    919          query_condition_iter != query_conditions.end();
    920          ++query_condition_iter) {
    921       new_patterns.insert(query_condition_iter->string_pattern());
    922     }
    923   }
    924 
    925   // This is the set of patterns that were registered before this function
    926   // is called.
    927   std::set<const StringPattern*>& registered_patterns =
    928       full_url_conditions ? registered_full_url_patterns_
    929                           : registered_url_component_patterns_;
    930 
    931   // Add all patterns that are in new_patterns but not in registered_patterns.
    932   std::vector<const StringPattern*> patterns_to_register =
    933       base::STLSetDifference<std::vector<const StringPattern*> >(
    934           new_patterns, registered_patterns);
    935 
    936   // Remove all patterns that are in registered_patterns but not in
    937   // new_patterns.
    938   std::vector<const StringPattern*> patterns_to_unregister =
    939       base::STLSetDifference<std::vector<const StringPattern*> >(
    940            registered_patterns, new_patterns);
    941 
    942   // Update the SubstringSetMatcher.
    943   SubstringSetMatcher& url_matcher =
    944       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
    945   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
    946                                             patterns_to_unregister);
    947 
    948   // Update the set of registered_patterns for the next time this function
    949   // is being called.
    950   registered_patterns.swap(new_patterns);
    951 }
    952 
    953 void URLMatcher::UpdateRegexSetMatcher() {
    954   std::vector<const StringPattern*> new_patterns;
    955   std::vector<const StringPattern*> new_origin_and_path_patterns;
    956 
    957   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    958       url_matcher_condition_sets_.begin();
    959       condition_set_iter != url_matcher_condition_sets_.end();
    960       ++condition_set_iter) {
    961     const URLMatcherConditionSet::Conditions& conditions =
    962         condition_set_iter->second->conditions();
    963     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    964          conditions.begin(); condition_iter != conditions.end();
    965          ++condition_iter) {
    966       if (condition_iter->IsRegexCondition()) {
    967         new_patterns.push_back(condition_iter->string_pattern());
    968       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
    969         new_origin_and_path_patterns.push_back(
    970             condition_iter->string_pattern());
    971       }
    972     }
    973   }
    974 
    975   // Start over from scratch. We can't really do better than this, since the
    976   // FilteredRE2 backend doesn't support incremental updates.
    977   regex_set_matcher_.ClearPatterns();
    978   regex_set_matcher_.AddPatterns(new_patterns);
    979   origin_and_path_regex_set_matcher_.ClearPatterns();
    980   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
    981 }
    982 
    983 void URLMatcher::UpdateTriggers() {
    984   // Count substring pattern frequencies.
    985   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
    986   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    987       url_matcher_condition_sets_.begin();
    988       condition_set_iter != url_matcher_condition_sets_.end();
    989       ++condition_set_iter) {
    990     const URLMatcherConditionSet::Conditions& conditions =
    991         condition_set_iter->second->conditions();
    992     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    993          conditions.begin(); condition_iter != conditions.end();
    994          ++condition_iter) {
    995       const StringPattern* pattern = condition_iter->string_pattern();
    996       substring_pattern_frequencies[pattern->id()]++;
    997     }
    998 
    999     const URLMatcherConditionSet::QueryConditions& query_conditions =
   1000         condition_set_iter->second->query_conditions();
   1001     for (URLMatcherConditionSet::QueryConditions::const_iterator
   1002              query_condition_iter = query_conditions.begin();
   1003          query_condition_iter != query_conditions.end();
   1004          ++query_condition_iter) {
   1005       const StringPattern* pattern = query_condition_iter->string_pattern();
   1006       substring_pattern_frequencies[pattern->id()]++;
   1007     }
   1008   }
   1009 
   1010   // Update trigger conditions: Determine for each URLMatcherConditionSet which
   1011   // URLMatcherCondition contains a StringPattern that occurs least
   1012   // frequently in this URLMatcher. We assume that this condition is very
   1013   // specific and occurs rarely in URLs. If a match occurs for this
   1014   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
   1015   // respective URLMatcherConditionSet as well to see whether the entire
   1016   // URLMatcherConditionSet is considered matching.
   1017   substring_match_triggers_.clear();
   1018   for (URLMatcherConditionSets::const_iterator condition_set_iter =
   1019       url_matcher_condition_sets_.begin();
   1020       condition_set_iter != url_matcher_condition_sets_.end();
   1021       ++condition_set_iter) {
   1022     const URLMatcherConditionSet::Conditions& conditions =
   1023         condition_set_iter->second->conditions();
   1024     if (conditions.empty())
   1025       continue;
   1026     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
   1027         conditions.begin();
   1028     StringPattern::ID trigger = condition_iter->string_pattern()->id();
   1029     // We skip the first element in the following loop.
   1030     ++condition_iter;
   1031     for (; condition_iter != conditions.end(); ++condition_iter) {
   1032       StringPattern::ID current_id =
   1033           condition_iter->string_pattern()->id();
   1034       if (substring_pattern_frequencies[trigger] >
   1035           substring_pattern_frequencies[current_id]) {
   1036         trigger = current_id;
   1037       }
   1038     }
   1039 
   1040     const URLMatcherConditionSet::QueryConditions& query_conditions =
   1041         condition_set_iter->second->query_conditions();
   1042     for (URLMatcherConditionSet::QueryConditions::const_iterator
   1043              query_condition_iter = query_conditions.begin();
   1044          query_condition_iter != query_conditions.end();
   1045          ++query_condition_iter) {
   1046       StringPattern::ID current_id =
   1047           query_condition_iter->string_pattern()->id();
   1048       if (substring_pattern_frequencies[trigger] >
   1049           substring_pattern_frequencies[current_id]) {
   1050         trigger = current_id;
   1051       }
   1052     }
   1053 
   1054     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
   1055   }
   1056 }
   1057 
   1058 void URLMatcher::UpdateConditionFactory() {
   1059   std::set<StringPattern::ID> used_patterns;
   1060   for (URLMatcherConditionSets::const_iterator condition_set_iter =
   1061       url_matcher_condition_sets_.begin();
   1062       condition_set_iter != url_matcher_condition_sets_.end();
   1063       ++condition_set_iter) {
   1064     const URLMatcherConditionSet::Conditions& conditions =
   1065         condition_set_iter->second->conditions();
   1066     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
   1067          conditions.begin(); condition_iter != conditions.end();
   1068          ++condition_iter) {
   1069       used_patterns.insert(condition_iter->string_pattern()->id());
   1070     }
   1071     const URLMatcherConditionSet::QueryConditions& query_conditions =
   1072         condition_set_iter->second->query_conditions();
   1073     for (URLMatcherConditionSet::QueryConditions::const_iterator
   1074              query_condition_iter = query_conditions.begin();
   1075          query_condition_iter != query_conditions.end();
   1076          ++query_condition_iter) {
   1077       used_patterns.insert(query_condition_iter->string_pattern()->id());
   1078     }
   1079   }
   1080   condition_factory_.ForgetUnusedPatterns(used_patterns);
   1081 }
   1082 
   1083 void URLMatcher::UpdateInternalDatastructures() {
   1084   UpdateSubstringSetMatcher(false);
   1085   UpdateSubstringSetMatcher(true);
   1086   UpdateRegexSetMatcher();
   1087   UpdateTriggers();
   1088   UpdateConditionFactory();
   1089 }
   1090 
   1091 }  // namespace url_matcher
   1092