Home | History | Annotate | Download | only in url_matcher
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/url_matcher/url_matcher.h"
      6 
      7 #include <algorithm>
      8 #include <iterator>
      9 
     10 #include "base/logging.h"
     11 #include "base/profiler/scoped_profile.h"
     12 #include "url/gurl.h"
     13 #include "url/url_canon.h"
     14 
     15 namespace url_matcher {
     16 
     17 // This set of classes implement a mapping of URL Component Patterns, such as
     18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
     19 // for use in substring comparisons.
     20 //
     21 // The idea of this mapping is to reduce the problem of comparing many
     22 // URL Component Patterns against one URL to the problem of searching many
     23 // substrings in one string:
     24 //
     25 // ----------------------                    -----------------
     26 // | URL Query operator | ----translate----> | StringPattern |
     27 // ----------------------                    -----------------
     28 //                                                   ^
     29 //                                                   |
     30 //                                                compare
     31 //                                                   |
     32 //                                                   v
     33 // ----------------------                    -----------------
     34 // | URL to compare     |                    |               |
     35 // | to all URL Query   | ----translate----> | String        |
     36 // | operators          |                    |               |
     37 // ----------------------                    -----------------
     38 //
     39 // The reason for this problem reduction is that there are efficient algorithms
     40 // for searching many substrings in one string (see Aho-Corasick algorithm).
     41 //
     42 // Additionally, some of the same pieces are reused to implement regular
     43 // expression comparisons. The FilteredRE2 implementation for matching many
     44 // regular expressions against one string uses prefiltering, in which a set
     45 // of substrings (derived from the regexes) are first searched for, to reduce
     46 // the number of regular expressions to test; the prefiltering step also
     47 // uses Aho-Corasick.
     48 //
     49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
     50 // ==========================================================
     51 //
     52 // For searches in this class, we normalize URLs as follows:
     53 //
     54 // Step 1:
     55 // Remove scheme, port and segment from URL:
     56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
     57 //    www.example.com/index.html?search=foo
     58 //
     59 // We remove the scheme and port number because they can be checked later
     60 // in a secondary filter step. We remove the segment (the #... part) because
     61 // this is not guaranteed to be ASCII-7 encoded.
     62 //
     63 // Step 2:
     64 // Translate URL to String and add the following position markers:
     65 // - BU = Beginning of URL
     66 // - ED = End of Domain
     67 // - EP = End of Path
     68 // - EU = End of URL
     69 // Furthermore, the hostname is canonicalized to start with a ".".
     70 //
     71 // Position markers are represented as characters >127, which are therefore
     72 // guaranteed not to be part of the ASCII-7 encoded URL character set.
     73 //
     74 // -> www.example.com/index.html?search=foo becomes
     75 // BU .www.example.com ED /index.html EP ?search=foo EU
     76 //
     77 // -> www.example.com/index.html becomes
     78 // BU .www.example.com ED /index.html EP EU
     79 //
     80 // Step 3:
     81 // Translate URL Component Patterns as follows:
     82 //
     83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
     84 // -> host_prefix("www.example") = BU .www.example
     85 //
     86 // host_suffix(suffix) = suffix ED
     87 // -> host_suffix("example.com") = example.com ED
     88 // -> host_suffix(".example.com") = .example.com ED
     89 //
     90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
     91 // -> host_equals("www.example.com") = BU .www.example.com ED
     92 //
     93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
     94 //
     95 // With this, we can search the StringPatterns in the normalized URL.
     96 //
     97 //
     98 // Case 2: url_{prefix,suffix,equals,contains} searches.
     99 // =====================================================
    100 //
    101 // Step 1: as above, except that
    102 // - the scheme is not removed
    103 // - the port is not removed if it is specified and does not match the default
    104 //   port for the given scheme.
    105 //
    106 // Step 2:
    107 // Translate URL to String and add the following position markers:
    108 // - BU = Beginning of URL
    109 // - EU = End of URL
    110 //
    111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
    112 // BU http://www.example.com:8080/index.html?search=foo EU
    113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
    114 // BU http://www.example.com/index.html?search=foo EU
    115 //
    116 // url_prefix(prefix) = BU prefix
    117 // -> url_prefix("http://www.example") = BU http://www.example
    118 //
    119 // url_contains(substring) = substring
    120 // -> url_contains("index") = index
    121 //
    122 //
    123 // Case 3: {host,path,query}_contains searches.
    124 // ============================================
    125 //
    126 // These kinds of searches are not supported directly but can be derived
    127 // by a combination of a url_contains() query followed by an explicit test:
    128 //
    129 // host_contains(str) = url_contains(str) followed by test whether str occurs
    130 //   in host component of original URL.
    131 // -> host_contains("example.co") = example.co
    132 //    followed by gurl.host().find("example.co");
    133 //
    134 // [similarly for path_contains and query_contains].
    135 //
    136 //
    137 // Regular expression matching (url_matches searches)
    138 // ==================================================
    139 //
    140 // This class also supports matching regular expressions (RE2 syntax)
    141 // against full URLs, which are transformed as in case 2.
    142 
    143 namespace {
    144 
    145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
    146   return criterion == URLMatcherCondition::URL_MATCHES;
    147 }
    148 
    149 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
    150   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
    151 }
    152 
    153 }  // namespace
    154 
    155 //
    156 // URLMatcherCondition
    157 //
    158 
    159 URLMatcherCondition::URLMatcherCondition()
    160     : criterion_(HOST_PREFIX),
    161       string_pattern_(NULL) {}
    162 
    163 URLMatcherCondition::~URLMatcherCondition() {}
    164 
    165 URLMatcherCondition::URLMatcherCondition(
    166     Criterion criterion,
    167     const StringPattern* string_pattern)
    168     : criterion_(criterion),
    169       string_pattern_(string_pattern) {}
    170 
    171 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
    172     : criterion_(rhs.criterion_),
    173       string_pattern_(rhs.string_pattern_) {}
    174 
    175 URLMatcherCondition& URLMatcherCondition::operator=(
    176     const URLMatcherCondition& rhs) {
    177   criterion_ = rhs.criterion_;
    178   string_pattern_ = rhs.string_pattern_;
    179   return *this;
    180 }
    181 
    182 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
    183   if (criterion_ < rhs.criterion_) return true;
    184   if (criterion_ > rhs.criterion_) return false;
    185   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    186     return *string_pattern_ < *rhs.string_pattern_;
    187   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
    188   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
    189   // or both are NULL.
    190   return false;
    191 }
    192 
    193 bool URLMatcherCondition::IsFullURLCondition() const {
    194   // For these criteria the SubstringMatcher needs to be executed on the
    195   // GURL that is canonicalized with
    196   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
    197   switch (criterion_) {
    198     case HOST_CONTAINS:
    199     case PATH_CONTAINS:
    200     case QUERY_CONTAINS:
    201     case URL_PREFIX:
    202     case URL_SUFFIX:
    203     case URL_CONTAINS:
    204     case URL_EQUALS:
    205       return true;
    206     default:
    207       break;
    208   }
    209   return false;
    210 }
    211 
    212 bool URLMatcherCondition::IsRegexCondition() const {
    213   return IsRegexCriterion(criterion_);
    214 }
    215 
    216 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
    217   return IsOriginAndPathRegexCriterion(criterion_);
    218 }
    219 
    220 bool URLMatcherCondition::IsMatch(
    221     const std::set<StringPattern::ID>& matching_patterns,
    222     const GURL& url) const {
    223   DCHECK(string_pattern_);
    224   if (!ContainsKey(matching_patterns, string_pattern_->id()))
    225     return false;
    226   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
    227   // a substring match on the raw URL. In case of a match, we need to verify
    228   // that the match was found in the correct component of the URL.
    229   switch (criterion_) {
    230     case HOST_CONTAINS:
    231       return url.host().find(string_pattern_->pattern()) !=
    232           std::string::npos;
    233     case PATH_CONTAINS:
    234       return url.path().find(string_pattern_->pattern()) !=
    235           std::string::npos;
    236     case QUERY_CONTAINS:
    237       return url.query().find(string_pattern_->pattern()) !=
    238           std::string::npos;
    239     default:
    240       break;
    241   }
    242   return true;
    243 }
    244 
    245 //
    246 // URLMatcherConditionFactory
    247 //
    248 
    249 namespace {
    250 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
    251 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
    252 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
    253 const char kEndOfPath[] = {static_cast<char>(-3), 0};
    254 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
    255 const char kEndOfURL[] = {static_cast<char>(-5), 0};
    256 
    257 // The delimiter for query parameters
    258 const char kQuerySeparator = '&';
    259 }  // namespace
    260 
    261 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
    262 
    263 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
    264   STLDeleteElements(&substring_pattern_singletons_);
    265   STLDeleteElements(&regex_pattern_singletons_);
    266   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
    267 }
    268 
    269 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
    270     const GURL& url) const {
    271   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
    272          url.path() + kEndOfPath +
    273          (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
    274                           : std::string()) +
    275          kEndOfURL;
    276 }
    277 
    278 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
    279     const std::string& prefix) {
    280   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
    281       kBeginningOfURL + CanonicalizeHostname(prefix));
    282 }
    283 
    284 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
    285     const std::string& suffix) {
    286   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
    287       suffix + kEndOfDomain);
    288 }
    289 
    290 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
    291     const std::string& str) {
    292   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
    293 }
    294 
    295 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
    296     const std::string& str) {
    297   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
    298       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
    299 }
    300 
    301 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
    302     const std::string& prefix) {
    303   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
    304       kEndOfDomain + prefix);
    305 }
    306 
    307 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
    308     const std::string& suffix) {
    309   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
    310 }
    311 
    312 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
    313     const std::string& str) {
    314   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
    315 }
    316 
    317 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
    318     const std::string& str) {
    319   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
    320       kEndOfDomain + str + kEndOfPath);
    321 }
    322 
    323 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
    324     const std::string& prefix) {
    325   std::string pattern;
    326   if (!prefix.empty() && prefix[0] == '?')
    327     pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
    328   else
    329     pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
    330 
    331   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
    332 }
    333 
    334 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
    335     const std::string& suffix) {
    336   if (!suffix.empty() && suffix[0] == '?') {
    337     return CreateQueryEqualsCondition(suffix);
    338   } else {
    339     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
    340                            CanonicalizeQuery(suffix, false, true) + kEndOfURL);
    341   }
    342 }
    343 
    344 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
    345     const std::string& str) {
    346   if (!str.empty() && str[0] == '?')
    347     return CreateQueryPrefixCondition(str);
    348   else
    349     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
    350 }
    351 
    352 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
    353     const std::string& str) {
    354   std::string pattern;
    355   if (!str.empty() && str[0] == '?')
    356     pattern =
    357         kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
    358   else
    359     pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
    360 
    361   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
    362 }
    363 
    364 URLMatcherCondition
    365     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
    366     const std::string& host_suffix,
    367     const std::string& path_prefix) {
    368   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
    369       host_suffix + kEndOfDomain + path_prefix);
    370 }
    371 
    372 URLMatcherCondition
    373 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
    374     const std::string& host,
    375     const std::string& path_prefix) {
    376   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
    377       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
    378       path_prefix);
    379 }
    380 
    381 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
    382     const GURL& url) const {
    383   GURL::Replacements replacements;
    384   replacements.ClearPassword();
    385   replacements.ClearUsername();
    386   replacements.ClearRef();
    387   // Clear port if it is implicit from scheme.
    388   if (url.has_port()) {
    389     const std::string& port = url.scheme();
    390     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
    391         url.EffectiveIntPort()) {
    392       replacements.ClearPort();
    393     }
    394   }
    395   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
    396       kEndOfURL;
    397 }
    398 
    399 static std::string CanonicalizeURLForRegexSearchesHelper(
    400     const GURL& url,
    401     bool clear_query) {
    402   GURL::Replacements replacements;
    403   replacements.ClearPassword();
    404   replacements.ClearUsername();
    405   replacements.ClearRef();
    406   if (clear_query)
    407     replacements.ClearQuery();
    408   // Clear port if it is implicit from scheme.
    409   if (url.has_port()) {
    410     const std::string& port = url.scheme();
    411     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
    412         url.EffectiveIntPort()) {
    413       replacements.ClearPort();
    414     }
    415   }
    416   return url.ReplaceComponents(replacements).spec();
    417 }
    418 
    419 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
    420     const GURL& url) const {
    421   return CanonicalizeURLForRegexSearchesHelper(url, false);
    422 }
    423 
    424 std::string
    425 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
    426     const GURL& url) const {
    427   return CanonicalizeURLForRegexSearchesHelper(url, true);
    428 }
    429 
    430 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
    431     const std::string& prefix) {
    432   return CreateCondition(URLMatcherCondition::URL_PREFIX,
    433       kBeginningOfURL + prefix);
    434 }
    435 
    436 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
    437     const std::string& suffix) {
    438   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
    439 }
    440 
    441 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
    442     const std::string& str) {
    443   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
    444 }
    445 
    446 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
    447     const std::string& str) {
    448   return CreateCondition(URLMatcherCondition::URL_EQUALS,
    449       kBeginningOfURL + str + kEndOfURL);
    450 }
    451 
    452 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
    453     const std::string& regex) {
    454   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
    455 }
    456 
    457 URLMatcherCondition
    458 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
    459     const std::string& regex) {
    460   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
    461 }
    462 
    463 void URLMatcherConditionFactory::ForgetUnusedPatterns(
    464       const std::set<StringPattern::ID>& used_patterns) {
    465   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
    466   while (i != substring_pattern_singletons_.end()) {
    467     if (ContainsKey(used_patterns, (*i)->id())) {
    468       ++i;
    469     } else {
    470       delete *i;
    471       substring_pattern_singletons_.erase(i++);
    472     }
    473   }
    474   i = regex_pattern_singletons_.begin();
    475   while (i != regex_pattern_singletons_.end()) {
    476     if (ContainsKey(used_patterns, (*i)->id())) {
    477       ++i;
    478     } else {
    479       delete *i;
    480       regex_pattern_singletons_.erase(i++);
    481     }
    482   }
    483   i = origin_and_path_regex_pattern_singletons_.begin();
    484   while (i != origin_and_path_regex_pattern_singletons_.end()) {
    485     if (ContainsKey(used_patterns, (*i)->id())) {
    486       ++i;
    487     } else {
    488       delete *i;
    489       origin_and_path_regex_pattern_singletons_.erase(i++);
    490     }
    491   }
    492 }
    493 
    494 bool URLMatcherConditionFactory::IsEmpty() const {
    495   return substring_pattern_singletons_.empty() &&
    496       regex_pattern_singletons_.empty() &&
    497       origin_and_path_regex_pattern_singletons_.empty();
    498 }
    499 
    500 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
    501     URLMatcherCondition::Criterion criterion,
    502     const std::string& pattern) {
    503   StringPattern search_pattern(pattern, 0);
    504   PatternSingletons* pattern_singletons = NULL;
    505   if (IsRegexCriterion(criterion))
    506     pattern_singletons = &regex_pattern_singletons_;
    507   else if (IsOriginAndPathRegexCriterion(criterion))
    508     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
    509   else
    510     pattern_singletons = &substring_pattern_singletons_;
    511 
    512   PatternSingletons::const_iterator iter =
    513       pattern_singletons->find(&search_pattern);
    514 
    515   if (iter != pattern_singletons->end()) {
    516     return URLMatcherCondition(criterion, *iter);
    517   } else {
    518     StringPattern* new_pattern =
    519         new StringPattern(pattern, id_counter_++);
    520     pattern_singletons->insert(new_pattern);
    521     return URLMatcherCondition(criterion, new_pattern);
    522   }
    523 }
    524 
    525 std::string URLMatcherConditionFactory::CanonicalizeHostname(
    526     const std::string& hostname) const {
    527   if (!hostname.empty() && hostname[0] == '.')
    528     return hostname;
    529   else
    530     return "." + hostname;
    531 }
    532 
    533 // This function prepares the query string by replacing query separator with a
    534 // magic value (|kQueryComponentDelimiter|). When the boolean
    535 // |prepend_beginning_of_query_component| is true the function prepends the
    536 // query with the same magic. This is done to locate the start of a key value
    537 // pair in the query string. The parameter |query| is passed by value
    538 // intentionally, since it is locally modified.
    539 std::string URLMatcherConditionFactory::CanonicalizeQuery(
    540     std::string query,
    541     bool prepend_beginning_of_query_component,
    542     bool append_end_of_query_component) const {
    543   for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
    544     if (*it == kQuerySeparator)
    545       *it = kQueryComponentDelimiter[0];
    546   }
    547   if (prepend_beginning_of_query_component)
    548     query = kQueryComponentDelimiter + query;
    549   if (append_end_of_query_component)
    550     query += kQueryComponentDelimiter;
    551   return query;
    552 }
    553 
    554 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
    555     StringPattern* lhs,
    556     StringPattern* rhs) const {
    557   if (lhs == NULL && rhs != NULL) return true;
    558   if (lhs != NULL && rhs != NULL)
    559     return lhs->pattern() < rhs->pattern();
    560   // Either both are NULL or only rhs is NULL.
    561   return false;
    562 }
    563 
    564 //
    565 // URLQueryElementMatcherCondition
    566 //
    567 
    568 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
    569     const std::string& key,
    570     const std::string& value,
    571     QueryValueMatchType query_value_match_type,
    572     QueryElementType query_element_type,
    573     Type match_type,
    574     URLMatcherConditionFactory* factory) {
    575   match_type_ = match_type;
    576 
    577   if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
    578     key_ = kQueryComponentDelimiter + key + "=";
    579     value_ = value;
    580   } else {
    581     key_ = kQueryComponentDelimiter + key;
    582     value_ = std::string();
    583   }
    584 
    585   if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
    586     value_ += kQueryComponentDelimiter;
    587 
    588   // If |value_| is empty no need to find the |key_| and verify if the value
    589   // matches. Simply checking the presence of key is sufficient, which is done
    590   // by MATCH_ANY
    591   if (value_.empty())
    592     match_type_ = MATCH_ANY;
    593 
    594   URLMatcherCondition condition;
    595   // If |match_type_| is MATCH_ANY, then we could simply look for the
    596   // combination of |key_| + |value_|, which can be efficiently done by
    597   // SubstringMatcher
    598   if (match_type_ == MATCH_ANY)
    599     condition = factory->CreateQueryContainsCondition(key_ + value_);
    600   else
    601     condition = factory->CreateQueryContainsCondition(key_);
    602   string_pattern_ = condition.string_pattern();
    603 
    604   key_length_ = key_.length();
    605   value_length_ = value_.length();
    606 }
    607 
    608 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
    609 
    610 bool URLQueryElementMatcherCondition::operator<(
    611     const URLQueryElementMatcherCondition& rhs) const {
    612   if (match_type_ != rhs.match_type_)
    613     return match_type_ < rhs.match_type_;
    614   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
    615     return *string_pattern_ < *rhs.string_pattern_;
    616   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL)
    617     return true;
    618   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
    619   // or both are NULL.
    620   return false;
    621 }
    622 
    623 bool URLQueryElementMatcherCondition::IsMatch(
    624     const std::string& url_for_component_searches) const {
    625   switch (match_type_) {
    626     case MATCH_ANY: {
    627       // For MATCH_ANY, no additional verification step is needed. We can trust
    628       // the SubstringMatcher to do the verification.
    629       return true;
    630     }
    631     case MATCH_ALL: {
    632       size_t start = 0;
    633       int found = 0;
    634       size_t offset;
    635       while ((offset = url_for_component_searches.find(key_, start)) !=
    636              std::string::npos) {
    637         if (url_for_component_searches.compare(
    638                 offset + key_length_, value_length_, value_) != 0) {
    639           return false;
    640         } else {
    641           ++found;
    642         }
    643         start = offset + key_length_ + value_length_ - 1;
    644       }
    645       return !!found;
    646     }
    647     case MATCH_FIRST: {
    648       size_t offset = url_for_component_searches.find(key_);
    649       return url_for_component_searches.compare(
    650                  offset + key_length_, value_length_, value_) == 0;
    651     }
    652     case MATCH_LAST: {
    653       size_t offset = url_for_component_searches.rfind(key_);
    654       return url_for_component_searches.compare(
    655                  offset + key_length_, value_length_, value_) == 0;
    656     }
    657   }
    658   NOTREACHED();
    659   return false;
    660 }
    661 
    662 //
    663 // URLMatcherSchemeFilter
    664 //
    665 
    666 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
    667     : filters_(1) {
    668   filters_.push_back(filter);
    669 }
    670 
    671 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
    672     const std::vector<std::string>& filters)
    673     : filters_(filters) {}
    674 
    675 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
    676 
    677 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
    678   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
    679       filters_.end();
    680 }
    681 
    682 //
    683 // URLMatcherPortFilter
    684 //
    685 
    686 URLMatcherPortFilter::URLMatcherPortFilter(
    687     const std::vector<URLMatcherPortFilter::Range>& ranges)
    688     : ranges_(ranges) {}
    689 
    690 URLMatcherPortFilter::~URLMatcherPortFilter() {}
    691 
    692 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
    693   int port = url.EffectiveIntPort();
    694   for (std::vector<Range>::const_iterator i = ranges_.begin();
    695        i != ranges_.end(); ++i) {
    696     if (i->first <= port && port <= i->second)
    697       return true;
    698   }
    699   return false;
    700 }
    701 
    702 // static
    703 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
    704                                                               int to) {
    705   return Range(from, to);
    706 }
    707 
    708 // static
    709 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
    710   return Range(port, port);
    711 }
    712 
    713 //
    714 // URLMatcherConditionSet
    715 //
    716 
    717 URLMatcherConditionSet::~URLMatcherConditionSet() {}
    718 
    719 URLMatcherConditionSet::URLMatcherConditionSet(
    720     ID id,
    721     const Conditions& conditions)
    722     : id_(id),
    723       conditions_(conditions) {}
    724 
    725 URLMatcherConditionSet::URLMatcherConditionSet(
    726     ID id,
    727     const Conditions& conditions,
    728     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    729     scoped_ptr<URLMatcherPortFilter> port_filter)
    730     : id_(id),
    731       conditions_(conditions),
    732       scheme_filter_(scheme_filter.Pass()),
    733       port_filter_(port_filter.Pass()) {}
    734 
    735 URLMatcherConditionSet::URLMatcherConditionSet(
    736     ID id,
    737     const Conditions& conditions,
    738     const QueryConditions& query_conditions,
    739     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
    740     scoped_ptr<URLMatcherPortFilter> port_filter)
    741     : id_(id),
    742       conditions_(conditions),
    743       query_conditions_(query_conditions),
    744       scheme_filter_(scheme_filter.Pass()),
    745       port_filter_(port_filter.Pass()) {}
    746 
    747 bool URLMatcherConditionSet::IsMatch(
    748     const std::set<StringPattern::ID>& matching_patterns,
    749     const GURL& url) const {
    750   return IsMatch(matching_patterns, url, std::string());
    751 }
    752 
    753 bool URLMatcherConditionSet::IsMatch(
    754     const std::set<StringPattern::ID>& matching_patterns,
    755     const GURL& url,
    756     const std::string& url_for_component_searches) const {
    757   for (Conditions::const_iterator i = conditions_.begin();
    758        i != conditions_.end(); ++i) {
    759     if (!i->IsMatch(matching_patterns, url))
    760       return false;
    761   }
    762   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
    763     return false;
    764   if (port_filter_.get() && !port_filter_->IsMatch(url))
    765     return false;
    766   if (query_conditions_.empty())
    767     return true;
    768   // The loop is duplicated below for performance reasons. If not all query
    769   // elements are found, no need to verify match that is expected to take more
    770   // cycles.
    771   for (QueryConditions::const_iterator i = query_conditions_.begin();
    772        i != query_conditions_.end();
    773        ++i) {
    774     if (!ContainsKey(matching_patterns, i->string_pattern()->id()))
    775       return false;
    776   }
    777   for (QueryConditions::const_iterator i = query_conditions_.begin();
    778        i != query_conditions_.end();
    779        ++i) {
    780     if (!i->IsMatch(url_for_component_searches))
    781       return false;
    782   }
    783   return true;
    784 }
    785 
    786 //
    787 // URLMatcher
    788 //
    789 
    790 URLMatcher::URLMatcher() {}
    791 
    792 URLMatcher::~URLMatcher() {}
    793 
    794 void URLMatcher::AddConditionSets(
    795     const URLMatcherConditionSet::Vector& condition_sets) {
    796   for (URLMatcherConditionSet::Vector::const_iterator i =
    797        condition_sets.begin(); i != condition_sets.end(); ++i) {
    798     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
    799         url_matcher_condition_sets_.end());
    800     url_matcher_condition_sets_[(*i)->id()] = *i;
    801   }
    802   UpdateInternalDatastructures();
    803 }
    804 
    805 void URLMatcher::RemoveConditionSets(
    806     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
    807   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
    808        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
    809     DCHECK(url_matcher_condition_sets_.find(*i) !=
    810         url_matcher_condition_sets_.end());
    811     url_matcher_condition_sets_.erase(*i);
    812   }
    813   UpdateInternalDatastructures();
    814 }
    815 
    816 void URLMatcher::ClearUnusedConditionSets() {
    817   UpdateConditionFactory();
    818 }
    819 
    820 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
    821     const GURL& url) const {
    822   // Find all IDs of StringPatterns that match |url|.
    823   // See URLMatcherConditionFactory for the canonicalization of URLs and the
    824   // distinction between full url searches and url component searches.
    825   std::set<StringPattern::ID> matches;
    826   std::string url_for_component_searches;
    827 
    828   if (!full_url_matcher_.IsEmpty()) {
    829     full_url_matcher_.Match(
    830         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
    831   }
    832   if (!url_component_matcher_.IsEmpty()) {
    833     url_for_component_searches =
    834         condition_factory_.CanonicalizeURLForComponentSearches(url);
    835     url_component_matcher_.Match(url_for_component_searches, &matches);
    836   }
    837   if (!regex_set_matcher_.IsEmpty()) {
    838     regex_set_matcher_.Match(
    839         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
    840   }
    841   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
    842     origin_and_path_regex_set_matcher_.Match(
    843         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
    844         &matches);
    845   }
    846 
    847   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
    848   // were fulfilled.
    849   std::set<URLMatcherConditionSet::ID> result;
    850   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
    851        i != matches.end(); ++i) {
    852     // For each URLMatcherConditionSet there is exactly one condition
    853     // registered in substring_match_triggers_. This means that the following
    854     // logic tests each URLMatcherConditionSet exactly once if it can be
    855     // completely fulfilled.
    856     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
    857         substring_match_triggers_.find(*i);
    858     if (triggered_condition_sets_iter == substring_match_triggers_.end())
    859       continue;  // Not all substring matches are triggers for a condition set.
    860     const std::set<URLMatcherConditionSet::ID>& condition_sets =
    861         triggered_condition_sets_iter->second;
    862     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
    863          condition_sets.begin(); j != condition_sets.end(); ++j) {
    864       URLMatcherConditionSets::const_iterator condition_set_iter =
    865           url_matcher_condition_sets_.find(*j);
    866       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
    867       if (condition_set_iter->second->IsMatch(
    868               matches, url, url_for_component_searches))
    869         result.insert(*j);
    870     }
    871   }
    872 
    873   return result;
    874 }
    875 
    876 bool URLMatcher::IsEmpty() const {
    877   return condition_factory_.IsEmpty() &&
    878       url_matcher_condition_sets_.empty() &&
    879       substring_match_triggers_.empty() &&
    880       full_url_matcher_.IsEmpty() &&
    881       url_component_matcher_.IsEmpty() &&
    882       regex_set_matcher_.IsEmpty() &&
    883       origin_and_path_regex_set_matcher_.IsEmpty() &&
    884       registered_full_url_patterns_.empty() &&
    885       registered_url_component_patterns_.empty();
    886 }
    887 
    888 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
    889   // The purpose of |full_url_conditions| is just that we need to execute
    890   // the same logic once for Full URL searches and once for URL Component
    891   // searches (see URLMatcherConditionFactory).
    892 
    893   // Determine which patterns need to be registered when this function
    894   // terminates.
    895   std::set<const StringPattern*> new_patterns;
    896   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    897       url_matcher_condition_sets_.begin();
    898       condition_set_iter != url_matcher_condition_sets_.end();
    899       ++condition_set_iter) {
    900     const URLMatcherConditionSet::Conditions& conditions =
    901         condition_set_iter->second->conditions();
    902     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    903          conditions.begin(); condition_iter != conditions.end();
    904          ++condition_iter) {
    905       // If we are called to process Full URL searches, ignore others, and
    906       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
    907       if (!condition_iter->IsRegexCondition() &&
    908           !condition_iter->IsOriginAndPathRegexCondition() &&
    909           full_url_conditions == condition_iter->IsFullURLCondition())
    910         new_patterns.insert(condition_iter->string_pattern());
    911     }
    912 
    913     if (full_url_conditions)
    914       continue;
    915 
    916     const URLMatcherConditionSet::QueryConditions& query_conditions =
    917         condition_set_iter->second->query_conditions();
    918     for (URLMatcherConditionSet::QueryConditions::const_iterator
    919              query_condition_iter = query_conditions.begin();
    920          query_condition_iter != query_conditions.end();
    921          ++query_condition_iter) {
    922       new_patterns.insert(query_condition_iter->string_pattern());
    923     }
    924   }
    925 
    926   // This is the set of patterns that were registered before this function
    927   // is called.
    928   std::set<const StringPattern*>& registered_patterns =
    929       full_url_conditions ? registered_full_url_patterns_
    930                           : registered_url_component_patterns_;
    931 
    932   // Add all patterns that are in new_patterns but not in registered_patterns.
    933   std::vector<const StringPattern*> patterns_to_register =
    934       base::STLSetDifference<std::vector<const StringPattern*> >(
    935           new_patterns, registered_patterns);
    936 
    937   // Remove all patterns that are in registered_patterns but not in
    938   // new_patterns.
    939   std::vector<const StringPattern*> patterns_to_unregister =
    940       base::STLSetDifference<std::vector<const StringPattern*> >(
    941            registered_patterns, new_patterns);
    942 
    943   // Update the SubstringSetMatcher.
    944   SubstringSetMatcher& url_matcher =
    945       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
    946   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
    947                                             patterns_to_unregister);
    948 
    949   // Update the set of registered_patterns for the next time this function
    950   // is being called.
    951   registered_patterns.swap(new_patterns);
    952 }
    953 
    954 void URLMatcher::UpdateRegexSetMatcher() {
    955   std::vector<const StringPattern*> new_patterns;
    956   std::vector<const StringPattern*> new_origin_and_path_patterns;
    957 
    958   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    959       url_matcher_condition_sets_.begin();
    960       condition_set_iter != url_matcher_condition_sets_.end();
    961       ++condition_set_iter) {
    962     const URLMatcherConditionSet::Conditions& conditions =
    963         condition_set_iter->second->conditions();
    964     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    965          conditions.begin(); condition_iter != conditions.end();
    966          ++condition_iter) {
    967       if (condition_iter->IsRegexCondition()) {
    968         new_patterns.push_back(condition_iter->string_pattern());
    969       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
    970         new_origin_and_path_patterns.push_back(
    971             condition_iter->string_pattern());
    972       }
    973     }
    974   }
    975 
    976   // Start over from scratch. We can't really do better than this, since the
    977   // FilteredRE2 backend doesn't support incremental updates.
    978   regex_set_matcher_.ClearPatterns();
    979   regex_set_matcher_.AddPatterns(new_patterns);
    980   origin_and_path_regex_set_matcher_.ClearPatterns();
    981   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
    982 }
    983 
    984 void URLMatcher::UpdateTriggers() {
    985   // Count substring pattern frequencies.
    986   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
    987   for (URLMatcherConditionSets::const_iterator condition_set_iter =
    988       url_matcher_condition_sets_.begin();
    989       condition_set_iter != url_matcher_condition_sets_.end();
    990       ++condition_set_iter) {
    991     const URLMatcherConditionSet::Conditions& conditions =
    992         condition_set_iter->second->conditions();
    993     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
    994          conditions.begin(); condition_iter != conditions.end();
    995          ++condition_iter) {
    996       const StringPattern* pattern = condition_iter->string_pattern();
    997       substring_pattern_frequencies[pattern->id()]++;
    998     }
    999 
   1000     const URLMatcherConditionSet::QueryConditions& query_conditions =
   1001         condition_set_iter->second->query_conditions();
   1002     for (URLMatcherConditionSet::QueryConditions::const_iterator
   1003              query_condition_iter = query_conditions.begin();
   1004          query_condition_iter != query_conditions.end();
   1005          ++query_condition_iter) {
   1006       const StringPattern* pattern = query_condition_iter->string_pattern();
   1007       substring_pattern_frequencies[pattern->id()]++;
   1008     }
   1009   }
   1010 
   1011   // Update trigger conditions: Determine for each URLMatcherConditionSet which
   1012   // URLMatcherCondition contains a StringPattern that occurs least
   1013   // frequently in this URLMatcher. We assume that this condition is very
   1014   // specific and occurs rarely in URLs. If a match occurs for this
   1015   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
   1016   // respective URLMatcherConditionSet as well to see whether the entire
   1017   // URLMatcherConditionSet is considered matching.
   1018   substring_match_triggers_.clear();
   1019   for (URLMatcherConditionSets::const_iterator condition_set_iter =
   1020       url_matcher_condition_sets_.begin();
   1021       condition_set_iter != url_matcher_condition_sets_.end();
   1022       ++condition_set_iter) {
   1023     const URLMatcherConditionSet::Conditions& conditions =
   1024         condition_set_iter->second->conditions();
   1025     if (conditions.empty())
   1026       continue;
   1027     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
   1028         conditions.begin();
   1029     StringPattern::ID trigger = condition_iter->string_pattern()->id();
   1030     // We skip the first element in the following loop.
   1031     ++condition_iter;
   1032     for (; condition_iter != conditions.end(); ++condition_iter) {
   1033       StringPattern::ID current_id =
   1034           condition_iter->string_pattern()->id();
   1035       if (substring_pattern_frequencies[trigger] >
   1036           substring_pattern_frequencies[current_id]) {
   1037         trigger = current_id;
   1038       }
   1039     }
   1040 
   1041     const URLMatcherConditionSet::QueryConditions& query_conditions =
   1042         condition_set_iter->second->query_conditions();
   1043     for (URLMatcherConditionSet::QueryConditions::const_iterator
   1044              query_condition_iter = query_conditions.begin();
   1045          query_condition_iter != query_conditions.end();
   1046          ++query_condition_iter) {
   1047       StringPattern::ID current_id =
   1048           query_condition_iter->string_pattern()->id();
   1049       if (substring_pattern_frequencies[trigger] >
   1050           substring_pattern_frequencies[current_id]) {
   1051         trigger = current_id;
   1052       }
   1053     }
   1054 
   1055     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
   1056   }
   1057 }
   1058 
   1059 void URLMatcher::UpdateConditionFactory() {
   1060   std::set<StringPattern::ID> used_patterns;
   1061   for (URLMatcherConditionSets::const_iterator condition_set_iter =
   1062       url_matcher_condition_sets_.begin();
   1063       condition_set_iter != url_matcher_condition_sets_.end();
   1064       ++condition_set_iter) {
   1065     const URLMatcherConditionSet::Conditions& conditions =
   1066         condition_set_iter->second->conditions();
   1067     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
   1068          conditions.begin(); condition_iter != conditions.end();
   1069          ++condition_iter) {
   1070       used_patterns.insert(condition_iter->string_pattern()->id());
   1071     }
   1072     const URLMatcherConditionSet::QueryConditions& query_conditions =
   1073         condition_set_iter->second->query_conditions();
   1074     for (URLMatcherConditionSet::QueryConditions::const_iterator
   1075              query_condition_iter = query_conditions.begin();
   1076          query_condition_iter != query_conditions.end();
   1077          ++query_condition_iter) {
   1078       used_patterns.insert(query_condition_iter->string_pattern()->id());
   1079     }
   1080   }
   1081   condition_factory_.ForgetUnusedPatterns(used_patterns);
   1082 }
   1083 
   1084 void URLMatcher::UpdateInternalDatastructures() {
   1085   // TODO(vadimt): Remove ScopedProfile below once crbug.com/417106 is fixed.
   1086   tracked_objects::ScopedProfile tracking_profile(
   1087       FROM_HERE_WITH_EXPLICIT_FUNCTION(
   1088           "URLMatcher_UpdateInternalDatastructures"));
   1089   UpdateSubstringSetMatcher(false);
   1090   UpdateSubstringSetMatcher(true);
   1091   UpdateRegexSetMatcher();
   1092   UpdateTriggers();
   1093   UpdateConditionFactory();
   1094 }
   1095 
   1096 }  // namespace url_matcher
   1097