Home | History | Annotate | Download | only in common
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "extensions/common/url_pattern.h"
      6 
      7 #include <ostream>
      8 
      9 #include "base/strings/string_number_conversions.h"
     10 #include "base/strings/string_piece.h"
     11 #include "base/strings/string_split.h"
     12 #include "base/strings/string_util.h"
     13 #include "base/strings/stringprintf.h"
     14 #include "content/public/common/url_constants.h"
     15 #include "extensions/common/constants.h"
     16 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
     17 #include "url/gurl.h"
     18 #include "url/url_util.h"
     19 
     20 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
     21 
     22 namespace {
     23 
     24 // TODO(aa): What about more obscure schemes like data: and javascript: ?
     25 // Note: keep this array in sync with kValidSchemeMasks.
     26 const char* kValidSchemes[] = {
     27     url::kHttpScheme,
     28     url::kHttpsScheme,
     29     url::kFileScheme,
     30     url::kFtpScheme,
     31     content::kChromeUIScheme,
     32     extensions::kExtensionScheme,
     33     url::kFileSystemScheme,
     34 };
     35 
     36 const int kValidSchemeMasks[] = {
     37   URLPattern::SCHEME_HTTP,
     38   URLPattern::SCHEME_HTTPS,
     39   URLPattern::SCHEME_FILE,
     40   URLPattern::SCHEME_FTP,
     41   URLPattern::SCHEME_CHROMEUI,
     42   URLPattern::SCHEME_EXTENSION,
     43   URLPattern::SCHEME_FILESYSTEM,
     44 };
     45 
     46 COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
     47                must_keep_these_arrays_in_sync);
     48 
     49 const char kParseSuccess[] = "Success.";
     50 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
     51 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
     52 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
     53 const char kParseErrorEmptyHost[] = "Host can not be empty.";
     54 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
     55 const char kParseErrorEmptyPath[] = "Empty path.";
     56 const char kParseErrorInvalidPort[] = "Invalid port.";
     57 const char kParseErrorInvalidHost[] = "Invalid host.";
     58 
     59 // Message explaining each URLPattern::ParseResult.
     60 const char* const kParseResultMessages[] = {
     61   kParseSuccess,
     62   kParseErrorMissingSchemeSeparator,
     63   kParseErrorInvalidScheme,
     64   kParseErrorWrongSchemeType,
     65   kParseErrorEmptyHost,
     66   kParseErrorInvalidHostWildcard,
     67   kParseErrorEmptyPath,
     68   kParseErrorInvalidPort,
     69   kParseErrorInvalidHost,
     70 };
     71 
     72 COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
     73                must_add_message_for_each_parse_result);
     74 
     75 const char kPathSeparator[] = "/";
     76 
     77 bool IsStandardScheme(const std::string& scheme) {
     78   // "*" gets the same treatment as a standard scheme.
     79   if (scheme == "*")
     80     return true;
     81 
     82   return url::IsStandard(scheme.c_str(),
     83                          url::Component(0, static_cast<int>(scheme.length())));
     84 }
     85 
     86 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
     87   if (port == "*")
     88     return true;
     89 
     90   // Only accept non-wildcard ports if the scheme uses ports.
     91   if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
     92       url::PORT_UNSPECIFIED) {
     93     return false;
     94   }
     95 
     96   int parsed_port = url::PORT_UNSPECIFIED;
     97   if (!base::StringToInt(port, &parsed_port))
     98     return false;
     99   return (parsed_port >= 0) && (parsed_port < 65536);
    100 }
    101 
    102 // Returns |path| with the trailing wildcard stripped if one existed.
    103 //
    104 // The functions that rely on this (OverlapsWith and Contains) are only
    105 // called for the patterns inside URLPatternSet. In those cases, we know that
    106 // the path will have only a single wildcard at the end. This makes figuring
    107 // out overlap much easier. It seems like there is probably a computer-sciency
    108 // way to solve the general case, but we don't need that yet.
    109 std::string StripTrailingWildcard(const std::string& path) {
    110   size_t wildcard_index = path.find('*');
    111   size_t path_last = path.size() - 1;
    112   DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
    113   return wildcard_index == path_last ? path.substr(0, path_last) : path;
    114 }
    115 
    116 }  // namespace
    117 
    118 // static
    119 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
    120   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    121     if (scheme == kValidSchemes[i])
    122       return true;
    123   }
    124   return false;
    125 }
    126 
    127 URLPattern::URLPattern()
    128     : valid_schemes_(SCHEME_NONE),
    129       match_all_urls_(false),
    130       match_subdomains_(false),
    131       port_("*") {}
    132 
    133 URLPattern::URLPattern(int valid_schemes)
    134     : valid_schemes_(valid_schemes),
    135       match_all_urls_(false),
    136       match_subdomains_(false),
    137       port_("*") {}
    138 
    139 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
    140     // Strict error checking is used, because this constructor is only
    141     // appropriate when we know |pattern| is valid.
    142     : valid_schemes_(valid_schemes),
    143       match_all_urls_(false),
    144       match_subdomains_(false),
    145       port_("*") {
    146   ParseResult result = Parse(pattern);
    147   if (PARSE_SUCCESS != result)
    148     NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
    149 }
    150 
    151 URLPattern::~URLPattern() {
    152 }
    153 
    154 bool URLPattern::operator<(const URLPattern& other) const {
    155   return GetAsString() < other.GetAsString();
    156 }
    157 
    158 bool URLPattern::operator>(const URLPattern& other) const {
    159   return GetAsString() > other.GetAsString();
    160 }
    161 
    162 bool URLPattern::operator==(const URLPattern& other) const {
    163   return GetAsString() == other.GetAsString();
    164 }
    165 
    166 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
    167   return out << '"' << url_pattern.GetAsString() << '"';
    168 }
    169 
    170 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
    171   spec_.clear();
    172   SetMatchAllURLs(false);
    173   SetMatchSubdomains(false);
    174   SetPort("*");
    175 
    176   // Special case pattern to match every valid URL.
    177   if (pattern == kAllUrlsPattern) {
    178     SetMatchAllURLs(true);
    179     return PARSE_SUCCESS;
    180   }
    181 
    182   // Parse out the scheme.
    183   size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
    184   bool has_standard_scheme_separator = true;
    185 
    186   // Some urls also use ':' alone as the scheme separator.
    187   if (scheme_end_pos == std::string::npos) {
    188     scheme_end_pos = pattern.find(':');
    189     has_standard_scheme_separator = false;
    190   }
    191 
    192   if (scheme_end_pos == std::string::npos)
    193     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
    194 
    195   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
    196     return PARSE_ERROR_INVALID_SCHEME;
    197 
    198   bool standard_scheme = IsStandardScheme(scheme_);
    199   if (standard_scheme != has_standard_scheme_separator)
    200     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
    201 
    202   // Advance past the scheme separator.
    203   scheme_end_pos +=
    204       (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
    205   if (scheme_end_pos >= pattern.size())
    206     return PARSE_ERROR_EMPTY_HOST;
    207 
    208   // Parse out the host and path.
    209   size_t host_start_pos = scheme_end_pos;
    210   size_t path_start_pos = 0;
    211 
    212   if (!standard_scheme) {
    213     path_start_pos = host_start_pos;
    214   } else if (scheme_ == url::kFileScheme) {
    215     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    216     if (host_end_pos == std::string::npos) {
    217       // Allow hostname omission.
    218       // e.g. file://* is interpreted as file:///*,
    219       // file://foo* is interpreted as file:///foo*.
    220       path_start_pos = host_start_pos - 1;
    221     } else {
    222       // Ignore hostname if scheme is file://.
    223       // e.g. file://localhost/foo is equal to file:///foo.
    224       path_start_pos = host_end_pos;
    225     }
    226   } else {
    227     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    228 
    229     // Host is required.
    230     if (host_start_pos == host_end_pos)
    231       return PARSE_ERROR_EMPTY_HOST;
    232 
    233     if (host_end_pos == std::string::npos)
    234       return PARSE_ERROR_EMPTY_PATH;
    235 
    236     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
    237 
    238     // The first component can optionally be '*' to match all subdomains.
    239     std::vector<std::string> host_components;
    240     base::SplitString(host_, '.', &host_components);
    241 
    242     // Could be empty if the host only consists of whitespace characters.
    243     if (host_components.empty())
    244       return PARSE_ERROR_EMPTY_HOST;
    245 
    246     if (host_components[0] == "*") {
    247       match_subdomains_ = true;
    248       host_components.erase(host_components.begin(),
    249                             host_components.begin() + 1);
    250     }
    251     host_ = JoinString(host_components, '.');
    252 
    253     path_start_pos = host_end_pos;
    254   }
    255 
    256   SetPath(pattern.substr(path_start_pos));
    257 
    258   size_t port_pos = host_.find(':');
    259   if (port_pos != std::string::npos) {
    260     if (!SetPort(host_.substr(port_pos + 1)))
    261       return PARSE_ERROR_INVALID_PORT;
    262     host_ = host_.substr(0, port_pos);
    263   }
    264 
    265   // No other '*' can occur in the host, though. This isn't necessary, but is
    266   // done as a convenience to developers who might otherwise be confused and
    267   // think '*' works as a glob in the host.
    268   if (host_.find('*') != std::string::npos)
    269     return PARSE_ERROR_INVALID_HOST_WILDCARD;
    270 
    271   // Null characters are not allowed in hosts.
    272   if (host_.find('\0') != std::string::npos)
    273     return PARSE_ERROR_INVALID_HOST;
    274 
    275   return PARSE_SUCCESS;
    276 }
    277 
    278 void URLPattern::SetValidSchemes(int valid_schemes) {
    279   spec_.clear();
    280   valid_schemes_ = valid_schemes;
    281 }
    282 
    283 void URLPattern::SetHost(const std::string& host) {
    284   spec_.clear();
    285   host_ = host;
    286 }
    287 
    288 void URLPattern::SetMatchAllURLs(bool val) {
    289   spec_.clear();
    290   match_all_urls_ = val;
    291 
    292   if (val) {
    293     match_subdomains_ = true;
    294     scheme_ = "*";
    295     host_.clear();
    296     SetPath("/*");
    297   }
    298 }
    299 
    300 void URLPattern::SetMatchSubdomains(bool val) {
    301   spec_.clear();
    302   match_subdomains_ = val;
    303 }
    304 
    305 bool URLPattern::SetScheme(const std::string& scheme) {
    306   spec_.clear();
    307   scheme_ = scheme;
    308   if (scheme_ == "*") {
    309     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
    310   } else if (!IsValidScheme(scheme_)) {
    311     return false;
    312   }
    313   return true;
    314 }
    315 
    316 bool URLPattern::IsValidScheme(const std::string& scheme) const {
    317   if (valid_schemes_ == SCHEME_ALL)
    318     return true;
    319 
    320   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    321     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
    322       return true;
    323   }
    324 
    325   return false;
    326 }
    327 
    328 void URLPattern::SetPath(const std::string& path) {
    329   spec_.clear();
    330   path_ = path;
    331   path_escaped_ = path_;
    332   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
    333   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
    334 }
    335 
    336 bool URLPattern::SetPort(const std::string& port) {
    337   spec_.clear();
    338   if (IsValidPortForScheme(scheme_, port)) {
    339     port_ = port;
    340     return true;
    341   }
    342   return false;
    343 }
    344 
    345 bool URLPattern::MatchesURL(const GURL& test) const {
    346   const GURL* test_url = &test;
    347   bool has_inner_url = test.inner_url() != NULL;
    348 
    349   if (has_inner_url) {
    350     if (!test.SchemeIsFileSystem())
    351       return false;  // The only nested URLs we handle are filesystem URLs.
    352     test_url = test.inner_url();
    353   }
    354 
    355   if (!MatchesScheme(test_url->scheme()))
    356     return false;
    357 
    358   if (match_all_urls_)
    359     return true;
    360 
    361   std::string path_for_request = test.PathForRequest();
    362   if (has_inner_url)
    363     path_for_request = test_url->path() + path_for_request;
    364 
    365   return MatchesSecurityOriginHelper(*test_url) &&
    366          MatchesPath(path_for_request);
    367 }
    368 
    369 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
    370   const GURL* test_url = &test;
    371   bool has_inner_url = test.inner_url() != NULL;
    372 
    373   if (has_inner_url) {
    374     if (!test.SchemeIsFileSystem())
    375       return false;  // The only nested URLs we handle are filesystem URLs.
    376     test_url = test.inner_url();
    377   }
    378 
    379   if (!MatchesScheme(test_url->scheme()))
    380     return false;
    381 
    382   if (match_all_urls_)
    383     return true;
    384 
    385   return MatchesSecurityOriginHelper(*test_url);
    386 }
    387 
    388 bool URLPattern::MatchesScheme(const std::string& test) const {
    389   if (!IsValidScheme(test))
    390     return false;
    391 
    392   return scheme_ == "*" || test == scheme_;
    393 }
    394 
    395 bool URLPattern::MatchesHost(const std::string& host) const {
    396   std::string test(url::kHttpScheme);
    397   test += url::kStandardSchemeSeparator;
    398   test += host;
    399   test += "/";
    400   return MatchesHost(GURL(test));
    401 }
    402 
    403 bool URLPattern::MatchesHost(const GURL& test) const {
    404   // If the hosts are exactly equal, we have a match.
    405   if (test.host() == host_)
    406     return true;
    407 
    408   // If we're matching subdomains, and we have no host in the match pattern,
    409   // that means that we're matching all hosts, which means we have a match no
    410   // matter what the test host is.
    411   if (match_subdomains_ && host_.empty())
    412     return true;
    413 
    414   // Otherwise, we can only match if our match pattern matches subdomains.
    415   if (!match_subdomains_)
    416     return false;
    417 
    418   // We don't do subdomain matching against IP addresses, so we can give up now
    419   // if the test host is an IP address.
    420   if (test.HostIsIPAddress())
    421     return false;
    422 
    423   // Check if the test host is a subdomain of our host.
    424   if (test.host().length() <= (host_.length() + 1))
    425     return false;
    426 
    427   if (test.host().compare(test.host().length() - host_.length(),
    428                           host_.length(), host_) != 0)
    429     return false;
    430 
    431   return test.host()[test.host().length() - host_.length() - 1] == '.';
    432 }
    433 
    434 bool URLPattern::ImpliesAllHosts() const {
    435   // Check if it matches all urls or is a pattern like http://*/*.
    436   if (match_all_urls_ ||
    437       (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
    438     return true;
    439   }
    440 
    441   // If this doesn't even match subdomains, it can't possibly imply all hosts.
    442   if (!match_subdomains_)
    443     return false;
    444 
    445   // If |host_| is a recognized TLD, this will be 0. We don't include private
    446   // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
    447   size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
    448       host_,
    449       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
    450       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
    451   // If there was more than just a TLD in the host (e.g., *.foobar.com), it
    452   // doesn't imply all hosts.
    453   if (registry_length > 0)
    454     return false;
    455 
    456   // At this point the host could either be just a TLD ("com") or some unknown
    457   // TLD-like string ("notatld"). To disambiguate between them construct a
    458   // fake URL, and check the registry. This returns 0 if the TLD is
    459   // unrecognized, or the length of the recognized TLD.
    460   registry_length = net::registry_controlled_domains::GetRegistryLength(
    461       base::StringPrintf("foo.%s", host_.c_str()),
    462       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
    463       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
    464   // If we recognized this TLD, then this is a pattern like *.com, and it
    465   // should imply all hosts. Otherwise, this doesn't imply all hosts.
    466   return registry_length > 0;
    467 }
    468 
    469 bool URLPattern::MatchesSingleOrigin() const {
    470   // Strictly speaking, the port is part of the origin, but in URLPattern it
    471   // defaults to *. It's not very interesting anyway, so leave it out.
    472   return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
    473 }
    474 
    475 bool URLPattern::MatchesPath(const std::string& test) const {
    476   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
    477   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
    478   if (test + "/*" == path_escaped_)
    479     return true;
    480 
    481   return MatchPattern(test, path_escaped_);
    482 }
    483 
    484 const std::string& URLPattern::GetAsString() const {
    485   if (!spec_.empty())
    486     return spec_;
    487 
    488   if (match_all_urls_) {
    489     spec_ = kAllUrlsPattern;
    490     return spec_;
    491   }
    492 
    493   bool standard_scheme = IsStandardScheme(scheme_);
    494 
    495   std::string spec = scheme_ +
    496       (standard_scheme ? url::kStandardSchemeSeparator : ":");
    497 
    498   if (scheme_ != url::kFileScheme && standard_scheme) {
    499     if (match_subdomains_) {
    500       spec += "*";
    501       if (!host_.empty())
    502         spec += ".";
    503     }
    504 
    505     if (!host_.empty())
    506       spec += host_;
    507 
    508     if (port_ != "*") {
    509       spec += ":";
    510       spec += port_;
    511     }
    512   }
    513 
    514   if (!path_.empty())
    515     spec += path_;
    516 
    517   spec_ = spec;
    518   return spec_;
    519 }
    520 
    521 bool URLPattern::OverlapsWith(const URLPattern& other) const {
    522   if (match_all_urls() || other.match_all_urls())
    523     return true;
    524   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
    525           other.MatchesAnyScheme(GetExplicitSchemes()))
    526       && (MatchesHost(other.host()) || other.MatchesHost(host()))
    527       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
    528       && (MatchesPath(StripTrailingWildcard(other.path())) ||
    529           other.MatchesPath(StripTrailingWildcard(path())));
    530 }
    531 
    532 bool URLPattern::Contains(const URLPattern& other) const {
    533   if (match_all_urls())
    534     return true;
    535   return MatchesAllSchemes(other.GetExplicitSchemes())
    536       && MatchesHost(other.host())
    537       && MatchesPortPattern(other.port())
    538       && MatchesPath(StripTrailingWildcard(other.path()));
    539 }
    540 
    541 bool URLPattern::MatchesAnyScheme(
    542     const std::vector<std::string>& schemes) const {
    543   for (std::vector<std::string>::const_iterator i = schemes.begin();
    544        i != schemes.end(); ++i) {
    545     if (MatchesScheme(*i))
    546       return true;
    547   }
    548 
    549   return false;
    550 }
    551 
    552 bool URLPattern::MatchesAllSchemes(
    553     const std::vector<std::string>& schemes) const {
    554   for (std::vector<std::string>::const_iterator i = schemes.begin();
    555        i != schemes.end(); ++i) {
    556     if (!MatchesScheme(*i))
    557       return false;
    558   }
    559 
    560   return true;
    561 }
    562 
    563 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
    564   // Ignore hostname if scheme is file://.
    565   if (scheme_ != url::kFileScheme && !MatchesHost(test))
    566     return false;
    567 
    568   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
    569     return false;
    570 
    571   return true;
    572 }
    573 
    574 bool URLPattern::MatchesPortPattern(const std::string& port) const {
    575   return port_ == "*" || port_ == port;
    576 }
    577 
    578 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
    579   std::vector<std::string> result;
    580 
    581   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
    582     result.push_back(scheme_);
    583     return result;
    584   }
    585 
    586   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    587     if (MatchesScheme(kValidSchemes[i])) {
    588       result.push_back(kValidSchemes[i]);
    589     }
    590   }
    591 
    592   return result;
    593 }
    594 
    595 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
    596   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
    597   std::vector<URLPattern> result;
    598 
    599   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
    600        i != explicit_schemes.end(); ++i) {
    601     URLPattern temp = *this;
    602     temp.SetScheme(*i);
    603     temp.SetMatchAllURLs(false);
    604     result.push_back(temp);
    605   }
    606 
    607   return result;
    608 }
    609 
    610 // static
    611 const char* URLPattern::GetParseResultString(
    612     URLPattern::ParseResult parse_result) {
    613   return kParseResultMessages[parse_result];
    614 }
    615