Home | History | Annotate | Download | only in common
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "extensions/common/url_pattern.h"
      6 
      7 #include "base/strings/string_number_conversions.h"
      8 #include "base/strings/string_piece.h"
      9 #include "base/strings/string_split.h"
     10 #include "base/strings/string_util.h"
     11 #include "content/public/common/url_constants.h"
     12 #include "extensions/common/constants.h"
     13 #include "url/gurl.h"
     14 #include "url/url_util.h"
     15 
     16 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
     17 
     18 namespace {
     19 
     20 // TODO(aa): What about more obscure schemes like data: and javascript: ?
     21 // Note: keep this array in sync with kValidSchemeMasks.
     22 const char* kValidSchemes[] = {
     23     url::kHttpScheme,
     24     url::kHttpsScheme,
     25     url::kFileScheme,
     26     url::kFtpScheme,
     27     content::kChromeUIScheme,
     28     extensions::kExtensionScheme,
     29     url::kFileSystemScheme,
     30 };
     31 
     32 const int kValidSchemeMasks[] = {
     33   URLPattern::SCHEME_HTTP,
     34   URLPattern::SCHEME_HTTPS,
     35   URLPattern::SCHEME_FILE,
     36   URLPattern::SCHEME_FTP,
     37   URLPattern::SCHEME_CHROMEUI,
     38   URLPattern::SCHEME_EXTENSION,
     39   URLPattern::SCHEME_FILESYSTEM,
     40 };
     41 
     42 COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
     43                must_keep_these_arrays_in_sync);
     44 
     45 const char kParseSuccess[] = "Success.";
     46 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
     47 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
     48 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
     49 const char kParseErrorEmptyHost[] = "Host can not be empty.";
     50 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
     51 const char kParseErrorEmptyPath[] = "Empty path.";
     52 const char kParseErrorInvalidPort[] = "Invalid port.";
     53 const char kParseErrorInvalidHost[] = "Invalid host.";
     54 
     55 // Message explaining each URLPattern::ParseResult.
     56 const char* const kParseResultMessages[] = {
     57   kParseSuccess,
     58   kParseErrorMissingSchemeSeparator,
     59   kParseErrorInvalidScheme,
     60   kParseErrorWrongSchemeType,
     61   kParseErrorEmptyHost,
     62   kParseErrorInvalidHostWildcard,
     63   kParseErrorEmptyPath,
     64   kParseErrorInvalidPort,
     65   kParseErrorInvalidHost,
     66 };
     67 
     68 COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
     69                must_add_message_for_each_parse_result);
     70 
     71 const char kPathSeparator[] = "/";
     72 
     73 bool IsStandardScheme(const std::string& scheme) {
     74   // "*" gets the same treatment as a standard scheme.
     75   if (scheme == "*")
     76     return true;
     77 
     78   return url::IsStandard(scheme.c_str(),
     79                          url::Component(0, static_cast<int>(scheme.length())));
     80 }
     81 
     82 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
     83   if (port == "*")
     84     return true;
     85 
     86   // Only accept non-wildcard ports if the scheme uses ports.
     87   if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
     88       url::PORT_UNSPECIFIED) {
     89     return false;
     90   }
     91 
     92   int parsed_port = url::PORT_UNSPECIFIED;
     93   if (!base::StringToInt(port, &parsed_port))
     94     return false;
     95   return (parsed_port >= 0) && (parsed_port < 65536);
     96 }
     97 
     98 // Returns |path| with the trailing wildcard stripped if one existed.
     99 //
    100 // The functions that rely on this (OverlapsWith and Contains) are only
    101 // called for the patterns inside URLPatternSet. In those cases, we know that
    102 // the path will have only a single wildcard at the end. This makes figuring
    103 // out overlap much easier. It seems like there is probably a computer-sciency
    104 // way to solve the general case, but we don't need that yet.
    105 std::string StripTrailingWildcard(const std::string& path) {
    106   size_t wildcard_index = path.find('*');
    107   size_t path_last = path.size() - 1;
    108   DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
    109   return wildcard_index == path_last ? path.substr(0, path_last) : path;
    110 }
    111 
    112 }  // namespace
    113 
    114 // static
    115 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
    116   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    117     if (scheme == kValidSchemes[i])
    118       return true;
    119   }
    120   return false;
    121 }
    122 
    123 URLPattern::URLPattern()
    124     : valid_schemes_(SCHEME_NONE),
    125       match_all_urls_(false),
    126       match_subdomains_(false),
    127       port_("*") {}
    128 
    129 URLPattern::URLPattern(int valid_schemes)
    130     : valid_schemes_(valid_schemes),
    131       match_all_urls_(false),
    132       match_subdomains_(false),
    133       port_("*") {}
    134 
    135 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
    136     // Strict error checking is used, because this constructor is only
    137     // appropriate when we know |pattern| is valid.
    138     : valid_schemes_(valid_schemes),
    139       match_all_urls_(false),
    140       match_subdomains_(false),
    141       port_("*") {
    142   ParseResult result = Parse(pattern);
    143   if (PARSE_SUCCESS != result)
    144     NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
    145 }
    146 
    147 URLPattern::~URLPattern() {
    148 }
    149 
    150 bool URLPattern::operator<(const URLPattern& other) const {
    151   return GetAsString() < other.GetAsString();
    152 }
    153 
    154 bool URLPattern::operator>(const URLPattern& other) const {
    155   return GetAsString() > other.GetAsString();
    156 }
    157 
    158 bool URLPattern::operator==(const URLPattern& other) const {
    159   return GetAsString() == other.GetAsString();
    160 }
    161 
    162 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
    163   spec_.clear();
    164   SetMatchAllURLs(false);
    165   SetMatchSubdomains(false);
    166   SetPort("*");
    167 
    168   // Special case pattern to match every valid URL.
    169   if (pattern == kAllUrlsPattern) {
    170     SetMatchAllURLs(true);
    171     return PARSE_SUCCESS;
    172   }
    173 
    174   // Parse out the scheme.
    175   size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
    176   bool has_standard_scheme_separator = true;
    177 
    178   // Some urls also use ':' alone as the scheme separator.
    179   if (scheme_end_pos == std::string::npos) {
    180     scheme_end_pos = pattern.find(':');
    181     has_standard_scheme_separator = false;
    182   }
    183 
    184   if (scheme_end_pos == std::string::npos)
    185     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
    186 
    187   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
    188     return PARSE_ERROR_INVALID_SCHEME;
    189 
    190   bool standard_scheme = IsStandardScheme(scheme_);
    191   if (standard_scheme != has_standard_scheme_separator)
    192     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
    193 
    194   // Advance past the scheme separator.
    195   scheme_end_pos +=
    196       (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
    197   if (scheme_end_pos >= pattern.size())
    198     return PARSE_ERROR_EMPTY_HOST;
    199 
    200   // Parse out the host and path.
    201   size_t host_start_pos = scheme_end_pos;
    202   size_t path_start_pos = 0;
    203 
    204   if (!standard_scheme) {
    205     path_start_pos = host_start_pos;
    206   } else if (scheme_ == url::kFileScheme) {
    207     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    208     if (host_end_pos == std::string::npos) {
    209       // Allow hostname omission.
    210       // e.g. file://* is interpreted as file:///*,
    211       // file://foo* is interpreted as file:///foo*.
    212       path_start_pos = host_start_pos - 1;
    213     } else {
    214       // Ignore hostname if scheme is file://.
    215       // e.g. file://localhost/foo is equal to file:///foo.
    216       path_start_pos = host_end_pos;
    217     }
    218   } else {
    219     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    220 
    221     // Host is required.
    222     if (host_start_pos == host_end_pos)
    223       return PARSE_ERROR_EMPTY_HOST;
    224 
    225     if (host_end_pos == std::string::npos)
    226       return PARSE_ERROR_EMPTY_PATH;
    227 
    228     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
    229 
    230     // The first component can optionally be '*' to match all subdomains.
    231     std::vector<std::string> host_components;
    232     base::SplitString(host_, '.', &host_components);
    233     if (host_components[0] == "*") {
    234       match_subdomains_ = true;
    235       host_components.erase(host_components.begin(),
    236                             host_components.begin() + 1);
    237     }
    238     host_ = JoinString(host_components, '.');
    239 
    240     path_start_pos = host_end_pos;
    241   }
    242 
    243   SetPath(pattern.substr(path_start_pos));
    244 
    245   size_t port_pos = host_.find(':');
    246   if (port_pos != std::string::npos) {
    247     if (!SetPort(host_.substr(port_pos + 1)))
    248       return PARSE_ERROR_INVALID_PORT;
    249     host_ = host_.substr(0, port_pos);
    250   }
    251 
    252   // No other '*' can occur in the host, though. This isn't necessary, but is
    253   // done as a convenience to developers who might otherwise be confused and
    254   // think '*' works as a glob in the host.
    255   if (host_.find('*') != std::string::npos)
    256     return PARSE_ERROR_INVALID_HOST_WILDCARD;
    257 
    258   // Null characters are not allowed in hosts.
    259   if (host_.find('\0') != std::string::npos)
    260     return PARSE_ERROR_INVALID_HOST;
    261 
    262   return PARSE_SUCCESS;
    263 }
    264 
    265 void URLPattern::SetValidSchemes(int valid_schemes) {
    266   spec_.clear();
    267   valid_schemes_ = valid_schemes;
    268 }
    269 
    270 void URLPattern::SetHost(const std::string& host) {
    271   spec_.clear();
    272   host_ = host;
    273 }
    274 
    275 void URLPattern::SetMatchAllURLs(bool val) {
    276   spec_.clear();
    277   match_all_urls_ = val;
    278 
    279   if (val) {
    280     match_subdomains_ = true;
    281     scheme_ = "*";
    282     host_.clear();
    283     SetPath("/*");
    284   }
    285 }
    286 
    287 void URLPattern::SetMatchSubdomains(bool val) {
    288   spec_.clear();
    289   match_subdomains_ = val;
    290 }
    291 
    292 bool URLPattern::SetScheme(const std::string& scheme) {
    293   spec_.clear();
    294   scheme_ = scheme;
    295   if (scheme_ == "*") {
    296     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
    297   } else if (!IsValidScheme(scheme_)) {
    298     return false;
    299   }
    300   return true;
    301 }
    302 
    303 bool URLPattern::IsValidScheme(const std::string& scheme) const {
    304   if (valid_schemes_ == SCHEME_ALL)
    305     return true;
    306 
    307   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    308     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
    309       return true;
    310   }
    311 
    312   return false;
    313 }
    314 
    315 void URLPattern::SetPath(const std::string& path) {
    316   spec_.clear();
    317   path_ = path;
    318   path_escaped_ = path_;
    319   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
    320   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
    321 }
    322 
    323 bool URLPattern::SetPort(const std::string& port) {
    324   spec_.clear();
    325   if (IsValidPortForScheme(scheme_, port)) {
    326     port_ = port;
    327     return true;
    328   }
    329   return false;
    330 }
    331 
    332 bool URLPattern::MatchesURL(const GURL& test) const {
    333   const GURL* test_url = &test;
    334   bool has_inner_url = test.inner_url() != NULL;
    335 
    336   if (has_inner_url) {
    337     if (!test.SchemeIsFileSystem())
    338       return false;  // The only nested URLs we handle are filesystem URLs.
    339     test_url = test.inner_url();
    340   }
    341 
    342   if (!MatchesScheme(test_url->scheme()))
    343     return false;
    344 
    345   if (match_all_urls_)
    346     return true;
    347 
    348   std::string path_for_request = test.PathForRequest();
    349   if (has_inner_url)
    350     path_for_request = test_url->path() + path_for_request;
    351 
    352   return MatchesSecurityOriginHelper(*test_url) &&
    353          MatchesPath(path_for_request);
    354 }
    355 
    356 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
    357   const GURL* test_url = &test;
    358   bool has_inner_url = test.inner_url() != NULL;
    359 
    360   if (has_inner_url) {
    361     if (!test.SchemeIsFileSystem())
    362       return false;  // The only nested URLs we handle are filesystem URLs.
    363     test_url = test.inner_url();
    364   }
    365 
    366   if (!MatchesScheme(test_url->scheme()))
    367     return false;
    368 
    369   if (match_all_urls_)
    370     return true;
    371 
    372   return MatchesSecurityOriginHelper(*test_url);
    373 }
    374 
    375 bool URLPattern::MatchesScheme(const std::string& test) const {
    376   if (!IsValidScheme(test))
    377     return false;
    378 
    379   return scheme_ == "*" || test == scheme_;
    380 }
    381 
    382 bool URLPattern::MatchesHost(const std::string& host) const {
    383   std::string test(url::kHttpScheme);
    384   test += url::kStandardSchemeSeparator;
    385   test += host;
    386   test += "/";
    387   return MatchesHost(GURL(test));
    388 }
    389 
    390 bool URLPattern::MatchesHost(const GURL& test) const {
    391   // If the hosts are exactly equal, we have a match.
    392   if (test.host() == host_)
    393     return true;
    394 
    395   // If we're matching subdomains, and we have no host in the match pattern,
    396   // that means that we're matching all hosts, which means we have a match no
    397   // matter what the test host is.
    398   if (match_subdomains_ && host_.empty())
    399     return true;
    400 
    401   // Otherwise, we can only match if our match pattern matches subdomains.
    402   if (!match_subdomains_)
    403     return false;
    404 
    405   // We don't do subdomain matching against IP addresses, so we can give up now
    406   // if the test host is an IP address.
    407   if (test.HostIsIPAddress())
    408     return false;
    409 
    410   // Check if the test host is a subdomain of our host.
    411   if (test.host().length() <= (host_.length() + 1))
    412     return false;
    413 
    414   if (test.host().compare(test.host().length() - host_.length(),
    415                           host_.length(), host_) != 0)
    416     return false;
    417 
    418   return test.host()[test.host().length() - host_.length() - 1] == '.';
    419 }
    420 
    421 bool URLPattern::MatchesPath(const std::string& test) const {
    422   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
    423   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
    424   if (test + "/*" == path_escaped_)
    425     return true;
    426 
    427   return MatchPattern(test, path_escaped_);
    428 }
    429 
    430 const std::string& URLPattern::GetAsString() const {
    431   if (!spec_.empty())
    432     return spec_;
    433 
    434   if (match_all_urls_) {
    435     spec_ = kAllUrlsPattern;
    436     return spec_;
    437   }
    438 
    439   bool standard_scheme = IsStandardScheme(scheme_);
    440 
    441   std::string spec = scheme_ +
    442       (standard_scheme ? url::kStandardSchemeSeparator : ":");
    443 
    444   if (scheme_ != url::kFileScheme && standard_scheme) {
    445     if (match_subdomains_) {
    446       spec += "*";
    447       if (!host_.empty())
    448         spec += ".";
    449     }
    450 
    451     if (!host_.empty())
    452       spec += host_;
    453 
    454     if (port_ != "*") {
    455       spec += ":";
    456       spec += port_;
    457     }
    458   }
    459 
    460   if (!path_.empty())
    461     spec += path_;
    462 
    463   spec_ = spec;
    464   return spec_;
    465 }
    466 
    467 bool URLPattern::OverlapsWith(const URLPattern& other) const {
    468   if (match_all_urls() || other.match_all_urls())
    469     return true;
    470   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
    471           other.MatchesAnyScheme(GetExplicitSchemes()))
    472       && (MatchesHost(other.host()) || other.MatchesHost(host()))
    473       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
    474       && (MatchesPath(StripTrailingWildcard(other.path())) ||
    475           other.MatchesPath(StripTrailingWildcard(path())));
    476 }
    477 
    478 bool URLPattern::Contains(const URLPattern& other) const {
    479   if (match_all_urls())
    480     return true;
    481   return MatchesAllSchemes(other.GetExplicitSchemes())
    482       && MatchesHost(other.host())
    483       && MatchesPortPattern(other.port())
    484       && MatchesPath(StripTrailingWildcard(other.path()));
    485 }
    486 
    487 bool URLPattern::MatchesAnyScheme(
    488     const std::vector<std::string>& schemes) const {
    489   for (std::vector<std::string>::const_iterator i = schemes.begin();
    490        i != schemes.end(); ++i) {
    491     if (MatchesScheme(*i))
    492       return true;
    493   }
    494 
    495   return false;
    496 }
    497 
    498 bool URLPattern::MatchesAllSchemes(
    499     const std::vector<std::string>& schemes) const {
    500   for (std::vector<std::string>::const_iterator i = schemes.begin();
    501        i != schemes.end(); ++i) {
    502     if (!MatchesScheme(*i))
    503       return false;
    504   }
    505 
    506   return true;
    507 }
    508 
    509 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
    510   // Ignore hostname if scheme is file://.
    511   if (scheme_ != url::kFileScheme && !MatchesHost(test))
    512     return false;
    513 
    514   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
    515     return false;
    516 
    517   return true;
    518 }
    519 
    520 bool URLPattern::MatchesPortPattern(const std::string& port) const {
    521   return port_ == "*" || port_ == port;
    522 }
    523 
    524 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
    525   std::vector<std::string> result;
    526 
    527   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
    528     result.push_back(scheme_);
    529     return result;
    530   }
    531 
    532   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    533     if (MatchesScheme(kValidSchemes[i])) {
    534       result.push_back(kValidSchemes[i]);
    535     }
    536   }
    537 
    538   return result;
    539 }
    540 
    541 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
    542   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
    543   std::vector<URLPattern> result;
    544 
    545   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
    546        i != explicit_schemes.end(); ++i) {
    547     URLPattern temp = *this;
    548     temp.SetScheme(*i);
    549     temp.SetMatchAllURLs(false);
    550     result.push_back(temp);
    551   }
    552 
    553   return result;
    554 }
    555 
    556 // static
    557 const char* URLPattern::GetParseResultString(
    558     URLPattern::ParseResult parse_result) {
    559   return kParseResultMessages[parse_result];
    560 }
    561