Home | History | Annotate | Download | only in common
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "extensions/common/url_pattern.h"
      6 
      7 #include "base/strings/string_number_conversions.h"
      8 #include "base/strings/string_piece.h"
      9 #include "base/strings/string_split.h"
     10 #include "base/strings/string_util.h"
     11 #include "content/public/common/url_constants.h"
     12 #include "extensions/common/constants.h"
     13 #include "url/gurl.h"
     14 #include "url/url_util.h"
     15 
     16 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
     17 
     18 namespace {
     19 
     20 // TODO(aa): What about more obscure schemes like data: and javascript: ?
     21 // Note: keep this array in sync with kValidSchemeMasks.
     22 const char* kValidSchemes[] = {
     23   content::kHttpScheme,
     24   content::kHttpsScheme,
     25   chrome::kFileScheme,
     26   content::kFtpScheme,
     27   chrome::kChromeUIScheme,
     28   extensions::kExtensionScheme,
     29   chrome::kFileSystemScheme,
     30 };
     31 
     32 const int kValidSchemeMasks[] = {
     33   URLPattern::SCHEME_HTTP,
     34   URLPattern::SCHEME_HTTPS,
     35   URLPattern::SCHEME_FILE,
     36   URLPattern::SCHEME_FTP,
     37   URLPattern::SCHEME_CHROMEUI,
     38   URLPattern::SCHEME_EXTENSION,
     39   URLPattern::SCHEME_FILESYSTEM,
     40 };
     41 
     42 COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
     43                must_keep_these_arrays_in_sync);
     44 
     45 const char kParseSuccess[] = "Success.";
     46 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
     47 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
     48 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
     49 const char kParseErrorEmptyHost[] = "Host can not be empty.";
     50 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
     51 const char kParseErrorEmptyPath[] = "Empty path.";
     52 const char kParseErrorInvalidPort[] = "Invalid port.";
     53 
     54 // Message explaining each URLPattern::ParseResult.
     55 const char* const kParseResultMessages[] = {
     56   kParseSuccess,
     57   kParseErrorMissingSchemeSeparator,
     58   kParseErrorInvalidScheme,
     59   kParseErrorWrongSchemeType,
     60   kParseErrorEmptyHost,
     61   kParseErrorInvalidHostWildcard,
     62   kParseErrorEmptyPath,
     63   kParseErrorInvalidPort,
     64 };
     65 
     66 COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
     67                must_add_message_for_each_parse_result);
     68 
     69 const char kPathSeparator[] = "/";
     70 
     71 bool IsStandardScheme(const std::string& scheme) {
     72   // "*" gets the same treatment as a standard scheme.
     73   if (scheme == "*")
     74     return true;
     75 
     76   return url_util::IsStandard(scheme.c_str(),
     77       url_parse::Component(0, static_cast<int>(scheme.length())));
     78 }
     79 
     80 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
     81   if (port == "*")
     82     return true;
     83 
     84   // Only accept non-wildcard ports if the scheme uses ports.
     85   if (url_canon::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
     86       url_parse::PORT_UNSPECIFIED) {
     87     return false;
     88   }
     89 
     90   int parsed_port = url_parse::PORT_UNSPECIFIED;
     91   if (!base::StringToInt(port, &parsed_port))
     92     return false;
     93   return (parsed_port >= 0) && (parsed_port < 65536);
     94 }
     95 
     96 // Returns |path| with the trailing wildcard stripped if one existed.
     97 //
     98 // The functions that rely on this (OverlapsWith and Contains) are only
     99 // called for the patterns inside URLPatternSet. In those cases, we know that
    100 // the path will have only a single wildcard at the end. This makes figuring
    101 // out overlap much easier. It seems like there is probably a computer-sciency
    102 // way to solve the general case, but we don't need that yet.
    103 std::string StripTrailingWildcard(const std::string& path) {
    104   size_t wildcard_index = path.find('*');
    105   size_t path_last = path.size() - 1;
    106   DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
    107   return wildcard_index == path_last ? path.substr(0, path_last) : path;
    108 }
    109 
    110 }  // namespace
    111 
    112 URLPattern::URLPattern()
    113     : valid_schemes_(SCHEME_NONE),
    114       match_all_urls_(false),
    115       match_subdomains_(false),
    116       port_("*") {}
    117 
    118 URLPattern::URLPattern(int valid_schemes)
    119     : valid_schemes_(valid_schemes),
    120       match_all_urls_(false),
    121       match_subdomains_(false),
    122       port_("*") {}
    123 
    124 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
    125     // Strict error checking is used, because this constructor is only
    126     // appropriate when we know |pattern| is valid.
    127     : valid_schemes_(valid_schemes),
    128       match_all_urls_(false),
    129       match_subdomains_(false),
    130       port_("*") {
    131   if (PARSE_SUCCESS != Parse(pattern))
    132     NOTREACHED() << "URLPattern is invalid: " << pattern;
    133 }
    134 
    135 URLPattern::~URLPattern() {
    136 }
    137 
    138 bool URLPattern::operator<(const URLPattern& other) const {
    139   return GetAsString() < other.GetAsString();
    140 }
    141 
    142 bool URLPattern::operator>(const URLPattern& other) const {
    143   return GetAsString() > other.GetAsString();
    144 }
    145 
    146 bool URLPattern::operator==(const URLPattern& other) const {
    147   return GetAsString() == other.GetAsString();
    148 }
    149 
    150 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
    151   spec_.clear();
    152   SetMatchAllURLs(false);
    153   SetMatchSubdomains(false);
    154   SetPort("*");
    155 
    156   // Special case pattern to match every valid URL.
    157   if (pattern == kAllUrlsPattern) {
    158     SetMatchAllURLs(true);
    159     return PARSE_SUCCESS;
    160   }
    161 
    162   // Parse out the scheme.
    163   size_t scheme_end_pos = pattern.find(content::kStandardSchemeSeparator);
    164   bool has_standard_scheme_separator = true;
    165 
    166   // Some urls also use ':' alone as the scheme separator.
    167   if (scheme_end_pos == std::string::npos) {
    168     scheme_end_pos = pattern.find(':');
    169     has_standard_scheme_separator = false;
    170   }
    171 
    172   if (scheme_end_pos == std::string::npos)
    173     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
    174 
    175   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
    176     return PARSE_ERROR_INVALID_SCHEME;
    177 
    178   bool standard_scheme = IsStandardScheme(scheme_);
    179   if (standard_scheme != has_standard_scheme_separator)
    180     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
    181 
    182   // Advance past the scheme separator.
    183   scheme_end_pos +=
    184       (standard_scheme ? strlen(content::kStandardSchemeSeparator) : 1);
    185   if (scheme_end_pos >= pattern.size())
    186     return PARSE_ERROR_EMPTY_HOST;
    187 
    188   // Parse out the host and path.
    189   size_t host_start_pos = scheme_end_pos;
    190   size_t path_start_pos = 0;
    191 
    192   if (!standard_scheme) {
    193     path_start_pos = host_start_pos;
    194   } else if (scheme_ == chrome::kFileScheme) {
    195     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    196     if (host_end_pos == std::string::npos) {
    197       // Allow hostname omission.
    198       // e.g. file://* is interpreted as file:///*,
    199       // file://foo* is interpreted as file:///foo*.
    200       path_start_pos = host_start_pos - 1;
    201     } else {
    202       // Ignore hostname if scheme is file://.
    203       // e.g. file://localhost/foo is equal to file:///foo.
    204       path_start_pos = host_end_pos;
    205     }
    206   } else {
    207     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    208 
    209     // Host is required.
    210     if (host_start_pos == host_end_pos)
    211       return PARSE_ERROR_EMPTY_HOST;
    212 
    213     if (host_end_pos == std::string::npos)
    214       return PARSE_ERROR_EMPTY_PATH;
    215 
    216     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
    217 
    218     // The first component can optionally be '*' to match all subdomains.
    219     std::vector<std::string> host_components;
    220     base::SplitString(host_, '.', &host_components);
    221     if (host_components[0] == "*") {
    222       match_subdomains_ = true;
    223       host_components.erase(host_components.begin(),
    224                             host_components.begin() + 1);
    225     }
    226     host_ = JoinString(host_components, '.');
    227 
    228     path_start_pos = host_end_pos;
    229   }
    230 
    231   SetPath(pattern.substr(path_start_pos));
    232 
    233   size_t port_pos = host_.find(':');
    234   if (port_pos != std::string::npos) {
    235     if (!SetPort(host_.substr(port_pos + 1)))
    236       return PARSE_ERROR_INVALID_PORT;
    237     host_ = host_.substr(0, port_pos);
    238   }
    239 
    240   // No other '*' can occur in the host, though. This isn't necessary, but is
    241   // done as a convenience to developers who might otherwise be confused and
    242   // think '*' works as a glob in the host.
    243   if (host_.find('*') != std::string::npos)
    244     return PARSE_ERROR_INVALID_HOST_WILDCARD;
    245 
    246   return PARSE_SUCCESS;
    247 }
    248 
    249 void URLPattern::SetValidSchemes(int valid_schemes) {
    250   spec_.clear();
    251   valid_schemes_ = valid_schemes;
    252 }
    253 
    254 void URLPattern::SetHost(const std::string& host) {
    255   spec_.clear();
    256   host_ = host;
    257 }
    258 
    259 void URLPattern::SetMatchAllURLs(bool val) {
    260   spec_.clear();
    261   match_all_urls_ = val;
    262 
    263   if (val) {
    264     match_subdomains_ = true;
    265     scheme_ = "*";
    266     host_.clear();
    267     SetPath("/*");
    268   }
    269 }
    270 
    271 void URLPattern::SetMatchSubdomains(bool val) {
    272   spec_.clear();
    273   match_subdomains_ = val;
    274 }
    275 
    276 bool URLPattern::SetScheme(const std::string& scheme) {
    277   spec_.clear();
    278   scheme_ = scheme;
    279   if (scheme_ == "*") {
    280     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
    281   } else if (!IsValidScheme(scheme_)) {
    282     return false;
    283   }
    284   return true;
    285 }
    286 
    287 bool URLPattern::IsValidScheme(const std::string& scheme) const {
    288   if (valid_schemes_ == SCHEME_ALL)
    289     return true;
    290 
    291   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    292     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
    293       return true;
    294   }
    295 
    296   return false;
    297 }
    298 
    299 void URLPattern::SetPath(const std::string& path) {
    300   spec_.clear();
    301   path_ = path;
    302   path_escaped_ = path_;
    303   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
    304   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
    305 }
    306 
    307 bool URLPattern::SetPort(const std::string& port) {
    308   spec_.clear();
    309   if (IsValidPortForScheme(scheme_, port)) {
    310     port_ = port;
    311     return true;
    312   }
    313   return false;
    314 }
    315 
    316 bool URLPattern::MatchesURL(const GURL& test) const {
    317   const GURL* test_url = &test;
    318   bool has_inner_url = test.inner_url() != NULL;
    319 
    320   if (has_inner_url) {
    321     if (!test.SchemeIsFileSystem())
    322       return false;  // The only nested URLs we handle are filesystem URLs.
    323     test_url = test.inner_url();
    324   }
    325 
    326   if (!MatchesScheme(test_url->scheme()))
    327     return false;
    328 
    329   if (match_all_urls_)
    330     return true;
    331 
    332   std::string path_for_request = test.PathForRequest();
    333   if (has_inner_url)
    334     path_for_request = test_url->path() + path_for_request;
    335 
    336   return MatchesSecurityOriginHelper(*test_url) &&
    337          MatchesPath(path_for_request);
    338 }
    339 
    340 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
    341   const GURL* test_url = &test;
    342   bool has_inner_url = test.inner_url() != NULL;
    343 
    344   if (has_inner_url) {
    345     if (!test.SchemeIsFileSystem())
    346       return false;  // The only nested URLs we handle are filesystem URLs.
    347     test_url = test.inner_url();
    348   }
    349 
    350   if (!MatchesScheme(test_url->scheme()))
    351     return false;
    352 
    353   if (match_all_urls_)
    354     return true;
    355 
    356   return MatchesSecurityOriginHelper(*test_url);
    357 }
    358 
    359 bool URLPattern::MatchesScheme(const std::string& test) const {
    360   if (!IsValidScheme(test))
    361     return false;
    362 
    363   return scheme_ == "*" || test == scheme_;
    364 }
    365 
    366 bool URLPattern::MatchesHost(const std::string& host) const {
    367   std::string test(content::kHttpScheme);
    368   test += content::kStandardSchemeSeparator;
    369   test += host;
    370   test += "/";
    371   return MatchesHost(GURL(test));
    372 }
    373 
    374 bool URLPattern::MatchesHost(const GURL& test) const {
    375   // If the hosts are exactly equal, we have a match.
    376   if (test.host() == host_)
    377     return true;
    378 
    379   // If we're matching subdomains, and we have no host in the match pattern,
    380   // that means that we're matching all hosts, which means we have a match no
    381   // matter what the test host is.
    382   if (match_subdomains_ && host_.empty())
    383     return true;
    384 
    385   // Otherwise, we can only match if our match pattern matches subdomains.
    386   if (!match_subdomains_)
    387     return false;
    388 
    389   // We don't do subdomain matching against IP addresses, so we can give up now
    390   // if the test host is an IP address.
    391   if (test.HostIsIPAddress())
    392     return false;
    393 
    394   // Check if the test host is a subdomain of our host.
    395   if (test.host().length() <= (host_.length() + 1))
    396     return false;
    397 
    398   if (test.host().compare(test.host().length() - host_.length(),
    399                           host_.length(), host_) != 0)
    400     return false;
    401 
    402   return test.host()[test.host().length() - host_.length() - 1] == '.';
    403 }
    404 
    405 bool URLPattern::MatchesPath(const std::string& test) const {
    406   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
    407   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
    408   if (test + "/*" == path_escaped_)
    409     return true;
    410 
    411   return MatchPattern(test, path_escaped_);
    412 }
    413 
    414 const std::string& URLPattern::GetAsString() const {
    415   if (!spec_.empty())
    416     return spec_;
    417 
    418   if (match_all_urls_) {
    419     spec_ = kAllUrlsPattern;
    420     return spec_;
    421   }
    422 
    423   bool standard_scheme = IsStandardScheme(scheme_);
    424 
    425   std::string spec = scheme_ +
    426       (standard_scheme ? content::kStandardSchemeSeparator : ":");
    427 
    428   if (scheme_ != chrome::kFileScheme && standard_scheme) {
    429     if (match_subdomains_) {
    430       spec += "*";
    431       if (!host_.empty())
    432         spec += ".";
    433     }
    434 
    435     if (!host_.empty())
    436       spec += host_;
    437 
    438     if (port_ != "*") {
    439       spec += ":";
    440       spec += port_;
    441     }
    442   }
    443 
    444   if (!path_.empty())
    445     spec += path_;
    446 
    447   spec_ = spec;
    448   return spec_;
    449 }
    450 
    451 bool URLPattern::OverlapsWith(const URLPattern& other) const {
    452   if (match_all_urls() || other.match_all_urls())
    453     return true;
    454   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
    455           other.MatchesAnyScheme(GetExplicitSchemes()))
    456       && (MatchesHost(other.host()) || other.MatchesHost(host()))
    457       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
    458       && (MatchesPath(StripTrailingWildcard(other.path())) ||
    459           other.MatchesPath(StripTrailingWildcard(path())));
    460 }
    461 
    462 bool URLPattern::Contains(const URLPattern& other) const {
    463   if (match_all_urls())
    464     return true;
    465   return MatchesAllSchemes(other.GetExplicitSchemes())
    466       && MatchesHost(other.host())
    467       && MatchesPortPattern(other.port())
    468       && MatchesPath(StripTrailingWildcard(other.path()));
    469 }
    470 
    471 bool URLPattern::MatchesAnyScheme(
    472     const std::vector<std::string>& schemes) const {
    473   for (std::vector<std::string>::const_iterator i = schemes.begin();
    474        i != schemes.end(); ++i) {
    475     if (MatchesScheme(*i))
    476       return true;
    477   }
    478 
    479   return false;
    480 }
    481 
    482 bool URLPattern::MatchesAllSchemes(
    483     const std::vector<std::string>& schemes) const {
    484   for (std::vector<std::string>::const_iterator i = schemes.begin();
    485        i != schemes.end(); ++i) {
    486     if (!MatchesScheme(*i))
    487       return false;
    488   }
    489 
    490   return true;
    491 }
    492 
    493 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
    494   // Ignore hostname if scheme is file://.
    495   if (scheme_ != chrome::kFileScheme && !MatchesHost(test))
    496     return false;
    497 
    498   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
    499     return false;
    500 
    501   return true;
    502 }
    503 
    504 bool URLPattern::MatchesPortPattern(const std::string& port) const {
    505   return port_ == "*" || port_ == port;
    506 }
    507 
    508 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
    509   std::vector<std::string> result;
    510 
    511   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
    512     result.push_back(scheme_);
    513     return result;
    514   }
    515 
    516   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    517     if (MatchesScheme(kValidSchemes[i])) {
    518       result.push_back(kValidSchemes[i]);
    519     }
    520   }
    521 
    522   return result;
    523 }
    524 
    525 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
    526   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
    527   std::vector<URLPattern> result;
    528 
    529   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
    530        i != explicit_schemes.end(); ++i) {
    531     URLPattern temp = *this;
    532     temp.SetScheme(*i);
    533     temp.SetMatchAllURLs(false);
    534     result.push_back(temp);
    535   }
    536 
    537   return result;
    538 }
    539 
    540 // static
    541 const char* URLPattern::GetParseResultString(
    542     URLPattern::ParseResult parse_result) {
    543   return kParseResultMessages[parse_result];
    544 }
    545