Home | History | Annotate | Download | only in extensions
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/common/extensions/url_pattern.h"
      6 
      7 #include "base/string_piece.h"
      8 #include "base/string_split.h"
      9 #include "base/string_util.h"
     10 #include "chrome/common/url_constants.h"
     11 #include "googleurl/src/gurl.h"
     12 #include "googleurl/src/url_util.h"
     13 
     14 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
     15 
     16 namespace {
     17 
     18 // TODO(aa): Consider adding chrome-extension? What about more obscure ones
     19 // like data: and javascript: ?
     20 // Note: keep this array in sync with kValidSchemeMasks.
     21 const char* kValidSchemes[] = {
     22   chrome::kHttpScheme,
     23   chrome::kHttpsScheme,
     24   chrome::kFileScheme,
     25   chrome::kFtpScheme,
     26   chrome::kChromeUIScheme,
     27   chrome::kFileSystemScheme,
     28 };
     29 
     30 const int kValidSchemeMasks[] = {
     31   URLPattern::SCHEME_HTTP,
     32   URLPattern::SCHEME_HTTPS,
     33   URLPattern::SCHEME_FILE,
     34   URLPattern::SCHEME_FTP,
     35   URLPattern::SCHEME_CHROMEUI,
     36   URLPattern::SCHEME_FILESYSTEM,
     37 };
     38 
     39 COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
     40                must_keep_these_arrays_in_sync);
     41 
     42 const char* kParseSuccess = "Success.";
     43 const char* kParseErrorMissingSchemeSeparator = "Missing scheme separator.";
     44 const char* kParseErrorInvalidScheme = "Invalid scheme.";
     45 const char* kParseErrorWrongSchemeType = "Wrong scheme type.";
     46 const char* kParseErrorEmptyHost = "Host can not be empty.";
     47 const char* kParseErrorInvalidHostWildcard = "Invalid host wildcard.";
     48 const char* kParseErrorEmptyPath = "Empty path.";
     49 const char* kParseErrorHasColon =
     50     "Ports are not supported in URL patterns. ':' may not be used in a host.";
     51 
     52 // Message explaining each URLPattern::ParseResult.
     53 const char* kParseResultMessages[] = {
     54   kParseSuccess,
     55   kParseErrorMissingSchemeSeparator,
     56   kParseErrorInvalidScheme,
     57   kParseErrorWrongSchemeType,
     58   kParseErrorEmptyHost,
     59   kParseErrorInvalidHostWildcard,
     60   kParseErrorEmptyPath,
     61   kParseErrorHasColon
     62 };
     63 
     64 COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
     65                must_add_message_for_each_parse_result);
     66 
     67 const char kPathSeparator[] = "/";
     68 
     69 bool IsStandardScheme(const std::string& scheme) {
     70   // "*" gets the same treatment as a standard scheme.
     71   if (scheme == "*")
     72     return true;
     73 
     74   return url_util::IsStandard(scheme.c_str(),
     75       url_parse::Component(0, static_cast<int>(scheme.length())));
     76 }
     77 
     78 }  // namespace
     79 
     80 URLPattern::URLPattern()
     81     : valid_schemes_(SCHEME_NONE),
     82       match_all_urls_(false),
     83       match_subdomains_(false) {}
     84 
     85 URLPattern::URLPattern(int valid_schemes)
     86     : valid_schemes_(valid_schemes), match_all_urls_(false),
     87       match_subdomains_(false) {}
     88 
     89 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
     90     : valid_schemes_(valid_schemes), match_all_urls_(false),
     91       match_subdomains_(false) {
     92 
     93   // Strict error checking is used, because this constructor is only
     94   // appropriate when we know |pattern| is valid.
     95   if (PARSE_SUCCESS != Parse(pattern, PARSE_STRICT))
     96     NOTREACHED() << "URLPattern is invalid: " << pattern;
     97 }
     98 
     99 URLPattern::~URLPattern() {
    100 }
    101 
    102 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern,
    103                                           ParseOption strictness) {
    104   CHECK(strictness == PARSE_LENIENT ||
    105         strictness == PARSE_STRICT);
    106 
    107   // Special case pattern to match every valid URL.
    108   if (pattern == kAllUrlsPattern) {
    109     match_all_urls_ = true;
    110     match_subdomains_ = true;
    111     scheme_ = "*";
    112     host_.clear();
    113     SetPath("/*");
    114     return PARSE_SUCCESS;
    115   }
    116 
    117   // Parse out the scheme.
    118   size_t scheme_end_pos = pattern.find(chrome::kStandardSchemeSeparator);
    119   bool has_standard_scheme_separator = true;
    120 
    121   // Some urls also use ':' alone as the scheme separator.
    122   if (scheme_end_pos == std::string::npos) {
    123     scheme_end_pos = pattern.find(':');
    124     has_standard_scheme_separator = false;
    125   }
    126 
    127   if (scheme_end_pos == std::string::npos)
    128     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
    129 
    130   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
    131     return PARSE_ERROR_INVALID_SCHEME;
    132 
    133   bool standard_scheme = IsStandardScheme(scheme_);
    134   if (standard_scheme != has_standard_scheme_separator)
    135     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
    136 
    137   // Advance past the scheme separator.
    138   scheme_end_pos +=
    139       (standard_scheme ? strlen(chrome::kStandardSchemeSeparator) : 1);
    140   if (scheme_end_pos >= pattern.size())
    141     return PARSE_ERROR_EMPTY_HOST;
    142 
    143   // Parse out the host and path.
    144   size_t host_start_pos = scheme_end_pos;
    145   size_t path_start_pos = 0;
    146 
    147   // File URLs are special because they have no host.
    148   if (scheme_ == chrome::kFileScheme || !standard_scheme) {
    149     path_start_pos = host_start_pos;
    150   } else {
    151     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    152 
    153     // Host is required.
    154     if (host_start_pos == host_end_pos)
    155       return PARSE_ERROR_EMPTY_HOST;
    156 
    157     if (host_end_pos == std::string::npos)
    158       return PARSE_ERROR_EMPTY_PATH;
    159 
    160     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
    161 
    162     // The first component can optionally be '*' to match all subdomains.
    163     std::vector<std::string> host_components;
    164     base::SplitString(host_, '.', &host_components);
    165     if (host_components[0] == "*") {
    166       match_subdomains_ = true;
    167       host_components.erase(host_components.begin(),
    168                             host_components.begin() + 1);
    169     }
    170     host_ = JoinString(host_components, '.');
    171 
    172     // No other '*' can occur in the host, though. This isn't necessary, but is
    173     // done as a convenience to developers who might otherwise be confused and
    174     // think '*' works as a glob in the host.
    175     if (host_.find('*') != std::string::npos)
    176       return PARSE_ERROR_INVALID_HOST_WILDCARD;
    177 
    178     path_start_pos = host_end_pos;
    179   }
    180 
    181   SetPath(pattern.substr(path_start_pos));
    182 
    183   if (strictness == PARSE_STRICT && host_.find(':') != std::string::npos)
    184     return PARSE_ERROR_HAS_COLON;
    185 
    186   return PARSE_SUCCESS;
    187 }
    188 
    189 bool URLPattern::SetScheme(const std::string& scheme) {
    190   scheme_ = scheme;
    191   if (scheme_ == "*") {
    192     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
    193   } else if (!IsValidScheme(scheme_)) {
    194     return false;
    195   }
    196   return true;
    197 }
    198 
    199 bool URLPattern::IsValidScheme(const std::string& scheme) const {
    200   if (valid_schemes_ == SCHEME_ALL)
    201     return true;
    202 
    203   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    204     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
    205       return true;
    206   }
    207 
    208   return false;
    209 }
    210 
    211 void URLPattern::SetPath(const std::string& path) {
    212   path_ = path;
    213   path_escaped_ = path_;
    214   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
    215   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
    216 }
    217 
    218 bool URLPattern::MatchesUrl(const GURL &test) const {
    219   if (!MatchesScheme(test.scheme()))
    220     return false;
    221 
    222   if (match_all_urls_)
    223     return true;
    224 
    225   if (!MatchesHost(test))
    226     return false;
    227 
    228   if (!MatchesPath(test.PathForRequest()))
    229     return false;
    230 
    231   return true;
    232 }
    233 
    234 bool URLPattern::MatchesScheme(const std::string& test) const {
    235   if (!IsValidScheme(test))
    236     return false;
    237 
    238   return scheme_ == "*" || test == scheme_;
    239 }
    240 
    241 bool URLPattern::MatchesHost(const std::string& host) const {
    242   std::string test(chrome::kHttpScheme);
    243   test += chrome::kStandardSchemeSeparator;
    244   test += host;
    245   test += "/";
    246   return MatchesHost(GURL(test));
    247 }
    248 
    249 bool URLPattern::MatchesHost(const GURL& test) const {
    250   // If the hosts are exactly equal, we have a match.
    251   if (test.host() == host_)
    252     return true;
    253 
    254   // If we're matching subdomains, and we have no host in the match pattern,
    255   // that means that we're matching all hosts, which means we have a match no
    256   // matter what the test host is.
    257   if (match_subdomains_ && host_.empty())
    258     return true;
    259 
    260   // Otherwise, we can only match if our match pattern matches subdomains.
    261   if (!match_subdomains_)
    262     return false;
    263 
    264   // We don't do subdomain matching against IP addresses, so we can give up now
    265   // if the test host is an IP address.
    266   if (test.HostIsIPAddress())
    267     return false;
    268 
    269   // Check if the test host is a subdomain of our host.
    270   if (test.host().length() <= (host_.length() + 1))
    271     return false;
    272 
    273   if (test.host().compare(test.host().length() - host_.length(),
    274                           host_.length(), host_) != 0)
    275     return false;
    276 
    277   return test.host()[test.host().length() - host_.length() - 1] == '.';
    278 }
    279 
    280 bool URLPattern::MatchesPath(const std::string& test) const {
    281   if (!MatchPattern(test, path_escaped_))
    282     return false;
    283 
    284   return true;
    285 }
    286 
    287 std::string URLPattern::GetAsString() const {
    288   if (match_all_urls_)
    289     return kAllUrlsPattern;
    290 
    291   bool standard_scheme = IsStandardScheme(scheme_);
    292 
    293   std::string spec = scheme_ +
    294       (standard_scheme ? chrome::kStandardSchemeSeparator : ":");
    295 
    296   if (scheme_ != chrome::kFileScheme && standard_scheme) {
    297     if (match_subdomains_) {
    298       spec += "*";
    299       if (!host_.empty())
    300         spec += ".";
    301     }
    302 
    303     if (!host_.empty())
    304       spec += host_;
    305   }
    306 
    307   if (!path_.empty())
    308     spec += path_;
    309 
    310   return spec;
    311 }
    312 
    313 bool URLPattern::OverlapsWith(const URLPattern& other) const {
    314   if (!MatchesScheme(other.scheme_) && !other.MatchesScheme(scheme_))
    315     return false;
    316 
    317   if (!MatchesHost(other.host()) && !other.MatchesHost(host_))
    318     return false;
    319 
    320   // We currently only use OverlapsWith() for the patterns inside
    321   // ExtensionExtent. In those cases, we know that the path will have only a
    322   // single wildcard at the end. This makes figuring out overlap much easier. It
    323   // seems like there is probably a computer-sciency way to solve the general
    324   // case, but we don't need that yet.
    325   DCHECK(path_.find('*') == path_.size() - 1);
    326   DCHECK(other.path().find('*') == other.path().size() - 1);
    327 
    328   if (!MatchesPath(other.path().substr(0, other.path().size() - 1)) &&
    329       !other.MatchesPath(path_.substr(0, path_.size() - 1)))
    330     return false;
    331 
    332   return true;
    333 }
    334 
    335 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
    336   std::vector<URLPattern> result;
    337 
    338   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
    339     result.push_back(*this);
    340     return result;
    341   }
    342 
    343   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    344     if (MatchesScheme(kValidSchemes[i])) {
    345       URLPattern temp = *this;
    346       temp.SetScheme(kValidSchemes[i]);
    347       temp.set_match_all_urls(false);
    348       result.push_back(temp);
    349     }
    350   }
    351 
    352   return result;
    353 }
    354 
    355 // static
    356 const char* URLPattern::GetParseResultString(
    357     URLPattern::ParseResult parse_result) {
    358   return kParseResultMessages[parse_result];
    359 }
    360