1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "extensions/common/url_pattern.h" 6 7 #include <ostream> 8 9 #include "base/strings/string_number_conversions.h" 10 #include "base/strings/string_piece.h" 11 #include "base/strings/string_split.h" 12 #include "base/strings/string_util.h" 13 #include "base/strings/stringprintf.h" 14 #include "content/public/common/url_constants.h" 15 #include "extensions/common/constants.h" 16 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" 17 #include "url/gurl.h" 18 #include "url/url_util.h" 19 20 const char URLPattern::kAllUrlsPattern[] = "<all_urls>"; 21 22 namespace { 23 24 // TODO(aa): What about more obscure schemes like data: and javascript: ? 25 // Note: keep this array in sync with kValidSchemeMasks. 26 const char* kValidSchemes[] = { 27 url::kHttpScheme, 28 url::kHttpsScheme, 29 url::kFileScheme, 30 url::kFtpScheme, 31 content::kChromeUIScheme, 32 extensions::kExtensionScheme, 33 url::kFileSystemScheme, 34 }; 35 36 const int kValidSchemeMasks[] = { 37 URLPattern::SCHEME_HTTP, 38 URLPattern::SCHEME_HTTPS, 39 URLPattern::SCHEME_FILE, 40 URLPattern::SCHEME_FTP, 41 URLPattern::SCHEME_CHROMEUI, 42 URLPattern::SCHEME_EXTENSION, 43 URLPattern::SCHEME_FILESYSTEM, 44 }; 45 46 COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks), 47 must_keep_these_arrays_in_sync); 48 49 const char kParseSuccess[] = "Success."; 50 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator."; 51 const char kParseErrorInvalidScheme[] = "Invalid scheme."; 52 const char kParseErrorWrongSchemeType[] = "Wrong scheme type."; 53 const char kParseErrorEmptyHost[] = "Host can not be empty."; 54 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard."; 55 const char kParseErrorEmptyPath[] = "Empty path."; 56 const char kParseErrorInvalidPort[] = "Invalid port."; 57 const char kParseErrorInvalidHost[] = "Invalid host."; 58 59 // Message explaining each URLPattern::ParseResult. 60 const char* const kParseResultMessages[] = { 61 kParseSuccess, 62 kParseErrorMissingSchemeSeparator, 63 kParseErrorInvalidScheme, 64 kParseErrorWrongSchemeType, 65 kParseErrorEmptyHost, 66 kParseErrorInvalidHostWildcard, 67 kParseErrorEmptyPath, 68 kParseErrorInvalidPort, 69 kParseErrorInvalidHost, 70 }; 71 72 COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages), 73 must_add_message_for_each_parse_result); 74 75 const char kPathSeparator[] = "/"; 76 77 bool IsStandardScheme(const std::string& scheme) { 78 // "*" gets the same treatment as a standard scheme. 79 if (scheme == "*") 80 return true; 81 82 return url::IsStandard(scheme.c_str(), 83 url::Component(0, static_cast<int>(scheme.length()))); 84 } 85 86 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) { 87 if (port == "*") 88 return true; 89 90 // Only accept non-wildcard ports if the scheme uses ports. 91 if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) == 92 url::PORT_UNSPECIFIED) { 93 return false; 94 } 95 96 int parsed_port = url::PORT_UNSPECIFIED; 97 if (!base::StringToInt(port, &parsed_port)) 98 return false; 99 return (parsed_port >= 0) && (parsed_port < 65536); 100 } 101 102 // Returns |path| with the trailing wildcard stripped if one existed. 103 // 104 // The functions that rely on this (OverlapsWith and Contains) are only 105 // called for the patterns inside URLPatternSet. In those cases, we know that 106 // the path will have only a single wildcard at the end. This makes figuring 107 // out overlap much easier. It seems like there is probably a computer-sciency 108 // way to solve the general case, but we don't need that yet. 109 std::string StripTrailingWildcard(const std::string& path) { 110 size_t wildcard_index = path.find('*'); 111 size_t path_last = path.size() - 1; 112 DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last); 113 return wildcard_index == path_last ? path.substr(0, path_last) : path; 114 } 115 116 } // namespace 117 118 // static 119 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) { 120 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) { 121 if (scheme == kValidSchemes[i]) 122 return true; 123 } 124 return false; 125 } 126 127 URLPattern::URLPattern() 128 : valid_schemes_(SCHEME_NONE), 129 match_all_urls_(false), 130 match_subdomains_(false), 131 port_("*") {} 132 133 URLPattern::URLPattern(int valid_schemes) 134 : valid_schemes_(valid_schemes), 135 match_all_urls_(false), 136 match_subdomains_(false), 137 port_("*") {} 138 139 URLPattern::URLPattern(int valid_schemes, const std::string& pattern) 140 // Strict error checking is used, because this constructor is only 141 // appropriate when we know |pattern| is valid. 142 : valid_schemes_(valid_schemes), 143 match_all_urls_(false), 144 match_subdomains_(false), 145 port_("*") { 146 ParseResult result = Parse(pattern); 147 if (PARSE_SUCCESS != result) 148 NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result; 149 } 150 151 URLPattern::~URLPattern() { 152 } 153 154 bool URLPattern::operator<(const URLPattern& other) const { 155 return GetAsString() < other.GetAsString(); 156 } 157 158 bool URLPattern::operator>(const URLPattern& other) const { 159 return GetAsString() > other.GetAsString(); 160 } 161 162 bool URLPattern::operator==(const URLPattern& other) const { 163 return GetAsString() == other.GetAsString(); 164 } 165 166 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) { 167 return out << '"' << url_pattern.GetAsString() << '"'; 168 } 169 170 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) { 171 spec_.clear(); 172 SetMatchAllURLs(false); 173 SetMatchSubdomains(false); 174 SetPort("*"); 175 176 // Special case pattern to match every valid URL. 177 if (pattern == kAllUrlsPattern) { 178 SetMatchAllURLs(true); 179 return PARSE_SUCCESS; 180 } 181 182 // Parse out the scheme. 183 size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator); 184 bool has_standard_scheme_separator = true; 185 186 // Some urls also use ':' alone as the scheme separator. 187 if (scheme_end_pos == std::string::npos) { 188 scheme_end_pos = pattern.find(':'); 189 has_standard_scheme_separator = false; 190 } 191 192 if (scheme_end_pos == std::string::npos) 193 return PARSE_ERROR_MISSING_SCHEME_SEPARATOR; 194 195 if (!SetScheme(pattern.substr(0, scheme_end_pos))) 196 return PARSE_ERROR_INVALID_SCHEME; 197 198 bool standard_scheme = IsStandardScheme(scheme_); 199 if (standard_scheme != has_standard_scheme_separator) 200 return PARSE_ERROR_WRONG_SCHEME_SEPARATOR; 201 202 // Advance past the scheme separator. 203 scheme_end_pos += 204 (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1); 205 if (scheme_end_pos >= pattern.size()) 206 return PARSE_ERROR_EMPTY_HOST; 207 208 // Parse out the host and path. 209 size_t host_start_pos = scheme_end_pos; 210 size_t path_start_pos = 0; 211 212 if (!standard_scheme) { 213 path_start_pos = host_start_pos; 214 } else if (scheme_ == url::kFileScheme) { 215 size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos); 216 if (host_end_pos == std::string::npos) { 217 // Allow hostname omission. 218 // e.g. file://* is interpreted as file:///*, 219 // file://foo* is interpreted as file:///foo*. 220 path_start_pos = host_start_pos - 1; 221 } else { 222 // Ignore hostname if scheme is file://. 223 // e.g. file://localhost/foo is equal to file:///foo. 224 path_start_pos = host_end_pos; 225 } 226 } else { 227 size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos); 228 229 // Host is required. 230 if (host_start_pos == host_end_pos) 231 return PARSE_ERROR_EMPTY_HOST; 232 233 if (host_end_pos == std::string::npos) 234 return PARSE_ERROR_EMPTY_PATH; 235 236 host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos); 237 238 // The first component can optionally be '*' to match all subdomains. 239 std::vector<std::string> host_components; 240 base::SplitString(host_, '.', &host_components); 241 242 // Could be empty if the host only consists of whitespace characters. 243 if (host_components.empty()) 244 return PARSE_ERROR_EMPTY_HOST; 245 246 if (host_components[0] == "*") { 247 match_subdomains_ = true; 248 host_components.erase(host_components.begin(), 249 host_components.begin() + 1); 250 } 251 host_ = JoinString(host_components, '.'); 252 253 path_start_pos = host_end_pos; 254 } 255 256 SetPath(pattern.substr(path_start_pos)); 257 258 size_t port_pos = host_.find(':'); 259 if (port_pos != std::string::npos) { 260 if (!SetPort(host_.substr(port_pos + 1))) 261 return PARSE_ERROR_INVALID_PORT; 262 host_ = host_.substr(0, port_pos); 263 } 264 265 // No other '*' can occur in the host, though. This isn't necessary, but is 266 // done as a convenience to developers who might otherwise be confused and 267 // think '*' works as a glob in the host. 268 if (host_.find('*') != std::string::npos) 269 return PARSE_ERROR_INVALID_HOST_WILDCARD; 270 271 // Null characters are not allowed in hosts. 272 if (host_.find('\0') != std::string::npos) 273 return PARSE_ERROR_INVALID_HOST; 274 275 return PARSE_SUCCESS; 276 } 277 278 void URLPattern::SetValidSchemes(int valid_schemes) { 279 spec_.clear(); 280 valid_schemes_ = valid_schemes; 281 } 282 283 void URLPattern::SetHost(const std::string& host) { 284 spec_.clear(); 285 host_ = host; 286 } 287 288 void URLPattern::SetMatchAllURLs(bool val) { 289 spec_.clear(); 290 match_all_urls_ = val; 291 292 if (val) { 293 match_subdomains_ = true; 294 scheme_ = "*"; 295 host_.clear(); 296 SetPath("/*"); 297 } 298 } 299 300 void URLPattern::SetMatchSubdomains(bool val) { 301 spec_.clear(); 302 match_subdomains_ = val; 303 } 304 305 bool URLPattern::SetScheme(const std::string& scheme) { 306 spec_.clear(); 307 scheme_ = scheme; 308 if (scheme_ == "*") { 309 valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS); 310 } else if (!IsValidScheme(scheme_)) { 311 return false; 312 } 313 return true; 314 } 315 316 bool URLPattern::IsValidScheme(const std::string& scheme) const { 317 if (valid_schemes_ == SCHEME_ALL) 318 return true; 319 320 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) { 321 if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i])) 322 return true; 323 } 324 325 return false; 326 } 327 328 void URLPattern::SetPath(const std::string& path) { 329 spec_.clear(); 330 path_ = path; 331 path_escaped_ = path_; 332 ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\"); 333 ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?"); 334 } 335 336 bool URLPattern::SetPort(const std::string& port) { 337 spec_.clear(); 338 if (IsValidPortForScheme(scheme_, port)) { 339 port_ = port; 340 return true; 341 } 342 return false; 343 } 344 345 bool URLPattern::MatchesURL(const GURL& test) const { 346 const GURL* test_url = &test; 347 bool has_inner_url = test.inner_url() != NULL; 348 349 if (has_inner_url) { 350 if (!test.SchemeIsFileSystem()) 351 return false; // The only nested URLs we handle are filesystem URLs. 352 test_url = test.inner_url(); 353 } 354 355 if (!MatchesScheme(test_url->scheme())) 356 return false; 357 358 if (match_all_urls_) 359 return true; 360 361 std::string path_for_request = test.PathForRequest(); 362 if (has_inner_url) 363 path_for_request = test_url->path() + path_for_request; 364 365 return MatchesSecurityOriginHelper(*test_url) && 366 MatchesPath(path_for_request); 367 } 368 369 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const { 370 const GURL* test_url = &test; 371 bool has_inner_url = test.inner_url() != NULL; 372 373 if (has_inner_url) { 374 if (!test.SchemeIsFileSystem()) 375 return false; // The only nested URLs we handle are filesystem URLs. 376 test_url = test.inner_url(); 377 } 378 379 if (!MatchesScheme(test_url->scheme())) 380 return false; 381 382 if (match_all_urls_) 383 return true; 384 385 return MatchesSecurityOriginHelper(*test_url); 386 } 387 388 bool URLPattern::MatchesScheme(const std::string& test) const { 389 if (!IsValidScheme(test)) 390 return false; 391 392 return scheme_ == "*" || test == scheme_; 393 } 394 395 bool URLPattern::MatchesHost(const std::string& host) const { 396 std::string test(url::kHttpScheme); 397 test += url::kStandardSchemeSeparator; 398 test += host; 399 test += "/"; 400 return MatchesHost(GURL(test)); 401 } 402 403 bool URLPattern::MatchesHost(const GURL& test) const { 404 // If the hosts are exactly equal, we have a match. 405 if (test.host() == host_) 406 return true; 407 408 // If we're matching subdomains, and we have no host in the match pattern, 409 // that means that we're matching all hosts, which means we have a match no 410 // matter what the test host is. 411 if (match_subdomains_ && host_.empty()) 412 return true; 413 414 // Otherwise, we can only match if our match pattern matches subdomains. 415 if (!match_subdomains_) 416 return false; 417 418 // We don't do subdomain matching against IP addresses, so we can give up now 419 // if the test host is an IP address. 420 if (test.HostIsIPAddress()) 421 return false; 422 423 // Check if the test host is a subdomain of our host. 424 if (test.host().length() <= (host_.length() + 1)) 425 return false; 426 427 if (test.host().compare(test.host().length() - host_.length(), 428 host_.length(), host_) != 0) 429 return false; 430 431 return test.host()[test.host().length() - host_.length() - 1] == '.'; 432 } 433 434 bool URLPattern::ImpliesAllHosts() const { 435 // Check if it matches all urls or is a pattern like http://*/*. 436 if (match_all_urls_ || 437 (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) { 438 return true; 439 } 440 441 // If this doesn't even match subdomains, it can't possibly imply all hosts. 442 if (!match_subdomains_) 443 return false; 444 445 // If |host_| is a recognized TLD, this will be 0. We don't include private 446 // TLDs, so that, e.g., *.appspot.com does not imply all hosts. 447 size_t registry_length = net::registry_controlled_domains::GetRegistryLength( 448 host_, 449 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, 450 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 451 // If there was more than just a TLD in the host (e.g., *.foobar.com), it 452 // doesn't imply all hosts. 453 if (registry_length > 0) 454 return false; 455 456 // At this point the host could either be just a TLD ("com") or some unknown 457 // TLD-like string ("notatld"). To disambiguate between them construct a 458 // fake URL, and check the registry. This returns 0 if the TLD is 459 // unrecognized, or the length of the recognized TLD. 460 registry_length = net::registry_controlled_domains::GetRegistryLength( 461 base::StringPrintf("foo.%s", host_.c_str()), 462 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, 463 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 464 // If we recognized this TLD, then this is a pattern like *.com, and it 465 // should imply all hosts. Otherwise, this doesn't imply all hosts. 466 return registry_length > 0; 467 } 468 469 bool URLPattern::MatchesSingleOrigin() const { 470 // Strictly speaking, the port is part of the origin, but in URLPattern it 471 // defaults to *. It's not very interesting anyway, so leave it out. 472 return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_; 473 } 474 475 bool URLPattern::MatchesPath(const std::string& test) const { 476 // Make the behaviour of OverlapsWith consistent with MatchesURL, which is 477 // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'. 478 if (test + "/*" == path_escaped_) 479 return true; 480 481 return MatchPattern(test, path_escaped_); 482 } 483 484 const std::string& URLPattern::GetAsString() const { 485 if (!spec_.empty()) 486 return spec_; 487 488 if (match_all_urls_) { 489 spec_ = kAllUrlsPattern; 490 return spec_; 491 } 492 493 bool standard_scheme = IsStandardScheme(scheme_); 494 495 std::string spec = scheme_ + 496 (standard_scheme ? url::kStandardSchemeSeparator : ":"); 497 498 if (scheme_ != url::kFileScheme && standard_scheme) { 499 if (match_subdomains_) { 500 spec += "*"; 501 if (!host_.empty()) 502 spec += "."; 503 } 504 505 if (!host_.empty()) 506 spec += host_; 507 508 if (port_ != "*") { 509 spec += ":"; 510 spec += port_; 511 } 512 } 513 514 if (!path_.empty()) 515 spec += path_; 516 517 spec_ = spec; 518 return spec_; 519 } 520 521 bool URLPattern::OverlapsWith(const URLPattern& other) const { 522 if (match_all_urls() || other.match_all_urls()) 523 return true; 524 return (MatchesAnyScheme(other.GetExplicitSchemes()) || 525 other.MatchesAnyScheme(GetExplicitSchemes())) 526 && (MatchesHost(other.host()) || other.MatchesHost(host())) 527 && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port())) 528 && (MatchesPath(StripTrailingWildcard(other.path())) || 529 other.MatchesPath(StripTrailingWildcard(path()))); 530 } 531 532 bool URLPattern::Contains(const URLPattern& other) const { 533 if (match_all_urls()) 534 return true; 535 return MatchesAllSchemes(other.GetExplicitSchemes()) 536 && MatchesHost(other.host()) 537 && MatchesPortPattern(other.port()) 538 && MatchesPath(StripTrailingWildcard(other.path())); 539 } 540 541 bool URLPattern::MatchesAnyScheme( 542 const std::vector<std::string>& schemes) const { 543 for (std::vector<std::string>::const_iterator i = schemes.begin(); 544 i != schemes.end(); ++i) { 545 if (MatchesScheme(*i)) 546 return true; 547 } 548 549 return false; 550 } 551 552 bool URLPattern::MatchesAllSchemes( 553 const std::vector<std::string>& schemes) const { 554 for (std::vector<std::string>::const_iterator i = schemes.begin(); 555 i != schemes.end(); ++i) { 556 if (!MatchesScheme(*i)) 557 return false; 558 } 559 560 return true; 561 } 562 563 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const { 564 // Ignore hostname if scheme is file://. 565 if (scheme_ != url::kFileScheme && !MatchesHost(test)) 566 return false; 567 568 if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort()))) 569 return false; 570 571 return true; 572 } 573 574 bool URLPattern::MatchesPortPattern(const std::string& port) const { 575 return port_ == "*" || port_ == port; 576 } 577 578 std::vector<std::string> URLPattern::GetExplicitSchemes() const { 579 std::vector<std::string> result; 580 581 if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) { 582 result.push_back(scheme_); 583 return result; 584 } 585 586 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) { 587 if (MatchesScheme(kValidSchemes[i])) { 588 result.push_back(kValidSchemes[i]); 589 } 590 } 591 592 return result; 593 } 594 595 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const { 596 std::vector<std::string> explicit_schemes = GetExplicitSchemes(); 597 std::vector<URLPattern> result; 598 599 for (std::vector<std::string>::const_iterator i = explicit_schemes.begin(); 600 i != explicit_schemes.end(); ++i) { 601 URLPattern temp = *this; 602 temp.SetScheme(*i); 603 temp.SetMatchAllURLs(false); 604 result.push_back(temp); 605 } 606 607 return result; 608 } 609 610 // static 611 const char* URLPattern::GetParseResultString( 612 URLPattern::ParseResult parse_result) { 613 return kParseResultMessages[parse_result]; 614 } 615