1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifdef WIN32 6 #include <windows.h> 7 #else 8 #include <pthread.h> 9 #endif 10 11 #include <algorithm> 12 #include <ostream> 13 14 #include "url/gurl.h" 15 16 #include "base/logging.h" 17 #include "url/url_canon_stdstring.h" 18 #include "url/url_util.h" 19 20 namespace { 21 22 // External template that can handle initialization of either character type. 23 // The input spec is given, and the canonical version will be placed in 24 // |*canonical|, along with the parsing of the canonical spec in |*parsed|. 25 template<typename STR> 26 bool InitCanonical(const STR& input_spec, 27 std::string* canonical, 28 url_parse::Parsed* parsed) { 29 // Reserve enough room in the output for the input, plus some extra so that 30 // we have room if we have to escape a few things without reallocating. 31 canonical->reserve(input_spec.size() + 32); 32 url_canon::StdStringCanonOutput output(canonical); 33 bool success = url_util::Canonicalize( 34 input_spec.data(), static_cast<int>(input_spec.length()), 35 NULL, &output, parsed); 36 37 output.Complete(); // Must be done before using string. 38 return success; 39 } 40 41 static std::string* empty_string = NULL; 42 static GURL* empty_gurl = NULL; 43 44 #ifdef WIN32 45 46 // Returns a static reference to an empty string for returning a reference 47 // when there is no underlying string. 48 const std::string& EmptyStringForGURL() { 49 // Avoid static object construction/destruction on startup/shutdown. 50 if (!empty_string) { 51 // Create the string. Be careful that we don't break in the case that this 52 // is being called from multiple threads. Statics are not threadsafe. 53 std::string* new_empty_string = new std::string; 54 if (InterlockedCompareExchangePointer( 55 reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) { 56 // The old value was non-NULL, so no replacement was done. Another 57 // thread did the initialization out from under us. 58 delete new_empty_string; 59 } 60 } 61 return *empty_string; 62 } 63 64 #else 65 66 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT; 67 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT; 68 69 void EmptyStringForGURLOnce(void) { 70 empty_string = new std::string; 71 } 72 73 const std::string& EmptyStringForGURL() { 74 // Avoid static object construction/destruction on startup/shutdown. 75 pthread_once(&empty_string_once, EmptyStringForGURLOnce); 76 return *empty_string; 77 } 78 79 #endif // WIN32 80 81 } // namespace 82 83 GURL::GURL() : is_valid_(false), inner_url_(NULL) { 84 } 85 86 GURL::GURL(const GURL& other) 87 : spec_(other.spec_), 88 is_valid_(other.is_valid_), 89 parsed_(other.parsed_), 90 inner_url_(NULL) { 91 if (other.inner_url_) 92 inner_url_ = new GURL(*other.inner_url_); 93 // Valid filesystem urls should always have an inner_url_. 94 DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_); 95 } 96 97 GURL::GURL(const std::string& url_string) : inner_url_(NULL) { 98 is_valid_ = InitCanonical(url_string, &spec_, &parsed_); 99 if (is_valid_ && SchemeIsFileSystem()) { 100 inner_url_ = 101 new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true); 102 } 103 } 104 105 GURL::GURL(const base::string16& url_string) : inner_url_(NULL) { 106 is_valid_ = InitCanonical(url_string, &spec_, &parsed_); 107 if (is_valid_ && SchemeIsFileSystem()) { 108 inner_url_ = 109 new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true); 110 } 111 } 112 113 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len, 114 const url_parse::Parsed& parsed, bool is_valid) 115 : spec_(canonical_spec, canonical_spec_len), 116 is_valid_(is_valid), 117 parsed_(parsed), 118 inner_url_(NULL) { 119 InitializeFromCanonicalSpec(); 120 } 121 122 GURL::GURL(std::string canonical_spec, 123 const url_parse::Parsed& parsed, bool is_valid) 124 : is_valid_(is_valid), 125 parsed_(parsed), 126 inner_url_(NULL) { 127 spec_.swap(canonical_spec); 128 InitializeFromCanonicalSpec(); 129 } 130 131 void GURL::InitializeFromCanonicalSpec() { 132 if (is_valid_ && SchemeIsFileSystem()) { 133 inner_url_ = 134 new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true); 135 } 136 137 #ifndef NDEBUG 138 // For testing purposes, check that the parsed canonical URL is identical to 139 // what we would have produced. Skip checking for invalid URLs have no meaning 140 // and we can't always canonicalize them reproducibly. 141 // Skip checking for non-standard URLs as they may have trailing white-space 142 // and we can't always canonicalize them exactly. TODO(joth): see if we 143 // can do a better job on this e.g. by not stripping trailing white-space 144 // for non-standard URLs in this validation path. http://crbug.com/291747. 145 if (is_valid_ && IsStandard()) { 146 url_parse::Component scheme; 147 if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(), 148 "filesystem", &scheme) || 149 scheme.begin == parsed_.scheme.begin) { 150 // We can't do this check on the inner_url of a filesystem URL, as 151 // canonical_spec actually points to the start of the outer URL, so we'd 152 // end up with infinite recursion in this constructor. 153 GURL test_url(spec_); 154 155 DCHECK(test_url.is_valid_ == is_valid_); 156 DCHECK(test_url.spec_ == spec_); 157 158 DCHECK(test_url.parsed_.scheme == parsed_.scheme); 159 DCHECK(test_url.parsed_.username == parsed_.username); 160 DCHECK(test_url.parsed_.password == parsed_.password); 161 DCHECK(test_url.parsed_.host == parsed_.host); 162 DCHECK(test_url.parsed_.port == parsed_.port); 163 DCHECK(test_url.parsed_.path == parsed_.path); 164 DCHECK(test_url.parsed_.query == parsed_.query); 165 DCHECK(test_url.parsed_.ref == parsed_.ref); 166 } 167 } 168 #endif 169 } 170 171 GURL::~GURL() { 172 delete inner_url_; 173 } 174 175 GURL& GURL::operator=(const GURL& other) { 176 spec_ = other.spec_; 177 is_valid_ = other.is_valid_; 178 parsed_ = other.parsed_; 179 delete inner_url_; 180 inner_url_ = NULL; 181 if (other.inner_url_) 182 inner_url_ = new GURL(*other.inner_url_); 183 // Valid filesystem urls should always have an inner_url_. 184 DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_); 185 return *this; 186 } 187 188 const std::string& GURL::spec() const { 189 if (is_valid_ || spec_.empty()) 190 return spec_; 191 192 DCHECK(false) << "Trying to get the spec of an invalid URL!"; 193 return EmptyStringForGURL(); 194 } 195 196 GURL GURL::Resolve(const std::string& relative) const { 197 return ResolveWithCharsetConverter(relative, NULL); 198 } 199 GURL GURL::Resolve(const base::string16& relative) const { 200 return ResolveWithCharsetConverter(relative, NULL); 201 } 202 203 // Note: code duplicated below (it's inconvenient to use a template here). 204 GURL GURL::ResolveWithCharsetConverter( 205 const std::string& relative, 206 url_canon::CharsetConverter* charset_converter) const { 207 // Not allowed for invalid URLs. 208 if (!is_valid_) 209 return GURL(); 210 211 GURL result; 212 213 // Reserve enough room in the output for the input, plus some extra so that 214 // we have room if we have to escape a few things without reallocating. 215 result.spec_.reserve(spec_.size() + 32); 216 url_canon::StdStringCanonOutput output(&result.spec_); 217 218 if (!url_util::ResolveRelative( 219 spec_.data(), static_cast<int>(spec_.length()), parsed_, 220 relative.data(), static_cast<int>(relative.length()), 221 charset_converter, &output, &result.parsed_)) { 222 // Error resolving, return an empty URL. 223 return GURL(); 224 } 225 226 output.Complete(); 227 result.is_valid_ = true; 228 if (result.SchemeIsFileSystem()) { 229 result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(), 230 *result.parsed_.inner_parsed(), true); 231 } 232 return result; 233 } 234 235 // Note: code duplicated above (it's inconvenient to use a template here). 236 GURL GURL::ResolveWithCharsetConverter( 237 const base::string16& relative, 238 url_canon::CharsetConverter* charset_converter) const { 239 // Not allowed for invalid URLs. 240 if (!is_valid_) 241 return GURL(); 242 243 GURL result; 244 245 // Reserve enough room in the output for the input, plus some extra so that 246 // we have room if we have to escape a few things without reallocating. 247 result.spec_.reserve(spec_.size() + 32); 248 url_canon::StdStringCanonOutput output(&result.spec_); 249 250 if (!url_util::ResolveRelative( 251 spec_.data(), static_cast<int>(spec_.length()), parsed_, 252 relative.data(), static_cast<int>(relative.length()), 253 charset_converter, &output, &result.parsed_)) { 254 // Error resolving, return an empty URL. 255 return GURL(); 256 } 257 258 output.Complete(); 259 result.is_valid_ = true; 260 if (result.SchemeIsFileSystem()) { 261 result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(), 262 *result.parsed_.inner_parsed(), true); 263 } 264 return result; 265 } 266 267 // Note: code duplicated below (it's inconvenient to use a template here). 268 GURL GURL::ReplaceComponents( 269 const url_canon::Replacements<char>& replacements) const { 270 GURL result; 271 272 // Not allowed for invalid URLs. 273 if (!is_valid_) 274 return GURL(); 275 276 // Reserve enough room in the output for the input, plus some extra so that 277 // we have room if we have to escape a few things without reallocating. 278 result.spec_.reserve(spec_.size() + 32); 279 url_canon::StdStringCanonOutput output(&result.spec_); 280 281 result.is_valid_ = url_util::ReplaceComponents( 282 spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, 283 NULL, &output, &result.parsed_); 284 285 output.Complete(); 286 if (result.is_valid_ && result.SchemeIsFileSystem()) { 287 result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(), 288 *result.parsed_.inner_parsed(), true); 289 } 290 return result; 291 } 292 293 // Note: code duplicated above (it's inconvenient to use a template here). 294 GURL GURL::ReplaceComponents( 295 const url_canon::Replacements<base::char16>& replacements) const { 296 GURL result; 297 298 // Not allowed for invalid URLs. 299 if (!is_valid_) 300 return GURL(); 301 302 // Reserve enough room in the output for the input, plus some extra so that 303 // we have room if we have to escape a few things without reallocating. 304 result.spec_.reserve(spec_.size() + 32); 305 url_canon::StdStringCanonOutput output(&result.spec_); 306 307 result.is_valid_ = url_util::ReplaceComponents( 308 spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, 309 NULL, &output, &result.parsed_); 310 311 output.Complete(); 312 if (result.is_valid_ && result.SchemeIsFileSystem()) { 313 result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(), 314 *result.parsed_.inner_parsed(), true); 315 } 316 return result; 317 } 318 319 GURL GURL::GetOrigin() const { 320 // This doesn't make sense for invalid or nonstandard URLs, so return 321 // the empty URL 322 if (!is_valid_ || !IsStandard()) 323 return GURL(); 324 325 if (SchemeIsFileSystem()) 326 return inner_url_->GetOrigin(); 327 328 url_canon::Replacements<char> replacements; 329 replacements.ClearUsername(); 330 replacements.ClearPassword(); 331 replacements.ClearPath(); 332 replacements.ClearQuery(); 333 replacements.ClearRef(); 334 335 return ReplaceComponents(replacements); 336 } 337 338 GURL GURL::GetWithEmptyPath() const { 339 // This doesn't make sense for invalid or nonstandard URLs, so return 340 // the empty URL. 341 if (!is_valid_ || !IsStandard()) 342 return GURL(); 343 344 // We could optimize this since we know that the URL is canonical, and we are 345 // appending a canonical path, so avoiding re-parsing. 346 GURL other(*this); 347 if (parsed_.path.len == 0) 348 return other; 349 350 // Clear everything after the path. 351 other.parsed_.query.reset(); 352 other.parsed_.ref.reset(); 353 354 // Set the path, since the path is longer than one, we can just set the 355 // first character and resize. 356 other.spec_[other.parsed_.path.begin] = '/'; 357 other.parsed_.path.len = 1; 358 other.spec_.resize(other.parsed_.path.begin + 1); 359 return other; 360 } 361 362 bool GURL::IsStandard() const { 363 return url_util::IsStandard(spec_.data(), parsed_.scheme); 364 } 365 366 bool GURL::SchemeIs(const char* lower_ascii_scheme) const { 367 if (parsed_.scheme.len <= 0) 368 return lower_ascii_scheme == NULL; 369 return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin, 370 spec_.data() + parsed_.scheme.end(), 371 lower_ascii_scheme); 372 } 373 374 int GURL::IntPort() const { 375 if (parsed_.port.is_nonempty()) 376 return url_parse::ParsePort(spec_.data(), parsed_.port); 377 return url_parse::PORT_UNSPECIFIED; 378 } 379 380 int GURL::EffectiveIntPort() const { 381 int int_port = IntPort(); 382 if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard()) 383 return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin, 384 parsed_.scheme.len); 385 return int_port; 386 } 387 388 std::string GURL::ExtractFileName() const { 389 url_parse::Component file_component; 390 url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component); 391 return ComponentString(file_component); 392 } 393 394 std::string GURL::PathForRequest() const { 395 DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty"; 396 if (parsed_.ref.len >= 0) { 397 // Clip off the reference when it exists. The reference starts after the # 398 // sign, so we have to subtract one to also remove it. 399 return std::string(spec_, parsed_.path.begin, 400 parsed_.ref.begin - parsed_.path.begin - 1); 401 } 402 // Compute the actual path length, rather than depending on the spec's 403 // terminator. If we're an inner_url, our spec continues on into our outer 404 // url's path/query/ref. 405 int path_len = parsed_.path.len; 406 if (parsed_.query.is_valid()) 407 path_len = parsed_.query.end() - parsed_.path.begin; 408 409 return std::string(spec_, parsed_.path.begin, path_len); 410 } 411 412 std::string GURL::HostNoBrackets() const { 413 // If host looks like an IPv6 literal, strip the square brackets. 414 url_parse::Component h(parsed_.host); 415 if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') { 416 h.begin++; 417 h.len -= 2; 418 } 419 return ComponentString(h); 420 } 421 422 std::string GURL::GetContent() const { 423 return is_valid_ ? ComponentString(parsed_.GetContent()) : ""; 424 } 425 426 bool GURL::HostIsIPAddress() const { 427 if (!is_valid_ || spec_.empty()) 428 return false; 429 430 url_canon::RawCanonOutputT<char, 128> ignored_output; 431 url_canon::CanonHostInfo host_info; 432 url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, 433 &ignored_output, &host_info); 434 return host_info.IsIPAddress(); 435 } 436 437 #ifdef WIN32 438 439 const GURL& GURL::EmptyGURL() { 440 // Avoid static object construction/destruction on startup/shutdown. 441 if (!empty_gurl) { 442 // Create the string. Be careful that we don't break in the case that this 443 // is being called from multiple threads. 444 GURL* new_empty_gurl = new GURL; 445 if (InterlockedCompareExchangePointer( 446 reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) { 447 // The old value was non-NULL, so no replacement was done. Another 448 // thread did the initialization out from under us. 449 delete new_empty_gurl; 450 } 451 } 452 return *empty_gurl; 453 } 454 455 #else 456 457 void EmptyGURLOnce(void) { 458 empty_gurl = new GURL; 459 } 460 461 const GURL& GURL::EmptyGURL() { 462 // Avoid static object construction/destruction on startup/shutdown. 463 pthread_once(&empty_gurl_once, EmptyGURLOnce); 464 return *empty_gurl; 465 } 466 467 #endif // WIN32 468 469 bool GURL::DomainIs(const char* lower_ascii_domain, 470 int domain_len) const { 471 // Return false if this URL is not valid or domain is empty. 472 if (!is_valid_ || !domain_len) 473 return false; 474 475 // FileSystem URLs have empty parsed_.host, so check this first. 476 if (SchemeIsFileSystem() && inner_url_) 477 return inner_url_->DomainIs(lower_ascii_domain, domain_len); 478 479 if (!parsed_.host.is_nonempty()) 480 return false; 481 482 // Check whether the host name is end with a dot. If yes, treat it 483 // the same as no-dot unless the input comparison domain is end 484 // with dot. 485 const char* last_pos = spec_.data() + parsed_.host.end() - 1; 486 int host_len = parsed_.host.len; 487 if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) { 488 last_pos--; 489 host_len--; 490 } 491 492 // Return false if host's length is less than domain's length. 493 if (host_len < domain_len) 494 return false; 495 496 // Compare this url whether belong specific domain. 497 const char* start_pos = spec_.data() + parsed_.host.begin + 498 host_len - domain_len; 499 500 if (!url_util::LowerCaseEqualsASCII(start_pos, 501 last_pos + 1, 502 lower_ascii_domain, 503 lower_ascii_domain + domain_len)) 504 return false; 505 506 // Check whether host has right domain start with dot, make sure we got 507 // right domain range. For example www.google.com has domain 508 // "google.com" but www.iamnotgoogle.com does not. 509 if ('.' != lower_ascii_domain[0] && host_len > domain_len && 510 '.' != *(start_pos - 1)) 511 return false; 512 513 return true; 514 } 515 516 void GURL::Swap(GURL* other) { 517 spec_.swap(other->spec_); 518 std::swap(is_valid_, other->is_valid_); 519 std::swap(parsed_, other->parsed_); 520 std::swap(inner_url_, other->inner_url_); 521 } 522 523 std::ostream& operator<<(std::ostream& out, const GURL& url) { 524 return out << url.possibly_invalid_spec(); 525 } 526