Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifdef WIN32
      6 #include <windows.h>
      7 #else
      8 #include <pthread.h>
      9 #endif
     10 
     11 #include <algorithm>
     12 #include <ostream>
     13 
     14 #include "url/gurl.h"
     15 
     16 #include "base/logging.h"
     17 #include "url/url_canon_stdstring.h"
     18 #include "url/url_util.h"
     19 
     20 namespace {
     21 
     22 // External template that can handle initialization of either character type.
     23 // The input spec is given, and the canonical version will be placed in
     24 // |*canonical|, along with the parsing of the canonical spec in |*parsed|.
     25 template<typename STR>
     26 bool InitCanonical(const STR& input_spec,
     27                    std::string* canonical,
     28                    url_parse::Parsed* parsed) {
     29   // Reserve enough room in the output for the input, plus some extra so that
     30   // we have room if we have to escape a few things without reallocating.
     31   canonical->reserve(input_spec.size() + 32);
     32   url_canon::StdStringCanonOutput output(canonical);
     33   bool success = url_util::Canonicalize(
     34       input_spec.data(), static_cast<int>(input_spec.length()),
     35       NULL, &output, parsed);
     36 
     37   output.Complete();  // Must be done before using string.
     38   return success;
     39 }
     40 
     41 static std::string* empty_string = NULL;
     42 static GURL* empty_gurl = NULL;
     43 
     44 #ifdef WIN32
     45 
     46 // Returns a static reference to an empty string for returning a reference
     47 // when there is no underlying string.
     48 const std::string& EmptyStringForGURL() {
     49   // Avoid static object construction/destruction on startup/shutdown.
     50   if (!empty_string) {
     51     // Create the string. Be careful that we don't break in the case that this
     52     // is being called from multiple threads. Statics are not threadsafe.
     53     std::string* new_empty_string = new std::string;
     54     if (InterlockedCompareExchangePointer(
     55         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
     56       // The old value was non-NULL, so no replacement was done. Another
     57       // thread did the initialization out from under us.
     58       delete new_empty_string;
     59     }
     60   }
     61   return *empty_string;
     62 }
     63 
     64 #else
     65 
     66 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
     67 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
     68 
     69 void EmptyStringForGURLOnce(void) {
     70   empty_string = new std::string;
     71 }
     72 
     73 const std::string& EmptyStringForGURL() {
     74   // Avoid static object construction/destruction on startup/shutdown.
     75   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
     76   return *empty_string;
     77 }
     78 
     79 #endif  // WIN32
     80 
     81 } // namespace
     82 
     83 GURL::GURL() : is_valid_(false), inner_url_(NULL) {
     84 }
     85 
     86 GURL::GURL(const GURL& other)
     87     : spec_(other.spec_),
     88       is_valid_(other.is_valid_),
     89       parsed_(other.parsed_),
     90       inner_url_(NULL) {
     91   if (other.inner_url_)
     92     inner_url_ = new GURL(*other.inner_url_);
     93   // Valid filesystem urls should always have an inner_url_.
     94   DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
     95 }
     96 
     97 GURL::GURL(const std::string& url_string) : inner_url_(NULL) {
     98   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
     99   if (is_valid_ && SchemeIsFileSystem()) {
    100     inner_url_ =
    101         new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
    102   }
    103 }
    104 
    105 GURL::GURL(const base::string16& url_string) : inner_url_(NULL) {
    106   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
    107   if (is_valid_ && SchemeIsFileSystem()) {
    108     inner_url_ =
    109         new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
    110   }
    111 }
    112 
    113 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
    114            const url_parse::Parsed& parsed, bool is_valid)
    115     : spec_(canonical_spec, canonical_spec_len),
    116       is_valid_(is_valid),
    117       parsed_(parsed),
    118       inner_url_(NULL) {
    119   InitializeFromCanonicalSpec();
    120 }
    121 
    122 GURL::GURL(std::string canonical_spec,
    123            const url_parse::Parsed& parsed, bool is_valid)
    124     : is_valid_(is_valid),
    125       parsed_(parsed),
    126       inner_url_(NULL) {
    127   spec_.swap(canonical_spec);
    128   InitializeFromCanonicalSpec();
    129 }
    130 
    131 void GURL::InitializeFromCanonicalSpec() {
    132   if (is_valid_ && SchemeIsFileSystem()) {
    133     inner_url_ =
    134         new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
    135   }
    136 
    137 #ifndef NDEBUG
    138   // For testing purposes, check that the parsed canonical URL is identical to
    139   // what we would have produced. Skip checking for invalid URLs have no meaning
    140   // and we can't always canonicalize them reproducibly.
    141   // Skip checking for non-standard URLs as they may have trailing white-space
    142   // and we can't always canonicalize them exactly. TODO(joth): see if we
    143   // can do a better job on this e.g. by not stripping trailing white-space
    144   // for non-standard URLs in this validation path. http://crbug.com/291747.
    145   if (is_valid_ && IsStandard()) {
    146     url_parse::Component scheme;
    147     if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(),
    148                                         "filesystem", &scheme) ||
    149         scheme.begin == parsed_.scheme.begin) {
    150       // We can't do this check on the inner_url of a filesystem URL, as
    151       // canonical_spec actually points to the start of the outer URL, so we'd
    152       // end up with infinite recursion in this constructor.
    153       GURL test_url(spec_);
    154 
    155       DCHECK(test_url.is_valid_ == is_valid_);
    156       DCHECK(test_url.spec_ == spec_);
    157 
    158       DCHECK(test_url.parsed_.scheme == parsed_.scheme);
    159       DCHECK(test_url.parsed_.username == parsed_.username);
    160       DCHECK(test_url.parsed_.password == parsed_.password);
    161       DCHECK(test_url.parsed_.host == parsed_.host);
    162       DCHECK(test_url.parsed_.port == parsed_.port);
    163       DCHECK(test_url.parsed_.path == parsed_.path);
    164       DCHECK(test_url.parsed_.query == parsed_.query);
    165       DCHECK(test_url.parsed_.ref == parsed_.ref);
    166     }
    167   }
    168 #endif
    169 }
    170 
    171 GURL::~GURL() {
    172   delete inner_url_;
    173 }
    174 
    175 GURL& GURL::operator=(const GURL& other) {
    176   spec_ = other.spec_;
    177   is_valid_ = other.is_valid_;
    178   parsed_ = other.parsed_;
    179   delete inner_url_;
    180   inner_url_ = NULL;
    181   if (other.inner_url_)
    182     inner_url_ = new GURL(*other.inner_url_);
    183   // Valid filesystem urls should always have an inner_url_.
    184   DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
    185   return *this;
    186 }
    187 
    188 const std::string& GURL::spec() const {
    189   if (is_valid_ || spec_.empty())
    190     return spec_;
    191 
    192   DCHECK(false) << "Trying to get the spec of an invalid URL!";
    193   return EmptyStringForGURL();
    194 }
    195 
    196 GURL GURL::Resolve(const std::string& relative) const {
    197   return ResolveWithCharsetConverter(relative, NULL);
    198 }
    199 GURL GURL::Resolve(const base::string16& relative) const {
    200   return ResolveWithCharsetConverter(relative, NULL);
    201 }
    202 
    203 // Note: code duplicated below (it's inconvenient to use a template here).
    204 GURL GURL::ResolveWithCharsetConverter(
    205     const std::string& relative,
    206     url_canon::CharsetConverter* charset_converter) const {
    207   // Not allowed for invalid URLs.
    208   if (!is_valid_)
    209     return GURL();
    210 
    211   GURL result;
    212 
    213   // Reserve enough room in the output for the input, plus some extra so that
    214   // we have room if we have to escape a few things without reallocating.
    215   result.spec_.reserve(spec_.size() + 32);
    216   url_canon::StdStringCanonOutput output(&result.spec_);
    217 
    218   if (!url_util::ResolveRelative(
    219           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    220           relative.data(), static_cast<int>(relative.length()),
    221           charset_converter, &output, &result.parsed_)) {
    222     // Error resolving, return an empty URL.
    223     return GURL();
    224   }
    225 
    226   output.Complete();
    227   result.is_valid_ = true;
    228   if (result.SchemeIsFileSystem()) {
    229     result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(),
    230                                  *result.parsed_.inner_parsed(), true);
    231   }
    232   return result;
    233 }
    234 
    235 // Note: code duplicated above (it's inconvenient to use a template here).
    236 GURL GURL::ResolveWithCharsetConverter(
    237     const base::string16& relative,
    238     url_canon::CharsetConverter* charset_converter) const {
    239   // Not allowed for invalid URLs.
    240   if (!is_valid_)
    241     return GURL();
    242 
    243   GURL result;
    244 
    245   // Reserve enough room in the output for the input, plus some extra so that
    246   // we have room if we have to escape a few things without reallocating.
    247   result.spec_.reserve(spec_.size() + 32);
    248   url_canon::StdStringCanonOutput output(&result.spec_);
    249 
    250   if (!url_util::ResolveRelative(
    251           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    252           relative.data(), static_cast<int>(relative.length()),
    253           charset_converter, &output, &result.parsed_)) {
    254     // Error resolving, return an empty URL.
    255     return GURL();
    256   }
    257 
    258   output.Complete();
    259   result.is_valid_ = true;
    260   if (result.SchemeIsFileSystem()) {
    261     result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(),
    262                                  *result.parsed_.inner_parsed(), true);
    263   }
    264   return result;
    265 }
    266 
    267 // Note: code duplicated below (it's inconvenient to use a template here).
    268 GURL GURL::ReplaceComponents(
    269     const url_canon::Replacements<char>& replacements) const {
    270   GURL result;
    271 
    272   // Not allowed for invalid URLs.
    273   if (!is_valid_)
    274     return GURL();
    275 
    276   // Reserve enough room in the output for the input, plus some extra so that
    277   // we have room if we have to escape a few things without reallocating.
    278   result.spec_.reserve(spec_.size() + 32);
    279   url_canon::StdStringCanonOutput output(&result.spec_);
    280 
    281   result.is_valid_ = url_util::ReplaceComponents(
    282       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    283       NULL, &output, &result.parsed_);
    284 
    285   output.Complete();
    286   if (result.is_valid_ && result.SchemeIsFileSystem()) {
    287     result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
    288                                  *result.parsed_.inner_parsed(), true);
    289   }
    290   return result;
    291 }
    292 
    293 // Note: code duplicated above (it's inconvenient to use a template here).
    294 GURL GURL::ReplaceComponents(
    295     const url_canon::Replacements<base::char16>& replacements) const {
    296   GURL result;
    297 
    298   // Not allowed for invalid URLs.
    299   if (!is_valid_)
    300     return GURL();
    301 
    302   // Reserve enough room in the output for the input, plus some extra so that
    303   // we have room if we have to escape a few things without reallocating.
    304   result.spec_.reserve(spec_.size() + 32);
    305   url_canon::StdStringCanonOutput output(&result.spec_);
    306 
    307   result.is_valid_ = url_util::ReplaceComponents(
    308       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    309       NULL, &output, &result.parsed_);
    310 
    311   output.Complete();
    312   if (result.is_valid_ && result.SchemeIsFileSystem()) {
    313     result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
    314                                  *result.parsed_.inner_parsed(), true);
    315   }
    316   return result;
    317 }
    318 
    319 GURL GURL::GetOrigin() const {
    320   // This doesn't make sense for invalid or nonstandard URLs, so return
    321   // the empty URL
    322   if (!is_valid_ || !IsStandard())
    323     return GURL();
    324 
    325   if (SchemeIsFileSystem())
    326     return inner_url_->GetOrigin();
    327 
    328   url_canon::Replacements<char> replacements;
    329   replacements.ClearUsername();
    330   replacements.ClearPassword();
    331   replacements.ClearPath();
    332   replacements.ClearQuery();
    333   replacements.ClearRef();
    334 
    335   return ReplaceComponents(replacements);
    336 }
    337 
    338 GURL GURL::GetWithEmptyPath() const {
    339   // This doesn't make sense for invalid or nonstandard URLs, so return
    340   // the empty URL.
    341   if (!is_valid_ || !IsStandard())
    342     return GURL();
    343 
    344   // We could optimize this since we know that the URL is canonical, and we are
    345   // appending a canonical path, so avoiding re-parsing.
    346   GURL other(*this);
    347   if (parsed_.path.len == 0)
    348     return other;
    349 
    350   // Clear everything after the path.
    351   other.parsed_.query.reset();
    352   other.parsed_.ref.reset();
    353 
    354   // Set the path, since the path is longer than one, we can just set the
    355   // first character and resize.
    356   other.spec_[other.parsed_.path.begin] = '/';
    357   other.parsed_.path.len = 1;
    358   other.spec_.resize(other.parsed_.path.begin + 1);
    359   return other;
    360 }
    361 
    362 bool GURL::IsStandard() const {
    363   return url_util::IsStandard(spec_.data(), parsed_.scheme);
    364 }
    365 
    366 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
    367   if (parsed_.scheme.len <= 0)
    368     return lower_ascii_scheme == NULL;
    369   return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
    370                                         spec_.data() + parsed_.scheme.end(),
    371                                         lower_ascii_scheme);
    372 }
    373 
    374 int GURL::IntPort() const {
    375   if (parsed_.port.is_nonempty())
    376     return url_parse::ParsePort(spec_.data(), parsed_.port);
    377   return url_parse::PORT_UNSPECIFIED;
    378 }
    379 
    380 int GURL::EffectiveIntPort() const {
    381   int int_port = IntPort();
    382   if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
    383     return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
    384                                            parsed_.scheme.len);
    385   return int_port;
    386 }
    387 
    388 std::string GURL::ExtractFileName() const {
    389   url_parse::Component file_component;
    390   url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
    391   return ComponentString(file_component);
    392 }
    393 
    394 std::string GURL::PathForRequest() const {
    395   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
    396   if (parsed_.ref.len >= 0) {
    397     // Clip off the reference when it exists. The reference starts after the #
    398     // sign, so we have to subtract one to also remove it.
    399     return std::string(spec_, parsed_.path.begin,
    400                        parsed_.ref.begin - parsed_.path.begin - 1);
    401   }
    402   // Compute the actual path length, rather than depending on the spec's
    403   // terminator.  If we're an inner_url, our spec continues on into our outer
    404   // url's path/query/ref.
    405   int path_len = parsed_.path.len;
    406   if (parsed_.query.is_valid())
    407     path_len = parsed_.query.end() - parsed_.path.begin;
    408 
    409   return std::string(spec_, parsed_.path.begin, path_len);
    410 }
    411 
    412 std::string GURL::HostNoBrackets() const {
    413   // If host looks like an IPv6 literal, strip the square brackets.
    414   url_parse::Component h(parsed_.host);
    415   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
    416     h.begin++;
    417     h.len -= 2;
    418   }
    419   return ComponentString(h);
    420 }
    421 
    422 std::string GURL::GetContent() const {
    423   return is_valid_ ? ComponentString(parsed_.GetContent()) : "";
    424 }
    425 
    426 bool GURL::HostIsIPAddress() const {
    427   if (!is_valid_ || spec_.empty())
    428      return false;
    429 
    430   url_canon::RawCanonOutputT<char, 128> ignored_output;
    431   url_canon::CanonHostInfo host_info;
    432   url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
    433                                    &ignored_output, &host_info);
    434   return host_info.IsIPAddress();
    435 }
    436 
    437 #ifdef WIN32
    438 
    439 const GURL& GURL::EmptyGURL() {
    440   // Avoid static object construction/destruction on startup/shutdown.
    441   if (!empty_gurl) {
    442     // Create the string. Be careful that we don't break in the case that this
    443     // is being called from multiple threads.
    444     GURL* new_empty_gurl = new GURL;
    445     if (InterlockedCompareExchangePointer(
    446         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
    447       // The old value was non-NULL, so no replacement was done. Another
    448       // thread did the initialization out from under us.
    449       delete new_empty_gurl;
    450     }
    451   }
    452   return *empty_gurl;
    453 }
    454 
    455 #else
    456 
    457 void EmptyGURLOnce(void) {
    458   empty_gurl = new GURL;
    459 }
    460 
    461 const GURL& GURL::EmptyGURL() {
    462   // Avoid static object construction/destruction on startup/shutdown.
    463   pthread_once(&empty_gurl_once, EmptyGURLOnce);
    464   return *empty_gurl;
    465 }
    466 
    467 #endif  // WIN32
    468 
    469 bool GURL::DomainIs(const char* lower_ascii_domain,
    470                     int domain_len) const {
    471   // Return false if this URL is not valid or domain is empty.
    472   if (!is_valid_ || !domain_len)
    473     return false;
    474 
    475   // FileSystem URLs have empty parsed_.host, so check this first.
    476   if (SchemeIsFileSystem() && inner_url_)
    477     return inner_url_->DomainIs(lower_ascii_domain, domain_len);
    478 
    479   if (!parsed_.host.is_nonempty())
    480     return false;
    481 
    482   // Check whether the host name is end with a dot. If yes, treat it
    483   // the same as no-dot unless the input comparison domain is end
    484   // with dot.
    485   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
    486   int host_len = parsed_.host.len;
    487   if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
    488     last_pos--;
    489     host_len--;
    490   }
    491 
    492   // Return false if host's length is less than domain's length.
    493   if (host_len < domain_len)
    494     return false;
    495 
    496   // Compare this url whether belong specific domain.
    497   const char* start_pos = spec_.data() + parsed_.host.begin +
    498                           host_len - domain_len;
    499 
    500   if (!url_util::LowerCaseEqualsASCII(start_pos,
    501                                       last_pos + 1,
    502                                       lower_ascii_domain,
    503                                       lower_ascii_domain + domain_len))
    504     return false;
    505 
    506   // Check whether host has right domain start with dot, make sure we got
    507   // right domain range. For example www.google.com has domain
    508   // "google.com" but www.iamnotgoogle.com does not.
    509   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
    510       '.' != *(start_pos - 1))
    511     return false;
    512 
    513   return true;
    514 }
    515 
    516 void GURL::Swap(GURL* other) {
    517   spec_.swap(other->spec_);
    518   std::swap(is_valid_, other->is_valid_);
    519   std::swap(parsed_, other->parsed_);
    520   std::swap(inner_url_, other->inner_url_);
    521 }
    522 
    523 std::ostream& operator<<(std::ostream& out, const GURL& url) {
    524   return out << url.possibly_invalid_spec();
    525 }
    526