Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifdef WIN32
      6 #include <windows.h>
      7 #else
      8 #include <pthread.h>
      9 #endif
     10 
     11 #include <algorithm>
     12 #include <ostream>
     13 
     14 #include "url/gurl.h"
     15 
     16 #include "base/logging.h"
     17 #include "url/url_canon_stdstring.h"
     18 #include "url/url_util.h"
     19 
     20 namespace {
     21 
     22 static std::string* empty_string = NULL;
     23 static GURL* empty_gurl = NULL;
     24 
     25 #ifdef WIN32
     26 
     27 // Returns a static reference to an empty string for returning a reference
     28 // when there is no underlying string.
     29 const std::string& EmptyStringForGURL() {
     30   // Avoid static object construction/destruction on startup/shutdown.
     31   if (!empty_string) {
     32     // Create the string. Be careful that we don't break in the case that this
     33     // is being called from multiple threads. Statics are not threadsafe.
     34     std::string* new_empty_string = new std::string;
     35     if (InterlockedCompareExchangePointer(
     36         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
     37       // The old value was non-NULL, so no replacement was done. Another
     38       // thread did the initialization out from under us.
     39       delete new_empty_string;
     40     }
     41   }
     42   return *empty_string;
     43 }
     44 
     45 #else
     46 
     47 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
     48 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
     49 
     50 void EmptyStringForGURLOnce(void) {
     51   empty_string = new std::string;
     52 }
     53 
     54 const std::string& EmptyStringForGURL() {
     55   // Avoid static object construction/destruction on startup/shutdown.
     56   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
     57   return *empty_string;
     58 }
     59 
     60 #endif  // WIN32
     61 
     62 } // namespace
     63 
     64 GURL::GURL() : is_valid_(false) {
     65 }
     66 
     67 GURL::GURL(const GURL& other)
     68     : spec_(other.spec_),
     69       is_valid_(other.is_valid_),
     70       parsed_(other.parsed_) {
     71   if (other.inner_url_)
     72     inner_url_.reset(new GURL(*other.inner_url_));
     73   // Valid filesystem urls should always have an inner_url_.
     74   DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
     75 }
     76 
     77 GURL::GURL(const std::string& url_string) {
     78   InitCanonical(url_string, true);
     79 }
     80 
     81 GURL::GURL(const base::string16& url_string) {
     82   InitCanonical(url_string, true);
     83 }
     84 
     85 GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
     86   InitCanonical(url_string, false);
     87 }
     88 
     89 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
     90            const url_parse::Parsed& parsed, bool is_valid)
     91     : spec_(canonical_spec, canonical_spec_len),
     92       is_valid_(is_valid),
     93       parsed_(parsed) {
     94   InitializeFromCanonicalSpec();
     95 }
     96 
     97 GURL::GURL(std::string canonical_spec,
     98            const url_parse::Parsed& parsed, bool is_valid)
     99     : is_valid_(is_valid),
    100       parsed_(parsed) {
    101   spec_.swap(canonical_spec);
    102   InitializeFromCanonicalSpec();
    103 }
    104 
    105 template<typename STR>
    106 void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
    107   // Reserve enough room in the output for the input, plus some extra so that
    108   // we have room if we have to escape a few things without reallocating.
    109   spec_.reserve(input_spec.size() + 32);
    110   url_canon::StdStringCanonOutput output(&spec_);
    111   is_valid_ = url_util::Canonicalize(
    112       input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
    113       NULL, &output, &parsed_);
    114 
    115   output.Complete();  // Must be done before using string.
    116   if (is_valid_ && SchemeIsFileSystem()) {
    117     inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
    118                               *parsed_.inner_parsed(), true));
    119   }
    120 }
    121 
    122 void GURL::InitializeFromCanonicalSpec() {
    123   if (is_valid_ && SchemeIsFileSystem()) {
    124     inner_url_.reset(
    125         new GURL(spec_.data(), parsed_.Length(),
    126                  *parsed_.inner_parsed(), true));
    127   }
    128 
    129 #ifndef NDEBUG
    130   // For testing purposes, check that the parsed canonical URL is identical to
    131   // what we would have produced. Skip checking for invalid URLs have no meaning
    132   // and we can't always canonicalize then reproducabely.
    133   if (is_valid_) {
    134     url_parse::Component scheme;
    135     // We can't do this check on the inner_url of a filesystem URL, as
    136     // canonical_spec actually points to the start of the outer URL, so we'd
    137     // end up with infinite recursion in this constructor.
    138     if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(),
    139                                         "filesystem", &scheme) ||
    140         scheme.begin == parsed_.scheme.begin) {
    141       // We need to retain trailing whitespace on path URLs, as the |parsed_|
    142       // spec we originally received may legitimately contain trailing white-
    143       // space on the path or  components e.g. if the #ref has been
    144       // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
    145       GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
    146 
    147       DCHECK(test_url.is_valid_ == is_valid_);
    148       DCHECK(test_url.spec_ == spec_);
    149 
    150       DCHECK(test_url.parsed_.scheme == parsed_.scheme);
    151       DCHECK(test_url.parsed_.username == parsed_.username);
    152       DCHECK(test_url.parsed_.password == parsed_.password);
    153       DCHECK(test_url.parsed_.host == parsed_.host);
    154       DCHECK(test_url.parsed_.port == parsed_.port);
    155       DCHECK(test_url.parsed_.path == parsed_.path);
    156       DCHECK(test_url.parsed_.query == parsed_.query);
    157       DCHECK(test_url.parsed_.ref == parsed_.ref);
    158     }
    159   }
    160 #endif
    161 }
    162 
    163 GURL::~GURL() {
    164 }
    165 
    166 GURL& GURL::operator=(GURL other) {
    167   Swap(&other);
    168   return *this;
    169 }
    170 
    171 const std::string& GURL::spec() const {
    172   if (is_valid_ || spec_.empty())
    173     return spec_;
    174 
    175   DCHECK(false) << "Trying to get the spec of an invalid URL!";
    176   return EmptyStringForGURL();
    177 }
    178 
    179 GURL GURL::Resolve(const std::string& relative) const {
    180   return ResolveWithCharsetConverter(relative, NULL);
    181 }
    182 GURL GURL::Resolve(const base::string16& relative) const {
    183   return ResolveWithCharsetConverter(relative, NULL);
    184 }
    185 
    186 // Note: code duplicated below (it's inconvenient to use a template here).
    187 GURL GURL::ResolveWithCharsetConverter(
    188     const std::string& relative,
    189     url_canon::CharsetConverter* charset_converter) const {
    190   // Not allowed for invalid URLs.
    191   if (!is_valid_)
    192     return GURL();
    193 
    194   GURL result;
    195 
    196   // Reserve enough room in the output for the input, plus some extra so that
    197   // we have room if we have to escape a few things without reallocating.
    198   result.spec_.reserve(spec_.size() + 32);
    199   url_canon::StdStringCanonOutput output(&result.spec_);
    200 
    201   if (!url_util::ResolveRelative(
    202           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    203           relative.data(), static_cast<int>(relative.length()),
    204           charset_converter, &output, &result.parsed_)) {
    205     // Error resolving, return an empty URL.
    206     return GURL();
    207   }
    208 
    209   output.Complete();
    210   result.is_valid_ = true;
    211   if (result.SchemeIsFileSystem()) {
    212     result.inner_url_.reset(
    213         new GURL(result.spec_.data(), result.parsed_.Length(),
    214                  *result.parsed_.inner_parsed(), true));
    215   }
    216   return result;
    217 }
    218 
    219 // Note: code duplicated above (it's inconvenient to use a template here).
    220 GURL GURL::ResolveWithCharsetConverter(
    221     const base::string16& relative,
    222     url_canon::CharsetConverter* charset_converter) const {
    223   // Not allowed for invalid URLs.
    224   if (!is_valid_)
    225     return GURL();
    226 
    227   GURL result;
    228 
    229   // Reserve enough room in the output for the input, plus some extra so that
    230   // we have room if we have to escape a few things without reallocating.
    231   result.spec_.reserve(spec_.size() + 32);
    232   url_canon::StdStringCanonOutput output(&result.spec_);
    233 
    234   if (!url_util::ResolveRelative(
    235           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    236           relative.data(), static_cast<int>(relative.length()),
    237           charset_converter, &output, &result.parsed_)) {
    238     // Error resolving, return an empty URL.
    239     return GURL();
    240   }
    241 
    242   output.Complete();
    243   result.is_valid_ = true;
    244   if (result.SchemeIsFileSystem()) {
    245     result.inner_url_.reset(
    246         new GURL(result.spec_.data(), result.parsed_.Length(),
    247                  *result.parsed_.inner_parsed(), true));
    248   }
    249   return result;
    250 }
    251 
    252 // Note: code duplicated below (it's inconvenient to use a template here).
    253 GURL GURL::ReplaceComponents(
    254     const url_canon::Replacements<char>& replacements) const {
    255   GURL result;
    256 
    257   // Not allowed for invalid URLs.
    258   if (!is_valid_)
    259     return GURL();
    260 
    261   // Reserve enough room in the output for the input, plus some extra so that
    262   // we have room if we have to escape a few things without reallocating.
    263   result.spec_.reserve(spec_.size() + 32);
    264   url_canon::StdStringCanonOutput output(&result.spec_);
    265 
    266   result.is_valid_ = url_util::ReplaceComponents(
    267       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    268       NULL, &output, &result.parsed_);
    269 
    270   output.Complete();
    271   if (result.is_valid_ && result.SchemeIsFileSystem()) {
    272     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
    273                                      *result.parsed_.inner_parsed(), true));
    274   }
    275   return result;
    276 }
    277 
    278 // Note: code duplicated above (it's inconvenient to use a template here).
    279 GURL GURL::ReplaceComponents(
    280     const url_canon::Replacements<base::char16>& replacements) const {
    281   GURL result;
    282 
    283   // Not allowed for invalid URLs.
    284   if (!is_valid_)
    285     return GURL();
    286 
    287   // Reserve enough room in the output for the input, plus some extra so that
    288   // we have room if we have to escape a few things without reallocating.
    289   result.spec_.reserve(spec_.size() + 32);
    290   url_canon::StdStringCanonOutput output(&result.spec_);
    291 
    292   result.is_valid_ = url_util::ReplaceComponents(
    293       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    294       NULL, &output, &result.parsed_);
    295 
    296   output.Complete();
    297   if (result.is_valid_ && result.SchemeIsFileSystem()) {
    298     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
    299                                      *result.parsed_.inner_parsed(), true));
    300   }
    301   return result;
    302 }
    303 
    304 GURL GURL::GetOrigin() const {
    305   // This doesn't make sense for invalid or nonstandard URLs, so return
    306   // the empty URL
    307   if (!is_valid_ || !IsStandard())
    308     return GURL();
    309 
    310   if (SchemeIsFileSystem())
    311     return inner_url_->GetOrigin();
    312 
    313   url_canon::Replacements<char> replacements;
    314   replacements.ClearUsername();
    315   replacements.ClearPassword();
    316   replacements.ClearPath();
    317   replacements.ClearQuery();
    318   replacements.ClearRef();
    319 
    320   return ReplaceComponents(replacements);
    321 }
    322 
    323 GURL GURL::GetWithEmptyPath() const {
    324   // This doesn't make sense for invalid or nonstandard URLs, so return
    325   // the empty URL.
    326   if (!is_valid_ || !IsStandard())
    327     return GURL();
    328 
    329   // We could optimize this since we know that the URL is canonical, and we are
    330   // appending a canonical path, so avoiding re-parsing.
    331   GURL other(*this);
    332   if (parsed_.path.len == 0)
    333     return other;
    334 
    335   // Clear everything after the path.
    336   other.parsed_.query.reset();
    337   other.parsed_.ref.reset();
    338 
    339   // Set the path, since the path is longer than one, we can just set the
    340   // first character and resize.
    341   other.spec_[other.parsed_.path.begin] = '/';
    342   other.parsed_.path.len = 1;
    343   other.spec_.resize(other.parsed_.path.begin + 1);
    344   return other;
    345 }
    346 
    347 bool GURL::IsStandard() const {
    348   return url_util::IsStandard(spec_.data(), parsed_.scheme);
    349 }
    350 
    351 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
    352   if (parsed_.scheme.len <= 0)
    353     return lower_ascii_scheme == NULL;
    354   return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
    355                                         spec_.data() + parsed_.scheme.end(),
    356                                         lower_ascii_scheme);
    357 }
    358 
    359 bool GURL::SchemeIsHTTPOrHTTPS() const {
    360   return SchemeIs("http") || SchemeIs("https");
    361 }
    362 
    363 bool GURL::SchemeIsWSOrWSS() const {
    364   return SchemeIs("ws") || SchemeIs("wss");
    365 }
    366 
    367 int GURL::IntPort() const {
    368   if (parsed_.port.is_nonempty())
    369     return url_parse::ParsePort(spec_.data(), parsed_.port);
    370   return url_parse::PORT_UNSPECIFIED;
    371 }
    372 
    373 int GURL::EffectiveIntPort() const {
    374   int int_port = IntPort();
    375   if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
    376     return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
    377                                            parsed_.scheme.len);
    378   return int_port;
    379 }
    380 
    381 std::string GURL::ExtractFileName() const {
    382   url_parse::Component file_component;
    383   url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
    384   return ComponentString(file_component);
    385 }
    386 
    387 std::string GURL::PathForRequest() const {
    388   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
    389   if (parsed_.ref.len >= 0) {
    390     // Clip off the reference when it exists. The reference starts after the #
    391     // sign, so we have to subtract one to also remove it.
    392     return std::string(spec_, parsed_.path.begin,
    393                        parsed_.ref.begin - parsed_.path.begin - 1);
    394   }
    395   // Compute the actual path length, rather than depending on the spec's
    396   // terminator.  If we're an inner_url, our spec continues on into our outer
    397   // url's path/query/ref.
    398   int path_len = parsed_.path.len;
    399   if (parsed_.query.is_valid())
    400     path_len = parsed_.query.end() - parsed_.path.begin;
    401 
    402   return std::string(spec_, parsed_.path.begin, path_len);
    403 }
    404 
    405 std::string GURL::HostNoBrackets() const {
    406   // If host looks like an IPv6 literal, strip the square brackets.
    407   url_parse::Component h(parsed_.host);
    408   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
    409     h.begin++;
    410     h.len -= 2;
    411   }
    412   return ComponentString(h);
    413 }
    414 
    415 std::string GURL::GetContent() const {
    416   return is_valid_ ? ComponentString(parsed_.GetContent()) : std::string();
    417 }
    418 
    419 bool GURL::HostIsIPAddress() const {
    420   if (!is_valid_ || spec_.empty())
    421      return false;
    422 
    423   url_canon::RawCanonOutputT<char, 128> ignored_output;
    424   url_canon::CanonHostInfo host_info;
    425   url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
    426                                    &ignored_output, &host_info);
    427   return host_info.IsIPAddress();
    428 }
    429 
    430 #ifdef WIN32
    431 
    432 const GURL& GURL::EmptyGURL() {
    433   // Avoid static object construction/destruction on startup/shutdown.
    434   if (!empty_gurl) {
    435     // Create the string. Be careful that we don't break in the case that this
    436     // is being called from multiple threads.
    437     GURL* new_empty_gurl = new GURL;
    438     if (InterlockedCompareExchangePointer(
    439         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
    440       // The old value was non-NULL, so no replacement was done. Another
    441       // thread did the initialization out from under us.
    442       delete new_empty_gurl;
    443     }
    444   }
    445   return *empty_gurl;
    446 }
    447 
    448 #else
    449 
    450 void EmptyGURLOnce(void) {
    451   empty_gurl = new GURL;
    452 }
    453 
    454 const GURL& GURL::EmptyGURL() {
    455   // Avoid static object construction/destruction on startup/shutdown.
    456   pthread_once(&empty_gurl_once, EmptyGURLOnce);
    457   return *empty_gurl;
    458 }
    459 
    460 #endif  // WIN32
    461 
    462 bool GURL::DomainIs(const char* lower_ascii_domain,
    463                     int domain_len) const {
    464   // Return false if this URL is not valid or domain is empty.
    465   if (!is_valid_ || !domain_len)
    466     return false;
    467 
    468   // FileSystem URLs have empty parsed_.host, so check this first.
    469   if (SchemeIsFileSystem() && inner_url_)
    470     return inner_url_->DomainIs(lower_ascii_domain, domain_len);
    471 
    472   if (!parsed_.host.is_nonempty())
    473     return false;
    474 
    475   // Check whether the host name is end with a dot. If yes, treat it
    476   // the same as no-dot unless the input comparison domain is end
    477   // with dot.
    478   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
    479   int host_len = parsed_.host.len;
    480   if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
    481     last_pos--;
    482     host_len--;
    483   }
    484 
    485   // Return false if host's length is less than domain's length.
    486   if (host_len < domain_len)
    487     return false;
    488 
    489   // Compare this url whether belong specific domain.
    490   const char* start_pos = spec_.data() + parsed_.host.begin +
    491                           host_len - domain_len;
    492 
    493   if (!url_util::LowerCaseEqualsASCII(start_pos,
    494                                       last_pos + 1,
    495                                       lower_ascii_domain,
    496                                       lower_ascii_domain + domain_len))
    497     return false;
    498 
    499   // Check whether host has right domain start with dot, make sure we got
    500   // right domain range. For example www.google.com has domain
    501   // "google.com" but www.iamnotgoogle.com does not.
    502   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
    503       '.' != *(start_pos - 1))
    504     return false;
    505 
    506   return true;
    507 }
    508 
    509 void GURL::Swap(GURL* other) {
    510   spec_.swap(other->spec_);
    511   std::swap(is_valid_, other->is_valid_);
    512   std::swap(parsed_, other->parsed_);
    513   inner_url_.swap(other->inner_url_);
    514 }
    515 
    516 std::ostream& operator<<(std::ostream& out, const GURL& url) {
    517   return out << url.possibly_invalid_spec();
    518 }
    519