Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 #ifdef WIN32
     31 #include <windows.h>
     32 #else
     33 #include <pthread.h>
     34 #endif
     35 
     36 #include <algorithm>
     37 #include <iostream>
     38 
     39 #include "googleurl/src/gurl.h"
     40 
     41 #include "base/logging.h"
     42 #include "googleurl/src/url_canon_stdstring.h"
     43 #include "googleurl/src/url_util.h"
     44 
     45 namespace {
     46 
     47 // External template that can handle initialization of either character type.
     48 // The input spec is given, and the canonical version will be placed in
     49 // |*canonical|, along with the parsing of the canonical spec in |*parsed|.
     50 template<typename STR>
     51 bool InitCanonical(const STR& input_spec,
     52                    std::string* canonical,
     53                    url_parse::Parsed* parsed) {
     54   // Reserve enough room in the output for the input, plus some extra so that
     55   // we have room if we have to escape a few things without reallocating.
     56   canonical->reserve(input_spec.size() + 32);
     57   url_canon::StdStringCanonOutput output(canonical);
     58   bool success = url_util::Canonicalize(
     59       input_spec.data(), static_cast<int>(input_spec.length()),
     60       NULL, &output, parsed);
     61 
     62   output.Complete();  // Must be done before using string.
     63   return success;
     64 }
     65 
     66 static std::string* empty_string = NULL;
     67 static GURL* empty_gurl = NULL;
     68 
     69 #ifdef WIN32
     70 
     71 // Returns a static reference to an empty string for returning a reference
     72 // when there is no underlying string.
     73 const std::string& EmptyStringForGURL() {
     74   // Avoid static object construction/destruction on startup/shutdown.
     75   if (!empty_string) {
     76     // Create the string. Be careful that we don't break in the case that this
     77     // is being called from multiple threads. Statics are not threadsafe.
     78     std::string* new_empty_string = new std::string;
     79     if (InterlockedCompareExchangePointer(
     80         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
     81       // The old value was non-NULL, so no replacement was done. Another
     82       // thread did the initialization out from under us.
     83       delete new_empty_string;
     84     }
     85   }
     86   return *empty_string;
     87 }
     88 
     89 #else
     90 
     91 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
     92 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
     93 
     94 void EmptyStringForGURLOnce(void) {
     95   empty_string = new std::string;
     96 }
     97 
     98 const std::string& EmptyStringForGURL() {
     99   // Avoid static object construction/destruction on startup/shutdown.
    100   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
    101   return *empty_string;
    102 }
    103 
    104 #endif  // WIN32
    105 
    106 } // namespace
    107 
    108 GURL::GURL() : is_valid_(false) {
    109 }
    110 
    111 GURL::GURL(const GURL& other)
    112     : spec_(other.spec_),
    113       is_valid_(other.is_valid_),
    114       parsed_(other.parsed_) {
    115 }
    116 
    117 GURL::GURL(const std::string& url_string) {
    118   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
    119 }
    120 
    121 GURL::GURL(const string16& url_string) {
    122   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
    123 }
    124 
    125 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
    126            const url_parse::Parsed& parsed, bool is_valid)
    127     : spec_(canonical_spec, canonical_spec_len),
    128       is_valid_(is_valid),
    129       parsed_(parsed) {
    130 #ifndef NDEBUG
    131   // For testing purposes, check that the parsed canonical URL is identical to
    132   // what we would have produced. Skip checking for invalid URLs have no meaning
    133   // and we can't always canonicalize then reproducabely.
    134   if (is_valid_) {
    135     GURL test_url(spec_);
    136 
    137     DCHECK(test_url.is_valid_ == is_valid_);
    138     DCHECK(test_url.spec_ == spec_);
    139 
    140     DCHECK(test_url.parsed_.scheme == parsed_.scheme);
    141     DCHECK(test_url.parsed_.username == parsed_.username);
    142     DCHECK(test_url.parsed_.password == parsed_.password);
    143     DCHECK(test_url.parsed_.host == parsed_.host);
    144     DCHECK(test_url.parsed_.port == parsed_.port);
    145     DCHECK(test_url.parsed_.path == parsed_.path);
    146     DCHECK(test_url.parsed_.query == parsed_.query);
    147     DCHECK(test_url.parsed_.ref == parsed_.ref);
    148   }
    149 #endif
    150 }
    151 
    152 GURL& GURL::operator=(const GURL& other) {
    153   spec_ = other.spec_;
    154   is_valid_ = other.is_valid_;
    155   parsed_ = other.parsed_;
    156   return *this;
    157 }
    158 
    159 const std::string& GURL::spec() const {
    160   if (is_valid_ || spec_.empty())
    161     return spec_;
    162 
    163   DCHECK(false) << "Trying to get the spec of an invalid URL!";
    164   return EmptyStringForGURL();
    165 }
    166 
    167 GURL GURL::Resolve(const std::string& relative) const {
    168   return ResolveWithCharsetConverter(relative, NULL);
    169 }
    170 GURL GURL::Resolve(const string16& relative) const {
    171   return ResolveWithCharsetConverter(relative, NULL);
    172 }
    173 
    174 // Note: code duplicated below (it's inconvenient to use a template here).
    175 GURL GURL::ResolveWithCharsetConverter(
    176     const std::string& relative,
    177     url_canon::CharsetConverter* charset_converter) const {
    178   // Not allowed for invalid URLs.
    179   if (!is_valid_)
    180     return GURL();
    181 
    182   GURL result;
    183 
    184   // Reserve enough room in the output for the input, plus some extra so that
    185   // we have room if we have to escape a few things without reallocating.
    186   result.spec_.reserve(spec_.size() + 32);
    187   url_canon::StdStringCanonOutput output(&result.spec_);
    188 
    189   if (!url_util::ResolveRelative(
    190           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    191           relative.data(), static_cast<int>(relative.length()),
    192           charset_converter, &output, &result.parsed_)) {
    193     // Error resolving, return an empty URL.
    194     return GURL();
    195   }
    196 
    197   output.Complete();
    198   result.is_valid_ = true;
    199   return result;
    200 }
    201 
    202 // Note: code duplicated above (it's inconvenient to use a template here).
    203 GURL GURL::ResolveWithCharsetConverter(
    204     const string16& relative,
    205     url_canon::CharsetConverter* charset_converter) const {
    206   // Not allowed for invalid URLs.
    207   if (!is_valid_)
    208     return GURL();
    209 
    210   GURL result;
    211 
    212   // Reserve enough room in the output for the input, plus some extra so that
    213   // we have room if we have to escape a few things without reallocating.
    214   result.spec_.reserve(spec_.size() + 32);
    215   url_canon::StdStringCanonOutput output(&result.spec_);
    216 
    217   if (!url_util::ResolveRelative(
    218           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    219           relative.data(), static_cast<int>(relative.length()),
    220           charset_converter, &output, &result.parsed_)) {
    221     // Error resolving, return an empty URL.
    222     return GURL();
    223   }
    224 
    225   output.Complete();
    226   result.is_valid_ = true;
    227   return result;
    228 }
    229 
    230 // Note: code duplicated below (it's inconvenient to use a template here).
    231 GURL GURL::ReplaceComponents(
    232     const url_canon::Replacements<char>& replacements) const {
    233   GURL result;
    234 
    235   // Not allowed for invalid URLs.
    236   if (!is_valid_)
    237     return GURL();
    238 
    239   // Reserve enough room in the output for the input, plus some extra so that
    240   // we have room if we have to escape a few things without reallocating.
    241   result.spec_.reserve(spec_.size() + 32);
    242   url_canon::StdStringCanonOutput output(&result.spec_);
    243 
    244   result.is_valid_ = url_util::ReplaceComponents(
    245       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    246       NULL, &output, &result.parsed_);
    247 
    248   output.Complete();
    249   return result;
    250 }
    251 
    252 // Note: code duplicated above (it's inconvenient to use a template here).
    253 GURL GURL::ReplaceComponents(
    254     const url_canon::Replacements<char16>& replacements) const {
    255   GURL result;
    256 
    257   // Not allowed for invalid URLs.
    258   if (!is_valid_)
    259     return GURL();
    260 
    261   // Reserve enough room in the output for the input, plus some extra so that
    262   // we have room if we have to escape a few things without reallocating.
    263   result.spec_.reserve(spec_.size() + 32);
    264   url_canon::StdStringCanonOutput output(&result.spec_);
    265 
    266   result.is_valid_ = url_util::ReplaceComponents(
    267       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    268       NULL, &output, &result.parsed_);
    269 
    270   output.Complete();
    271   return result;
    272 }
    273 
    274 GURL GURL::GetOrigin() const {
    275   // This doesn't make sense for invalid or nonstandard URLs, so return
    276   // the empty URL
    277   if (!is_valid_ || !IsStandard())
    278     return GURL();
    279 
    280   url_canon::Replacements<char> replacements;
    281   replacements.ClearUsername();
    282   replacements.ClearPassword();
    283   replacements.ClearPath();
    284   replacements.ClearQuery();
    285   replacements.ClearRef();
    286 
    287   return ReplaceComponents(replacements);
    288 }
    289 
    290 GURL GURL::GetWithEmptyPath() const {
    291   // This doesn't make sense for invalid or nonstandard URLs, so return
    292   // the empty URL.
    293   if (!is_valid_ || !IsStandard())
    294     return GURL();
    295 
    296   // We could optimize this since we know that the URL is canonical, and we are
    297   // appending a canonical path, so avoiding re-parsing.
    298   GURL other(*this);
    299   if (parsed_.path.len == 0)
    300     return other;
    301 
    302   // Clear everything after the path.
    303   other.parsed_.query.reset();
    304   other.parsed_.ref.reset();
    305 
    306   // Set the path, since the path is longer than one, we can just set the
    307   // first character and resize.
    308   other.spec_[other.parsed_.path.begin] = '/';
    309   other.parsed_.path.len = 1;
    310   other.spec_.resize(other.parsed_.path.begin + 1);
    311   return other;
    312 }
    313 
    314 bool GURL::IsStandard() const {
    315   return url_util::IsStandard(spec_.data(), parsed_.scheme);
    316 }
    317 
    318 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
    319   if (parsed_.scheme.len <= 0)
    320     return lower_ascii_scheme == NULL;
    321   return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
    322                                         spec_.data() + parsed_.scheme.end(),
    323                                         lower_ascii_scheme);
    324 }
    325 
    326 int GURL::IntPort() const {
    327   if (parsed_.port.is_nonempty())
    328     return url_parse::ParsePort(spec_.data(), parsed_.port);
    329   return url_parse::PORT_UNSPECIFIED;
    330 }
    331 
    332 int GURL::EffectiveIntPort() const {
    333   int int_port = IntPort();
    334   if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
    335     return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
    336                                            parsed_.scheme.len);
    337   return int_port;
    338 }
    339 
    340 std::string GURL::ExtractFileName() const {
    341   url_parse::Component file_component;
    342   url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
    343   return ComponentString(file_component);
    344 }
    345 
    346 std::string GURL::PathForRequest() const {
    347   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
    348   if (parsed_.ref.len >= 0) {
    349     // Clip off the reference when it exists. The reference starts after the #
    350     // sign, so we have to subtract one to also remove it.
    351     return std::string(spec_, parsed_.path.begin,
    352                        parsed_.ref.begin - parsed_.path.begin - 1);
    353   }
    354 
    355   // Use everything form the path to the end.
    356   return std::string(spec_, parsed_.path.begin);
    357 }
    358 
    359 std::string GURL::HostNoBrackets() const {
    360   // If host looks like an IPv6 literal, strip the square brackets.
    361   url_parse::Component h(parsed_.host);
    362   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
    363     h.begin++;
    364     h.len -= 2;
    365   }
    366   return ComponentString(h);
    367 }
    368 
    369 bool GURL::HostIsIPAddress() const {
    370   if (!is_valid_ || spec_.empty())
    371      return false;
    372 
    373   url_canon::RawCanonOutputT<char, 128> ignored_output;
    374   url_canon::CanonHostInfo host_info;
    375   url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
    376                                    &ignored_output, &host_info);
    377   return host_info.IsIPAddress();
    378 }
    379 
    380 #ifdef WIN32
    381 
    382 const GURL& GURL::EmptyGURL() {
    383   // Avoid static object construction/destruction on startup/shutdown.
    384   if (!empty_gurl) {
    385     // Create the string. Be careful that we don't break in the case that this
    386     // is being called from multiple threads.
    387     GURL* new_empty_gurl = new GURL;
    388     if (InterlockedCompareExchangePointer(
    389         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
    390       // The old value was non-NULL, so no replacement was done. Another
    391       // thread did the initialization out from under us.
    392       delete new_empty_gurl;
    393     }
    394   }
    395   return *empty_gurl;
    396 }
    397 
    398 #else
    399 
    400 void EmptyGURLOnce(void) {
    401   empty_gurl = new GURL;
    402 }
    403 
    404 const GURL& GURL::EmptyGURL() {
    405   // Avoid static object construction/destruction on startup/shutdown.
    406   pthread_once(&empty_gurl_once, EmptyGURLOnce);
    407   return *empty_gurl;
    408 }
    409 
    410 #endif  // WIN32
    411 
    412 bool GURL::DomainIs(const char* lower_ascii_domain,
    413                     int domain_len) const {
    414   // Return false if this URL is not valid or domain is empty.
    415   if (!is_valid_ || !parsed_.host.is_nonempty() || !domain_len)
    416     return false;
    417 
    418   // Check whether the host name is end with a dot. If yes, treat it
    419   // the same as no-dot unless the input comparison domain is end
    420   // with dot.
    421   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
    422   int host_len = parsed_.host.len;
    423   if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
    424     last_pos--;
    425     host_len--;
    426   }
    427 
    428   // Return false if host's length is less than domain's length.
    429   if (host_len < domain_len)
    430     return false;
    431 
    432   // Compare this url whether belong specific domain.
    433   const char* start_pos = spec_.data() + parsed_.host.begin +
    434                           host_len - domain_len;
    435 
    436   if (!url_util::LowerCaseEqualsASCII(start_pos,
    437                                       last_pos + 1,
    438                                       lower_ascii_domain,
    439                                       lower_ascii_domain + domain_len))
    440     return false;
    441 
    442   // Check whether host has right domain start with dot, make sure we got
    443   // right domain range. For example www.google.com has domain
    444   // "google.com" but www.iamnotgoogle.com does not.
    445   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
    446       '.' != *(start_pos - 1))
    447     return false;
    448 
    449   return true;
    450 }
    451 
    452 void GURL::Swap(GURL* other) {
    453   spec_.swap(other->spec_);
    454   std::swap(is_valid_, other->is_valid_);
    455   std::swap(parsed_, other->parsed_);
    456 }
    457 
    458 std::ostream& operator<<(std::ostream& out, const GURL& url) {
    459   return out << url.possibly_invalid_spec();
    460 }
    461