Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 #ifdef WIN32
     31 #include <windows.h>
     32 #else
     33 #include <pthread.h>
     34 #endif
     35 
     36 #include <algorithm>
     37 
     38 #include "googleurl/src/gurl.h"
     39 
     40 #include "base/logging.h"
     41 #include "googleurl/src/url_canon_stdstring.h"
     42 #include "googleurl/src/url_util.h"
     43 
     44 namespace {
     45 
     46 // External template that can handle initialization of either character type.
     47 // The input spec is given, and the canonical version will be placed in
     48 // |*canonical|, along with the parsing of the canonical spec in |*parsed|.
     49 template<typename STR>
     50 bool InitCanonical(const STR& input_spec,
     51                    std::string* canonical,
     52                    url_parse::Parsed* parsed) {
     53   // Reserve enough room in the output for the input, plus some extra so that
     54   // we have room if we have to escape a few things without reallocating.
     55   canonical->reserve(input_spec.size() + 32);
     56   url_canon::StdStringCanonOutput output(canonical);
     57   bool success = url_util::Canonicalize(
     58       input_spec.data(), static_cast<int>(input_spec.length()),
     59       NULL, &output, parsed);
     60 
     61   output.Complete();  // Must be done before using string.
     62   return success;
     63 }
     64 
     65 static std::string* empty_string = NULL;
     66 static GURL* empty_gurl = NULL;
     67 
     68 #ifdef WIN32
     69 
     70 // Returns a static reference to an empty string for returning a reference
     71 // when there is no underlying string.
     72 const std::string& EmptyStringForGURL() {
     73   // Avoid static object construction/destruction on startup/shutdown.
     74   if (!empty_string) {
     75     // Create the string. Be careful that we don't break in the case that this
     76     // is being called from multiple threads. Statics are not threadsafe.
     77     std::string* new_empty_string = new std::string;
     78     if (InterlockedCompareExchangePointer(
     79         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
     80       // The old value was non-NULL, so no replacement was done. Another
     81       // thread did the initialization out from under us.
     82       delete new_empty_string;
     83     }
     84   }
     85   return *empty_string;
     86 }
     87 
     88 #else
     89 
     90 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
     91 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
     92 
     93 void EmptyStringForGURLOnce(void) {
     94   empty_string = new std::string;
     95 }
     96 
     97 const std::string& EmptyStringForGURL() {
     98   // Avoid static object construction/destruction on startup/shutdown.
     99   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
    100   return *empty_string;
    101 }
    102 
    103 #endif  // WIN32
    104 
    105 } // namespace
    106 
    107 GURL::GURL() : is_valid_(false) {
    108 }
    109 
    110 GURL::GURL(const GURL& other)
    111     : spec_(other.spec_),
    112       is_valid_(other.is_valid_),
    113       parsed_(other.parsed_) {
    114 }
    115 
    116 GURL::GURL(const std::string& url_string) {
    117   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
    118 }
    119 
    120 GURL::GURL(const string16& url_string) {
    121   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
    122 }
    123 
    124 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
    125            const url_parse::Parsed& parsed, bool is_valid)
    126     : spec_(canonical_spec, canonical_spec_len),
    127       is_valid_(is_valid),
    128       parsed_(parsed) {
    129 #ifndef NDEBUG
    130   // For testing purposes, check that the parsed canonical URL is identical to
    131   // what we would have produced. Skip checking for invalid URLs have no meaning
    132   // and we can't always canonicalize then reproducabely.
    133   if (is_valid_) {
    134     GURL test_url(spec_);
    135 
    136     DCHECK(test_url.is_valid_ == is_valid_);
    137     DCHECK(test_url.spec_ == spec_);
    138 
    139     DCHECK(test_url.parsed_.scheme == parsed_.scheme);
    140     DCHECK(test_url.parsed_.username == parsed_.username);
    141     DCHECK(test_url.parsed_.password == parsed_.password);
    142     DCHECK(test_url.parsed_.host == parsed_.host);
    143     DCHECK(test_url.parsed_.port == parsed_.port);
    144     DCHECK(test_url.parsed_.path == parsed_.path);
    145     DCHECK(test_url.parsed_.query == parsed_.query);
    146     DCHECK(test_url.parsed_.ref == parsed_.ref);
    147   }
    148 #endif
    149 }
    150 
    151 const std::string& GURL::spec() const {
    152   if (is_valid_ || spec_.empty())
    153     return spec_;
    154 
    155   DCHECK(false) << "Trying to get the spec of an invalid URL!";
    156   return EmptyStringForGURL();
    157 }
    158 
    159 GURL GURL::Resolve(const std::string& relative) const {
    160   return ResolveWithCharsetConverter(relative, NULL);
    161 }
    162 GURL GURL::Resolve(const string16& relative) const {
    163   return ResolveWithCharsetConverter(relative, NULL);
    164 }
    165 
    166 // Note: code duplicated below (it's inconvenient to use a template here).
    167 GURL GURL::ResolveWithCharsetConverter(
    168     const std::string& relative,
    169     url_canon::CharsetConverter* charset_converter) const {
    170   // Not allowed for invalid URLs.
    171   if (!is_valid_)
    172     return GURL();
    173 
    174   GURL result;
    175 
    176   // Reserve enough room in the output for the input, plus some extra so that
    177   // we have room if we have to escape a few things without reallocating.
    178   result.spec_.reserve(spec_.size() + 32);
    179   url_canon::StdStringCanonOutput output(&result.spec_);
    180 
    181   if (!url_util::ResolveRelative(
    182           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    183           relative.data(), static_cast<int>(relative.length()),
    184           charset_converter, &output, &result.parsed_)) {
    185     // Error resolving, return an empty URL.
    186     return GURL();
    187   }
    188 
    189   output.Complete();
    190   result.is_valid_ = true;
    191   return result;
    192 }
    193 
    194 // Note: code duplicated above (it's inconvenient to use a template here).
    195 GURL GURL::ResolveWithCharsetConverter(
    196     const string16& relative,
    197     url_canon::CharsetConverter* charset_converter) const {
    198   // Not allowed for invalid URLs.
    199   if (!is_valid_)
    200     return GURL();
    201 
    202   GURL result;
    203 
    204   // Reserve enough room in the output for the input, plus some extra so that
    205   // we have room if we have to escape a few things without reallocating.
    206   result.spec_.reserve(spec_.size() + 32);
    207   url_canon::StdStringCanonOutput output(&result.spec_);
    208 
    209   if (!url_util::ResolveRelative(
    210           spec_.data(), static_cast<int>(spec_.length()), parsed_,
    211           relative.data(), static_cast<int>(relative.length()),
    212           charset_converter, &output, &result.parsed_)) {
    213     // Error resolving, return an empty URL.
    214     return GURL();
    215   }
    216 
    217   output.Complete();
    218   result.is_valid_ = true;
    219   return result;
    220 }
    221 
    222 // Note: code duplicated below (it's inconvenient to use a template here).
    223 GURL GURL::ReplaceComponents(
    224     const url_canon::Replacements<char>& replacements) const {
    225   GURL result;
    226 
    227   // Not allowed for invalid URLs.
    228   if (!is_valid_)
    229     return GURL();
    230 
    231   // Reserve enough room in the output for the input, plus some extra so that
    232   // we have room if we have to escape a few things without reallocating.
    233   result.spec_.reserve(spec_.size() + 32);
    234   url_canon::StdStringCanonOutput output(&result.spec_);
    235 
    236   result.is_valid_ = url_util::ReplaceComponents(
    237       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    238       NULL, &output, &result.parsed_);
    239 
    240   output.Complete();
    241   return result;
    242 }
    243 
    244 // Note: code duplicated above (it's inconvenient to use a template here).
    245 GURL GURL::ReplaceComponents(
    246     const url_canon::Replacements<char16>& replacements) const {
    247   GURL result;
    248 
    249   // Not allowed for invalid URLs.
    250   if (!is_valid_)
    251     return GURL();
    252 
    253   // Reserve enough room in the output for the input, plus some extra so that
    254   // we have room if we have to escape a few things without reallocating.
    255   result.spec_.reserve(spec_.size() + 32);
    256   url_canon::StdStringCanonOutput output(&result.spec_);
    257 
    258   result.is_valid_ = url_util::ReplaceComponents(
    259       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
    260       NULL, &output, &result.parsed_);
    261 
    262   output.Complete();
    263   return result;
    264 }
    265 
    266 GURL GURL::GetOrigin() const {
    267   // This doesn't make sense for invalid or nonstandard URLs, so return
    268   // the empty URL
    269   if (!is_valid_ || !IsStandard())
    270     return GURL();
    271 
    272   url_canon::Replacements<char> replacements;
    273   replacements.ClearUsername();
    274   replacements.ClearPassword();
    275   replacements.ClearPath();
    276   replacements.ClearQuery();
    277   replacements.ClearRef();
    278 
    279   return ReplaceComponents(replacements);
    280 }
    281 
    282 GURL GURL::GetWithEmptyPath() const {
    283   // This doesn't make sense for invalid or nonstandard URLs, so return
    284   // the empty URL.
    285   if (!is_valid_ || !IsStandard())
    286     return GURL();
    287 
    288   // We could optimize this since we know that the URL is canonical, and we are
    289   // appending a canonical path, so avoiding re-parsing.
    290   GURL other(*this);
    291   if (parsed_.path.len == 0)
    292     return other;
    293 
    294   // Clear everything after the path.
    295   other.parsed_.query.reset();
    296   other.parsed_.ref.reset();
    297 
    298   // Set the path, since the path is longer than one, we can just set the
    299   // first character and resize.
    300   other.spec_[other.parsed_.path.begin] = '/';
    301   other.parsed_.path.len = 1;
    302   other.spec_.resize(other.parsed_.path.begin + 1);
    303   return other;
    304 }
    305 
    306 bool GURL::IsStandard() const {
    307   return url_util::IsStandard(spec_.data(), static_cast<int>(spec_.length()),
    308                               parsed_.scheme);
    309 }
    310 
    311 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
    312   if (parsed_.scheme.len <= 0)
    313     return lower_ascii_scheme == NULL;
    314   return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
    315                                         spec_.data() + parsed_.scheme.end(),
    316                                         lower_ascii_scheme);
    317 }
    318 
    319 int GURL::IntPort() const {
    320   if (parsed_.port.is_nonempty())
    321     return url_parse::ParsePort(spec_.data(), parsed_.port);
    322   return url_parse::PORT_UNSPECIFIED;
    323 }
    324 
    325 int GURL::EffectiveIntPort() const {
    326   int int_port = IntPort();
    327   if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
    328     return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
    329                                            parsed_.scheme.len);
    330   return int_port;
    331 }
    332 
    333 std::string GURL::ExtractFileName() const {
    334   url_parse::Component file_component;
    335   url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
    336   return ComponentString(file_component);
    337 }
    338 
    339 std::string GURL::PathForRequest() const {
    340   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
    341   if (parsed_.ref.len >= 0) {
    342     // Clip off the reference when it exists. The reference starts after the #
    343     // sign, so we have to subtract one to also remove it.
    344     return std::string(spec_, parsed_.path.begin,
    345                        parsed_.ref.begin - parsed_.path.begin - 1);
    346   }
    347 
    348   // Use everything form the path to the end.
    349   return std::string(spec_, parsed_.path.begin);
    350 }
    351 
    352 std::string GURL::HostNoBrackets() const {
    353   // If host looks like an IPv6 literal, strip the square brackets.
    354   url_parse::Component h(parsed_.host);
    355   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
    356     h.begin++;
    357     h.len -= 2;
    358   }
    359   return ComponentString(h);
    360 }
    361 
    362 bool GURL::HostIsIPAddress() const {
    363   if (!is_valid_ || spec_.empty())
    364      return false;
    365 
    366   url_canon::RawCanonOutputT<char, 128> ignored_output;
    367   url_canon::CanonHostInfo host_info;
    368   url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
    369                                    &ignored_output, &host_info);
    370   return host_info.IsIPAddress();
    371 }
    372 
    373 #ifdef WIN32
    374 
    375 const GURL& GURL::EmptyGURL() {
    376   // Avoid static object construction/destruction on startup/shutdown.
    377   if (!empty_gurl) {
    378     // Create the string. Be careful that we don't break in the case that this
    379     // is being called from multiple threads.
    380     GURL* new_empty_gurl = new GURL;
    381     if (InterlockedCompareExchangePointer(
    382         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
    383       // The old value was non-NULL, so no replacement was done. Another
    384       // thread did the initialization out from under us.
    385       delete new_empty_gurl;
    386     }
    387   }
    388   return *empty_gurl;
    389 }
    390 
    391 #else
    392 
    393 void EmptyGURLOnce(void) {
    394   empty_gurl = new GURL;
    395 }
    396 
    397 const GURL& GURL::EmptyGURL() {
    398   // Avoid static object construction/destruction on startup/shutdown.
    399   pthread_once(&empty_gurl_once, EmptyGURLOnce);
    400   return *empty_gurl;
    401 }
    402 
    403 #endif  // WIN32
    404 
    405 bool GURL::DomainIs(const char* lower_ascii_domain,
    406                     int domain_len) const {
    407   // Return false if this URL is not valid or domain is empty.
    408   if (!is_valid_ || !parsed_.host.is_nonempty() || !domain_len)
    409     return false;
    410 
    411   // Check whether the host name is end with a dot. If yes, treat it
    412   // the same as no-dot unless the input comparison domain is end
    413   // with dot.
    414   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
    415   int host_len = parsed_.host.len;
    416   if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
    417     last_pos--;
    418     host_len--;
    419   }
    420 
    421   // Return false if host's length is less than domain's length.
    422   if (host_len < domain_len)
    423     return false;
    424 
    425   // Compare this url whether belong specific domain.
    426   const char* start_pos = spec_.data() + parsed_.host.begin +
    427                           host_len - domain_len;
    428 
    429   if (!url_util::LowerCaseEqualsASCII(start_pos,
    430                                       last_pos + 1,
    431                                       lower_ascii_domain,
    432                                       lower_ascii_domain + domain_len))
    433     return false;
    434 
    435   // Check whether host has right domain start with dot, make sure we got
    436   // right domain range. For example www.google.com has domain
    437   // "google.com" but www.iamnotgoogle.com does not.
    438   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
    439       '.' != *(start_pos - 1))
    440     return false;
    441 
    442   return true;
    443 }
    444 
    445 void GURL::Swap(GURL* other) {
    446   spec_.swap(other->spec_);
    447   std::swap(is_valid_, other->is_valid_);
    448   std::swap(parsed_, other->parsed_);
    449 }
    450 
    451