Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 #include <string.h>
     31 #include <vector>
     32 
     33 #include "googleurl/src/url_util.h"
     34 
     35 #include "base/logging.h"
     36 #include "googleurl/src/url_file.h"
     37 
     38 namespace url_util {
     39 
     40 namespace {
     41 
     42 // ASCII-specific tolower.  The standard library's tolower is locale sensitive,
     43 // so we don't want to use it here.
     44 template <class Char> inline Char ToLowerASCII(Char c) {
     45   return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
     46 }
     47 
     48 // Backend for LowerCaseEqualsASCII.
     49 template<typename Iter>
     50 inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
     51   for (Iter it = a_begin; it != a_end; ++it, ++b) {
     52     if (!*b || ToLowerASCII(*it) != *b)
     53       return false;
     54   }
     55   return *b == 0;
     56 }
     57 
     58 const char kFileScheme[] = "file";  // Used in a number of places.
     59 const char kMailtoScheme[] = "mailto";
     60 
     61 const int kNumStandardURLSchemes = 5;
     62 const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
     63   "http",
     64   "https",
     65   kFileScheme,  // Yes, file urls can have a hostname!
     66   "ftp",
     67   "gopher",
     68 };
     69 
     70 // List of the currently installed standard schemes. This list is lazily
     71 // initialized by InitStandardSchemes and is leaked on shutdown to prevent
     72 // any destructors from being called that will slow us down or cause problems.
     73 std::vector<const char*>* standard_schemes = NULL;
     74 
     75 // Ensures that the standard_schemes list is initialized, does nothing if it
     76 // already has values.
     77 void InitStandardSchemes() {
     78   if (standard_schemes)
     79     return;
     80   standard_schemes = new std::vector<const char*>;
     81   for (int i = 0; i < kNumStandardURLSchemes; i++)
     82     standard_schemes->push_back(kStandardURLSchemes[i]);
     83 }
     84 
     85 // Given a string and a range inside the string, compares it to the given
     86 // lower-case |compare_to| buffer.
     87 template<typename CHAR>
     88 inline bool CompareSchemeComponent(const CHAR* spec,
     89                                    const url_parse::Component& component,
     90                                    const char* compare_to) {
     91   if (!component.is_nonempty())
     92     return compare_to[0] == 0;  // When component is empty, match empty scheme.
     93   return LowerCaseEqualsASCII(&spec[component.begin],
     94                               &spec[component.end()],
     95                               compare_to);
     96 }
     97 
     98 // Returns true if the given scheme identified by |scheme| within |spec| is one
     99 // of the registered "standard" schemes. Note that this does not check for
    100 // "://", use IsStandard for that.
    101 template<typename CHAR>
    102 bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {
    103   if (!scheme.is_nonempty())
    104     return false;  // Empty or invalid schemes are non-standard.
    105 
    106   InitStandardSchemes();
    107   for (size_t i = 0; i < standard_schemes->size(); i++) {
    108     if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
    109                              standard_schemes->at(i)))
    110       return true;
    111   }
    112   return false;
    113 }
    114 
    115 // Returns true if the stuff following the scheme in the given spec indicates
    116 // a "standard" URL. The presence of "://" after the scheme indicates that
    117 // there is a hostname, etc. which we call a standard URL.
    118 template<typename CHAR>
    119 bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len,
    120                                 const url_parse::Component& scheme) {
    121   int after_scheme = scheme.end();
    122   if (spec_len < after_scheme + 3)
    123     return false;
    124   return spec[after_scheme] == ':' &&
    125          spec[after_scheme + 1] == '/' &&
    126          spec[after_scheme + 2] == '/';
    127 }
    128 
    129 template<typename CHAR>
    130 bool DoIsStandard(const CHAR* spec, int spec_len,
    131                   const url_parse::Component& scheme) {
    132   return HasStandardSchemeSeparator(spec, spec_len, scheme) ||
    133          IsStandardScheme(spec, scheme);
    134 }
    135 
    136 template<typename CHAR>
    137 bool DoFindAndCompareScheme(const CHAR* str,
    138                             int str_len,
    139                             const char* compare,
    140                             url_parse::Component* found_scheme) {
    141   url_parse::Component our_scheme;
    142   if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) {
    143     // No scheme.
    144     if (found_scheme)
    145       *found_scheme = url_parse::Component();
    146     return false;
    147   }
    148   if (found_scheme)
    149     *found_scheme = our_scheme;
    150   return CompareSchemeComponent(str, our_scheme, compare);
    151 }
    152 
    153 template<typename CHAR>
    154 bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
    155                     url_canon::CharsetConverter* charset_converter,
    156                     url_canon::CanonOutput* output,
    157                     url_parse::Parsed* output_parsed) {
    158   // Remove any whitespace from the middle of the relative URL, possibly
    159   // copying to the new buffer.
    160   url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
    161   int spec_len;
    162   const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
    163                                          &whitespace_buffer, &spec_len);
    164 
    165   url_parse::Parsed parsed_input;
    166 #ifdef WIN32
    167   // For Windows, we allow things that look like absolute Windows paths to be
    168   // fixed up magically to file URLs. This is done for IE compatability. For
    169   // example, this will change "c:/foo" into a file URL rather than treating
    170   // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
    171   // There is similar logic in url_canon_relative.cc for
    172   //
    173   // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
    174   // has no meaning as an absolute path name. This is because browsers on Mac
    175   // & Unix don't generally do this, so there is no compatibility reason for
    176   // doing so.
    177   if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) ||
    178       url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
    179     url_parse::ParseFileURL(spec, spec_len, &parsed_input);
    180     return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
    181                                            charset_converter,
    182                                            output, output_parsed);
    183   }
    184 #endif
    185 
    186   url_parse::Component scheme;
    187   if(!url_parse::ExtractScheme(spec, spec_len, &scheme))
    188     return false;
    189 
    190   // This is the parsed version of the input URL, we have to canonicalize it
    191   // before storing it in our object.
    192   bool success;
    193   if (CompareSchemeComponent(spec, scheme, kFileScheme)) {
    194     // File URLs are special.
    195     url_parse::ParseFileURL(spec, spec_len, &parsed_input);
    196     success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
    197                                              charset_converter,
    198                                              output, output_parsed);
    199 
    200   } else if (IsStandard(spec, spec_len, scheme)) {
    201     // All "normal" URLs.
    202     url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
    203     success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
    204                                                  charset_converter,
    205                                                  output, output_parsed);
    206 
    207   } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) {
    208     // Mailto are treated like a standard url with only a scheme, path, query
    209     url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);
    210     success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,
    211                                                output, output_parsed);
    212 
    213   } else {
    214     // "Weird" URLs like data: and javascript:
    215     url_parse::ParsePathURL(spec, spec_len, &parsed_input);
    216     success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input,
    217                                              output, output_parsed);
    218   }
    219   return success;
    220 }
    221 
    222 template<typename CHAR>
    223 bool DoResolveRelative(const char* base_spec,
    224                        int base_spec_len,
    225                        const url_parse::Parsed& base_parsed,
    226                        const CHAR* in_relative,
    227                        int in_relative_length,
    228                        url_canon::CharsetConverter* charset_converter,
    229                        url_canon::CanonOutput* output,
    230                        url_parse::Parsed* output_parsed) {
    231   // Remove any whitespace from the middle of the relative URL, possibly
    232   // copying to the new buffer.
    233   url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
    234   int relative_length;
    235   const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
    236                                              &whitespace_buffer,
    237                                              &relative_length);
    238 
    239   // See if our base URL should be treated as "standard".
    240   bool standard_base_scheme =
    241       base_parsed.scheme.is_nonempty() &&
    242       IsStandard(base_spec, base_spec_len, base_parsed.scheme);
    243 
    244   bool is_relative;
    245   url_parse::Component relative_component;
    246   if (!url_canon::IsRelativeURL(base_spec, base_parsed,
    247                                 relative, relative_length,
    248                                 standard_base_scheme,
    249                                 &is_relative,
    250                                 &relative_component)) {
    251     // Error resolving.
    252     return false;
    253   }
    254 
    255   if (is_relative) {
    256     // Relative, resolve and canonicalize.
    257     bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
    258         CompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
    259     return url_canon::ResolveRelativeURL(base_spec, base_parsed,
    260                                          file_base_scheme, relative,
    261                                          relative_component, charset_converter,
    262                                          output, output_parsed);
    263   }
    264 
    265   // Not relative, canonicalize the input.
    266   return DoCanonicalize(relative, relative_length, charset_converter,
    267                         output, output_parsed);
    268 }
    269 
    270 template<typename CHAR>
    271 bool DoReplaceComponents(const char* spec,
    272                          int spec_len,
    273                          const url_parse::Parsed& parsed,
    274                          const url_canon::Replacements<CHAR>& replacements,
    275                          url_canon::CharsetConverter* charset_converter,
    276                          url_canon::CanonOutput* output,
    277                          url_parse::Parsed* out_parsed) {
    278   // Note that we dispatch to the parser according the the scheme type of
    279   // the OUTPUT URL. Normally, this is the same as our scheme, but if the
    280   // scheme is being overridden, we need to test that.
    281 
    282   if (// Either the scheme is not replaced and the old one is a file,
    283       (!replacements.IsSchemeOverridden() &&
    284        CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) ||
    285       // ...or it is being replaced and the new one is a file.
    286       (replacements.IsSchemeOverridden() &&
    287        CompareSchemeComponent(replacements.sources().scheme,
    288                               replacements.components().scheme,
    289                               kFileScheme))) {
    290     return url_canon::ReplaceFileURL(spec, parsed, replacements,
    291                                      charset_converter, output, out_parsed);
    292   }
    293 
    294   if (// Either the scheme is not replaced and the old one is standard,
    295       (!replacements.IsSchemeOverridden() &&
    296        IsStandard(spec, spec_len, parsed.scheme)) ||
    297       // ...or it is being replaced and the new one is standard.
    298       (replacements.IsSchemeOverridden() &&
    299        IsStandardScheme(replacements.sources().scheme,
    300                         replacements.components().scheme))) {
    301     // Standard URL with all parts.
    302     return url_canon::ReplaceStandardURL(spec, parsed, replacements,
    303                                          charset_converter, output, out_parsed);
    304   }
    305 
    306   if (// Either the scheme is not replaced and the old one is mailto,
    307       (!replacements.IsSchemeOverridden() &&
    308        CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) ||
    309       // ...or it is being replaced and the new one is a mailto.
    310       (replacements.IsSchemeOverridden() &&
    311        CompareSchemeComponent(replacements.sources().scheme,
    312                               replacements.components().scheme,
    313                               kMailtoScheme))) {
    314      return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
    315                                         output, out_parsed);
    316   }
    317 
    318   return url_canon::ReplacePathURL(spec, parsed, replacements,
    319                                    output, out_parsed);
    320 }
    321 
    322 }  // namespace
    323 
    324 void AddStandardScheme(const char* new_scheme) {
    325   size_t scheme_len = strlen(new_scheme);
    326   if (scheme_len == 0)
    327     return;
    328 
    329   // Dulicate the scheme into a new buffer and add it to the list of standard
    330   // schemes. This pointer will be leaked on shutdown.
    331   char* dup_scheme = new char[scheme_len + 1];
    332   memcpy(dup_scheme, new_scheme, scheme_len + 1);
    333 
    334   InitStandardSchemes();
    335   standard_schemes->push_back(dup_scheme);
    336 }
    337 
    338 bool IsStandard(const char* spec, int spec_len,
    339                 const url_parse::Component& scheme) {
    340   return DoIsStandard(spec, spec_len, scheme);
    341 }
    342 
    343 bool IsStandard(const char16* spec, int spec_len,
    344                 const url_parse::Component& scheme) {
    345   return DoIsStandard(spec, spec_len, scheme);
    346 }
    347 
    348 bool FindAndCompareScheme(const char* str,
    349                           int str_len,
    350                           const char* compare,
    351                           url_parse::Component* found_scheme) {
    352   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
    353 }
    354 
    355 bool FindAndCompareScheme(const char16* str,
    356                           int str_len,
    357                           const char* compare,
    358                           url_parse::Component* found_scheme) {
    359   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
    360 }
    361 
    362 bool Canonicalize(const char* spec,
    363                   int spec_len,
    364                   url_canon::CharsetConverter* charset_converter,
    365                   url_canon::CanonOutput* output,
    366                   url_parse::Parsed* output_parsed) {
    367   return DoCanonicalize(spec, spec_len, charset_converter,
    368                         output, output_parsed);
    369 }
    370 
    371 bool Canonicalize(const char16* spec,
    372                   int spec_len,
    373                   url_canon::CharsetConverter* charset_converter,
    374                   url_canon::CanonOutput* output,
    375                   url_parse::Parsed* output_parsed) {
    376   return DoCanonicalize(spec, spec_len, charset_converter,
    377                         output, output_parsed);
    378 }
    379 
    380 bool ResolveRelative(const char* base_spec,
    381                      int base_spec_len,
    382                      const url_parse::Parsed& base_parsed,
    383                      const char* relative,
    384                      int relative_length,
    385                      url_canon::CharsetConverter* charset_converter,
    386                      url_canon::CanonOutput* output,
    387                      url_parse::Parsed* output_parsed) {
    388   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
    389                            relative, relative_length,
    390                            charset_converter, output, output_parsed);
    391 }
    392 
    393 bool ResolveRelative(const char* base_spec,
    394                      int base_spec_len,
    395                      const url_parse::Parsed& base_parsed,
    396                      const char16* relative,
    397                      int relative_length,
    398                      url_canon::CharsetConverter* charset_converter,
    399                      url_canon::CanonOutput* output,
    400                      url_parse::Parsed* output_parsed) {
    401   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
    402                            relative, relative_length,
    403                            charset_converter, output, output_parsed);
    404 }
    405 
    406 bool ReplaceComponents(const char* spec,
    407                        int spec_len,
    408                        const url_parse::Parsed& parsed,
    409                        const url_canon::Replacements<char>& replacements,
    410                        url_canon::CharsetConverter* charset_converter,
    411                        url_canon::CanonOutput* output,
    412                        url_parse::Parsed* out_parsed) {
    413   return DoReplaceComponents(spec, spec_len, parsed, replacements,
    414                              charset_converter, output, out_parsed);
    415 }
    416 
    417 bool ReplaceComponents(const char* spec,
    418                        int spec_len,
    419                        const url_parse::Parsed& parsed,
    420                        const url_canon::Replacements<char16>& replacements,
    421                        url_canon::CharsetConverter* charset_converter,
    422                        url_canon::CanonOutput* output,
    423                        url_parse::Parsed* out_parsed) {
    424   return DoReplaceComponents(spec, spec_len, parsed, replacements,
    425                              charset_converter, output, out_parsed);
    426 }
    427 
    428 // Front-ends for LowerCaseEqualsASCII.
    429 bool LowerCaseEqualsASCII(const char* a_begin,
    430                           const char* a_end,
    431                           const char* b) {
    432   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
    433 }
    434 
    435 bool LowerCaseEqualsASCII(const char* a_begin,
    436                           const char* a_end,
    437                           const char* b_begin,
    438                           const char* b_end) {
    439   while (a_begin != a_end && b_begin != b_end &&
    440          ToLowerASCII(*a_begin) == *b_begin) {
    441     a_begin++;
    442     b_begin++;
    443   }
    444   return a_begin == a_end && b_begin == b_end;
    445 }
    446 
    447 bool LowerCaseEqualsASCII(const char16* a_begin,
    448                           const char16* a_end,
    449                           const char* b) {
    450   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
    451 }
    452 
    453 }  // namespace url_util
    454