Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "url/url_util.h"
      6 
      7 #include <string.h>
      8 #include <vector>
      9 
     10 #include "base/debug/leak_annotations.h"
     11 #include "base/logging.h"
     12 #include "url/url_canon_internal.h"
     13 #include "url/url_file.h"
     14 #include "url/url_util_internal.h"
     15 
     16 namespace url {
     17 
     18 namespace {
     19 
     20 // ASCII-specific tolower.  The standard library's tolower is locale sensitive,
     21 // so we don't want to use it here.
     22 template<class Char>
     23 inline Char ToLowerASCII(Char c) {
     24   return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
     25 }
     26 
     27 // Backend for LowerCaseEqualsASCII.
     28 template<typename Iter>
     29 inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
     30   for (Iter it = a_begin; it != a_end; ++it, ++b) {
     31     if (!*b || ToLowerASCII(*it) != *b)
     32       return false;
     33   }
     34   return *b == 0;
     35 }
     36 
     37 const int kNumStandardURLSchemes = 8;
     38 const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
     39   kHttpScheme,
     40   kHttpsScheme,
     41   kFileScheme,  // Yes, file urls can have a hostname!
     42   kFtpScheme,
     43   kGopherScheme,
     44   kWsScheme,    // WebSocket.
     45   kWssScheme,   // WebSocket secure.
     46   kFileSystemScheme,
     47 };
     48 
     49 // List of the currently installed standard schemes. This list is lazily
     50 // initialized by InitStandardSchemes and is leaked on shutdown to prevent
     51 // any destructors from being called that will slow us down or cause problems.
     52 std::vector<const char*>* standard_schemes = NULL;
     53 
     54 // See the LockStandardSchemes declaration in the header.
     55 bool standard_schemes_locked = false;
     56 
     57 // Ensures that the standard_schemes list is initialized, does nothing if it
     58 // already has values.
     59 void InitStandardSchemes() {
     60   if (standard_schemes)
     61     return;
     62   standard_schemes = new std::vector<const char*>;
     63   for (int i = 0; i < kNumStandardURLSchemes; i++)
     64     standard_schemes->push_back(kStandardURLSchemes[i]);
     65 }
     66 
     67 // Given a string and a range inside the string, compares it to the given
     68 // lower-case |compare_to| buffer.
     69 template<typename CHAR>
     70 inline bool DoCompareSchemeComponent(const CHAR* spec,
     71                                      const Component& component,
     72                                      const char* compare_to) {
     73   if (!component.is_nonempty())
     74     return compare_to[0] == 0;  // When component is empty, match empty scheme.
     75   return LowerCaseEqualsASCII(&spec[component.begin],
     76                               &spec[component.end()],
     77                               compare_to);
     78 }
     79 
     80 // Returns true if the given scheme identified by |scheme| within |spec| is one
     81 // of the registered "standard" schemes.
     82 template<typename CHAR>
     83 bool DoIsStandard(const CHAR* spec, const Component& scheme) {
     84   if (!scheme.is_nonempty())
     85     return false;  // Empty or invalid schemes are non-standard.
     86 
     87   InitStandardSchemes();
     88   for (size_t i = 0; i < standard_schemes->size(); i++) {
     89     if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
     90                              standard_schemes->at(i)))
     91       return true;
     92   }
     93   return false;
     94 }
     95 
     96 template<typename CHAR>
     97 bool DoFindAndCompareScheme(const CHAR* str,
     98                             int str_len,
     99                             const char* compare,
    100                             Component* found_scheme) {
    101   // Before extracting scheme, canonicalize the URL to remove any whitespace.
    102   // This matches the canonicalization done in DoCanonicalize function.
    103   RawCanonOutputT<CHAR> whitespace_buffer;
    104   int spec_len;
    105   const CHAR* spec = RemoveURLWhitespace(str, str_len,
    106                                          &whitespace_buffer, &spec_len);
    107 
    108   Component our_scheme;
    109   if (!ExtractScheme(spec, spec_len, &our_scheme)) {
    110     // No scheme.
    111     if (found_scheme)
    112       *found_scheme = Component();
    113     return false;
    114   }
    115   if (found_scheme)
    116     *found_scheme = our_scheme;
    117   return DoCompareSchemeComponent(spec, our_scheme, compare);
    118 }
    119 
    120 template<typename CHAR>
    121 bool DoCanonicalize(const CHAR* in_spec,
    122                     int in_spec_len,
    123                     bool trim_path_end,
    124                     CharsetConverter* charset_converter,
    125                     CanonOutput* output,
    126                     Parsed* output_parsed) {
    127   // Remove any whitespace from the middle of the relative URL, possibly
    128   // copying to the new buffer.
    129   RawCanonOutputT<CHAR> whitespace_buffer;
    130   int spec_len;
    131   const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
    132                                          &whitespace_buffer, &spec_len);
    133 
    134   Parsed parsed_input;
    135 #ifdef WIN32
    136   // For Windows, we allow things that look like absolute Windows paths to be
    137   // fixed up magically to file URLs. This is done for IE compatability. For
    138   // example, this will change "c:/foo" into a file URL rather than treating
    139   // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
    140   // There is similar logic in url_canon_relative.cc for
    141   //
    142   // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
    143   // has no meaning as an absolute path name. This is because browsers on Mac
    144   // & Unix don't generally do this, so there is no compatibility reason for
    145   // doing so.
    146   if (DoesBeginUNCPath(spec, 0, spec_len, false) ||
    147       DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
    148     ParseFileURL(spec, spec_len, &parsed_input);
    149     return CanonicalizeFileURL(spec, spec_len, parsed_input, charset_converter,
    150                                output, output_parsed);
    151   }
    152 #endif
    153 
    154   Component scheme;
    155   if (!ExtractScheme(spec, spec_len, &scheme))
    156     return false;
    157 
    158   // This is the parsed version of the input URL, we have to canonicalize it
    159   // before storing it in our object.
    160   bool success;
    161   if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) {
    162     // File URLs are special.
    163     ParseFileURL(spec, spec_len, &parsed_input);
    164     success = CanonicalizeFileURL(spec, spec_len, parsed_input,
    165                                   charset_converter, output, output_parsed);
    166   } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) {
    167     // Filesystem URLs are special.
    168     ParseFileSystemURL(spec, spec_len, &parsed_input);
    169     success = CanonicalizeFileSystemURL(spec, spec_len, parsed_input,
    170                                         charset_converter, output,
    171                                         output_parsed);
    172 
    173   } else if (DoIsStandard(spec, scheme)) {
    174     // All "normal" URLs.
    175     ParseStandardURL(spec, spec_len, &parsed_input);
    176     success = CanonicalizeStandardURL(spec, spec_len, parsed_input,
    177                                       charset_converter, output, output_parsed);
    178 
    179   } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
    180     // Mailto are treated like a standard url with only a scheme, path, query
    181     ParseMailtoURL(spec, spec_len, &parsed_input);
    182     success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output,
    183                                     output_parsed);
    184 
    185   } else {
    186     // "Weird" URLs like data: and javascript:
    187     ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
    188     success = CanonicalizePathURL(spec, spec_len, parsed_input, output,
    189                                   output_parsed);
    190   }
    191   return success;
    192 }
    193 
    194 template<typename CHAR>
    195 bool DoResolveRelative(const char* base_spec,
    196                        int base_spec_len,
    197                        const Parsed& base_parsed,
    198                        const CHAR* in_relative,
    199                        int in_relative_length,
    200                        CharsetConverter* charset_converter,
    201                        CanonOutput* output,
    202                        Parsed* output_parsed) {
    203   // Remove any whitespace from the middle of the relative URL, possibly
    204   // copying to the new buffer.
    205   RawCanonOutputT<CHAR> whitespace_buffer;
    206   int relative_length;
    207   const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
    208                                              &whitespace_buffer,
    209                                              &relative_length);
    210   bool base_is_authority_based = false;
    211   bool base_is_hierarchical = false;
    212   if (base_spec &&
    213       base_parsed.scheme.is_nonempty()) {
    214     int after_scheme = base_parsed.scheme.end() + 1;  // Skip past the colon.
    215     int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme,
    216                                               base_spec_len);
    217     base_is_authority_based = num_slashes > 1;
    218     base_is_hierarchical = num_slashes > 0;
    219   }
    220 
    221   bool standard_base_scheme =
    222       base_parsed.scheme.is_nonempty() &&
    223       DoIsStandard(base_spec, base_parsed.scheme);
    224 
    225   bool is_relative;
    226   Component relative_component;
    227   if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length,
    228                      (base_is_hierarchical || standard_base_scheme),
    229                      &is_relative, &relative_component)) {
    230     // Error resolving.
    231     return false;
    232   }
    233 
    234   // Pretend for a moment that |base_spec| is a standard URL. Normally
    235   // non-standard URLs are treated as PathURLs, but if the base has an
    236   // authority we would like to preserve it.
    237   if (is_relative && base_is_authority_based && !standard_base_scheme) {
    238     Parsed base_parsed_authority;
    239     ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority);
    240     if (base_parsed_authority.host.is_nonempty()) {
    241       bool did_resolve_succeed =
    242           ResolveRelativeURL(base_spec, base_parsed_authority, false, relative,
    243                              relative_component, charset_converter, output,
    244                              output_parsed);
    245       // The output_parsed is incorrect at this point (because it was built
    246       // based on base_parsed_authority instead of base_parsed) and needs to be
    247       // re-created.
    248       ParsePathURL(output->data(), output->length(), true,
    249                    output_parsed);
    250       return did_resolve_succeed;
    251     }
    252   } else if (is_relative) {
    253     // Relative, resolve and canonicalize.
    254     bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
    255         DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
    256     return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme, relative,
    257                               relative_component, charset_converter, output,
    258                               output_parsed);
    259   }
    260 
    261   // Not relative, canonicalize the input.
    262   return DoCanonicalize(relative, relative_length, true, charset_converter,
    263                         output, output_parsed);
    264 }
    265 
    266 template<typename CHAR>
    267 bool DoReplaceComponents(const char* spec,
    268                          int spec_len,
    269                          const Parsed& parsed,
    270                          const Replacements<CHAR>& replacements,
    271                          CharsetConverter* charset_converter,
    272                          CanonOutput* output,
    273                          Parsed* out_parsed) {
    274   // If the scheme is overridden, just do a simple string substitution and
    275   // reparse the whole thing. There are lots of edge cases that we really don't
    276   // want to deal with. Like what happens if I replace "http://e:8080/foo"
    277   // with a file. Does it become "file:///E:/8080/foo" where the port number
    278   // becomes part of the path? Parsing that string as a file URL says "yes"
    279   // but almost no sane rule for dealing with the components individually would
    280   // come up with that.
    281   //
    282   // Why allow these crazy cases at all? Programatically, there is almost no
    283   // case for replacing the scheme. The most common case for hitting this is
    284   // in JS when building up a URL using the location object. In this case, the
    285   // JS code expects the string substitution behavior:
    286   //   http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
    287   if (replacements.IsSchemeOverridden()) {
    288     // Canonicalize the new scheme so it is 8-bit and can be concatenated with
    289     // the existing spec.
    290     RawCanonOutput<128> scheme_replaced;
    291     Component scheme_replaced_parsed;
    292     CanonicalizeScheme(replacements.sources().scheme,
    293                        replacements.components().scheme,
    294                        &scheme_replaced, &scheme_replaced_parsed);
    295 
    296     // We can assume that the input is canonicalized, which means it always has
    297     // a colon after the scheme (or where the scheme would be).
    298     int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
    299                                                     : 1;
    300     if (spec_len - spec_after_colon > 0) {
    301       scheme_replaced.Append(&spec[spec_after_colon],
    302                              spec_len - spec_after_colon);
    303     }
    304 
    305     // We now need to completely re-parse the resulting string since its meaning
    306     // may have changed with the different scheme.
    307     RawCanonOutput<128> recanonicalized;
    308     Parsed recanonicalized_parsed;
    309     DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
    310                    charset_converter,
    311                    &recanonicalized, &recanonicalized_parsed);
    312 
    313     // Recurse using the version with the scheme already replaced. This will now
    314     // use the replacement rules for the new scheme.
    315     //
    316     // Warning: this code assumes that ReplaceComponents will re-check all
    317     // components for validity. This is because we can't fail if DoCanonicalize
    318     // failed above since theoretically the thing making it fail could be
    319     // getting replaced here. If ReplaceComponents didn't re-check everything,
    320     // we wouldn't know if something *not* getting replaced is a problem.
    321     // If the scheme-specific replacers are made more intelligent so they don't
    322     // re-check everything, we should instead recanonicalize the whole thing
    323     // after this call to check validity (this assumes replacing the scheme is
    324     // much much less common than other types of replacements, like clearing the
    325     // ref).
    326     Replacements<CHAR> replacements_no_scheme = replacements;
    327     replacements_no_scheme.SetScheme(NULL, Component());
    328     return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
    329                                recanonicalized_parsed, replacements_no_scheme,
    330                                charset_converter, output, out_parsed);
    331   }
    332 
    333   // If we get here, then we know the scheme doesn't need to be replaced, so can
    334   // just key off the scheme in the spec to know how to do the replacements.
    335   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) {
    336     return ReplaceFileURL(spec, parsed, replacements, charset_converter, output,
    337                           out_parsed);
    338   }
    339   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) {
    340     return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter,
    341                                 output, out_parsed);
    342   }
    343   if (DoIsStandard(spec, parsed.scheme)) {
    344     return ReplaceStandardURL(spec, parsed, replacements, charset_converter,
    345                               output, out_parsed);
    346   }
    347   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) {
    348     return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed);
    349   }
    350 
    351   // Default is a path URL.
    352   return ReplacePathURL(spec, parsed, replacements, output, out_parsed);
    353 }
    354 
    355 }  // namespace
    356 
    357 void Initialize() {
    358   InitStandardSchemes();
    359 }
    360 
    361 void Shutdown() {
    362   if (standard_schemes) {
    363     delete standard_schemes;
    364     standard_schemes = NULL;
    365   }
    366 }
    367 
    368 void AddStandardScheme(const char* new_scheme) {
    369   // If this assert triggers, it means you've called AddStandardScheme after
    370   // LockStandardSchemes have been called (see the header file for
    371   // LockStandardSchemes for more).
    372   //
    373   // This normally means you're trying to set up a new standard scheme too late
    374   // in your application's init process. Locate where your app does this
    375   // initialization and calls LockStandardScheme, and add your new standard
    376   // scheme there.
    377   DCHECK(!standard_schemes_locked) <<
    378       "Trying to add a standard scheme after the list has been locked.";
    379 
    380   size_t scheme_len = strlen(new_scheme);
    381   if (scheme_len == 0)
    382     return;
    383 
    384   // Dulicate the scheme into a new buffer and add it to the list of standard
    385   // schemes. This pointer will be leaked on shutdown.
    386   char* dup_scheme = new char[scheme_len + 1];
    387   ANNOTATE_LEAKING_OBJECT_PTR(dup_scheme);
    388   memcpy(dup_scheme, new_scheme, scheme_len + 1);
    389 
    390   InitStandardSchemes();
    391   standard_schemes->push_back(dup_scheme);
    392 }
    393 
    394 void LockStandardSchemes() {
    395   standard_schemes_locked = true;
    396 }
    397 
    398 bool IsStandard(const char* spec, const Component& scheme) {
    399   return DoIsStandard(spec, scheme);
    400 }
    401 
    402 bool IsStandard(const base::char16* spec, const Component& scheme) {
    403   return DoIsStandard(spec, scheme);
    404 }
    405 
    406 bool FindAndCompareScheme(const char* str,
    407                           int str_len,
    408                           const char* compare,
    409                           Component* found_scheme) {
    410   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
    411 }
    412 
    413 bool FindAndCompareScheme(const base::char16* str,
    414                           int str_len,
    415                           const char* compare,
    416                           Component* found_scheme) {
    417   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
    418 }
    419 
    420 bool Canonicalize(const char* spec,
    421                   int spec_len,
    422                   bool trim_path_end,
    423                   CharsetConverter* charset_converter,
    424                   CanonOutput* output,
    425                   Parsed* output_parsed) {
    426   return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
    427                         output, output_parsed);
    428 }
    429 
    430 bool Canonicalize(const base::char16* spec,
    431                   int spec_len,
    432                   bool trim_path_end,
    433                   CharsetConverter* charset_converter,
    434                   CanonOutput* output,
    435                   Parsed* output_parsed) {
    436   return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
    437                         output, output_parsed);
    438 }
    439 
    440 bool ResolveRelative(const char* base_spec,
    441                      int base_spec_len,
    442                      const Parsed& base_parsed,
    443                      const char* relative,
    444                      int relative_length,
    445                      CharsetConverter* charset_converter,
    446                      CanonOutput* output,
    447                      Parsed* output_parsed) {
    448   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
    449                            relative, relative_length,
    450                            charset_converter, output, output_parsed);
    451 }
    452 
    453 bool ResolveRelative(const char* base_spec,
    454                      int base_spec_len,
    455                      const Parsed& base_parsed,
    456                      const base::char16* relative,
    457                      int relative_length,
    458                      CharsetConverter* charset_converter,
    459                      CanonOutput* output,
    460                      Parsed* output_parsed) {
    461   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
    462                            relative, relative_length,
    463                            charset_converter, output, output_parsed);
    464 }
    465 
    466 bool ReplaceComponents(const char* spec,
    467                        int spec_len,
    468                        const Parsed& parsed,
    469                        const Replacements<char>& replacements,
    470                        CharsetConverter* charset_converter,
    471                        CanonOutput* output,
    472                        Parsed* out_parsed) {
    473   return DoReplaceComponents(spec, spec_len, parsed, replacements,
    474                              charset_converter, output, out_parsed);
    475 }
    476 
    477 bool ReplaceComponents(const char* spec,
    478                        int spec_len,
    479                        const Parsed& parsed,
    480                        const Replacements<base::char16>& replacements,
    481                        CharsetConverter* charset_converter,
    482                        CanonOutput* output,
    483                        Parsed* out_parsed) {
    484   return DoReplaceComponents(spec, spec_len, parsed, replacements,
    485                              charset_converter, output, out_parsed);
    486 }
    487 
    488 // Front-ends for LowerCaseEqualsASCII.
    489 bool LowerCaseEqualsASCII(const char* a_begin,
    490                           const char* a_end,
    491                           const char* b) {
    492   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
    493 }
    494 
    495 bool LowerCaseEqualsASCII(const char* a_begin,
    496                           const char* a_end,
    497                           const char* b_begin,
    498                           const char* b_end) {
    499   while (a_begin != a_end && b_begin != b_end &&
    500          ToLowerASCII(*a_begin) == *b_begin) {
    501     a_begin++;
    502     b_begin++;
    503   }
    504   return a_begin == a_end && b_begin == b_end;
    505 }
    506 
    507 bool LowerCaseEqualsASCII(const base::char16* a_begin,
    508                           const base::char16* a_end,
    509                           const char* b) {
    510   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
    511 }
    512 
    513 void DecodeURLEscapeSequences(const char* input,
    514                               int length,
    515                               CanonOutputW* output) {
    516   RawCanonOutputT<char> unescaped_chars;
    517   for (int i = 0; i < length; i++) {
    518     if (input[i] == '%') {
    519       unsigned char ch;
    520       if (DecodeEscaped(input, &i, length, &ch)) {
    521         unescaped_chars.push_back(ch);
    522       } else {
    523         // Invalid escape sequence, copy the percent literal.
    524         unescaped_chars.push_back('%');
    525       }
    526     } else {
    527       // Regular non-escaped 8-bit character.
    528       unescaped_chars.push_back(input[i]);
    529     }
    530   }
    531 
    532   // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
    533   // JavaScript URLs, but Firefox and Safari do.
    534   for (int i = 0; i < unescaped_chars.length(); i++) {
    535     unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
    536     if (uch < 0x80) {
    537       // Non-UTF-8, just append directly
    538       output->push_back(uch);
    539     } else {
    540       // next_ch will point to the last character of the decoded
    541       // character.
    542       int next_character = i;
    543       unsigned code_point;
    544       if (ReadUTFChar(unescaped_chars.data(), &next_character,
    545                       unescaped_chars.length(), &code_point)) {
    546         // Valid UTF-8 character, convert to UTF-16.
    547         AppendUTF16Value(code_point, output);
    548         i = next_character;
    549       } else {
    550         // If there are any sequences that are not valid UTF-8, we keep
    551         // invalid code points and promote to UTF-16. We copy all characters
    552         // from the current position to the end of the identified sequence.
    553         while (i < next_character) {
    554           output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
    555           i++;
    556         }
    557         output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
    558       }
    559     }
    560   }
    561 }
    562 
    563 void EncodeURIComponent(const char* input, int length, CanonOutput* output) {
    564   for (int i = 0; i < length; ++i) {
    565     unsigned char c = static_cast<unsigned char>(input[i]);
    566     if (IsComponentChar(c))
    567       output->push_back(c);
    568     else
    569       AppendEscapedChar(c, output);
    570   }
    571 }
    572 
    573 bool CompareSchemeComponent(const char* spec,
    574                             const Component& component,
    575                             const char* compare_to) {
    576   return DoCompareSchemeComponent(spec, component, compare_to);
    577 }
    578 
    579 bool CompareSchemeComponent(const base::char16* spec,
    580                             const Component& component,
    581                             const char* compare_to) {
    582   return DoCompareSchemeComponent(spec, component, compare_to);
    583 }
    584 
    585 }  // namespace url
    586