Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Functions to canonicalize "standard" URLs, which are ones that have an
      6 // authority section including a host name.
      7 
      8 #include "url/url_canon.h"
      9 #include "url/url_canon_internal.h"
     10 
     11 namespace url_canon {
     12 
     13 namespace {
     14 
     15 template<typename CHAR, typename UCHAR>
     16 bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
     17                                const url_parse::Parsed& parsed,
     18                                CharsetConverter* query_converter,
     19                                CanonOutput* output,
     20                                url_parse::Parsed* new_parsed) {
     21   // Scheme: this will append the colon.
     22   bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
     23                                     output, &new_parsed->scheme);
     24 
     25   // Authority (username, password, host, port)
     26   bool have_authority;
     27   if (parsed.username.is_valid() || parsed.password.is_valid() ||
     28       parsed.host.is_nonempty() || parsed.port.is_valid()) {
     29     have_authority = true;
     30 
     31     // Only write the authority separators when we have a scheme.
     32     if (parsed.scheme.is_valid()) {
     33       output->push_back('/');
     34       output->push_back('/');
     35     }
     36 
     37     // User info: the canonicalizer will handle the : and @.
     38     success &= CanonicalizeUserInfo(source.username, parsed.username,
     39                                     source.password, parsed.password,
     40                                     output,
     41                                     &new_parsed->username,
     42                                     &new_parsed->password);
     43 
     44     success &= CanonicalizeHost(source.host, parsed.host,
     45                                 output, &new_parsed->host);
     46 
     47     // Host must not be empty for standard URLs.
     48     if (!parsed.host.is_nonempty())
     49       success = false;
     50 
     51     // Port: the port canonicalizer will handle the colon.
     52     int default_port = DefaultPortForScheme(
     53         &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
     54     success &= CanonicalizePort(source.port, parsed.port, default_port,
     55                                 output, &new_parsed->port);
     56   } else {
     57     // No authority, clear the components.
     58     have_authority = false;
     59     new_parsed->host.reset();
     60     new_parsed->username.reset();
     61     new_parsed->password.reset();
     62     new_parsed->port.reset();
     63     success = false;  // Standard URLs must have an authority.
     64   }
     65 
     66   // Path
     67   if (parsed.path.is_valid()) {
     68     success &= CanonicalizePath(source.path, parsed.path,
     69                                 output, &new_parsed->path);
     70   } else if (have_authority ||
     71              parsed.query.is_valid() || parsed.ref.is_valid()) {
     72     // When we have an empty path, make up a path when we have an authority
     73     // or something following the path. The only time we allow an empty
     74     // output path is when there is nothing else.
     75     new_parsed->path = url_parse::Component(output->length(), 1);
     76     output->push_back('/');
     77   } else {
     78     // No path at all
     79     new_parsed->path.reset();
     80   }
     81 
     82   // Query
     83   CanonicalizeQuery(source.query, parsed.query, query_converter,
     84                     output, &new_parsed->query);
     85 
     86   // Ref: ignore failure for this, since the page can probably still be loaded.
     87   CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
     88 
     89   return success;
     90 }
     91 
     92 }  // namespace
     93 
     94 
     95 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
     96 // if the scheme is unknown.
     97 int DefaultPortForScheme(const char* scheme, int scheme_len) {
     98   int default_port = url_parse::PORT_UNSPECIFIED;
     99   switch (scheme_len) {
    100     case 4:
    101       if (!strncmp(scheme, "http", scheme_len))
    102         default_port = 80;
    103       break;
    104     case 5:
    105       if (!strncmp(scheme, "https", scheme_len))
    106         default_port = 443;
    107       break;
    108     case 3:
    109       if (!strncmp(scheme, "ftp", scheme_len))
    110         default_port = 21;
    111       else if (!strncmp(scheme, "wss", scheme_len))
    112         default_port = 443;
    113       break;
    114     case 6:
    115       if (!strncmp(scheme, "gopher", scheme_len))
    116         default_port = 70;
    117       break;
    118     case 2:
    119       if (!strncmp(scheme, "ws", scheme_len))
    120         default_port = 80;
    121       break;
    122   }
    123   return default_port;
    124 }
    125 
    126 bool CanonicalizeStandardURL(const char* spec,
    127                              int spec_len,
    128                              const url_parse::Parsed& parsed,
    129                              CharsetConverter* query_converter,
    130                              CanonOutput* output,
    131                              url_parse::Parsed* new_parsed) {
    132   return DoCanonicalizeStandardURL<char, unsigned char>(
    133       URLComponentSource<char>(spec), parsed, query_converter,
    134       output, new_parsed);
    135 }
    136 
    137 bool CanonicalizeStandardURL(const base::char16* spec,
    138                              int spec_len,
    139                              const url_parse::Parsed& parsed,
    140                              CharsetConverter* query_converter,
    141                              CanonOutput* output,
    142                              url_parse::Parsed* new_parsed) {
    143   return DoCanonicalizeStandardURL<base::char16, base::char16>(
    144       URLComponentSource<base::char16>(spec), parsed, query_converter,
    145       output, new_parsed);
    146 }
    147 
    148 // It might be nice in the future to optimize this so unchanged components don't
    149 // need to be recanonicalized. This is especially true since the common case for
    150 // ReplaceComponents is removing things we don't want, like reference fragments
    151 // and usernames. These cases can become more efficient if we can assume the
    152 // rest of the URL is OK with these removed (or only the modified parts
    153 // recanonicalized). This would be much more complex to implement, however.
    154 //
    155 // You would also need to update DoReplaceComponents in url_util.cc which
    156 // relies on this re-checking everything (see the comment there for why).
    157 bool ReplaceStandardURL(const char* base,
    158                         const url_parse::Parsed& base_parsed,
    159                         const Replacements<char>& replacements,
    160                         CharsetConverter* query_converter,
    161                         CanonOutput* output,
    162                         url_parse::Parsed* new_parsed) {
    163   URLComponentSource<char> source(base);
    164   url_parse::Parsed parsed(base_parsed);
    165   SetupOverrideComponents(base, replacements, &source, &parsed);
    166   return DoCanonicalizeStandardURL<char, unsigned char>(
    167       source, parsed, query_converter, output, new_parsed);
    168 }
    169 
    170 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
    171 // regular codepath can be used.
    172 bool ReplaceStandardURL(const char* base,
    173                         const url_parse::Parsed& base_parsed,
    174                         const Replacements<base::char16>& replacements,
    175                         CharsetConverter* query_converter,
    176                         CanonOutput* output,
    177                         url_parse::Parsed* new_parsed) {
    178   RawCanonOutput<1024> utf8;
    179   URLComponentSource<char> source(base);
    180   url_parse::Parsed parsed(base_parsed);
    181   SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
    182   return DoCanonicalizeStandardURL<char, unsigned char>(
    183       source, parsed, query_converter, output, new_parsed);
    184 }
    185 
    186 }  // namespace url_canon
    187