Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 // Functions to canonicalize "standard" URLs, which are ones that have an
     31 // authority section including a host name.
     32 
     33 #include "googleurl/src/url_canon.h"
     34 #include "googleurl/src/url_canon_internal.h"
     35 
     36 namespace url_canon {
     37 
     38 namespace {
     39 
     40 template<typename CHAR, typename UCHAR>
     41 bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
     42                                const url_parse::Parsed& parsed,
     43                                CharsetConverter* query_converter,
     44                                CanonOutput* output,
     45                                url_parse::Parsed* new_parsed) {
     46   // Scheme: this will append the colon.
     47   bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
     48                                     output, &new_parsed->scheme);
     49 
     50   // Authority (username, password, host, port)
     51   bool have_authority;
     52   if (parsed.username.is_valid() || parsed.password.is_valid() ||
     53       parsed.host.is_nonempty() || parsed.port.is_valid()) {
     54     have_authority = true;
     55 
     56     // Only write the authority separators when we have a scheme.
     57     if (parsed.scheme.is_valid()) {
     58       output->push_back('/');
     59       output->push_back('/');
     60     }
     61 
     62     // User info: the canonicalizer will handle the : and @.
     63     success &= CanonicalizeUserInfo(source.username, parsed.username,
     64                                     source.password, parsed.password,
     65                                     output,
     66                                     &new_parsed->username,
     67                                     &new_parsed->password);
     68 
     69     success &= CanonicalizeHost(source.host, parsed.host,
     70                                 output, &new_parsed->host);
     71 
     72     // Host must not be empty for standard URLs.
     73     if (!parsed.host.is_nonempty())
     74       success = false;
     75 
     76     // Port: the port canonicalizer will handle the colon.
     77     int default_port = DefaultPortForScheme(
     78         &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
     79     success &= CanonicalizePort(source.port, parsed.port, default_port,
     80                                 output, &new_parsed->port);
     81   } else {
     82     // No authority, clear the components.
     83     have_authority = false;
     84     new_parsed->host.reset();
     85     new_parsed->username.reset();
     86     new_parsed->password.reset();
     87     new_parsed->port.reset();
     88     success = false;  // Standard URLs must have an authority.
     89   }
     90 
     91   // Path
     92   if (parsed.path.is_valid()) {
     93     success &= CanonicalizePath(source.path, parsed.path,
     94                                 output, &new_parsed->path);
     95   } else if (have_authority ||
     96              parsed.query.is_valid() || parsed.ref.is_valid()) {
     97     // When we have an empty path, make up a path when we have an authority
     98     // or something following the path. The only time we allow an empty
     99     // output path is when there is nothing else.
    100     new_parsed->path = url_parse::Component(output->length(), 1);
    101     output->push_back('/');
    102   } else {
    103     // No path at all
    104     new_parsed->path.reset();
    105   }
    106 
    107   // Query
    108   CanonicalizeQuery(source.query, parsed.query, query_converter,
    109                     output, &new_parsed->query);
    110 
    111   // Ref: ignore failure for this, since the page can probably still be loaded.
    112   CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
    113 
    114   return success;
    115 }
    116 
    117 }  // namespace
    118 
    119 
    120 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
    121 // if the scheme is unknown.
    122 int DefaultPortForScheme(const char* scheme, int scheme_len) {
    123   int default_port = url_parse::PORT_UNSPECIFIED;
    124   switch (scheme_len) {
    125     case 4:
    126       if (!strncmp(scheme, "http", scheme_len))
    127         default_port = 80;
    128       break;
    129     case 5:
    130       if (!strncmp(scheme, "https", scheme_len))
    131         default_port = 443;
    132       break;
    133     case 3:
    134       if (!strncmp(scheme, "ftp", scheme_len))
    135         default_port = 21;
    136       else if (!strncmp(scheme, "wss", scheme_len))
    137         default_port = 443;
    138       break;
    139     case 6:
    140       if (!strncmp(scheme, "gopher", scheme_len))
    141         default_port = 70;
    142       break;
    143     case 2:
    144       if (!strncmp(scheme, "ws", scheme_len))
    145         default_port = 80;
    146       break;
    147   }
    148   return default_port;
    149 }
    150 
    151 bool CanonicalizeStandardURL(const char* spec,
    152                              int spec_len,
    153                              const url_parse::Parsed& parsed,
    154                              CharsetConverter* query_converter,
    155                              CanonOutput* output,
    156                              url_parse::Parsed* new_parsed) {
    157   return DoCanonicalizeStandardURL<char, unsigned char>(
    158       URLComponentSource<char>(spec), parsed, query_converter,
    159       output, new_parsed);
    160 }
    161 
    162 bool CanonicalizeStandardURL(const char16* spec,
    163                              int spec_len,
    164                              const url_parse::Parsed& parsed,
    165                              CharsetConverter* query_converter,
    166                              CanonOutput* output,
    167                              url_parse::Parsed* new_parsed) {
    168   return DoCanonicalizeStandardURL<char16, char16>(
    169       URLComponentSource<char16>(spec), parsed, query_converter,
    170       output, new_parsed);
    171 }
    172 
    173 // It might be nice in the future to optimize this so unchanged components don't
    174 // need to be recanonicalized. This is especially true since the common case for
    175 // ReplaceComponents is removing things we don't want, like reference fragments
    176 // and usernames. These cases can become more efficient if we can assume the
    177 // rest of the URL is OK with these removed (or only the modified parts
    178 // recanonicalized). This would be much more complex to implement, however.
    179 //
    180 // You would also need to update DoReplaceComponents in url_util.cc which
    181 // relies on this re-checking everything (see the comment there for why).
    182 bool ReplaceStandardURL(const char* base,
    183                         const url_parse::Parsed& base_parsed,
    184                         const Replacements<char>& replacements,
    185                         CharsetConverter* query_converter,
    186                         CanonOutput* output,
    187                         url_parse::Parsed* new_parsed) {
    188   URLComponentSource<char> source(base);
    189   url_parse::Parsed parsed(base_parsed);
    190   SetupOverrideComponents(base, replacements, &source, &parsed);
    191   return DoCanonicalizeStandardURL<char, unsigned char>(
    192       source, parsed, query_converter, output, new_parsed);
    193 }
    194 
    195 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
    196 // regular codepath can be used.
    197 bool ReplaceStandardURL(const char* base,
    198                         const url_parse::Parsed& base_parsed,
    199                         const Replacements<char16>& replacements,
    200                         CharsetConverter* query_converter,
    201                         CanonOutput* output,
    202                         url_parse::Parsed* new_parsed) {
    203   RawCanonOutput<1024> utf8;
    204   URLComponentSource<char> source(base);
    205   url_parse::Parsed parsed(base_parsed);
    206   SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
    207   return DoCanonicalizeStandardURL<char, unsigned char>(
    208       source, parsed, query_converter, output, new_parsed);
    209 }
    210 
    211 }  // namespace url_canon
    212