Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Functions for canonicalizing "path" URLs. Not to be confused with the path
      6 // of a URL, these are URLs that have no authority section, only a path. For
      7 // example, "javascript:" and "data:".
      8 
      9 #include "url/url_canon.h"
     10 #include "url/url_canon_internal.h"
     11 
     12 namespace url_canon {
     13 
     14 namespace {
     15 
     16 // Canonicalize the given |component| from |source| into |output| and
     17 // |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
     18 // prior to the canonicalized component; i.e. for the '?' or '#' characters.
     19 template<typename CHAR, typename UCHAR>
     20 bool DoCanonicalizePathComponent(const CHAR* source,
     21                                  const url_parse::Component& component,
     22                                  CHAR seperator,
     23                                  CanonOutput* output,
     24                                  url_parse::Component* new_component) {
     25   bool success = true;
     26   if (component.is_valid()) {
     27     if (seperator)
     28       output->push_back(seperator);
     29     // Copy the path using path URL's more lax escaping rules (think for
     30     // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
     31     // ASCII characters alone. This helps readability of JavaStript.
     32     new_component->begin = output->length();
     33     int end = component.end();
     34     for (int i = component.begin; i < end; i++) {
     35       UCHAR uch = static_cast<UCHAR>(source[i]);
     36       if (uch < 0x20 || uch >= 0x80)
     37         success &= AppendUTF8EscapedChar(source, &i, end, output);
     38       else
     39         output->push_back(static_cast<char>(uch));
     40     }
     41     new_component->len = output->length() - new_component->begin;
     42   } else {
     43     // Empty part.
     44     new_component->reset();
     45   }
     46   return success;
     47 }
     48 
     49 template<typename CHAR, typename UCHAR>
     50 bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
     51                            const url_parse::Parsed& parsed,
     52                            CanonOutput* output,
     53                            url_parse::Parsed* new_parsed) {
     54   // Scheme: this will append the colon.
     55   bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
     56                                     output, &new_parsed->scheme);
     57 
     58   // We assume there's no authority for path URLs. Note that hosts should never
     59   // have -1 length.
     60   new_parsed->username.reset();
     61   new_parsed->password.reset();
     62   new_parsed->host.reset();
     63   new_parsed->port.reset();
     64   // We allow path URLs to have the path, query and fragment components, but we
     65   // will canonicalize each of the via the weaker path URL rules.
     66   success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
     67       source.path, parsed.path, 0, output, &new_parsed->path);
     68   success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
     69       source.query, parsed.query, '?', output, &new_parsed->query);
     70   success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
     71       source.ref, parsed.ref, '#', output, &new_parsed->ref);
     72 
     73   return success;
     74 }
     75 
     76 }  // namespace
     77 
     78 bool CanonicalizePathURL(const char* spec,
     79                          int spec_len,
     80                          const url_parse::Parsed& parsed,
     81                          CanonOutput* output,
     82                          url_parse::Parsed* new_parsed) {
     83   return DoCanonicalizePathURL<char, unsigned char>(
     84       URLComponentSource<char>(spec), parsed, output, new_parsed);
     85 }
     86 
     87 bool CanonicalizePathURL(const base::char16* spec,
     88                          int spec_len,
     89                          const url_parse::Parsed& parsed,
     90                          CanonOutput* output,
     91                          url_parse::Parsed* new_parsed) {
     92   return DoCanonicalizePathURL<base::char16, base::char16>(
     93       URLComponentSource<base::char16>(spec), parsed, output, new_parsed);
     94 }
     95 
     96 bool ReplacePathURL(const char* base,
     97                     const url_parse::Parsed& base_parsed,
     98                     const Replacements<char>& replacements,
     99                     CanonOutput* output,
    100                     url_parse::Parsed* new_parsed) {
    101   URLComponentSource<char> source(base);
    102   url_parse::Parsed parsed(base_parsed);
    103   SetupOverrideComponents(base, replacements, &source, &parsed);
    104   return DoCanonicalizePathURL<char, unsigned char>(
    105       source, parsed, output, new_parsed);
    106 }
    107 
    108 bool ReplacePathURL(const char* base,
    109                     const url_parse::Parsed& base_parsed,
    110                     const Replacements<base::char16>& replacements,
    111                     CanonOutput* output,
    112                     url_parse::Parsed* new_parsed) {
    113   RawCanonOutput<1024> utf8;
    114   URLComponentSource<char> source(base);
    115   url_parse::Parsed parsed(base_parsed);
    116   SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
    117   return DoCanonicalizePathURL<char, unsigned char>(
    118       source, parsed, output, new_parsed);
    119 }
    120 
    121 }  // namespace url_canon
    122