1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Functions to canonicalize "standard" URLs, which are ones that have an 6 // authority section including a host name. 7 8 #include "url/url_canon.h" 9 #include "url/url_canon_internal.h" 10 #include "url/url_constants.h" 11 12 namespace url { 13 14 namespace { 15 16 template<typename CHAR, typename UCHAR> 17 bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source, 18 const Parsed& parsed, 19 CharsetConverter* query_converter, 20 CanonOutput* output, 21 Parsed* new_parsed) { 22 // Scheme: this will append the colon. 23 bool success = CanonicalizeScheme(source.scheme, parsed.scheme, 24 output, &new_parsed->scheme); 25 26 // Authority (username, password, host, port) 27 bool have_authority; 28 if (parsed.username.is_valid() || parsed.password.is_valid() || 29 parsed.host.is_nonempty() || parsed.port.is_valid()) { 30 have_authority = true; 31 32 // Only write the authority separators when we have a scheme. 33 if (parsed.scheme.is_valid()) { 34 output->push_back('/'); 35 output->push_back('/'); 36 } 37 38 // User info: the canonicalizer will handle the : and @. 39 success &= CanonicalizeUserInfo(source.username, parsed.username, 40 source.password, parsed.password, 41 output, 42 &new_parsed->username, 43 &new_parsed->password); 44 45 success &= CanonicalizeHost(source.host, parsed.host, 46 output, &new_parsed->host); 47 48 // Host must not be empty for standard URLs. 49 if (!parsed.host.is_nonempty()) 50 success = false; 51 52 // Port: the port canonicalizer will handle the colon. 53 int default_port = DefaultPortForScheme( 54 &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len); 55 success &= CanonicalizePort(source.port, parsed.port, default_port, 56 output, &new_parsed->port); 57 } else { 58 // No authority, clear the components. 59 have_authority = false; 60 new_parsed->host.reset(); 61 new_parsed->username.reset(); 62 new_parsed->password.reset(); 63 new_parsed->port.reset(); 64 success = false; // Standard URLs must have an authority. 65 } 66 67 // Path 68 if (parsed.path.is_valid()) { 69 success &= CanonicalizePath(source.path, parsed.path, 70 output, &new_parsed->path); 71 } else if (have_authority || 72 parsed.query.is_valid() || parsed.ref.is_valid()) { 73 // When we have an empty path, make up a path when we have an authority 74 // or something following the path. The only time we allow an empty 75 // output path is when there is nothing else. 76 new_parsed->path = Component(output->length(), 1); 77 output->push_back('/'); 78 } else { 79 // No path at all 80 new_parsed->path.reset(); 81 } 82 83 // Query 84 CanonicalizeQuery(source.query, parsed.query, query_converter, 85 output, &new_parsed->query); 86 87 // Ref: ignore failure for this, since the page can probably still be loaded. 88 CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); 89 90 return success; 91 } 92 93 } // namespace 94 95 96 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED 97 // if the scheme is unknown. 98 int DefaultPortForScheme(const char* scheme, int scheme_len) { 99 int default_port = PORT_UNSPECIFIED; 100 switch (scheme_len) { 101 case 4: 102 if (!strncmp(scheme, kHttpScheme, scheme_len)) 103 default_port = 80; 104 break; 105 case 5: 106 if (!strncmp(scheme, kHttpsScheme, scheme_len)) 107 default_port = 443; 108 break; 109 case 3: 110 if (!strncmp(scheme, kFtpScheme, scheme_len)) 111 default_port = 21; 112 else if (!strncmp(scheme, kWssScheme, scheme_len)) 113 default_port = 443; 114 break; 115 case 6: 116 if (!strncmp(scheme, kGopherScheme, scheme_len)) 117 default_port = 70; 118 break; 119 case 2: 120 if (!strncmp(scheme, kWsScheme, scheme_len)) 121 default_port = 80; 122 break; 123 } 124 return default_port; 125 } 126 127 bool CanonicalizeStandardURL(const char* spec, 128 int spec_len, 129 const Parsed& parsed, 130 CharsetConverter* query_converter, 131 CanonOutput* output, 132 Parsed* new_parsed) { 133 return DoCanonicalizeStandardURL<char, unsigned char>( 134 URLComponentSource<char>(spec), parsed, query_converter, 135 output, new_parsed); 136 } 137 138 bool CanonicalizeStandardURL(const base::char16* spec, 139 int spec_len, 140 const Parsed& parsed, 141 CharsetConverter* query_converter, 142 CanonOutput* output, 143 Parsed* new_parsed) { 144 return DoCanonicalizeStandardURL<base::char16, base::char16>( 145 URLComponentSource<base::char16>(spec), parsed, query_converter, 146 output, new_parsed); 147 } 148 149 // It might be nice in the future to optimize this so unchanged components don't 150 // need to be recanonicalized. This is especially true since the common case for 151 // ReplaceComponents is removing things we don't want, like reference fragments 152 // and usernames. These cases can become more efficient if we can assume the 153 // rest of the URL is OK with these removed (or only the modified parts 154 // recanonicalized). This would be much more complex to implement, however. 155 // 156 // You would also need to update DoReplaceComponents in url_util.cc which 157 // relies on this re-checking everything (see the comment there for why). 158 bool ReplaceStandardURL(const char* base, 159 const Parsed& base_parsed, 160 const Replacements<char>& replacements, 161 CharsetConverter* query_converter, 162 CanonOutput* output, 163 Parsed* new_parsed) { 164 URLComponentSource<char> source(base); 165 Parsed parsed(base_parsed); 166 SetupOverrideComponents(base, replacements, &source, &parsed); 167 return DoCanonicalizeStandardURL<char, unsigned char>( 168 source, parsed, query_converter, output, new_parsed); 169 } 170 171 // For 16-bit replacements, we turn all the replacements into UTF-8 so the 172 // regular codepath can be used. 173 bool ReplaceStandardURL(const char* base, 174 const Parsed& base_parsed, 175 const Replacements<base::char16>& replacements, 176 CharsetConverter* query_converter, 177 CanonOutput* output, 178 Parsed* new_parsed) { 179 RawCanonOutput<1024> utf8; 180 URLComponentSource<char> source(base); 181 Parsed parsed(base_parsed); 182 SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); 183 return DoCanonicalizeStandardURL<char, unsigned char>( 184 source, parsed, query_converter, output, new_parsed); 185 } 186 187 } // namespace url 188