1 // Copyright 2007, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 #include <string.h> 31 #include <vector> 32 33 #include "googleurl/src/url_util.h" 34 35 #include "base/logging.h" 36 #include "googleurl/src/url_file.h" 37 38 namespace url_util { 39 40 namespace { 41 42 // ASCII-specific tolower. The standard library's tolower is locale sensitive, 43 // so we don't want to use it here. 44 template <class Char> inline Char ToLowerASCII(Char c) { 45 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; 46 } 47 48 // Backend for LowerCaseEqualsASCII. 49 template<typename Iter> 50 inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) { 51 for (Iter it = a_begin; it != a_end; ++it, ++b) { 52 if (!*b || ToLowerASCII(*it) != *b) 53 return false; 54 } 55 return *b == 0; 56 } 57 58 const char kFileScheme[] = "file"; // Used in a number of places. 59 const char kMailtoScheme[] = "mailto"; 60 61 const int kNumStandardURLSchemes = 5; 62 const char* kStandardURLSchemes[kNumStandardURLSchemes] = { 63 "http", 64 "https", 65 kFileScheme, // Yes, file urls can have a hostname! 66 "ftp", 67 "gopher", 68 }; 69 70 // List of the currently installed standard schemes. This list is lazily 71 // initialized by InitStandardSchemes and is leaked on shutdown to prevent 72 // any destructors from being called that will slow us down or cause problems. 73 std::vector<const char*>* standard_schemes = NULL; 74 75 // Ensures that the standard_schemes list is initialized, does nothing if it 76 // already has values. 77 void InitStandardSchemes() { 78 if (standard_schemes) 79 return; 80 standard_schemes = new std::vector<const char*>; 81 for (int i = 0; i < kNumStandardURLSchemes; i++) 82 standard_schemes->push_back(kStandardURLSchemes[i]); 83 } 84 85 // Given a string and a range inside the string, compares it to the given 86 // lower-case |compare_to| buffer. 87 template<typename CHAR> 88 inline bool CompareSchemeComponent(const CHAR* spec, 89 const url_parse::Component& component, 90 const char* compare_to) { 91 if (!component.is_nonempty()) 92 return compare_to[0] == 0; // When component is empty, match empty scheme. 93 return LowerCaseEqualsASCII(&spec[component.begin], 94 &spec[component.end()], 95 compare_to); 96 } 97 98 // Returns true if the given scheme identified by |scheme| within |spec| is one 99 // of the registered "standard" schemes. Note that this does not check for 100 // "://", use IsStandard for that. 101 template<typename CHAR> 102 bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) { 103 if (!scheme.is_nonempty()) 104 return false; // Empty or invalid schemes are non-standard. 105 106 InitStandardSchemes(); 107 for (size_t i = 0; i < standard_schemes->size(); i++) { 108 if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()], 109 standard_schemes->at(i))) 110 return true; 111 } 112 return false; 113 } 114 115 // Returns true if the stuff following the scheme in the given spec indicates 116 // a "standard" URL. The presence of "://" after the scheme indicates that 117 // there is a hostname, etc. which we call a standard URL. 118 template<typename CHAR> 119 bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len, 120 const url_parse::Component& scheme) { 121 int after_scheme = scheme.end(); 122 if (spec_len < after_scheme + 3) 123 return false; 124 return spec[after_scheme] == ':' && 125 spec[after_scheme + 1] == '/' && 126 spec[after_scheme + 2] == '/'; 127 } 128 129 template<typename CHAR> 130 bool DoIsStandard(const CHAR* spec, int spec_len, 131 const url_parse::Component& scheme) { 132 return HasStandardSchemeSeparator(spec, spec_len, scheme) || 133 IsStandardScheme(spec, scheme); 134 } 135 136 template<typename CHAR> 137 bool DoFindAndCompareScheme(const CHAR* str, 138 int str_len, 139 const char* compare, 140 url_parse::Component* found_scheme) { 141 url_parse::Component our_scheme; 142 if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) { 143 // No scheme. 144 if (found_scheme) 145 *found_scheme = url_parse::Component(); 146 return false; 147 } 148 if (found_scheme) 149 *found_scheme = our_scheme; 150 return CompareSchemeComponent(str, our_scheme, compare); 151 } 152 153 template<typename CHAR> 154 bool DoCanonicalize(const CHAR* in_spec, int in_spec_len, 155 url_canon::CharsetConverter* charset_converter, 156 url_canon::CanonOutput* output, 157 url_parse::Parsed* output_parsed) { 158 // Remove any whitespace from the middle of the relative URL, possibly 159 // copying to the new buffer. 160 url_canon::RawCanonOutputT<CHAR> whitespace_buffer; 161 int spec_len; 162 const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len, 163 &whitespace_buffer, &spec_len); 164 165 url_parse::Parsed parsed_input; 166 #ifdef WIN32 167 // For Windows, we allow things that look like absolute Windows paths to be 168 // fixed up magically to file URLs. This is done for IE compatability. For 169 // example, this will change "c:/foo" into a file URL rather than treating 170 // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt"). 171 // There is similar logic in url_canon_relative.cc for 172 // 173 // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which 174 // has no meaning as an absolute path name. This is because browsers on Mac 175 // & Unix don't generally do this, so there is no compatibility reason for 176 // doing so. 177 if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) || 178 url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) { 179 url_parse::ParseFileURL(spec, spec_len, &parsed_input); 180 return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input, 181 charset_converter, 182 output, output_parsed); 183 } 184 #endif 185 186 url_parse::Component scheme; 187 if(!url_parse::ExtractScheme(spec, spec_len, &scheme)) 188 return false; 189 190 // This is the parsed version of the input URL, we have to canonicalize it 191 // before storing it in our object. 192 bool success; 193 if (CompareSchemeComponent(spec, scheme, kFileScheme)) { 194 // File URLs are special. 195 url_parse::ParseFileURL(spec, spec_len, &parsed_input); 196 success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input, 197 charset_converter, 198 output, output_parsed); 199 200 } else if (IsStandard(spec, spec_len, scheme)) { 201 // All "normal" URLs. 202 url_parse::ParseStandardURL(spec, spec_len, &parsed_input); 203 success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input, 204 charset_converter, 205 output, output_parsed); 206 207 } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) { 208 // Mailto are treated like a standard url with only a scheme, path, query 209 url_parse::ParseMailtoURL(spec, spec_len, &parsed_input); 210 success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input, 211 output, output_parsed); 212 213 } else { 214 // "Weird" URLs like data: and javascript: 215 url_parse::ParsePathURL(spec, spec_len, &parsed_input); 216 success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input, 217 output, output_parsed); 218 } 219 return success; 220 } 221 222 template<typename CHAR> 223 bool DoResolveRelative(const char* base_spec, 224 int base_spec_len, 225 const url_parse::Parsed& base_parsed, 226 const CHAR* in_relative, 227 int in_relative_length, 228 url_canon::CharsetConverter* charset_converter, 229 url_canon::CanonOutput* output, 230 url_parse::Parsed* output_parsed) { 231 // Remove any whitespace from the middle of the relative URL, possibly 232 // copying to the new buffer. 233 url_canon::RawCanonOutputT<CHAR> whitespace_buffer; 234 int relative_length; 235 const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length, 236 &whitespace_buffer, 237 &relative_length); 238 239 // See if our base URL should be treated as "standard". 240 bool standard_base_scheme = 241 base_parsed.scheme.is_nonempty() && 242 IsStandard(base_spec, base_spec_len, base_parsed.scheme); 243 244 bool is_relative; 245 url_parse::Component relative_component; 246 if (!url_canon::IsRelativeURL(base_spec, base_parsed, 247 relative, relative_length, 248 standard_base_scheme, 249 &is_relative, 250 &relative_component)) { 251 // Error resolving. 252 return false; 253 } 254 255 if (is_relative) { 256 // Relative, resolve and canonicalize. 257 bool file_base_scheme = base_parsed.scheme.is_nonempty() && 258 CompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme); 259 return url_canon::ResolveRelativeURL(base_spec, base_parsed, 260 file_base_scheme, relative, 261 relative_component, charset_converter, 262 output, output_parsed); 263 } 264 265 // Not relative, canonicalize the input. 266 return DoCanonicalize(relative, relative_length, charset_converter, 267 output, output_parsed); 268 } 269 270 template<typename CHAR> 271 bool DoReplaceComponents(const char* spec, 272 int spec_len, 273 const url_parse::Parsed& parsed, 274 const url_canon::Replacements<CHAR>& replacements, 275 url_canon::CharsetConverter* charset_converter, 276 url_canon::CanonOutput* output, 277 url_parse::Parsed* out_parsed) { 278 // Note that we dispatch to the parser according the the scheme type of 279 // the OUTPUT URL. Normally, this is the same as our scheme, but if the 280 // scheme is being overridden, we need to test that. 281 282 if (// Either the scheme is not replaced and the old one is a file, 283 (!replacements.IsSchemeOverridden() && 284 CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) || 285 // ...or it is being replaced and the new one is a file. 286 (replacements.IsSchemeOverridden() && 287 CompareSchemeComponent(replacements.sources().scheme, 288 replacements.components().scheme, 289 kFileScheme))) { 290 return url_canon::ReplaceFileURL(spec, parsed, replacements, 291 charset_converter, output, out_parsed); 292 } 293 294 if (// Either the scheme is not replaced and the old one is standard, 295 (!replacements.IsSchemeOverridden() && 296 IsStandard(spec, spec_len, parsed.scheme)) || 297 // ...or it is being replaced and the new one is standard. 298 (replacements.IsSchemeOverridden() && 299 IsStandardScheme(replacements.sources().scheme, 300 replacements.components().scheme))) { 301 // Standard URL with all parts. 302 return url_canon::ReplaceStandardURL(spec, parsed, replacements, 303 charset_converter, output, out_parsed); 304 } 305 306 if (// Either the scheme is not replaced and the old one is mailto, 307 (!replacements.IsSchemeOverridden() && 308 CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) || 309 // ...or it is being replaced and the new one is a mailto. 310 (replacements.IsSchemeOverridden() && 311 CompareSchemeComponent(replacements.sources().scheme, 312 replacements.components().scheme, 313 kMailtoScheme))) { 314 return url_canon::ReplaceMailtoURL(spec, parsed, replacements, 315 output, out_parsed); 316 } 317 318 return url_canon::ReplacePathURL(spec, parsed, replacements, 319 output, out_parsed); 320 } 321 322 } // namespace 323 324 void AddStandardScheme(const char* new_scheme) { 325 size_t scheme_len = strlen(new_scheme); 326 if (scheme_len == 0) 327 return; 328 329 // Dulicate the scheme into a new buffer and add it to the list of standard 330 // schemes. This pointer will be leaked on shutdown. 331 char* dup_scheme = new char[scheme_len + 1]; 332 memcpy(dup_scheme, new_scheme, scheme_len + 1); 333 334 InitStandardSchemes(); 335 standard_schemes->push_back(dup_scheme); 336 } 337 338 bool IsStandard(const char* spec, int spec_len, 339 const url_parse::Component& scheme) { 340 return DoIsStandard(spec, spec_len, scheme); 341 } 342 343 bool IsStandard(const char16* spec, int spec_len, 344 const url_parse::Component& scheme) { 345 return DoIsStandard(spec, spec_len, scheme); 346 } 347 348 bool FindAndCompareScheme(const char* str, 349 int str_len, 350 const char* compare, 351 url_parse::Component* found_scheme) { 352 return DoFindAndCompareScheme(str, str_len, compare, found_scheme); 353 } 354 355 bool FindAndCompareScheme(const char16* str, 356 int str_len, 357 const char* compare, 358 url_parse::Component* found_scheme) { 359 return DoFindAndCompareScheme(str, str_len, compare, found_scheme); 360 } 361 362 bool Canonicalize(const char* spec, 363 int spec_len, 364 url_canon::CharsetConverter* charset_converter, 365 url_canon::CanonOutput* output, 366 url_parse::Parsed* output_parsed) { 367 return DoCanonicalize(spec, spec_len, charset_converter, 368 output, output_parsed); 369 } 370 371 bool Canonicalize(const char16* spec, 372 int spec_len, 373 url_canon::CharsetConverter* charset_converter, 374 url_canon::CanonOutput* output, 375 url_parse::Parsed* output_parsed) { 376 return DoCanonicalize(spec, spec_len, charset_converter, 377 output, output_parsed); 378 } 379 380 bool ResolveRelative(const char* base_spec, 381 int base_spec_len, 382 const url_parse::Parsed& base_parsed, 383 const char* relative, 384 int relative_length, 385 url_canon::CharsetConverter* charset_converter, 386 url_canon::CanonOutput* output, 387 url_parse::Parsed* output_parsed) { 388 return DoResolveRelative(base_spec, base_spec_len, base_parsed, 389 relative, relative_length, 390 charset_converter, output, output_parsed); 391 } 392 393 bool ResolveRelative(const char* base_spec, 394 int base_spec_len, 395 const url_parse::Parsed& base_parsed, 396 const char16* relative, 397 int relative_length, 398 url_canon::CharsetConverter* charset_converter, 399 url_canon::CanonOutput* output, 400 url_parse::Parsed* output_parsed) { 401 return DoResolveRelative(base_spec, base_spec_len, base_parsed, 402 relative, relative_length, 403 charset_converter, output, output_parsed); 404 } 405 406 bool ReplaceComponents(const char* spec, 407 int spec_len, 408 const url_parse::Parsed& parsed, 409 const url_canon::Replacements<char>& replacements, 410 url_canon::CharsetConverter* charset_converter, 411 url_canon::CanonOutput* output, 412 url_parse::Parsed* out_parsed) { 413 return DoReplaceComponents(spec, spec_len, parsed, replacements, 414 charset_converter, output, out_parsed); 415 } 416 417 bool ReplaceComponents(const char* spec, 418 int spec_len, 419 const url_parse::Parsed& parsed, 420 const url_canon::Replacements<char16>& replacements, 421 url_canon::CharsetConverter* charset_converter, 422 url_canon::CanonOutput* output, 423 url_parse::Parsed* out_parsed) { 424 return DoReplaceComponents(spec, spec_len, parsed, replacements, 425 charset_converter, output, out_parsed); 426 } 427 428 // Front-ends for LowerCaseEqualsASCII. 429 bool LowerCaseEqualsASCII(const char* a_begin, 430 const char* a_end, 431 const char* b) { 432 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 433 } 434 435 bool LowerCaseEqualsASCII(const char* a_begin, 436 const char* a_end, 437 const char* b_begin, 438 const char* b_end) { 439 while (a_begin != a_end && b_begin != b_end && 440 ToLowerASCII(*a_begin) == *b_begin) { 441 a_begin++; 442 b_begin++; 443 } 444 return a_begin == a_end && b_begin == b_end; 445 } 446 447 bool LowerCaseEqualsASCII(const char16* a_begin, 448 const char16* a_end, 449 const char* b) { 450 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 451 } 452 453 } // namespace url_util 454