1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "url/url_util.h" 6 7 #include <string.h> 8 #include <vector> 9 10 #include "base/logging.h" 11 #include "url/url_canon_internal.h" 12 #include "url/url_file.h" 13 #include "url/url_util_internal.h" 14 15 namespace url { 16 17 namespace { 18 19 // ASCII-specific tolower. The standard library's tolower is locale sensitive, 20 // so we don't want to use it here. 21 template<class Char> 22 inline Char ToLowerASCII(Char c) { 23 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; 24 } 25 26 // Backend for LowerCaseEqualsASCII. 27 template<typename Iter> 28 inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) { 29 for (Iter it = a_begin; it != a_end; ++it, ++b) { 30 if (!*b || ToLowerASCII(*it) != *b) 31 return false; 32 } 33 return *b == 0; 34 } 35 36 const int kNumStandardURLSchemes = 8; 37 const char* kStandardURLSchemes[kNumStandardURLSchemes] = { 38 kHttpScheme, 39 kHttpsScheme, 40 kFileScheme, // Yes, file urls can have a hostname! 41 kFtpScheme, 42 kGopherScheme, 43 kWsScheme, // WebSocket. 44 kWssScheme, // WebSocket secure. 45 kFileSystemScheme, 46 }; 47 48 // List of the currently installed standard schemes. This list is lazily 49 // initialized by InitStandardSchemes and is leaked on shutdown to prevent 50 // any destructors from being called that will slow us down or cause problems. 51 std::vector<const char*>* standard_schemes = NULL; 52 53 // See the LockStandardSchemes declaration in the header. 54 bool standard_schemes_locked = false; 55 56 // Ensures that the standard_schemes list is initialized, does nothing if it 57 // already has values. 58 void InitStandardSchemes() { 59 if (standard_schemes) 60 return; 61 standard_schemes = new std::vector<const char*>; 62 for (int i = 0; i < kNumStandardURLSchemes; i++) 63 standard_schemes->push_back(kStandardURLSchemes[i]); 64 } 65 66 // Given a string and a range inside the string, compares it to the given 67 // lower-case |compare_to| buffer. 68 template<typename CHAR> 69 inline bool DoCompareSchemeComponent(const CHAR* spec, 70 const Component& component, 71 const char* compare_to) { 72 if (!component.is_nonempty()) 73 return compare_to[0] == 0; // When component is empty, match empty scheme. 74 return LowerCaseEqualsASCII(&spec[component.begin], 75 &spec[component.end()], 76 compare_to); 77 } 78 79 // Returns true if the given scheme identified by |scheme| within |spec| is one 80 // of the registered "standard" schemes. 81 template<typename CHAR> 82 bool DoIsStandard(const CHAR* spec, const Component& scheme) { 83 if (!scheme.is_nonempty()) 84 return false; // Empty or invalid schemes are non-standard. 85 86 InitStandardSchemes(); 87 for (size_t i = 0; i < standard_schemes->size(); i++) { 88 if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()], 89 standard_schemes->at(i))) 90 return true; 91 } 92 return false; 93 } 94 95 template<typename CHAR> 96 bool DoFindAndCompareScheme(const CHAR* str, 97 int str_len, 98 const char* compare, 99 Component* found_scheme) { 100 // Before extracting scheme, canonicalize the URL to remove any whitespace. 101 // This matches the canonicalization done in DoCanonicalize function. 102 RawCanonOutputT<CHAR> whitespace_buffer; 103 int spec_len; 104 const CHAR* spec = RemoveURLWhitespace(str, str_len, 105 &whitespace_buffer, &spec_len); 106 107 Component our_scheme; 108 if (!ExtractScheme(spec, spec_len, &our_scheme)) { 109 // No scheme. 110 if (found_scheme) 111 *found_scheme = Component(); 112 return false; 113 } 114 if (found_scheme) 115 *found_scheme = our_scheme; 116 return DoCompareSchemeComponent(spec, our_scheme, compare); 117 } 118 119 template<typename CHAR> 120 bool DoCanonicalize(const CHAR* in_spec, 121 int in_spec_len, 122 bool trim_path_end, 123 CharsetConverter* charset_converter, 124 CanonOutput* output, 125 Parsed* output_parsed) { 126 // Remove any whitespace from the middle of the relative URL, possibly 127 // copying to the new buffer. 128 RawCanonOutputT<CHAR> whitespace_buffer; 129 int spec_len; 130 const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len, 131 &whitespace_buffer, &spec_len); 132 133 Parsed parsed_input; 134 #ifdef WIN32 135 // For Windows, we allow things that look like absolute Windows paths to be 136 // fixed up magically to file URLs. This is done for IE compatability. For 137 // example, this will change "c:/foo" into a file URL rather than treating 138 // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt"). 139 // There is similar logic in url_canon_relative.cc for 140 // 141 // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which 142 // has no meaning as an absolute path name. This is because browsers on Mac 143 // & Unix don't generally do this, so there is no compatibility reason for 144 // doing so. 145 if (DoesBeginUNCPath(spec, 0, spec_len, false) || 146 DoesBeginWindowsDriveSpec(spec, 0, spec_len)) { 147 ParseFileURL(spec, spec_len, &parsed_input); 148 return CanonicalizeFileURL(spec, spec_len, parsed_input, charset_converter, 149 output, output_parsed); 150 } 151 #endif 152 153 Component scheme; 154 if (!ExtractScheme(spec, spec_len, &scheme)) 155 return false; 156 157 // This is the parsed version of the input URL, we have to canonicalize it 158 // before storing it in our object. 159 bool success; 160 if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) { 161 // File URLs are special. 162 ParseFileURL(spec, spec_len, &parsed_input); 163 success = CanonicalizeFileURL(spec, spec_len, parsed_input, 164 charset_converter, output, output_parsed); 165 } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) { 166 // Filesystem URLs are special. 167 ParseFileSystemURL(spec, spec_len, &parsed_input); 168 success = CanonicalizeFileSystemURL(spec, spec_len, parsed_input, 169 charset_converter, output, 170 output_parsed); 171 172 } else if (DoIsStandard(spec, scheme)) { 173 // All "normal" URLs. 174 ParseStandardURL(spec, spec_len, &parsed_input); 175 success = CanonicalizeStandardURL(spec, spec_len, parsed_input, 176 charset_converter, output, output_parsed); 177 178 } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) { 179 // Mailto are treated like a standard url with only a scheme, path, query 180 ParseMailtoURL(spec, spec_len, &parsed_input); 181 success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output, 182 output_parsed); 183 184 } else { 185 // "Weird" URLs like data: and javascript: 186 ParsePathURL(spec, spec_len, trim_path_end, &parsed_input); 187 success = CanonicalizePathURL(spec, spec_len, parsed_input, output, 188 output_parsed); 189 } 190 return success; 191 } 192 193 template<typename CHAR> 194 bool DoResolveRelative(const char* base_spec, 195 int base_spec_len, 196 const Parsed& base_parsed, 197 const CHAR* in_relative, 198 int in_relative_length, 199 CharsetConverter* charset_converter, 200 CanonOutput* output, 201 Parsed* output_parsed) { 202 // Remove any whitespace from the middle of the relative URL, possibly 203 // copying to the new buffer. 204 RawCanonOutputT<CHAR> whitespace_buffer; 205 int relative_length; 206 const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length, 207 &whitespace_buffer, 208 &relative_length); 209 bool base_is_authority_based = false; 210 bool base_is_hierarchical = false; 211 if (base_spec && 212 base_parsed.scheme.is_nonempty()) { 213 int after_scheme = base_parsed.scheme.end() + 1; // Skip past the colon. 214 int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme, 215 base_spec_len); 216 base_is_authority_based = num_slashes > 1; 217 base_is_hierarchical = num_slashes > 0; 218 } 219 220 bool standard_base_scheme = 221 base_parsed.scheme.is_nonempty() && 222 DoIsStandard(base_spec, base_parsed.scheme); 223 224 bool is_relative; 225 Component relative_component; 226 if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length, 227 (base_is_hierarchical || standard_base_scheme), 228 &is_relative, &relative_component)) { 229 // Error resolving. 230 return false; 231 } 232 233 // Pretend for a moment that |base_spec| is a standard URL. Normally 234 // non-standard URLs are treated as PathURLs, but if the base has an 235 // authority we would like to preserve it. 236 if (is_relative && base_is_authority_based && !standard_base_scheme) { 237 Parsed base_parsed_authority; 238 ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority); 239 if (base_parsed_authority.host.is_nonempty()) { 240 bool did_resolve_succeed = 241 ResolveRelativeURL(base_spec, base_parsed_authority, false, relative, 242 relative_component, charset_converter, output, 243 output_parsed); 244 // The output_parsed is incorrect at this point (because it was built 245 // based on base_parsed_authority instead of base_parsed) and needs to be 246 // re-created. 247 ParsePathURL(output->data(), output->length(), true, 248 output_parsed); 249 return did_resolve_succeed; 250 } 251 } else if (is_relative) { 252 // Relative, resolve and canonicalize. 253 bool file_base_scheme = base_parsed.scheme.is_nonempty() && 254 DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme); 255 return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme, relative, 256 relative_component, charset_converter, output, 257 output_parsed); 258 } 259 260 // Not relative, canonicalize the input. 261 return DoCanonicalize(relative, relative_length, true, charset_converter, 262 output, output_parsed); 263 } 264 265 template<typename CHAR> 266 bool DoReplaceComponents(const char* spec, 267 int spec_len, 268 const Parsed& parsed, 269 const Replacements<CHAR>& replacements, 270 CharsetConverter* charset_converter, 271 CanonOutput* output, 272 Parsed* out_parsed) { 273 // If the scheme is overridden, just do a simple string substitution and 274 // reparse the whole thing. There are lots of edge cases that we really don't 275 // want to deal with. Like what happens if I replace "http://e:8080/foo" 276 // with a file. Does it become "file:///E:/8080/foo" where the port number 277 // becomes part of the path? Parsing that string as a file URL says "yes" 278 // but almost no sane rule for dealing with the components individually would 279 // come up with that. 280 // 281 // Why allow these crazy cases at all? Programatically, there is almost no 282 // case for replacing the scheme. The most common case for hitting this is 283 // in JS when building up a URL using the location object. In this case, the 284 // JS code expects the string substitution behavior: 285 // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3 286 if (replacements.IsSchemeOverridden()) { 287 // Canonicalize the new scheme so it is 8-bit and can be concatenated with 288 // the existing spec. 289 RawCanonOutput<128> scheme_replaced; 290 Component scheme_replaced_parsed; 291 CanonicalizeScheme(replacements.sources().scheme, 292 replacements.components().scheme, 293 &scheme_replaced, &scheme_replaced_parsed); 294 295 // We can assume that the input is canonicalized, which means it always has 296 // a colon after the scheme (or where the scheme would be). 297 int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1 298 : 1; 299 if (spec_len - spec_after_colon > 0) { 300 scheme_replaced.Append(&spec[spec_after_colon], 301 spec_len - spec_after_colon); 302 } 303 304 // We now need to completely re-parse the resulting string since its meaning 305 // may have changed with the different scheme. 306 RawCanonOutput<128> recanonicalized; 307 Parsed recanonicalized_parsed; 308 DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true, 309 charset_converter, 310 &recanonicalized, &recanonicalized_parsed); 311 312 // Recurse using the version with the scheme already replaced. This will now 313 // use the replacement rules for the new scheme. 314 // 315 // Warning: this code assumes that ReplaceComponents will re-check all 316 // components for validity. This is because we can't fail if DoCanonicalize 317 // failed above since theoretically the thing making it fail could be 318 // getting replaced here. If ReplaceComponents didn't re-check everything, 319 // we wouldn't know if something *not* getting replaced is a problem. 320 // If the scheme-specific replacers are made more intelligent so they don't 321 // re-check everything, we should instead recanonicalize the whole thing 322 // after this call to check validity (this assumes replacing the scheme is 323 // much much less common than other types of replacements, like clearing the 324 // ref). 325 Replacements<CHAR> replacements_no_scheme = replacements; 326 replacements_no_scheme.SetScheme(NULL, Component()); 327 return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(), 328 recanonicalized_parsed, replacements_no_scheme, 329 charset_converter, output, out_parsed); 330 } 331 332 // If we get here, then we know the scheme doesn't need to be replaced, so can 333 // just key off the scheme in the spec to know how to do the replacements. 334 if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) { 335 return ReplaceFileURL(spec, parsed, replacements, charset_converter, output, 336 out_parsed); 337 } 338 if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) { 339 return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter, 340 output, out_parsed); 341 } 342 if (DoIsStandard(spec, parsed.scheme)) { 343 return ReplaceStandardURL(spec, parsed, replacements, charset_converter, 344 output, out_parsed); 345 } 346 if (DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) { 347 return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed); 348 } 349 350 // Default is a path URL. 351 return ReplacePathURL(spec, parsed, replacements, output, out_parsed); 352 } 353 354 } // namespace 355 356 void Initialize() { 357 InitStandardSchemes(); 358 } 359 360 void Shutdown() { 361 if (standard_schemes) { 362 delete standard_schemes; 363 standard_schemes = NULL; 364 } 365 } 366 367 void AddStandardScheme(const char* new_scheme) { 368 // If this assert triggers, it means you've called AddStandardScheme after 369 // LockStandardSchemes have been called (see the header file for 370 // LockStandardSchemes for more). 371 // 372 // This normally means you're trying to set up a new standard scheme too late 373 // in your application's init process. Locate where your app does this 374 // initialization and calls LockStandardScheme, and add your new standard 375 // scheme there. 376 DCHECK(!standard_schemes_locked) << 377 "Trying to add a standard scheme after the list has been locked."; 378 379 size_t scheme_len = strlen(new_scheme); 380 if (scheme_len == 0) 381 return; 382 383 // Dulicate the scheme into a new buffer and add it to the list of standard 384 // schemes. This pointer will be leaked on shutdown. 385 char* dup_scheme = new char[scheme_len + 1]; 386 memcpy(dup_scheme, new_scheme, scheme_len + 1); 387 388 InitStandardSchemes(); 389 standard_schemes->push_back(dup_scheme); 390 } 391 392 void LockStandardSchemes() { 393 standard_schemes_locked = true; 394 } 395 396 bool IsStandard(const char* spec, const Component& scheme) { 397 return DoIsStandard(spec, scheme); 398 } 399 400 bool IsStandard(const base::char16* spec, const Component& scheme) { 401 return DoIsStandard(spec, scheme); 402 } 403 404 bool FindAndCompareScheme(const char* str, 405 int str_len, 406 const char* compare, 407 Component* found_scheme) { 408 return DoFindAndCompareScheme(str, str_len, compare, found_scheme); 409 } 410 411 bool FindAndCompareScheme(const base::char16* str, 412 int str_len, 413 const char* compare, 414 Component* found_scheme) { 415 return DoFindAndCompareScheme(str, str_len, compare, found_scheme); 416 } 417 418 bool Canonicalize(const char* spec, 419 int spec_len, 420 bool trim_path_end, 421 CharsetConverter* charset_converter, 422 CanonOutput* output, 423 Parsed* output_parsed) { 424 return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter, 425 output, output_parsed); 426 } 427 428 bool Canonicalize(const base::char16* spec, 429 int spec_len, 430 bool trim_path_end, 431 CharsetConverter* charset_converter, 432 CanonOutput* output, 433 Parsed* output_parsed) { 434 return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter, 435 output, output_parsed); 436 } 437 438 bool ResolveRelative(const char* base_spec, 439 int base_spec_len, 440 const Parsed& base_parsed, 441 const char* relative, 442 int relative_length, 443 CharsetConverter* charset_converter, 444 CanonOutput* output, 445 Parsed* output_parsed) { 446 return DoResolveRelative(base_spec, base_spec_len, base_parsed, 447 relative, relative_length, 448 charset_converter, output, output_parsed); 449 } 450 451 bool ResolveRelative(const char* base_spec, 452 int base_spec_len, 453 const Parsed& base_parsed, 454 const base::char16* relative, 455 int relative_length, 456 CharsetConverter* charset_converter, 457 CanonOutput* output, 458 Parsed* output_parsed) { 459 return DoResolveRelative(base_spec, base_spec_len, base_parsed, 460 relative, relative_length, 461 charset_converter, output, output_parsed); 462 } 463 464 bool ReplaceComponents(const char* spec, 465 int spec_len, 466 const Parsed& parsed, 467 const Replacements<char>& replacements, 468 CharsetConverter* charset_converter, 469 CanonOutput* output, 470 Parsed* out_parsed) { 471 return DoReplaceComponents(spec, spec_len, parsed, replacements, 472 charset_converter, output, out_parsed); 473 } 474 475 bool ReplaceComponents(const char* spec, 476 int spec_len, 477 const Parsed& parsed, 478 const Replacements<base::char16>& replacements, 479 CharsetConverter* charset_converter, 480 CanonOutput* output, 481 Parsed* out_parsed) { 482 return DoReplaceComponents(spec, spec_len, parsed, replacements, 483 charset_converter, output, out_parsed); 484 } 485 486 // Front-ends for LowerCaseEqualsASCII. 487 bool LowerCaseEqualsASCII(const char* a_begin, 488 const char* a_end, 489 const char* b) { 490 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 491 } 492 493 bool LowerCaseEqualsASCII(const char* a_begin, 494 const char* a_end, 495 const char* b_begin, 496 const char* b_end) { 497 while (a_begin != a_end && b_begin != b_end && 498 ToLowerASCII(*a_begin) == *b_begin) { 499 a_begin++; 500 b_begin++; 501 } 502 return a_begin == a_end && b_begin == b_end; 503 } 504 505 bool LowerCaseEqualsASCII(const base::char16* a_begin, 506 const base::char16* a_end, 507 const char* b) { 508 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 509 } 510 511 void DecodeURLEscapeSequences(const char* input, 512 int length, 513 CanonOutputW* output) { 514 RawCanonOutputT<char> unescaped_chars; 515 for (int i = 0; i < length; i++) { 516 if (input[i] == '%') { 517 unsigned char ch; 518 if (DecodeEscaped(input, &i, length, &ch)) { 519 unescaped_chars.push_back(ch); 520 } else { 521 // Invalid escape sequence, copy the percent literal. 522 unescaped_chars.push_back('%'); 523 } 524 } else { 525 // Regular non-escaped 8-bit character. 526 unescaped_chars.push_back(input[i]); 527 } 528 } 529 530 // Convert that 8-bit to UTF-16. It's not clear IE does this at all to 531 // JavaScript URLs, but Firefox and Safari do. 532 for (int i = 0; i < unescaped_chars.length(); i++) { 533 unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i)); 534 if (uch < 0x80) { 535 // Non-UTF-8, just append directly 536 output->push_back(uch); 537 } else { 538 // next_ch will point to the last character of the decoded 539 // character. 540 int next_character = i; 541 unsigned code_point; 542 if (ReadUTFChar(unescaped_chars.data(), &next_character, 543 unescaped_chars.length(), &code_point)) { 544 // Valid UTF-8 character, convert to UTF-16. 545 AppendUTF16Value(code_point, output); 546 i = next_character; 547 } else { 548 // If there are any sequences that are not valid UTF-8, we keep 549 // invalid code points and promote to UTF-16. We copy all characters 550 // from the current position to the end of the identified sequence. 551 while (i < next_character) { 552 output->push_back(static_cast<unsigned char>(unescaped_chars.at(i))); 553 i++; 554 } 555 output->push_back(static_cast<unsigned char>(unescaped_chars.at(i))); 556 } 557 } 558 } 559 } 560 561 void EncodeURIComponent(const char* input, int length, CanonOutput* output) { 562 for (int i = 0; i < length; ++i) { 563 unsigned char c = static_cast<unsigned char>(input[i]); 564 if (IsComponentChar(c)) 565 output->push_back(c); 566 else 567 AppendEscapedChar(c, output); 568 } 569 } 570 571 bool CompareSchemeComponent(const char* spec, 572 const Component& component, 573 const char* compare_to) { 574 return DoCompareSchemeComponent(spec, component, compare_to); 575 } 576 577 bool CompareSchemeComponent(const base::char16* spec, 578 const Component& component, 579 const char* compare_to) { 580 return DoCompareSchemeComponent(spec, component, compare_to); 581 } 582 583 } // namespace url 584