1 /* Based on nsURLParsers.cc from Mozilla 2 * ------------------------------------- 3 * The contents of this file are subject to the Mozilla Public License Version 4 * 1.1 (the "License"); you may not use this file except in compliance with 5 * the License. You may obtain a copy of the License at 6 * http://www.mozilla.org/MPL/ 7 * 8 * Software distributed under the License is distributed on an "AS IS" basis, 9 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 * for the specific language governing rights and limitations under the 11 * License. 12 * 13 * The Original Code is mozilla.org code. 14 * 15 * The Initial Developer of the Original Code is 16 * Netscape Communications Corporation. 17 * Portions created by the Initial Developer are Copyright (C) 1998 18 * the Initial Developer. All Rights Reserved. 19 * 20 * Contributor(s): 21 * Darin Fisher (original author) 22 * 23 * Alternatively, the contents of this file may be used under the terms of 24 * either the GNU General Public License Version 2 or later (the "GPL"), or 25 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 * in which case the provisions of the GPL or the LGPL are applicable instead 27 * of those above. If you wish to allow use of your version of this file only 28 * under the terms of either the GPL or the LGPL, and not to allow others to 29 * use your version of this file under the terms of the MPL, indicate your 30 * decision by deleting the provisions above and replace them with the notice 31 * and other provisions required by the GPL or the LGPL. If you do not delete 32 * the provisions above, a recipient may use your version of this file under 33 * the terms of any one of the MPL, the GPL or the LGPL. 34 * 35 * ***** END LICENSE BLOCK ***** */ 36 37 #include "url/third_party/mozilla/url_parse.h" 38 39 #include <stdlib.h> 40 41 #include "base/logging.h" 42 #include "url/url_parse_internal.h" 43 #include "url/url_util.h" 44 #include "url/url_util_internal.h" 45 46 namespace url { 47 48 namespace { 49 50 // Returns true if the given character is a valid digit to use in a port. 51 inline bool IsPortDigit(base::char16 ch) { 52 return ch >= '0' && ch <= '9'; 53 } 54 55 // Returns the offset of the next authority terminator in the input starting 56 // from start_offset. If no terminator is found, the return value will be equal 57 // to spec_len. 58 template<typename CHAR> 59 int FindNextAuthorityTerminator(const CHAR* spec, 60 int start_offset, 61 int spec_len) { 62 for (int i = start_offset; i < spec_len; i++) { 63 if (IsAuthorityTerminator(spec[i])) 64 return i; 65 } 66 return spec_len; // Not found. 67 } 68 69 template<typename CHAR> 70 void ParseUserInfo(const CHAR* spec, 71 const Component& user, 72 Component* username, 73 Component* password) { 74 // Find the first colon in the user section, which separates the username and 75 // password. 76 int colon_offset = 0; 77 while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') 78 colon_offset++; 79 80 if (colon_offset < user.len) { 81 // Found separator: <username>:<password> 82 *username = Component(user.begin, colon_offset); 83 *password = MakeRange(user.begin + colon_offset + 1, 84 user.begin + user.len); 85 } else { 86 // No separator, treat everything as the username 87 *username = user; 88 *password = Component(); 89 } 90 } 91 92 template<typename CHAR> 93 void ParseServerInfo(const CHAR* spec, 94 const Component& serverinfo, 95 Component* hostname, 96 Component* port_num) { 97 if (serverinfo.len == 0) { 98 // No server info, host name is empty. 99 hostname->reset(); 100 port_num->reset(); 101 return; 102 } 103 104 // If the host starts with a left-bracket, assume the entire host is an 105 // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. 106 // This assumption will be overridden if we find a right-bracket. 107 // 108 // Our IPv6 address canonicalization code requires both brackets to exist, 109 // but the ability to locate an incomplete address can still be useful. 110 int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; 111 int colon = -1; 112 113 // Find the last right-bracket, and the last colon. 114 for (int i = serverinfo.begin; i < serverinfo.end(); i++) { 115 switch (spec[i]) { 116 case ']': 117 ipv6_terminator = i; 118 break; 119 case ':': 120 colon = i; 121 break; 122 } 123 } 124 125 if (colon > ipv6_terminator) { 126 // Found a port number: <hostname>:<port> 127 *hostname = MakeRange(serverinfo.begin, colon); 128 if (hostname->len == 0) 129 hostname->reset(); 130 *port_num = MakeRange(colon + 1, serverinfo.end()); 131 } else { 132 // No port: <hostname> 133 *hostname = serverinfo; 134 port_num->reset(); 135 } 136 } 137 138 // Given an already-identified auth section, breaks it into its consituent 139 // parts. The port number will be parsed and the resulting integer will be 140 // filled into the given *port variable, or -1 if there is no port number or it 141 // is invalid. 142 template<typename CHAR> 143 void DoParseAuthority(const CHAR* spec, 144 const Component& auth, 145 Component* username, 146 Component* password, 147 Component* hostname, 148 Component* port_num) { 149 DCHECK(auth.is_valid()) << "We should always get an authority"; 150 if (auth.len == 0) { 151 username->reset(); 152 password->reset(); 153 hostname->reset(); 154 port_num->reset(); 155 return; 156 } 157 158 // Search backwards for @, which is the separator between the user info and 159 // the server info. 160 int i = auth.begin + auth.len - 1; 161 while (i > auth.begin && spec[i] != '@') 162 i--; 163 164 if (spec[i] == '@') { 165 // Found user info: <user-info>@<server-info> 166 ParseUserInfo(spec, Component(auth.begin, i - auth.begin), 167 username, password); 168 ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), 169 hostname, port_num); 170 } else { 171 // No user info, everything is server info. 172 username->reset(); 173 password->reset(); 174 ParseServerInfo(spec, auth, hostname, port_num); 175 } 176 } 177 178 template<typename CHAR> 179 void ParsePath(const CHAR* spec, 180 const Component& path, 181 Component* filepath, 182 Component* query, 183 Component* ref) { 184 // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> 185 186 // Special case when there is no path. 187 if (path.len == -1) { 188 filepath->reset(); 189 query->reset(); 190 ref->reset(); 191 return; 192 } 193 DCHECK(path.len > 0) << "We should never have 0 length paths"; 194 195 // Search for first occurrence of either ? or #. 196 int path_end = path.begin + path.len; 197 198 int query_separator = -1; // Index of the '?' 199 int ref_separator = -1; // Index of the '#' 200 for (int i = path.begin; i < path_end; i++) { 201 switch (spec[i]) { 202 case '?': 203 // Only match the query string if it precedes the reference fragment 204 // and when we haven't found one already. 205 if (ref_separator < 0 && query_separator < 0) 206 query_separator = i; 207 break; 208 case '#': 209 // Record the first # sign only. 210 if (ref_separator < 0) 211 ref_separator = i; 212 break; 213 } 214 } 215 216 // Markers pointing to the character after each of these corresponding 217 // components. The code below words from the end back to the beginning, 218 // and will update these indices as it finds components that exist. 219 int file_end, query_end; 220 221 // Ref fragment: from the # to the end of the path. 222 if (ref_separator >= 0) { 223 file_end = query_end = ref_separator; 224 *ref = MakeRange(ref_separator + 1, path_end); 225 } else { 226 file_end = query_end = path_end; 227 ref->reset(); 228 } 229 230 // Query fragment: everything from the ? to the next boundary (either the end 231 // of the path or the ref fragment). 232 if (query_separator >= 0) { 233 file_end = query_separator; 234 *query = MakeRange(query_separator + 1, query_end); 235 } else { 236 query->reset(); 237 } 238 239 // File path: treat an empty file path as no file path. 240 if (file_end != path.begin) 241 *filepath = MakeRange(path.begin, file_end); 242 else 243 filepath->reset(); 244 } 245 246 template<typename CHAR> 247 bool DoExtractScheme(const CHAR* url, 248 int url_len, 249 Component* scheme) { 250 // Skip leading whitespace and control characters. 251 int begin = 0; 252 while (begin < url_len && ShouldTrimFromURL(url[begin])) 253 begin++; 254 if (begin == url_len) 255 return false; // Input is empty or all whitespace. 256 257 // Find the first colon character. 258 for (int i = begin; i < url_len; i++) { 259 if (url[i] == ':') { 260 *scheme = MakeRange(begin, i); 261 return true; 262 } 263 } 264 return false; // No colon found: no scheme 265 } 266 267 // Fills in all members of the Parsed structure except for the scheme. 268 // 269 // |spec| is the full spec being parsed, of length |spec_len|. 270 // |after_scheme| is the character immediately following the scheme (after the 271 // colon) where we'll begin parsing. 272 // 273 // Compatability data points. I list "host", "path" extracted: 274 // Input IE6 Firefox Us 275 // ----- -------------- -------------- -------------- 276 // http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 277 // http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 278 // http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" 279 // http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" 280 // http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 281 // 282 // (*) Interestingly, although IE fails to load these URLs, its history 283 // canonicalizer handles them, meaning if you've been to the corresponding 284 // "http://foo.com/" link, it will be colored. 285 template <typename CHAR> 286 void DoParseAfterScheme(const CHAR* spec, 287 int spec_len, 288 int after_scheme, 289 Parsed* parsed) { 290 int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); 291 int after_slashes = after_scheme + num_slashes; 292 293 // First split into two main parts, the authority (username, password, host, 294 // and port) and the full path (path, query, and reference). 295 Component authority; 296 Component full_path; 297 298 // Found "//<some data>", looks like an authority section. Treat everything 299 // from there to the next slash (or end of spec) to be the authority. Note 300 // that we ignore the number of slashes and treat it as the authority. 301 int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); 302 authority = Component(after_slashes, end_auth - after_slashes); 303 304 if (end_auth == spec_len) // No beginning of path found. 305 full_path = Component(); 306 else // Everything starting from the slash to the end is the path. 307 full_path = Component(end_auth, spec_len - end_auth); 308 309 // Now parse those two sub-parts. 310 DoParseAuthority(spec, authority, &parsed->username, &parsed->password, 311 &parsed->host, &parsed->port); 312 ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); 313 } 314 315 // The main parsing function for standard URLs. Standard URLs have a scheme, 316 // host, path, etc. 317 template<typename CHAR> 318 void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { 319 DCHECK(spec_len >= 0); 320 321 // Strip leading & trailing spaces and control characters. 322 int begin = 0; 323 TrimURL(spec, &begin, &spec_len); 324 325 int after_scheme; 326 if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { 327 after_scheme = parsed->scheme.end() + 1; // Skip past the colon. 328 } else { 329 // Say there's no scheme when there is no colon. We could also say that 330 // everything is the scheme. Both would produce an invalid URL, but this way 331 // seems less wrong in more cases. 332 parsed->scheme.reset(); 333 after_scheme = begin; 334 } 335 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 336 } 337 338 template<typename CHAR> 339 void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { 340 DCHECK(spec_len >= 0); 341 342 // Get the unused parts of the URL out of the way. 343 parsed->username.reset(); 344 parsed->password.reset(); 345 parsed->host.reset(); 346 parsed->port.reset(); 347 parsed->path.reset(); // May use this; reset for convenience. 348 parsed->ref.reset(); // May use this; reset for convenience. 349 parsed->query.reset(); // May use this; reset for convenience. 350 parsed->clear_inner_parsed(); // May use this; reset for convenience. 351 352 // Strip leading & trailing spaces and control characters. 353 int begin = 0; 354 TrimURL(spec, &begin, &spec_len); 355 356 // Handle empty specs or ones that contain only whitespace or control chars. 357 if (begin == spec_len) { 358 parsed->scheme.reset(); 359 return; 360 } 361 362 int inner_start = -1; 363 364 // Extract the scheme. We also handle the case where there is no scheme. 365 if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 366 // Offset the results since we gave ExtractScheme a substring. 367 parsed->scheme.begin += begin; 368 369 if (parsed->scheme.end() == spec_len - 1) 370 return; 371 372 inner_start = parsed->scheme.end() + 1; 373 } else { 374 // No scheme found; that's not valid for filesystem URLs. 375 parsed->scheme.reset(); 376 return; 377 } 378 379 Component inner_scheme; 380 const CHAR* inner_spec = &spec[inner_start]; 381 int inner_spec_len = spec_len - inner_start; 382 383 if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { 384 // Offset the results since we gave ExtractScheme a substring. 385 inner_scheme.begin += inner_start; 386 387 if (inner_scheme.end() == spec_len - 1) 388 return; 389 } else { 390 // No scheme found; that's not valid for filesystem URLs. 391 // The best we can do is return "filesystem://". 392 return; 393 } 394 395 Parsed inner_parsed; 396 397 if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) { 398 // File URLs are special. 399 ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); 400 } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) { 401 // Filesystem URLs don't nest. 402 return; 403 } else if (IsStandard(spec, inner_scheme)) { 404 // All "normal" URLs. 405 DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); 406 } else { 407 return; 408 } 409 410 // All members of inner_parsed need to be offset by inner_start. 411 // If we had any scheme that supported nesting more than one level deep, 412 // we'd have to recurse into the inner_parsed's inner_parsed when 413 // adjusting by inner_start. 414 inner_parsed.scheme.begin += inner_start; 415 inner_parsed.username.begin += inner_start; 416 inner_parsed.password.begin += inner_start; 417 inner_parsed.host.begin += inner_start; 418 inner_parsed.port.begin += inner_start; 419 inner_parsed.query.begin += inner_start; 420 inner_parsed.ref.begin += inner_start; 421 inner_parsed.path.begin += inner_start; 422 423 // Query and ref move from inner_parsed to parsed. 424 parsed->query = inner_parsed.query; 425 inner_parsed.query.reset(); 426 parsed->ref = inner_parsed.ref; 427 inner_parsed.ref.reset(); 428 429 parsed->set_inner_parsed(inner_parsed); 430 if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || 431 inner_parsed.inner_parsed()) { 432 return; 433 } 434 435 // The path in inner_parsed should start with a slash, then have a filesystem 436 // type followed by a slash. From the first slash up to but excluding the 437 // second should be what it keeps; the rest goes to parsed. If the path ends 438 // before the second slash, it's still pretty clear what the user meant, so 439 // we'll let that through. 440 if (!IsURLSlash(spec[inner_parsed.path.begin])) { 441 return; 442 } 443 int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash 444 while (inner_path_end < spec_len && 445 !IsURLSlash(spec[inner_path_end])) 446 ++inner_path_end; 447 parsed->path.begin = inner_path_end; 448 int new_inner_path_length = inner_path_end - inner_parsed.path.begin; 449 parsed->path.len = inner_parsed.path.len - new_inner_path_length; 450 parsed->inner_parsed()->path.len = new_inner_path_length; 451 } 452 453 // Initializes a path URL which is merely a scheme followed by a path. Examples 454 // include "about:foo" and "javascript:alert('bar');" 455 template<typename CHAR> 456 void DoParsePathURL(const CHAR* spec, int spec_len, 457 bool trim_path_end, 458 Parsed* parsed) { 459 // Get the non-path and non-scheme parts of the URL out of the way, we never 460 // use them. 461 parsed->username.reset(); 462 parsed->password.reset(); 463 parsed->host.reset(); 464 parsed->port.reset(); 465 parsed->path.reset(); 466 parsed->query.reset(); 467 parsed->ref.reset(); 468 469 // Strip leading & trailing spaces and control characters. 470 int scheme_begin = 0; 471 TrimURL(spec, &scheme_begin, &spec_len, trim_path_end); 472 473 // Handle empty specs or ones that contain only whitespace or control chars. 474 if (scheme_begin == spec_len) { 475 parsed->scheme.reset(); 476 parsed->path.reset(); 477 return; 478 } 479 480 int path_begin; 481 // Extract the scheme, with the path being everything following. We also 482 // handle the case where there is no scheme. 483 if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin, 484 &parsed->scheme)) { 485 // Offset the results since we gave ExtractScheme a substring. 486 parsed->scheme.begin += scheme_begin; 487 path_begin = parsed->scheme.end() + 1; 488 } else { 489 // No scheme case. 490 parsed->scheme.reset(); 491 path_begin = scheme_begin; 492 } 493 494 if (path_begin == spec_len) 495 return; 496 DCHECK_LT(path_begin, spec_len); 497 498 ParsePath(spec, 499 MakeRange(path_begin, spec_len), 500 &parsed->path, 501 &parsed->query, 502 &parsed->ref); 503 } 504 505 template<typename CHAR> 506 void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { 507 DCHECK(spec_len >= 0); 508 509 // Get the non-path and non-scheme parts of the URL out of the way, we never 510 // use them. 511 parsed->username.reset(); 512 parsed->password.reset(); 513 parsed->host.reset(); 514 parsed->port.reset(); 515 parsed->ref.reset(); 516 parsed->query.reset(); // May use this; reset for convenience. 517 518 // Strip leading & trailing spaces and control characters. 519 int begin = 0; 520 TrimURL(spec, &begin, &spec_len); 521 522 // Handle empty specs or ones that contain only whitespace or control chars. 523 if (begin == spec_len) { 524 parsed->scheme.reset(); 525 parsed->path.reset(); 526 return; 527 } 528 529 int path_begin = -1; 530 int path_end = -1; 531 532 // Extract the scheme, with the path being everything following. We also 533 // handle the case where there is no scheme. 534 if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 535 // Offset the results since we gave ExtractScheme a substring. 536 parsed->scheme.begin += begin; 537 538 if (parsed->scheme.end() != spec_len - 1) { 539 path_begin = parsed->scheme.end() + 1; 540 path_end = spec_len; 541 } 542 } else { 543 // No scheme found, just path. 544 parsed->scheme.reset(); 545 path_begin = begin; 546 path_end = spec_len; 547 } 548 549 // Split [path_begin, path_end) into a path + query. 550 for (int i = path_begin; i < path_end; ++i) { 551 if (spec[i] == '?') { 552 parsed->query = MakeRange(i + 1, path_end); 553 path_end = i; 554 break; 555 } 556 } 557 558 // For compatability with the standard URL parser, treat no path as 559 // -1, rather than having a length of 0 560 if (path_begin == path_end) { 561 parsed->path.reset(); 562 } else { 563 parsed->path = MakeRange(path_begin, path_end); 564 } 565 } 566 567 // Converts a port number in a string to an integer. We'd like to just call 568 // sscanf but our input is not NULL-terminated, which sscanf requires. Instead, 569 // we copy the digits to a small stack buffer (since we know the maximum number 570 // of digits in a valid port number) that we can NULL terminate. 571 template<typename CHAR> 572 int DoParsePort(const CHAR* spec, const Component& component) { 573 // Easy success case when there is no port. 574 const int kMaxDigits = 5; 575 if (!component.is_nonempty()) 576 return PORT_UNSPECIFIED; 577 578 // Skip over any leading 0s. 579 Component digits_comp(component.end(), 0); 580 for (int i = 0; i < component.len; i++) { 581 if (spec[component.begin + i] != '0') { 582 digits_comp = MakeRange(component.begin + i, component.end()); 583 break; 584 } 585 } 586 if (digits_comp.len == 0) 587 return 0; // All digits were 0. 588 589 // Verify we don't have too many digits (we'll be copying to our buffer so 590 // we need to double-check). 591 if (digits_comp.len > kMaxDigits) 592 return PORT_INVALID; 593 594 // Copy valid digits to the buffer. 595 char digits[kMaxDigits + 1]; // +1 for null terminator 596 for (int i = 0; i < digits_comp.len; i++) { 597 CHAR ch = spec[digits_comp.begin + i]; 598 if (!IsPortDigit(ch)) { 599 // Invalid port digit, fail. 600 return PORT_INVALID; 601 } 602 digits[i] = static_cast<char>(ch); 603 } 604 605 // Null-terminate the string and convert to integer. Since we guarantee 606 // only digits, atoi's lack of error handling is OK. 607 digits[digits_comp.len] = 0; 608 int port = atoi(digits); 609 if (port > 65535) 610 return PORT_INVALID; // Out of range. 611 return port; 612 } 613 614 template<typename CHAR> 615 void DoExtractFileName(const CHAR* spec, 616 const Component& path, 617 Component* file_name) { 618 // Handle empty paths: they have no file names. 619 if (!path.is_nonempty()) { 620 file_name->reset(); 621 return; 622 } 623 624 // Search backwards for a parameter, which is a normally unused field in a 625 // URL delimited by a semicolon. We parse the parameter as part of the 626 // path, but here, we don't want to count it. The last semicolon is the 627 // parameter. The path should start with a slash, so we don't need to check 628 // the first one. 629 int file_end = path.end(); 630 for (int i = path.end() - 1; i > path.begin; i--) { 631 if (spec[i] == ';') { 632 file_end = i; 633 break; 634 } 635 } 636 637 // Now search backwards from the filename end to the previous slash 638 // to find the beginning of the filename. 639 for (int i = file_end - 1; i >= path.begin; i--) { 640 if (IsURLSlash(spec[i])) { 641 // File name is everything following this character to the end 642 *file_name = MakeRange(i + 1, file_end); 643 return; 644 } 645 } 646 647 // No slash found, this means the input was degenerate (generally paths 648 // will start with a slash). Let's call everything the file name. 649 *file_name = MakeRange(path.begin, file_end); 650 return; 651 } 652 653 template<typename CHAR> 654 bool DoExtractQueryKeyValue(const CHAR* spec, 655 Component* query, 656 Component* key, 657 Component* value) { 658 if (!query->is_nonempty()) 659 return false; 660 661 int start = query->begin; 662 int cur = start; 663 int end = query->end(); 664 665 // We assume the beginning of the input is the beginning of the "key" and we 666 // skip to the end of it. 667 key->begin = cur; 668 while (cur < end && spec[cur] != '&' && spec[cur] != '=') 669 cur++; 670 key->len = cur - key->begin; 671 672 // Skip the separator after the key (if any). 673 if (cur < end && spec[cur] == '=') 674 cur++; 675 676 // Find the value part. 677 value->begin = cur; 678 while (cur < end && spec[cur] != '&') 679 cur++; 680 value->len = cur - value->begin; 681 682 // Finally skip the next separator if any 683 if (cur < end && spec[cur] == '&') 684 cur++; 685 686 // Save the new query 687 *query = MakeRange(cur, end); 688 return true; 689 } 690 691 } // namespace 692 693 Parsed::Parsed() : inner_parsed_(NULL) { 694 } 695 696 Parsed::Parsed(const Parsed& other) : 697 scheme(other.scheme), 698 username(other.username), 699 password(other.password), 700 host(other.host), 701 port(other.port), 702 path(other.path), 703 query(other.query), 704 ref(other.ref), 705 inner_parsed_(NULL) { 706 if (other.inner_parsed_) 707 set_inner_parsed(*other.inner_parsed_); 708 } 709 710 Parsed& Parsed::operator=(const Parsed& other) { 711 if (this != &other) { 712 scheme = other.scheme; 713 username = other.username; 714 password = other.password; 715 host = other.host; 716 port = other.port; 717 path = other.path; 718 query = other.query; 719 ref = other.ref; 720 if (other.inner_parsed_) 721 set_inner_parsed(*other.inner_parsed_); 722 else 723 clear_inner_parsed(); 724 } 725 return *this; 726 } 727 728 Parsed::~Parsed() { 729 delete inner_parsed_; 730 } 731 732 int Parsed::Length() const { 733 if (ref.is_valid()) 734 return ref.end(); 735 return CountCharactersBefore(REF, false); 736 } 737 738 int Parsed::CountCharactersBefore(ComponentType type, 739 bool include_delimiter) const { 740 if (type == SCHEME) 741 return scheme.begin; 742 743 // There will be some characters after the scheme like "://" and we don't 744 // know how many. Search forwards for the next thing until we find one. 745 int cur = 0; 746 if (scheme.is_valid()) 747 cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. 748 749 if (username.is_valid()) { 750 if (type <= USERNAME) 751 return username.begin; 752 cur = username.end() + 1; // Advance over the '@' or ':' at the end. 753 } 754 755 if (password.is_valid()) { 756 if (type <= PASSWORD) 757 return password.begin; 758 cur = password.end() + 1; // Advance over the '@' at the end. 759 } 760 761 if (host.is_valid()) { 762 if (type <= HOST) 763 return host.begin; 764 cur = host.end(); 765 } 766 767 if (port.is_valid()) { 768 if (type < PORT || (type == PORT && include_delimiter)) 769 return port.begin - 1; // Back over delimiter. 770 if (type == PORT) 771 return port.begin; // Don't want delimiter counted. 772 cur = port.end(); 773 } 774 775 if (path.is_valid()) { 776 if (type <= PATH) 777 return path.begin; 778 cur = path.end(); 779 } 780 781 if (query.is_valid()) { 782 if (type < QUERY || (type == QUERY && include_delimiter)) 783 return query.begin - 1; // Back over delimiter. 784 if (type == QUERY) 785 return query.begin; // Don't want delimiter counted. 786 cur = query.end(); 787 } 788 789 if (ref.is_valid()) { 790 if (type == REF && !include_delimiter) 791 return ref.begin; // Back over delimiter. 792 793 // When there is a ref and we get here, the component we wanted was before 794 // this and not found, so we always know the beginning of the ref is right. 795 return ref.begin - 1; // Don't want delimiter counted. 796 } 797 798 return cur; 799 } 800 801 Component Parsed::GetContent() const { 802 const int begin = CountCharactersBefore(USERNAME, false); 803 const int len = Length() - begin; 804 // For compatability with the standard URL parser, we treat no content as 805 // -1, rather than having a length of 0 (we normally wouldn't care so 806 // much for these non-standard URLs). 807 return len ? Component(begin, len) : Component(); 808 } 809 810 bool ExtractScheme(const char* url, int url_len, Component* scheme) { 811 return DoExtractScheme(url, url_len, scheme); 812 } 813 814 bool ExtractScheme(const base::char16* url, int url_len, Component* scheme) { 815 return DoExtractScheme(url, url_len, scheme); 816 } 817 818 // This handles everything that may be an authority terminator, including 819 // backslash. For special backslash handling see DoParseAfterScheme. 820 bool IsAuthorityTerminator(base::char16 ch) { 821 return IsURLSlash(ch) || ch == '?' || ch == '#'; 822 } 823 824 void ExtractFileName(const char* url, 825 const Component& path, 826 Component* file_name) { 827 DoExtractFileName(url, path, file_name); 828 } 829 830 void ExtractFileName(const base::char16* url, 831 const Component& path, 832 Component* file_name) { 833 DoExtractFileName(url, path, file_name); 834 } 835 836 bool ExtractQueryKeyValue(const char* url, 837 Component* query, 838 Component* key, 839 Component* value) { 840 return DoExtractQueryKeyValue(url, query, key, value); 841 } 842 843 bool ExtractQueryKeyValue(const base::char16* url, 844 Component* query, 845 Component* key, 846 Component* value) { 847 return DoExtractQueryKeyValue(url, query, key, value); 848 } 849 850 void ParseAuthority(const char* spec, 851 const Component& auth, 852 Component* username, 853 Component* password, 854 Component* hostname, 855 Component* port_num) { 856 DoParseAuthority(spec, auth, username, password, hostname, port_num); 857 } 858 859 void ParseAuthority(const base::char16* spec, 860 const Component& auth, 861 Component* username, 862 Component* password, 863 Component* hostname, 864 Component* port_num) { 865 DoParseAuthority(spec, auth, username, password, hostname, port_num); 866 } 867 868 int ParsePort(const char* url, const Component& port) { 869 return DoParsePort(url, port); 870 } 871 872 int ParsePort(const base::char16* url, const Component& port) { 873 return DoParsePort(url, port); 874 } 875 876 void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { 877 DoParseStandardURL(url, url_len, parsed); 878 } 879 880 void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) { 881 DoParseStandardURL(url, url_len, parsed); 882 } 883 884 void ParsePathURL(const char* url, 885 int url_len, 886 bool trim_path_end, 887 Parsed* parsed) { 888 DoParsePathURL(url, url_len, trim_path_end, parsed); 889 } 890 891 void ParsePathURL(const base::char16* url, 892 int url_len, 893 bool trim_path_end, 894 Parsed* parsed) { 895 DoParsePathURL(url, url_len, trim_path_end, parsed); 896 } 897 898 void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { 899 DoParseFileSystemURL(url, url_len, parsed); 900 } 901 902 void ParseFileSystemURL(const base::char16* url, int url_len, Parsed* parsed) { 903 DoParseFileSystemURL(url, url_len, parsed); 904 } 905 906 void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { 907 DoParseMailtoURL(url, url_len, parsed); 908 } 909 910 void ParseMailtoURL(const base::char16* url, int url_len, Parsed* parsed) { 911 DoParseMailtoURL(url, url_len, parsed); 912 } 913 914 void ParsePathInternal(const char* spec, 915 const Component& path, 916 Component* filepath, 917 Component* query, 918 Component* ref) { 919 ParsePath(spec, path, filepath, query, ref); 920 } 921 922 void ParsePathInternal(const base::char16* spec, 923 const Component& path, 924 Component* filepath, 925 Component* query, 926 Component* ref) { 927 ParsePath(spec, path, filepath, query, ref); 928 } 929 930 void ParseAfterScheme(const char* spec, 931 int spec_len, 932 int after_scheme, 933 Parsed* parsed) { 934 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 935 } 936 937 void ParseAfterScheme(const base::char16* spec, 938 int spec_len, 939 int after_scheme, 940 Parsed* parsed) { 941 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 942 } 943 944 } // namespace url 945