1 /* Based on nsURLParsers.cc from Mozilla 2 * ------------------------------------- 3 * The contents of this file are subject to the Mozilla Public License Version 4 * 1.1 (the "License"); you may not use this file except in compliance with 5 * the License. You may obtain a copy of the License at 6 * http://www.mozilla.org/MPL/ 7 * 8 * Software distributed under the License is distributed on an "AS IS" basis, 9 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 * for the specific language governing rights and limitations under the 11 * License. 12 * 13 * The Original Code is mozilla.org code. 14 * 15 * The Initial Developer of the Original Code is 16 * Netscape Communications Corporation. 17 * Portions created by the Initial Developer are Copyright (C) 1998 18 * the Initial Developer. All Rights Reserved. 19 * 20 * Contributor(s): 21 * Darin Fisher (original author) 22 * 23 * Alternatively, the contents of this file may be used under the terms of 24 * either the GNU General Public License Version 2 or later (the "GPL"), or 25 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 * in which case the provisions of the GPL or the LGPL are applicable instead 27 * of those above. If you wish to allow use of your version of this file only 28 * under the terms of either the GPL or the LGPL, and not to allow others to 29 * use your version of this file under the terms of the MPL, indicate your 30 * decision by deleting the provisions above and replace them with the notice 31 * and other provisions required by the GPL or the LGPL. If you do not delete 32 * the provisions above, a recipient may use your version of this file under 33 * the terms of any one of the MPL, the GPL or the LGPL. 34 * 35 * ***** END LICENSE BLOCK ***** */ 36 37 #include "url/third_party/mozilla/url_parse.h" 38 39 #include <stdlib.h> 40 41 #include "base/logging.h" 42 #include "url/url_parse_internal.h" 43 #include "url/url_util.h" 44 #include "url/url_util_internal.h" 45 46 namespace url_parse { 47 48 namespace { 49 50 // Returns true if the given character is a valid digit to use in a port. 51 inline bool IsPortDigit(base::char16 ch) { 52 return ch >= '0' && ch <= '9'; 53 } 54 55 // Returns the offset of the next authority terminator in the input starting 56 // from start_offset. If no terminator is found, the return value will be equal 57 // to spec_len. 58 template<typename CHAR> 59 int FindNextAuthorityTerminator(const CHAR* spec, 60 int start_offset, 61 int spec_len) { 62 for (int i = start_offset; i < spec_len; i++) { 63 if (IsAuthorityTerminator(spec[i])) 64 return i; 65 } 66 return spec_len; // Not found. 67 } 68 69 template<typename CHAR> 70 void ParseUserInfo(const CHAR* spec, 71 const Component& user, 72 Component* username, 73 Component* password) { 74 // Find the first colon in the user section, which separates the username and 75 // password. 76 int colon_offset = 0; 77 while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') 78 colon_offset++; 79 80 if (colon_offset < user.len) { 81 // Found separator: <username>:<password> 82 *username = Component(user.begin, colon_offset); 83 *password = MakeRange(user.begin + colon_offset + 1, 84 user.begin + user.len); 85 } else { 86 // No separator, treat everything as the username 87 *username = user; 88 *password = Component(); 89 } 90 } 91 92 template<typename CHAR> 93 void ParseServerInfo(const CHAR* spec, 94 const Component& serverinfo, 95 Component* hostname, 96 Component* port_num) { 97 if (serverinfo.len == 0) { 98 // No server info, host name is empty. 99 hostname->reset(); 100 port_num->reset(); 101 return; 102 } 103 104 // If the host starts with a left-bracket, assume the entire host is an 105 // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. 106 // This assumption will be overridden if we find a right-bracket. 107 // 108 // Our IPv6 address canonicalization code requires both brackets to exist, 109 // but the ability to locate an incomplete address can still be useful. 110 int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; 111 int colon = -1; 112 113 // Find the last right-bracket, and the last colon. 114 for (int i = serverinfo.begin; i < serverinfo.end(); i++) { 115 switch (spec[i]) { 116 case ']': 117 ipv6_terminator = i; 118 break; 119 case ':': 120 colon = i; 121 break; 122 } 123 } 124 125 if (colon > ipv6_terminator) { 126 // Found a port number: <hostname>:<port> 127 *hostname = MakeRange(serverinfo.begin, colon); 128 if (hostname->len == 0) 129 hostname->reset(); 130 *port_num = MakeRange(colon + 1, serverinfo.end()); 131 } else { 132 // No port: <hostname> 133 *hostname = serverinfo; 134 port_num->reset(); 135 } 136 } 137 138 // Given an already-identified auth section, breaks it into its consituent 139 // parts. The port number will be parsed and the resulting integer will be 140 // filled into the given *port variable, or -1 if there is no port number or it 141 // is invalid. 142 template<typename CHAR> 143 void DoParseAuthority(const CHAR* spec, 144 const Component& auth, 145 Component* username, 146 Component* password, 147 Component* hostname, 148 Component* port_num) { 149 DCHECK(auth.is_valid()) << "We should always get an authority"; 150 if (auth.len == 0) { 151 username->reset(); 152 password->reset(); 153 hostname->reset(); 154 port_num->reset(); 155 return; 156 } 157 158 // Search backwards for @, which is the separator between the user info and 159 // the server info. 160 int i = auth.begin + auth.len - 1; 161 while (i > auth.begin && spec[i] != '@') 162 i--; 163 164 if (spec[i] == '@') { 165 // Found user info: <user-info>@<server-info> 166 ParseUserInfo(spec, Component(auth.begin, i - auth.begin), 167 username, password); 168 ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), 169 hostname, port_num); 170 } else { 171 // No user info, everything is server info. 172 username->reset(); 173 password->reset(); 174 ParseServerInfo(spec, auth, hostname, port_num); 175 } 176 } 177 178 template<typename CHAR> 179 void ParsePath(const CHAR* spec, 180 const Component& path, 181 Component* filepath, 182 Component* query, 183 Component* ref) { 184 // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> 185 186 // Special case when there is no path. 187 if (path.len == -1) { 188 filepath->reset(); 189 query->reset(); 190 ref->reset(); 191 return; 192 } 193 DCHECK(path.len > 0) << "We should never have 0 length paths"; 194 195 // Search for first occurrence of either ? or #. 196 int path_end = path.begin + path.len; 197 198 int query_separator = -1; // Index of the '?' 199 int ref_separator = -1; // Index of the '#' 200 for (int i = path.begin; i < path_end; i++) { 201 switch (spec[i]) { 202 case '?': 203 // Only match the query string if it precedes the reference fragment 204 // and when we haven't found one already. 205 if (ref_separator < 0 && query_separator < 0) 206 query_separator = i; 207 break; 208 case '#': 209 // Record the first # sign only. 210 if (ref_separator < 0) 211 ref_separator = i; 212 break; 213 } 214 } 215 216 // Markers pointing to the character after each of these corresponding 217 // components. The code below words from the end back to the beginning, 218 // and will update these indices as it finds components that exist. 219 int file_end, query_end; 220 221 // Ref fragment: from the # to the end of the path. 222 if (ref_separator >= 0) { 223 file_end = query_end = ref_separator; 224 *ref = MakeRange(ref_separator + 1, path_end); 225 } else { 226 file_end = query_end = path_end; 227 ref->reset(); 228 } 229 230 // Query fragment: everything from the ? to the next boundary (either the end 231 // of the path or the ref fragment). 232 if (query_separator >= 0) { 233 file_end = query_separator; 234 *query = MakeRange(query_separator + 1, query_end); 235 } else { 236 query->reset(); 237 } 238 239 // File path: treat an empty file path as no file path. 240 if (file_end != path.begin) 241 *filepath = MakeRange(path.begin, file_end); 242 else 243 filepath->reset(); 244 } 245 246 template<typename CHAR> 247 bool DoExtractScheme(const CHAR* url, 248 int url_len, 249 Component* scheme) { 250 // Skip leading whitespace and control characters. 251 int begin = 0; 252 while (begin < url_len && ShouldTrimFromURL(url[begin])) 253 begin++; 254 if (begin == url_len) 255 return false; // Input is empty or all whitespace. 256 257 // Find the first colon character. 258 for (int i = begin; i < url_len; i++) { 259 if (url[i] == ':') { 260 *scheme = MakeRange(begin, i); 261 return true; 262 } 263 } 264 return false; // No colon found: no scheme 265 } 266 267 // Fills in all members of the Parsed structure except for the scheme. 268 // 269 // |spec| is the full spec being parsed, of length |spec_len|. 270 // |after_scheme| is the character immediately following the scheme (after the 271 // colon) where we'll begin parsing. 272 // 273 // Compatability data points. I list "host", "path" extracted: 274 // Input IE6 Firefox Us 275 // ----- -------------- -------------- -------------- 276 // http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 277 // http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 278 // http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" 279 // http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" 280 // http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 281 // 282 // (*) Interestingly, although IE fails to load these URLs, its history 283 // canonicalizer handles them, meaning if you've been to the corresponding 284 // "http://foo.com/" link, it will be colored. 285 template <typename CHAR> 286 void DoParseAfterScheme(const CHAR* spec, 287 int spec_len, 288 int after_scheme, 289 Parsed* parsed) { 290 int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); 291 int after_slashes = after_scheme + num_slashes; 292 293 // First split into two main parts, the authority (username, password, host, 294 // and port) and the full path (path, query, and reference). 295 Component authority; 296 Component full_path; 297 298 // Found "//<some data>", looks like an authority section. Treat everything 299 // from there to the next slash (or end of spec) to be the authority. Note 300 // that we ignore the number of slashes and treat it as the authority. 301 int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); 302 authority = Component(after_slashes, end_auth - after_slashes); 303 304 if (end_auth == spec_len) // No beginning of path found. 305 full_path = Component(); 306 else // Everything starting from the slash to the end is the path. 307 full_path = Component(end_auth, spec_len - end_auth); 308 309 // Now parse those two sub-parts. 310 DoParseAuthority(spec, authority, &parsed->username, &parsed->password, 311 &parsed->host, &parsed->port); 312 ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); 313 } 314 315 // The main parsing function for standard URLs. Standard URLs have a scheme, 316 // host, path, etc. 317 template<typename CHAR> 318 void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { 319 DCHECK(spec_len >= 0); 320 321 // Strip leading & trailing spaces and control characters. 322 int begin = 0; 323 TrimURL(spec, &begin, &spec_len); 324 325 int after_scheme; 326 if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { 327 after_scheme = parsed->scheme.end() + 1; // Skip past the colon. 328 } else { 329 // Say there's no scheme when there is no colon. We could also say that 330 // everything is the scheme. Both would produce an invalid URL, but this way 331 // seems less wrong in more cases. 332 parsed->scheme.reset(); 333 after_scheme = begin; 334 } 335 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 336 } 337 338 template<typename CHAR> 339 void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { 340 DCHECK(spec_len >= 0); 341 342 // Get the unused parts of the URL out of the way. 343 parsed->username.reset(); 344 parsed->password.reset(); 345 parsed->host.reset(); 346 parsed->port.reset(); 347 parsed->path.reset(); // May use this; reset for convenience. 348 parsed->ref.reset(); // May use this; reset for convenience. 349 parsed->query.reset(); // May use this; reset for convenience. 350 parsed->clear_inner_parsed(); // May use this; reset for convenience. 351 352 // Strip leading & trailing spaces and control characters. 353 int begin = 0; 354 TrimURL(spec, &begin, &spec_len); 355 356 // Handle empty specs or ones that contain only whitespace or control chars. 357 if (begin == spec_len) { 358 parsed->scheme.reset(); 359 return; 360 } 361 362 int inner_start = -1; 363 364 // Extract the scheme. We also handle the case where there is no scheme. 365 if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 366 // Offset the results since we gave ExtractScheme a substring. 367 parsed->scheme.begin += begin; 368 369 if (parsed->scheme.end() == spec_len - 1) 370 return; 371 372 inner_start = parsed->scheme.end() + 1; 373 } else { 374 // No scheme found; that's not valid for filesystem URLs. 375 parsed->scheme.reset(); 376 return; 377 } 378 379 url_parse::Component inner_scheme; 380 const CHAR* inner_spec = &spec[inner_start]; 381 int inner_spec_len = spec_len - inner_start; 382 383 if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { 384 // Offset the results since we gave ExtractScheme a substring. 385 inner_scheme.begin += inner_start; 386 387 if (inner_scheme.end() == spec_len - 1) 388 return; 389 } else { 390 // No scheme found; that's not valid for filesystem URLs. 391 // The best we can do is return "filesystem://". 392 return; 393 } 394 395 Parsed inner_parsed; 396 397 if (url_util::CompareSchemeComponent( 398 spec, inner_scheme, url_util::kFileScheme)) { 399 // File URLs are special. 400 ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); 401 } else if (url_util::CompareSchemeComponent(spec, inner_scheme, 402 url_util::kFileSystemScheme)) { 403 // Filesystem URLs don't nest. 404 return; 405 } else if (url_util::IsStandard(spec, inner_scheme)) { 406 // All "normal" URLs. 407 DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); 408 } else { 409 return; 410 } 411 412 // All members of inner_parsed need to be offset by inner_start. 413 // If we had any scheme that supported nesting more than one level deep, 414 // we'd have to recurse into the inner_parsed's inner_parsed when 415 // adjusting by inner_start. 416 inner_parsed.scheme.begin += inner_start; 417 inner_parsed.username.begin += inner_start; 418 inner_parsed.password.begin += inner_start; 419 inner_parsed.host.begin += inner_start; 420 inner_parsed.port.begin += inner_start; 421 inner_parsed.query.begin += inner_start; 422 inner_parsed.ref.begin += inner_start; 423 inner_parsed.path.begin += inner_start; 424 425 // Query and ref move from inner_parsed to parsed. 426 parsed->query = inner_parsed.query; 427 inner_parsed.query.reset(); 428 parsed->ref = inner_parsed.ref; 429 inner_parsed.ref.reset(); 430 431 parsed->set_inner_parsed(inner_parsed); 432 if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || 433 inner_parsed.inner_parsed()) { 434 return; 435 } 436 437 // The path in inner_parsed should start with a slash, then have a filesystem 438 // type followed by a slash. From the first slash up to but excluding the 439 // second should be what it keeps; the rest goes to parsed. If the path ends 440 // before the second slash, it's still pretty clear what the user meant, so 441 // we'll let that through. 442 if (!IsURLSlash(spec[inner_parsed.path.begin])) { 443 return; 444 } 445 int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash 446 while (inner_path_end < spec_len && 447 !IsURLSlash(spec[inner_path_end])) 448 ++inner_path_end; 449 parsed->path.begin = inner_path_end; 450 int new_inner_path_length = inner_path_end - inner_parsed.path.begin; 451 parsed->path.len = inner_parsed.path.len - new_inner_path_length; 452 parsed->inner_parsed()->path.len = new_inner_path_length; 453 } 454 455 // Initializes a path URL which is merely a scheme followed by a path. Examples 456 // include "about:foo" and "javascript:alert('bar');" 457 template<typename CHAR> 458 void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) { 459 // Get the non-path and non-scheme parts of the URL out of the way, we never 460 // use them. 461 parsed->username.reset(); 462 parsed->password.reset(); 463 parsed->host.reset(); 464 parsed->port.reset(); 465 parsed->path.reset(); 466 parsed->query.reset(); 467 parsed->ref.reset(); 468 469 // Strip leading & trailing spaces and control characters. 470 int begin = 0; 471 TrimURL(spec, &begin, &spec_len); 472 473 // Handle empty specs or ones that contain only whitespace or control chars. 474 if (begin == spec_len) { 475 parsed->scheme.reset(); 476 parsed->path.reset(); 477 return; 478 } 479 480 // Extract the scheme, with the path being everything following. We also 481 // handle the case where there is no scheme. 482 if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 483 // Offset the results since we gave ExtractScheme a substring. 484 parsed->scheme.begin += begin; 485 begin = parsed->scheme.end() + 1; 486 } else { 487 parsed->scheme.reset(); 488 } 489 490 if (begin == spec_len) 491 return; 492 DCHECK_LT(begin, spec_len); 493 494 ParsePath(spec, 495 MakeRange(begin, spec_len), 496 &parsed->path, 497 &parsed->query, 498 &parsed->ref); 499 } 500 501 template<typename CHAR> 502 void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { 503 DCHECK(spec_len >= 0); 504 505 // Get the non-path and non-scheme parts of the URL out of the way, we never 506 // use them. 507 parsed->username.reset(); 508 parsed->password.reset(); 509 parsed->host.reset(); 510 parsed->port.reset(); 511 parsed->ref.reset(); 512 parsed->query.reset(); // May use this; reset for convenience. 513 514 // Strip leading & trailing spaces and control characters. 515 int begin = 0; 516 TrimURL(spec, &begin, &spec_len); 517 518 // Handle empty specs or ones that contain only whitespace or control chars. 519 if (begin == spec_len) { 520 parsed->scheme.reset(); 521 parsed->path.reset(); 522 return; 523 } 524 525 int path_begin = -1; 526 int path_end = -1; 527 528 // Extract the scheme, with the path being everything following. We also 529 // handle the case where there is no scheme. 530 if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 531 // Offset the results since we gave ExtractScheme a substring. 532 parsed->scheme.begin += begin; 533 534 if (parsed->scheme.end() != spec_len - 1) { 535 path_begin = parsed->scheme.end() + 1; 536 path_end = spec_len; 537 } 538 } else { 539 // No scheme found, just path. 540 parsed->scheme.reset(); 541 path_begin = begin; 542 path_end = spec_len; 543 } 544 545 // Split [path_begin, path_end) into a path + query. 546 for (int i = path_begin; i < path_end; ++i) { 547 if (spec[i] == '?') { 548 parsed->query = MakeRange(i + 1, path_end); 549 path_end = i; 550 break; 551 } 552 } 553 554 // For compatability with the standard URL parser, treat no path as 555 // -1, rather than having a length of 0 556 if (path_begin == path_end) { 557 parsed->path.reset(); 558 } else { 559 parsed->path = MakeRange(path_begin, path_end); 560 } 561 } 562 563 // Converts a port number in a string to an integer. We'd like to just call 564 // sscanf but our input is not NULL-terminated, which sscanf requires. Instead, 565 // we copy the digits to a small stack buffer (since we know the maximum number 566 // of digits in a valid port number) that we can NULL terminate. 567 template<typename CHAR> 568 int DoParsePort(const CHAR* spec, const Component& component) { 569 // Easy success case when there is no port. 570 const int kMaxDigits = 5; 571 if (!component.is_nonempty()) 572 return PORT_UNSPECIFIED; 573 574 // Skip over any leading 0s. 575 Component digits_comp(component.end(), 0); 576 for (int i = 0; i < component.len; i++) { 577 if (spec[component.begin + i] != '0') { 578 digits_comp = MakeRange(component.begin + i, component.end()); 579 break; 580 } 581 } 582 if (digits_comp.len == 0) 583 return 0; // All digits were 0. 584 585 // Verify we don't have too many digits (we'll be copying to our buffer so 586 // we need to double-check). 587 if (digits_comp.len > kMaxDigits) 588 return PORT_INVALID; 589 590 // Copy valid digits to the buffer. 591 char digits[kMaxDigits + 1]; // +1 for null terminator 592 for (int i = 0; i < digits_comp.len; i++) { 593 CHAR ch = spec[digits_comp.begin + i]; 594 if (!IsPortDigit(ch)) { 595 // Invalid port digit, fail. 596 return PORT_INVALID; 597 } 598 digits[i] = static_cast<char>(ch); 599 } 600 601 // Null-terminate the string and convert to integer. Since we guarantee 602 // only digits, atoi's lack of error handling is OK. 603 digits[digits_comp.len] = 0; 604 int port = atoi(digits); 605 if (port > 65535) 606 return PORT_INVALID; // Out of range. 607 return port; 608 } 609 610 template<typename CHAR> 611 void DoExtractFileName(const CHAR* spec, 612 const Component& path, 613 Component* file_name) { 614 // Handle empty paths: they have no file names. 615 if (!path.is_nonempty()) { 616 file_name->reset(); 617 return; 618 } 619 620 // Search backwards for a parameter, which is a normally unused field in a 621 // URL delimited by a semicolon. We parse the parameter as part of the 622 // path, but here, we don't want to count it. The last semicolon is the 623 // parameter. The path should start with a slash, so we don't need to check 624 // the first one. 625 int file_end = path.end(); 626 for (int i = path.end() - 1; i > path.begin; i--) { 627 if (spec[i] == ';') { 628 file_end = i; 629 break; 630 } 631 } 632 633 // Now search backwards from the filename end to the previous slash 634 // to find the beginning of the filename. 635 for (int i = file_end - 1; i >= path.begin; i--) { 636 if (IsURLSlash(spec[i])) { 637 // File name is everything following this character to the end 638 *file_name = MakeRange(i + 1, file_end); 639 return; 640 } 641 } 642 643 // No slash found, this means the input was degenerate (generally paths 644 // will start with a slash). Let's call everything the file name. 645 *file_name = MakeRange(path.begin, file_end); 646 return; 647 } 648 649 template<typename CHAR> 650 bool DoExtractQueryKeyValue(const CHAR* spec, 651 Component* query, 652 Component* key, 653 Component* value) { 654 if (!query->is_nonempty()) 655 return false; 656 657 int start = query->begin; 658 int cur = start; 659 int end = query->end(); 660 661 // We assume the beginning of the input is the beginning of the "key" and we 662 // skip to the end of it. 663 key->begin = cur; 664 while (cur < end && spec[cur] != '&' && spec[cur] != '=') 665 cur++; 666 key->len = cur - key->begin; 667 668 // Skip the separator after the key (if any). 669 if (cur < end && spec[cur] == '=') 670 cur++; 671 672 // Find the value part. 673 value->begin = cur; 674 while (cur < end && spec[cur] != '&') 675 cur++; 676 value->len = cur - value->begin; 677 678 // Finally skip the next separator if any 679 if (cur < end && spec[cur] == '&') 680 cur++; 681 682 // Save the new query 683 *query = url_parse::MakeRange(cur, end); 684 return true; 685 } 686 687 } // namespace 688 689 Parsed::Parsed() : inner_parsed_(NULL) { 690 } 691 692 Parsed::Parsed(const Parsed& other) : 693 scheme(other.scheme), 694 username(other.username), 695 password(other.password), 696 host(other.host), 697 port(other.port), 698 path(other.path), 699 query(other.query), 700 ref(other.ref), 701 inner_parsed_(NULL) { 702 if (other.inner_parsed_) 703 set_inner_parsed(*other.inner_parsed_); 704 } 705 706 Parsed& Parsed::operator=(const Parsed& other) { 707 if (this != &other) { 708 scheme = other.scheme; 709 username = other.username; 710 password = other.password; 711 host = other.host; 712 port = other.port; 713 path = other.path; 714 query = other.query; 715 ref = other.ref; 716 if (other.inner_parsed_) 717 set_inner_parsed(*other.inner_parsed_); 718 else 719 clear_inner_parsed(); 720 } 721 return *this; 722 } 723 724 Parsed::~Parsed() { 725 delete inner_parsed_; 726 } 727 728 int Parsed::Length() const { 729 if (ref.is_valid()) 730 return ref.end(); 731 return CountCharactersBefore(REF, false); 732 } 733 734 int Parsed::CountCharactersBefore(ComponentType type, 735 bool include_delimiter) const { 736 if (type == SCHEME) 737 return scheme.begin; 738 739 // There will be some characters after the scheme like "://" and we don't 740 // know how many. Search forwards for the next thing until we find one. 741 int cur = 0; 742 if (scheme.is_valid()) 743 cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. 744 745 if (username.is_valid()) { 746 if (type <= USERNAME) 747 return username.begin; 748 cur = username.end() + 1; // Advance over the '@' or ':' at the end. 749 } 750 751 if (password.is_valid()) { 752 if (type <= PASSWORD) 753 return password.begin; 754 cur = password.end() + 1; // Advance over the '@' at the end. 755 } 756 757 if (host.is_valid()) { 758 if (type <= HOST) 759 return host.begin; 760 cur = host.end(); 761 } 762 763 if (port.is_valid()) { 764 if (type < PORT || (type == PORT && include_delimiter)) 765 return port.begin - 1; // Back over delimiter. 766 if (type == PORT) 767 return port.begin; // Don't want delimiter counted. 768 cur = port.end(); 769 } 770 771 if (path.is_valid()) { 772 if (type <= PATH) 773 return path.begin; 774 cur = path.end(); 775 } 776 777 if (query.is_valid()) { 778 if (type < QUERY || (type == QUERY && include_delimiter)) 779 return query.begin - 1; // Back over delimiter. 780 if (type == QUERY) 781 return query.begin; // Don't want delimiter counted. 782 cur = query.end(); 783 } 784 785 if (ref.is_valid()) { 786 if (type == REF && !include_delimiter) 787 return ref.begin; // Back over delimiter. 788 789 // When there is a ref and we get here, the component we wanted was before 790 // this and not found, so we always know the beginning of the ref is right. 791 return ref.begin - 1; // Don't want delimiter counted. 792 } 793 794 return cur; 795 } 796 797 Component Parsed::GetContent() const { 798 const int begin = CountCharactersBefore(USERNAME, false); 799 const int len = Length() - begin; 800 // For compatability with the standard URL parser, we treat no content as 801 // -1, rather than having a length of 0 (we normally wouldn't care so 802 // much for these non-standard URLs). 803 return len ? Component(begin, len) : Component(); 804 } 805 806 bool ExtractScheme(const char* url, int url_len, Component* scheme) { 807 return DoExtractScheme(url, url_len, scheme); 808 } 809 810 bool ExtractScheme(const base::char16* url, int url_len, Component* scheme) { 811 return DoExtractScheme(url, url_len, scheme); 812 } 813 814 // This handles everything that may be an authority terminator, including 815 // backslash. For special backslash handling see DoParseAfterScheme. 816 bool IsAuthorityTerminator(base::char16 ch) { 817 return IsURLSlash(ch) || ch == '?' || ch == '#'; 818 } 819 820 void ExtractFileName(const char* url, 821 const Component& path, 822 Component* file_name) { 823 DoExtractFileName(url, path, file_name); 824 } 825 826 void ExtractFileName(const base::char16* url, 827 const Component& path, 828 Component* file_name) { 829 DoExtractFileName(url, path, file_name); 830 } 831 832 bool ExtractQueryKeyValue(const char* url, 833 Component* query, 834 Component* key, 835 Component* value) { 836 return DoExtractQueryKeyValue(url, query, key, value); 837 } 838 839 bool ExtractQueryKeyValue(const base::char16* url, 840 Component* query, 841 Component* key, 842 Component* value) { 843 return DoExtractQueryKeyValue(url, query, key, value); 844 } 845 846 void ParseAuthority(const char* spec, 847 const Component& auth, 848 Component* username, 849 Component* password, 850 Component* hostname, 851 Component* port_num) { 852 DoParseAuthority(spec, auth, username, password, hostname, port_num); 853 } 854 855 void ParseAuthority(const base::char16* spec, 856 const Component& auth, 857 Component* username, 858 Component* password, 859 Component* hostname, 860 Component* port_num) { 861 DoParseAuthority(spec, auth, username, password, hostname, port_num); 862 } 863 864 int ParsePort(const char* url, const Component& port) { 865 return DoParsePort(url, port); 866 } 867 868 int ParsePort(const base::char16* url, const Component& port) { 869 return DoParsePort(url, port); 870 } 871 872 void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { 873 DoParseStandardURL(url, url_len, parsed); 874 } 875 876 void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) { 877 DoParseStandardURL(url, url_len, parsed); 878 } 879 880 void ParsePathURL(const char* url, int url_len, Parsed* parsed) { 881 DoParsePathURL(url, url_len, parsed); 882 } 883 884 void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) { 885 DoParsePathURL(url, url_len, parsed); 886 } 887 888 void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { 889 DoParseFileSystemURL(url, url_len, parsed); 890 } 891 892 void ParseFileSystemURL(const base::char16* url, int url_len, Parsed* parsed) { 893 DoParseFileSystemURL(url, url_len, parsed); 894 } 895 896 void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { 897 DoParseMailtoURL(url, url_len, parsed); 898 } 899 900 void ParseMailtoURL(const base::char16* url, int url_len, Parsed* parsed) { 901 DoParseMailtoURL(url, url_len, parsed); 902 } 903 904 void ParsePathInternal(const char* spec, 905 const Component& path, 906 Component* filepath, 907 Component* query, 908 Component* ref) { 909 ParsePath(spec, path, filepath, query, ref); 910 } 911 912 void ParsePathInternal(const base::char16* spec, 913 const Component& path, 914 Component* filepath, 915 Component* query, 916 Component* ref) { 917 ParsePath(spec, path, filepath, query, ref); 918 } 919 920 void ParseAfterScheme(const char* spec, 921 int spec_len, 922 int after_scheme, 923 Parsed* parsed) { 924 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 925 } 926 927 void ParseAfterScheme(const base::char16* spec, 928 int spec_len, 929 int after_scheme, 930 Parsed* parsed) { 931 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 932 } 933 934 } // namespace url_parse 935