1 /* Based on nsURLParsers.cc from Mozilla 2 * ------------------------------------- 3 * The contents of this file are subject to the Mozilla Public License Version 4 * 1.1 (the "License"); you may not use this file except in compliance with 5 * the License. You may obtain a copy of the License at 6 * http://www.mozilla.org/MPL/ 7 * 8 * Software distributed under the License is distributed on an "AS IS" basis, 9 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 10 * for the specific language governing rights and limitations under the 11 * License. 12 * 13 * The Original Code is mozilla.org code. 14 * 15 * The Initial Developer of the Original Code is 16 * Netscape Communications Corporation. 17 * Portions created by the Initial Developer are Copyright (C) 1998 18 * the Initial Developer. All Rights Reserved. 19 * 20 * Contributor(s): 21 * Darin Fisher (original author) 22 * 23 * Alternatively, the contents of this file may be used under the terms of 24 * either the GNU General Public License Version 2 or later (the "GPL"), or 25 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 26 * in which case the provisions of the GPL or the LGPL are applicable instead 27 * of those above. If you wish to allow use of your version of this file only 28 * under the terms of either the GPL or the LGPL, and not to allow others to 29 * use your version of this file under the terms of the MPL, indicate your 30 * decision by deleting the provisions above and replace them with the notice 31 * and other provisions required by the GPL or the LGPL. If you do not delete 32 * the provisions above, a recipient may use your version of this file under 33 * the terms of any one of the MPL, the GPL or the LGPL. 34 * 35 * ***** END LICENSE BLOCK ***** */ 36 37 #include "url/third_party/mozilla/url_parse.h" 38 39 #include <stdlib.h> 40 41 #include "base/logging.h" 42 #include "url/url_parse_internal.h" 43 #include "url/url_util.h" 44 #include "url/url_util_internal.h" 45 46 namespace url_parse { 47 48 namespace { 49 50 // Returns true if the given character is a valid digit to use in a port. 51 inline bool IsPortDigit(base::char16 ch) { 52 return ch >= '0' && ch <= '9'; 53 } 54 55 // Returns the offset of the next authority terminator in the input starting 56 // from start_offset. If no terminator is found, the return value will be equal 57 // to spec_len. 58 template<typename CHAR> 59 int FindNextAuthorityTerminator(const CHAR* spec, 60 int start_offset, 61 int spec_len) { 62 for (int i = start_offset; i < spec_len; i++) { 63 if (IsAuthorityTerminator(spec[i])) 64 return i; 65 } 66 return spec_len; // Not found. 67 } 68 69 template<typename CHAR> 70 void ParseUserInfo(const CHAR* spec, 71 const Component& user, 72 Component* username, 73 Component* password) { 74 // Find the first colon in the user section, which separates the username and 75 // password. 76 int colon_offset = 0; 77 while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') 78 colon_offset++; 79 80 if (colon_offset < user.len) { 81 // Found separator: <username>:<password> 82 *username = Component(user.begin, colon_offset); 83 *password = MakeRange(user.begin + colon_offset + 1, 84 user.begin + user.len); 85 } else { 86 // No separator, treat everything as the username 87 *username = user; 88 *password = Component(); 89 } 90 } 91 92 template<typename CHAR> 93 void ParseServerInfo(const CHAR* spec, 94 const Component& serverinfo, 95 Component* hostname, 96 Component* port_num) { 97 if (serverinfo.len == 0) { 98 // No server info, host name is empty. 99 hostname->reset(); 100 port_num->reset(); 101 return; 102 } 103 104 // If the host starts with a left-bracket, assume the entire host is an 105 // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. 106 // This assumption will be overridden if we find a right-bracket. 107 // 108 // Our IPv6 address canonicalization code requires both brackets to exist, 109 // but the ability to locate an incomplete address can still be useful. 110 int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; 111 int colon = -1; 112 113 // Find the last right-bracket, and the last colon. 114 for (int i = serverinfo.begin; i < serverinfo.end(); i++) { 115 switch (spec[i]) { 116 case ']': 117 ipv6_terminator = i; 118 break; 119 case ':': 120 colon = i; 121 break; 122 } 123 } 124 125 if (colon > ipv6_terminator) { 126 // Found a port number: <hostname>:<port> 127 *hostname = MakeRange(serverinfo.begin, colon); 128 if (hostname->len == 0) 129 hostname->reset(); 130 *port_num = MakeRange(colon + 1, serverinfo.end()); 131 } else { 132 // No port: <hostname> 133 *hostname = serverinfo; 134 port_num->reset(); 135 } 136 } 137 138 // Given an already-identified auth section, breaks it into its consituent 139 // parts. The port number will be parsed and the resulting integer will be 140 // filled into the given *port variable, or -1 if there is no port number or it 141 // is invalid. 142 template<typename CHAR> 143 void DoParseAuthority(const CHAR* spec, 144 const Component& auth, 145 Component* username, 146 Component* password, 147 Component* hostname, 148 Component* port_num) { 149 DCHECK(auth.is_valid()) << "We should always get an authority"; 150 if (auth.len == 0) { 151 username->reset(); 152 password->reset(); 153 hostname->reset(); 154 port_num->reset(); 155 return; 156 } 157 158 // Search backwards for @, which is the separator between the user info and 159 // the server info. 160 int i = auth.begin + auth.len - 1; 161 while (i > auth.begin && spec[i] != '@') 162 i--; 163 164 if (spec[i] == '@') { 165 // Found user info: <user-info>@<server-info> 166 ParseUserInfo(spec, Component(auth.begin, i - auth.begin), 167 username, password); 168 ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), 169 hostname, port_num); 170 } else { 171 // No user info, everything is server info. 172 username->reset(); 173 password->reset(); 174 ParseServerInfo(spec, auth, hostname, port_num); 175 } 176 } 177 178 template<typename CHAR> 179 void ParsePath(const CHAR* spec, 180 const Component& path, 181 Component* filepath, 182 Component* query, 183 Component* ref) { 184 // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> 185 186 // Special case when there is no path. 187 if (path.len == -1) { 188 filepath->reset(); 189 query->reset(); 190 ref->reset(); 191 return; 192 } 193 DCHECK(path.len > 0) << "We should never have 0 length paths"; 194 195 // Search for first occurrence of either ? or #. 196 int path_end = path.begin + path.len; 197 198 int query_separator = -1; // Index of the '?' 199 int ref_separator = -1; // Index of the '#' 200 for (int i = path.begin; i < path_end; i++) { 201 switch (spec[i]) { 202 case '?': 203 // Only match the query string if it precedes the reference fragment 204 // and when we haven't found one already. 205 if (ref_separator < 0 && query_separator < 0) 206 query_separator = i; 207 break; 208 case '#': 209 // Record the first # sign only. 210 if (ref_separator < 0) 211 ref_separator = i; 212 break; 213 } 214 } 215 216 // Markers pointing to the character after each of these corresponding 217 // components. The code below words from the end back to the beginning, 218 // and will update these indices as it finds components that exist. 219 int file_end, query_end; 220 221 // Ref fragment: from the # to the end of the path. 222 if (ref_separator >= 0) { 223 file_end = query_end = ref_separator; 224 *ref = MakeRange(ref_separator + 1, path_end); 225 } else { 226 file_end = query_end = path_end; 227 ref->reset(); 228 } 229 230 // Query fragment: everything from the ? to the next boundary (either the end 231 // of the path or the ref fragment). 232 if (query_separator >= 0) { 233 file_end = query_separator; 234 *query = MakeRange(query_separator + 1, query_end); 235 } else { 236 query->reset(); 237 } 238 239 // File path: treat an empty file path as no file path. 240 if (file_end != path.begin) 241 *filepath = MakeRange(path.begin, file_end); 242 else 243 filepath->reset(); 244 } 245 246 template<typename CHAR> 247 bool DoExtractScheme(const CHAR* url, 248 int url_len, 249 Component* scheme) { 250 // Skip leading whitespace and control characters. 251 int begin = 0; 252 while (begin < url_len && ShouldTrimFromURL(url[begin])) 253 begin++; 254 if (begin == url_len) 255 return false; // Input is empty or all whitespace. 256 257 // Find the first colon character. 258 for (int i = begin; i < url_len; i++) { 259 if (url[i] == ':') { 260 *scheme = MakeRange(begin, i); 261 return true; 262 } 263 } 264 return false; // No colon found: no scheme 265 } 266 267 // Fills in all members of the Parsed structure except for the scheme. 268 // 269 // |spec| is the full spec being parsed, of length |spec_len|. 270 // |after_scheme| is the character immediately following the scheme (after the 271 // colon) where we'll begin parsing. 272 // 273 // Compatability data points. I list "host", "path" extracted: 274 // Input IE6 Firefox Us 275 // ----- -------------- -------------- -------------- 276 // http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 277 // http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 278 // http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" 279 // http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" 280 // http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" 281 // 282 // (*) Interestingly, although IE fails to load these URLs, its history 283 // canonicalizer handles them, meaning if you've been to the corresponding 284 // "http://foo.com/" link, it will be colored. 285 template <typename CHAR> 286 void DoParseAfterScheme(const CHAR* spec, 287 int spec_len, 288 int after_scheme, 289 Parsed* parsed) { 290 int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); 291 int after_slashes = after_scheme + num_slashes; 292 293 // First split into two main parts, the authority (username, password, host, 294 // and port) and the full path (path, query, and reference). 295 Component authority; 296 Component full_path; 297 298 // Found "//<some data>", looks like an authority section. Treat everything 299 // from there to the next slash (or end of spec) to be the authority. Note 300 // that we ignore the number of slashes and treat it as the authority. 301 int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); 302 authority = Component(after_slashes, end_auth - after_slashes); 303 304 if (end_auth == spec_len) // No beginning of path found. 305 full_path = Component(); 306 else // Everything starting from the slash to the end is the path. 307 full_path = Component(end_auth, spec_len - end_auth); 308 309 // Now parse those two sub-parts. 310 DoParseAuthority(spec, authority, &parsed->username, &parsed->password, 311 &parsed->host, &parsed->port); 312 ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); 313 } 314 315 // The main parsing function for standard URLs. Standard URLs have a scheme, 316 // host, path, etc. 317 template<typename CHAR> 318 void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { 319 DCHECK(spec_len >= 0); 320 321 // Strip leading & trailing spaces and control characters. 322 int begin = 0; 323 TrimURL(spec, &begin, &spec_len); 324 325 int after_scheme; 326 if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { 327 after_scheme = parsed->scheme.end() + 1; // Skip past the colon. 328 } else { 329 // Say there's no scheme when there is no colon. We could also say that 330 // everything is the scheme. Both would produce an invalid URL, but this way 331 // seems less wrong in more cases. 332 parsed->scheme.reset(); 333 after_scheme = begin; 334 } 335 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 336 } 337 338 template<typename CHAR> 339 void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { 340 DCHECK(spec_len >= 0); 341 342 // Get the unused parts of the URL out of the way. 343 parsed->username.reset(); 344 parsed->password.reset(); 345 parsed->host.reset(); 346 parsed->port.reset(); 347 parsed->path.reset(); // May use this; reset for convenience. 348 parsed->ref.reset(); // May use this; reset for convenience. 349 parsed->query.reset(); // May use this; reset for convenience. 350 parsed->clear_inner_parsed(); // May use this; reset for convenience. 351 352 // Strip leading & trailing spaces and control characters. 353 int begin = 0; 354 TrimURL(spec, &begin, &spec_len); 355 356 // Handle empty specs or ones that contain only whitespace or control chars. 357 if (begin == spec_len) { 358 parsed->scheme.reset(); 359 return; 360 } 361 362 int inner_start = -1; 363 364 // Extract the scheme. We also handle the case where there is no scheme. 365 if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 366 // Offset the results since we gave ExtractScheme a substring. 367 parsed->scheme.begin += begin; 368 369 if (parsed->scheme.end() == spec_len - 1) 370 return; 371 372 inner_start = parsed->scheme.end() + 1; 373 } else { 374 // No scheme found; that's not valid for filesystem URLs. 375 parsed->scheme.reset(); 376 return; 377 } 378 379 url_parse::Component inner_scheme; 380 const CHAR* inner_spec = &spec[inner_start]; 381 int inner_spec_len = spec_len - inner_start; 382 383 if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { 384 // Offset the results since we gave ExtractScheme a substring. 385 inner_scheme.begin += inner_start; 386 387 if (inner_scheme.end() == spec_len - 1) 388 return; 389 } else { 390 // No scheme found; that's not valid for filesystem URLs. 391 // The best we can do is return "filesystem://". 392 return; 393 } 394 395 Parsed inner_parsed; 396 397 if (url_util::CompareSchemeComponent( 398 spec, inner_scheme, url_util::kFileScheme)) { 399 // File URLs are special. 400 ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); 401 } else if (url_util::CompareSchemeComponent(spec, inner_scheme, 402 url_util::kFileSystemScheme)) { 403 // Filesystem URLs don't nest. 404 return; 405 } else if (url_util::IsStandard(spec, inner_scheme)) { 406 // All "normal" URLs. 407 DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); 408 } else { 409 return; 410 } 411 412 // All members of inner_parsed need to be offset by inner_start. 413 // If we had any scheme that supported nesting more than one level deep, 414 // we'd have to recurse into the inner_parsed's inner_parsed when 415 // adjusting by inner_start. 416 inner_parsed.scheme.begin += inner_start; 417 inner_parsed.username.begin += inner_start; 418 inner_parsed.password.begin += inner_start; 419 inner_parsed.host.begin += inner_start; 420 inner_parsed.port.begin += inner_start; 421 inner_parsed.query.begin += inner_start; 422 inner_parsed.ref.begin += inner_start; 423 inner_parsed.path.begin += inner_start; 424 425 // Query and ref move from inner_parsed to parsed. 426 parsed->query = inner_parsed.query; 427 inner_parsed.query.reset(); 428 parsed->ref = inner_parsed.ref; 429 inner_parsed.ref.reset(); 430 431 parsed->set_inner_parsed(inner_parsed); 432 if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || 433 inner_parsed.inner_parsed()) { 434 return; 435 } 436 437 // The path in inner_parsed should start with a slash, then have a filesystem 438 // type followed by a slash. From the first slash up to but excluding the 439 // second should be what it keeps; the rest goes to parsed. If the path ends 440 // before the second slash, it's still pretty clear what the user meant, so 441 // we'll let that through. 442 if (!IsURLSlash(spec[inner_parsed.path.begin])) { 443 return; 444 } 445 int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash 446 while (inner_path_end < spec_len && 447 !IsURLSlash(spec[inner_path_end])) 448 ++inner_path_end; 449 parsed->path.begin = inner_path_end; 450 int new_inner_path_length = inner_path_end - inner_parsed.path.begin; 451 parsed->path.len = inner_parsed.path.len - new_inner_path_length; 452 parsed->inner_parsed()->path.len = new_inner_path_length; 453 } 454 455 // Initializes a path URL which is merely a scheme followed by a path. Examples 456 // include "about:foo" and "javascript:alert('bar');" 457 template<typename CHAR> 458 void DoParsePathURL(const CHAR* spec, int spec_len, 459 bool trim_path_end, 460 Parsed* parsed) { 461 // Get the non-path and non-scheme parts of the URL out of the way, we never 462 // use them. 463 parsed->username.reset(); 464 parsed->password.reset(); 465 parsed->host.reset(); 466 parsed->port.reset(); 467 parsed->path.reset(); 468 parsed->query.reset(); 469 parsed->ref.reset(); 470 471 // Strip leading & trailing spaces and control characters. 472 int scheme_begin = 0; 473 TrimURL(spec, &scheme_begin, &spec_len, trim_path_end); 474 475 // Handle empty specs or ones that contain only whitespace or control chars. 476 if (scheme_begin == spec_len) { 477 parsed->scheme.reset(); 478 parsed->path.reset(); 479 return; 480 } 481 482 int path_begin; 483 // Extract the scheme, with the path being everything following. We also 484 // handle the case where there is no scheme. 485 if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin, 486 &parsed->scheme)) { 487 // Offset the results since we gave ExtractScheme a substring. 488 parsed->scheme.begin += scheme_begin; 489 path_begin = parsed->scheme.end() + 1; 490 } else { 491 // No scheme case. 492 parsed->scheme.reset(); 493 path_begin = scheme_begin; 494 } 495 496 if (path_begin == spec_len) 497 return; 498 DCHECK_LT(path_begin, spec_len); 499 500 ParsePath(spec, 501 MakeRange(path_begin, spec_len), 502 &parsed->path, 503 &parsed->query, 504 &parsed->ref); 505 } 506 507 template<typename CHAR> 508 void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { 509 DCHECK(spec_len >= 0); 510 511 // Get the non-path and non-scheme parts of the URL out of the way, we never 512 // use them. 513 parsed->username.reset(); 514 parsed->password.reset(); 515 parsed->host.reset(); 516 parsed->port.reset(); 517 parsed->ref.reset(); 518 parsed->query.reset(); // May use this; reset for convenience. 519 520 // Strip leading & trailing spaces and control characters. 521 int begin = 0; 522 TrimURL(spec, &begin, &spec_len); 523 524 // Handle empty specs or ones that contain only whitespace or control chars. 525 if (begin == spec_len) { 526 parsed->scheme.reset(); 527 parsed->path.reset(); 528 return; 529 } 530 531 int path_begin = -1; 532 int path_end = -1; 533 534 // Extract the scheme, with the path being everything following. We also 535 // handle the case where there is no scheme. 536 if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { 537 // Offset the results since we gave ExtractScheme a substring. 538 parsed->scheme.begin += begin; 539 540 if (parsed->scheme.end() != spec_len - 1) { 541 path_begin = parsed->scheme.end() + 1; 542 path_end = spec_len; 543 } 544 } else { 545 // No scheme found, just path. 546 parsed->scheme.reset(); 547 path_begin = begin; 548 path_end = spec_len; 549 } 550 551 // Split [path_begin, path_end) into a path + query. 552 for (int i = path_begin; i < path_end; ++i) { 553 if (spec[i] == '?') { 554 parsed->query = MakeRange(i + 1, path_end); 555 path_end = i; 556 break; 557 } 558 } 559 560 // For compatability with the standard URL parser, treat no path as 561 // -1, rather than having a length of 0 562 if (path_begin == path_end) { 563 parsed->path.reset(); 564 } else { 565 parsed->path = MakeRange(path_begin, path_end); 566 } 567 } 568 569 // Converts a port number in a string to an integer. We'd like to just call 570 // sscanf but our input is not NULL-terminated, which sscanf requires. Instead, 571 // we copy the digits to a small stack buffer (since we know the maximum number 572 // of digits in a valid port number) that we can NULL terminate. 573 template<typename CHAR> 574 int DoParsePort(const CHAR* spec, const Component& component) { 575 // Easy success case when there is no port. 576 const int kMaxDigits = 5; 577 if (!component.is_nonempty()) 578 return PORT_UNSPECIFIED; 579 580 // Skip over any leading 0s. 581 Component digits_comp(component.end(), 0); 582 for (int i = 0; i < component.len; i++) { 583 if (spec[component.begin + i] != '0') { 584 digits_comp = MakeRange(component.begin + i, component.end()); 585 break; 586 } 587 } 588 if (digits_comp.len == 0) 589 return 0; // All digits were 0. 590 591 // Verify we don't have too many digits (we'll be copying to our buffer so 592 // we need to double-check). 593 if (digits_comp.len > kMaxDigits) 594 return PORT_INVALID; 595 596 // Copy valid digits to the buffer. 597 char digits[kMaxDigits + 1]; // +1 for null terminator 598 for (int i = 0; i < digits_comp.len; i++) { 599 CHAR ch = spec[digits_comp.begin + i]; 600 if (!IsPortDigit(ch)) { 601 // Invalid port digit, fail. 602 return PORT_INVALID; 603 } 604 digits[i] = static_cast<char>(ch); 605 } 606 607 // Null-terminate the string and convert to integer. Since we guarantee 608 // only digits, atoi's lack of error handling is OK. 609 digits[digits_comp.len] = 0; 610 int port = atoi(digits); 611 if (port > 65535) 612 return PORT_INVALID; // Out of range. 613 return port; 614 } 615 616 template<typename CHAR> 617 void DoExtractFileName(const CHAR* spec, 618 const Component& path, 619 Component* file_name) { 620 // Handle empty paths: they have no file names. 621 if (!path.is_nonempty()) { 622 file_name->reset(); 623 return; 624 } 625 626 // Search backwards for a parameter, which is a normally unused field in a 627 // URL delimited by a semicolon. We parse the parameter as part of the 628 // path, but here, we don't want to count it. The last semicolon is the 629 // parameter. The path should start with a slash, so we don't need to check 630 // the first one. 631 int file_end = path.end(); 632 for (int i = path.end() - 1; i > path.begin; i--) { 633 if (spec[i] == ';') { 634 file_end = i; 635 break; 636 } 637 } 638 639 // Now search backwards from the filename end to the previous slash 640 // to find the beginning of the filename. 641 for (int i = file_end - 1; i >= path.begin; i--) { 642 if (IsURLSlash(spec[i])) { 643 // File name is everything following this character to the end 644 *file_name = MakeRange(i + 1, file_end); 645 return; 646 } 647 } 648 649 // No slash found, this means the input was degenerate (generally paths 650 // will start with a slash). Let's call everything the file name. 651 *file_name = MakeRange(path.begin, file_end); 652 return; 653 } 654 655 template<typename CHAR> 656 bool DoExtractQueryKeyValue(const CHAR* spec, 657 Component* query, 658 Component* key, 659 Component* value) { 660 if (!query->is_nonempty()) 661 return false; 662 663 int start = query->begin; 664 int cur = start; 665 int end = query->end(); 666 667 // We assume the beginning of the input is the beginning of the "key" and we 668 // skip to the end of it. 669 key->begin = cur; 670 while (cur < end && spec[cur] != '&' && spec[cur] != '=') 671 cur++; 672 key->len = cur - key->begin; 673 674 // Skip the separator after the key (if any). 675 if (cur < end && spec[cur] == '=') 676 cur++; 677 678 // Find the value part. 679 value->begin = cur; 680 while (cur < end && spec[cur] != '&') 681 cur++; 682 value->len = cur - value->begin; 683 684 // Finally skip the next separator if any 685 if (cur < end && spec[cur] == '&') 686 cur++; 687 688 // Save the new query 689 *query = url_parse::MakeRange(cur, end); 690 return true; 691 } 692 693 } // namespace 694 695 Parsed::Parsed() : inner_parsed_(NULL) { 696 } 697 698 Parsed::Parsed(const Parsed& other) : 699 scheme(other.scheme), 700 username(other.username), 701 password(other.password), 702 host(other.host), 703 port(other.port), 704 path(other.path), 705 query(other.query), 706 ref(other.ref), 707 inner_parsed_(NULL) { 708 if (other.inner_parsed_) 709 set_inner_parsed(*other.inner_parsed_); 710 } 711 712 Parsed& Parsed::operator=(const Parsed& other) { 713 if (this != &other) { 714 scheme = other.scheme; 715 username = other.username; 716 password = other.password; 717 host = other.host; 718 port = other.port; 719 path = other.path; 720 query = other.query; 721 ref = other.ref; 722 if (other.inner_parsed_) 723 set_inner_parsed(*other.inner_parsed_); 724 else 725 clear_inner_parsed(); 726 } 727 return *this; 728 } 729 730 Parsed::~Parsed() { 731 delete inner_parsed_; 732 } 733 734 int Parsed::Length() const { 735 if (ref.is_valid()) 736 return ref.end(); 737 return CountCharactersBefore(REF, false); 738 } 739 740 int Parsed::CountCharactersBefore(ComponentType type, 741 bool include_delimiter) const { 742 if (type == SCHEME) 743 return scheme.begin; 744 745 // There will be some characters after the scheme like "://" and we don't 746 // know how many. Search forwards for the next thing until we find one. 747 int cur = 0; 748 if (scheme.is_valid()) 749 cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. 750 751 if (username.is_valid()) { 752 if (type <= USERNAME) 753 return username.begin; 754 cur = username.end() + 1; // Advance over the '@' or ':' at the end. 755 } 756 757 if (password.is_valid()) { 758 if (type <= PASSWORD) 759 return password.begin; 760 cur = password.end() + 1; // Advance over the '@' at the end. 761 } 762 763 if (host.is_valid()) { 764 if (type <= HOST) 765 return host.begin; 766 cur = host.end(); 767 } 768 769 if (port.is_valid()) { 770 if (type < PORT || (type == PORT && include_delimiter)) 771 return port.begin - 1; // Back over delimiter. 772 if (type == PORT) 773 return port.begin; // Don't want delimiter counted. 774 cur = port.end(); 775 } 776 777 if (path.is_valid()) { 778 if (type <= PATH) 779 return path.begin; 780 cur = path.end(); 781 } 782 783 if (query.is_valid()) { 784 if (type < QUERY || (type == QUERY && include_delimiter)) 785 return query.begin - 1; // Back over delimiter. 786 if (type == QUERY) 787 return query.begin; // Don't want delimiter counted. 788 cur = query.end(); 789 } 790 791 if (ref.is_valid()) { 792 if (type == REF && !include_delimiter) 793 return ref.begin; // Back over delimiter. 794 795 // When there is a ref and we get here, the component we wanted was before 796 // this and not found, so we always know the beginning of the ref is right. 797 return ref.begin - 1; // Don't want delimiter counted. 798 } 799 800 return cur; 801 } 802 803 Component Parsed::GetContent() const { 804 const int begin = CountCharactersBefore(USERNAME, false); 805 const int len = Length() - begin; 806 // For compatability with the standard URL parser, we treat no content as 807 // -1, rather than having a length of 0 (we normally wouldn't care so 808 // much for these non-standard URLs). 809 return len ? Component(begin, len) : Component(); 810 } 811 812 bool ExtractScheme(const char* url, int url_len, Component* scheme) { 813 return DoExtractScheme(url, url_len, scheme); 814 } 815 816 bool ExtractScheme(const base::char16* url, int url_len, Component* scheme) { 817 return DoExtractScheme(url, url_len, scheme); 818 } 819 820 // This handles everything that may be an authority terminator, including 821 // backslash. For special backslash handling see DoParseAfterScheme. 822 bool IsAuthorityTerminator(base::char16 ch) { 823 return IsURLSlash(ch) || ch == '?' || ch == '#'; 824 } 825 826 void ExtractFileName(const char* url, 827 const Component& path, 828 Component* file_name) { 829 DoExtractFileName(url, path, file_name); 830 } 831 832 void ExtractFileName(const base::char16* url, 833 const Component& path, 834 Component* file_name) { 835 DoExtractFileName(url, path, file_name); 836 } 837 838 bool ExtractQueryKeyValue(const char* url, 839 Component* query, 840 Component* key, 841 Component* value) { 842 return DoExtractQueryKeyValue(url, query, key, value); 843 } 844 845 bool ExtractQueryKeyValue(const base::char16* url, 846 Component* query, 847 Component* key, 848 Component* value) { 849 return DoExtractQueryKeyValue(url, query, key, value); 850 } 851 852 void ParseAuthority(const char* spec, 853 const Component& auth, 854 Component* username, 855 Component* password, 856 Component* hostname, 857 Component* port_num) { 858 DoParseAuthority(spec, auth, username, password, hostname, port_num); 859 } 860 861 void ParseAuthority(const base::char16* spec, 862 const Component& auth, 863 Component* username, 864 Component* password, 865 Component* hostname, 866 Component* port_num) { 867 DoParseAuthority(spec, auth, username, password, hostname, port_num); 868 } 869 870 int ParsePort(const char* url, const Component& port) { 871 return DoParsePort(url, port); 872 } 873 874 int ParsePort(const base::char16* url, const Component& port) { 875 return DoParsePort(url, port); 876 } 877 878 void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { 879 DoParseStandardURL(url, url_len, parsed); 880 } 881 882 void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) { 883 DoParseStandardURL(url, url_len, parsed); 884 } 885 886 void ParsePathURL(const char* url, 887 int url_len, 888 bool trim_path_end, 889 Parsed* parsed) { 890 DoParsePathURL(url, url_len, trim_path_end, parsed); 891 } 892 893 void ParsePathURL(const base::char16* url, 894 int url_len, 895 bool trim_path_end, 896 Parsed* parsed) { 897 DoParsePathURL(url, url_len, trim_path_end, parsed); 898 } 899 900 void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { 901 DoParseFileSystemURL(url, url_len, parsed); 902 } 903 904 void ParseFileSystemURL(const base::char16* url, int url_len, Parsed* parsed) { 905 DoParseFileSystemURL(url, url_len, parsed); 906 } 907 908 void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { 909 DoParseMailtoURL(url, url_len, parsed); 910 } 911 912 void ParseMailtoURL(const base::char16* url, int url_len, Parsed* parsed) { 913 DoParseMailtoURL(url, url_len, parsed); 914 } 915 916 void ParsePathInternal(const char* spec, 917 const Component& path, 918 Component* filepath, 919 Component* query, 920 Component* ref) { 921 ParsePath(spec, path, filepath, query, ref); 922 } 923 924 void ParsePathInternal(const base::char16* spec, 925 const Component& path, 926 Component* filepath, 927 Component* query, 928 Component* ref) { 929 ParsePath(spec, path, filepath, query, ref); 930 } 931 932 void ParseAfterScheme(const char* spec, 933 int spec_len, 934 int after_scheme, 935 Parsed* parsed) { 936 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 937 } 938 939 void ParseAfterScheme(const base::char16* spec, 940 int spec_len, 941 int after_scheme, 942 Parsed* parsed) { 943 DoParseAfterScheme(spec, spec_len, after_scheme, parsed); 944 } 945 946 } // namespace url_parse 947