1 /* 2 * Copyright (C) 2004, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are 7 * met: 8 * 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following disclaimer 13 * in the documentation and/or other materials provided with the 14 * distribution. 15 * * Neither the name of Google Inc. nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include "config.h" 33 34 #if USE(GOOGLEURL) 35 #include "KURL.h" 36 37 #ifndef NDEBUG 38 #include <stdio.h> 39 #endif 40 41 #include <algorithm> 42 43 #include "NotImplemented.h" 44 #include "TextEncoding.h" 45 #include <wtf/HashMap.h> 46 #include <wtf/Vector.h> 47 #include <wtf/StdLibExtras.h> 48 #include <wtf/text/CString.h> 49 #include <wtf/text/StringHash.h> 50 51 #include <googleurl/src/url_util.h> 52 53 using WTF::isASCIILower; 54 using WTF::toASCIILower; 55 using std::binary_search; 56 57 namespace WebCore { 58 59 static const int maximumValidPortNumber = 0xFFFE; 60 static const int invalidPortNumber = 0xFFFF; 61 62 // Wraps WebCore's text encoding in a character set converter for the 63 // canonicalizer. 64 class KURLCharsetConverter : public url_canon::CharsetConverter { 65 public: 66 // The encoding parameter may be 0, but in this case the object must not be called. 67 KURLCharsetConverter(const TextEncoding* encoding) 68 : m_encoding(encoding) 69 { 70 } 71 72 virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLength, 73 url_canon::CanonOutput* output) 74 { 75 CString encoded = m_encoding->encode(input, inputLength, URLEncodedEntitiesForUnencodables); 76 output->Append(encoded.data(), static_cast<int>(encoded.length())); 77 } 78 79 private: 80 const TextEncoding* m_encoding; 81 }; 82 83 // Note that this function must be named differently than the one in KURL.cpp 84 // since our unit tests evilly include both files, and their local definition 85 // will be ambiguous. 86 static inline void assertProtocolIsGood(const char* protocol) 87 { 88 #ifndef NDEBUG 89 const char* p = protocol; 90 while (*p) { 91 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); 92 ++p; 93 } 94 #endif 95 } 96 97 // Returns the characters for the given string, or a pointer to a static empty 98 // string if the input string is null. This will always ensure we have a non- 99 // null character pointer since ReplaceComponents has special meaning for null. 100 static inline const url_parse::UTF16Char* CharactersOrEmpty(const String& str) 101 { 102 static const url_parse::UTF16Char zero = 0; 103 return str.characters() ? 104 reinterpret_cast<const url_parse::UTF16Char*>(str.characters()) : 105 &zero; 106 } 107 108 static inline bool isUnicodeEncoding(const TextEncoding* encoding) 109 { 110 return encoding->encodingForFormSubmission() == UTF8Encoding(); 111 } 112 113 static bool lowerCaseEqualsASCII(const char* begin, const char* end, const char* str) 114 { 115 while (begin != end && *str) { 116 ASSERT(toASCIILower(*str) == *str); 117 if (toASCIILower(*begin++) != *str++) 118 return false; 119 } 120 121 // Both strings are equal (ignoring case) if and only if all of the characters were equal, 122 // and the end of both has been reached. 123 return begin == end && !*str; 124 } 125 126 static inline bool isSchemeFirstChar(char c) 127 { 128 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 129 } 130 131 static inline bool isSchemeChar(char c) 132 { 133 return isSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' || c == '-' || c == '*'; 134 } 135 136 bool isValidProtocol(const String& protocol) 137 { 138 // NOTE This is a copy of the function in KURL.cpp. 139 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 140 if (protocol.isEmpty()) 141 return false; 142 if (!isSchemeFirstChar(protocol[0])) 143 return false; 144 unsigned protocolLength = protocol.length(); 145 for (unsigned i = 1; i < protocolLength; i++) { 146 if (!isSchemeChar(protocol[i])) 147 return false; 148 } 149 return true; 150 } 151 152 153 // KURLGooglePrivate ----------------------------------------------------------- 154 155 KURLGooglePrivate::KURLGooglePrivate() 156 : m_isValid(false) 157 , m_protocolInHTTPFamily(false) 158 , m_utf8IsASCII(true) 159 , m_stringIsValid(false) 160 { 161 } 162 163 KURLGooglePrivate::KURLGooglePrivate(const url_parse::Parsed& parsed, bool isValid) 164 : m_isValid(isValid) 165 , m_protocolInHTTPFamily(false) 166 , m_parsed(parsed) 167 , m_utf8IsASCII(true) 168 , m_stringIsValid(false) 169 { 170 } 171 172 KURLGooglePrivate::KURLGooglePrivate(WTF::HashTableDeletedValueType) 173 : m_string(WTF::HashTableDeletedValue) 174 { 175 } 176 177 // Setters for the data. Using the ASCII version when you know the 178 // data is ASCII will be slightly more efficient. The UTF-8 version 179 // will always be correct if the caller is unsure. 180 void KURLGooglePrivate::setUtf8(const CString& str) 181 { 182 const char* data = str.data(); 183 unsigned dataLength = str.length(); 184 185 // The m_utf8IsASCII must always be correct since the DeprecatedString 186 // getter must create it with the proper constructor. This test can be 187 // removed when DeprecatedString is gone, but it still might be a 188 // performance win. 189 m_utf8IsASCII = true; 190 for (unsigned i = 0; i < dataLength; i++) { 191 if (static_cast<unsigned char>(data[i]) >= 0x80) { 192 m_utf8IsASCII = false; 193 break; 194 } 195 } 196 197 m_utf8 = str; 198 m_stringIsValid = false; 199 initProtocolInHTTPFamily(); 200 } 201 202 void KURLGooglePrivate::setAscii(const CString& str) 203 { 204 m_utf8 = str; 205 m_utf8IsASCII = true; 206 m_stringIsValid = false; 207 initProtocolInHTTPFamily(); 208 } 209 210 void KURLGooglePrivate::init(const KURL& base, 211 const String& relative, 212 const TextEncoding* queryEncoding) 213 { 214 init(base, relative.characters(), relative.length(), queryEncoding); 215 } 216 217 template <typename CHAR> 218 void KURLGooglePrivate::init(const KURL& base, const CHAR* rel, int relLength, 219 const TextEncoding* queryEncoding) 220 { 221 // As a performance optimization, we do not use the charset converter 222 // if encoding is UTF-8 or other Unicode encodings. Note that this is 223 // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more 224 // efficient with no charset converter object because it can do UTF-8 225 // internally with no extra copies. 226 227 // We feel free to make the charset converter object every time since it's 228 // just a wrapper around a reference. 229 KURLCharsetConverter charsetConverterObject(queryEncoding); 230 KURLCharsetConverter* charsetConverter = 231 (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 : 232 &charsetConverterObject; 233 234 url_canon::RawCanonOutputT<char> output; 235 const CString& baseStr = base.m_url.utf8String(); 236 m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(), 237 base.m_url.m_parsed, rel, relLength, 238 charsetConverter, 239 &output, &m_parsed); 240 241 // See FIXME in KURLGooglePrivate in the header. If canonicalization has not 242 // changed the string, we can avoid an extra allocation by using assignment. 243 // 244 // When KURL encounters an error such that the URL is invalid and empty 245 // (for example, resolving a relative URL on a non-hierarchical base), it 246 // will produce an isNull URL, and calling setUtf8 will produce an empty 247 // non-null URL. This is unlikely to affect anything, but we preserve this 248 // just in case. 249 if (m_isValid || output.length()) { 250 // Without ref, the whole url is guaranteed to be ASCII-only. 251 if (m_parsed.ref.is_nonempty()) 252 setUtf8(CString(output.data(), output.length())); 253 else 254 setAscii(CString(output.data(), output.length())); 255 } else { 256 // WebCore expects resolved URLs to be empty rather than null. 257 setUtf8(CString("", 0)); 258 } 259 } 260 261 void KURLGooglePrivate::initProtocolInHTTPFamily() 262 { 263 if (!m_isValid) { 264 m_protocolInHTTPFamily = false; 265 return; 266 } 267 268 const char* scheme = m_utf8.data() + m_parsed.scheme.begin; 269 if (m_parsed.scheme.len == 4) 270 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 4, "http"); 271 else if (m_parsed.scheme.len == 5) 272 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 5, "https"); 273 else 274 m_protocolInHTTPFamily = false; 275 } 276 277 void KURLGooglePrivate::copyTo(KURLGooglePrivate* dest) const 278 { 279 dest->m_isValid = m_isValid; 280 dest->m_protocolInHTTPFamily = m_protocolInHTTPFamily; 281 dest->m_parsed = m_parsed; 282 283 // Don't copy the 16-bit string since that will be regenerated as needed. 284 dest->m_utf8 = CString(m_utf8.data(), m_utf8.length()); 285 dest->m_utf8IsASCII = m_utf8IsASCII; 286 dest->m_stringIsValid = false; 287 } 288 289 String KURLGooglePrivate::componentString(const url_parse::Component& comp) const 290 { 291 if (!m_isValid || comp.len <= 0) { 292 // KURL returns a null string if the URL is itself a null string, and an 293 // empty string for other nonexistent entities. 294 if (utf8String().isNull()) 295 return String(); 296 return String("", 0); 297 } 298 // begin and len are in terms of bytes which do not match 299 // if string() is UTF-16 and input contains non-ASCII characters. 300 // However, the only part in urlString that can contain non-ASCII 301 // characters is 'ref' at the end of the string. In that case, 302 // begin will always match the actual value and len (in terms of 303 // byte) will be longer than what's needed by 'mid'. However, mid 304 // truncates len to avoid go past the end of a string so that we can 305 // get away withtout doing anything here. 306 return string().substring(comp.begin, comp.len); 307 } 308 309 void KURLGooglePrivate::replaceComponents(const Replacements& replacements) 310 { 311 url_canon::RawCanonOutputT<char> output; 312 url_parse::Parsed newParsed; 313 314 m_isValid = url_util::ReplaceComponents(utf8String().data(), 315 utf8String().length(), m_parsed, replacements, 0, &output, &newParsed); 316 317 m_parsed = newParsed; 318 if (m_parsed.ref.is_nonempty()) 319 setUtf8(CString(output.data(), output.length())); 320 else 321 setAscii(CString(output.data(), output.length())); 322 } 323 324 const String& KURLGooglePrivate::string() const 325 { 326 if (!m_stringIsValid) { 327 // Handle the null case separately. Otherwise, constructing 328 // the string like we do below would generate the empty string, 329 // not the null string. 330 if (m_utf8.isNull()) 331 m_string = String(); 332 else if (m_utf8IsASCII) 333 m_string = String(m_utf8.data(), m_utf8.length()); 334 else 335 m_string = String::fromUTF8(m_utf8.data(), m_utf8.length()); 336 m_stringIsValid = true; 337 } 338 return m_string; 339 } 340 341 // KURL ------------------------------------------------------------------------ 342 343 // Creates with null-terminated string input representing an absolute URL. 344 // WebCore generally calls this only with hardcoded strings, so the input is 345 // ASCII. We treat it as UTF-8 just in case. 346 KURL::KURL(ParsedURLStringTag, const char *url) 347 { 348 // FIXME The Mac code checks for beginning with a slash and converts it to 349 // file: URL. We will want to add this as well once we can compile on a 350 // system like that. 351 m_url.init(KURL(), url, strlen(url), 0); 352 353 // The one-argument constructors should never generate a null string. 354 // This is a funny quirk of KURL.cpp (probably a bug) which we preserve. 355 if (m_url.utf8String().isNull()) 356 m_url.setAscii(CString("", 0)); 357 } 358 359 // Initializes with a string representing an absolute URL. No encoding 360 // information is specified. This generally happens when a KURL is converted 361 // to a string and then converted back. In this case, the URL is already 362 // canonical and in proper escaped form so needs no encoding. We treat it as 363 // UTF-8 just in case. 364 KURL::KURL(ParsedURLStringTag, const String& url) 365 { 366 if (!url.isNull()) 367 m_url.init(KURL(), url, 0); 368 else { 369 // WebCore expects us to preserve the nullness of strings when this 370 // constructor is used. In all other cases, it expects a non-null 371 // empty string, which is what init() will create. 372 m_url.m_isValid = false; 373 m_url.m_protocolInHTTPFamily = false; 374 } 375 } 376 377 // Constructs a new URL given a base URL and a possibly relative input URL. 378 // This assumes UTF-8 encoding. 379 KURL::KURL(const KURL& base, const String& relative) 380 { 381 m_url.init(base, relative, 0); 382 } 383 384 // Constructs a new URL given a base URL and a possibly relative input URL. 385 // Any query portion of the relative URL will be encoded in the given encoding. 386 KURL::KURL(const KURL& base, 387 const String& relative, 388 const TextEncoding& encoding) 389 { 390 m_url.init(base, relative, &encoding.encodingForFormSubmission()); 391 } 392 393 KURL::KURL(const CString& canonicalSpec, 394 const url_parse::Parsed& parsed, bool isValid) 395 : m_url(parsed, isValid) 396 { 397 // We know the reference fragment is the only part that can be UTF-8, so 398 // we know it's ASCII when there is no ref. 399 if (parsed.ref.is_nonempty()) 400 m_url.setUtf8(canonicalSpec); 401 else 402 m_url.setAscii(canonicalSpec); 403 } 404 405 #if USE(CF) 406 KURL::KURL(CFURLRef) 407 { 408 notImplemented(); 409 invalidate(); 410 } 411 412 CFURLRef KURL::createCFURL() const 413 { 414 notImplemented(); 415 return 0; 416 } 417 #endif 418 419 KURL KURL::copy() const 420 { 421 KURL result = *this; 422 m_url.copyTo(&result.m_url); 423 return result; 424 } 425 426 bool KURL::isNull() const 427 { 428 return m_url.utf8String().isNull(); 429 } 430 431 bool KURL::isEmpty() const 432 { 433 return !m_url.utf8String().length(); 434 } 435 436 bool KURL::isValid() const 437 { 438 return m_url.m_isValid; 439 } 440 441 bool KURL::hasPort() const 442 { 443 return hostEnd() < pathStart(); 444 } 445 446 bool KURL::protocolInHTTPFamily() const 447 { 448 return m_url.m_protocolInHTTPFamily; 449 } 450 451 bool KURL::hasPath() const 452 { 453 // Note that http://www.google.com/" has a path, the path is "/". This can 454 // return false only for invalid or nonstandard URLs. 455 return m_url.m_parsed.path.len >= 0; 456 } 457 458 // We handle "parameters" separated by a semicolon, while KURL.cpp does not, 459 // which can lead to different results in some cases. 460 String KURL::lastPathComponent() const 461 { 462 // When the output ends in a slash, WebCore has different expectations than 463 // the GoogleURL library. For "/foo/bar/" the library will return the empty 464 // string, but WebCore wants "bar". 465 url_parse::Component path = m_url.m_parsed.path; 466 if (path.len > 0 && m_url.utf8String().data()[path.end() - 1] == '/') 467 path.len--; 468 469 url_parse::Component file; 470 url_parse::ExtractFileName(m_url.utf8String().data(), path, &file); 471 472 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 473 // a null string when the path is empty, which we duplicate here. 474 if (!file.is_nonempty()) 475 return String(); 476 return m_url.componentString(file); 477 } 478 479 String KURL::protocol() const 480 { 481 return m_url.componentString(m_url.m_parsed.scheme); 482 } 483 484 String KURL::host() const 485 { 486 // Note: KURL.cpp unescapes here. 487 return m_url.componentString(m_url.m_parsed.host); 488 } 489 490 // Returns 0 when there is no port. 491 // 492 // We treat URL's with out-of-range port numbers as invalid URLs, and they will 493 // be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but 494 // return invalidPortNumber from this port() function, so we mirror that behavior here. 495 unsigned short KURL::port() const 496 { 497 if (!m_url.m_isValid || m_url.m_parsed.port.len <= 0) 498 return 0; 499 int port = url_parse::ParsePort(m_url.utf8String().data(), m_url.m_parsed.port); 500 ASSERT(port != url_parse::PORT_UNSPECIFIED); // Checked port.len <= 0 before. 501 502 if (port == url_parse::PORT_INVALID || port > maximumValidPortNumber) // Mimic KURL::port() 503 port = invalidPortNumber; 504 505 return static_cast<unsigned short>(port); 506 } 507 508 // Returns the empty string if there is no password. 509 String KURL::pass() const 510 { 511 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 512 // a null string when the password is empty, which we duplicate here. 513 if (!m_url.m_parsed.password.is_nonempty()) 514 return String(); 515 516 // Note: KURL.cpp unescapes here. 517 return m_url.componentString(m_url.m_parsed.password); 518 } 519 520 // Returns the empty string if there is no username. 521 String KURL::user() const 522 { 523 // Note: KURL.cpp unescapes here. 524 return m_url.componentString(m_url.m_parsed.username); 525 } 526 527 String KURL::fragmentIdentifier() const 528 { 529 // Empty but present refs ("foo.com/bar#") should result in the empty 530 // string, which m_url.componentString will produce. Nonexistent refs 531 // should be the null string. 532 if (!m_url.m_parsed.ref.is_valid()) 533 return String(); 534 535 // Note: KURL.cpp unescapes here. 536 return m_url.componentString(m_url.m_parsed.ref); 537 } 538 539 bool KURL::hasFragmentIdentifier() const 540 { 541 // Note: KURL.cpp unescapes here. 542 // FIXME determine if KURL.cpp agrees about an empty ref 543 return m_url.m_parsed.ref.len >= 0; 544 } 545 546 void KURL::copyParsedQueryTo(ParsedURLParameters& parameters) const 547 { 548 String query = m_url.componentString(m_url.m_parsed.query); 549 const UChar* pos = query.characters(); 550 const UChar* end = query.characters() + query.length(); 551 while (pos < end) { 552 const UChar* parameterStart = pos; 553 while (pos < end && *pos != '&') 554 ++pos; 555 const UChar* parameterEnd = pos; 556 if (pos < end) { 557 ASSERT(*pos == '&'); 558 ++pos; 559 } 560 if (parameterStart == parameterEnd) 561 continue; 562 const UChar* nameStart = parameterStart; 563 const UChar* equalSign = parameterStart; 564 while (equalSign < parameterEnd && *equalSign != '=') 565 ++equalSign; 566 if (equalSign == nameStart) 567 continue; 568 String name(nameStart, equalSign - nameStart); 569 String value = equalSign == parameterEnd ? String() : String(equalSign + 1, parameterEnd - equalSign - 1); 570 parameters.set(name, value); 571 } 572 } 573 574 String KURL::baseAsString() const 575 { 576 // FIXME: There is probably a more efficient way to do this? 577 return string().left(pathAfterLastSlash()); 578 } 579 580 String KURL::query() const 581 { 582 if (m_url.m_parsed.query.len >= 0) 583 return m_url.componentString(m_url.m_parsed.query); 584 585 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 586 // an empty string when the query is empty rather than a null (not sure 587 // which is right). 588 // Returns a null if the query is not specified, instead of empty. 589 if (m_url.m_parsed.query.is_valid()) 590 return String("", 0); 591 return String(); 592 } 593 594 String KURL::path() const 595 { 596 // Note: KURL.cpp unescapes here. 597 return m_url.componentString(m_url.m_parsed.path); 598 } 599 600 bool KURL::setProtocol(const String& protocol) 601 { 602 // Firefox and IE remove everything after the first ':'. 603 int separatorPosition = protocol.find(':'); 604 String newProtocol = protocol.substring(0, separatorPosition); 605 606 // If KURL is given an invalid scheme, it returns failure without modifying 607 // the URL at all. This is in contrast to most other setters which modify 608 // the URL and set "m_isValid." 609 url_canon::RawCanonOutputT<char> canonProtocol; 610 url_parse::Component protocolComponent; 611 if (!url_canon::CanonicalizeScheme(newProtocol.characters(), 612 url_parse::Component(0, newProtocol.length()), 613 &canonProtocol, &protocolComponent) 614 || !protocolComponent.is_nonempty()) 615 return false; 616 617 KURLGooglePrivate::Replacements replacements; 618 replacements.SetScheme(CharactersOrEmpty(newProtocol), 619 url_parse::Component(0, newProtocol.length())); 620 m_url.replaceComponents(replacements); 621 622 // isValid could be false but we still return true here. This is because 623 // WebCore or JS scripts can build up a URL by setting individual 624 // components, and a JS exception is based on the return value of this 625 // function. We want to throw the exception and stop the script only when 626 // its trying to set a bad protocol, and not when it maybe just hasn't 627 // finished building up its final scheme. 628 return true; 629 } 630 631 void KURL::setHost(const String& host) 632 { 633 KURLGooglePrivate::Replacements replacements; 634 replacements.SetHost(CharactersOrEmpty(host), 635 url_parse::Component(0, host.length())); 636 m_url.replaceComponents(replacements); 637 } 638 639 void KURL::setHostAndPort(const String& s) 640 { 641 String host = s; 642 String port; 643 int hostEnd = s.find(":"); 644 if (hostEnd != -1) { 645 host = s.left(hostEnd); 646 port = s.substring(hostEnd + 1); 647 } 648 649 KURLGooglePrivate::Replacements replacements; 650 // Host can't be removed, so we always set. 651 replacements.SetHost(CharactersOrEmpty(host), 652 url_parse::Component(0, host.length())); 653 654 if (port.isEmpty()) // Port may be removed, so we support clearing. 655 replacements.ClearPort(); 656 else 657 replacements.SetPort(CharactersOrEmpty(port), url_parse::Component(0, port.length())); 658 m_url.replaceComponents(replacements); 659 } 660 661 void KURL::removePort() 662 { 663 if (hasPort()) { 664 String urlWithoutPort = m_url.string().left(hostEnd()) + m_url.string().substring(pathStart()); 665 m_url.setUtf8(urlWithoutPort.utf8()); 666 } 667 } 668 669 void KURL::setPort(unsigned short i) 670 { 671 KURLGooglePrivate::Replacements replacements; 672 String portStr; 673 if (i) { 674 portStr = String::number(i); 675 replacements.SetPort( 676 reinterpret_cast<const url_parse::UTF16Char*>(portStr.characters()), 677 url_parse::Component(0, portStr.length())); 678 679 } else { 680 // Clear any existing port when it is set to 0. 681 replacements.ClearPort(); 682 } 683 m_url.replaceComponents(replacements); 684 } 685 686 void KURL::setUser(const String& user) 687 { 688 // This function is commonly called to clear the username, which we 689 // normally don't have, so we optimize this case. 690 if (user.isEmpty() && !m_url.m_parsed.username.is_valid()) 691 return; 692 693 // The canonicalizer will clear any usernames that are empty, so we 694 // don't have to explicitly call ClearUsername() here. 695 KURLGooglePrivate::Replacements replacements; 696 replacements.SetUsername(CharactersOrEmpty(user), 697 url_parse::Component(0, user.length())); 698 m_url.replaceComponents(replacements); 699 } 700 701 void KURL::setPass(const String& pass) 702 { 703 // This function is commonly called to clear the password, which we 704 // normally don't have, so we optimize this case. 705 if (pass.isEmpty() && !m_url.m_parsed.password.is_valid()) 706 return; 707 708 // The canonicalizer will clear any passwords that are empty, so we 709 // don't have to explicitly call ClearUsername() here. 710 KURLGooglePrivate::Replacements replacements; 711 replacements.SetPassword(CharactersOrEmpty(pass), 712 url_parse::Component(0, pass.length())); 713 m_url.replaceComponents(replacements); 714 } 715 716 void KURL::setFragmentIdentifier(const String& s) 717 { 718 // This function is commonly called to clear the ref, which we 719 // normally don't have, so we optimize this case. 720 if (s.isNull() && !m_url.m_parsed.ref.is_valid()) 721 return; 722 723 KURLGooglePrivate::Replacements replacements; 724 if (s.isNull()) 725 replacements.ClearRef(); 726 else 727 replacements.SetRef(CharactersOrEmpty(s), url_parse::Component(0, s.length())); 728 m_url.replaceComponents(replacements); 729 } 730 731 void KURL::removeFragmentIdentifier() 732 { 733 KURLGooglePrivate::Replacements replacements; 734 replacements.ClearRef(); 735 m_url.replaceComponents(replacements); 736 } 737 738 void KURL::setQuery(const String& query) 739 { 740 KURLGooglePrivate::Replacements replacements; 741 if (query.isNull()) { 742 // KURL.cpp sets to null to clear any query. 743 replacements.ClearQuery(); 744 } else if (query.length() > 0 && query[0] == '?') { 745 // WebCore expects the query string to begin with a question mark, but 746 // GoogleURL doesn't. So we trim off the question mark when setting. 747 replacements.SetQuery(CharactersOrEmpty(query), 748 url_parse::Component(1, query.length() - 1)); 749 } else { 750 // When set with the empty string or something that doesn't begin with 751 // a question mark, KURL.cpp will add a question mark for you. The only 752 // way this isn't compatible is if you call this function with an empty 753 // string. KURL.cpp will leave a '?' with nothing following it in the 754 // URL, whereas we'll clear it. 755 // FIXME We should eliminate this difference. 756 replacements.SetQuery(CharactersOrEmpty(query), 757 url_parse::Component(0, query.length())); 758 } 759 m_url.replaceComponents(replacements); 760 } 761 762 void KURL::setPath(const String& path) 763 { 764 // Empty paths will be canonicalized to "/", so we don't have to worry 765 // about calling ClearPath(). 766 KURLGooglePrivate::Replacements replacements; 767 replacements.SetPath(CharactersOrEmpty(path), 768 url_parse::Component(0, path.length())); 769 m_url.replaceComponents(replacements); 770 } 771 772 // On Mac, this just seems to return the same URL, but with "/foo/bar" for 773 // file: URLs instead of file:///foo/bar. We don't bother with any of this, 774 // at least for now. 775 String KURL::prettyURL() const 776 { 777 if (!m_url.m_isValid) 778 return String(); 779 return m_url.string(); 780 } 781 782 String decodeURLEscapeSequences(const String& str) 783 { 784 return decodeURLEscapeSequences(str, UTF8Encoding()); 785 } 786 787 // In KURL.cpp's implementation, this is called by every component getter. 788 // It will unescape every character, including '\0'. This is scary, and may 789 // cause security holes. We never call this function for components, and 790 // just return the ASCII versions instead. 791 // 792 // This function is also used to decode javascript: URLs and as a general 793 // purpose unescaping function. 794 // 795 // FIXME These should be merged to the KURL.cpp implementation. 796 String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding) 797 { 798 // FIXME We can probably use KURL.cpp's version of this function 799 // without modification. However, I'm concerned about 800 // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old 801 // custom code for now. Using their version will also fix the bug that 802 // we ignore the encoding. 803 // 804 // FIXME b/1350291: This does not get called very often. We just convert 805 // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of 806 // sucks, and we don't use the encoding properly, which will make some 807 // obscure anchor navigations fail. 808 CString cstr = str.utf8(); 809 810 const char* input = cstr.data(); 811 int inputLength = cstr.length(); 812 813 url_canon::RawCanonOutputT<url_parse::UTF16Char> unescaped; 814 815 url_util::DecodeURLEscapeSequences(input, inputLength, &unescaped); 816 817 return String(reinterpret_cast<UChar*>(unescaped.data()), 818 unescaped.length()); 819 } 820 821 bool KURL::protocolIs(const char* protocol) const 822 { 823 assertProtocolIsGood(protocol); 824 825 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid. 826 // The free function protocolIsJavaScript() should be used instead. 827 // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript")); 828 829 if (m_url.m_parsed.scheme.len <= 0) 830 return !protocol; 831 return lowerCaseEqualsASCII( 832 m_url.utf8String().data() + m_url.m_parsed.scheme.begin, 833 m_url.utf8String().data() + m_url.m_parsed.scheme.end(), 834 protocol); 835 } 836 837 // This is called to escape a URL string. It is only used externally when 838 // constructing mailto: links to set the query section. Since our query setter 839 // will automatically do the correct escaping, this function does not have to 840 // do any work. 841 // 842 // There is a possibility that a future caller may use this function in other 843 // ways, and may expect to get a valid URL string. The dangerous thing we want 844 // to protect against here is accidentally getting '\0' characters in a string 845 // that is not supposed to have them. Therefore, we escape these characters. 846 String encodeWithURLEscapeSequences(const String& notEncodedString) 847 { 848 CString utf8 = UTF8Encoding().encode( 849 reinterpret_cast<const UChar*>(notEncodedString.characters()), 850 notEncodedString.length(), 851 URLEncodedEntitiesForUnencodables); 852 const char* input = utf8.data(); 853 int inputLength = utf8.length(); 854 855 Vector<char, 2048> buffer; 856 for (int i = 0; i < inputLength; i++) { 857 if (!input[i]) 858 buffer.append("%00", 3); 859 else 860 buffer.append(input[i]); 861 } 862 return String(buffer.data(), buffer.size()); 863 } 864 865 bool KURL::isHierarchical() const 866 { 867 if (!m_url.m_parsed.scheme.is_nonempty()) 868 return false; 869 return url_util::IsStandard( 870 &m_url.utf8String().data()[m_url.m_parsed.scheme.begin], 871 m_url.m_parsed.scheme); 872 } 873 874 #ifndef NDEBUG 875 void KURL::print() const 876 { 877 printf("%s\n", m_url.utf8String().data()); 878 } 879 #endif 880 881 void KURL::invalidate() 882 { 883 // This is only called from the constructor so resetting the (automatically 884 // initialized) string and parsed structure would be a waste of time. 885 m_url.m_isValid = false; 886 m_url.m_protocolInHTTPFamily = false; 887 } 888 889 // Equal up to reference fragments, if any. 890 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) 891 { 892 // Compute the length of each URL without its ref. Note that the reference 893 // begin (if it exists) points to the character *after* the '#', so we need 894 // to subtract one. 895 int aLength = a.m_url.utf8String().length(); 896 if (a.m_url.m_parsed.ref.len >= 0) 897 aLength = a.m_url.m_parsed.ref.begin - 1; 898 899 int bLength = b.m_url.utf8String().length(); 900 if (b.m_url.m_parsed.ref.len >= 0) 901 bLength = b.m_url.m_parsed.ref.begin - 1; 902 903 return aLength == bLength 904 && !strncmp(a.m_url.utf8String().data(), b.m_url.utf8String().data(), aLength); 905 } 906 907 unsigned KURL::hostStart() const 908 { 909 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false); 910 } 911 912 unsigned KURL::hostEnd() const 913 { 914 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true); 915 } 916 917 unsigned KURL::pathStart() const 918 { 919 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); 920 } 921 922 unsigned KURL::pathEnd() const 923 { 924 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true); 925 } 926 927 unsigned KURL::pathAfterLastSlash() const 928 { 929 // When there's no path, ask for what would be the beginning of it. 930 if (!m_url.m_parsed.path.is_valid()) 931 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); 932 933 url_parse::Component filename; 934 url_parse::ExtractFileName(m_url.utf8String().data(), m_url.m_parsed.path, 935 &filename); 936 return filename.begin; 937 } 938 939 bool protocolIs(const String& url, const char* protocol) 940 { 941 // Do the comparison without making a new string object. 942 assertProtocolIsGood(protocol); 943 944 // Check the scheme like GURL does. 945 return url_util::FindAndCompareScheme(url.characters(), url.length(), 946 protocol, 0); 947 } 948 949 inline bool KURL::protocolIs(const String& string, const char* protocol) 950 { 951 return WebCore::protocolIs(string, protocol); 952 } 953 954 bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b) 955 { 956 if (a.parsed().scheme.end() != b.parsed().scheme.end()) 957 return false; 958 959 int hostStartA = a.hostStart(); 960 int hostLengthA = a.hostEnd() - hostStartA; 961 int hostStartB = b.hostStart(); 962 int hostLengthB = b.hostEnd() - b.hostStart(); 963 if (hostLengthA != hostLengthB) 964 return false; 965 966 // Check the scheme 967 for (int i = 0; i < a.parsed().scheme.end(); ++i) 968 if (a.string()[i] != b.string()[i]) 969 return false; 970 971 // And the host 972 for (int i = 0; i < hostLengthA; ++i) 973 if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) 974 return false; 975 976 if (a.port() != b.port()) 977 return false; 978 979 return true; 980 } 981 982 } // namespace WebCore 983 984 #endif // USE(GOOGLEURL) 985