Home | History | Annotate | Download | only in platform
      1 /*
      2  * Copyright (C) 2004, 2007, 2008, 2009 Apple Inc. All rights reserved.
      3  * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions are
      7  * met:
      8  *
      9  *     * Redistributions of source code must retain the above copyright
     10  * notice, this list of conditions and the following disclaimer.
     11  *     * Redistributions in binary form must reproduce the above
     12  * copyright notice, this list of conditions and the following disclaimer
     13  * in the documentation and/or other materials provided with the
     14  * distribution.
     15  *     * Neither the name of Google Inc. nor the names of its
     16  * contributors may be used to endorse or promote products derived from
     17  * this software without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include "config.h"
     33 
     34 #if USE(GOOGLEURL)
     35 #include "KURL.h"
     36 
     37 #ifndef NDEBUG
     38 #include <stdio.h>
     39 #endif
     40 
     41 #include <algorithm>
     42 
     43 #include "NotImplemented.h"
     44 #include "TextEncoding.h"
     45 #include <wtf/HashMap.h>
     46 #include <wtf/Vector.h>
     47 #include <wtf/StdLibExtras.h>
     48 #include <wtf/text/CString.h>
     49 #include <wtf/text/StringHash.h>
     50 
     51 #include <googleurl/src/url_util.h>
     52 
     53 using WTF::isASCIILower;
     54 using WTF::toASCIILower;
     55 using std::binary_search;
     56 
     57 namespace WebCore {
     58 
     59 static const int maximumValidPortNumber = 0xFFFE;
     60 static const int invalidPortNumber = 0xFFFF;
     61 
     62 // Wraps WebCore's text encoding in a character set converter for the
     63 // canonicalizer.
     64 class KURLCharsetConverter : public url_canon::CharsetConverter {
     65 public:
     66     // The encoding parameter may be 0, but in this case the object must not be called.
     67     KURLCharsetConverter(const TextEncoding* encoding)
     68         : m_encoding(encoding)
     69     {
     70     }
     71 
     72     virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLength,
     73                                   url_canon::CanonOutput* output)
     74     {
     75         CString encoded = m_encoding->encode(input, inputLength, URLEncodedEntitiesForUnencodables);
     76         output->Append(encoded.data(), static_cast<int>(encoded.length()));
     77     }
     78 
     79 private:
     80     const TextEncoding* m_encoding;
     81 };
     82 
     83 // Note that this function must be named differently than the one in KURL.cpp
     84 // since our unit tests evilly include both files, and their local definition
     85 // will be ambiguous.
     86 static inline void assertProtocolIsGood(const char* protocol)
     87 {
     88 #ifndef NDEBUG
     89     const char* p = protocol;
     90     while (*p) {
     91         ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
     92         ++p;
     93     }
     94 #endif
     95 }
     96 
     97 // Returns the characters for the given string, or a pointer to a static empty
     98 // string if the input string is null. This will always ensure we have a non-
     99 // null character pointer since ReplaceComponents has special meaning for null.
    100 static inline const url_parse::UTF16Char* CharactersOrEmpty(const String& str)
    101 {
    102     static const url_parse::UTF16Char zero = 0;
    103     return str.characters() ?
    104            reinterpret_cast<const url_parse::UTF16Char*>(str.characters()) :
    105            &zero;
    106 }
    107 
    108 static inline bool isUnicodeEncoding(const TextEncoding* encoding)
    109 {
    110     return encoding->encodingForFormSubmission() == UTF8Encoding();
    111 }
    112 
    113 static bool lowerCaseEqualsASCII(const char* begin, const char* end, const char* str)
    114 {
    115     while (begin != end && *str) {
    116         ASSERT(toASCIILower(*str) == *str);
    117         if (toASCIILower(*begin++) != *str++)
    118             return false;
    119     }
    120 
    121     // Both strings are equal (ignoring case) if and only if all of the characters were equal,
    122     // and the end of both has been reached.
    123     return begin == end && !*str;
    124 }
    125 
    126 static inline bool isSchemeFirstChar(char c)
    127 {
    128     return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    129 }
    130 
    131 static inline bool isSchemeChar(char c)
    132 {
    133     return isSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' || c == '-' || c == '*';
    134 }
    135 
    136 bool isValidProtocol(const String& protocol)
    137 {
    138     // NOTE This is a copy of the function in KURL.cpp.
    139     // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
    140     if (protocol.isEmpty())
    141         return false;
    142     if (!isSchemeFirstChar(protocol[0]))
    143         return false;
    144     unsigned protocolLength = protocol.length();
    145     for (unsigned i = 1; i < protocolLength; i++) {
    146         if (!isSchemeChar(protocol[i]))
    147             return false;
    148     }
    149     return true;
    150 }
    151 
    152 
    153 // KURLGooglePrivate -----------------------------------------------------------
    154 
    155 KURLGooglePrivate::KURLGooglePrivate()
    156     : m_isValid(false)
    157     , m_protocolInHTTPFamily(false)
    158     , m_utf8IsASCII(true)
    159     , m_stringIsValid(false)
    160 {
    161 }
    162 
    163 KURLGooglePrivate::KURLGooglePrivate(const url_parse::Parsed& parsed, bool isValid)
    164     : m_isValid(isValid)
    165     , m_protocolInHTTPFamily(false)
    166     , m_parsed(parsed)
    167     , m_utf8IsASCII(true)
    168     , m_stringIsValid(false)
    169 {
    170 }
    171 
    172 KURLGooglePrivate::KURLGooglePrivate(WTF::HashTableDeletedValueType)
    173     : m_string(WTF::HashTableDeletedValue)
    174 {
    175 }
    176 
    177 // Setters for the data. Using the ASCII version when you know the
    178 // data is ASCII will be slightly more efficient. The UTF-8 version
    179 // will always be correct if the caller is unsure.
    180 void KURLGooglePrivate::setUtf8(const CString& str)
    181 {
    182     const char* data = str.data();
    183     unsigned dataLength = str.length();
    184 
    185     // The m_utf8IsASCII must always be correct since the DeprecatedString
    186     // getter must create it with the proper constructor. This test can be
    187     // removed when DeprecatedString is gone, but it still might be a
    188     // performance win.
    189     m_utf8IsASCII = true;
    190     for (unsigned i = 0; i < dataLength; i++) {
    191         if (static_cast<unsigned char>(data[i]) >= 0x80) {
    192             m_utf8IsASCII = false;
    193             break;
    194         }
    195     }
    196 
    197     m_utf8 = str;
    198     m_stringIsValid = false;
    199     initProtocolInHTTPFamily();
    200 }
    201 
    202 void KURLGooglePrivate::setAscii(const CString& str)
    203 {
    204     m_utf8 = str;
    205     m_utf8IsASCII = true;
    206     m_stringIsValid = false;
    207     initProtocolInHTTPFamily();
    208 }
    209 
    210 void KURLGooglePrivate::init(const KURL& base,
    211                              const String& relative,
    212                              const TextEncoding* queryEncoding)
    213 {
    214     init(base, relative.characters(), relative.length(), queryEncoding);
    215 }
    216 
    217 template <typename CHAR>
    218 void KURLGooglePrivate::init(const KURL& base, const CHAR* rel, int relLength,
    219                              const TextEncoding* queryEncoding)
    220 {
    221     // As a performance optimization, we do not use the charset converter
    222     // if encoding is UTF-8 or other Unicode encodings. Note that this is
    223     // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more
    224     // efficient with no charset converter object because it can do UTF-8
    225     // internally with no extra copies.
    226 
    227     // We feel free to make the charset converter object every time since it's
    228     // just a wrapper around a reference.
    229     KURLCharsetConverter charsetConverterObject(queryEncoding);
    230     KURLCharsetConverter* charsetConverter =
    231         (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 :
    232         &charsetConverterObject;
    233 
    234     url_canon::RawCanonOutputT<char> output;
    235     const CString& baseStr = base.m_url.utf8String();
    236     m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(),
    237                                           base.m_url.m_parsed, rel, relLength,
    238                                           charsetConverter,
    239                                           &output, &m_parsed);
    240 
    241     // See FIXME in KURLGooglePrivate in the header. If canonicalization has not
    242     // changed the string, we can avoid an extra allocation by using assignment.
    243     //
    244     // When KURL encounters an error such that the URL is invalid and empty
    245     // (for example, resolving a relative URL on a non-hierarchical base), it
    246     // will produce an isNull URL, and calling setUtf8 will produce an empty
    247     // non-null URL. This is unlikely to affect anything, but we preserve this
    248     // just in case.
    249     if (m_isValid || output.length()) {
    250         // Without ref, the whole url is guaranteed to be ASCII-only.
    251         if (m_parsed.ref.is_nonempty())
    252             setUtf8(CString(output.data(), output.length()));
    253         else
    254             setAscii(CString(output.data(), output.length()));
    255     } else {
    256         // WebCore expects resolved URLs to be empty rather than null.
    257         setUtf8(CString("", 0));
    258     }
    259 }
    260 
    261 void KURLGooglePrivate::initProtocolInHTTPFamily()
    262 {
    263     if (!m_isValid) {
    264         m_protocolInHTTPFamily = false;
    265         return;
    266     }
    267 
    268     const char* scheme = m_utf8.data() + m_parsed.scheme.begin;
    269     if (m_parsed.scheme.len == 4)
    270         m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 4, "http");
    271     else if (m_parsed.scheme.len == 5)
    272         m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 5, "https");
    273     else
    274         m_protocolInHTTPFamily = false;
    275 }
    276 
    277 void KURLGooglePrivate::copyTo(KURLGooglePrivate* dest) const
    278 {
    279     dest->m_isValid = m_isValid;
    280     dest->m_protocolInHTTPFamily = m_protocolInHTTPFamily;
    281     dest->m_parsed = m_parsed;
    282 
    283     // Don't copy the 16-bit string since that will be regenerated as needed.
    284     dest->m_utf8 = CString(m_utf8.data(), m_utf8.length());
    285     dest->m_utf8IsASCII = m_utf8IsASCII;
    286     dest->m_stringIsValid = false;
    287 }
    288 
    289 String KURLGooglePrivate::componentString(const url_parse::Component& comp) const
    290 {
    291     if (!m_isValid || comp.len <= 0) {
    292         // KURL returns a null string if the URL is itself a null string, and an
    293         // empty string for other nonexistent entities.
    294         if (utf8String().isNull())
    295             return String();
    296         return String("", 0);
    297     }
    298     // begin and len are in terms of bytes which do not match
    299     // if string() is UTF-16 and input contains non-ASCII characters.
    300     // However, the only part in urlString that can contain non-ASCII
    301     // characters is 'ref' at the end of the string. In that case,
    302     // begin will always match the actual value and len (in terms of
    303     // byte) will be longer than what's needed by 'mid'. However, mid
    304     // truncates len to avoid go past the end of a string so that we can
    305     // get away withtout doing anything here.
    306     return string().substring(comp.begin, comp.len);
    307 }
    308 
    309 void KURLGooglePrivate::replaceComponents(const Replacements& replacements)
    310 {
    311     url_canon::RawCanonOutputT<char> output;
    312     url_parse::Parsed newParsed;
    313 
    314     m_isValid = url_util::ReplaceComponents(utf8String().data(),
    315                                             utf8String().length(), m_parsed, replacements, 0, &output, &newParsed);
    316 
    317     m_parsed = newParsed;
    318     if (m_parsed.ref.is_nonempty())
    319         setUtf8(CString(output.data(), output.length()));
    320     else
    321         setAscii(CString(output.data(), output.length()));
    322 }
    323 
    324 const String& KURLGooglePrivate::string() const
    325 {
    326     if (!m_stringIsValid) {
    327         // Handle the null case separately. Otherwise, constructing
    328         // the string like we do below would generate the empty string,
    329         // not the null string.
    330         if (m_utf8.isNull())
    331             m_string = String();
    332         else if (m_utf8IsASCII)
    333             m_string = String(m_utf8.data(), m_utf8.length());
    334         else
    335             m_string = String::fromUTF8(m_utf8.data(), m_utf8.length());
    336         m_stringIsValid = true;
    337     }
    338     return m_string;
    339 }
    340 
    341 // KURL ------------------------------------------------------------------------
    342 
    343 // Creates with null-terminated string input representing an absolute URL.
    344 // WebCore generally calls this only with hardcoded strings, so the input is
    345 // ASCII. We treat it as UTF-8 just in case.
    346 KURL::KURL(ParsedURLStringTag, const char *url)
    347 {
    348     // FIXME The Mac code checks for beginning with a slash and converts it to
    349     // file: URL. We will want to add this as well once we can compile on a
    350     // system like that.
    351     m_url.init(KURL(), url, strlen(url), 0);
    352 
    353     // The one-argument constructors should never generate a null string.
    354     // This is a funny quirk of KURL.cpp (probably a bug) which we preserve.
    355     if (m_url.utf8String().isNull())
    356         m_url.setAscii(CString("", 0));
    357 }
    358 
    359 // Initializes with a string representing an absolute URL. No encoding
    360 // information is specified. This generally happens when a KURL is converted
    361 // to a string and then converted back. In this case, the URL is already
    362 // canonical and in proper escaped form so needs no encoding. We treat it as
    363 // UTF-8 just in case.
    364 KURL::KURL(ParsedURLStringTag, const String& url)
    365 {
    366     if (!url.isNull())
    367         m_url.init(KURL(), url, 0);
    368     else {
    369         // WebCore expects us to preserve the nullness of strings when this
    370         // constructor is used. In all other cases, it expects a non-null
    371         // empty string, which is what init() will create.
    372         m_url.m_isValid = false;
    373         m_url.m_protocolInHTTPFamily = false;
    374     }
    375 }
    376 
    377 // Constructs a new URL given a base URL and a possibly relative input URL.
    378 // This assumes UTF-8 encoding.
    379 KURL::KURL(const KURL& base, const String& relative)
    380 {
    381     m_url.init(base, relative, 0);
    382 }
    383 
    384 // Constructs a new URL given a base URL and a possibly relative input URL.
    385 // Any query portion of the relative URL will be encoded in the given encoding.
    386 KURL::KURL(const KURL& base,
    387            const String& relative,
    388            const TextEncoding& encoding)
    389 {
    390     m_url.init(base, relative, &encoding.encodingForFormSubmission());
    391 }
    392 
    393 KURL::KURL(const CString& canonicalSpec,
    394            const url_parse::Parsed& parsed, bool isValid)
    395     : m_url(parsed, isValid)
    396 {
    397     // We know the reference fragment is the only part that can be UTF-8, so
    398     // we know it's ASCII when there is no ref.
    399     if (parsed.ref.is_nonempty())
    400         m_url.setUtf8(canonicalSpec);
    401     else
    402         m_url.setAscii(canonicalSpec);
    403 }
    404 
    405 #if USE(CF)
    406 KURL::KURL(CFURLRef)
    407 {
    408     notImplemented();
    409     invalidate();
    410 }
    411 
    412 CFURLRef KURL::createCFURL() const
    413 {
    414     notImplemented();
    415     return 0;
    416 }
    417 #endif
    418 
    419 KURL KURL::copy() const
    420 {
    421     KURL result = *this;
    422     m_url.copyTo(&result.m_url);
    423     return result;
    424 }
    425 
    426 bool KURL::isNull() const
    427 {
    428     return m_url.utf8String().isNull();
    429 }
    430 
    431 bool KURL::isEmpty() const
    432 {
    433     return !m_url.utf8String().length();
    434 }
    435 
    436 bool KURL::isValid() const
    437 {
    438     return m_url.m_isValid;
    439 }
    440 
    441 bool KURL::hasPort() const
    442 {
    443     return hostEnd() < pathStart();
    444 }
    445 
    446 bool KURL::protocolInHTTPFamily() const
    447 {
    448     return m_url.m_protocolInHTTPFamily;
    449 }
    450 
    451 bool KURL::hasPath() const
    452 {
    453     // Note that http://www.google.com/" has a path, the path is "/". This can
    454     // return false only for invalid or nonstandard URLs.
    455     return m_url.m_parsed.path.len >= 0;
    456 }
    457 
    458 // We handle "parameters" separated by a semicolon, while KURL.cpp does not,
    459 // which can lead to different results in some cases.
    460 String KURL::lastPathComponent() const
    461 {
    462     // When the output ends in a slash, WebCore has different expectations than
    463     // the GoogleURL library. For "/foo/bar/" the library will return the empty
    464     // string, but WebCore wants "bar".
    465     url_parse::Component path = m_url.m_parsed.path;
    466     if (path.len > 0 && m_url.utf8String().data()[path.end() - 1] == '/')
    467         path.len--;
    468 
    469     url_parse::Component file;
    470     url_parse::ExtractFileName(m_url.utf8String().data(), path, &file);
    471 
    472     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
    473     // a null string when the path is empty, which we duplicate here.
    474     if (!file.is_nonempty())
    475         return String();
    476     return m_url.componentString(file);
    477 }
    478 
    479 String KURL::protocol() const
    480 {
    481     return m_url.componentString(m_url.m_parsed.scheme);
    482 }
    483 
    484 String KURL::host() const
    485 {
    486     // Note: KURL.cpp unescapes here.
    487     return m_url.componentString(m_url.m_parsed.host);
    488 }
    489 
    490 // Returns 0 when there is no port.
    491 //
    492 // We treat URL's with out-of-range port numbers as invalid URLs, and they will
    493 // be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but
    494 // return invalidPortNumber from this port() function, so we mirror that behavior here.
    495 unsigned short KURL::port() const
    496 {
    497     if (!m_url.m_isValid || m_url.m_parsed.port.len <= 0)
    498         return 0;
    499     int port = url_parse::ParsePort(m_url.utf8String().data(), m_url.m_parsed.port);
    500     ASSERT(port != url_parse::PORT_UNSPECIFIED); // Checked port.len <= 0 before.
    501 
    502     if (port == url_parse::PORT_INVALID || port > maximumValidPortNumber) // Mimic KURL::port()
    503         port = invalidPortNumber;
    504 
    505     return static_cast<unsigned short>(port);
    506 }
    507 
    508 // Returns the empty string if there is no password.
    509 String KURL::pass() const
    510 {
    511     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
    512     // a null string when the password is empty, which we duplicate here.
    513     if (!m_url.m_parsed.password.is_nonempty())
    514         return String();
    515 
    516     // Note: KURL.cpp unescapes here.
    517     return m_url.componentString(m_url.m_parsed.password);
    518 }
    519 
    520 // Returns the empty string if there is no username.
    521 String KURL::user() const
    522 {
    523     // Note: KURL.cpp unescapes here.
    524     return m_url.componentString(m_url.m_parsed.username);
    525 }
    526 
    527 String KURL::fragmentIdentifier() const
    528 {
    529     // Empty but present refs ("foo.com/bar#") should result in the empty
    530     // string, which m_url.componentString will produce. Nonexistent refs
    531     // should be the null string.
    532     if (!m_url.m_parsed.ref.is_valid())
    533         return String();
    534 
    535     // Note: KURL.cpp unescapes here.
    536     return m_url.componentString(m_url.m_parsed.ref);
    537 }
    538 
    539 bool KURL::hasFragmentIdentifier() const
    540 {
    541     // Note: KURL.cpp unescapes here.
    542     // FIXME determine if KURL.cpp agrees about an empty ref
    543     return m_url.m_parsed.ref.len >= 0;
    544 }
    545 
    546 void KURL::copyParsedQueryTo(ParsedURLParameters& parameters) const
    547 {
    548     String query = m_url.componentString(m_url.m_parsed.query);
    549     const UChar* pos = query.characters();
    550     const UChar* end = query.characters() + query.length();
    551     while (pos < end) {
    552         const UChar* parameterStart = pos;
    553         while (pos < end && *pos != '&')
    554             ++pos;
    555         const UChar* parameterEnd = pos;
    556         if (pos < end) {
    557             ASSERT(*pos == '&');
    558             ++pos;
    559         }
    560         if (parameterStart == parameterEnd)
    561             continue;
    562         const UChar* nameStart = parameterStart;
    563         const UChar* equalSign = parameterStart;
    564         while (equalSign < parameterEnd && *equalSign != '=')
    565             ++equalSign;
    566         if (equalSign == nameStart)
    567             continue;
    568         String name(nameStart, equalSign - nameStart);
    569         String value = equalSign == parameterEnd ? String() : String(equalSign + 1, parameterEnd - equalSign - 1);
    570         parameters.set(name, value);
    571     }
    572 }
    573 
    574 String KURL::baseAsString() const
    575 {
    576     // FIXME: There is probably a more efficient way to do this?
    577     return string().left(pathAfterLastSlash());
    578 }
    579 
    580 String KURL::query() const
    581 {
    582     if (m_url.m_parsed.query.len >= 0)
    583         return m_url.componentString(m_url.m_parsed.query);
    584 
    585     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
    586     // an empty string when the query is empty rather than a null (not sure
    587     // which is right).
    588     // Returns a null if the query is not specified, instead of empty.
    589     if (m_url.m_parsed.query.is_valid())
    590         return String("", 0);
    591     return String();
    592 }
    593 
    594 String KURL::path() const
    595 {
    596     // Note: KURL.cpp unescapes here.
    597     return m_url.componentString(m_url.m_parsed.path);
    598 }
    599 
    600 bool KURL::setProtocol(const String& protocol)
    601 {
    602     // Firefox and IE remove everything after the first ':'.
    603     int separatorPosition = protocol.find(':');
    604     String newProtocol = protocol.substring(0, separatorPosition);
    605 
    606     // If KURL is given an invalid scheme, it returns failure without modifying
    607     // the URL at all. This is in contrast to most other setters which modify
    608     // the URL and set "m_isValid."
    609     url_canon::RawCanonOutputT<char> canonProtocol;
    610     url_parse::Component protocolComponent;
    611     if (!url_canon::CanonicalizeScheme(newProtocol.characters(),
    612                                        url_parse::Component(0, newProtocol.length()),
    613                                        &canonProtocol, &protocolComponent)
    614         || !protocolComponent.is_nonempty())
    615         return false;
    616 
    617     KURLGooglePrivate::Replacements replacements;
    618     replacements.SetScheme(CharactersOrEmpty(newProtocol),
    619                            url_parse::Component(0, newProtocol.length()));
    620     m_url.replaceComponents(replacements);
    621 
    622     // isValid could be false but we still return true here. This is because
    623     // WebCore or JS scripts can build up a URL by setting individual
    624     // components, and a JS exception is based on the return value of this
    625     // function. We want to throw the exception and stop the script only when
    626     // its trying to set a bad protocol, and not when it maybe just hasn't
    627     // finished building up its final scheme.
    628     return true;
    629 }
    630 
    631 void KURL::setHost(const String& host)
    632 {
    633     KURLGooglePrivate::Replacements replacements;
    634     replacements.SetHost(CharactersOrEmpty(host),
    635                          url_parse::Component(0, host.length()));
    636     m_url.replaceComponents(replacements);
    637 }
    638 
    639 void KURL::setHostAndPort(const String& s)
    640 {
    641     String host = s;
    642     String port;
    643     int hostEnd = s.find(":");
    644     if (hostEnd != -1) {
    645         host = s.left(hostEnd);
    646         port = s.substring(hostEnd + 1);
    647     }
    648 
    649     KURLGooglePrivate::Replacements replacements;
    650     // Host can't be removed, so we always set.
    651     replacements.SetHost(CharactersOrEmpty(host),
    652                          url_parse::Component(0, host.length()));
    653 
    654     if (port.isEmpty())  // Port may be removed, so we support clearing.
    655         replacements.ClearPort();
    656     else
    657         replacements.SetPort(CharactersOrEmpty(port), url_parse::Component(0, port.length()));
    658     m_url.replaceComponents(replacements);
    659 }
    660 
    661 void KURL::removePort()
    662 {
    663     if (hasPort()) {
    664         String urlWithoutPort = m_url.string().left(hostEnd()) + m_url.string().substring(pathStart());
    665         m_url.setUtf8(urlWithoutPort.utf8());
    666     }
    667 }
    668 
    669 void KURL::setPort(unsigned short i)
    670 {
    671     KURLGooglePrivate::Replacements replacements;
    672     String portStr;
    673     if (i) {
    674         portStr = String::number(i);
    675         replacements.SetPort(
    676             reinterpret_cast<const url_parse::UTF16Char*>(portStr.characters()),
    677             url_parse::Component(0, portStr.length()));
    678 
    679     } else {
    680         // Clear any existing port when it is set to 0.
    681         replacements.ClearPort();
    682     }
    683     m_url.replaceComponents(replacements);
    684 }
    685 
    686 void KURL::setUser(const String& user)
    687 {
    688     // This function is commonly called to clear the username, which we
    689     // normally don't have, so we optimize this case.
    690     if (user.isEmpty() && !m_url.m_parsed.username.is_valid())
    691         return;
    692 
    693     // The canonicalizer will clear any usernames that are empty, so we
    694     // don't have to explicitly call ClearUsername() here.
    695     KURLGooglePrivate::Replacements replacements;
    696     replacements.SetUsername(CharactersOrEmpty(user),
    697                              url_parse::Component(0, user.length()));
    698     m_url.replaceComponents(replacements);
    699 }
    700 
    701 void KURL::setPass(const String& pass)
    702 {
    703     // This function is commonly called to clear the password, which we
    704     // normally don't have, so we optimize this case.
    705     if (pass.isEmpty() && !m_url.m_parsed.password.is_valid())
    706         return;
    707 
    708     // The canonicalizer will clear any passwords that are empty, so we
    709     // don't have to explicitly call ClearUsername() here.
    710     KURLGooglePrivate::Replacements replacements;
    711     replacements.SetPassword(CharactersOrEmpty(pass),
    712                              url_parse::Component(0, pass.length()));
    713     m_url.replaceComponents(replacements);
    714 }
    715 
    716 void KURL::setFragmentIdentifier(const String& s)
    717 {
    718     // This function is commonly called to clear the ref, which we
    719     // normally don't have, so we optimize this case.
    720     if (s.isNull() && !m_url.m_parsed.ref.is_valid())
    721         return;
    722 
    723     KURLGooglePrivate::Replacements replacements;
    724     if (s.isNull())
    725         replacements.ClearRef();
    726     else
    727         replacements.SetRef(CharactersOrEmpty(s), url_parse::Component(0, s.length()));
    728     m_url.replaceComponents(replacements);
    729 }
    730 
    731 void KURL::removeFragmentIdentifier()
    732 {
    733     KURLGooglePrivate::Replacements replacements;
    734     replacements.ClearRef();
    735     m_url.replaceComponents(replacements);
    736 }
    737 
    738 void KURL::setQuery(const String& query)
    739 {
    740     KURLGooglePrivate::Replacements replacements;
    741     if (query.isNull()) {
    742         // KURL.cpp sets to null to clear any query.
    743         replacements.ClearQuery();
    744     } else if (query.length() > 0 && query[0] == '?') {
    745         // WebCore expects the query string to begin with a question mark, but
    746         // GoogleURL doesn't. So we trim off the question mark when setting.
    747         replacements.SetQuery(CharactersOrEmpty(query),
    748                               url_parse::Component(1, query.length() - 1));
    749     } else {
    750         // When set with the empty string or something that doesn't begin with
    751         // a question mark, KURL.cpp will add a question mark for you. The only
    752         // way this isn't compatible is if you call this function with an empty
    753         // string. KURL.cpp will leave a '?' with nothing following it in the
    754         // URL, whereas we'll clear it.
    755         // FIXME We should eliminate this difference.
    756         replacements.SetQuery(CharactersOrEmpty(query),
    757                               url_parse::Component(0, query.length()));
    758     }
    759     m_url.replaceComponents(replacements);
    760 }
    761 
    762 void KURL::setPath(const String& path)
    763 {
    764     // Empty paths will be canonicalized to "/", so we don't have to worry
    765     // about calling ClearPath().
    766     KURLGooglePrivate::Replacements replacements;
    767     replacements.SetPath(CharactersOrEmpty(path),
    768                          url_parse::Component(0, path.length()));
    769     m_url.replaceComponents(replacements);
    770 }
    771 
    772 // On Mac, this just seems to return the same URL, but with "/foo/bar" for
    773 // file: URLs instead of file:///foo/bar. We don't bother with any of this,
    774 // at least for now.
    775 String KURL::prettyURL() const
    776 {
    777     if (!m_url.m_isValid)
    778         return String();
    779     return m_url.string();
    780 }
    781 
    782 String decodeURLEscapeSequences(const String& str)
    783 {
    784     return decodeURLEscapeSequences(str, UTF8Encoding());
    785 }
    786 
    787 // In KURL.cpp's implementation, this is called by every component getter.
    788 // It will unescape every character, including '\0'. This is scary, and may
    789 // cause security holes. We never call this function for components, and
    790 // just return the ASCII versions instead.
    791 //
    792 // This function is also used to decode javascript: URLs and as a general
    793 // purpose unescaping function.
    794 //
    795 // FIXME These should be merged to the KURL.cpp implementation.
    796 String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding)
    797 {
    798     // FIXME We can probably use KURL.cpp's version of this function
    799     // without modification. However, I'm concerned about
    800     // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old
    801     // custom code for now. Using their version will also fix the bug that
    802     // we ignore the encoding.
    803     //
    804     // FIXME b/1350291: This does not get called very often. We just convert
    805     // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of
    806     // sucks, and we don't use the encoding properly, which will make some
    807     // obscure anchor navigations fail.
    808     CString cstr = str.utf8();
    809 
    810     const char* input = cstr.data();
    811     int inputLength = cstr.length();
    812 
    813     url_canon::RawCanonOutputT<url_parse::UTF16Char> unescaped;
    814 
    815     url_util::DecodeURLEscapeSequences(input, inputLength, &unescaped);
    816 
    817     return String(reinterpret_cast<UChar*>(unescaped.data()),
    818                   unescaped.length());
    819 }
    820 
    821 bool KURL::protocolIs(const char* protocol) const
    822 {
    823     assertProtocolIsGood(protocol);
    824 
    825     // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
    826     // The free function protocolIsJavaScript() should be used instead.
    827     // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript"));
    828 
    829     if (m_url.m_parsed.scheme.len <= 0)
    830         return !protocol;
    831     return lowerCaseEqualsASCII(
    832         m_url.utf8String().data() + m_url.m_parsed.scheme.begin,
    833         m_url.utf8String().data() + m_url.m_parsed.scheme.end(),
    834         protocol);
    835 }
    836 
    837 // This is called to escape a URL string. It is only used externally when
    838 // constructing mailto: links to set the query section. Since our query setter
    839 // will automatically do the correct escaping, this function does not have to
    840 // do any work.
    841 //
    842 // There is a possibility that a future caller may use this function in other
    843 // ways, and may expect to get a valid URL string. The dangerous thing we want
    844 // to protect against here is accidentally getting '\0' characters in a string
    845 // that is not supposed to have them. Therefore, we escape these characters.
    846 String encodeWithURLEscapeSequences(const String& notEncodedString)
    847 {
    848     CString utf8 = UTF8Encoding().encode(
    849         reinterpret_cast<const UChar*>(notEncodedString.characters()),
    850         notEncodedString.length(),
    851         URLEncodedEntitiesForUnencodables);
    852     const char* input = utf8.data();
    853     int inputLength = utf8.length();
    854 
    855     Vector<char, 2048> buffer;
    856     for (int i = 0; i < inputLength; i++) {
    857         if (!input[i])
    858             buffer.append("%00", 3);
    859         else
    860             buffer.append(input[i]);
    861     }
    862     return String(buffer.data(), buffer.size());
    863 }
    864 
    865 bool KURL::isHierarchical() const
    866 {
    867     if (!m_url.m_parsed.scheme.is_nonempty())
    868         return false;
    869     return url_util::IsStandard(
    870         &m_url.utf8String().data()[m_url.m_parsed.scheme.begin],
    871         m_url.m_parsed.scheme);
    872 }
    873 
    874 #ifndef NDEBUG
    875 void KURL::print() const
    876 {
    877     printf("%s\n", m_url.utf8String().data());
    878 }
    879 #endif
    880 
    881 void KURL::invalidate()
    882 {
    883     // This is only called from the constructor so resetting the (automatically
    884     // initialized) string and parsed structure would be a waste of time.
    885     m_url.m_isValid = false;
    886     m_url.m_protocolInHTTPFamily = false;
    887 }
    888 
    889 // Equal up to reference fragments, if any.
    890 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
    891 {
    892     // Compute the length of each URL without its ref. Note that the reference
    893     // begin (if it exists) points to the character *after* the '#', so we need
    894     // to subtract one.
    895     int aLength = a.m_url.utf8String().length();
    896     if (a.m_url.m_parsed.ref.len >= 0)
    897         aLength = a.m_url.m_parsed.ref.begin - 1;
    898 
    899     int bLength = b.m_url.utf8String().length();
    900     if (b.m_url.m_parsed.ref.len >= 0)
    901         bLength = b.m_url.m_parsed.ref.begin - 1;
    902 
    903     return aLength == bLength
    904         && !strncmp(a.m_url.utf8String().data(), b.m_url.utf8String().data(), aLength);
    905 }
    906 
    907 unsigned KURL::hostStart() const
    908 {
    909     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false);
    910 }
    911 
    912 unsigned KURL::hostEnd() const
    913 {
    914     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true);
    915 }
    916 
    917 unsigned KURL::pathStart() const
    918 {
    919     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
    920 }
    921 
    922 unsigned KURL::pathEnd() const
    923 {
    924     return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true);
    925 }
    926 
    927 unsigned KURL::pathAfterLastSlash() const
    928 {
    929     // When there's no path, ask for what would be the beginning of it.
    930     if (!m_url.m_parsed.path.is_valid())
    931         return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false);
    932 
    933     url_parse::Component filename;
    934     url_parse::ExtractFileName(m_url.utf8String().data(), m_url.m_parsed.path,
    935                                &filename);
    936     return filename.begin;
    937 }
    938 
    939 bool protocolIs(const String& url, const char* protocol)
    940 {
    941     // Do the comparison without making a new string object.
    942     assertProtocolIsGood(protocol);
    943 
    944     // Check the scheme like GURL does.
    945     return url_util::FindAndCompareScheme(url.characters(), url.length(),
    946         protocol, 0);
    947 }
    948 
    949 inline bool KURL::protocolIs(const String& string, const char* protocol)
    950 {
    951     return WebCore::protocolIs(string, protocol);
    952 }
    953 
    954 bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
    955 {
    956     if (a.parsed().scheme.end() != b.parsed().scheme.end())
    957         return false;
    958 
    959     int hostStartA = a.hostStart();
    960     int hostLengthA = a.hostEnd() - hostStartA;
    961     int hostStartB = b.hostStart();
    962     int hostLengthB = b.hostEnd() - b.hostStart();
    963     if (hostLengthA != hostLengthB)
    964         return false;
    965 
    966     // Check the scheme
    967     for (int i = 0; i < a.parsed().scheme.end(); ++i)
    968         if (a.string()[i] != b.string()[i])
    969             return false;
    970 
    971     // And the host
    972     for (int i = 0; i < hostLengthA; ++i)
    973         if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
    974             return false;
    975 
    976     if (a.port() != b.port())
    977         return false;
    978 
    979     return true;
    980 }
    981 
    982 } // namespace WebCore
    983 
    984 #endif // USE(GOOGLEURL)
    985