Home | History | Annotate | Download | only in weborigin
      1 /*
      2  * Copyright (C) 2004, 2007, 2008, 2011, 2012 Apple Inc. All rights reserved.
      3  * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
      4  * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "platform/weborigin/KURL.h"
     30 
     31 #include "platform/weborigin/KnownPorts.h"
     32 #include "wtf/StdLibExtras.h"
     33 #include "wtf/text/CString.h"
     34 #include "wtf/text/StringHash.h"
     35 #include "wtf/text/StringUTF8Adaptor.h"
     36 #include "wtf/text/TextEncoding.h"
     37 #include <algorithm>
     38 #include <url/url_util.h>
     39 #ifndef NDEBUG
     40 #include <stdio.h>
     41 #endif
     42 
     43 namespace WebCore {
     44 
     45 static const int maximumValidPortNumber = 0xFFFE;
     46 static const int invalidPortNumber = 0xFFFF;
     47 
     48 static void assertProtocolIsGood(const char* protocol)
     49 {
     50 #ifndef NDEBUG
     51     const char* p = protocol;
     52     while (*p) {
     53         ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
     54         ++p;
     55     }
     56 #endif
     57 }
     58 
     59 // Note: You must ensure that |spec| is a valid canonicalized URL before calling this function.
     60 static const char* asURLChar8Subtle(const String& spec)
     61 {
     62     ASSERT(spec.is8Bit());
     63     // characters8 really return characters in Latin-1, but because we canonicalize
     64     // URL strings, we know that everything before the fragment identifier will
     65     // actually be ASCII, which means this cast is safe as long as you don't look
     66     // at the fragment component.
     67     return reinterpret_cast<const char*>(spec.characters8());
     68 }
     69 
     70 // Returns the characters for the given string, or a pointer to a static empty
     71 // string if the input string is null. This will always ensure we have a non-
     72 // null character pointer since ReplaceComponents has special meaning for null.
     73 static const char* charactersOrEmpty(const StringUTF8Adaptor& string)
     74 {
     75     static const char zero = 0;
     76     return string.data() ? string.data() : &zero;
     77 }
     78 
     79 static bool isSchemeFirstChar(char c)
     80 {
     81     return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
     82 }
     83 
     84 static bool isSchemeChar(char c)
     85 {
     86     return isSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' || c == '-' || c == '+';
     87 }
     88 
     89 static bool isUnicodeEncoding(const WTF::TextEncoding* encoding)
     90 {
     91     return encoding->encodingForFormSubmission() == UTF8Encoding();
     92 }
     93 
     94 namespace {
     95 
     96 class KURLCharsetConverter FINAL : public url::CharsetConverter {
     97 public:
     98     // The encoding parameter may be 0, but in this case the object must not be called.
     99     explicit KURLCharsetConverter(const WTF::TextEncoding* encoding)
    100         : m_encoding(encoding)
    101     {
    102     }
    103 
    104     virtual void ConvertFromUTF16(const url::UTF16Char* input, int inputLength, url::CanonOutput* output) OVERRIDE
    105     {
    106         CString encoded = m_encoding->normalizeAndEncode(String(input, inputLength), WTF::URLEncodedEntitiesForUnencodables);
    107         output->Append(encoded.data(), static_cast<int>(encoded.length()));
    108     }
    109 
    110 private:
    111     const WTF::TextEncoding* m_encoding;
    112 };
    113 
    114 } // namespace
    115 
    116 bool isValidProtocol(const String& protocol)
    117 {
    118     // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
    119     if (protocol.isEmpty())
    120         return false;
    121     if (!isSchemeFirstChar(protocol[0]))
    122         return false;
    123     unsigned protocolLength = protocol.length();
    124     for (unsigned i = 1; i < protocolLength; i++) {
    125         if (!isSchemeChar(protocol[i]))
    126             return false;
    127     }
    128     return true;
    129 }
    130 
    131 String KURL::strippedForUseAsReferrer() const
    132 {
    133     if (protocolIsAbout() || protocolIs("data") || protocolIs("javascript"))
    134         return String();
    135 
    136     if (m_parsed.username.is_nonempty() || m_parsed.password.is_nonempty() || m_parsed.ref.is_nonempty()) {
    137         KURL referrer(*this);
    138         referrer.setUser(String());
    139         referrer.setPass(String());
    140         referrer.removeFragmentIdentifier();
    141         return referrer.string();
    142     }
    143     return string();
    144 }
    145 
    146 bool KURL::isLocalFile() const
    147 {
    148     // Including feed here might be a bad idea since drag and drop uses this check
    149     // and including feed would allow feeds to potentially let someone's blog
    150     // read the contents of the clipboard on a drag, even without a drop.
    151     // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
    152     return protocolIs("file");
    153 }
    154 
    155 bool protocolIsJavaScript(const String& url)
    156 {
    157     return protocolIs(url, "javascript");
    158 }
    159 
    160 const KURL& blankURL()
    161 {
    162     DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank"));
    163     return staticBlankURL;
    164 }
    165 
    166 bool KURL::isAboutBlankURL() const
    167 {
    168     return *this == blankURL();
    169 }
    170 
    171 String KURL::elidedString() const
    172 {
    173     if (string().length() <= 1024)
    174         return string();
    175 
    176     return string().left(511) + "..." + string().right(510);
    177 }
    178 
    179 KURL::KURL()
    180     : m_isValid(false)
    181     , m_protocolIsInHTTPFamily(false)
    182 {
    183 }
    184 
    185 // Initializes with a string representing an absolute URL. No encoding
    186 // information is specified. This generally happens when a KURL is converted
    187 // to a string and then converted back. In this case, the URL is already
    188 // canonical and in proper escaped form so needs no encoding. We treat it as
    189 // UTF-8 just in case.
    190 KURL::KURL(ParsedURLStringTag, const String& url)
    191 {
    192     if (!url.isNull())
    193         init(KURL(), url, 0);
    194     else {
    195         // WebCore expects us to preserve the nullness of strings when this
    196         // constructor is used. In all other cases, it expects a non-null
    197         // empty string, which is what init() will create.
    198         m_isValid = false;
    199         m_protocolIsInHTTPFamily = false;
    200     }
    201 }
    202 
    203 KURL KURL::createIsolated(ParsedURLStringTag, const String& url)
    204 {
    205     // FIXME: We should be able to skip this extra copy and created an
    206     // isolated KURL more efficiently.
    207     return KURL(ParsedURLString, url).copy();
    208 }
    209 
    210 // Constructs a new URL given a base URL and a possibly relative input URL.
    211 // This assumes UTF-8 encoding.
    212 KURL::KURL(const KURL& base, const String& relative)
    213 {
    214     init(base, relative, 0);
    215 }
    216 
    217 // Constructs a new URL given a base URL and a possibly relative input URL.
    218 // Any query portion of the relative URL will be encoded in the given encoding.
    219 KURL::KURL(const KURL& base, const String& relative, const WTF::TextEncoding& encoding)
    220 {
    221     init(base, relative, &encoding.encodingForFormSubmission());
    222 }
    223 
    224 KURL::KURL(const AtomicString& canonicalString, const url::Parsed& parsed, bool isValid)
    225     : m_isValid(isValid)
    226     , m_protocolIsInHTTPFamily(false)
    227     , m_parsed(parsed)
    228     , m_string(canonicalString)
    229 {
    230     initProtocolIsInHTTPFamily();
    231     initInnerURL();
    232 }
    233 
    234 KURL::KURL(WTF::HashTableDeletedValueType)
    235     : m_isValid(false)
    236     , m_protocolIsInHTTPFamily(false)
    237     , m_string(WTF::HashTableDeletedValue)
    238 {
    239 }
    240 
    241 KURL::KURL(const KURL& other)
    242     : m_isValid(other.m_isValid)
    243     , m_protocolIsInHTTPFamily(other.m_protocolIsInHTTPFamily)
    244     , m_parsed(other.m_parsed)
    245     , m_string(other.m_string)
    246 {
    247     if (other.m_innerURL.get())
    248         m_innerURL = adoptPtr(new KURL(other.m_innerURL->copy()));
    249 }
    250 
    251 KURL& KURL::operator=(const KURL& other)
    252 {
    253     m_isValid = other.m_isValid;
    254     m_protocolIsInHTTPFamily = other.m_protocolIsInHTTPFamily;
    255     m_parsed = other.m_parsed;
    256     m_string = other.m_string;
    257     if (other.m_innerURL)
    258         m_innerURL = adoptPtr(new KURL(other.m_innerURL->copy()));
    259     else
    260         m_innerURL.clear();
    261     return *this;
    262 }
    263 
    264 #if COMPILER_SUPPORTS(CXX_RVALUE_REFERENCES)
    265 KURL::KURL(KURL&& other)
    266     : m_isValid(other.m_isValid)
    267     , m_protocolIsInHTTPFamily(other.m_protocolIsInHTTPFamily)
    268     , m_parsed(other.m_parsed)
    269     // FIXME: Instead of explicitly casting to String&& here, we should use std::move, but that requires us to
    270     // have a standard library that supports move semantics.
    271     , m_string(static_cast<String&&>(other.m_string))
    272     , m_innerURL(other.m_innerURL.release())
    273 {
    274 }
    275 
    276 KURL& KURL::operator=(KURL&& other)
    277 {
    278     m_isValid = other.m_isValid;
    279     m_protocolIsInHTTPFamily = other.m_protocolIsInHTTPFamily;
    280     m_parsed = other.m_parsed;
    281     // FIXME: Instead of explicitly casting to String&& here, we should use std::move, but that requires us to
    282     // have a standard library that supports move semantics.
    283     m_string = static_cast<String&&>(other.m_string);
    284     m_innerURL = other.m_innerURL.release();
    285     return *this;
    286 }
    287 #endif
    288 
    289 KURL KURL::copy() const
    290 {
    291     KURL result;
    292     result.m_isValid = m_isValid;
    293     result.m_protocolIsInHTTPFamily = m_protocolIsInHTTPFamily;
    294     result.m_parsed = m_parsed;
    295     result.m_string = m_string.isolatedCopy();
    296     if (m_innerURL)
    297         result.m_innerURL = adoptPtr(new KURL(m_innerURL->copy()));
    298     return result;
    299 }
    300 
    301 bool KURL::isNull() const
    302 {
    303     return m_string.isNull();
    304 }
    305 
    306 bool KURL::isEmpty() const
    307 {
    308     return m_string.isEmpty();
    309 }
    310 
    311 bool KURL::isValid() const
    312 {
    313     return m_isValid;
    314 }
    315 
    316 bool KURL::hasPort() const
    317 {
    318     return hostEnd() < pathStart();
    319 }
    320 
    321 bool KURL::protocolIsInHTTPFamily() const
    322 {
    323     return m_protocolIsInHTTPFamily;
    324 }
    325 
    326 bool KURL::hasPath() const
    327 {
    328     // Note that http://www.google.com/" has a path, the path is "/". This can
    329     // return false only for invalid or nonstandard URLs.
    330     return m_parsed.path.len >= 0;
    331 }
    332 
    333 // We handle "parameters" separated by a semicolon, while KURL.cpp does not,
    334 // which can lead to different results in some cases.
    335 String KURL::lastPathComponent() const
    336 {
    337     if (!m_isValid)
    338         return stringForInvalidComponent();
    339     ASSERT(!m_string.isNull());
    340 
    341     // When the output ends in a slash, WebCore has different expectations than
    342     // the GoogleURL library. For "/foo/bar/" the library will return the empty
    343     // string, but WebCore wants "bar".
    344     url::Component path = m_parsed.path;
    345     if (path.len > 0 && m_string[path.end() - 1] == '/')
    346         path.len--;
    347 
    348     url::Component file;
    349     if (m_string.is8Bit())
    350         url::ExtractFileName(asURLChar8Subtle(m_string), path, &file);
    351     else
    352         url::ExtractFileName(m_string.characters16(), path, &file);
    353 
    354     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
    355     // a null string when the path is empty, which we duplicate here.
    356     if (!file.is_nonempty())
    357         return String();
    358     return componentString(file);
    359 }
    360 
    361 String KURL::protocol() const
    362 {
    363     return componentString(m_parsed.scheme);
    364 }
    365 
    366 String KURL::host() const
    367 {
    368     return componentString(m_parsed.host);
    369 }
    370 
    371 // Returns 0 when there is no port.
    372 //
    373 // We treat URL's with out-of-range port numbers as invalid URLs, and they will
    374 // be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but
    375 // return invalidPortNumber from this port() function, so we mirror that behavior here.
    376 unsigned short KURL::port() const
    377 {
    378     if (!m_isValid || m_parsed.port.len <= 0)
    379         return 0;
    380     ASSERT(!m_string.isNull());
    381     int port = m_string.is8Bit() ?
    382         url::ParsePort(asURLChar8Subtle(m_string), m_parsed.port) :
    383         url::ParsePort(m_string.characters16(), m_parsed.port);
    384     ASSERT(port != url::PORT_UNSPECIFIED); // Checked port.len <= 0 before.
    385 
    386     if (port == url::PORT_INVALID || port > maximumValidPortNumber) // Mimic KURL::port()
    387         port = invalidPortNumber;
    388 
    389     return static_cast<unsigned short>(port);
    390 }
    391 
    392 String KURL::pass() const
    393 {
    394     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
    395     // a null string when the password is empty, which we duplicate here.
    396     if (!m_parsed.password.is_nonempty())
    397         return String();
    398     return componentString(m_parsed.password);
    399 }
    400 
    401 String KURL::user() const
    402 {
    403     return componentString(m_parsed.username);
    404 }
    405 
    406 String KURL::fragmentIdentifier() const
    407 {
    408     // Empty but present refs ("foo.com/bar#") should result in the empty
    409     // string, which componentString will produce. Nonexistent refs
    410     // should be the null string.
    411     if (!m_parsed.ref.is_valid())
    412         return String();
    413     return componentString(m_parsed.ref);
    414 }
    415 
    416 bool KURL::hasFragmentIdentifier() const
    417 {
    418     return m_parsed.ref.len >= 0;
    419 }
    420 
    421 String KURL::baseAsString() const
    422 {
    423     // FIXME: There is probably a more efficient way to do this?
    424     return m_string.left(pathAfterLastSlash());
    425 }
    426 
    427 String KURL::query() const
    428 {
    429     if (m_parsed.query.len >= 0)
    430         return componentString(m_parsed.query);
    431 
    432     // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns
    433     // an empty string when the query is empty rather than a null (not sure
    434     // which is right).
    435     // Returns a null if the query is not specified, instead of empty.
    436     if (m_parsed.query.is_valid())
    437         return emptyString();
    438     return String();
    439 }
    440 
    441 String KURL::path() const
    442 {
    443     return componentString(m_parsed.path);
    444 }
    445 
    446 bool KURL::setProtocol(const String& protocol)
    447 {
    448     // Firefox and IE remove everything after the first ':'.
    449     int separatorPosition = protocol.find(':');
    450     String newProtocol = protocol.substring(0, separatorPosition);
    451     StringUTF8Adaptor newProtocolUTF8(newProtocol);
    452 
    453     // If KURL is given an invalid scheme, it returns failure without modifying
    454     // the URL at all. This is in contrast to most other setters which modify
    455     // the URL and set "m_isValid."
    456     url::RawCanonOutputT<char> canonProtocol;
    457     url::Component protocolComponent;
    458     if (!url::CanonicalizeScheme(newProtocolUTF8.data(), url::Component(0, newProtocolUTF8.length()), &canonProtocol, &protocolComponent)
    459         || !protocolComponent.is_nonempty())
    460         return false;
    461 
    462     url::Replacements<char> replacements;
    463     replacements.SetScheme(charactersOrEmpty(newProtocolUTF8), url::Component(0, newProtocolUTF8.length()));
    464     replaceComponents(replacements);
    465 
    466     // isValid could be false but we still return true here. This is because
    467     // WebCore or JS scripts can build up a URL by setting individual
    468     // components, and a JS exception is based on the return value of this
    469     // function. We want to throw the exception and stop the script only when
    470     // its trying to set a bad protocol, and not when it maybe just hasn't
    471     // finished building up its final scheme.
    472     return true;
    473 }
    474 
    475 void KURL::setHost(const String& host)
    476 {
    477     StringUTF8Adaptor hostUTF8(host);
    478     url::Replacements<char> replacements;
    479     replacements.SetHost(charactersOrEmpty(hostUTF8), url::Component(0, hostUTF8.length()));
    480     replaceComponents(replacements);
    481 }
    482 
    483 static String parsePortFromStringPosition(const String& value, unsigned portStart)
    484 {
    485     // "008080junk" needs to be treated as port "8080" and "000" as "0".
    486     size_t length = value.length();
    487     unsigned portEnd = portStart;
    488     while (isASCIIDigit(value[portEnd]) && portEnd < length)
    489         ++portEnd;
    490     while (value[portStart] == '0' && portStart < portEnd - 1)
    491         ++portStart;
    492 
    493     // Required for backwards compat.
    494     // https://www.w3.org/Bugs/Public/show_bug.cgi?id=23463
    495     if (portStart == portEnd)
    496         return "0";
    497 
    498     return value.substring(portStart, portEnd - portStart);
    499 }
    500 
    501 void KURL::setHostAndPort(const String& hostAndPort)
    502 {
    503     size_t separator = hostAndPort.find(':');
    504     if (!separator)
    505         return;
    506 
    507     if (separator == kNotFound) {
    508         url::Replacements<char> replacements;
    509         StringUTF8Adaptor hostUTF8(hostAndPort);
    510         replacements.SetHost(charactersOrEmpty(hostUTF8), url::Component(0, hostUTF8.length()));
    511         replaceComponents(replacements);
    512         return;
    513     }
    514 
    515     String host = hostAndPort.substring(0, separator);
    516     String port = parsePortFromStringPosition(hostAndPort, separator + 1);
    517 
    518     StringUTF8Adaptor hostUTF8(host);
    519     StringUTF8Adaptor portUTF8(port);
    520 
    521     url::Replacements<char> replacements;
    522     replacements.SetHost(charactersOrEmpty(hostUTF8), url::Component(0, hostUTF8.length()));
    523     replacements.SetPort(charactersOrEmpty(portUTF8), url::Component(0, portUTF8.length()));
    524     replaceComponents(replacements);
    525 }
    526 
    527 void KURL::removePort()
    528 {
    529     if (!hasPort())
    530         return;
    531     url::Replacements<char> replacements;
    532     replacements.ClearPort();
    533     replaceComponents(replacements);
    534 }
    535 
    536 void KURL::setPort(const String& port)
    537 {
    538     String parsedPort = parsePortFromStringPosition(port, 0);
    539     setPort(parsedPort.toUInt());
    540 }
    541 
    542 void KURL::setPort(unsigned short port)
    543 {
    544     if (isDefaultPortForProtocol(port, protocol())) {
    545         removePort();
    546         return;
    547     }
    548 
    549     String portString = String::number(port);
    550     ASSERT(portString.is8Bit());
    551 
    552     url::Replacements<char> replacements;
    553     replacements.SetPort(reinterpret_cast<const char*>(portString.characters8()), url::Component(0, portString.length()));
    554     replaceComponents(replacements);
    555 }
    556 
    557 void KURL::setUser(const String& user)
    558 {
    559     // This function is commonly called to clear the username, which we
    560     // normally don't have, so we optimize this case.
    561     if (user.isEmpty() && !m_parsed.username.is_valid())
    562         return;
    563 
    564     // The canonicalizer will clear any usernames that are empty, so we
    565     // don't have to explicitly call ClearUsername() here.
    566     StringUTF8Adaptor userUTF8(user);
    567     url::Replacements<char> replacements;
    568     replacements.SetUsername(charactersOrEmpty(userUTF8), url::Component(0, userUTF8.length()));
    569     replaceComponents(replacements);
    570 }
    571 
    572 void KURL::setPass(const String& pass)
    573 {
    574     // This function is commonly called to clear the password, which we
    575     // normally don't have, so we optimize this case.
    576     if (pass.isEmpty() && !m_parsed.password.is_valid())
    577         return;
    578 
    579     // The canonicalizer will clear any passwords that are empty, so we
    580     // don't have to explicitly call ClearUsername() here.
    581     StringUTF8Adaptor passUTF8(pass);
    582     url::Replacements<char> replacements;
    583     replacements.SetPassword(charactersOrEmpty(passUTF8), url::Component(0, passUTF8.length()));
    584     replaceComponents(replacements);
    585 }
    586 
    587 void KURL::setFragmentIdentifier(const String& fragment)
    588 {
    589     // This function is commonly called to clear the ref, which we
    590     // normally don't have, so we optimize this case.
    591     if (fragment.isNull() && !m_parsed.ref.is_valid())
    592         return;
    593 
    594     StringUTF8Adaptor fragmentUTF8(fragment);
    595 
    596     url::Replacements<char> replacements;
    597     if (fragment.isNull())
    598         replacements.ClearRef();
    599     else
    600         replacements.SetRef(charactersOrEmpty(fragmentUTF8), url::Component(0, fragmentUTF8.length()));
    601     replaceComponents(replacements);
    602 }
    603 
    604 void KURL::removeFragmentIdentifier()
    605 {
    606     url::Replacements<char> replacements;
    607     replacements.ClearRef();
    608     replaceComponents(replacements);
    609 }
    610 
    611 void KURL::setQuery(const String& query)
    612 {
    613     StringUTF8Adaptor queryUTF8(query);
    614     url::Replacements<char> replacements;
    615     if (query.isNull()) {
    616         // KURL.cpp sets to null to clear any query.
    617         replacements.ClearQuery();
    618     } else if (query.length() > 0 && query[0] == '?') {
    619         // WebCore expects the query string to begin with a question mark, but
    620         // GoogleURL doesn't. So we trim off the question mark when setting.
    621         replacements.SetQuery(charactersOrEmpty(queryUTF8), url::Component(1, queryUTF8.length() - 1));
    622     } else {
    623         // When set with the empty string or something that doesn't begin with
    624         // a question mark, KURL.cpp will add a question mark for you. The only
    625         // way this isn't compatible is if you call this function with an empty
    626         // string. KURL.cpp will leave a '?' with nothing following it in the
    627         // URL, whereas we'll clear it.
    628         // FIXME We should eliminate this difference.
    629         replacements.SetQuery(charactersOrEmpty(queryUTF8), url::Component(0, queryUTF8.length()));
    630     }
    631     replaceComponents(replacements);
    632 }
    633 
    634 void KURL::setPath(const String& path)
    635 {
    636     // Empty paths will be canonicalized to "/", so we don't have to worry
    637     // about calling ClearPath().
    638     StringUTF8Adaptor pathUTF8(path);
    639     url::Replacements<char> replacements;
    640     replacements.SetPath(charactersOrEmpty(pathUTF8), url::Component(0, pathUTF8.length()));
    641     replaceComponents(replacements);
    642 }
    643 
    644 String decodeURLEscapeSequences(const String& string)
    645 {
    646     return decodeURLEscapeSequences(string, UTF8Encoding());
    647 }
    648 
    649 // In KURL.cpp's implementation, this is called by every component getter.
    650 // It will unescape every character, including '\0'. This is scary, and may
    651 // cause security holes. We never call this function for components, and
    652 // just return the ASCII versions instead.
    653 //
    654 // This function is also used to decode javascript: URLs and as a general
    655 // purpose unescaping function.
    656 //
    657 // FIXME These should be merged to the KURL.cpp implementation.
    658 String decodeURLEscapeSequences(const String& string, const WTF::TextEncoding& encoding)
    659 {
    660     // FIXME We can probably use KURL.cpp's version of this function
    661     // without modification. However, I'm concerned about
    662     // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old
    663     // custom code for now. Using their version will also fix the bug that
    664     // we ignore the encoding.
    665     //
    666     // FIXME b/1350291: This does not get called very often. We just convert
    667     // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of
    668     // sucks, and we don't use the encoding properly, which will make some
    669     // obscure anchor navigations fail.
    670     StringUTF8Adaptor stringUTF8(string);
    671     url::RawCanonOutputT<url::UTF16Char> unescaped;
    672     url::DecodeURLEscapeSequences(stringUTF8.data(), stringUTF8.length(), &unescaped);
    673     return StringImpl::create8BitIfPossible(reinterpret_cast<UChar*>(unescaped.data()), unescaped.length());
    674 }
    675 
    676 String encodeWithURLEscapeSequences(const String& notEncodedString)
    677 {
    678     CString utf8 = UTF8Encoding().normalizeAndEncode(notEncodedString, WTF::URLEncodedEntitiesForUnencodables);
    679 
    680     url::RawCanonOutputT<char> buffer;
    681     int inputLength = utf8.length();
    682     if (buffer.length() < inputLength * 3)
    683         buffer.Resize(inputLength * 3);
    684 
    685     url::EncodeURIComponent(utf8.data(), inputLength, &buffer);
    686     String escaped(buffer.data(), buffer.length());
    687     // Unescape '/'; it's safe and much prettier.
    688     escaped.replace("%2F", "/");
    689     return escaped;
    690 }
    691 
    692 bool KURL::isHierarchical() const
    693 {
    694     if (m_string.isNull() || !m_parsed.scheme.is_nonempty())
    695         return false;
    696     return m_string.is8Bit() ?
    697         url::IsStandard(asURLChar8Subtle(m_string), m_parsed.scheme) :
    698         url::IsStandard(m_string.characters16(), m_parsed.scheme);
    699 }
    700 
    701 #ifndef NDEBUG
    702 void KURL::print() const
    703 {
    704     printf("%s\n", m_string.utf8().data());
    705 }
    706 #endif
    707 
    708 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
    709 {
    710     // Compute the length of each URL without its ref. Note that the reference
    711     // begin (if it exists) points to the character *after* the '#', so we need
    712     // to subtract one.
    713     int aLength = a.m_string.length();
    714     if (a.m_parsed.ref.len >= 0)
    715         aLength = a.m_parsed.ref.begin - 1;
    716 
    717     int bLength = b.m_string.length();
    718     if (b.m_parsed.ref.len >= 0)
    719         bLength = b.m_parsed.ref.begin - 1;
    720 
    721     if (aLength != bLength)
    722         return false;
    723 
    724     const String& aString = a.m_string;
    725     const String& bString = b.m_string;
    726     // FIXME: Abstraction this into a function in WTFString.h.
    727     for (int i = 0; i < aLength; ++i) {
    728         if (aString[i] != bString[i])
    729             return false;
    730     }
    731     return true;
    732 }
    733 
    734 unsigned KURL::hostStart() const
    735 {
    736     return m_parsed.CountCharactersBefore(url::Parsed::HOST, false);
    737 }
    738 
    739 unsigned KURL::hostEnd() const
    740 {
    741     return m_parsed.CountCharactersBefore(url::Parsed::PORT, true);
    742 }
    743 
    744 unsigned KURL::pathStart() const
    745 {
    746     return m_parsed.CountCharactersBefore(url::Parsed::PATH, false);
    747 }
    748 
    749 unsigned KURL::pathEnd() const
    750 {
    751     return m_parsed.CountCharactersBefore(url::Parsed::QUERY, true);
    752 }
    753 
    754 unsigned KURL::pathAfterLastSlash() const
    755 {
    756     if (m_string.isNull())
    757         return 0;
    758     if (!m_isValid || !m_parsed.path.is_valid())
    759         return m_parsed.CountCharactersBefore(url::Parsed::PATH, false);
    760     url::Component filename;
    761     if (m_string.is8Bit())
    762         url::ExtractFileName(asURLChar8Subtle(m_string), m_parsed.path, &filename);
    763     else
    764         url::ExtractFileName(m_string.characters16(), m_parsed.path, &filename);
    765     return filename.begin;
    766 }
    767 
    768 bool protocolIs(const String& url, const char* protocol)
    769 {
    770     assertProtocolIsGood(protocol);
    771     if (url.isNull())
    772         return false;
    773     if (url.is8Bit())
    774         return url::FindAndCompareScheme(asURLChar8Subtle(url), url.length(), protocol, 0);
    775     return url::FindAndCompareScheme(url.characters16(), url.length(), protocol, 0);
    776 }
    777 
    778 void KURL::init(const KURL& base, const String& relative, const WTF::TextEncoding* queryEncoding)
    779 {
    780     if (!relative.isNull() && relative.is8Bit()) {
    781         StringUTF8Adaptor relativeUTF8(relative);
    782         init(base, relativeUTF8.data(), relativeUTF8.length(), queryEncoding);
    783     } else
    784         init(base, relative.characters16(), relative.length(), queryEncoding);
    785     initProtocolIsInHTTPFamily();
    786     initInnerURL();
    787 }
    788 
    789 template <typename CHAR>
    790 void KURL::init(const KURL& base, const CHAR* relative, int relativeLength, const WTF::TextEncoding* queryEncoding)
    791 {
    792     // As a performance optimization, we do not use the charset converter
    793     // if encoding is UTF-8 or other Unicode encodings. Note that this is
    794     // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more
    795     // efficient with no charset converter object because it can do UTF-8
    796     // internally with no extra copies.
    797 
    798     // We feel free to make the charset converter object every time since it's
    799     // just a wrapper around a reference.
    800     KURLCharsetConverter charsetConverterObject(queryEncoding);
    801     KURLCharsetConverter* charsetConverter = (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 : &charsetConverterObject;
    802 
    803     StringUTF8Adaptor baseUTF8(base.string());
    804 
    805     url::RawCanonOutputT<char> output;
    806     m_isValid = url::ResolveRelative(baseUTF8.data(), baseUTF8.length(), base.m_parsed, relative, relativeLength, charsetConverter, &output, &m_parsed);
    807 
    808     // See FIXME in KURLPrivate in the header. If canonicalization has not
    809     // changed the string, we can avoid an extra allocation by using assignment.
    810     m_string = AtomicString::fromUTF8(output.data(), output.length());
    811 }
    812 
    813 void KURL::initInnerURL()
    814 {
    815     if (!m_isValid) {
    816         m_innerURL.clear();
    817         return;
    818     }
    819     if (url::Parsed* innerParsed = m_parsed.inner_parsed())
    820         m_innerURL = adoptPtr(new KURL(ParsedURLString, m_string.substring(innerParsed->scheme.begin, innerParsed->Length() - innerParsed->scheme.begin)));
    821     else
    822         m_innerURL.clear();
    823 }
    824 
    825 template<typename CHAR>
    826 bool internalProtocolIs(const url::Component& scheme, const CHAR* spec, const char* protocol)
    827 {
    828     const CHAR* begin = spec + scheme.begin;
    829     const CHAR* end = begin + scheme.len;
    830 
    831     while (begin != end && *protocol) {
    832         ASSERT(toASCIILower(*protocol) == *protocol);
    833         if (toASCIILower(*begin++) != *protocol++)
    834             return false;
    835     }
    836 
    837     // Both strings are equal (ignoring case) if and only if all of the characters were equal,
    838     // and the end of both has been reached.
    839     return begin == end && !*protocol;
    840 }
    841 
    842 template<typename CHAR>
    843 bool checkIfProtocolIsInHTTPFamily(const url::Component& scheme, const CHAR* spec)
    844 {
    845     if (scheme.len == 4)
    846         return internalProtocolIs(scheme, spec, "http");
    847     if (scheme.len == 5)
    848         return internalProtocolIs(scheme, spec, "https");
    849     return false;
    850 }
    851 
    852 void KURL::initProtocolIsInHTTPFamily()
    853 {
    854     if (!m_isValid) {
    855         m_protocolIsInHTTPFamily = false;
    856         return;
    857     }
    858 
    859     ASSERT(!m_string.isNull());
    860     m_protocolIsInHTTPFamily = m_string.is8Bit() ?
    861         checkIfProtocolIsInHTTPFamily(m_parsed.scheme, m_string.characters8()) :
    862         checkIfProtocolIsInHTTPFamily(m_parsed.scheme, m_string.characters16());
    863 }
    864 
    865 bool KURL::protocolIs(const char* protocol) const
    866 {
    867     assertProtocolIsGood(protocol);
    868 
    869     // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
    870     // The free function protocolIsJavaScript() should be used instead.
    871     // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript"));
    872 
    873     if (m_string.isNull() || m_parsed.scheme.len <= 0)
    874         return *protocol == '\0';
    875 
    876     return m_string.is8Bit() ?
    877         internalProtocolIs(m_parsed.scheme, m_string.characters8(), protocol) :
    878         internalProtocolIs(m_parsed.scheme, m_string.characters16(), protocol);
    879 }
    880 
    881 String KURL::stringForInvalidComponent() const
    882 {
    883     if (m_string.isNull())
    884         return String();
    885     return emptyString();
    886 }
    887 
    888 String KURL::componentString(const url::Component& component) const
    889 {
    890     if (!m_isValid || component.len <= 0)
    891         return stringForInvalidComponent();
    892     // begin and len are in terms of bytes which do not match
    893     // if string() is UTF-16 and input contains non-ASCII characters.
    894     // However, the only part in urlString that can contain non-ASCII
    895     // characters is 'ref' at the end of the string. In that case,
    896     // begin will always match the actual value and len (in terms of
    897     // byte) will be longer than what's needed by 'mid'. However, mid
    898     // truncates len to avoid go past the end of a string so that we can
    899     // get away without doing anything here.
    900     return string().substring(component.begin, component.len);
    901 }
    902 
    903 template<typename CHAR>
    904 void KURL::replaceComponents(const url::Replacements<CHAR>& replacements)
    905 {
    906     url::RawCanonOutputT<char> output;
    907     url::Parsed newParsed;
    908 
    909     StringUTF8Adaptor utf8(m_string);
    910     m_isValid = url::ReplaceComponents(utf8.data(), utf8.length(), m_parsed, replacements, 0, &output, &newParsed);
    911 
    912     m_parsed = newParsed;
    913     m_string = AtomicString::fromUTF8(output.data(), output.length());
    914 }
    915 
    916 bool KURL::isSafeToSendToAnotherThread() const
    917 {
    918     return m_string.isSafeToSendToAnotherThread()
    919         && (!m_innerURL || m_innerURL->isSafeToSendToAnotherThread());
    920 }
    921 
    922 } // namespace WebCore
    923