1 /* 2 * Copyright (C) 2004, 2007, 2008 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 28 #include "KURL.h" 29 30 #include "TextEncoding.h" 31 #include <wtf/text/CString.h> 32 #include <wtf/HashMap.h> 33 #include <wtf/HexNumber.h> 34 #include <wtf/StdLibExtras.h> 35 #include <wtf/text/StringHash.h> 36 37 #if USE(ICU_UNICODE) 38 #include <unicode/uidna.h> 39 #elif USE(QT4_UNICODE) 40 #include <QUrl> 41 #elif USE(GLIB_UNICODE) 42 #include <glib.h> 43 #include "GOwnPtr.h" 44 #endif 45 46 #include <stdio.h> 47 48 using namespace std; 49 using namespace WTF; 50 51 namespace WebCore { 52 53 typedef Vector<char, 512> CharBuffer; 54 typedef Vector<UChar, 512> UCharBuffer; 55 56 static const unsigned maximumValidPortNumber = 0xFFFE; 57 static const unsigned invalidPortNumber = 0xFFFF; 58 59 #if !USE(GOOGLEURL) 60 61 // FIXME: This file makes too much use of the + operator on String. 62 // We either have to optimize that operator so it doesn't involve 63 // so many allocations, or change this to use Vector<UChar> instead. 64 65 enum URLCharacterClasses { 66 // alpha 67 SchemeFirstChar = 1 << 0, 68 69 // ( alpha | digit | "+" | "-" | "." ) 70 SchemeChar = 1 << 1, 71 72 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" 73 // unreserved = alphanum | mark 74 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," ) 75 UserInfoChar = 1 << 2, 76 77 // alnum | "." | "-" | "%" 78 // The above is what the specification says, but we are lenient to 79 // match existing practice and also allow: 80 // "_" 81 HostnameChar = 1 << 3, 82 83 // hexdigit | ":" | "%" 84 IPv6Char = 1 << 4, 85 86 // "#" | "?" | "/" | nul 87 PathSegmentEndChar = 1 << 5, 88 89 // not allowed in path 90 BadChar = 1 << 6 91 }; 92 93 static const unsigned char characterClassTable[256] = { 94 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar, 95 /* 2 stx */ BadChar, /* 3 etx */ BadChar, 96 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar, 97 /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar, 98 /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar, 99 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar, 100 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar, 101 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar, 102 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar, 103 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar, 104 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar, 105 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar, 106 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar, 107 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar, 108 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar, 109 /* 44 , */ UserInfoChar, 110 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar, 111 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 112 /* 47 / */ PathSegmentEndChar, 113 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 114 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 115 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 116 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 117 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 118 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 119 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 120 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 121 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 122 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 123 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar, 124 /* 60 < */ BadChar, /* 61 = */ UserInfoChar, 125 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar, 126 /* 64 @ */ 0, 127 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 128 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 129 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 130 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 131 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 132 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 133 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 134 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 135 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 136 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 137 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 138 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 139 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 140 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 141 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 142 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 143 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 144 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 145 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 146 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 147 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 148 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 149 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 150 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 151 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 152 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 153 /* 91 [ */ 0, 154 /* 92 \ */ 0, /* 93 ] */ 0, 155 /* 94 ^ */ 0, 156 /* 95 _ */ UserInfoChar | HostnameChar, 157 /* 96 ` */ 0, 158 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 159 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 160 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 161 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 162 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 163 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 164 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 165 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 166 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 167 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 168 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 169 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 170 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 171 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 172 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 173 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 174 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 175 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 176 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 177 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 178 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 179 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 180 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 181 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 182 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 183 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 184 /* 123 { */ 0, 185 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar, 186 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar, 187 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar, 188 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar, 189 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar, 190 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar, 191 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar, 192 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar, 193 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar, 194 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar, 195 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar, 196 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar, 197 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar, 198 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar, 199 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar, 200 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar, 201 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar, 202 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar, 203 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar, 204 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar, 205 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar, 206 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar, 207 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar, 208 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar, 209 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar, 210 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar, 211 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar, 212 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar, 213 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar, 214 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar, 215 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar, 216 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar, 217 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar 218 }; 219 220 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd); 221 static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput); 222 static String substituteBackslashes(const String&); 223 224 static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; } 225 static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); } 226 static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; } 227 static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); } 228 static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; } 229 static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; } 230 static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; } 231 static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; } 232 static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); } 233 static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; } 234 235 static inline int hexDigitValue(UChar c) 236 { 237 ASSERT(isASCIIHexDigit(c)); 238 if (c < 'A') 239 return c - '0'; 240 return (c - 'A' + 10) & 0xF; // handle both upper and lower case without a branch 241 } 242 243 // Copies the source to the destination, assuming all the source characters are 244 // ASCII. The destination buffer must be large enough. Null characters are allowed 245 // in the source string, and no attempt is made to null-terminate the result. 246 static void copyASCII(const UChar* src, int length, char* dest) 247 { 248 for (int i = 0; i < length; i++) 249 dest[i] = static_cast<char>(src[i]); 250 } 251 252 static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer) 253 { 254 buffer.resize(base.length() + len + 1); 255 copyASCII(base.characters(), base.length(), buffer.data()); 256 memcpy(buffer.data() + base.length(), rel, len); 257 buffer[buffer.size() - 1] = '\0'; 258 } 259 260 // FIXME: Move to PlatformString.h eventually. 261 // Returns the index of the first index in string |s| of any of the characters 262 // in |toFind|. |toFind| should be a null-terminated string, all characters up 263 // to the null will be searched. Returns int if not found. 264 static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind) 265 { 266 for (int i = startPos; i < sLen; i++) { 267 const char* cur = toFind; 268 while (*cur) { 269 if (s[i] == *(cur++)) 270 return i; 271 } 272 } 273 return -1; 274 } 275 276 #ifndef NDEBUG 277 static void checkEncodedString(const String& url) 278 { 279 for (unsigned i = 0; i < url.length(); ++i) 280 ASSERT(!(url[i] & ~0x7F)); 281 282 ASSERT(!url.length() || isSchemeFirstChar(url[0])); 283 } 284 #else 285 static inline void checkEncodedString(const String&) 286 { 287 } 288 #endif 289 290 inline bool KURL::protocolIs(const String& string, const char* protocol) 291 { 292 return WebCore::protocolIs(string, protocol); 293 } 294 295 void KURL::invalidate() 296 { 297 m_isValid = false; 298 m_protocolInHTTPFamily = false; 299 m_schemeEnd = 0; 300 m_userStart = 0; 301 m_userEnd = 0; 302 m_passwordEnd = 0; 303 m_hostEnd = 0; 304 m_portEnd = 0; 305 m_pathEnd = 0; 306 m_pathAfterLastSlash = 0; 307 m_queryEnd = 0; 308 m_fragmentEnd = 0; 309 } 310 311 KURL::KURL(ParsedURLStringTag, const char* url) 312 { 313 parse(url, 0); 314 ASSERT(url == m_string); 315 } 316 317 KURL::KURL(ParsedURLStringTag, const String& url) 318 { 319 parse(url); 320 ASSERT(url == m_string); 321 } 322 323 KURL::KURL(ParsedURLStringTag, const URLString& url) 324 { 325 parse(url.string()); 326 ASSERT(url.string() == m_string); 327 } 328 329 KURL::KURL(const KURL& base, const String& relative) 330 { 331 init(base, relative, UTF8Encoding()); 332 } 333 334 KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding) 335 { 336 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as 337 // we do when submitting a form. A form with GET method 338 // has its contents added to a URL as query params and it makes sense 339 // to be consistent. 340 init(base, relative, encoding.encodingForFormSubmission()); 341 } 342 343 static bool shouldTrimFromURL(unsigned char c) 344 { 345 // Browsers ignore leading/trailing whitespace and control 346 // characters from URLs. Note that c is an *unsigned* char here 347 // so this comparison should only catch control characters. 348 return c <= ' '; 349 } 350 351 void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding) 352 { 353 // Allow resolutions with a null or empty base URL, but not with any other invalid one. 354 // FIXME: Is this a good rule? 355 if (!base.m_isValid && !base.isEmpty()) { 356 m_string = relative; 357 invalidate(); 358 return; 359 } 360 361 // For compatibility with Win IE, treat backslashes as if they were slashes, 362 // as long as we're not dealing with javascript: or data: URLs. 363 String rel = relative; 364 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data"))) 365 rel = substituteBackslashes(rel); 366 367 String* originalString = &rel; 368 369 bool allASCII = charactersAreAllASCII(rel.characters(), rel.length()); 370 CharBuffer strBuffer; 371 char* str; 372 size_t len; 373 if (allASCII) { 374 len = rel.length(); 375 strBuffer.resize(len + 1); 376 copyASCII(rel.characters(), len, strBuffer.data()); 377 strBuffer[len] = 0; 378 str = strBuffer.data(); 379 } else { 380 originalString = 0; 381 encodeRelativeString(rel, encoding, strBuffer); 382 str = strBuffer.data(); 383 len = strlen(str); 384 } 385 386 // Get rid of leading whitespace and control characters. 387 while (len && shouldTrimFromURL(*str)) { 388 originalString = 0; 389 str++; 390 --len; 391 } 392 393 // Get rid of trailing whitespace and control characters. 394 while (len && shouldTrimFromURL(str[len - 1])) { 395 originalString = 0; 396 str[--len] = '\0'; 397 } 398 399 // According to the RFC, the reference should be interpreted as an 400 // absolute URI if possible, using the "leftmost, longest" 401 // algorithm. If the URI reference is absolute it will have a 402 // scheme, meaning that it will have a colon before the first 403 // non-scheme element. 404 bool absolute = false; 405 char* p = str; 406 if (isSchemeFirstChar(*p)) { 407 ++p; 408 while (isSchemeChar(*p)) { 409 ++p; 410 } 411 if (*p == ':') { 412 if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical()) { 413 str = p + 1; 414 originalString = 0; 415 } else 416 absolute = true; 417 } 418 } 419 420 CharBuffer parseBuffer; 421 422 if (absolute) { 423 parse(str, originalString); 424 } else { 425 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid 426 // unless the relative URL is a single fragment. 427 if (!base.isHierarchical()) { 428 if (str[0] == '#') { 429 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 430 parse(parseBuffer.data(), 0); 431 } else { 432 m_string = relative; 433 invalidate(); 434 } 435 return; 436 } 437 438 switch (str[0]) { 439 case '\0': 440 // The reference is empty, so this is a reference to the same document with any fragment identifier removed. 441 *this = base; 442 removeFragmentIdentifier(); 443 break; 444 case '#': { 445 // must be fragment-only reference 446 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 447 parse(parseBuffer.data(), 0); 448 break; 449 } 450 case '?': { 451 // query-only reference, special case needed for non-URL results 452 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer); 453 parse(parseBuffer.data(), 0); 454 break; 455 } 456 case '/': 457 // must be net-path or absolute-path reference 458 if (str[1] == '/') { 459 // net-path 460 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer); 461 parse(parseBuffer.data(), 0); 462 } else { 463 // abs-path 464 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer); 465 parse(parseBuffer.data(), 0); 466 } 467 break; 468 default: 469 { 470 // must be relative-path reference 471 472 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte. 473 parseBuffer.resize(base.m_pathEnd + 1 + len + 1); 474 475 char* bufferPos = parseBuffer.data(); 476 477 // first copy everything before the path from the base 478 unsigned baseLength = base.m_string.length(); 479 const UChar* baseCharacters = base.m_string.characters(); 480 CharBuffer baseStringBuffer(baseLength); 481 copyASCII(baseCharacters, baseLength, baseStringBuffer.data()); 482 const char* baseString = baseStringBuffer.data(); 483 const char* baseStringStart = baseString; 484 const char* pathStart = baseStringStart + base.m_portEnd; 485 while (baseStringStart < pathStart) 486 *bufferPos++ = *baseStringStart++; 487 char* bufferPathStart = bufferPos; 488 489 // now copy the base path 490 const char* baseStringEnd = baseString + base.m_pathEnd; 491 492 // go back to the last slash 493 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/') 494 baseStringEnd--; 495 496 if (baseStringEnd == baseStringStart) { 497 // no path in base, add a path separator if necessary 498 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#') 499 *bufferPos++ = '/'; 500 } else { 501 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart); 502 } 503 504 const char* relStringStart = str; 505 const char* relStringPos = relStringStart; 506 507 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') { 508 if (relStringPos[0] == '.' && bufferPos[-1] == '/') { 509 if (isPathSegmentEndChar(relStringPos[1])) { 510 // skip over "." segment 511 relStringPos += 1; 512 if (relStringPos[0] == '/') 513 relStringPos++; 514 continue; 515 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) { 516 // skip over ".." segment and rewind the last segment 517 // the RFC leaves it up to the app to decide what to do with excess 518 // ".." segments - we choose to drop them since some web content 519 // relies on this. 520 relStringPos += 2; 521 if (relStringPos[0] == '/') 522 relStringPos++; 523 if (bufferPos > bufferPathStart + 1) 524 bufferPos--; 525 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/') 526 bufferPos--; 527 continue; 528 } 529 } 530 531 *bufferPos = *relStringPos; 532 relStringPos++; 533 bufferPos++; 534 } 535 536 // all done with the path work, now copy any remainder 537 // of the relative reference; this will also add a null terminator 538 strcpy(bufferPos, relStringPos); 539 540 parse(parseBuffer.data(), 0); 541 542 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size()); 543 break; 544 } 545 } 546 } 547 } 548 549 KURL KURL::copy() const 550 { 551 KURL result = *this; 552 result.m_string = result.m_string.crossThreadString(); 553 return result; 554 } 555 556 bool KURL::hasPath() const 557 { 558 return m_pathEnd != m_portEnd; 559 } 560 561 String KURL::lastPathComponent() const 562 { 563 if (!hasPath()) 564 return String(); 565 566 unsigned end = m_pathEnd - 1; 567 if (m_string[end] == '/') 568 --end; 569 570 size_t start = m_string.reverseFind('/', end); 571 if (start < static_cast<unsigned>(m_portEnd)) 572 return String(); 573 ++start; 574 575 return m_string.substring(start, end - start + 1); 576 } 577 578 String KURL::protocol() const 579 { 580 return m_string.left(m_schemeEnd); 581 } 582 583 String KURL::host() const 584 { 585 int start = hostStart(); 586 return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start)); 587 } 588 589 unsigned short KURL::port() const 590 { 591 // We return a port of 0 if there is no port specified. This can happen in two situations: 592 // 1) The URL contains no colon after the host name and before the path component of the URL. 593 // 2) The URL contains a colon but there's no port number before the path component of the URL begins. 594 if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1) 595 return 0; 596 597 const UChar* stringData = m_string.characters(); 598 bool ok = false; 599 unsigned number = charactersToUIntStrict(stringData + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok); 600 if (!ok || number > maximumValidPortNumber) 601 return invalidPortNumber; 602 return number; 603 } 604 605 String KURL::pass() const 606 { 607 if (m_passwordEnd == m_userEnd) 608 return String(); 609 610 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1)); 611 } 612 613 String KURL::user() const 614 { 615 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart)); 616 } 617 618 String KURL::fragmentIdentifier() const 619 { 620 if (m_fragmentEnd == m_queryEnd) 621 return String(); 622 623 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1)); 624 } 625 626 bool KURL::hasFragmentIdentifier() const 627 { 628 return m_fragmentEnd != m_queryEnd; 629 } 630 631 void KURL::copyParsedQueryTo(ParsedURLParameters& parameters) const 632 { 633 const UChar* pos = m_string.characters() + m_pathEnd + 1; 634 const UChar* end = m_string.characters() + m_queryEnd; 635 while (pos < end) { 636 const UChar* parameterStart = pos; 637 while (pos < end && *pos != '&') 638 ++pos; 639 const UChar* parameterEnd = pos; 640 if (pos < end) { 641 ASSERT(*pos == '&'); 642 ++pos; 643 } 644 if (parameterStart == parameterEnd) 645 continue; 646 const UChar* nameStart = parameterStart; 647 const UChar* equalSign = parameterStart; 648 while (equalSign < parameterEnd && *equalSign != '=') 649 ++equalSign; 650 if (equalSign == nameStart) 651 continue; 652 String name(nameStart, equalSign - nameStart); 653 String value = equalSign == parameterEnd ? String() : String(equalSign + 1, parameterEnd - equalSign - 1); 654 parameters.set(name, value); 655 } 656 } 657 658 String KURL::baseAsString() const 659 { 660 return m_string.left(m_pathAfterLastSlash); 661 } 662 663 #ifdef NDEBUG 664 665 static inline void assertProtocolIsGood(const char*) 666 { 667 } 668 669 #else 670 671 static void assertProtocolIsGood(const char* protocol) 672 { 673 const char* p = protocol; 674 while (*p) { 675 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); 676 ++p; 677 } 678 } 679 680 #endif 681 682 bool KURL::protocolIs(const char* protocol) const 683 { 684 assertProtocolIsGood(protocol); 685 686 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid. 687 // The free function protocolIsJavaScript() should be used instead. 688 ASSERT(!equalIgnoringCase(protocol, String("javascript"))); 689 690 if (!m_isValid) 691 return false; 692 693 // Do the comparison without making a new string object. 694 for (int i = 0; i < m_schemeEnd; ++i) { 695 if (!protocol[i] || toASCIILower(m_string[i]) != protocol[i]) 696 return false; 697 } 698 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument. 699 } 700 701 String KURL::query() const 702 { 703 if (m_queryEnd == m_pathEnd) 704 return String(); 705 706 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1)); 707 } 708 709 String KURL::path() const 710 { 711 return decodeURLEscapeSequences(m_string.substring(m_portEnd, m_pathEnd - m_portEnd)); 712 } 713 714 bool KURL::setProtocol(const String& s) 715 { 716 // Firefox and IE remove everything after the first ':'. 717 size_t separatorPosition = s.find(':'); 718 String newProtocol = s.substring(0, separatorPosition); 719 720 if (!isValidProtocol(newProtocol)) 721 return false; 722 723 if (!m_isValid) { 724 parse(newProtocol + ":" + m_string); 725 return true; 726 } 727 728 parse(newProtocol + m_string.substring(m_schemeEnd)); 729 return true; 730 } 731 732 void KURL::setHost(const String& s) 733 { 734 if (!m_isValid) 735 return; 736 737 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 738 // and to avoid changing more than just the host. 739 740 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 741 742 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd)); 743 } 744 745 void KURL::removePort() 746 { 747 if (m_hostEnd == m_portEnd) 748 return; 749 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd)); 750 } 751 752 void KURL::setPort(unsigned short i) 753 { 754 if (!m_isValid) 755 return; 756 757 bool colonNeeded = m_portEnd == m_hostEnd; 758 int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1); 759 760 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd)); 761 } 762 763 void KURL::setHostAndPort(const String& hostAndPort) 764 { 765 if (!m_isValid) 766 return; 767 768 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 769 // and to avoid changing more than just host and port. 770 771 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 772 773 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd)); 774 } 775 776 void KURL::setUser(const String& user) 777 { 778 if (!m_isValid) 779 return; 780 781 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 782 // and to avoid changing more than just the user login. 783 String u; 784 int end = m_userEnd; 785 if (!user.isEmpty()) { 786 u = user; 787 if (m_userStart == m_schemeEnd + 1) 788 u = "//" + u; 789 // Add '@' if we didn't have one before. 790 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@')) 791 u.append('@'); 792 } else { 793 // Remove '@' if we now have neither user nor password. 794 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@') 795 end += 1; 796 } 797 parse(m_string.left(m_userStart) + u + m_string.substring(end)); 798 } 799 800 void KURL::setPass(const String& password) 801 { 802 if (!m_isValid) 803 return; 804 805 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 806 // and to avoid changing more than just the user password. 807 String p; 808 int end = m_passwordEnd; 809 if (!password.isEmpty()) { 810 p = ":" + password + "@"; 811 if (m_userEnd == m_schemeEnd + 1) 812 p = "//" + p; 813 // Eat the existing '@' since we are going to add our own. 814 if (end != m_hostEnd && m_string[end] == '@') 815 end += 1; 816 } else { 817 // Remove '@' if we now have neither user nor password. 818 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@') 819 end += 1; 820 } 821 parse(m_string.left(m_userEnd) + p + m_string.substring(end)); 822 } 823 824 void KURL::setFragmentIdentifier(const String& s) 825 { 826 if (!m_isValid) 827 return; 828 829 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations. 830 parse(m_string.left(m_queryEnd) + "#" + s); 831 } 832 833 void KURL::removeFragmentIdentifier() 834 { 835 if (!m_isValid) 836 return; 837 parse(m_string.left(m_queryEnd)); 838 } 839 840 void KURL::setQuery(const String& query) 841 { 842 if (!m_isValid) 843 return; 844 845 // FIXME: '#' and non-ASCII characters must be encoded and escaped. 846 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have 847 // access to the document in this function. 848 if ((query.isEmpty() || query[0] != '?') && !query.isNull()) 849 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd)); 850 else 851 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd)); 852 853 } 854 855 void KURL::setPath(const String& s) 856 { 857 if (!m_isValid) 858 return; 859 860 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts 861 // may be inadvertently affected. 862 String path = s; 863 if (path.isEmpty() || path[0] != '/') 864 path = "/" + path; 865 866 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd)); 867 } 868 869 String KURL::prettyURL() const 870 { 871 if (!m_isValid) 872 return m_string; 873 874 Vector<UChar> result; 875 876 append(result, protocol()); 877 result.append(':'); 878 879 Vector<UChar> authority; 880 881 if (m_hostEnd != m_passwordEnd) { 882 if (m_userEnd != m_userStart) { 883 append(authority, user()); 884 authority.append('@'); 885 } 886 append(authority, host()); 887 if (hasPort()) { 888 authority.append(':'); 889 append(authority, String::number(port())); 890 } 891 } 892 893 if (!authority.isEmpty()) { 894 result.append('/'); 895 result.append('/'); 896 result.append(authority); 897 } else if (protocolIs("file")) { 898 result.append('/'); 899 result.append('/'); 900 } 901 902 append(result, path()); 903 904 if (m_pathEnd != m_queryEnd) { 905 result.append('?'); 906 append(result, query()); 907 } 908 909 if (m_fragmentEnd != m_queryEnd) { 910 result.append('#'); 911 append(result, fragmentIdentifier()); 912 } 913 914 return String::adopt(result); 915 } 916 917 String decodeURLEscapeSequences(const String& str) 918 { 919 return decodeURLEscapeSequences(str, UTF8Encoding()); 920 } 921 922 String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding) 923 { 924 Vector<UChar> result; 925 926 CharBuffer buffer; 927 928 unsigned length = str.length(); 929 unsigned decodedPosition = 0; 930 unsigned searchPosition = 0; 931 size_t encodedRunPosition; 932 while ((encodedRunPosition = str.find('%', searchPosition)) != notFound) { 933 // Find the sequence of %-escape codes. 934 unsigned encodedRunEnd = encodedRunPosition; 935 while (length - encodedRunEnd >= 3 936 && str[encodedRunEnd] == '%' 937 && isASCIIHexDigit(str[encodedRunEnd + 1]) 938 && isASCIIHexDigit(str[encodedRunEnd + 2])) 939 encodedRunEnd += 3; 940 searchPosition = encodedRunEnd; 941 if (encodedRunEnd == encodedRunPosition) { 942 ++searchPosition; 943 continue; 944 } 945 946 // Decode the %-escapes into bytes. 947 unsigned runLength = (encodedRunEnd - encodedRunPosition) / 3; 948 buffer.resize(runLength); 949 char* p = buffer.data(); 950 const UChar* q = str.characters() + encodedRunPosition; 951 for (unsigned i = 0; i < runLength; ++i) { 952 *p++ = (hexDigitValue(q[1]) << 4) | hexDigitValue(q[2]); 953 q += 3; 954 } 955 956 // Decode the bytes into Unicode characters. 957 String decoded = (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data()); 958 if (decoded.isEmpty()) 959 continue; 960 961 // Build up the string with what we just skipped and what we just decoded. 962 result.append(str.characters() + decodedPosition, encodedRunPosition - decodedPosition); 963 result.append(decoded.characters(), decoded.length()); 964 decodedPosition = encodedRunEnd; 965 } 966 967 result.append(str.characters() + decodedPosition, length - decodedPosition); 968 969 return String::adopt(result); 970 } 971 972 // Caution: This function does not bounds check. 973 static void appendEscapedChar(char*& buffer, unsigned char c) 974 { 975 *buffer++ = '%'; 976 placeByteAsHex(c, buffer); 977 } 978 979 static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length) 980 { 981 char* p = buffer; 982 983 const char* str = strStart; 984 const char* strEnd = strStart + length; 985 while (str < strEnd) { 986 unsigned char c = *str++; 987 if (isBadChar(c)) { 988 if (c == '%' || c == '?') 989 *p++ = c; 990 else if (c != 0x09 && c != 0x0a && c != 0x0d) 991 appendEscapedChar(p, c); 992 } else 993 *p++ = c; 994 } 995 996 buffer = p; 997 } 998 999 static void escapeAndAppendFragment(char*& buffer, const char* strStart, size_t length) 1000 { 1001 char* p = buffer; 1002 1003 const char* str = strStart; 1004 const char* strEnd = strStart + length; 1005 while (str < strEnd) { 1006 unsigned char c = *str++; 1007 // Strip CR, LF and Tab from fragments, per: 1008 // https://bugs.webkit.org/show_bug.cgi?id=8770 1009 if (c == 0x09 || c == 0x0a || c == 0x0d) 1010 continue; 1011 1012 // Chrome and IE allow non-ascii characters in fragments, however doing 1013 // so would hit an ASSERT in checkEncodedString, so for now we don't. 1014 if (c < 0x20 || c >= 127) { 1015 appendEscapedChar(p, c); 1016 continue; 1017 } 1018 *p++ = c; 1019 } 1020 1021 buffer = p; 1022 } 1023 1024 // copy a path, accounting for "." and ".." segments 1025 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd) 1026 { 1027 char* bufferPathStart = dst; 1028 1029 // empty path is a special case, and need not have a leading slash 1030 if (srcStart != srcEnd) { 1031 const char* baseStringStart = src + srcStart; 1032 const char* baseStringEnd = src + srcEnd; 1033 const char* baseStringPos = baseStringStart; 1034 1035 // this code is unprepared for paths that do not begin with a 1036 // slash and we should always have one in the source string 1037 ASSERT(baseStringPos[0] == '/'); 1038 1039 // copy the leading slash into the destination 1040 *dst = *baseStringPos; 1041 baseStringPos++; 1042 dst++; 1043 1044 while (baseStringPos < baseStringEnd) { 1045 if (baseStringPos[0] == '.' && dst[-1] == '/') { 1046 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) { 1047 // skip over "." segment 1048 baseStringPos += 2; 1049 continue; 1050 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' || 1051 baseStringPos + 2 == baseStringEnd)) { 1052 // skip over ".." segment and rewind the last segment 1053 // the RFC leaves it up to the app to decide what to do with excess 1054 // ".." segments - we choose to drop them since some web content 1055 // relies on this. 1056 baseStringPos += 3; 1057 if (dst > bufferPathStart + 1) 1058 dst--; 1059 while (dst > bufferPathStart && dst[-1] != '/') 1060 dst--; 1061 continue; 1062 } 1063 } 1064 1065 *dst = *baseStringPos; 1066 baseStringPos++; 1067 dst++; 1068 } 1069 } 1070 *dst = '\0'; 1071 return dst - bufferPathStart; 1072 } 1073 1074 static inline bool hasSlashDotOrDotDot(const char* str) 1075 { 1076 const unsigned char* p = reinterpret_cast<const unsigned char*>(str); 1077 if (!*p) 1078 return false; 1079 unsigned char pc = *p; 1080 while (unsigned char c = *++p) { 1081 if (c == '.' && (pc == '/' || pc == '.')) 1082 return true; 1083 pc = c; 1084 } 1085 return false; 1086 } 1087 1088 static inline bool matchLetter(char c, char lowercaseLetter) 1089 { 1090 return (c | 0x20) == lowercaseLetter; 1091 } 1092 1093 void KURL::parse(const String& string) 1094 { 1095 checkEncodedString(string); 1096 1097 CharBuffer buffer(string.length() + 1); 1098 copyASCII(string.characters(), string.length(), buffer.data()); 1099 buffer[string.length()] = '\0'; 1100 parse(buffer.data(), &string); 1101 } 1102 1103 static inline bool equal(const char* a, size_t lenA, const char* b, size_t lenB) 1104 { 1105 if (lenA != lenB) 1106 return false; 1107 return !strncmp(a, b, lenA); 1108 } 1109 1110 // List of default schemes is taken from google-url: 1111 // http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120 1112 static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength) 1113 { 1114 // This switch is theoretically a performance optimization. It came over when 1115 // the code was moved from google-url, but may be removed later. 1116 switch (schemeLength) { 1117 case 2: 1118 return equal("ws", 2, scheme, schemeLength) && equal("80", 2, port, portLength); 1119 case 3: 1120 if (equal("ftp", 3, scheme, schemeLength)) 1121 return equal("21", 2, port, portLength); 1122 if (equal("wss", 3, scheme, schemeLength)) 1123 return equal("443", 3, port, portLength); 1124 break; 1125 case 4: 1126 return equal("http", 4, scheme, schemeLength) && equal("80", 2, port, portLength); 1127 case 5: 1128 return equal("https", 5, scheme, schemeLength) && equal("443", 3, port, portLength); 1129 case 6: 1130 return equal("gopher", 6, scheme, schemeLength) && equal("70", 2, port, portLength); 1131 } 1132 return false; 1133 } 1134 1135 static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userEndChar) 1136 { 1137 return userEndChar == '@' && hostStart == portEnd; 1138 } 1139 1140 void KURL::parse(const char* url, const String* originalString) 1141 { 1142 if (!url || url[0] == '\0') { 1143 // valid URL must be non-empty 1144 m_string = originalString ? *originalString : url; 1145 invalidate(); 1146 return; 1147 } 1148 1149 if (!isSchemeFirstChar(url[0])) { 1150 // scheme must start with an alphabetic character 1151 m_string = originalString ? *originalString : url; 1152 invalidate(); 1153 return; 1154 } 1155 1156 int schemeEnd = 0; 1157 while (isSchemeChar(url[schemeEnd])) 1158 schemeEnd++; 1159 1160 if (url[schemeEnd] != ':') { 1161 m_string = originalString ? *originalString : url; 1162 invalidate(); 1163 return; 1164 } 1165 1166 int userStart = schemeEnd + 1; 1167 int userEnd; 1168 int passwordStart; 1169 int passwordEnd; 1170 int hostStart; 1171 int hostEnd; 1172 int portStart; 1173 int portEnd; 1174 1175 bool hierarchical = url[schemeEnd + 1] == '/'; 1176 1177 bool isFile = schemeEnd == 4 1178 && matchLetter(url[0], 'f') 1179 && matchLetter(url[1], 'i') 1180 && matchLetter(url[2], 'l') 1181 && matchLetter(url[3], 'e'); 1182 1183 m_protocolInHTTPFamily = matchLetter(url[0], 'h') 1184 && matchLetter(url[1], 't') 1185 && matchLetter(url[2], 't') 1186 && matchLetter(url[3], 'p') 1187 && (url[4] == ':' || (matchLetter(url[4], 's') && url[5] == ':')); 1188 1189 if (hierarchical && url[schemeEnd + 2] == '/') { 1190 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty. 1191 // Attempt to find an authority. 1192 1193 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster. 1194 userStart += 2; 1195 userEnd = userStart; 1196 1197 int colonPos = 0; 1198 while (isUserInfoChar(url[userEnd])) { 1199 if (url[userEnd] == ':' && colonPos == 0) 1200 colonPos = userEnd; 1201 userEnd++; 1202 } 1203 1204 if (url[userEnd] == '@') { 1205 // actual end of the userinfo, start on the host 1206 if (colonPos != 0) { 1207 passwordEnd = userEnd; 1208 userEnd = colonPos; 1209 passwordStart = colonPos + 1; 1210 } else 1211 passwordStart = passwordEnd = userEnd; 1212 1213 hostStart = passwordEnd + 1; 1214 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) { 1215 // hit the end of the authority, must have been no user 1216 // or looks like an IPv6 hostname 1217 // either way, try to parse it as a hostname 1218 userEnd = userStart; 1219 passwordStart = passwordEnd = userEnd; 1220 hostStart = userStart; 1221 } else { 1222 // invalid character 1223 m_string = originalString ? *originalString : url; 1224 invalidate(); 1225 return; 1226 } 1227 1228 hostEnd = hostStart; 1229 1230 // IPV6 IP address 1231 if (url[hostEnd] == '[') { 1232 hostEnd++; 1233 while (isIPv6Char(url[hostEnd])) 1234 hostEnd++; 1235 if (url[hostEnd] == ']') 1236 hostEnd++; 1237 else { 1238 // invalid character 1239 m_string = originalString ? *originalString : url; 1240 invalidate(); 1241 return; 1242 } 1243 } else { 1244 while (isHostnameChar(url[hostEnd])) 1245 hostEnd++; 1246 } 1247 1248 if (url[hostEnd] == ':') { 1249 portStart = portEnd = hostEnd + 1; 1250 1251 // possible start of port 1252 portEnd = portStart; 1253 while (isASCIIDigit(url[portEnd])) 1254 portEnd++; 1255 } else 1256 portStart = portEnd = hostEnd; 1257 1258 if (!isPathSegmentEndChar(url[portEnd])) { 1259 // invalid character 1260 m_string = originalString ? *originalString : url; 1261 invalidate(); 1262 return; 1263 } 1264 1265 if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[userEnd])) { 1266 // in this circumstance, act as if there is an erroneous hostname containing an '@' 1267 userEnd = userStart; 1268 hostStart = userEnd; 1269 } 1270 1271 if (userStart == portEnd && !m_protocolInHTTPFamily && !isFile) { 1272 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two 1273 // path segments are empty. For file, http and https only, an empty authority is allowed. 1274 userStart -= 2; 1275 userEnd = userStart; 1276 passwordStart = userEnd; 1277 passwordEnd = passwordStart; 1278 hostStart = passwordEnd; 1279 hostEnd = hostStart; 1280 portStart = hostEnd; 1281 portEnd = hostEnd; 1282 } 1283 } else { 1284 // the part after the scheme must be an opaque_part or an abs_path 1285 userEnd = userStart; 1286 passwordStart = passwordEnd = userEnd; 1287 hostStart = hostEnd = passwordEnd; 1288 portStart = portEnd = hostEnd; 1289 } 1290 1291 int pathStart = portEnd; 1292 int pathEnd = pathStart; 1293 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#') 1294 pathEnd++; 1295 1296 int queryStart = pathEnd; 1297 int queryEnd = queryStart; 1298 if (url[queryStart] == '?') { 1299 while (url[queryEnd] && url[queryEnd] != '#') 1300 queryEnd++; 1301 } 1302 1303 int fragmentStart = queryEnd; 1304 int fragmentEnd = fragmentStart; 1305 if (url[fragmentStart] == '#') { 1306 fragmentStart++; 1307 fragmentEnd = fragmentStart; 1308 while (url[fragmentEnd]) 1309 fragmentEnd++; 1310 } 1311 1312 // assemble it all, remembering the real ranges 1313 1314 Vector<char, 4096> buffer(fragmentEnd * 3 + 1); 1315 1316 char *p = buffer.data(); 1317 const char *strPtr = url; 1318 1319 // copy in the scheme 1320 const char *schemeEndPtr = url + schemeEnd; 1321 while (strPtr < schemeEndPtr) 1322 *p++ = toASCIILower(*strPtr++); 1323 m_schemeEnd = p - buffer.data(); 1324 1325 bool hostIsLocalHost = portEnd - userStart == 9 1326 && matchLetter(url[userStart], 'l') 1327 && matchLetter(url[userStart+1], 'o') 1328 && matchLetter(url[userStart+2], 'c') 1329 && matchLetter(url[userStart+3], 'a') 1330 && matchLetter(url[userStart+4], 'l') 1331 && matchLetter(url[userStart+5], 'h') 1332 && matchLetter(url[userStart+6], 'o') 1333 && matchLetter(url[userStart+7], 's') 1334 && matchLetter(url[userStart+8], 't'); 1335 1336 // File URLs need a host part unless it is just file:// or file://localhost 1337 bool degenFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost); 1338 1339 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || portStart != portEnd; 1340 1341 // add ":" after scheme 1342 *p++ = ':'; 1343 1344 // if we have at least one authority part or a file URL - add "//" and authority 1345 if (isFile ? !degenFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) { 1346 *p++ = '/'; 1347 *p++ = '/'; 1348 1349 m_userStart = p - buffer.data(); 1350 1351 // copy in the user 1352 strPtr = url + userStart; 1353 const char* userEndPtr = url + userEnd; 1354 while (strPtr < userEndPtr) 1355 *p++ = *strPtr++; 1356 m_userEnd = p - buffer.data(); 1357 1358 // copy in the password 1359 if (passwordEnd != passwordStart) { 1360 *p++ = ':'; 1361 strPtr = url + passwordStart; 1362 const char* passwordEndPtr = url + passwordEnd; 1363 while (strPtr < passwordEndPtr) 1364 *p++ = *strPtr++; 1365 } 1366 m_passwordEnd = p - buffer.data(); 1367 1368 // If we had any user info, add "@" 1369 if (p - buffer.data() != m_userStart) 1370 *p++ = '@'; 1371 1372 // copy in the host, except in the case of a file URL with authority="localhost" 1373 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) { 1374 strPtr = url + hostStart; 1375 const char* hostEndPtr = url + hostEnd; 1376 while (strPtr < hostEndPtr) 1377 *p++ = *strPtr++; 1378 } 1379 m_hostEnd = p - buffer.data(); 1380 1381 // Copy in the port if the URL has one (and it's not default). 1382 if (hostEnd != portStart) { 1383 const char* portStr = url + portStart; 1384 size_t portLength = portEnd - portStart; 1385 if (portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd)) { 1386 *p++ = ':'; 1387 const char* portEndPtr = url + portEnd; 1388 while (portStr < portEndPtr) 1389 *p++ = *portStr++; 1390 } 1391 } 1392 m_portEnd = p - buffer.data(); 1393 } else 1394 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data(); 1395 1396 // For canonicalization, ensure we have a '/' for no path. 1397 // Do this only for hierarchical URL with protocol http or https. 1398 if (m_protocolInHTTPFamily && hierarchical && pathEnd == pathStart) 1399 *p++ = '/'; 1400 1401 // add path, escaping bad characters 1402 if (!hierarchical || !hasSlashDotOrDotDot(url)) 1403 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart); 1404 else { 1405 CharBuffer pathBuffer(pathEnd - pathStart + 1); 1406 size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd); 1407 appendEscapingBadChars(p, pathBuffer.data(), length); 1408 } 1409 1410 m_pathEnd = p - buffer.data(); 1411 1412 // Find the position after the last slash in the path, or 1413 // the position before the path if there are no slashes in it. 1414 int i; 1415 for (i = m_pathEnd; i > m_portEnd; --i) { 1416 if (buffer[i - 1] == '/') 1417 break; 1418 } 1419 m_pathAfterLastSlash = i; 1420 1421 // add query, escaping bad characters 1422 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart); 1423 m_queryEnd = p - buffer.data(); 1424 1425 // add fragment, escaping bad characters 1426 if (fragmentEnd != queryEnd) { 1427 *p++ = '#'; 1428 escapeAndAppendFragment(p, url + fragmentStart, fragmentEnd - fragmentStart); 1429 } 1430 m_fragmentEnd = p - buffer.data(); 1431 1432 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1433 1434 // If we didn't end up actually changing the original string and 1435 // it was already in a String, reuse it to avoid extra allocation. 1436 if (originalString && originalString->length() == static_cast<unsigned>(m_fragmentEnd) && strncmp(buffer.data(), url, m_fragmentEnd) == 0) 1437 m_string = *originalString; 1438 else 1439 m_string = String(buffer.data(), m_fragmentEnd); 1440 1441 m_isValid = true; 1442 } 1443 1444 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) 1445 { 1446 if (a.m_queryEnd != b.m_queryEnd) 1447 return false; 1448 unsigned queryLength = a.m_queryEnd; 1449 for (unsigned i = 0; i < queryLength; ++i) 1450 if (a.string()[i] != b.string()[i]) 1451 return false; 1452 return true; 1453 } 1454 1455 bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b) 1456 { 1457 if (a.m_schemeEnd != b.m_schemeEnd) 1458 return false; 1459 1460 int hostStartA = a.hostStart(); 1461 int hostLengthA = a.hostEnd() - hostStartA; 1462 int hostStartB = b.hostStart(); 1463 int hostLengthB = b.hostEnd() - b.hostStart(); 1464 if (hostLengthA != hostLengthB) 1465 return false; 1466 1467 // Check the scheme 1468 for (int i = 0; i < a.m_schemeEnd; ++i) 1469 if (a.string()[i] != b.string()[i]) 1470 return false; 1471 1472 // And the host 1473 for (int i = 0; i < hostLengthA; ++i) 1474 if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) 1475 return false; 1476 1477 if (a.port() != b.port()) 1478 return false; 1479 1480 return true; 1481 } 1482 1483 String encodeWithURLEscapeSequences(const String& notEncodedString) 1484 { 1485 CString asUTF8 = notEncodedString.utf8(); 1486 1487 CharBuffer buffer(asUTF8.length() * 3 + 1); 1488 char* p = buffer.data(); 1489 1490 const char* str = asUTF8.data(); 1491 const char* strEnd = str + asUTF8.length(); 1492 while (str < strEnd) { 1493 unsigned char c = *str++; 1494 if (isBadChar(c)) 1495 appendEscapedChar(p, c); 1496 else 1497 *p++ = c; 1498 } 1499 1500 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1501 1502 return String(buffer.data(), p - buffer.data()); 1503 } 1504 1505 // Appends the punycoded hostname identified by the given string and length to 1506 // the output buffer. The result will not be null terminated. 1507 static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen) 1508 { 1509 // Needs to be big enough to hold an IDN-encoded name. 1510 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. 1511 const unsigned hostnameBufferLength = 2048; 1512 1513 if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) { 1514 buffer.append(str, strLen); 1515 return; 1516 } 1517 1518 #if USE(ICU_UNICODE) 1519 UChar hostnameBuffer[hostnameBufferLength]; 1520 UErrorCode error = U_ZERO_ERROR; 1521 int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer, 1522 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error); 1523 if (error == U_ZERO_ERROR) 1524 buffer.append(hostnameBuffer, numCharactersConverted); 1525 #elif USE(QT4_UNICODE) 1526 QByteArray result = QUrl::toAce(String(str, strLen)); 1527 buffer.append(result.constData(), result.length()); 1528 #elif USE(GLIB_UNICODE) 1529 GOwnPtr<gchar> utf8Hostname; 1530 GOwnPtr<GError> utf8Err; 1531 utf8Hostname.set(g_utf16_to_utf8(str, strLen, 0, 0, &utf8Err.outPtr())); 1532 if (utf8Err) 1533 return; 1534 1535 GOwnPtr<gchar> encodedHostname; 1536 encodedHostname.set(g_hostname_to_ascii(utf8Hostname.get())); 1537 if (!encodedHostname) 1538 return; 1539 1540 buffer.append(encodedHostname.get(), strlen(encodedHostname.get())); 1541 #endif 1542 } 1543 1544 static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<pair<int, int> >& nameRanges) 1545 { 1546 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character. 1547 // Skip quoted strings so that characters in them don't confuse us. 1548 // When we find a '?' character, we are past the part of the URL that contains host names. 1549 1550 nameRanges.clear(); 1551 1552 int p = 0; 1553 while (1) { 1554 // Find start of host name or of quoted string. 1555 int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?"); 1556 if (hostnameOrStringStart == -1) 1557 return; 1558 UChar c = str[hostnameOrStringStart]; 1559 p = hostnameOrStringStart + 1; 1560 1561 if (c == '?') 1562 return; 1563 1564 if (c == '@') { 1565 // Find end of host name. 1566 int hostnameStart = p; 1567 int hostnameEnd = findFirstOf(str, strLen, p, ">,?"); 1568 bool done; 1569 if (hostnameEnd == -1) { 1570 hostnameEnd = strLen; 1571 done = true; 1572 } else { 1573 p = hostnameEnd; 1574 done = false; 1575 } 1576 1577 nameRanges.append(make_pair(hostnameStart, hostnameEnd)); 1578 1579 if (done) 1580 return; 1581 } else { 1582 // Skip quoted string. 1583 ASSERT(c == '"'); 1584 while (1) { 1585 int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\"); 1586 if (escapedCharacterOrStringEnd == -1) 1587 return; 1588 1589 c = str[escapedCharacterOrStringEnd]; 1590 p = escapedCharacterOrStringEnd + 1; 1591 1592 // If we are the end of the string, then break from the string loop back to the host name loop. 1593 if (c == '"') 1594 break; 1595 1596 // Skip escaped character. 1597 ASSERT(c == '\\'); 1598 if (p == strLen) 1599 return; 1600 1601 ++p; 1602 } 1603 } 1604 } 1605 } 1606 1607 static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset) 1608 { 1609 // Find the host name in a hierarchical URL. 1610 // It comes after a "://" sequence, with scheme characters preceding, and 1611 // this should be the first colon in the string. 1612 // It ends with the end of the string or a ":" or a path segment ending character. 1613 // If there is a "@" character, the host part is just the part after the "@". 1614 int separator = findFirstOf(str, strLen, 0, ":"); 1615 if (separator == -1 || separator + 2 >= strLen || 1616 str[separator + 1] != '/' || str[separator + 2] != '/') 1617 return false; 1618 1619 // Check that all characters before the :// are valid scheme characters. 1620 if (!isSchemeFirstChar(str[0])) 1621 return false; 1622 for (int i = 1; i < separator; ++i) { 1623 if (!isSchemeChar(str[i])) 1624 return false; 1625 } 1626 1627 // Start after the separator. 1628 int authorityStart = separator + 3; 1629 1630 // Find terminating character. 1631 int hostnameEnd = strLen; 1632 for (int i = authorityStart; i < strLen; ++i) { 1633 UChar c = str[i]; 1634 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) { 1635 hostnameEnd = i; 1636 break; 1637 } 1638 } 1639 1640 // Find "@" for the start of the host name. 1641 int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@"); 1642 int hostnameStart; 1643 if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd) 1644 hostnameStart = authorityStart; 1645 else 1646 hostnameStart = userInfoTerminator + 1; 1647 1648 startOffset = hostnameStart; 1649 endOffset = hostnameEnd; 1650 return true; 1651 } 1652 1653 // Converts all hostnames found in the given input to punycode, preserving the 1654 // rest of the URL unchanged. The output will NOT be null-terminated. 1655 static void encodeHostnames(const String& str, UCharBuffer& output) 1656 { 1657 output.clear(); 1658 1659 if (protocolIs(str, "mailto")) { 1660 Vector<pair<int, int> > hostnameRanges; 1661 findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges); 1662 int n = hostnameRanges.size(); 1663 int p = 0; 1664 for (int i = 0; i < n; ++i) { 1665 const pair<int, int>& r = hostnameRanges[i]; 1666 output.append(&str.characters()[p], r.first - p); 1667 appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first); 1668 p = r.second; 1669 } 1670 // This will copy either everything after the last hostname, or the 1671 // whole thing if there is no hostname. 1672 output.append(&str.characters()[p], str.length() - p); 1673 } else { 1674 int hostStart, hostEnd; 1675 if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) { 1676 output.append(str.characters(), hostStart); // Before hostname. 1677 appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart); 1678 output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname. 1679 } else { 1680 // No hostname to encode, return the input. 1681 output.append(str.characters(), str.length()); 1682 } 1683 } 1684 } 1685 1686 static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output) 1687 { 1688 UCharBuffer s; 1689 encodeHostnames(rel, s); 1690 1691 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme. 1692 1693 int pathEnd = -1; 1694 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) { 1695 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise. 1696 pathEnd = findFirstOf(s.data(), s.size(), 0, "#?"); 1697 } 1698 1699 if (pathEnd == -1) { 1700 CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables); 1701 output.resize(decoded.length()); 1702 memcpy(output.data(), decoded.data(), decoded.length()); 1703 } else { 1704 CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables); 1705 // Unencodable characters in URLs are represented by converting 1706 // them to XML entities and escaping non-alphanumeric characters. 1707 CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables); 1708 1709 output.resize(pathDecoded.length() + otherDecoded.length()); 1710 memcpy(output.data(), pathDecoded.data(), pathDecoded.length()); 1711 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length()); 1712 } 1713 output.append('\0'); // null-terminate the output. 1714 } 1715 1716 static String substituteBackslashes(const String& string) 1717 { 1718 size_t questionPos = string.find('?'); 1719 size_t hashPos = string.find('#'); 1720 unsigned pathEnd; 1721 1722 if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos)) 1723 pathEnd = hashPos; 1724 else if (questionPos != notFound) 1725 pathEnd = questionPos; 1726 else 1727 pathEnd = string.length(); 1728 1729 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd); 1730 } 1731 1732 bool KURL::isHierarchical() const 1733 { 1734 if (!m_isValid) 1735 return false; 1736 ASSERT(m_string[m_schemeEnd] == ':'); 1737 return m_string[m_schemeEnd + 1] == '/'; 1738 } 1739 1740 void KURL::copyToBuffer(CharBuffer& buffer) const 1741 { 1742 // FIXME: This throws away the high bytes of all the characters in the string! 1743 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs. 1744 buffer.resize(m_string.length()); 1745 copyASCII(m_string.characters(), m_string.length(), buffer.data()); 1746 } 1747 1748 bool protocolIs(const String& url, const char* protocol) 1749 { 1750 // Do the comparison without making a new string object. 1751 assertProtocolIsGood(protocol); 1752 for (int i = 0; ; ++i) { 1753 if (!protocol[i]) 1754 return url[i] == ':'; 1755 if (toASCIILower(url[i]) != protocol[i]) 1756 return false; 1757 } 1758 } 1759 1760 bool isValidProtocol(const String& protocol) 1761 { 1762 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 1763 if (protocol.isEmpty()) 1764 return false; 1765 if (!isSchemeFirstChar(protocol[0])) 1766 return false; 1767 unsigned protocolLength = protocol.length(); 1768 for (unsigned i = 1; i < protocolLength; i++) { 1769 if (!isSchemeChar(protocol[i])) 1770 return false; 1771 } 1772 return true; 1773 } 1774 1775 #ifndef NDEBUG 1776 void KURL::print() const 1777 { 1778 printf("%s\n", m_string.utf8().data()); 1779 } 1780 #endif 1781 1782 #endif // !USE(GOOGLEURL) 1783 1784 String KURL::strippedForUseAsReferrer() const 1785 { 1786 KURL referrer(*this); 1787 referrer.setUser(String()); 1788 referrer.setPass(String()); 1789 referrer.removeFragmentIdentifier(); 1790 return referrer.string(); 1791 } 1792 1793 bool KURL::isLocalFile() const 1794 { 1795 // Including feed here might be a bad idea since drag and drop uses this check 1796 // and including feed would allow feeds to potentially let someone's blog 1797 // read the contents of the clipboard on a drag, even without a drop. 1798 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function. 1799 return protocolIs("file"); 1800 } 1801 1802 bool protocolIsJavaScript(const String& url) 1803 { 1804 return protocolIs(url, "javascript"); 1805 } 1806 1807 const KURL& blankURL() 1808 { 1809 DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank")); 1810 return staticBlankURL; 1811 } 1812 1813 bool isDefaultPortForProtocol(unsigned short port, const String& protocol) 1814 { 1815 if (protocol.isEmpty()) 1816 return false; 1817 1818 typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap; 1819 DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ()); 1820 if (defaultPorts.isEmpty()) { 1821 defaultPorts.set("http", 80); 1822 defaultPorts.set("https", 443); 1823 defaultPorts.set("ftp", 21); 1824 defaultPorts.set("ftps", 990); 1825 } 1826 return defaultPorts.get(protocol) == port; 1827 } 1828 1829 bool portAllowed(const KURL& url) 1830 { 1831 unsigned short port = url.port(); 1832 1833 // Since most URLs don't have a port, return early for the "no port" case. 1834 if (!port) 1835 return true; 1836 1837 // This blocked port list matches the port blocking that Mozilla implements. 1838 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information. 1839 static const unsigned short blockedPortList[] = { 1840 1, // tcpmux 1841 7, // echo 1842 9, // discard 1843 11, // systat 1844 13, // daytime 1845 15, // netstat 1846 17, // qotd 1847 19, // chargen 1848 20, // FTP-data 1849 21, // FTP-control 1850 22, // SSH 1851 23, // telnet 1852 25, // SMTP 1853 37, // time 1854 42, // name 1855 43, // nicname 1856 53, // domain 1857 77, // priv-rjs 1858 79, // finger 1859 87, // ttylink 1860 95, // supdup 1861 101, // hostriame 1862 102, // iso-tsap 1863 103, // gppitnp 1864 104, // acr-nema 1865 109, // POP2 1866 110, // POP3 1867 111, // sunrpc 1868 113, // auth 1869 115, // SFTP 1870 117, // uucp-path 1871 119, // nntp 1872 123, // NTP 1873 135, // loc-srv / epmap 1874 139, // netbios 1875 143, // IMAP2 1876 179, // BGP 1877 389, // LDAP 1878 465, // SMTP+SSL 1879 512, // print / exec 1880 513, // login 1881 514, // shell 1882 515, // printer 1883 526, // tempo 1884 530, // courier 1885 531, // Chat 1886 532, // netnews 1887 540, // UUCP 1888 556, // remotefs 1889 563, // NNTP+SSL 1890 587, // ESMTP 1891 601, // syslog-conn 1892 636, // LDAP+SSL 1893 993, // IMAP+SSL 1894 995, // POP3+SSL 1895 2049, // NFS 1896 3659, // apple-sasl / PasswordServer [Apple addition] 1897 4045, // lockd 1898 6000, // X11 1899 6665, // Alternate IRC [Apple addition] 1900 6666, // Alternate IRC [Apple addition] 1901 6667, // Standard IRC [Apple addition] 1902 6668, // Alternate IRC [Apple addition] 1903 6669, // Alternate IRC [Apple addition] 1904 invalidPortNumber, // Used to block all invalid port numbers 1905 }; 1906 const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList); 1907 1908 #ifndef NDEBUG 1909 // The port list must be sorted for binary_search to work. 1910 static bool checkedPortList = false; 1911 if (!checkedPortList) { 1912 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p) 1913 ASSERT(*p < *(p + 1)); 1914 checkedPortList = true; 1915 } 1916 #endif 1917 1918 // If the port is not in the blocked port list, allow it. 1919 if (!binary_search(blockedPortList, blockedPortListEnd, port)) 1920 return true; 1921 1922 // Allow ports 21 and 22 for FTP URLs, as Mozilla does. 1923 if ((port == 21 || port == 22) && url.protocolIs("ftp")) 1924 return true; 1925 1926 // Allow any port number in a file URL, since the port number is ignored. 1927 if (url.protocolIs("file")) 1928 return true; 1929 1930 return false; 1931 } 1932 1933 String mimeTypeFromDataURL(const String& url) 1934 { 1935 ASSERT(protocolIs(url, "data")); 1936 size_t index = url.find(';'); 1937 if (index == notFound) 1938 index = url.find(','); 1939 if (index != notFound) { 1940 if (index > 5) 1941 return url.substring(5, index - 5); 1942 return "text/plain"; // Data URLs with no MIME type are considered text/plain. 1943 } 1944 return ""; 1945 } 1946 1947 } 1948