1 /* Based on nsURLParsers.cc from Mozilla 2 * ------------------------------------- 3 * Copyright (C) 1998 Netscape Communications Corporation. 4 * 5 * Other contributors: 6 * Darin Fisher (original author) 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with this library; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 * 22 * Alternatively, the contents of this file may be used under the terms 23 * of either the Mozilla Public License Version 1.1, found at 24 * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public 25 * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html 26 * (the "GPL"), in which case the provisions of the MPL or the GPL are 27 * applicable instead of those above. If you wish to allow use of your 28 * version of this file only under the terms of one of those two 29 * licenses (the MPL or the GPL) and not to allow others to use your 30 * version of this file under the LGPL, indicate your decision by 31 * deletingthe provisions above and replace them with the notice and 32 * other provisions required by the MPL or the GPL, as the case may be. 33 * If you do not delete the provisions above, a recipient may use your 34 * version of this file under any of the LGPL, the MPL or the GPL. 35 */ 36 37 #ifndef URLParser_h 38 #define URLParser_h 39 40 #include "URLComponent.h" 41 #include "URLSegments.h" 42 43 namespace WTF { 44 45 template<typename CHAR> 46 class URLParser { 47 public: 48 enum SpecialPort { 49 UnspecifiedPort = -1, 50 InvalidPort = -2, 51 }; 52 53 // This handles everything that may be an authority terminator, including 54 // backslash. For special backslash handling see parseAfterScheme. 55 static bool isPossibleAuthorityTerminator(CHAR ch) 56 { 57 return isURLSlash(ch) || ch == '?' || ch == '#' || ch == ';'; 58 } 59 60 // Given an already-identified auth section, breaks it into its constituent 61 // parts. The port number will be parsed and the resulting integer will be 62 // filled into the given *port variable, or -1 if there is no port number 63 // or it is invalid. 64 static void parseAuthority(const CHAR* spec, const URLComponent& auth, URLComponent& username, URLComponent& password, URLComponent& host, URLComponent& port) 65 { 66 // FIXME: add ASSERT(auth.isValid()); // We should always get an authority. 67 if (!auth.length()) { 68 username.reset(); 69 password.reset(); 70 host.reset(); 71 port.reset(); 72 return; 73 } 74 75 // Search backwards for @, which is the separator between the user info 76 // and the server info. RFC 3986 forbids @ from occuring in auth, but 77 // someone might include it in a password unescaped. 78 int i = auth.begin() + auth.length() - 1; 79 while (i > auth.begin() && spec[i] != '@') 80 --i; 81 82 if (spec[i] == '@') { 83 // Found user info: <user-info>@<server-info> 84 parseUserInfo(spec, URLComponent(auth.begin(), i - auth.begin()), username, password); 85 parseServerInfo(spec, URLComponent::fromRange(i + 1, auth.begin() + auth.length()), host, port); 86 } else { 87 // No user info, everything is server info. 88 username.reset(); 89 password.reset(); 90 parseServerInfo(spec, auth, host, port); 91 } 92 } 93 94 static bool extractScheme(const CHAR* spec, int specLength, URLComponent& scheme) 95 { 96 // Skip leading whitespace and control characters. 97 int begin = 0; 98 while (begin < specLength && shouldTrimFromURL(spec[begin])) 99 begin++; 100 if (begin == specLength) 101 return false; // Input is empty or all whitespace. 102 103 // Find the first colon character. 104 for (int i = begin; i < specLength; i++) { 105 if (spec[i] == ':') { 106 scheme = URLComponent::fromRange(begin, i); 107 return true; 108 } 109 } 110 return false; // No colon found: no scheme 111 } 112 113 // Fills in all members of the URLSegments structure (except for the 114 // scheme) for standard URLs. 115 // 116 // |spec| is the full spec being parsed, of length |specLength|. 117 // |afterScheme| is the character immediately following the scheme (after 118 // the colon) where we'll begin parsing. 119 static void parseAfterScheme(const CHAR* spec, int specLength, int afterScheme, URLSegments& parsed) 120 { 121 int numberOfSlashes = consecutiveSlashes(spec, afterScheme, specLength); 122 int afterSlashes = afterScheme + numberOfSlashes; 123 124 // First split into two main parts, the authority (username, password, 125 // host, and port) and the full path (path, query, and reference). 126 URLComponent authority; 127 URLComponent fullPath; 128 129 // Found "//<some data>", looks like an authority section. Treat 130 // everything from there to the next slash (or end of spec) to be the 131 // authority. Note that we ignore the number of slashes and treat it as 132 // the authority. 133 int authEnd = nextAuthorityTerminator(spec, afterSlashes, specLength); 134 authority = URLComponent(afterSlashes, authEnd - afterSlashes); 135 136 if (authEnd == specLength) // No beginning of path found. 137 fullPath = URLComponent(); 138 else // Everything starting from the slash to the end is the path. 139 fullPath = URLComponent(authEnd, specLength - authEnd); 140 141 // Now parse those two sub-parts. 142 parseAuthority(spec, authority, parsed.username, parsed.password, parsed.host, parsed.port); 143 parsePath(spec, fullPath, parsed.path, parsed.query, parsed.fragment); 144 } 145 146 // The main parsing function for standard URLs. Standard URLs have a scheme, 147 // host, path, etc. 148 static void parseStandardURL(const CHAR* spec, int specLength, URLSegments& parsed) 149 { 150 // FIXME: add ASSERT(specLength >= 0); 151 152 // Strip leading & trailing spaces and control characters. 153 int begin = 0; 154 trimURL(spec, begin, specLength); 155 156 int afterScheme; 157 if (extractScheme(spec, specLength, parsed.scheme)) 158 afterScheme = parsed.scheme.end() + 1; // Skip past the colon. 159 else { 160 // Say there's no scheme when there is a colon. We could also say 161 // that everything is the scheme. Both would produce an invalid 162 // URL, but this way seems less wrong in more cases. 163 parsed.scheme.reset(); 164 afterScheme = begin; 165 } 166 parseAfterScheme(spec, specLength, afterScheme, parsed); 167 } 168 169 static void parsePath(const CHAR* spec, const URLComponent& path, URLComponent& filepath, URLComponent& query, URLComponent& fragment) 170 { 171 // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<fragment> 172 173 // Special case when there is no path. 174 if (!path.isValid()) { 175 filepath.reset(); 176 query.reset(); 177 fragment.reset(); 178 return; 179 } 180 // FIXME: add ASSERT(path.length() > 0); // We should never have 0 length paths. 181 182 // Search for first occurrence of either ? or #. 183 int pathEnd = path.begin() + path.length(); 184 185 int querySeparator = -1; // Index of the '?' 186 int refSeparator = -1; // Index of the '#' 187 for (int i = path.begin(); i < pathEnd; i++) { 188 switch (spec[i]) { 189 case '?': 190 if (querySeparator < 0) 191 querySeparator = i; 192 break; 193 case '#': 194 refSeparator = i; 195 i = pathEnd; // Break out of the loop. 196 break; 197 default: 198 break; 199 } 200 } 201 202 // Markers pointing to the character after each of these corresponding 203 // components. The code below works from the end back to the beginning, 204 // and will update these indices as it finds components that exist. 205 int fileEnd, queryEnd; 206 207 // Fragment: from the # to the end of the path. 208 if (refSeparator >= 0) { 209 fileEnd = refSeparator; 210 queryEnd = refSeparator; 211 fragment = URLComponent::fromRange(refSeparator + 1, pathEnd); 212 } else { 213 fileEnd = pathEnd; 214 queryEnd = pathEnd; 215 fragment.reset(); 216 } 217 218 // Query fragment: everything from the ? to the next boundary (either 219 // the end of the path or the fragment fragment). 220 if (querySeparator >= 0) { 221 fileEnd = querySeparator; 222 query = URLComponent::fromRange(querySeparator + 1, queryEnd); 223 } else 224 query.reset(); 225 226 // File path: treat an empty file path as no file path. 227 if (fileEnd != path.begin()) 228 filepath = URLComponent::fromRange(path.begin(), fileEnd); 229 else 230 filepath.reset(); 231 } 232 233 // Initializes a path URL which is merely a scheme followed by a path. 234 // Examples include "about:foo" and "javascript:alert('bar');" 235 static void parsePathURL(const CHAR* spec, int specLength, URLSegments& parsed) 236 { 237 // Get the non-path and non-scheme parts of the URL out of the way, we 238 // never use them. 239 parsed.username.reset(); 240 parsed.password.reset(); 241 parsed.host.reset(); 242 parsed.port.reset(); 243 parsed.query.reset(); 244 parsed.fragment.reset(); 245 246 // Strip leading & trailing spaces and control characters. 247 // FIXME: Perhaps this is unnecessary? 248 int begin = 0; 249 trimURL(spec, begin, specLength); 250 251 // Handle empty specs or ones that contain only whitespace or control 252 // chars. 253 if (begin == specLength) { 254 parsed.scheme.reset(); 255 parsed.path.reset(); 256 return; 257 } 258 259 // Extract the scheme, with the path being everything following. We also 260 // handle the case where there is no scheme. 261 if (extractScheme(&spec[begin], specLength - begin, parsed.scheme)) { 262 // Offset the results since we gave extractScheme a substring. 263 parsed.scheme.setBegin(parsed.scheme.begin() + begin); 264 265 // For compatibility with the standard URL parser, we treat no path 266 // as -1, rather than having a length of 0 (we normally wouldn't 267 // care so much for these non-standard URLs). 268 if (parsed.scheme.end() == specLength - 1) 269 parsed.path.reset(); 270 else 271 parsed.path = URLComponent::fromRange(parsed.scheme.end() + 1, specLength); 272 } else { 273 // No scheme found, just path. 274 parsed.scheme.reset(); 275 parsed.path = URLComponent::fromRange(begin, specLength); 276 } 277 } 278 279 static void parseMailtoURL(const CHAR* spec, int specLength, URLSegments& parsed) 280 { 281 // FIXME: add ASSERT(specLength >= 0); 282 283 // Get the non-path and non-scheme parts of the URL out of the way, we 284 // never use them. 285 parsed.username.reset(); 286 parsed.password.reset(); 287 parsed.host.reset(); 288 parsed.port.reset(); 289 parsed.fragment.reset(); 290 parsed.query.reset(); // May use this; reset for convenience. 291 292 // Strip leading & trailing spaces and control characters. 293 int begin = 0; 294 trimURL(spec, begin, specLength); 295 296 // Handle empty specs or ones that contain only whitespace or control 297 // chars. 298 if (begin == specLength) { 299 parsed.scheme.reset(); 300 parsed.path.reset(); 301 return; 302 } 303 304 int pathBegin = -1; 305 int pathEnd = -1; 306 307 // Extract the scheme, with the path being everything following. We also 308 // handle the case where there is no scheme. 309 if (extractScheme(&spec[begin], specLength - begin, parsed.scheme)) { 310 // Offset the results since we gave extractScheme a substring. 311 parsed.scheme.setBegin(parsed.scheme.begin() + begin); 312 313 if (parsed.scheme.end() != specLength - 1) { 314 pathBegin = parsed.scheme.end() + 1; 315 pathEnd = specLength; 316 } 317 } else { 318 // No scheme found, just path. 319 parsed.scheme.reset(); 320 pathBegin = begin; 321 pathEnd = specLength; 322 } 323 324 // Split [pathBegin, pathEnd) into a path + query. 325 for (int i = pathBegin; i < pathEnd; ++i) { 326 if (spec[i] == '?') { 327 parsed.query = URLComponent::fromRange(i + 1, pathEnd); 328 pathEnd = i; 329 break; 330 } 331 } 332 333 // For compatibility with the standard URL parser, treat no path as 334 // -1, rather than having a length of 0 335 if (pathBegin == pathEnd) 336 parsed.path.reset(); 337 else 338 parsed.path = URLComponent::fromRange(pathBegin, pathEnd); 339 } 340 341 static int parsePort(const CHAR* spec, const URLComponent& component) 342 { 343 // Easy success case when there is no port. 344 const int maxDigits = 5; 345 if (component.isEmptyOrInvalid()) 346 return UnspecifiedPort; 347 348 URLComponent nonZeroDigits(component.end(), 0); 349 for (int i = 0; i < component.length(); ++i) { 350 if (spec[component.begin() + i] != '0') { 351 nonZeroDigits = URLComponent::fromRange(component.begin() + i, component.end()); 352 break; 353 } 354 } 355 if (!nonZeroDigits.length()) 356 return 0; // All digits were 0. 357 358 if (nonZeroDigits.length() > maxDigits) 359 return InvalidPort; 360 361 int port = 0; 362 for (int i = 0; i < nonZeroDigits.length(); ++i) { 363 CHAR ch = spec[nonZeroDigits.begin() + i]; 364 if (!isPortDigit(ch)) 365 return InvalidPort; 366 port *= 10; 367 port += static_cast<char>(ch) - '0'; 368 } 369 if (port > 65535) 370 return InvalidPort; 371 return port; 372 } 373 374 static void extractFileName(const CHAR* spec, const URLComponent& path, URLComponent& fileName) 375 { 376 // Handle empty paths: they have no file names. 377 if (path.isEmptyOrInvalid()) { 378 fileName.reset(); 379 return; 380 } 381 382 // Search backwards for a parameter, which is a normally unused field 383 // in a URL delimited by a semicolon. We parse the parameter as part of 384 // the path, but here, we don't want to count it. The last semicolon is 385 // the parameter. 386 int fileEnd = path.end(); 387 for (int i = path.end() - 1; i > path.begin(); --i) { 388 if (spec[i] == ';') { 389 fileEnd = i; 390 break; 391 } 392 } 393 394 // Now search backwards from the filename end to the previous slash 395 // to find the beginning of the filename. 396 for (int i = fileEnd - 1; i >= path.begin(); --i) { 397 if (isURLSlash(spec[i])) { 398 // File name is everything following this character to the end 399 fileName = URLComponent::fromRange(i + 1, fileEnd); 400 return; 401 } 402 } 403 404 // No slash found, this means the input was degenerate (generally paths 405 // will start with a slash). Let's call everything the file name. 406 fileName = URLComponent::fromRange(path.begin(), fileEnd); 407 } 408 409 static bool extractQueryKeyValue(const CHAR* spec, URLComponent& query, URLComponent& key, URLComponent& value) 410 { 411 if (query.isEmptyOrInvalid()) 412 return false; 413 414 int start = query.begin(); 415 int current = start; 416 int end = query.end(); 417 418 // We assume the beginning of the input is the beginning of the "key" 419 // and we skip to the end of it. 420 key.setBegin(current); 421 while (current < end && spec[current] != '&' && spec[current] != '=') 422 ++current; 423 key.setLength(current - key.begin()); 424 425 // Skip the separator after the key (if any). 426 if (current < end && spec[current] == '=') 427 ++current; 428 429 // Find the value part. 430 value.setBegin(current); 431 while (current < end && spec[current] != '&') 432 ++current; 433 value.setLength(current - value.begin()); 434 435 // Finally skip the next separator if any 436 if (current < end && spec[current] == '&') 437 ++current; 438 439 // Save the new query 440 query = URLComponent::fromRange(current, end); 441 return true; 442 } 443 444 // FIXME: This should be protected or private. 445 public: 446 // We treat slashes and backslashes the same for IE compatibility. 447 static inline bool isURLSlash(CHAR ch) 448 { 449 return ch == '/' || ch == '\\'; 450 } 451 452 // Returns true if we should trim this character from the URL because it is 453 // a space or a control character. 454 static inline bool shouldTrimFromURL(CHAR ch) 455 { 456 return ch <= ' '; 457 } 458 459 // Given an already-initialized begin index and end index (the index after 460 // the last CHAR in spec), this shrinks the range to eliminate 461 // "should-be-trimmed" characters. 462 static inline void trimURL(const CHAR* spec, int& begin, int& end) 463 { 464 // Strip leading whitespace and control characters. 465 while (begin < end && shouldTrimFromURL(spec[begin])) 466 ++begin; 467 468 // Strip trailing whitespace and control characters. We need the >i 469 // test for when the input string is all blanks; we don't want to back 470 // past the input. 471 while (end > begin && shouldTrimFromURL(spec[end - 1])) 472 --end; 473 } 474 475 // Counts the number of consecutive slashes starting at the given offset 476 // in the given string of the given length. 477 static inline int consecutiveSlashes(const CHAR *string, int beginOffset, int stringLength) 478 { 479 int count = 0; 480 while (beginOffset + count < stringLength && isURLSlash(string[beginOffset + count])) 481 ++count; 482 return count; 483 } 484 485 private: 486 // URLParser cannot be constructed. 487 URLParser(); 488 489 // Returns true if the given character is a valid digit to use in a port. 490 static inline bool isPortDigit(CHAR ch) 491 { 492 return ch >= '0' && ch <= '9'; 493 } 494 495 // Returns the offset of the next authority terminator in the input starting 496 // from startOffset. If no terminator is found, the return value will be equal 497 // to specLength. 498 static int nextAuthorityTerminator(const CHAR* spec, int startOffset, int specLength) 499 { 500 for (int i = startOffset; i < specLength; i++) { 501 if (isPossibleAuthorityTerminator(spec[i])) 502 return i; 503 } 504 return specLength; // Not found. 505 } 506 507 static void parseUserInfo(const CHAR* spec, const URLComponent& user, URLComponent& username, URLComponent& password) 508 { 509 // Find the first colon in the user section, which separates the 510 // username and password. 511 int colonOffset = 0; 512 while (colonOffset < user.length() && spec[user.begin() + colonOffset] != ':') 513 ++colonOffset; 514 515 if (colonOffset < user.length()) { 516 // Found separator: <username>:<password> 517 username = URLComponent(user.begin(), colonOffset); 518 password = URLComponent::fromRange(user.begin() + colonOffset + 1, user.begin() + user.length()); 519 } else { 520 // No separator, treat everything as the username 521 username = user; 522 password = URLComponent(); 523 } 524 } 525 526 static void parseServerInfo(const CHAR* spec, const URLComponent& serverInfo, URLComponent& host, URLComponent& port) 527 { 528 if (!serverInfo.length()) { 529 // No server info, host name is empty. 530 host.reset(); 531 port.reset(); 532 return; 533 } 534 535 // If the host starts with a left-bracket, assume the entire host is an 536 // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. 537 // This assumption will be overridden if we find a right-bracket. 538 // 539 // Our IPv6 address canonicalization code requires both brackets to 540 // exist, but the ability to locate an incomplete address can still be 541 // useful. 542 int ipv6Terminator = spec[serverInfo.begin()] == '[' ? serverInfo.end() : -1; 543 int colon = -1; 544 545 // Find the last right-bracket, and the last colon. 546 for (int i = serverInfo.begin(); i < serverInfo.end(); i++) { 547 switch (spec[i]) { 548 case ']': 549 ipv6Terminator = i; 550 break; 551 case ':': 552 colon = i; 553 break; 554 default: 555 break; 556 } 557 } 558 559 if (colon > ipv6Terminator) { 560 // Found a port number: <hostname>:<port> 561 host = URLComponent::fromRange(serverInfo.begin(), colon); 562 if (!host.length()) 563 host.reset(); 564 port = URLComponent::fromRange(colon + 1, serverInfo.end()); 565 } else { 566 // No port: <hostname> 567 host = serverInfo; 568 port.reset(); 569 } 570 } 571 }; 572 573 } // namespace WTF 574 575 #endif // URLParser_h 576