Home | History | Annotate | Download | only in src
      1 /* Based on nsURLParsers.cc from Mozilla
      2  * -------------------------------------
      3  * Copyright (C) 1998 Netscape Communications Corporation.
      4  *
      5  * Other contributors:
      6  *   Darin Fisher (original author)
      7  *
      8  * This library is free software; you can redistribute it and/or
      9  * modify it under the terms of the GNU Lesser General Public
     10  * License as published by the Free Software Foundation; either
     11  * version 2.1 of the License, or (at your option) any later version.
     12  *
     13  * This library is distributed in the hope that it will be useful,
     14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16  * Lesser General Public License for more details.
     17  *
     18  * You should have received a copy of the GNU Lesser General Public
     19  * License along with this library; if not, write to the Free Software
     20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
     21  *
     22  * Alternatively, the contents of this file may be used under the terms
     23  * of either the Mozilla Public License Version 1.1, found at
     24  * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
     25  * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
     26  * (the "GPL"), in which case the provisions of the MPL or the GPL are
     27  * applicable instead of those above.  If you wish to allow use of your
     28  * version of this file only under the terms of one of those two
     29  * licenses (the MPL or the GPL) and not to allow others to use your
     30  * version of this file under the LGPL, indicate your decision by
     31  * deletingthe provisions above and replace them with the notice and
     32  * other provisions required by the MPL or the GPL, as the case may be.
     33  * If you do not delete the provisions above, a recipient may use your
     34  * version of this file under any of the LGPL, the MPL or the GPL.
     35  */
     36 
     37 #ifndef URLParser_h
     38 #define URLParser_h
     39 
     40 #include "URLComponent.h"
     41 #include "URLSegments.h"
     42 
     43 namespace WTF {
     44 
     45 template<typename CHAR>
     46 class URLParser {
     47 public:
     48     enum SpecialPort {
     49         UnspecifiedPort = -1,
     50         InvalidPort = -2,
     51     };
     52 
     53     // This handles everything that may be an authority terminator, including
     54     // backslash. For special backslash handling see parseAfterScheme.
     55     static bool isPossibleAuthorityTerminator(CHAR ch)
     56     {
     57         return isURLSlash(ch) || ch == '?' || ch == '#' || ch == ';';
     58     }
     59 
     60     // Given an already-identified auth section, breaks it into its constituent
     61     // parts. The port number will be parsed and the resulting integer will be
     62     // filled into the given *port variable, or -1 if there is no port number
     63     // or it is invalid.
     64     static void parseAuthority(const CHAR* spec, const URLComponent& auth, URLComponent& username, URLComponent& password, URLComponent& host, URLComponent& port)
     65     {
     66         // FIXME: add ASSERT(auth.isValid()); // We should always get an authority.
     67         if (!auth.length()) {
     68             username.reset();
     69             password.reset();
     70             host.reset();
     71             port.reset();
     72             return;
     73         }
     74 
     75         // Search backwards for @, which is the separator between the user info
     76         // and the server info.  RFC 3986 forbids @ from occuring in auth, but
     77         // someone might include it in a password unescaped.
     78         int i = auth.begin() + auth.length() - 1;
     79         while (i > auth.begin() && spec[i] != '@')
     80             --i;
     81 
     82         if (spec[i] == '@') {
     83             // Found user info: <user-info>@<server-info>
     84             parseUserInfo(spec, URLComponent(auth.begin(), i - auth.begin()), username, password);
     85             parseServerInfo(spec, URLComponent::fromRange(i + 1, auth.begin() + auth.length()), host, port);
     86         } else {
     87             // No user info, everything is server info.
     88             username.reset();
     89             password.reset();
     90             parseServerInfo(spec, auth, host, port);
     91         }
     92     }
     93 
     94     static bool extractScheme(const CHAR* spec, int specLength, URLComponent& scheme)
     95     {
     96         // Skip leading whitespace and control characters.
     97         int begin = 0;
     98         while (begin < specLength && shouldTrimFromURL(spec[begin]))
     99             begin++;
    100         if (begin == specLength)
    101             return false; // Input is empty or all whitespace.
    102 
    103         // Find the first colon character.
    104         for (int i = begin; i < specLength; i++) {
    105             if (spec[i] == ':') {
    106                 scheme = URLComponent::fromRange(begin, i);
    107                 return true;
    108             }
    109         }
    110         return false; // No colon found: no scheme
    111     }
    112 
    113     // Fills in all members of the URLSegments structure (except for the
    114     // scheme) for standard URLs.
    115     //
    116     // |spec| is the full spec being parsed, of length |specLength|.
    117     // |afterScheme| is the character immediately following the scheme (after
    118     // the colon) where we'll begin parsing.
    119     static void parseAfterScheme(const CHAR* spec, int specLength, int afterScheme, URLSegments& parsed)
    120     {
    121         int numberOfSlashes = consecutiveSlashes(spec, afterScheme, specLength);
    122         int afterSlashes = afterScheme + numberOfSlashes;
    123 
    124         // First split into two main parts, the authority (username, password,
    125         // host, and port) and the full path (path, query, and reference).
    126         URLComponent authority;
    127         URLComponent fullPath;
    128 
    129         // Found "//<some data>", looks like an authority section. Treat
    130         // everything from there to the next slash (or end of spec) to be the
    131         // authority. Note that we ignore the number of slashes and treat it as
    132         // the authority.
    133         int authEnd = nextAuthorityTerminator(spec, afterSlashes, specLength);
    134         authority = URLComponent(afterSlashes, authEnd - afterSlashes);
    135 
    136         if (authEnd == specLength) // No beginning of path found.
    137             fullPath = URLComponent();
    138         else // Everything starting from the slash to the end is the path.
    139             fullPath = URLComponent(authEnd, specLength - authEnd);
    140 
    141         // Now parse those two sub-parts.
    142         parseAuthority(spec, authority, parsed.username, parsed.password, parsed.host, parsed.port);
    143         parsePath(spec, fullPath, parsed.path, parsed.query, parsed.fragment);
    144     }
    145 
    146     // The main parsing function for standard URLs. Standard URLs have a scheme,
    147     // host, path, etc.
    148     static void parseStandardURL(const CHAR* spec, int specLength, URLSegments& parsed)
    149     {
    150         // FIXME: add ASSERT(specLength >= 0);
    151 
    152         // Strip leading & trailing spaces and control characters.
    153         int begin = 0;
    154         trimURL(spec, begin, specLength);
    155 
    156         int afterScheme;
    157         if (extractScheme(spec, specLength, parsed.scheme))
    158             afterScheme = parsed.scheme.end() + 1; // Skip past the colon.
    159         else {
    160             // Say there's no scheme when there is a colon. We could also say
    161             // that everything is the scheme. Both would produce an invalid
    162             // URL, but this way seems less wrong in more cases.
    163             parsed.scheme.reset();
    164             afterScheme = begin;
    165         }
    166         parseAfterScheme(spec, specLength, afterScheme, parsed);
    167     }
    168 
    169     static void parsePath(const CHAR* spec, const URLComponent& path, URLComponent& filepath, URLComponent& query, URLComponent& fragment)
    170     {
    171         // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<fragment>
    172 
    173         // Special case when there is no path.
    174         if (!path.isValid()) {
    175             filepath.reset();
    176             query.reset();
    177             fragment.reset();
    178             return;
    179         }
    180         // FIXME: add ASSERT(path.length() > 0); // We should never have 0 length paths.
    181 
    182         // Search for first occurrence of either ? or #.
    183         int pathEnd = path.begin() + path.length();
    184 
    185         int querySeparator = -1; // Index of the '?'
    186         int refSeparator = -1; // Index of the '#'
    187         for (int i = path.begin(); i < pathEnd; i++) {
    188             switch (spec[i]) {
    189             case '?':
    190                 if (querySeparator < 0)
    191                     querySeparator = i;
    192                 break;
    193             case '#':
    194                 refSeparator = i;
    195                 i = pathEnd; // Break out of the loop.
    196                 break;
    197             default:
    198                 break;
    199             }
    200         }
    201 
    202         // Markers pointing to the character after each of these corresponding
    203         // components. The code below works from the end back to the beginning,
    204         // and will update these indices as it finds components that exist.
    205         int fileEnd, queryEnd;
    206 
    207         // Fragment: from the # to the end of the path.
    208         if (refSeparator >= 0) {
    209             fileEnd = refSeparator;
    210             queryEnd = refSeparator;
    211             fragment = URLComponent::fromRange(refSeparator + 1, pathEnd);
    212         } else {
    213             fileEnd = pathEnd;
    214             queryEnd = pathEnd;
    215             fragment.reset();
    216         }
    217 
    218         // Query fragment: everything from the ? to the next boundary (either
    219         // the end of the path or the fragment fragment).
    220         if (querySeparator >= 0) {
    221             fileEnd = querySeparator;
    222             query = URLComponent::fromRange(querySeparator + 1, queryEnd);
    223         } else
    224             query.reset();
    225 
    226         // File path: treat an empty file path as no file path.
    227         if (fileEnd != path.begin())
    228             filepath = URLComponent::fromRange(path.begin(), fileEnd);
    229         else
    230             filepath.reset();
    231     }
    232 
    233     // Initializes a path URL which is merely a scheme followed by a path.
    234     // Examples include "about:foo" and "javascript:alert('bar');"
    235     static void parsePathURL(const CHAR* spec, int specLength, URLSegments& parsed)
    236     {
    237         // Get the non-path and non-scheme parts of the URL out of the way, we
    238         // never use them.
    239         parsed.username.reset();
    240         parsed.password.reset();
    241         parsed.host.reset();
    242         parsed.port.reset();
    243         parsed.query.reset();
    244         parsed.fragment.reset();
    245 
    246         // Strip leading & trailing spaces and control characters.
    247         // FIXME: Perhaps this is unnecessary?
    248         int begin = 0;
    249         trimURL(spec, begin, specLength);
    250 
    251         // Handle empty specs or ones that contain only whitespace or control
    252         // chars.
    253         if (begin == specLength) {
    254             parsed.scheme.reset();
    255             parsed.path.reset();
    256             return;
    257         }
    258 
    259         // Extract the scheme, with the path being everything following. We also
    260         // handle the case where there is no scheme.
    261         if (extractScheme(&spec[begin], specLength - begin, parsed.scheme)) {
    262             // Offset the results since we gave extractScheme a substring.
    263             parsed.scheme.setBegin(parsed.scheme.begin() + begin);
    264 
    265             // For compatibility with the standard URL parser, we treat no path
    266             // as -1, rather than having a length of 0 (we normally wouldn't
    267             // care so much for these non-standard URLs).
    268             if (parsed.scheme.end() == specLength - 1)
    269                 parsed.path.reset();
    270             else
    271                 parsed.path = URLComponent::fromRange(parsed.scheme.end() + 1, specLength);
    272         } else {
    273             // No scheme found, just path.
    274             parsed.scheme.reset();
    275             parsed.path = URLComponent::fromRange(begin, specLength);
    276         }
    277     }
    278 
    279     static void parseMailtoURL(const CHAR* spec, int specLength, URLSegments& parsed)
    280     {
    281         // FIXME: add ASSERT(specLength >= 0);
    282 
    283         // Get the non-path and non-scheme parts of the URL out of the way, we
    284         // never use them.
    285         parsed.username.reset();
    286         parsed.password.reset();
    287         parsed.host.reset();
    288         parsed.port.reset();
    289         parsed.fragment.reset();
    290         parsed.query.reset(); // May use this; reset for convenience.
    291 
    292         // Strip leading & trailing spaces and control characters.
    293         int begin = 0;
    294         trimURL(spec, begin, specLength);
    295 
    296         // Handle empty specs or ones that contain only whitespace or control
    297         // chars.
    298         if (begin == specLength) {
    299             parsed.scheme.reset();
    300             parsed.path.reset();
    301             return;
    302         }
    303 
    304         int pathBegin = -1;
    305         int pathEnd = -1;
    306 
    307         // Extract the scheme, with the path being everything following. We also
    308         // handle the case where there is no scheme.
    309         if (extractScheme(&spec[begin], specLength - begin, parsed.scheme)) {
    310             // Offset the results since we gave extractScheme a substring.
    311             parsed.scheme.setBegin(parsed.scheme.begin() + begin);
    312 
    313             if (parsed.scheme.end() != specLength - 1) {
    314                 pathBegin = parsed.scheme.end() + 1;
    315                 pathEnd = specLength;
    316             }
    317         } else {
    318             // No scheme found, just path.
    319             parsed.scheme.reset();
    320             pathBegin = begin;
    321             pathEnd = specLength;
    322         }
    323 
    324         // Split [pathBegin, pathEnd) into a path + query.
    325         for (int i = pathBegin; i < pathEnd; ++i) {
    326             if (spec[i] == '?') {
    327                 parsed.query = URLComponent::fromRange(i + 1, pathEnd);
    328                 pathEnd = i;
    329                 break;
    330             }
    331         }
    332 
    333         // For compatibility with the standard URL parser, treat no path as
    334         // -1, rather than having a length of 0
    335         if (pathBegin == pathEnd)
    336             parsed.path.reset();
    337         else
    338             parsed.path = URLComponent::fromRange(pathBegin, pathEnd);
    339     }
    340 
    341     static int parsePort(const CHAR* spec, const URLComponent& component)
    342     {
    343         // Easy success case when there is no port.
    344         const int maxDigits = 5;
    345         if (component.isEmptyOrInvalid())
    346             return UnspecifiedPort;
    347 
    348         URLComponent nonZeroDigits(component.end(), 0);
    349         for (int i = 0; i < component.length(); ++i) {
    350             if (spec[component.begin() + i] != '0') {
    351                 nonZeroDigits = URLComponent::fromRange(component.begin() + i, component.end());
    352                 break;
    353             }
    354         }
    355         if (!nonZeroDigits.length())
    356             return 0; // All digits were 0.
    357 
    358         if (nonZeroDigits.length() > maxDigits)
    359             return InvalidPort;
    360 
    361         int port = 0;
    362         for (int i = 0; i < nonZeroDigits.length(); ++i) {
    363             CHAR ch = spec[nonZeroDigits.begin() + i];
    364             if (!isPortDigit(ch))
    365                 return InvalidPort;
    366             port *= 10;
    367             port += static_cast<char>(ch) - '0';
    368         }
    369         if (port > 65535)
    370             return InvalidPort;
    371         return port;
    372     }
    373 
    374     static void extractFileName(const CHAR* spec, const URLComponent& path, URLComponent& fileName)
    375     {
    376         // Handle empty paths: they have no file names.
    377         if (path.isEmptyOrInvalid()) {
    378             fileName.reset();
    379             return;
    380         }
    381 
    382         // Search backwards for a parameter, which is a normally unused field
    383         // in a URL delimited by a semicolon. We parse the parameter as part of
    384         // the path, but here, we don't want to count it. The last semicolon is
    385         // the parameter.
    386         int fileEnd = path.end();
    387         for (int i = path.end() - 1; i > path.begin(); --i) {
    388             if (spec[i] == ';') {
    389                 fileEnd = i;
    390                 break;
    391             }
    392         }
    393 
    394         // Now search backwards from the filename end to the previous slash
    395         // to find the beginning of the filename.
    396         for (int i = fileEnd - 1; i >= path.begin(); --i) {
    397             if (isURLSlash(spec[i])) {
    398                 // File name is everything following this character to the end
    399                 fileName = URLComponent::fromRange(i + 1, fileEnd);
    400                 return;
    401             }
    402         }
    403 
    404         // No slash found, this means the input was degenerate (generally paths
    405         // will start with a slash). Let's call everything the file name.
    406         fileName = URLComponent::fromRange(path.begin(), fileEnd);
    407     }
    408 
    409     static bool extractQueryKeyValue(const CHAR* spec, URLComponent& query, URLComponent& key, URLComponent& value)
    410     {
    411         if (query.isEmptyOrInvalid())
    412             return false;
    413 
    414         int start = query.begin();
    415         int current = start;
    416         int end = query.end();
    417 
    418         // We assume the beginning of the input is the beginning of the "key"
    419         // and we skip to the end of it.
    420         key.setBegin(current);
    421         while (current < end && spec[current] != '&' && spec[current] != '=')
    422             ++current;
    423         key.setLength(current - key.begin());
    424 
    425         // Skip the separator after the key (if any).
    426         if (current < end && spec[current] == '=')
    427             ++current;
    428 
    429         // Find the value part.
    430         value.setBegin(current);
    431         while (current < end && spec[current] != '&')
    432             ++current;
    433         value.setLength(current - value.begin());
    434 
    435         // Finally skip the next separator if any
    436         if (current < end && spec[current] == '&')
    437             ++current;
    438 
    439         // Save the new query
    440         query = URLComponent::fromRange(current, end);
    441         return true;
    442     }
    443 
    444 // FIXME: This should be protected or private.
    445 public:
    446     // We treat slashes and backslashes the same for IE compatibility.
    447     static inline bool isURLSlash(CHAR ch)
    448     {
    449         return ch == '/' || ch == '\\';
    450     }
    451 
    452     // Returns true if we should trim this character from the URL because it is
    453     // a space or a control character.
    454     static inline bool shouldTrimFromURL(CHAR ch)
    455     {
    456         return ch <= ' ';
    457     }
    458 
    459     // Given an already-initialized begin index and end index (the index after
    460     // the last CHAR in spec), this shrinks the range to eliminate
    461     // "should-be-trimmed" characters.
    462     static inline void trimURL(const CHAR* spec, int& begin, int& end)
    463     {
    464         // Strip leading whitespace and control characters.
    465         while (begin < end && shouldTrimFromURL(spec[begin]))
    466             ++begin;
    467 
    468         // Strip trailing whitespace and control characters. We need the >i
    469         // test for when the input string is all blanks; we don't want to back
    470         // past the input.
    471         while (end > begin && shouldTrimFromURL(spec[end - 1]))
    472             --end;
    473     }
    474 
    475     // Counts the number of consecutive slashes starting at the given offset
    476     // in the given string of the given length.
    477     static inline int consecutiveSlashes(const CHAR *string, int beginOffset, int stringLength)
    478     {
    479         int count = 0;
    480         while (beginOffset + count < stringLength && isURLSlash(string[beginOffset + count]))
    481             ++count;
    482         return count;
    483     }
    484 
    485 private:
    486     // URLParser cannot be constructed.
    487     URLParser();
    488 
    489     // Returns true if the given character is a valid digit to use in a port.
    490     static inline bool isPortDigit(CHAR ch)
    491     {
    492         return ch >= '0' && ch <= '9';
    493     }
    494 
    495     // Returns the offset of the next authority terminator in the input starting
    496     // from startOffset. If no terminator is found, the return value will be equal
    497     // to specLength.
    498     static int nextAuthorityTerminator(const CHAR* spec, int startOffset, int specLength)
    499     {
    500         for (int i = startOffset; i < specLength; i++) {
    501             if (isPossibleAuthorityTerminator(spec[i]))
    502                 return i;
    503         }
    504         return specLength; // Not found.
    505     }
    506 
    507     static void parseUserInfo(const CHAR* spec, const URLComponent& user, URLComponent& username, URLComponent& password)
    508     {
    509         // Find the first colon in the user section, which separates the
    510         // username and password.
    511         int colonOffset = 0;
    512         while (colonOffset < user.length() && spec[user.begin() + colonOffset] != ':')
    513             ++colonOffset;
    514 
    515         if (colonOffset < user.length()) {
    516             // Found separator: <username>:<password>
    517             username = URLComponent(user.begin(), colonOffset);
    518             password = URLComponent::fromRange(user.begin() + colonOffset + 1, user.begin() + user.length());
    519         } else {
    520             // No separator, treat everything as the username
    521             username = user;
    522             password = URLComponent();
    523         }
    524     }
    525 
    526     static void parseServerInfo(const CHAR* spec, const URLComponent& serverInfo, URLComponent& host, URLComponent& port)
    527     {
    528         if (!serverInfo.length()) {
    529             // No server info, host name is empty.
    530             host.reset();
    531             port.reset();
    532             return;
    533         }
    534 
    535         // If the host starts with a left-bracket, assume the entire host is an
    536         // IPv6 literal.  Otherwise, assume none of the host is an IPv6 literal.
    537         // This assumption will be overridden if we find a right-bracket.
    538         //
    539         // Our IPv6 address canonicalization code requires both brackets to
    540         // exist, but the ability to locate an incomplete address can still be
    541         // useful.
    542         int ipv6Terminator = spec[serverInfo.begin()] == '[' ? serverInfo.end() : -1;
    543         int colon = -1;
    544 
    545         // Find the last right-bracket, and the last colon.
    546         for (int i = serverInfo.begin(); i < serverInfo.end(); i++) {
    547             switch (spec[i]) {
    548             case ']':
    549                 ipv6Terminator = i;
    550                 break;
    551             case ':':
    552                 colon = i;
    553                 break;
    554             default:
    555                 break;
    556             }
    557         }
    558 
    559         if (colon > ipv6Terminator) {
    560             // Found a port number: <hostname>:<port>
    561             host = URLComponent::fromRange(serverInfo.begin(), colon);
    562             if (!host.length())
    563                 host.reset();
    564             port = URLComponent::fromRange(colon + 1, serverInfo.end());
    565         } else {
    566             // No port: <hostname>
    567             host = serverInfo;
    568             port.reset();
    569         }
    570     }
    571 };
    572 
    573 } // namespace WTF
    574 
    575 #endif // URLParser_h
    576