Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 #include "base/logging.h"
     31 #include "googleurl/src/url_file.h"
     32 #include "googleurl/src/url_parse.h"
     33 #include "googleurl/src/url_parse_internal.h"
     34 
     35 // Interesting IE file:isms...
     36 //
     37 //  INPUT                      OUTPUT
     38 //  =========================  ==============================
     39 //  file:/foo/bar              file:///foo/bar
     40 //      The result here seems totally invalid!?!? This isn't UNC.
     41 //
     42 //  file:/
     43 //  file:// or any other number of slashes
     44 //      IE6 doesn't do anything at all if you click on this link. No error:
     45 //      nothing. IE6's history system seems to always color this link, so I'm
     46 //      guessing that it maps internally to the empty URL.
     47 //
     48 //  C:\                        file:///C:/
     49 //      When on a file: URL source page, this link will work. When over HTTP,
     50 //      the file: URL will appear in the status bar but the link will not work
     51 //      (security restriction for all file URLs).
     52 //
     53 //  file:foo/                  file:foo/     (invalid?!?!?)
     54 //  file:/foo/                 file:///foo/  (invalid?!?!?)
     55 //  file://foo/                file://foo/   (UNC to server "foo")
     56 //  file:///foo/               file:///foo/  (invalid, seems to be a file)
     57 //  file:////foo/              file://foo/   (UNC to server "foo")
     58 //      Any more than four slashes is also treated as UNC.
     59 //
     60 //  file:C:/                   file://C:/
     61 //  file:/C:/                  file://C:/
     62 //      The number of slashes after "file:" don't matter if the thing following
     63 //      it looks like an absolute drive path. Also, slashes and backslashes are
     64 //      equally valid here.
     65 
     66 namespace url_parse {
     67 
     68 namespace {
     69 
     70 // A subcomponent of DoInitFileURL, the input of this function should be a UNC
     71 // path name, with the index of the first character after the slashes following
     72 // the scheme given in |after_slashes|. This will initialize the host, path,
     73 // query, and ref, and leave the other output components untouched
     74 // (DoInitFileURL handles these for us).
     75 template<typename CHAR>
     76 void DoParseUNC(const CHAR* spec,
     77                 int after_slashes,
     78                 int spec_len,
     79                Parsed* parsed) {
     80   int next_slash = FindNextSlash(spec, after_slashes, spec_len);
     81   if (next_slash == spec_len) {
     82     // No additional slash found, as in "file://foo", treat the text as the
     83     // host with no path (this will end up being UNC to server "foo").
     84     int host_len = spec_len - after_slashes;
     85     if (host_len)
     86       parsed->host = Component(after_slashes, host_len);
     87     else
     88       parsed->host.reset();
     89     parsed->path.reset();
     90     return;
     91   }
     92 
     93 #ifdef WIN32
     94   // See if we have something that looks like a path following the first
     95   // component. As in "file://localhost/c:/", we get "c:/" out. We want to
     96   // treat this as a having no host but the path given. Works on Windows only.
     97   if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
     98     parsed->host.reset();
     99     ParsePathInternal(spec, MakeRange(next_slash, spec_len),
    100                       &parsed->path, &parsed->query, &parsed->ref);
    101     return;
    102   }
    103 #endif
    104 
    105   // Otherwise, everything up until that first slash we found is the host name,
    106   // which will end up being the UNC host. For example "file://foo/bar.txt"
    107   // will get a server name of "foo" and a path of "/bar". Later, on Windows,
    108   // this should be treated as the filename "\\foo\bar.txt" in proper UNC
    109   // notation.
    110   int host_len = next_slash - after_slashes;
    111   if (host_len)
    112     parsed->host = MakeRange(after_slashes, next_slash);
    113   else
    114     parsed->host.reset();
    115   if (next_slash < spec_len) {
    116     ParsePathInternal(spec, MakeRange(next_slash, spec_len),
    117                       &parsed->path, &parsed->query, &parsed->ref);
    118   } else {
    119     parsed->path.reset();
    120   }
    121 }
    122 
    123 // A subcomponent of DoParseFileURL, the input should be a local file, with the
    124 // beginning of the path indicated by the index in |path_begin|. This will
    125 // initialize the host, path, query, and ref, and leave the other output
    126 // components untouched (DoInitFileURL handles these for us).
    127 template<typename CHAR>
    128 void DoParseLocalFile(const CHAR* spec,
    129                       int path_begin,
    130                       int spec_len,
    131                       Parsed* parsed) {
    132   parsed->host.reset();
    133   ParsePathInternal(spec, MakeRange(path_begin, spec_len),
    134                     &parsed->path, &parsed->query, &parsed->ref);
    135 }
    136 
    137 // Backend for the external functions that operates on either char type.
    138 // We are handed the character after the "file:" at the beginning of the spec.
    139 // Usually this is a slash, but needn't be; we allow paths like "file:c:\foo".
    140 template<typename CHAR>
    141 void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
    142   DCHECK(spec_len >= 0);
    143 
    144   // Get the parts we never use for file URLs out of the way.
    145   parsed->username.reset();
    146   parsed->password.reset();
    147   parsed->port.reset();
    148 
    149   // Many of the code paths don't set these, so it's convenient to just clear
    150   // them. We'll write them in those cases we need them.
    151   parsed->query.reset();
    152   parsed->ref.reset();
    153 
    154   // Strip leading & trailing spaces and control characters.
    155   int begin = 0;
    156   TrimURL(spec, &begin, &spec_len);
    157 
    158   // Find the scheme.
    159   int num_slashes;
    160   int after_scheme;
    161   int after_slashes;
    162 #ifdef WIN32
    163   // See how many slashes there are. We want to handle cases like UNC but also
    164   // "/c:/foo". This is when there is no scheme, so we can allow pages to do
    165   // links like "c:/foo/bar" or "//foo/bar". This is also called by the
    166   // relative URL resolver when it determines there is an absolute URL, which
    167   // may give us input like "/c:/foo".
    168   num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
    169   after_slashes = begin + num_slashes;
    170   if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
    171     // Windows path, don't try to extract the scheme (for example, "c:\foo").
    172     parsed->scheme.reset();
    173     after_scheme = after_slashes;
    174   } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
    175     // Windows UNC path: don't try to extract the scheme, but keep the slashes.
    176     parsed->scheme.reset();
    177     after_scheme = begin;
    178   } else
    179 #endif
    180   {
    181     if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
    182       // Offset the results since we gave ExtractScheme a substring.
    183       parsed->scheme.begin += begin;
    184       after_scheme = parsed->scheme.end() + 1;
    185     } else {
    186       // No scheme found, remember that.
    187       parsed->scheme.reset();
    188       after_scheme = begin;
    189     }
    190   }
    191 
    192   // Handle empty specs ones that contain only whitespace or control chars,
    193   // or that are just the scheme (for example "file:").
    194   if (after_scheme == spec_len) {
    195     parsed->host.reset();
    196     parsed->path.reset();
    197     return;
    198   }
    199 
    200   num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
    201 
    202   after_slashes = after_scheme + num_slashes;
    203 #ifdef WIN32
    204   // Check whether the input is a drive again. We checked above for windows
    205   // drive specs, but that's only at the very beginning to see if we have a
    206   // scheme at all. This test will be duplicated in that case, but will
    207   // additionally handle all cases with a real scheme such as "file:///C:/".
    208   if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
    209       num_slashes != 3) {
    210     // Anything not beginning with a drive spec ("c:\") on Windows is treated
    211     // as UNC, with the exception of three slashes which always means a file.
    212     // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
    213     DoParseUNC(spec, after_slashes, spec_len, parsed);
    214     return;
    215   }
    216 #else
    217   // file: URL with exactly 2 slashes is considered to have a host component.
    218   if (num_slashes == 2) {
    219     DoParseUNC(spec, after_slashes, spec_len, parsed);
    220     return;
    221   }
    222 #endif  // WIN32
    223 
    224   // Easy and common case, the full path immediately follows the scheme
    225   // (modulo slashes), as in "file://c:/foo". Just treat everything from
    226   // there to the end as the path. Empty hosts have 0 length instead of -1.
    227   // We include the last slash as part of the path if there is one.
    228   DoParseLocalFile(spec,
    229       num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
    230       spec_len, parsed);
    231 }
    232 
    233 }  // namespace
    234 
    235 void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
    236   DoParseFileURL(url, url_len, parsed);
    237 }
    238 
    239 void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
    240   DoParseFileURL(url, url_len, parsed);
    241 }
    242 
    243 }  // namespace url_parse
    244