Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/logging.h"
      6 #include "url/url_file.h"
      7 #include "url/url_parse.h"
      8 #include "url/url_parse_internal.h"
      9 
     10 // Interesting IE file:isms...
     11 //
     12 //  INPUT                      OUTPUT
     13 //  =========================  ==============================
     14 //  file:/foo/bar              file:///foo/bar
     15 //      The result here seems totally invalid!?!? This isn't UNC.
     16 //
     17 //  file:/
     18 //  file:// or any other number of slashes
     19 //      IE6 doesn't do anything at all if you click on this link. No error:
     20 //      nothing. IE6's history system seems to always color this link, so I'm
     21 //      guessing that it maps internally to the empty URL.
     22 //
     23 //  C:\                        file:///C:/
     24 //      When on a file: URL source page, this link will work. When over HTTP,
     25 //      the file: URL will appear in the status bar but the link will not work
     26 //      (security restriction for all file URLs).
     27 //
     28 //  file:foo/                  file:foo/     (invalid?!?!?)
     29 //  file:/foo/                 file:///foo/  (invalid?!?!?)
     30 //  file://foo/                file://foo/   (UNC to server "foo")
     31 //  file:///foo/               file:///foo/  (invalid, seems to be a file)
     32 //  file:////foo/              file://foo/   (UNC to server "foo")
     33 //      Any more than four slashes is also treated as UNC.
     34 //
     35 //  file:C:/                   file://C:/
     36 //  file:/C:/                  file://C:/
     37 //      The number of slashes after "file:" don't matter if the thing following
     38 //      it looks like an absolute drive path. Also, slashes and backslashes are
     39 //      equally valid here.
     40 
     41 namespace url {
     42 
     43 namespace {
     44 
     45 // A subcomponent of DoInitFileURL, the input of this function should be a UNC
     46 // path name, with the index of the first character after the slashes following
     47 // the scheme given in |after_slashes|. This will initialize the host, path,
     48 // query, and ref, and leave the other output components untouched
     49 // (DoInitFileURL handles these for us).
     50 template<typename CHAR>
     51 void DoParseUNC(const CHAR* spec,
     52                 int after_slashes,
     53                 int spec_len,
     54                Parsed* parsed) {
     55   int next_slash = FindNextSlash(spec, after_slashes, spec_len);
     56   if (next_slash == spec_len) {
     57     // No additional slash found, as in "file://foo", treat the text as the
     58     // host with no path (this will end up being UNC to server "foo").
     59     int host_len = spec_len - after_slashes;
     60     if (host_len)
     61       parsed->host = Component(after_slashes, host_len);
     62     else
     63       parsed->host.reset();
     64     parsed->path.reset();
     65     return;
     66   }
     67 
     68 #ifdef WIN32
     69   // See if we have something that looks like a path following the first
     70   // component. As in "file://localhost/c:/", we get "c:/" out. We want to
     71   // treat this as a having no host but the path given. Works on Windows only.
     72   if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
     73     parsed->host.reset();
     74     ParsePathInternal(spec, MakeRange(next_slash, spec_len),
     75                       &parsed->path, &parsed->query, &parsed->ref);
     76     return;
     77   }
     78 #endif
     79 
     80   // Otherwise, everything up until that first slash we found is the host name,
     81   // which will end up being the UNC host. For example "file://foo/bar.txt"
     82   // will get a server name of "foo" and a path of "/bar". Later, on Windows,
     83   // this should be treated as the filename "\\foo\bar.txt" in proper UNC
     84   // notation.
     85   int host_len = next_slash - after_slashes;
     86   if (host_len)
     87     parsed->host = MakeRange(after_slashes, next_slash);
     88   else
     89     parsed->host.reset();
     90   if (next_slash < spec_len) {
     91     ParsePathInternal(spec, MakeRange(next_slash, spec_len),
     92                       &parsed->path, &parsed->query, &parsed->ref);
     93   } else {
     94     parsed->path.reset();
     95   }
     96 }
     97 
     98 // A subcomponent of DoParseFileURL, the input should be a local file, with the
     99 // beginning of the path indicated by the index in |path_begin|. This will
    100 // initialize the host, path, query, and ref, and leave the other output
    101 // components untouched (DoInitFileURL handles these for us).
    102 template<typename CHAR>
    103 void DoParseLocalFile(const CHAR* spec,
    104                       int path_begin,
    105                       int spec_len,
    106                       Parsed* parsed) {
    107   parsed->host.reset();
    108   ParsePathInternal(spec, MakeRange(path_begin, spec_len),
    109                     &parsed->path, &parsed->query, &parsed->ref);
    110 }
    111 
    112 // Backend for the external functions that operates on either char type.
    113 // Handles cases where there is a scheme, but also when handed the first
    114 // character following the "file:" at the beginning of the spec. If so,
    115 // this is usually a slash, but needn't be; we allow paths like "file:c:\foo".
    116 template<typename CHAR>
    117 void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
    118   DCHECK(spec_len >= 0);
    119 
    120   // Get the parts we never use for file URLs out of the way.
    121   parsed->username.reset();
    122   parsed->password.reset();
    123   parsed->port.reset();
    124 
    125   // Many of the code paths don't set these, so it's convenient to just clear
    126   // them. We'll write them in those cases we need them.
    127   parsed->query.reset();
    128   parsed->ref.reset();
    129 
    130   // Strip leading & trailing spaces and control characters.
    131   int begin = 0;
    132   TrimURL(spec, &begin, &spec_len);
    133 
    134   // Find the scheme, if any.
    135   int num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
    136   int after_scheme;
    137   int after_slashes;
    138 #ifdef WIN32
    139   // See how many slashes there are. We want to handle cases like UNC but also
    140   // "/c:/foo". This is when there is no scheme, so we can allow pages to do
    141   // links like "c:/foo/bar" or "//foo/bar". This is also called by the
    142   // relative URL resolver when it determines there is an absolute URL, which
    143   // may give us input like "/c:/foo".
    144   after_slashes = begin + num_slashes;
    145   if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
    146     // Windows path, don't try to extract the scheme (for example, "c:\foo").
    147     parsed->scheme.reset();
    148     after_scheme = after_slashes;
    149   } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
    150     // Windows UNC path: don't try to extract the scheme, but keep the slashes.
    151     parsed->scheme.reset();
    152     after_scheme = begin;
    153   } else
    154 #endif
    155   {
    156     // ExtractScheme doesn't understand the possibility of filenames with
    157     // colons in them, in which case it returns the entire spec up to the
    158     // colon as the scheme. So handle /foo.c:5 as a file but foo.c:5 as
    159     // the foo.c: scheme.
    160     if (!num_slashes &&
    161         ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
    162       // Offset the results since we gave ExtractScheme a substring.
    163       parsed->scheme.begin += begin;
    164       after_scheme = parsed->scheme.end() + 1;
    165     } else {
    166       // No scheme found, remember that.
    167       parsed->scheme.reset();
    168       after_scheme = begin;
    169     }
    170   }
    171 
    172   // Handle empty specs ones that contain only whitespace or control chars,
    173   // or that are just the scheme (for example "file:").
    174   if (after_scheme == spec_len) {
    175     parsed->host.reset();
    176     parsed->path.reset();
    177     return;
    178   }
    179 
    180   num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
    181   after_slashes = after_scheme + num_slashes;
    182 #ifdef WIN32
    183   // Check whether the input is a drive again. We checked above for windows
    184   // drive specs, but that's only at the very beginning to see if we have a
    185   // scheme at all. This test will be duplicated in that case, but will
    186   // additionally handle all cases with a real scheme such as "file:///C:/".
    187   if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
    188       num_slashes != 3) {
    189     // Anything not beginning with a drive spec ("c:\") on Windows is treated
    190     // as UNC, with the exception of three slashes which always means a file.
    191     // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
    192     DoParseUNC(spec, after_slashes, spec_len, parsed);
    193     return;
    194   }
    195 #else
    196   // file: URL with exactly 2 slashes is considered to have a host component.
    197   if (num_slashes == 2) {
    198     DoParseUNC(spec, after_slashes, spec_len, parsed);
    199     return;
    200   }
    201 #endif  // WIN32
    202 
    203   // Easy and common case, the full path immediately follows the scheme
    204   // (modulo slashes), as in "file://c:/foo". Just treat everything from
    205   // there to the end as the path. Empty hosts have 0 length instead of -1.
    206   // We include the last slash as part of the path if there is one.
    207   DoParseLocalFile(spec,
    208       num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
    209       spec_len, parsed);
    210 }
    211 
    212 }  // namespace
    213 
    214 void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
    215   DoParseFileURL(url, url_len, parsed);
    216 }
    217 
    218 void ParseFileURL(const base::char16* url, int url_len, Parsed* parsed) {
    219   DoParseFileURL(url, url_len, parsed);
    220 }
    221 
    222 }  // namespace url
    223