Home | History | Annotate | Download | only in net
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/net/url_fixer_upper.h"
      6 
      7 #include <algorithm>
      8 
      9 #if defined(OS_POSIX)
     10 #include "base/environment.h"
     11 #endif
     12 #include "base/file_util.h"
     13 #include "base/logging.h"
     14 #include "base/string_util.h"
     15 #include "base/utf_string_conversions.h"
     16 #include "chrome/common/url_constants.h"
     17 #include "googleurl/src/url_file.h"
     18 #include "googleurl/src/url_parse.h"
     19 #include "googleurl/src/url_util.h"
     20 #include "net/base/escape.h"
     21 #include "net/base/net_util.h"
     22 #include "net/base/registry_controlled_domain.h"
     23 
     24 const char* URLFixerUpper::home_directory_override = NULL;
     25 
     26 namespace {
     27 
     28 // TODO(estade): Remove these ugly, ugly functions. They are only used in
     29 // SegmentURL. A url_parse::Parsed object keeps track of a bunch of indices into
     30 // a url string, and these need to be updated when the URL is converted from
     31 // UTF8 to UTF16. Instead of this after-the-fact adjustment, we should parse it
     32 // in the correct string format to begin with.
     33 url_parse::Component UTF8ComponentToUTF16Component(
     34     const std::string& text_utf8,
     35     const url_parse::Component& component_utf8) {
     36   if (component_utf8.len == -1)
     37     return url_parse::Component();
     38 
     39   std::string before_component_string =
     40       text_utf8.substr(0, component_utf8.begin);
     41   std::string component_string = text_utf8.substr(component_utf8.begin,
     42                                                   component_utf8.len);
     43   string16 before_component_string_16 = UTF8ToUTF16(before_component_string);
     44   string16 component_string_16 = UTF8ToUTF16(component_string);
     45   url_parse::Component component_16(before_component_string_16.length(),
     46                                     component_string_16.length());
     47   return component_16;
     48 }
     49 
     50 void UTF8PartsToUTF16Parts(const std::string& text_utf8,
     51                            const url_parse::Parsed& parts_utf8,
     52                            url_parse::Parsed* parts) {
     53   if (IsStringASCII(text_utf8)) {
     54     *parts = parts_utf8;
     55     return;
     56   }
     57 
     58   parts->scheme =
     59       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.scheme);
     60   parts ->username =
     61       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.username);
     62   parts->password =
     63       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.password);
     64   parts->host =
     65       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.host);
     66   parts->port =
     67       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.port);
     68   parts->path =
     69       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.path);
     70   parts->query =
     71       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.query);
     72   parts->ref =
     73       UTF8ComponentToUTF16Component(text_utf8, parts_utf8.ref);
     74 }
     75 
     76 TrimPositions TrimWhitespaceUTF8(const std::string& input,
     77                                  TrimPositions positions,
     78                                  std::string* output) {
     79   // This implementation is not so fast since it converts the text encoding
     80   // twice. Please feel free to file a bug if this function hurts the
     81   // performance of Chrome.
     82   DCHECK(IsStringUTF8(input));
     83   std::wstring input_wide = UTF8ToWide(input);
     84   std::wstring output_wide;
     85   TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide);
     86   *output = WideToUTF8(output_wide);
     87   return result;
     88 }
     89 
     90 }  // namespace
     91 
     92 // does some basic fixes for input that we want to test for file-ness
     93 static void PrepareStringForFileOps(const FilePath& text,
     94                                     FilePath::StringType* output) {
     95 #if defined(OS_WIN)
     96   TrimWhitespace(text.value(), TRIM_ALL, output);
     97   replace(output->begin(), output->end(), '/', '\\');
     98 #else
     99   TrimWhitespaceUTF8(text.value(), TRIM_ALL, output);
    100 #endif
    101 }
    102 
    103 // Tries to create a full path from |text|.  If the result is valid and the
    104 // file exists, returns true and sets |full_path| to the result.  Otherwise,
    105 // returns false and leaves |full_path| unchanged.
    106 static bool ValidPathForFile(const FilePath::StringType& text,
    107                              FilePath* full_path) {
    108   FilePath file_path(text);
    109   if (!file_util::AbsolutePath(&file_path))
    110     return false;
    111 
    112   if (!file_util::PathExists(file_path))
    113     return false;
    114 
    115   *full_path = file_path;
    116   return true;
    117 }
    118 
    119 #if defined(OS_POSIX)
    120 // Given a path that starts with ~, return a path that starts with an
    121 // expanded-out /user/foobar directory.
    122 static std::string FixupHomedir(const std::string& text) {
    123   DCHECK(text.length() > 0 && text[0] == '~');
    124 
    125   if (text.length() == 1 || text[1] == '/') {
    126     const char* home = getenv(base::env_vars::kHome);
    127     if (URLFixerUpper::home_directory_override)
    128       home = URLFixerUpper::home_directory_override;
    129     // We'll probably break elsewhere if $HOME is undefined, but check here
    130     // just in case.
    131     if (!home)
    132       return text;
    133     return home + text.substr(1);
    134   }
    135 
    136   // Otherwise, this is a path like ~foobar/baz, where we must expand to
    137   // user foobar's home directory.  Officially, we should use getpwent(),
    138   // but that is a nasty blocking call.
    139 
    140 #if defined(OS_MACOSX)
    141   static const char kHome[] = "/Users/";
    142 #else
    143   static const char kHome[] = "/home/";
    144 #endif
    145   return kHome + text.substr(1);
    146 }
    147 #endif
    148 
    149 // Tries to create a file: URL from |text| if it looks like a filename, even if
    150 // it doesn't resolve as a valid path or to an existing file.  Returns a
    151 // (possibly invalid) file: URL in |fixed_up_url| for input beginning
    152 // with a drive specifier or "\\".  Returns the unchanged input in other cases
    153 // (including file: URLs: these don't look like filenames).
    154 static std::string FixupPath(const std::string& text) {
    155   DCHECK(!text.empty());
    156 
    157   FilePath::StringType filename;
    158 #if defined(OS_WIN)
    159   FilePath input_path(UTF8ToWide(text));
    160   PrepareStringForFileOps(input_path, &filename);
    161 
    162   // Fixup Windows-style drive letters, where "C:" gets rewritten to "C|".
    163   if (filename.length() > 1 && filename[1] == '|')
    164     filename[1] = ':';
    165 #elif defined(OS_POSIX)
    166   FilePath input_path(text);
    167   PrepareStringForFileOps(input_path, &filename);
    168   if (filename.length() > 0 && filename[0] == '~')
    169     filename = FixupHomedir(filename);
    170 #endif
    171 
    172   // Here, we know the input looks like a file.
    173   GURL file_url = net::FilePathToFileURL(FilePath(filename));
    174   if (file_url.is_valid()) {
    175     return UTF16ToUTF8(net::FormatUrl(file_url, std::string(),
    176         net::kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, NULL,
    177         NULL, NULL));
    178   }
    179 
    180   // Invalid file URL, just return the input.
    181   return text;
    182 }
    183 
    184 // Checks |domain| to see if a valid TLD is already present.  If not, appends
    185 // |desired_tld| to the domain, and prepends "www." unless it's already present.
    186 static void AddDesiredTLD(const std::string& desired_tld,
    187                           std::string* domain) {
    188   if (desired_tld.empty() || domain->empty())
    189     return;
    190 
    191   // Check the TLD.  If the return value is positive, we already have a TLD, so
    192   // abort.  If the return value is std::string::npos, there's no valid host,
    193   // but we can try to append a TLD anyway, since the host may become valid once
    194   // the TLD is attached -- for example, "999999999999" is detected as a broken
    195   // IP address and marked invalid, but attaching ".com" makes it legal.  When
    196   // the return value is 0, there's a valid host with no known TLD, so we can
    197   // definitely append the user's TLD.  We disallow unknown registries here so
    198   // users can input "mail.yahoo" and hit ctrl-enter to get
    199   // "www.mail.yahoo.com".
    200   const size_t registry_length =
    201       net::RegistryControlledDomainService::GetRegistryLength(*domain, false);
    202   if ((registry_length != 0) && (registry_length != std::string::npos))
    203     return;
    204 
    205   // Add the suffix at the end of the domain.
    206   const size_t domain_length(domain->length());
    207   DCHECK_GT(domain_length, 0U);
    208   DCHECK_NE(desired_tld[0], '.');
    209   if ((*domain)[domain_length - 1] != '.')
    210     domain->push_back('.');
    211   domain->append(desired_tld);
    212 
    213   // Now, if the domain begins with "www.", stop.
    214   const std::string prefix("www.");
    215   if (domain->compare(0, prefix.length(), prefix) != 0) {
    216     // Otherwise, add www. to the beginning of the URL.
    217     domain->insert(0, prefix);
    218   }
    219 }
    220 
    221 static inline void FixupUsername(const std::string& text,
    222                                  const url_parse::Component& part,
    223                                  std::string* url) {
    224   if (!part.is_valid())
    225     return;
    226 
    227   // We don't fix up the username at the moment.
    228   url->append(text, part.begin, part.len);
    229   // Do not append the trailing '@' because we might need to include the user's
    230   // password.  FixupURL itself will append the '@' for us.
    231 }
    232 
    233 static inline void FixupPassword(const std::string& text,
    234                                  const url_parse::Component& part,
    235                                  std::string* url) {
    236   if (!part.is_valid())
    237     return;
    238 
    239   // We don't fix up the password at the moment.
    240   url->append(":");
    241   url->append(text, part.begin, part.len);
    242 }
    243 
    244 static void FixupHost(const std::string& text,
    245                       const url_parse::Component& part,
    246                       bool has_scheme,
    247                       const std::string& desired_tld,
    248                       std::string* url) {
    249   if (!part.is_valid())
    250     return;
    251 
    252   // Make domain valid.
    253   // Strip all leading dots and all but one trailing dot, unless the user only
    254   // typed dots, in which case their input is totally invalid and we should just
    255   // leave it unchanged.
    256   std::string domain(text, part.begin, part.len);
    257   const size_t first_nondot(domain.find_first_not_of('.'));
    258   if (first_nondot != std::string::npos) {
    259     domain.erase(0, first_nondot);
    260     size_t last_nondot(domain.find_last_not_of('.'));
    261     DCHECK(last_nondot != std::string::npos);
    262     last_nondot += 2;  // Point at second period in ending string
    263     if (last_nondot < domain.length())
    264       domain.erase(last_nondot);
    265   }
    266 
    267   // Add any user-specified TLD, if applicable.
    268   AddDesiredTLD(desired_tld, &domain);
    269 
    270   url->append(domain);
    271 }
    272 
    273 static void FixupPort(const std::string& text,
    274                       const url_parse::Component& part,
    275                       std::string* url) {
    276   if (!part.is_valid())
    277     return;
    278 
    279   // We don't fix up the port at the moment.
    280   url->append(":");
    281   url->append(text, part.begin, part.len);
    282 }
    283 
    284 static inline void FixupPath(const std::string& text,
    285                              const url_parse::Component& part,
    286                              std::string* url) {
    287   if (!part.is_valid() || part.len == 0) {
    288     // We should always have a path.
    289     url->append("/");
    290     return;
    291   }
    292 
    293   // Append the path as is.
    294   url->append(text, part.begin, part.len);
    295 }
    296 
    297 static inline void FixupQuery(const std::string& text,
    298                               const url_parse::Component& part,
    299                               std::string* url) {
    300   if (!part.is_valid())
    301     return;
    302 
    303   // We don't fix up the query at the moment.
    304   url->append("?");
    305   url->append(text, part.begin, part.len);
    306 }
    307 
    308 static inline void FixupRef(const std::string& text,
    309                             const url_parse::Component& part,
    310                             std::string* url) {
    311   if (!part.is_valid())
    312     return;
    313 
    314   // We don't fix up the ref at the moment.
    315   url->append("#");
    316   url->append(text, part.begin, part.len);
    317 }
    318 
    319 static bool HasPort(const std::string& original_text,
    320                     const url_parse::Component& scheme_component) {
    321   // Find the range between the ":" and the "/".
    322   size_t port_start = scheme_component.end() + 1;
    323   size_t port_end = port_start;
    324   while ((port_end < original_text.length()) &&
    325          !url_parse::IsAuthorityTerminator(original_text[port_end]))
    326     ++port_end;
    327   if (port_end == port_start)
    328     return false;
    329 
    330   // Scan the range to see if it is entirely digits.
    331   for (size_t i = port_start; i < port_end; ++i) {
    332     if (!IsAsciiDigit(original_text[i]))
    333       return false;
    334   }
    335 
    336   return true;
    337 }
    338 
    339 // Try to extract a valid scheme from the beginning of |text|.
    340 // If successful, set |scheme_component| to the text range where the scheme
    341 // was located, and fill |canon_scheme| with its canonicalized form.
    342 // Otherwise, return false and leave the outputs in an indeterminate state.
    343 static bool GetValidScheme(const std::string &text,
    344                            url_parse::Component* scheme_component,
    345                            std::string* canon_scheme) {
    346   // Locate everything up to (but not including) the first ':'
    347   if (!url_parse::ExtractScheme(text.data(), static_cast<int>(text.length()),
    348                                 scheme_component))
    349     return false;
    350 
    351   // Make sure the scheme contains only valid characters, and convert
    352   // to lowercase.  This also catches IPv6 literals like [::1], because
    353   // brackets are not in the whitelist.
    354   url_canon::StdStringCanonOutput canon_scheme_output(canon_scheme);
    355   url_parse::Component canon_scheme_component;
    356   if (!url_canon::CanonicalizeScheme(text.data(), *scheme_component,
    357                                      &canon_scheme_output,
    358                                      &canon_scheme_component))
    359     return false;
    360 
    361   // Strip the ':', and any trailing buffer space.
    362   DCHECK_EQ(0, canon_scheme_component.begin);
    363   canon_scheme->erase(canon_scheme_component.len);
    364 
    365   // We need to fix up the segmentation for "www.example.com:/".  For this
    366   // case, we guess that schemes with a "." are not actually schemes.
    367   if (canon_scheme->find('.') != std::string::npos)
    368     return false;
    369 
    370   // We need to fix up the segmentation for "www:123/".  For this case, we
    371   // will add an HTTP scheme later and make the URL parser happy.
    372   // TODO(pkasting): Maybe we should try to use GURL's parser for this?
    373   if (HasPort(text, *scheme_component))
    374     return false;
    375 
    376   // Everything checks out.
    377   return true;
    378 }
    379 
    380 std::string URLFixerUpper::SegmentURL(const std::string& text,
    381                                       url_parse::Parsed* parts) {
    382   // Initialize the result.
    383   *parts = url_parse::Parsed();
    384 
    385   std::string trimmed;
    386   TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed);
    387   if (trimmed.empty())
    388     return std::string();  // Nothing to segment.
    389 
    390 #if defined(OS_WIN)
    391   int trimmed_length = static_cast<int>(trimmed.length());
    392   if (url_parse::DoesBeginWindowsDriveSpec(trimmed.data(), 0, trimmed_length) ||
    393       url_parse::DoesBeginUNCPath(trimmed.data(), 0, trimmed_length, true))
    394     return "file";
    395 #elif defined(OS_POSIX)
    396   if (FilePath::IsSeparator(trimmed.data()[0]) || trimmed.data()[0] == '~')
    397     return "file";
    398 #endif
    399 
    400   // Otherwise, we need to look at things carefully.
    401   std::string scheme;
    402   if (!GetValidScheme(text, &parts->scheme, &scheme)) {
    403     // Couldn't determine the scheme, so just pick one.
    404     parts->scheme.reset();
    405     scheme.assign(StartsWithASCII(text, "ftp.", false) ?
    406         chrome::kFtpScheme : chrome::kHttpScheme);
    407   }
    408 
    409   // Not segmenting file schemes or nonstandard schemes.
    410   if ((scheme == chrome::kFileScheme) ||
    411       !url_util::IsStandard(scheme.c_str(),
    412       url_parse::Component(0, static_cast<int>(scheme.length()))))
    413     return scheme;
    414 
    415   if (parts->scheme.is_valid()) {
    416     // Have the GURL parser do the heavy lifting for us.
    417     url_parse::ParseStandardURL(text.data(), static_cast<int>(text.length()),
    418                                 parts);
    419     return scheme;
    420   }
    421 
    422   // We need to add a scheme in order for ParseStandardURL to be happy.
    423   // Find the first non-whitespace character.
    424   std::string::const_iterator first_nonwhite = text.begin();
    425   while ((first_nonwhite != text.end()) && IsWhitespace(*first_nonwhite))
    426     ++first_nonwhite;
    427 
    428   // Construct the text to parse by inserting the scheme.
    429   std::string inserted_text(scheme);
    430   inserted_text.append("://");
    431   std::string text_to_parse(text.begin(), first_nonwhite);
    432   text_to_parse.append(inserted_text);
    433   text_to_parse.append(first_nonwhite, text.end());
    434 
    435   // Have the GURL parser do the heavy lifting for us.
    436   url_parse::ParseStandardURL(text_to_parse.data(),
    437                               static_cast<int>(text_to_parse.length()),
    438                               parts);
    439 
    440   // Offset the results of the parse to match the original text.
    441   const int offset = -static_cast<int>(inserted_text.length());
    442   OffsetComponent(offset, &parts->scheme);
    443   OffsetComponent(offset, &parts->username);
    444   OffsetComponent(offset, &parts->password);
    445   OffsetComponent(offset, &parts->host);
    446   OffsetComponent(offset, &parts->port);
    447   OffsetComponent(offset, &parts->path);
    448   OffsetComponent(offset, &parts->query);
    449   OffsetComponent(offset, &parts->ref);
    450 
    451   return scheme;
    452 }
    453 
    454 GURL URLFixerUpper::FixupURL(const std::string& text,
    455                              const std::string& desired_tld) {
    456   std::string trimmed;
    457   TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed);
    458   if (trimmed.empty())
    459     return GURL();  // Nothing here.
    460 
    461   // Segment the URL.
    462   url_parse::Parsed parts;
    463   std::string scheme(SegmentURL(trimmed, &parts));
    464 
    465   // For view-source: URLs, we strip "view-source:", do fixup, and stick it back
    466   // on.  This allows us to handle things like "view-source:google.com".
    467   if (scheme == chrome::kViewSourceScheme) {
    468     // Reject "view-source:view-source:..." to avoid deep recursion.
    469     std::string view_source(chrome::kViewSourceScheme + std::string(":"));
    470     if (!StartsWithASCII(text, view_source + view_source, false)) {
    471       return GURL(chrome::kViewSourceScheme + std::string(":") +
    472           FixupURL(trimmed.substr(scheme.length() + 1),
    473                    desired_tld).possibly_invalid_spec());
    474     }
    475   }
    476 
    477   // We handle the file scheme separately.
    478   if (scheme == chrome::kFileScheme)
    479     return GURL(parts.scheme.is_valid() ? text : FixupPath(text));
    480 
    481   // For some schemes whose layouts we understand, we rebuild it.
    482   if (url_util::IsStandard(scheme.c_str(),
    483           url_parse::Component(0, static_cast<int>(scheme.length())))) {
    484     std::string url(scheme);
    485     url.append("://");
    486 
    487     // We need to check whether the |username| is valid because it is our
    488     // responsibility to append the '@' to delineate the user information from
    489     // the host portion of the URL.
    490     if (parts.username.is_valid()) {
    491       FixupUsername(trimmed, parts.username, &url);
    492       FixupPassword(trimmed, parts.password, &url);
    493       url.append("@");
    494     }
    495 
    496     FixupHost(trimmed, parts.host, parts.scheme.is_valid(), desired_tld, &url);
    497     FixupPort(trimmed, parts.port, &url);
    498     FixupPath(trimmed, parts.path, &url);
    499     FixupQuery(trimmed, parts.query, &url);
    500     FixupRef(trimmed, parts.ref, &url);
    501 
    502     return GURL(url);
    503   }
    504 
    505   // In the worst-case, we insert a scheme if the URL lacks one.
    506   if (!parts.scheme.is_valid()) {
    507     std::string fixed_scheme(scheme);
    508     fixed_scheme.append("://");
    509     trimmed.insert(0, fixed_scheme);
    510   }
    511 
    512   return GURL(trimmed);
    513 }
    514 
    515 // The rules are different here than for regular fixup, since we need to handle
    516 // input like "hello.html" and know to look in the current directory.  Regular
    517 // fixup will look for cues that it is actually a file path before trying to
    518 // figure out what file it is.  If our logic doesn't work, we will fall back on
    519 // regular fixup.
    520 GURL URLFixerUpper::FixupRelativeFile(const FilePath& base_dir,
    521                                       const FilePath& text) {
    522   FilePath old_cur_directory;
    523   if (!base_dir.empty()) {
    524     // Save the old current directory before we move to the new one.
    525     file_util::GetCurrentDirectory(&old_cur_directory);
    526     file_util::SetCurrentDirectory(base_dir);
    527   }
    528 
    529   // Allow funny input with extra whitespace and the wrong kind of slashes.
    530   FilePath::StringType trimmed;
    531   PrepareStringForFileOps(text, &trimmed);
    532 
    533   bool is_file = true;
    534   FilePath full_path;
    535   if (!ValidPathForFile(trimmed, &full_path)) {
    536     // Not a path as entered, try unescaping it in case the user has
    537     // escaped things. We need to go through 8-bit since the escaped values
    538     // only represent 8-bit values.
    539 #if defined(OS_WIN)
    540     std::wstring unescaped = UTF8ToWide(UnescapeURLComponent(
    541         WideToUTF8(trimmed),
    542         UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS));
    543 #elif defined(OS_POSIX)
    544     std::string unescaped = UnescapeURLComponent(
    545         trimmed,
    546         UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
    547 #endif
    548 
    549     if (!ValidPathForFile(unescaped, &full_path))
    550       is_file = false;
    551   }
    552 
    553   // Put back the current directory if we saved it.
    554   if (!base_dir.empty())
    555     file_util::SetCurrentDirectory(old_cur_directory);
    556 
    557   if (is_file) {
    558     GURL file_url = net::FilePathToFileURL(full_path);
    559     if (file_url.is_valid())
    560       return GURL(UTF16ToUTF8(net::FormatUrl(file_url, std::string(),
    561           net::kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, NULL,
    562           NULL, NULL)));
    563     // Invalid files fall through to regular processing.
    564   }
    565 
    566   // Fall back on regular fixup for this input.
    567 #if defined(OS_WIN)
    568   std::string text_utf8 = WideToUTF8(text.value());
    569 #elif defined(OS_POSIX)
    570   std::string text_utf8 = text.value();
    571 #endif
    572   return FixupURL(text_utf8, std::string());
    573 }
    574 
    575 string16 URLFixerUpper::SegmentURL(const string16& text,
    576                                    url_parse::Parsed* parts) {
    577   std::string text_utf8 = UTF16ToUTF8(text);
    578   url_parse::Parsed parts_utf8;
    579   std::string scheme_utf8 = SegmentURL(text_utf8, &parts_utf8);
    580   UTF8PartsToUTF16Parts(text_utf8, parts_utf8, parts);
    581   return UTF8ToUTF16(scheme_utf8);
    582 }
    583 
    584 void URLFixerUpper::OffsetComponent(int offset, url_parse::Component* part) {
    585   DCHECK(part);
    586 
    587   if (part->is_valid()) {
    588     // Offset the location of this component.
    589     part->begin += offset;
    590 
    591     // This part might not have existed in the original text.
    592     if (part->begin < 0)
    593       part->reset();
    594   }
    595 }
    596