Home | History | Annotate | Download | only in autocomplete
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/autocomplete/autocomplete_input.h"
      6 
      7 #include "base/strings/string_util.h"
      8 #include "base/strings/utf_string_conversions.h"
      9 #include "chrome/browser/external_protocol/external_protocol_handler.h"
     10 #include "chrome/browser/profiles/profile_io_data.h"
     11 #include "chrome/common/net/url_fixer_upper.h"
     12 #include "content/public/common/url_constants.h"
     13 #include "net/base/net_util.h"
     14 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
     15 #include "url/url_canon_ip.h"
     16 #include "url/url_util.h"
     17 
     18 namespace {
     19 
     20 void AdjustCursorPositionIfNecessary(size_t num_leading_chars_removed,
     21                                      size_t* cursor_position) {
     22   if (*cursor_position == base::string16::npos)
     23     return;
     24   if (num_leading_chars_removed < *cursor_position)
     25     *cursor_position -= num_leading_chars_removed;
     26   else
     27     *cursor_position = 0;
     28 }
     29 
     30 }  // namespace
     31 
     32 AutocompleteInput::AutocompleteInput()
     33     : cursor_position_(base::string16::npos),
     34       current_page_classification_(AutocompleteInput::INVALID_SPEC),
     35       type_(INVALID),
     36       prevent_inline_autocomplete_(false),
     37       prefer_keyword_(false),
     38       allow_exact_keyword_match_(true),
     39       matches_requested_(ALL_MATCHES) {
     40 }
     41 
     42 AutocompleteInput::AutocompleteInput(
     43     const base::string16& text,
     44     size_t cursor_position,
     45     const base::string16& desired_tld,
     46     const GURL& current_url,
     47     AutocompleteInput::PageClassification current_page_classification,
     48     bool prevent_inline_autocomplete,
     49     bool prefer_keyword,
     50     bool allow_exact_keyword_match,
     51     MatchesRequested matches_requested)
     52     : cursor_position_(cursor_position),
     53       current_url_(current_url),
     54       current_page_classification_(current_page_classification),
     55       prevent_inline_autocomplete_(prevent_inline_autocomplete),
     56       prefer_keyword_(prefer_keyword),
     57       allow_exact_keyword_match_(allow_exact_keyword_match),
     58       matches_requested_(matches_requested) {
     59   DCHECK(cursor_position <= text.length() ||
     60          cursor_position == base::string16::npos)
     61       << "Text: '" << text << "', cp: " << cursor_position;
     62   // None of the providers care about leading white space so we always trim it.
     63   // Providers that care about trailing white space handle trimming themselves.
     64   if ((TrimWhitespace(text, TRIM_LEADING, &text_) & TRIM_LEADING) != 0)
     65     AdjustCursorPositionIfNecessary(text.length() - text_.length(),
     66                                     &cursor_position_);
     67 
     68   GURL canonicalized_url;
     69   type_ = Parse(text_, desired_tld, &parts_, &scheme_, &canonicalized_url);
     70 
     71   if (type_ == INVALID)
     72     return;
     73 
     74   if (((type_ == UNKNOWN) || (type_ == URL)) &&
     75       canonicalized_url.is_valid() &&
     76       (!canonicalized_url.IsStandard() || canonicalized_url.SchemeIsFile() ||
     77        canonicalized_url.SchemeIsFileSystem() ||
     78        !canonicalized_url.host().empty()))
     79     canonicalized_url_ = canonicalized_url;
     80 
     81   size_t chars_removed = RemoveForcedQueryStringIfNecessary(type_, &text_);
     82   AdjustCursorPositionIfNecessary(chars_removed, &cursor_position_);
     83   if (chars_removed) {
     84     // Remove spaces between opening question mark and first actual character.
     85     base::string16 trimmed_text;
     86     if ((TrimWhitespace(text_, TRIM_LEADING, &trimmed_text) & TRIM_LEADING) !=
     87         0) {
     88       AdjustCursorPositionIfNecessary(text_.length() - trimmed_text.length(),
     89                                       &cursor_position_);
     90       text_ = trimmed_text;
     91     }
     92   }
     93 }
     94 
     95 AutocompleteInput::~AutocompleteInput() {
     96 }
     97 
     98 // static
     99 size_t AutocompleteInput::RemoveForcedQueryStringIfNecessary(
    100     Type type,
    101     base::string16* text) {
    102   if (type != FORCED_QUERY || text->empty() || (*text)[0] != L'?')
    103     return 0;
    104   // Drop the leading '?'.
    105   text->erase(0, 1);
    106   return 1;
    107 }
    108 
    109 // static
    110 std::string AutocompleteInput::TypeToString(Type type) {
    111   switch (type) {
    112     case INVALID:       return "invalid";
    113     case UNKNOWN:       return "unknown";
    114     case URL:           return "url";
    115     case QUERY:         return "query";
    116     case FORCED_QUERY:  return "forced-query";
    117 
    118     default:
    119       NOTREACHED();
    120       return std::string();
    121   }
    122 }
    123 
    124 // static
    125 AutocompleteInput::Type AutocompleteInput::Parse(
    126     const base::string16& text,
    127     const base::string16& desired_tld,
    128     url_parse::Parsed* parts,
    129     base::string16* scheme,
    130     GURL* canonicalized_url) {
    131   size_t first_non_white = text.find_first_not_of(base::kWhitespaceUTF16, 0);
    132   if (first_non_white == base::string16::npos)
    133     return INVALID;  // All whitespace.
    134 
    135   if (text.at(first_non_white) == L'?') {
    136     // If the first non-whitespace character is a '?', we magically treat this
    137     // as a query.
    138     return FORCED_QUERY;
    139   }
    140 
    141   // Ask our parsing back-end to help us understand what the user typed.  We
    142   // use the URLFixerUpper here because we want to be smart about what we
    143   // consider a scheme.  For example, we shouldn't consider www.google.com:80
    144   // to have a scheme.
    145   url_parse::Parsed local_parts;
    146   if (!parts)
    147     parts = &local_parts;
    148   const base::string16 parsed_scheme(URLFixerUpper::SegmentURL(text, parts));
    149   if (scheme)
    150     *scheme = parsed_scheme;
    151   if (canonicalized_url) {
    152     *canonicalized_url = URLFixerUpper::FixupURL(UTF16ToUTF8(text),
    153                                                  UTF16ToUTF8(desired_tld));
    154   }
    155 
    156   if (LowerCaseEqualsASCII(parsed_scheme, chrome::kFileScheme)) {
    157     // A user might or might not type a scheme when entering a file URL.  In
    158     // either case, |parsed_scheme| will tell us that this is a file URL, but
    159     // |parts->scheme| might be empty, e.g. if the user typed "C:\foo".
    160     return URL;
    161   }
    162 
    163   if (LowerCaseEqualsASCII(parsed_scheme, chrome::kFileSystemScheme)) {
    164     // This could theoretically be a strange search, but let's check.
    165     // If it's got an inner_url with a scheme, it's a URL, whether it's valid or
    166     // not.
    167     if (parts->inner_parsed() && parts->inner_parsed()->scheme.is_valid())
    168       return URL;
    169   }
    170 
    171   // If the user typed a scheme, and it's HTTP or HTTPS, we know how to parse it
    172   // well enough that we can fall through to the heuristics below.  If it's
    173   // something else, we can just determine our action based on what we do with
    174   // any input of this scheme.  In theory we could do better with some schemes
    175   // (e.g. "ftp" or "view-source") but I'll wait to spend the effort on that
    176   // until I run into some cases that really need it.
    177   if (parts->scheme.is_nonempty() &&
    178       !LowerCaseEqualsASCII(parsed_scheme, content::kHttpScheme) &&
    179       !LowerCaseEqualsASCII(parsed_scheme, content::kHttpsScheme)) {
    180     // See if we know how to handle the URL internally.
    181     if (ProfileIOData::IsHandledProtocol(UTF16ToASCII(parsed_scheme)))
    182       return URL;
    183 
    184     // There are also some schemes that we convert to other things before they
    185     // reach the renderer or else the renderer handles internally without
    186     // reaching the net::URLRequest logic.  We thus won't catch these above, but
    187     // we should still claim to handle them.
    188     if (LowerCaseEqualsASCII(parsed_scheme, content::kViewSourceScheme) ||
    189         LowerCaseEqualsASCII(parsed_scheme, content::kJavaScriptScheme) ||
    190         LowerCaseEqualsASCII(parsed_scheme, chrome::kDataScheme))
    191       return URL;
    192 
    193     // Finally, check and see if the user has explicitly opened this scheme as
    194     // a URL before, or if the "scheme" is actually a username.  We need to do
    195     // this last because some schemes (e.g. "javascript") may be treated as
    196     // "blocked" by the external protocol handler because we don't want pages to
    197     // open them, but users still can.
    198     // TODO(viettrungluu): get rid of conversion.
    199     ExternalProtocolHandler::BlockState block_state =
    200         ExternalProtocolHandler::GetBlockState(UTF16ToUTF8(parsed_scheme));
    201     switch (block_state) {
    202       case ExternalProtocolHandler::DONT_BLOCK:
    203         return URL;
    204 
    205       case ExternalProtocolHandler::BLOCK:
    206         // If we don't want the user to open the URL, don't let it be navigated
    207         // to at all.
    208         return QUERY;
    209 
    210       default: {
    211         // We don't know about this scheme.  It might be that the user typed a
    212         // URL of the form "username:password (at) foo.com".
    213         const base::string16 http_scheme_prefix =
    214             ASCIIToUTF16(std::string(content::kHttpScheme) +
    215                          content::kStandardSchemeSeparator);
    216         url_parse::Parsed http_parts;
    217         base::string16 http_scheme;
    218         GURL http_canonicalized_url;
    219         Type http_type = Parse(http_scheme_prefix + text, desired_tld,
    220                                &http_parts, &http_scheme,
    221                                &http_canonicalized_url);
    222         DCHECK_EQ(std::string(content::kHttpScheme), UTF16ToUTF8(http_scheme));
    223 
    224         if (http_type == URL &&
    225             http_parts.username.is_nonempty() &&
    226             http_parts.password.is_nonempty()) {
    227           // Manually re-jigger the parsed parts to match |text| (without the
    228           // http scheme added).
    229           http_parts.scheme.reset();
    230           url_parse::Component* components[] = {
    231             &http_parts.username,
    232             &http_parts.password,
    233             &http_parts.host,
    234             &http_parts.port,
    235             &http_parts.path,
    236             &http_parts.query,
    237             &http_parts.ref,
    238           };
    239           for (size_t i = 0; i < arraysize(components); ++i) {
    240             URLFixerUpper::OffsetComponent(
    241                 -static_cast<int>(http_scheme_prefix.length()), components[i]);
    242           }
    243 
    244           *parts = http_parts;
    245           if (scheme)
    246             scheme->clear();
    247           if (canonicalized_url)
    248             *canonicalized_url = http_canonicalized_url;
    249 
    250           return http_type;
    251         }
    252 
    253         // We don't know about this scheme and it doesn't look like the user
    254         // typed a username and password.  It's likely to be a search operator
    255         // like "site:" or "link:".  We classify it as UNKNOWN so the user has
    256         // the option of treating it as a URL if we're wrong.
    257         // Note that SegmentURL() is smart so we aren't tricked by "c:\foo" or
    258         // "www.example.com:81" in this case.
    259         return UNKNOWN;
    260       }
    261     }
    262   }
    263 
    264   // Either the user didn't type a scheme, in which case we need to distinguish
    265   // between an HTTP URL and a query, or the scheme is HTTP or HTTPS, in which
    266   // case we should reject invalid formulations.
    267 
    268   // If we have an empty host it can't be a URL.
    269   if (!parts->host.is_nonempty())
    270     return QUERY;
    271 
    272   // Likewise, the RCDS can reject certain obviously-invalid hosts.  (We also
    273   // use the registry length later below.)
    274   const base::string16 host(text.substr(parts->host.begin, parts->host.len));
    275   const size_t registry_length =
    276       net::registry_controlled_domains::GetRegistryLength(
    277           UTF16ToUTF8(host),
    278           net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
    279           net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
    280   if (registry_length == std::string::npos) {
    281     // Try to append the desired_tld.
    282     if (!desired_tld.empty()) {
    283       base::string16 host_with_tld(host);
    284       if (host[host.length() - 1] != '.')
    285         host_with_tld += '.';
    286       host_with_tld += desired_tld;
    287       const size_t tld_length =
    288           net::registry_controlled_domains::GetRegistryLength(
    289               UTF16ToUTF8(host_with_tld),
    290               net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
    291               net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
    292       if (tld_length != std::string::npos)
    293         return URL;  // Something like "99999999999" that looks like a bad IP
    294                      // address, but becomes valid on attaching a TLD.
    295     }
    296     return QUERY;  // Could be a broken IP address, etc.
    297   }
    298 
    299 
    300   // See if the hostname is valid.  While IE and GURL allow hostnames to contain
    301   // many other characters (perhaps for weird intranet machines), it's extremely
    302   // unlikely that a user would be trying to type those in for anything other
    303   // than a search query.
    304   url_canon::CanonHostInfo host_info;
    305   const std::string canonicalized_host(net::CanonicalizeHost(UTF16ToUTF8(host),
    306                                                              &host_info));
    307   if ((host_info.family == url_canon::CanonHostInfo::NEUTRAL) &&
    308       !net::IsCanonicalizedHostCompliant(canonicalized_host,
    309                                          UTF16ToUTF8(desired_tld))) {
    310     // Invalid hostname.  There are several possible cases:
    311     // * Our checker is too strict and the user pasted in a real-world URL
    312     //   that's "invalid" but resolves.  To catch these, we return UNKNOWN when
    313     //   the user explicitly typed a scheme, so we'll still search by default
    314     //   but we'll show the accidental search infobar if necessary.
    315     // * The user is typing a multi-word query.  If we see a space anywhere in
    316     //   the hostname we assume this is a search and return QUERY.
    317     // * Our checker is too strict and the user is typing a real-world hostname
    318     //   that's "invalid" but resolves.  We return UNKNOWN if the TLD is known.
    319     //   Note that we explicitly excluded hosts with spaces above so that
    320     //   "toys at amazon.com" will be treated as a search.
    321     // * The user is typing some garbage string.  Return QUERY.
    322     //
    323     // Thus we fall down in the following cases:
    324     // * Trying to navigate to a hostname with spaces
    325     // * Trying to navigate to a hostname with invalid characters and an unknown
    326     //   TLD
    327     // These are rare, though probably possible in intranets.
    328     return (parts->scheme.is_nonempty() ||
    329            ((registry_length != 0) &&
    330             (host.find(' ') == base::string16::npos))) ? UNKNOWN : QUERY;
    331   }
    332 
    333   // A port number is a good indicator that this is a URL.  However, it might
    334   // also be a query like "1.66:1" that looks kind of like an IP address and
    335   // port number. So here we only check for "port numbers" that are illegal and
    336   // thus mean this can't be navigated to (e.g. "1.2.3.4:garbage"), and we save
    337   // handling legal port numbers until after the "IP address" determination
    338   // below.
    339   if (url_parse::ParsePort(text.c_str(), parts->port) ==
    340       url_parse::PORT_INVALID)
    341     return QUERY;
    342 
    343   // Now that we've ruled out all schemes other than http or https and done a
    344   // little more sanity checking, the presence of a scheme means this is likely
    345   // a URL.
    346   if (parts->scheme.is_nonempty())
    347     return URL;
    348 
    349   // See if the host is an IP address.
    350   if (host_info.family == url_canon::CanonHostInfo::IPV6)
    351     return URL;
    352   // If the user originally typed a host that looks like an IP address (a
    353   // dotted quad), they probably want to open it.  If the original input was
    354   // something else (like a single number), they probably wanted to search for
    355   // it, unless they explicitly typed a scheme.  This is true even if the URL
    356   // appears to have a path: "1.2/45" is more likely a search (for the answer
    357   // to a math problem) than a URL.  However, if there are more non-host
    358   // components, then maybe this really was intended to be a navigation.  For
    359   // this reason we only check the dotted-quad case here, and save the "other
    360   // IP addresses" case for after we check the number of non-host components
    361   // below.
    362   if ((host_info.family == url_canon::CanonHostInfo::IPV4) &&
    363       (host_info.num_ipv4_components == 4))
    364     return URL;
    365 
    366   // Presence of a password means this is likely a URL.  Note that unless the
    367   // user has typed an explicit "http://" or similar, we'll probably think that
    368   // the username is some unknown scheme, and bail out in the scheme-handling
    369   // code above.
    370   if (parts->password.is_nonempty())
    371     return URL;
    372 
    373   // Trailing slashes force the input to be treated as a URL.
    374   if (parts->path.is_nonempty()) {
    375     char c = text[parts->path.end() - 1];
    376     if ((c == '\\') || (c == '/'))
    377       return URL;
    378   }
    379 
    380   // If there is more than one recognized non-host component, this is likely to
    381   // be a URL, even if the TLD is unknown (in which case this is likely an
    382   // intranet URL).
    383   if (NumNonHostComponents(*parts) > 1)
    384     return URL;
    385 
    386   // If the host has a known TLD or a port, it's probably a URL, with the
    387   // following exceptions:
    388   // * Any "IP addresses" that make it here are more likely searches
    389   //   (see above).
    390   // * If we reach here with a username, our input looks like "user@host[.tld]".
    391   //   Because there is no scheme explicitly specified, we think this is more
    392   //   likely an email address than an HTTP auth attempt.  Hence, we search by
    393   //   default and let users correct us on a case-by-case basis.
    394   // Note that we special-case "localhost" as a known hostname.
    395   if ((host_info.family != url_canon::CanonHostInfo::IPV4) &&
    396       ((registry_length != 0) || (host == ASCIIToUTF16("localhost") ||
    397        parts->port.is_nonempty())))
    398     return parts->username.is_nonempty() ? UNKNOWN : URL;
    399 
    400   // If we reach this point, we know there's no known TLD on the input, so if
    401   // the user wishes to add a desired_tld, the fixup code will oblige; thus this
    402   // is a URL.
    403   if (!desired_tld.empty())
    404     return URL;
    405 
    406   // No scheme, password, port, path, and no known TLD on the host.
    407   // This could be:
    408   // * An "incomplete IP address"; likely a search (see above).
    409   // * An email-like input like "user@host", where "host" has no known TLD.
    410   //   It's not clear what the user means here and searching seems reasonable.
    411   // * A single word "foo"; possibly an intranet site, but more likely a search.
    412   //   This is ideally an UNKNOWN, and we can let the Alternate Nav URL code
    413   //   catch our mistakes.
    414   // * A URL with a valid TLD we don't know about yet.  If e.g. a registrar adds
    415   //   "xxx" as a TLD, then until we add it to our data file, Chrome won't know
    416   //   "foo.xxx" is a real URL.  So ideally this is a URL, but we can't really
    417   //   distinguish this case from:
    418   // * A "URL-like" string that's not really a URL (like
    419   //   "browser.tabs.closeButtons" or "java.awt.event.*").  This is ideally a
    420   //   QUERY.  Since this is indistinguishable from the case above, and this
    421   //   case is much more likely, claim these are UNKNOWN, which should default
    422   //   to the right thing and let users correct us on a case-by-case basis.
    423   return UNKNOWN;
    424 }
    425 
    426 // static
    427 void AutocompleteInput::ParseForEmphasizeComponents(
    428     const base::string16& text,
    429     url_parse::Component* scheme,
    430     url_parse::Component* host) {
    431   url_parse::Parsed parts;
    432   base::string16 scheme_str;
    433   Parse(text, base::string16(), &parts, &scheme_str, NULL);
    434 
    435   *scheme = parts.scheme;
    436   *host = parts.host;
    437 
    438   int after_scheme_and_colon = parts.scheme.end() + 1;
    439   // For the view-source scheme, we should emphasize the scheme and host of the
    440   // URL qualified by the view-source prefix.
    441   if (LowerCaseEqualsASCII(scheme_str, content::kViewSourceScheme) &&
    442       (static_cast<int>(text.length()) > after_scheme_and_colon)) {
    443     // Obtain the URL prefixed by view-source and parse it.
    444     base::string16 real_url(text.substr(after_scheme_and_colon));
    445     url_parse::Parsed real_parts;
    446     AutocompleteInput::Parse(real_url, base::string16(), &real_parts, NULL, NULL);
    447     if (real_parts.scheme.is_nonempty() || real_parts.host.is_nonempty()) {
    448       if (real_parts.scheme.is_nonempty()) {
    449         *scheme = url_parse::Component(
    450             after_scheme_and_colon + real_parts.scheme.begin,
    451             real_parts.scheme.len);
    452       } else {
    453         scheme->reset();
    454       }
    455       if (real_parts.host.is_nonempty()) {
    456         *host = url_parse::Component(
    457             after_scheme_and_colon + real_parts.host.begin,
    458             real_parts.host.len);
    459       } else {
    460         host->reset();
    461       }
    462     }
    463   } else if (LowerCaseEqualsASCII(scheme_str, chrome::kFileSystemScheme) &&
    464              parts.inner_parsed() && parts.inner_parsed()->scheme.is_valid()) {
    465     *host = parts.inner_parsed()->host;
    466   }
    467 }
    468 
    469 // static
    470 base::string16 AutocompleteInput::FormattedStringWithEquivalentMeaning(
    471     const GURL& url,
    472     const base::string16& formatted_url) {
    473   if (!net::CanStripTrailingSlash(url))
    474     return formatted_url;
    475   const base::string16 url_with_path(formatted_url + char16('/'));
    476   return (AutocompleteInput::Parse(formatted_url, base::string16(), NULL, NULL,
    477                                    NULL) ==
    478           AutocompleteInput::Parse(url_with_path, base::string16(), NULL, NULL,
    479                                    NULL)) ?
    480       formatted_url : url_with_path;
    481 }
    482 
    483 // static
    484 int AutocompleteInput::NumNonHostComponents(const url_parse::Parsed& parts) {
    485   int num_nonhost_components = 0;
    486   if (parts.scheme.is_nonempty())
    487     ++num_nonhost_components;
    488   if (parts.username.is_nonempty())
    489     ++num_nonhost_components;
    490   if (parts.password.is_nonempty())
    491     ++num_nonhost_components;
    492   if (parts.port.is_nonempty())
    493     ++num_nonhost_components;
    494   if (parts.path.is_nonempty())
    495     ++num_nonhost_components;
    496   if (parts.query.is_nonempty())
    497     ++num_nonhost_components;
    498   if (parts.ref.is_nonempty())
    499     ++num_nonhost_components;
    500   return num_nonhost_components;
    501 }
    502 
    503 // static
    504 bool AutocompleteInput::HasHTTPScheme(const base::string16& input) {
    505   std::string utf8_input(UTF16ToUTF8(input));
    506   url_parse::Component scheme;
    507   if (url_util::FindAndCompareScheme(utf8_input, content::kViewSourceScheme,
    508                                      &scheme))
    509     utf8_input.erase(0, scheme.end() + 1);
    510   return url_util::FindAndCompareScheme(utf8_input, content::kHttpScheme, NULL);
    511 }
    512 
    513 void AutocompleteInput::UpdateText(const base::string16& text,
    514                                    size_t cursor_position,
    515                                    const url_parse::Parsed& parts) {
    516   DCHECK(cursor_position <= text.length() ||
    517          cursor_position == base::string16::npos)
    518       << "Text: '" << text << "', cp: " << cursor_position;
    519   text_ = text;
    520   cursor_position_ = cursor_position;
    521   parts_ = parts;
    522 }
    523 
    524 void AutocompleteInput::Clear() {
    525   text_.clear();
    526   cursor_position_ = base::string16::npos;
    527   current_url_ = GURL();
    528   current_page_classification_ = AutocompleteInput::INVALID_SPEC;
    529   type_ = INVALID;
    530   parts_ = url_parse::Parsed();
    531   scheme_.clear();
    532   canonicalized_url_ = GURL();
    533   prevent_inline_autocomplete_ = false;
    534   prefer_keyword_ = false;
    535   allow_exact_keyword_match_ = false;
    536   matches_requested_ = ALL_MATCHES;
    537 }
    538