Home | History | Annotate | Download | only in autocomplete
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/autocomplete/bookmark_provider.h"
      6 
      7 #include <algorithm>
      8 #include <functional>
      9 #include <vector>
     10 
     11 #include "base/prefs/pref_service.h"
     12 #include "base/strings/string_util.h"
     13 #include "base/strings/utf_string_conversions.h"
     14 #include "chrome/browser/autocomplete/chrome_autocomplete_scheme_classifier.h"
     15 #include "chrome/browser/autocomplete/history_provider.h"
     16 #include "chrome/browser/bookmarks/bookmark_model_factory.h"
     17 #include "chrome/browser/profiles/profile.h"
     18 #include "chrome/common/pref_names.h"
     19 #include "components/bookmarks/browser/bookmark_match.h"
     20 #include "components/bookmarks/browser/bookmark_model.h"
     21 #include "components/metrics/proto/omnibox_input_type.pb.h"
     22 #include "components/omnibox/autocomplete_result.h"
     23 #include "components/omnibox/url_prefix.h"
     24 #include "net/base/net_util.h"
     25 
     26 using bookmarks::BookmarkMatch;
     27 
     28 typedef std::vector<BookmarkMatch> BookmarkMatches;
     29 
     30 namespace {
     31 
     32 // Removes leading spaces from |title| before displaying, otherwise it looks
     33 // funny.  In the process, corrects |title_match_positions| so the correct
     34 // characters are highlighted.
     35 void CorrectTitleAndMatchPositions(
     36     base::string16* title,
     37     BookmarkMatch::MatchPositions* title_match_positions) {
     38   size_t leading_whitespace_chars = title->length();
     39   base::TrimWhitespace(*title, base::TRIM_LEADING, title);
     40   leading_whitespace_chars-= title->length();
     41   if (leading_whitespace_chars == 0)
     42     return;
     43   for (query_parser::Snippet::MatchPositions::iterator it =
     44            title_match_positions->begin();
     45        it != title_match_positions->end(); ++it) {
     46     (*it) = query_parser::Snippet::MatchPosition(
     47         it->first - leading_whitespace_chars,
     48         it->second - leading_whitespace_chars);
     49   }
     50 }
     51 
     52 }  // namespace
     53 
     54 // BookmarkProvider ------------------------------------------------------------
     55 
     56 BookmarkProvider::BookmarkProvider(Profile* profile)
     57     : AutocompleteProvider(AutocompleteProvider::TYPE_BOOKMARK),
     58       profile_(profile),
     59       bookmark_model_(NULL) {
     60   if (profile) {
     61     bookmark_model_ = BookmarkModelFactory::GetForProfile(profile);
     62     languages_ = profile_->GetPrefs()->GetString(prefs::kAcceptLanguages);
     63   }
     64 }
     65 
     66 void BookmarkProvider::Start(const AutocompleteInput& input,
     67                              bool minimal_changes) {
     68   if (minimal_changes)
     69     return;
     70   matches_.clear();
     71 
     72   if (input.text().empty() ||
     73       (input.type() == metrics::OmniboxInputType::FORCED_QUERY))
     74     return;
     75 
     76   DoAutocomplete(input);
     77 }
     78 
     79 BookmarkProvider::~BookmarkProvider() {}
     80 
     81 void BookmarkProvider::DoAutocomplete(const AutocompleteInput& input) {
     82   // We may not have a bookmark model for some unit tests.
     83   if (!bookmark_model_)
     84     return;
     85 
     86   BookmarkMatches matches;
     87   // Retrieve enough bookmarks so that we have a reasonable probability of
     88   // suggesting the one that the user desires.
     89   const size_t kMaxBookmarkMatches = 50;
     90 
     91   // GetBookmarksMatching returns bookmarks matching the user's
     92   // search terms using the following rules:
     93   //  - The search text is broken up into search terms. Each term is searched
     94   //    for separately.
     95   //  - Term matches are always performed against the start of a word. 'def'
     96   //    will match against 'define' but not against 'indefinite'.
     97   //  - Terms must be at least three characters in length in order to perform
     98   //    partial word matches. Any term of lesser length will only be used as an
     99   //    exact match. 'def' will match against 'define' but 'de' will not match.
    100   //  - A search containing multiple terms will return results with those words
    101   //    occuring in any order.
    102   //  - Terms enclosed in quotes comprises a phrase that must match exactly.
    103   //  - Multiple terms enclosed in quotes will require those exact words in that
    104   //    exact order to match.
    105   //
    106   // Please refer to the code for BookmarkIndex::GetBookmarksMatching for
    107   // complete details of how searches are performed against the user's
    108   // bookmarks.
    109   bookmark_model_->GetBookmarksMatching(input.text(),
    110                                         kMaxBookmarkMatches,
    111                                         &matches);
    112   if (matches.empty())
    113     return;  // There were no matches.
    114   const base::string16 fixed_up_input(FixupUserInput(input).second);
    115   for (BookmarkMatches::const_iterator i = matches.begin(); i != matches.end();
    116        ++i) {
    117     // Create and score the AutocompleteMatch. If its score is 0 then the
    118     // match is discarded.
    119     AutocompleteMatch match(BookmarkMatchToACMatch(input, fixed_up_input, *i));
    120     if (match.relevance > 0)
    121       matches_.push_back(match);
    122   }
    123 
    124   // Sort and clip the resulting matches.
    125   size_t num_matches =
    126       std::min(matches_.size(), AutocompleteProvider::kMaxMatches);
    127   std::partial_sort(matches_.begin(), matches_.begin() + num_matches,
    128                     matches_.end(), AutocompleteMatch::MoreRelevant);
    129   matches_.resize(num_matches);
    130 }
    131 
    132 namespace {
    133 
    134 // for_each helper functor that calculates a match factor for each query term
    135 // when calculating the final score.
    136 //
    137 // Calculate a 'factor' from 0 to the bookmark's title length for a match
    138 // based on 1) how many characters match and 2) where the match is positioned.
    139 class ScoringFunctor {
    140  public:
    141   // |title_length| is the length of the bookmark title against which this
    142   // match will be scored.
    143   explicit ScoringFunctor(size_t title_length)
    144       : title_length_(static_cast<double>(title_length)),
    145         scoring_factor_(0.0) {
    146   }
    147 
    148   void operator()(const query_parser::Snippet::MatchPosition& match) {
    149     double term_length = static_cast<double>(match.second - match.first);
    150     scoring_factor_ += term_length *
    151         (title_length_ - match.first) / title_length_;
    152   }
    153 
    154   double ScoringFactor() { return scoring_factor_; }
    155 
    156  private:
    157   double title_length_;
    158   double scoring_factor_;
    159 };
    160 
    161 }  // namespace
    162 
    163 AutocompleteMatch BookmarkProvider::BookmarkMatchToACMatch(
    164     const AutocompleteInput& input,
    165     const base::string16& fixed_up_input_text,
    166     const BookmarkMatch& bookmark_match) {
    167   // The AutocompleteMatch we construct is non-deletable because the only
    168   // way to support this would be to delete the underlying bookmark, which is
    169   // unlikely to be what the user intends.
    170   AutocompleteMatch match(this, 0, false,
    171                           AutocompleteMatchType::BOOKMARK_TITLE);
    172   base::string16 title(bookmark_match.node->GetTitle());
    173   BookmarkMatch::MatchPositions new_title_match_positions =
    174       bookmark_match.title_match_positions;
    175   CorrectTitleAndMatchPositions(&title, &new_title_match_positions);
    176   const GURL& url(bookmark_match.node->url());
    177   const base::string16& url_utf16 = base::UTF8ToUTF16(url.spec());
    178   size_t inline_autocomplete_offset = URLPrefix::GetInlineAutocompleteOffset(
    179       input.text(), fixed_up_input_text, false, url_utf16);
    180   match.destination_url = url;
    181   const size_t match_start = bookmark_match.url_match_positions.empty() ?
    182       0 : bookmark_match.url_match_positions[0].first;
    183   const bool trim_http = !AutocompleteInput::HasHTTPScheme(input.text()) &&
    184       ((match_start == base::string16::npos) || (match_start != 0));
    185   std::vector<size_t> offsets = BookmarkMatch::OffsetsFromMatchPositions(
    186       bookmark_match.url_match_positions);
    187   // In addition to knowing how |offsets| is transformed, we need to know how
    188   // |inline_autocomplete_offset| is transformed.  We add it to the end of
    189   // |offsets|, compute how everything is transformed, then remove it from the
    190   // end.
    191   offsets.push_back(inline_autocomplete_offset);
    192   match.contents = net::FormatUrlWithOffsets(url, languages_,
    193       net::kFormatUrlOmitAll & ~(trim_http ? 0 : net::kFormatUrlOmitHTTP),
    194       net::UnescapeRule::SPACES, NULL, NULL, &offsets);
    195   inline_autocomplete_offset = offsets.back();
    196   offsets.pop_back();
    197   BookmarkMatch::MatchPositions new_url_match_positions =
    198       BookmarkMatch::ReplaceOffsetsInMatchPositions(
    199           bookmark_match.url_match_positions, offsets);
    200   match.contents_class =
    201       ClassificationsFromMatch(new_url_match_positions,
    202                                match.contents.size(),
    203                                true);
    204   match.fill_into_edit =
    205       AutocompleteInput::FormattedStringWithEquivalentMeaning(
    206           url, match.contents, ChromeAutocompleteSchemeClassifier(profile_));
    207   if (inline_autocomplete_offset != base::string16::npos) {
    208     // |inline_autocomplete_offset| may be beyond the end of the
    209     // |fill_into_edit| if the user has typed an URL with a scheme and the
    210     // last character typed is a slash.  That slash is removed by the
    211     // FormatURLWithOffsets call above.
    212     if (inline_autocomplete_offset < match.fill_into_edit.length()) {
    213       match.inline_autocompletion =
    214           match.fill_into_edit.substr(inline_autocomplete_offset);
    215     }
    216     match.allowed_to_be_default_match = match.inline_autocompletion.empty() ||
    217         !HistoryProvider::PreventInlineAutocomplete(input);
    218   }
    219   match.description = title;
    220   match.description_class =
    221       ClassificationsFromMatch(bookmark_match.title_match_positions,
    222                                match.description.size(),
    223                                false);
    224 
    225   // Summary on how a relevance score is determined for the match:
    226   //
    227   // For each match within the bookmark's title or URL (or both), calculate a
    228   // 'factor', sum up those factors, then use the sum to figure out a value
    229   // between the base score and the maximum score.
    230   //
    231   // The factor for each match is the product of:
    232   //
    233   //  1) how many characters in the bookmark's title/URL are part of this match.
    234   //     This is capped at the length of the bookmark's title
    235   //     to prevent terms that match in both the title and the URL from
    236   //     scoring too strongly.
    237   //
    238   //  2) where the match occurs within the bookmark's title or URL,
    239   //     giving more points for matches that appear earlier in the string:
    240   //       ((string_length - position of match start) / string_length).
    241   //
    242   //  Example: Given a bookmark title of 'abcde fghijklm', with a title length
    243   //     of 14, and two different search terms, 'abcde' and 'fghij', with
    244   //     start positions of 0 and 6, respectively, 'abcde' will score higher
    245   //     (with a a partial factor of (14-0)/14 = 1.000 ) than 'fghij' (with
    246   //     a partial factor of (14-6)/14 = 0.571 ).  (In this example neither
    247   //     term matches in the URL.)
    248   //
    249   // Once all match factors have been calculated they are summed.  If there
    250   // are no URL matches, the resulting sum will never be greater than the
    251   // length of the bookmark title because of the way the bookmark model matches
    252   // and removes overlaps.  (In particular, the bookmark model only
    253   // matches terms to the beginning of words and it removes all overlapping
    254   // matches, keeping only the longest.  Together these mean that each
    255   // character is included in at most one match.)  If there are matches in the
    256   // URL, the sum can be greater.
    257   //
    258   // This sum is then normalized by the length of the bookmark title + 10
    259   // and capped at 1.0.  The +10 is to expand the scoring range so fewer
    260   // bookmarks will hit the 1.0 cap and hence lose all ability to distinguish
    261   // between these high-quality bookmarks.
    262   //
    263   // The normalized value is multiplied against the scoring range available,
    264   // which is 299.  The 299 is calculated by subtracting the minimum possible
    265   // score, 900, from the maximum possible score, 1199.  This product, ranging
    266   // from 0 to 299, is added to the minimum possible score, 900, giving the
    267   // preliminary score.
    268   //
    269   // If the preliminary score is less than the maximum possible score, 1199,
    270   // it can be boosted up to that maximum possible score if the URL referenced
    271   // by the bookmark is also referenced by any of the user's other bookmarks.
    272   // A count of how many times the bookmark's URL is referenced is determined
    273   // and, for each additional reference beyond the one for the bookmark being
    274   // scored up to a maximum of three, the score is boosted by a fixed amount
    275   // given by |kURLCountBoost|, below.
    276   //
    277 
    278   // Pretend empty titles are identical to the URL.
    279   if (title.empty())
    280     title = base::ASCIIToUTF16(url.spec());
    281   ScoringFunctor title_position_functor =
    282       for_each(bookmark_match.title_match_positions.begin(),
    283                bookmark_match.title_match_positions.end(),
    284                ScoringFunctor(title.size()));
    285   ScoringFunctor url_position_functor =
    286       for_each(bookmark_match.url_match_positions.begin(),
    287                bookmark_match.url_match_positions.end(),
    288                ScoringFunctor(bookmark_match.node->url().spec().length()));
    289   const double summed_factors = title_position_functor.ScoringFactor() +
    290       url_position_functor.ScoringFactor();
    291   const double normalized_sum =
    292       std::min(summed_factors / (title.size() + 10), 1.0);
    293   const int kBaseBookmarkScore = 900;
    294   const int kMaxBookmarkScore = 1199;
    295   const double kBookmarkScoreRange =
    296       static_cast<double>(kMaxBookmarkScore - kBaseBookmarkScore);
    297   match.relevance = static_cast<int>(normalized_sum * kBookmarkScoreRange) +
    298       kBaseBookmarkScore;
    299   // Don't waste any time searching for additional referenced URLs if we
    300   // already have a perfect title match.
    301   if (match.relevance >= kMaxBookmarkScore)
    302     return match;
    303   // Boost the score if the bookmark's URL is referenced by other bookmarks.
    304   const int kURLCountBoost[4] = { 0, 75, 125, 150 };
    305   std::vector<const BookmarkNode*> nodes;
    306   bookmark_model_->GetNodesByURL(url, &nodes);
    307   DCHECK_GE(std::min(arraysize(kURLCountBoost), nodes.size()), 1U);
    308   match.relevance +=
    309       kURLCountBoost[std::min(arraysize(kURLCountBoost), nodes.size()) - 1];
    310   match.relevance = std::min(kMaxBookmarkScore, match.relevance);
    311   return match;
    312 }
    313 
    314 // static
    315 ACMatchClassifications BookmarkProvider::ClassificationsFromMatch(
    316     const query_parser::Snippet::MatchPositions& positions,
    317     size_t text_length,
    318     bool is_url) {
    319   ACMatchClassification::Style url_style =
    320       is_url ? ACMatchClassification::URL : ACMatchClassification::NONE;
    321   ACMatchClassifications classifications;
    322   if (positions.empty()) {
    323     if (text_length > 0)
    324       classifications.push_back(ACMatchClassification(0, url_style));
    325     return classifications;
    326   }
    327 
    328   for (query_parser::Snippet::MatchPositions::const_iterator i =
    329            positions.begin();
    330        i != positions.end();
    331        ++i) {
    332     AutocompleteMatch::ACMatchClassifications new_class;
    333     AutocompleteMatch::ClassifyLocationInString(i->first, i->second - i->first,
    334         text_length, url_style, &new_class);
    335     classifications = AutocompleteMatch::MergeClassifications(
    336         classifications, new_class);
    337   }
    338   return classifications;
    339 }
    340