1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_HISTORY_SCORED_HISTORY_MATCH_H_ 6 #define CHROME_BROWSER_HISTORY_SCORED_HISTORY_MATCH_H_ 7 8 #include <map> 9 #include <set> 10 #include <vector> 11 12 #include "base/strings/string16.h" 13 #include "chrome/browser/history/in_memory_url_index_types.h" 14 #include "components/history/core/browser/history_match.h" 15 #include "components/history/core/browser/history_types.h" 16 #include "testing/gtest/include/gtest/gtest_prod.h" 17 18 namespace history { 19 20 class HistoryClient; 21 class ScoredHistoryMatchTest; 22 23 // An HistoryMatch that has a score as well as metrics defining where in the 24 // history item's URL and/or page title matches have occurred. 25 class ScoredHistoryMatch : public history::HistoryMatch { 26 public: 27 // The maximum number of recent visits to examine in GetFrequency(). 28 // Public so url_index_private_data.cc knows how many visits it is 29 // expected to deliver (at minimum) to this class. 30 static const size_t kMaxVisitsToScore; 31 32 ScoredHistoryMatch(); // Required by STL. 33 34 // Creates a new match with a raw score calculated for the history item 35 // given in |row| with recent visits as indicated in |visits|. First 36 // determines if the row qualifies by seeing if all of the terms in 37 // |terms_vector| occur in |row|. If so, calculates a raw score. This raw 38 // score is in part determined by whether the matches occur at word 39 // boundaries, the locations of which are stored in |word_starts|. For some 40 // terms, it's appropriate to look for the word boundary within the term. 41 // For instance, the term ".net" should look for a word boundary at the "n". 42 // These offsets (".net" should have an offset of 1) come from 43 // |terms_to_word_starts_offsets|. |history_client| is used to determine 44 // if the match's URL is referenced by any bookmarks, which can also affect 45 // the raw score. The raw score allows the matches to be ordered and can be 46 // used to influence the final score calculated by the client of this index. 47 // If the row does not qualify the raw score will be 0. |languages| is used 48 // to help parse/format the URL before looking for the terms. 49 ScoredHistoryMatch(const URLRow& row, 50 const VisitInfoVector& visits, 51 const std::string& languages, 52 const base::string16& lower_string, 53 const String16Vector& terms_vector, 54 const WordStarts& terms_to_word_starts_offsets, 55 const RowWordStarts& word_starts, 56 const base::Time now, 57 HistoryClient* history_client); 58 ~ScoredHistoryMatch(); 59 60 // Compares two matches by score. Functor supporting URLIndexPrivateData's 61 // HistoryItemsForTerms function. Looks at particular fields within 62 // with url_info to make tie-breaking a bit smarter. 63 static bool MatchScoreGreater(const ScoredHistoryMatch& m1, 64 const ScoredHistoryMatch& m2); 65 66 // Accessors: 67 int raw_score() const { return raw_score_; } 68 const TermMatches& url_matches() const { return url_matches_; } 69 const TermMatches& title_matches() const { return title_matches_; } 70 bool can_inline() const { return can_inline_; } 71 72 // Returns |term_matches| after removing all matches that are not at a 73 // word break that are in the range [|start_pos|, |end_pos|). 74 // start_pos == string::npos is treated as start_pos = length of string. 75 // (In other words, no matches will be filtered.) 76 // end_pos == string::npos is treated as end_pos = length of string. 77 static TermMatches FilterTermMatchesByWordStarts( 78 const TermMatches& term_matches, 79 const WordStarts& terms_to_word_starts_offsets, 80 const WordStarts& word_starts, 81 size_t start_pos, 82 size_t end_pos); 83 84 private: 85 friend class ScoredHistoryMatchTest; 86 FRIEND_TEST_ALL_PREFIXES(ScoredHistoryMatchTest, ScoringBookmarks); 87 FRIEND_TEST_ALL_PREFIXES(ScoredHistoryMatchTest, ScoringDiscountFrecency); 88 FRIEND_TEST_ALL_PREFIXES(ScoredHistoryMatchTest, ScoringScheme); 89 FRIEND_TEST_ALL_PREFIXES(ScoredHistoryMatchTest, ScoringTLD); 90 91 // The number of days of recency scores to precompute. 92 static const int kDaysToPrecomputeRecencyScoresFor; 93 94 // The number of raw term score buckets use; raw term scores 95 // greater this are capped at the score of the largest bucket. 96 static const int kMaxRawTermScore; 97 98 // Return a topicality score based on how many matches appear in the 99 // url and the page's title and where they are (e.g., at word 100 // boundaries). Revises |url_matches_| and |title_matches_| in the 101 // process so they only reflect matches used for scoring. (For 102 // instance, some mid-word matches are not given credit in scoring.) 103 float GetTopicalityScore(const int num_terms, 104 const base::string16& cleaned_up_url, 105 const WordStarts& terms_to_word_starts_offsets, 106 const RowWordStarts& word_starts); 107 108 // Precalculates raw_term_score_to_topicality_score_, used in 109 // GetTopicalityScore(). 110 static void FillInTermScoreToTopicalityScoreArray(); 111 112 // Returns a recency score based on |last_visit_days_ago|, which is 113 // how many days ago the page was last visited. 114 static float GetRecencyScore(int last_visit_days_ago); 115 116 // Pre-calculates days_ago_to_recency_numerator_, used in 117 // GetRecencyScore(). 118 static void FillInDaysAgoToRecencyScoreArray(); 119 120 // Examines the first kMaxVisitsToScore and return a score (higher is 121 // better) based the rate of visits, whether the page is bookmarked, and 122 // how often those visits are typed navigations (i.e., explicitly 123 // invoked by the user). |now| is passed in to avoid unnecessarily 124 // recomputing it frequently. 125 static float GetFrequency(const base::Time& now, 126 const bool bookmarked, 127 const VisitInfoVector& visits); 128 129 // Combines the two component scores into a final score that's 130 // an appropriate value to use as a relevancy score. 131 static float GetFinalRelevancyScore( 132 float topicality_score, 133 float frequency_score); 134 135 // Sets |also_do_hup_like_scoring_|, 136 // |max_assigned_score_for_non_inlineable_matches_|, |bookmark_value_|, 137 // |allow_tld_matches_|, and |allow_scheme_matches_| based on the field 138 // trial state. 139 static void Init(); 140 141 // An interim score taking into consideration location and completeness 142 // of the match. 143 int raw_score_; 144 145 // Both these TermMatches contain the set of matches that are considered 146 // important. At this time, that means they exclude mid-word matches 147 // except in the hostname of the URL. (Technically, during early 148 // construction of ScoredHistoryMatch, they may contain all matches, but 149 // unimportant matches are eliminated by GetTopicalityScore(), called 150 // during construction.) 151 // Term matches within the URL. 152 TermMatches url_matches_; 153 // Term matches within the page title. 154 TermMatches title_matches_; 155 156 // True if this is a candidate for in-line autocompletion. 157 bool can_inline_; 158 159 // Pre-computed information to speed up calculating recency scores. 160 // |days_ago_to_recency_score_| is a simple array mapping how long 161 // ago a page was visited (in days) to the recency score we should 162 // assign it. This allows easy lookups of scores without requiring 163 // math. This is initialized upon first use of GetRecencyScore(), 164 // which calls FillInDaysAgoToRecencyScoreArray(), 165 static float* days_ago_to_recency_score_; 166 167 // Pre-computed information to speed up calculating topicality 168 // scores. |raw_term_score_to_topicality_score_| is a simple array 169 // mapping how raw terms scores (a weighted sum of the number of 170 // hits for the term, weighted by how important the hit is: 171 // hostname, path, etc.) to the topicality score we should assign 172 // it. This allows easy lookups of scores without requiring math. 173 // This is initialized upon first use of GetTopicalityScore(), 174 // which calls FillInTermScoreToTopicalityScoreArray(). 175 static float* raw_term_score_to_topicality_score_; 176 177 // Used so we initialize static variables only once (on first use). 178 static bool initialized_; 179 180 // Untyped visits to bookmarked pages score this, compared to 1 for 181 // untyped visits to non-bookmarked pages and 20 for typed visits. 182 static int bookmark_value_; 183 184 // If true, we allow input terms to match in the TLD (e.g., .com). 185 static bool allow_tld_matches_; 186 187 // If true, we allow input terms to match in the scheme (e.g., http://). 188 static bool allow_scheme_matches_; 189 190 // If true, assign raw scores to be max(whatever it normally would be, 191 // a score that's similar to the score HistoryURL provider would assign). 192 // This variable is set in the constructor by examining the field trial 193 // state. 194 static bool also_do_hup_like_scoring_; 195 196 // The maximum score that can be assigned to non-inlineable matches. 197 // This is useful because often we want inlineable matches to come 198 // first (even if they don't sometimes score as well as non-inlineable 199 // matches) because if a non-inlineable match comes first than all matches 200 // will get demoted later in HistoryQuickProvider to non-inlineable scores. 201 // Set to -1 to indicate no maximum score. 202 static int max_assigned_score_for_non_inlineable_matches_; 203 }; 204 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches; 205 206 } // namespace history 207 208 #endif // CHROME_BROWSER_HISTORY_SCORED_HISTORY_MATCH_H_ 209