1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <algorithm> 6 7 #include "base/strings/string16.h" 8 #include "base/strings/utf_string_conversions.h" 9 #include "chrome/browser/bookmarks/bookmark_service.h" 10 #include "chrome/browser/history/scored_history_match.h" 11 #include "testing/gtest/include/gtest/gtest.h" 12 13 namespace history { 14 15 // Returns a VisitInfoVector that includes |num_visits| spread over the 16 // last |frecency|*|num_visits| days (relative to |now|). A frequency of 17 // one means one visit each day, two means every other day, etc. 18 VisitInfoVector CreateVisitInfoVector(int num_visits, 19 int frequency, 20 base::Time now) { 21 VisitInfoVector visits; 22 for (int i = 0; i < num_visits; ++i) { 23 visits.push_back( 24 std::make_pair(now - base::TimeDelta::FromDays(i * frequency), 25 content::PAGE_TRANSITION_LINK)); 26 } 27 return visits; 28 } 29 30 class ScoredHistoryMatchTest : public testing::Test { 31 protected: 32 // Convenience function to create a URLRow with basic data for |url|, |title|, 33 // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number 34 // of days ago to which to set the URL's last_visit. 35 URLRow MakeURLRow(const char* url, 36 const char* title, 37 int visit_count, 38 int days_since_last_visit, 39 int typed_count); 40 41 // Convenience function to set the word starts information from a URLRow's 42 // URL and title. 43 void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts); 44 45 // Convenience functions for easily creating vectors of search terms. 46 String16Vector Make1Term(const char* term) const; 47 String16Vector Make2Terms(const char* term_1, const char* term_2) const; 48 49 // Convenience function for GetTopicalityScore() that builds the 50 // term match and word break information automatically that are needed 51 // to call GetTopicalityScore(). It only works for scoring a single term, 52 // not multiple terms. 53 float GetTopicalityScoreOfTermAgainstURLAndTitle(const string16& term, 54 const string16& url, 55 const string16& title); 56 }; 57 58 URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url, 59 const char* title, 60 int visit_count, 61 int days_since_last_visit, 62 int typed_count) { 63 URLRow row(GURL(url), 0); 64 row.set_title(ASCIIToUTF16(title)); 65 row.set_visit_count(visit_count); 66 row.set_typed_count(typed_count); 67 row.set_last_visit(base::Time::NowFromSystemTime() - 68 base::TimeDelta::FromDays(days_since_last_visit)); 69 return row; 70 } 71 72 void ScoredHistoryMatchTest::PopulateWordStarts( 73 const URLRow& url_row, RowWordStarts* word_starts) { 74 String16SetFromString16(ASCIIToUTF16(url_row.url().spec()), 75 &word_starts->url_word_starts_); 76 String16SetFromString16(url_row.title(), &word_starts->title_word_starts_); 77 } 78 79 80 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const { 81 String16Vector original_terms; 82 original_terms.push_back(ASCIIToUTF16(term)); 83 return original_terms; 84 } 85 86 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1, 87 const char* term_2) const { 88 String16Vector original_terms; 89 original_terms.push_back(ASCIIToUTF16(term_1)); 90 original_terms.push_back(ASCIIToUTF16(term_2)); 91 return original_terms; 92 } 93 94 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle( 95 const string16& term, 96 const string16& url, 97 const string16& title) { 98 TermMatches url_matches = MatchTermInString(term, url, 0); 99 TermMatches title_matches = MatchTermInString(term, title, 0); 100 RowWordStarts word_starts; 101 String16SetFromString16(url, &word_starts.url_word_starts_); 102 String16SetFromString16(title, &word_starts.title_word_starts_); 103 return ScoredHistoryMatch::GetTopicalityScore( 104 1, url, url_matches, title_matches, word_starts); 105 } 106 107 TEST_F(ScoredHistoryMatchTest, Scoring) { 108 // We use NowFromSystemTime() because MakeURLRow uses the same function 109 // to calculate last visit time when building a row. 110 base::Time now = base::Time::NowFromSystemTime(); 111 112 URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1)); 113 RowWordStarts word_starts_a; 114 PopulateWordStarts(row_a, &word_starts_a); 115 VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now); 116 // Mark one visit as typed. 117 visits_a[0].second = content::PAGE_TRANSITION_TYPED; 118 ScoredHistoryMatch scored_a(row_a, visits_a, std::string(), 119 ASCIIToUTF16("abc"), Make1Term("abc"), 120 word_starts_a, now, NULL); 121 122 // Test scores based on visit_count. 123 URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1)); 124 RowWordStarts word_starts_b; 125 PopulateWordStarts(row_b, &word_starts_b); 126 VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now); 127 visits_b[0].second = content::PAGE_TRANSITION_TYPED; 128 ScoredHistoryMatch scored_b(row_b, visits_b, std::string(), 129 ASCIIToUTF16("abc"), Make1Term("abc"), 130 word_starts_b, now, NULL); 131 EXPECT_GT(scored_b.raw_score, scored_a.raw_score); 132 133 // Test scores based on last_visit. 134 URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1)); 135 RowWordStarts word_starts_c; 136 PopulateWordStarts(row_c, &word_starts_c); 137 VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now); 138 visits_c[0].second = content::PAGE_TRANSITION_TYPED; 139 ScoredHistoryMatch scored_c(row_c, visits_c, std::string(), 140 ASCIIToUTF16("abc"), Make1Term("abc"), 141 word_starts_c, now, NULL); 142 EXPECT_GT(scored_c.raw_score, scored_a.raw_score); 143 144 // Test scores based on typed_count. 145 URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3)); 146 RowWordStarts word_starts_d; 147 PopulateWordStarts(row_d, &word_starts_d); 148 VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now); 149 visits_d[0].second = content::PAGE_TRANSITION_TYPED; 150 visits_d[1].second = content::PAGE_TRANSITION_TYPED; 151 visits_d[2].second = content::PAGE_TRANSITION_TYPED; 152 ScoredHistoryMatch scored_d(row_d, visits_d, std::string(), 153 ASCIIToUTF16("abc"), Make1Term("abc"), 154 word_starts_d, now, NULL); 155 EXPECT_GT(scored_d.raw_score, scored_a.raw_score); 156 157 // Test scores based on a terms appearing multiple times. 158 URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi", 159 "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3)); 160 RowWordStarts word_starts_e; 161 PopulateWordStarts(row_e, &word_starts_e); 162 const VisitInfoVector visits_e = visits_d; 163 ScoredHistoryMatch scored_e(row_e, visits_e, std::string(), 164 ASCIIToUTF16("csi"), Make1Term("csi"), 165 word_starts_e, now, NULL); 166 EXPECT_LT(scored_e.raw_score, 1400); 167 168 // Test that a result with only a mid-term match (i.e., not at a word 169 // boundary) scores 0. 170 ScoredHistoryMatch scored_f(row_a, visits_a, std::string(), 171 ASCIIToUTF16("cd"), Make1Term("cd"), 172 word_starts_a, now, NULL); 173 EXPECT_EQ(scored_f.raw_score, 0); 174 } 175 176 TEST_F(ScoredHistoryMatchTest, Inlining) { 177 // We use NowFromSystemTime() because MakeURLRow uses the same function 178 // to calculate last visit time when building a row. 179 base::Time now = base::Time::NowFromSystemTime(); 180 RowWordStarts word_starts; 181 VisitInfoVector visits; 182 183 { 184 URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1)); 185 ScoredHistoryMatch scored_a(row, visits, std::string(), 186 ASCIIToUTF16("g"), Make1Term("g"), 187 word_starts, now, NULL); 188 EXPECT_TRUE(scored_a.can_inline); 189 EXPECT_FALSE(scored_a.match_in_scheme); 190 ScoredHistoryMatch scored_b(row, visits, std::string(), 191 ASCIIToUTF16("w"), Make1Term("w"), 192 word_starts, now, NULL); 193 EXPECT_TRUE(scored_b.can_inline); 194 EXPECT_FALSE(scored_b.match_in_scheme); 195 ScoredHistoryMatch scored_c(row, visits, std::string(), 196 ASCIIToUTF16("h"), Make1Term("h"), 197 word_starts, now, NULL); 198 EXPECT_TRUE(scored_c.can_inline); 199 EXPECT_TRUE(scored_c.match_in_scheme); 200 ScoredHistoryMatch scored_d(row, visits, std::string(), 201 ASCIIToUTF16("o"), Make1Term("o"), 202 word_starts, now, NULL); 203 EXPECT_FALSE(scored_d.can_inline); 204 EXPECT_FALSE(scored_d.match_in_scheme); 205 } 206 207 { 208 URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1)); 209 ScoredHistoryMatch scored_a(row, visits, std::string(), 210 ASCIIToUTF16("t"), Make1Term("t"), 211 word_starts, now, NULL); 212 EXPECT_TRUE(scored_a.can_inline); 213 EXPECT_FALSE(scored_a.match_in_scheme); 214 ScoredHistoryMatch scored_b(row, visits, std::string(), 215 ASCIIToUTF16("f"), Make1Term("f"), 216 word_starts, now, NULL); 217 EXPECT_FALSE(scored_b.can_inline); 218 EXPECT_FALSE(scored_b.match_in_scheme); 219 ScoredHistoryMatch scored_c(row, visits, std::string(), 220 ASCIIToUTF16("o"), Make1Term("o"), 221 word_starts, now, NULL); 222 EXPECT_FALSE(scored_c.can_inline); 223 EXPECT_FALSE(scored_c.match_in_scheme); 224 } 225 226 { 227 URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1)); 228 ScoredHistoryMatch scored_a(row, visits, std::string(), 229 ASCIIToUTF16("t"), Make1Term("t"), 230 word_starts, now, NULL); 231 EXPECT_TRUE(scored_a.can_inline); 232 EXPECT_FALSE(scored_a.match_in_scheme); 233 ScoredHistoryMatch scored_b(row, visits, std::string(), 234 ASCIIToUTF16("h"), Make1Term("h"), 235 word_starts, now, NULL); 236 EXPECT_TRUE(scored_b.can_inline); 237 EXPECT_TRUE(scored_b.match_in_scheme); 238 ScoredHistoryMatch scored_c(row, visits, std::string(), 239 ASCIIToUTF16("w"), Make1Term("w"), 240 word_starts, now, NULL); 241 EXPECT_TRUE(scored_c.can_inline); 242 EXPECT_FALSE(scored_c.match_in_scheme); 243 } 244 } 245 246 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) { 247 const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle( 248 ASCIIToUTF16("def"), 249 ASCIIToUTF16("http://abc.def.com/"), 250 ASCIIToUTF16("Non-Matching Title")); 251 const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle( 252 ASCIIToUTF16("def"), 253 ASCIIToUTF16("http://abc.def.com"), 254 ASCIIToUTF16("Non-Matching Title")); 255 EXPECT_EQ(hostname_no_slash, hostname); 256 } 257 258 // This function only tests scoring of single terms that match exactly 259 // once somewhere in the URL or title. 260 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) { 261 string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?" 262 "arg1=val1&arg2=val2#hash_component"); 263 string16 title = ASCIIToUTF16("here is a title"); 264 const float hostname_score = 265 GetTopicalityScoreOfTermAgainstURLAndTitle( 266 ASCIIToUTF16("abc"), url, title); 267 const float hostname_mid_word_score = 268 GetTopicalityScoreOfTermAgainstURLAndTitle( 269 ASCIIToUTF16("bc"), url, title); 270 const float domain_name_score = 271 GetTopicalityScoreOfTermAgainstURLAndTitle( 272 ASCIIToUTF16("def"), url, title); 273 const float domain_name_mid_word_score = 274 GetTopicalityScoreOfTermAgainstURLAndTitle( 275 ASCIIToUTF16("ef"), url, title); 276 const float tld_score = 277 GetTopicalityScoreOfTermAgainstURLAndTitle( 278 ASCIIToUTF16("com"), url, title); 279 const float tld_mid_word_score = 280 GetTopicalityScoreOfTermAgainstURLAndTitle( 281 ASCIIToUTF16("om"), url, title); 282 const float path_score = 283 GetTopicalityScoreOfTermAgainstURLAndTitle( 284 ASCIIToUTF16("path1"), url, title); 285 const float path_mid_word_score = 286 GetTopicalityScoreOfTermAgainstURLAndTitle( 287 ASCIIToUTF16("ath1"), url, title); 288 const float arg_score = 289 GetTopicalityScoreOfTermAgainstURLAndTitle( 290 ASCIIToUTF16("arg2"), url, title); 291 const float arg_mid_word_score = 292 GetTopicalityScoreOfTermAgainstURLAndTitle( 293 ASCIIToUTF16("rg2"), url, title); 294 const float protocol_score = 295 GetTopicalityScoreOfTermAgainstURLAndTitle( 296 ASCIIToUTF16("htt"), url, title); 297 const float protocol_mid_word_score = 298 GetTopicalityScoreOfTermAgainstURLAndTitle( 299 ASCIIToUTF16("tt"), url, title); 300 const float title_score = 301 GetTopicalityScoreOfTermAgainstURLAndTitle( 302 ASCIIToUTF16("her"), url, title); 303 const float title_mid_word_score = 304 GetTopicalityScoreOfTermAgainstURLAndTitle( 305 ASCIIToUTF16("er"), url, title); 306 // Verify hostname and domain name > path > arg. 307 EXPECT_GT(hostname_score, path_score); 308 EXPECT_GT(domain_name_score, path_score); 309 EXPECT_GT(path_score, arg_score); 310 // Verify that domain name > path and domain name > arg for non-word 311 // boundaries. 312 EXPECT_GT(hostname_mid_word_score, path_mid_word_score); 313 EXPECT_GT(domain_name_mid_word_score, path_mid_word_score); 314 EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score); 315 EXPECT_GT(hostname_mid_word_score, arg_mid_word_score); 316 // Also verify that the matches at non-word-boundaries all score 317 // worse than the matches at word boundaries. These three sets suffice. 318 EXPECT_GT(arg_score, hostname_mid_word_score); 319 EXPECT_GT(arg_score, domain_name_mid_word_score); 320 EXPECT_GT(title_score, title_mid_word_score); 321 // Check that title matches fit somewhere reasonable compared to the 322 // various types of URL matches. 323 EXPECT_GT(title_score, arg_score); 324 EXPECT_GT(arg_score, title_mid_word_score); 325 // Finally, verify that protocol matches and top level domain name 326 // matches (.com, .net, etc.) score worse than some of the mid-word 327 // matches that actually count. 328 EXPECT_GT(hostname_mid_word_score, protocol_score); 329 EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score); 330 EXPECT_GT(hostname_mid_word_score, tld_score); 331 EXPECT_GT(hostname_mid_word_score, tld_mid_word_score); 332 } 333 334 } // namespace history 335