1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <algorithm> 6 7 #include "base/auto_reset.h" 8 #include "base/strings/string16.h" 9 #include "base/strings/utf_string_conversions.h" 10 #include "chrome/browser/history/scored_history_match.h" 11 #include "components/history/core/test/history_client_fake_bookmarks.h" 12 #include "testing/gtest/include/gtest/gtest.h" 13 14 using base::ASCIIToUTF16; 15 16 namespace history { 17 18 // Returns a VisitInfoVector that includes |num_visits| spread over the 19 // last |frequency|*|num_visits| days (relative to |now|). A frequency of 20 // one means one visit each day, two means every other day, etc. 21 VisitInfoVector CreateVisitInfoVector(int num_visits, 22 int frequency, 23 base::Time now) { 24 VisitInfoVector visits; 25 for (int i = 0; i < num_visits; ++i) { 26 visits.push_back( 27 std::make_pair(now - base::TimeDelta::FromDays(i * frequency), 28 content::PAGE_TRANSITION_LINK)); 29 } 30 return visits; 31 } 32 33 class ScoredHistoryMatchTest : public testing::Test { 34 protected: 35 // Convenience function to create a URLRow with basic data for |url|, |title|, 36 // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number 37 // of days ago to which to set the URL's last_visit. 38 URLRow MakeURLRow(const char* url, 39 const char* title, 40 int visit_count, 41 int days_since_last_visit, 42 int typed_count); 43 44 // Convenience function to set the word starts information from a URLRow's 45 // URL and title. 46 void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts); 47 48 // Convenience functions for easily creating vectors of search terms. 49 String16Vector Make1Term(const char* term) const; 50 String16Vector Make2Terms(const char* term_1, const char* term_2) const; 51 52 // Convenience function for GetTopicalityScore() that builds the 53 // term match and word break information automatically that are needed 54 // to call GetTopicalityScore(). It only works for scoring a single term, 55 // not multiple terms. 56 float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term, 57 const base::string16& url, 58 const base::string16& title); 59 }; 60 61 URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url, 62 const char* title, 63 int visit_count, 64 int days_since_last_visit, 65 int typed_count) { 66 URLRow row(GURL(url), 0); 67 row.set_title(ASCIIToUTF16(title)); 68 row.set_visit_count(visit_count); 69 row.set_typed_count(typed_count); 70 row.set_last_visit(base::Time::NowFromSystemTime() - 71 base::TimeDelta::FromDays(days_since_last_visit)); 72 return row; 73 } 74 75 void ScoredHistoryMatchTest::PopulateWordStarts( 76 const URLRow& url_row, RowWordStarts* word_starts) { 77 String16SetFromString16(ASCIIToUTF16(url_row.url().spec()), 78 &word_starts->url_word_starts_); 79 String16SetFromString16(url_row.title(), &word_starts->title_word_starts_); 80 } 81 82 83 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const { 84 String16Vector original_terms; 85 original_terms.push_back(ASCIIToUTF16(term)); 86 return original_terms; 87 } 88 89 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1, 90 const char* term_2) const { 91 String16Vector original_terms; 92 original_terms.push_back(ASCIIToUTF16(term_1)); 93 original_terms.push_back(ASCIIToUTF16(term_2)); 94 return original_terms; 95 } 96 97 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle( 98 const base::string16& term, 99 const base::string16& url, 100 const base::string16& title) { 101 // Make an empty match and simply populate the fields we need in order 102 // to call GetTopicalityScore(). 103 ScoredHistoryMatch scored_match; 104 scored_match.url_matches_ = MatchTermInString(term, url, 0); 105 scored_match.title_matches_ = MatchTermInString(term, title, 0); 106 RowWordStarts word_starts; 107 String16SetFromString16(url, &word_starts.url_word_starts_); 108 String16SetFromString16(title, &word_starts.title_word_starts_); 109 WordStarts one_word_no_offset(1, 0u); 110 return scored_match.GetTopicalityScore(1, url, one_word_no_offset, 111 word_starts); 112 } 113 114 TEST_F(ScoredHistoryMatchTest, Scoring) { 115 // We use NowFromSystemTime() because MakeURLRow uses the same function 116 // to calculate last visit time when building a row. 117 base::Time now = base::Time::NowFromSystemTime(); 118 119 URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1)); 120 RowWordStarts word_starts_a; 121 PopulateWordStarts(row_a, &word_starts_a); 122 WordStarts one_word_no_offset(1, 0u); 123 VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now); 124 // Mark one visit as typed. 125 visits_a[0].second = content::PAGE_TRANSITION_TYPED; 126 ScoredHistoryMatch scored_a(row_a, visits_a, std::string(), 127 ASCIIToUTF16("abc"), Make1Term("abc"), 128 one_word_no_offset, word_starts_a, now, NULL); 129 130 // Test scores based on visit_count. 131 URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1)); 132 RowWordStarts word_starts_b; 133 PopulateWordStarts(row_b, &word_starts_b); 134 VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now); 135 visits_b[0].second = content::PAGE_TRANSITION_TYPED; 136 ScoredHistoryMatch scored_b(row_b, visits_b, std::string(), 137 ASCIIToUTF16("abc"), Make1Term("abc"), 138 one_word_no_offset, word_starts_b, now, NULL); 139 EXPECT_GT(scored_b.raw_score(), scored_a.raw_score()); 140 141 // Test scores based on last_visit. 142 URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1)); 143 RowWordStarts word_starts_c; 144 PopulateWordStarts(row_c, &word_starts_c); 145 VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now); 146 visits_c[0].second = content::PAGE_TRANSITION_TYPED; 147 ScoredHistoryMatch scored_c(row_c, visits_c, std::string(), 148 ASCIIToUTF16("abc"), Make1Term("abc"), 149 one_word_no_offset, word_starts_c, now, NULL); 150 EXPECT_GT(scored_c.raw_score(), scored_a.raw_score()); 151 152 // Test scores based on typed_count. 153 URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3)); 154 RowWordStarts word_starts_d; 155 PopulateWordStarts(row_d, &word_starts_d); 156 VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now); 157 visits_d[0].second = content::PAGE_TRANSITION_TYPED; 158 visits_d[1].second = content::PAGE_TRANSITION_TYPED; 159 visits_d[2].second = content::PAGE_TRANSITION_TYPED; 160 ScoredHistoryMatch scored_d(row_d, visits_d, std::string(), 161 ASCIIToUTF16("abc"), Make1Term("abc"), 162 one_word_no_offset, word_starts_d, now, NULL); 163 EXPECT_GT(scored_d.raw_score(), scored_a.raw_score()); 164 165 // Test scores based on a terms appearing multiple times. 166 URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi", 167 "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3)); 168 RowWordStarts word_starts_e; 169 PopulateWordStarts(row_e, &word_starts_e); 170 const VisitInfoVector visits_e = visits_d; 171 ScoredHistoryMatch scored_e(row_e, visits_e, std::string(), 172 ASCIIToUTF16("csi"), Make1Term("csi"), 173 one_word_no_offset, word_starts_e, now, NULL); 174 EXPECT_LT(scored_e.raw_score(), 1400); 175 176 // Test that a result with only a mid-term match (i.e., not at a word 177 // boundary) scores 0. 178 ScoredHistoryMatch scored_f(row_a, visits_a, std::string(), 179 ASCIIToUTF16("cd"), Make1Term("cd"), 180 one_word_no_offset, word_starts_a, now, NULL); 181 EXPECT_EQ(scored_f.raw_score(), 0); 182 } 183 184 TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) { 185 // We use NowFromSystemTime() because MakeURLRow uses the same function 186 // to calculate last visit time when building a row. 187 base::Time now = base::Time::NowFromSystemTime(); 188 189 std::string url_string("http://fedcba"); 190 const GURL url(url_string); 191 URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1)); 192 RowWordStarts word_starts; 193 PopulateWordStarts(row, &word_starts); 194 WordStarts one_word_no_offset(1, 0u); 195 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 196 ScoredHistoryMatch scored(row, visits, std::string(), 197 ASCIIToUTF16("abc"), Make1Term("abc"), 198 one_word_no_offset, word_starts, now, NULL); 199 // Now bookmark that URL and make sure its score increases. 200 base::AutoReset<int> reset(&ScoredHistoryMatch::bookmark_value_, 5); 201 history::HistoryClientFakeBookmarks history_client; 202 history_client.AddBookmark(url); 203 ScoredHistoryMatch scored_with_bookmark( 204 row, visits, std::string(), ASCIIToUTF16("abc"), Make1Term("abc"), 205 one_word_no_offset, word_starts, now, &history_client); 206 EXPECT_GT(scored_with_bookmark.raw_score(), scored.raw_score()); 207 } 208 209 TEST_F(ScoredHistoryMatchTest, ScoringTLD) { 210 // We use NowFromSystemTime() because MakeURLRow uses the same function 211 // to calculate last visit time when building a row. 212 base::Time now = base::Time::NowFromSystemTime(); 213 214 // By default the URL should not be returned for a query that includes "com". 215 std::string url_string("http://fedcba.com/"); 216 const GURL url(url_string); 217 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1)); 218 RowWordStarts word_starts; 219 PopulateWordStarts(row, &word_starts); 220 WordStarts two_words_no_offsets(2, 0u); 221 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 222 ScoredHistoryMatch scored(row, visits, std::string(), 223 ASCIIToUTF16("fed com"), Make2Terms("fed", "com"), 224 two_words_no_offsets, word_starts, now, NULL); 225 EXPECT_EQ(0, scored.raw_score()); 226 227 // Now allow credit for the match in the TLD. 228 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true); 229 ScoredHistoryMatch scored_with_tld( 230 row, visits, std::string(), ASCIIToUTF16("fed com"), 231 Make2Terms("fed", "com"), two_words_no_offsets, word_starts, now, NULL); 232 EXPECT_GT(scored_with_tld.raw_score(), 0); 233 } 234 235 TEST_F(ScoredHistoryMatchTest, ScoringScheme) { 236 // We use NowFromSystemTime() because MakeURLRow uses the same function 237 // to calculate last visit time when building a row. 238 base::Time now = base::Time::NowFromSystemTime(); 239 240 // By default the URL should not be returned for a query that includes "http". 241 std::string url_string("http://fedcba/"); 242 const GURL url(url_string); 243 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1)); 244 RowWordStarts word_starts; 245 PopulateWordStarts(row, &word_starts); 246 WordStarts two_words_no_offsets(2, 0u); 247 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now); 248 ScoredHistoryMatch scored(row, visits, std::string(), 249 ASCIIToUTF16("fed http"), Make2Terms("fed", "http"), 250 two_words_no_offsets, word_starts, now, NULL); 251 EXPECT_EQ(0, scored.raw_score()); 252 253 // Now allow credit for the match in the scheme. 254 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true); 255 ScoredHistoryMatch scored_with_scheme( 256 row, visits, std::string(), ASCIIToUTF16("fed http"), 257 Make2Terms("fed", "http"), two_words_no_offsets, word_starts, now, NULL); 258 EXPECT_GT(scored_with_scheme.raw_score(), 0); 259 } 260 261 TEST_F(ScoredHistoryMatchTest, Inlining) { 262 // We use NowFromSystemTime() because MakeURLRow uses the same function 263 // to calculate last visit time when building a row. 264 base::Time now = base::Time::NowFromSystemTime(); 265 RowWordStarts word_starts; 266 WordStarts one_word_no_offset(1, 0u); 267 VisitInfoVector visits; 268 269 { 270 URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1)); 271 PopulateWordStarts(row, &word_starts); 272 ScoredHistoryMatch scored_a(row, visits, std::string(), 273 ASCIIToUTF16("g"), Make1Term("g"), 274 one_word_no_offset, word_starts, now, NULL); 275 EXPECT_TRUE(scored_a.can_inline()); 276 EXPECT_FALSE(scored_a.match_in_scheme); 277 ScoredHistoryMatch scored_b(row, visits, std::string(), 278 ASCIIToUTF16("w"), Make1Term("w"), 279 one_word_no_offset, word_starts, now, NULL); 280 EXPECT_TRUE(scored_b.can_inline()); 281 EXPECT_FALSE(scored_b.match_in_scheme); 282 ScoredHistoryMatch scored_c(row, visits, std::string(), 283 ASCIIToUTF16("h"), Make1Term("h"), 284 one_word_no_offset, word_starts, now, NULL); 285 EXPECT_TRUE(scored_c.can_inline()); 286 EXPECT_TRUE(scored_c.match_in_scheme); 287 ScoredHistoryMatch scored_d(row, visits, std::string(), 288 ASCIIToUTF16("o"), Make1Term("o"), 289 one_word_no_offset, word_starts, now, NULL); 290 EXPECT_FALSE(scored_d.can_inline()); 291 EXPECT_FALSE(scored_d.match_in_scheme); 292 } 293 294 { 295 URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1)); 296 PopulateWordStarts(row, &word_starts); 297 ScoredHistoryMatch scored_a(row, visits, std::string(), 298 ASCIIToUTF16("t"), Make1Term("t"), 299 one_word_no_offset, word_starts, now, NULL); 300 EXPECT_TRUE(scored_a.can_inline()); 301 EXPECT_FALSE(scored_a.match_in_scheme); 302 ScoredHistoryMatch scored_b(row, visits, std::string(), 303 ASCIIToUTF16("f"), Make1Term("f"), 304 one_word_no_offset, word_starts, now, NULL); 305 EXPECT_FALSE(scored_b.can_inline()); 306 EXPECT_FALSE(scored_b.match_in_scheme); 307 ScoredHistoryMatch scored_c(row, visits, std::string(), 308 ASCIIToUTF16("o"), Make1Term("o"), 309 one_word_no_offset, word_starts, now, NULL); 310 EXPECT_FALSE(scored_c.can_inline()); 311 EXPECT_FALSE(scored_c.match_in_scheme); 312 } 313 314 { 315 URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1)); 316 PopulateWordStarts(row, &word_starts); 317 ScoredHistoryMatch scored_a(row, visits, std::string(), 318 ASCIIToUTF16("t"), Make1Term("t"), 319 one_word_no_offset, word_starts, now, NULL); 320 EXPECT_TRUE(scored_a.can_inline()); 321 EXPECT_FALSE(scored_a.match_in_scheme); 322 ScoredHistoryMatch scored_b(row, visits, std::string(), 323 ASCIIToUTF16("h"), Make1Term("h"), 324 one_word_no_offset, word_starts, now, NULL); 325 EXPECT_TRUE(scored_b.can_inline()); 326 EXPECT_TRUE(scored_b.match_in_scheme); 327 ScoredHistoryMatch scored_c(row, visits, std::string(), 328 ASCIIToUTF16("w"), Make1Term("w"), 329 one_word_no_offset, word_starts, now, NULL); 330 EXPECT_TRUE(scored_c.can_inline()); 331 EXPECT_FALSE(scored_c.match_in_scheme); 332 } 333 } 334 335 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) { 336 const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle( 337 ASCIIToUTF16("def"), 338 ASCIIToUTF16("http://abc.def.com/"), 339 ASCIIToUTF16("Non-Matching Title")); 340 const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle( 341 ASCIIToUTF16("def"), 342 ASCIIToUTF16("http://abc.def.com"), 343 ASCIIToUTF16("Non-Matching Title")); 344 EXPECT_EQ(hostname_no_slash, hostname); 345 } 346 347 // This function only tests scoring of single terms that match exactly 348 // once somewhere in the URL or title. 349 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) { 350 base::string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?" 351 "arg1=val1&arg2=val2#hash_component"); 352 base::string16 title = ASCIIToUTF16("here is a title"); 353 const float hostname_score = 354 GetTopicalityScoreOfTermAgainstURLAndTitle( 355 ASCIIToUTF16("abc"), url, title); 356 const float hostname_mid_word_score = 357 GetTopicalityScoreOfTermAgainstURLAndTitle( 358 ASCIIToUTF16("bc"), url, title); 359 const float domain_name_score = 360 GetTopicalityScoreOfTermAgainstURLAndTitle( 361 ASCIIToUTF16("def"), url, title); 362 const float domain_name_mid_word_score = 363 GetTopicalityScoreOfTermAgainstURLAndTitle( 364 ASCIIToUTF16("ef"), url, title); 365 const float tld_score = 366 GetTopicalityScoreOfTermAgainstURLAndTitle( 367 ASCIIToUTF16("com"), url, title); 368 const float tld_mid_word_score = 369 GetTopicalityScoreOfTermAgainstURLAndTitle( 370 ASCIIToUTF16("om"), url, title); 371 const float path_score = 372 GetTopicalityScoreOfTermAgainstURLAndTitle( 373 ASCIIToUTF16("path1"), url, title); 374 const float path_mid_word_score = 375 GetTopicalityScoreOfTermAgainstURLAndTitle( 376 ASCIIToUTF16("ath1"), url, title); 377 const float arg_score = 378 GetTopicalityScoreOfTermAgainstURLAndTitle( 379 ASCIIToUTF16("arg2"), url, title); 380 const float arg_mid_word_score = 381 GetTopicalityScoreOfTermAgainstURLAndTitle( 382 ASCIIToUTF16("rg2"), url, title); 383 const float protocol_score = 384 GetTopicalityScoreOfTermAgainstURLAndTitle( 385 ASCIIToUTF16("htt"), url, title); 386 const float protocol_mid_word_score = 387 GetTopicalityScoreOfTermAgainstURLAndTitle( 388 ASCIIToUTF16("tt"), url, title); 389 const float title_score = 390 GetTopicalityScoreOfTermAgainstURLAndTitle( 391 ASCIIToUTF16("her"), url, title); 392 const float title_mid_word_score = 393 GetTopicalityScoreOfTermAgainstURLAndTitle( 394 ASCIIToUTF16("er"), url, title); 395 // Verify hostname and domain name > path > arg. 396 EXPECT_GT(hostname_score, path_score); 397 EXPECT_GT(domain_name_score, path_score); 398 EXPECT_GT(path_score, arg_score); 399 // Verify that domain name > path and domain name > arg for non-word 400 // boundaries. 401 EXPECT_GT(hostname_mid_word_score, path_mid_word_score); 402 EXPECT_GT(domain_name_mid_word_score, path_mid_word_score); 403 EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score); 404 EXPECT_GT(hostname_mid_word_score, arg_mid_word_score); 405 // Also verify that the matches at non-word-boundaries all score 406 // worse than the matches at word boundaries. These three sets suffice. 407 EXPECT_GT(arg_score, hostname_mid_word_score); 408 EXPECT_GT(arg_score, domain_name_mid_word_score); 409 EXPECT_GT(title_score, title_mid_word_score); 410 // Check that title matches fit somewhere reasonable compared to the 411 // various types of URL matches. 412 EXPECT_GT(title_score, arg_score); 413 EXPECT_GT(arg_score, title_mid_word_score); 414 // Finally, verify that protocol matches and top level domain name 415 // matches (.com, .net, etc.) score worse than some of the mid-word 416 // matches that actually count. 417 EXPECT_GT(hostname_mid_word_score, protocol_score); 418 EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score); 419 EXPECT_GT(hostname_mid_word_score, tld_score); 420 EXPECT_GT(hostname_mid_word_score, tld_mid_word_score); 421 } 422 423 } // namespace history 424