Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <algorithm>
      6 
      7 #include "base/auto_reset.h"
      8 #include "base/strings/string16.h"
      9 #include "base/strings/utf_string_conversions.h"
     10 #include "chrome/browser/history/scored_history_match.h"
     11 #include "components/history/core/test/history_client_fake_bookmarks.h"
     12 #include "testing/gtest/include/gtest/gtest.h"
     13 
     14 using base::ASCIIToUTF16;
     15 
     16 namespace history {
     17 
     18 // Returns a VisitInfoVector that includes |num_visits| spread over the
     19 // last |frequency|*|num_visits| days (relative to |now|).  A frequency of
     20 // one means one visit each day, two means every other day, etc.
     21 VisitInfoVector CreateVisitInfoVector(int num_visits,
     22                                       int frequency,
     23                                       base::Time now) {
     24   VisitInfoVector visits;
     25   for (int i = 0; i < num_visits; ++i) {
     26     visits.push_back(
     27         std::make_pair(now - base::TimeDelta::FromDays(i * frequency),
     28                        ui::PAGE_TRANSITION_LINK));
     29   }
     30   return visits;
     31 }
     32 
     33 class ScoredHistoryMatchTest : public testing::Test {
     34  protected:
     35   // Convenience function to create a URLRow with basic data for |url|, |title|,
     36   // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number
     37   // of days ago to which to set the URL's last_visit.
     38   URLRow MakeURLRow(const char* url,
     39                     const char* title,
     40                     int visit_count,
     41                     int days_since_last_visit,
     42                     int typed_count);
     43 
     44   // Convenience function to set the word starts information from a URLRow's
     45   // URL and title.
     46   void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts);
     47 
     48   // Convenience functions for easily creating vectors of search terms.
     49   String16Vector Make1Term(const char* term) const;
     50   String16Vector Make2Terms(const char* term_1, const char* term_2) const;
     51 
     52   // Convenience function for GetTopicalityScore() that builds the
     53   // term match and word break information automatically that are needed
     54   // to call GetTopicalityScore().  It only works for scoring a single term,
     55   // not multiple terms.
     56   float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term,
     57                                                    const base::string16& url,
     58                                                    const base::string16& title);
     59 };
     60 
     61 URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
     62                                           const char* title,
     63                                           int visit_count,
     64                                           int days_since_last_visit,
     65                                           int typed_count) {
     66   URLRow row(GURL(url), 0);
     67   row.set_title(ASCIIToUTF16(title));
     68   row.set_visit_count(visit_count);
     69   row.set_typed_count(typed_count);
     70   row.set_last_visit(base::Time::NowFromSystemTime() -
     71                      base::TimeDelta::FromDays(days_since_last_visit));
     72   return row;
     73 }
     74 
     75 void ScoredHistoryMatchTest::PopulateWordStarts(
     76     const URLRow& url_row, RowWordStarts* word_starts) {
     77   String16SetFromString16(ASCIIToUTF16(url_row.url().spec()),
     78                           &word_starts->url_word_starts_);
     79   String16SetFromString16(url_row.title(), &word_starts->title_word_starts_);
     80 }
     81 
     82 
     83 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
     84   String16Vector original_terms;
     85   original_terms.push_back(ASCIIToUTF16(term));
     86   return original_terms;
     87 }
     88 
     89 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
     90                                                   const char* term_2) const {
     91   String16Vector original_terms;
     92   original_terms.push_back(ASCIIToUTF16(term_1));
     93   original_terms.push_back(ASCIIToUTF16(term_2));
     94   return original_terms;
     95 }
     96 
     97 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
     98     const base::string16& term,
     99     const base::string16& url,
    100     const base::string16& title) {
    101   // Make an empty match and simply populate the fields we need in order
    102   // to call GetTopicalityScore().
    103   ScoredHistoryMatch scored_match;
    104   scored_match.url_matches_ = MatchTermInString(term, url, 0);
    105   scored_match.title_matches_ = MatchTermInString(term, title, 0);
    106   RowWordStarts word_starts;
    107   String16SetFromString16(url, &word_starts.url_word_starts_);
    108   String16SetFromString16(title, &word_starts.title_word_starts_);
    109   WordStarts one_word_no_offset(1, 0u);
    110   return scored_match.GetTopicalityScore(1, url, one_word_no_offset,
    111                                          word_starts);
    112 }
    113 
    114 TEST_F(ScoredHistoryMatchTest, Scoring) {
    115   // We use NowFromSystemTime() because MakeURLRow uses the same function
    116   // to calculate last visit time when building a row.
    117   base::Time now = base::Time::NowFromSystemTime();
    118 
    119   URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1));
    120   RowWordStarts word_starts_a;
    121   PopulateWordStarts(row_a, &word_starts_a);
    122   WordStarts one_word_no_offset(1, 0u);
    123   VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now);
    124   // Mark one visit as typed.
    125   visits_a[0].second = ui::PAGE_TRANSITION_TYPED;
    126   ScoredHistoryMatch scored_a(row_a, visits_a, std::string(),
    127                               ASCIIToUTF16("abc"), Make1Term("abc"),
    128                               one_word_no_offset, word_starts_a, now, NULL);
    129 
    130   // Test scores based on visit_count.
    131   URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1));
    132   RowWordStarts word_starts_b;
    133   PopulateWordStarts(row_b, &word_starts_b);
    134   VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now);
    135   visits_b[0].second = ui::PAGE_TRANSITION_TYPED;
    136   ScoredHistoryMatch scored_b(row_b, visits_b, std::string(),
    137                               ASCIIToUTF16("abc"), Make1Term("abc"),
    138                               one_word_no_offset, word_starts_b, now, NULL);
    139   EXPECT_GT(scored_b.raw_score(), scored_a.raw_score());
    140 
    141   // Test scores based on last_visit.
    142   URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1));
    143   RowWordStarts word_starts_c;
    144   PopulateWordStarts(row_c, &word_starts_c);
    145   VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now);
    146   visits_c[0].second = ui::PAGE_TRANSITION_TYPED;
    147   ScoredHistoryMatch scored_c(row_c, visits_c, std::string(),
    148                               ASCIIToUTF16("abc"), Make1Term("abc"),
    149                               one_word_no_offset, word_starts_c, now, NULL);
    150   EXPECT_GT(scored_c.raw_score(), scored_a.raw_score());
    151 
    152   // Test scores based on typed_count.
    153   URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3));
    154   RowWordStarts word_starts_d;
    155   PopulateWordStarts(row_d, &word_starts_d);
    156   VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now);
    157   visits_d[0].second = ui::PAGE_TRANSITION_TYPED;
    158   visits_d[1].second = ui::PAGE_TRANSITION_TYPED;
    159   visits_d[2].second = ui::PAGE_TRANSITION_TYPED;
    160   ScoredHistoryMatch scored_d(row_d, visits_d, std::string(),
    161                               ASCIIToUTF16("abc"), Make1Term("abc"),
    162                               one_word_no_offset, word_starts_d, now, NULL);
    163   EXPECT_GT(scored_d.raw_score(), scored_a.raw_score());
    164 
    165   // Test scores based on a terms appearing multiple times.
    166   URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi",
    167       "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3));
    168   RowWordStarts word_starts_e;
    169   PopulateWordStarts(row_e, &word_starts_e);
    170   const VisitInfoVector visits_e = visits_d;
    171   ScoredHistoryMatch scored_e(row_e, visits_e, std::string(),
    172                               ASCIIToUTF16("csi"), Make1Term("csi"),
    173                               one_word_no_offset, word_starts_e, now, NULL);
    174   EXPECT_LT(scored_e.raw_score(), 1400);
    175 
    176   // Test that a result with only a mid-term match (i.e., not at a word
    177   // boundary) scores 0.
    178   ScoredHistoryMatch scored_f(row_a, visits_a, std::string(),
    179                               ASCIIToUTF16("cd"), Make1Term("cd"),
    180                               one_word_no_offset, word_starts_a, now, NULL);
    181   EXPECT_EQ(scored_f.raw_score(), 0);
    182 }
    183 
    184 TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) {
    185   // We use NowFromSystemTime() because MakeURLRow uses the same function
    186   // to calculate last visit time when building a row.
    187   base::Time now = base::Time::NowFromSystemTime();
    188 
    189   std::string url_string("http://fedcba");
    190   const GURL url(url_string);
    191   URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1));
    192   RowWordStarts word_starts;
    193   PopulateWordStarts(row, &word_starts);
    194   WordStarts one_word_no_offset(1, 0u);
    195   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
    196   ScoredHistoryMatch scored(row, visits, std::string(),
    197                             ASCIIToUTF16("abc"), Make1Term("abc"),
    198                             one_word_no_offset, word_starts, now, NULL);
    199   // Now bookmark that URL and make sure its score increases.
    200   base::AutoReset<int> reset(&ScoredHistoryMatch::bookmark_value_, 5);
    201   history::HistoryClientFakeBookmarks history_client;
    202   history_client.AddBookmark(url);
    203   ScoredHistoryMatch scored_with_bookmark(
    204       row, visits, std::string(), ASCIIToUTF16("abc"), Make1Term("abc"),
    205       one_word_no_offset, word_starts, now, &history_client);
    206   EXPECT_GT(scored_with_bookmark.raw_score(), scored.raw_score());
    207 }
    208 
    209 TEST_F(ScoredHistoryMatchTest, ScoringTLD) {
    210   // We use NowFromSystemTime() because MakeURLRow uses the same function
    211   // to calculate last visit time when building a row.
    212   base::Time now = base::Time::NowFromSystemTime();
    213 
    214   // By default the URL should not be returned for a query that includes "com".
    215   std::string url_string("http://fedcba.com/");
    216   const GURL url(url_string);
    217   URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
    218   RowWordStarts word_starts;
    219   PopulateWordStarts(row, &word_starts);
    220   WordStarts two_words_no_offsets(2, 0u);
    221   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
    222   ScoredHistoryMatch scored(row, visits, std::string(),
    223                             ASCIIToUTF16("fed com"), Make2Terms("fed", "com"),
    224                             two_words_no_offsets, word_starts, now, NULL);
    225   EXPECT_EQ(0, scored.raw_score());
    226 
    227   // Now allow credit for the match in the TLD.
    228   base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true);
    229   ScoredHistoryMatch scored_with_tld(
    230       row, visits, std::string(), ASCIIToUTF16("fed com"),
    231       Make2Terms("fed", "com"), two_words_no_offsets, word_starts, now, NULL);
    232   EXPECT_GT(scored_with_tld.raw_score(), 0);
    233 }
    234 
    235 TEST_F(ScoredHistoryMatchTest, ScoringScheme) {
    236   // We use NowFromSystemTime() because MakeURLRow uses the same function
    237   // to calculate last visit time when building a row.
    238   base::Time now = base::Time::NowFromSystemTime();
    239 
    240   // By default the URL should not be returned for a query that includes "http".
    241   std::string url_string("http://fedcba/");
    242   const GURL url(url_string);
    243   URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
    244   RowWordStarts word_starts;
    245   PopulateWordStarts(row, &word_starts);
    246   WordStarts two_words_no_offsets(2, 0u);
    247   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
    248   ScoredHistoryMatch scored(row, visits, std::string(),
    249                             ASCIIToUTF16("fed http"), Make2Terms("fed", "http"),
    250                             two_words_no_offsets, word_starts, now, NULL);
    251   EXPECT_EQ(0, scored.raw_score());
    252 
    253   // Now allow credit for the match in the scheme.
    254   base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true);
    255   ScoredHistoryMatch scored_with_scheme(
    256       row, visits, std::string(), ASCIIToUTF16("fed http"),
    257       Make2Terms("fed", "http"), two_words_no_offsets, word_starts, now, NULL);
    258   EXPECT_GT(scored_with_scheme.raw_score(), 0);
    259 }
    260 
    261 TEST_F(ScoredHistoryMatchTest, Inlining) {
    262   // We use NowFromSystemTime() because MakeURLRow uses the same function
    263   // to calculate last visit time when building a row.
    264   base::Time now = base::Time::NowFromSystemTime();
    265   RowWordStarts word_starts;
    266   WordStarts one_word_no_offset(1, 0u);
    267   VisitInfoVector visits;
    268 
    269   {
    270     URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
    271     PopulateWordStarts(row, &word_starts);
    272     ScoredHistoryMatch scored_a(row, visits, std::string(),
    273                                 ASCIIToUTF16("g"), Make1Term("g"),
    274                                 one_word_no_offset, word_starts, now, NULL);
    275     EXPECT_TRUE(scored_a.can_inline());
    276     EXPECT_FALSE(scored_a.match_in_scheme);
    277     ScoredHistoryMatch scored_b(row, visits, std::string(),
    278                                 ASCIIToUTF16("w"), Make1Term("w"),
    279                                 one_word_no_offset, word_starts, now, NULL);
    280     EXPECT_TRUE(scored_b.can_inline());
    281     EXPECT_FALSE(scored_b.match_in_scheme);
    282     ScoredHistoryMatch scored_c(row, visits, std::string(),
    283                                 ASCIIToUTF16("h"), Make1Term("h"),
    284                                 one_word_no_offset, word_starts, now, NULL);
    285     EXPECT_TRUE(scored_c.can_inline());
    286     EXPECT_TRUE(scored_c.match_in_scheme);
    287     ScoredHistoryMatch scored_d(row, visits, std::string(),
    288                                 ASCIIToUTF16("o"), Make1Term("o"),
    289                                 one_word_no_offset, word_starts, now, NULL);
    290     EXPECT_FALSE(scored_d.can_inline());
    291     EXPECT_FALSE(scored_d.match_in_scheme);
    292   }
    293 
    294   {
    295     URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
    296     PopulateWordStarts(row, &word_starts);
    297     ScoredHistoryMatch scored_a(row, visits, std::string(),
    298                                 ASCIIToUTF16("t"), Make1Term("t"),
    299                                 one_word_no_offset, word_starts, now, NULL);
    300     EXPECT_TRUE(scored_a.can_inline());
    301     EXPECT_FALSE(scored_a.match_in_scheme);
    302     ScoredHistoryMatch scored_b(row, visits, std::string(),
    303                                 ASCIIToUTF16("f"), Make1Term("f"),
    304                                 one_word_no_offset, word_starts, now, NULL);
    305     EXPECT_FALSE(scored_b.can_inline());
    306     EXPECT_FALSE(scored_b.match_in_scheme);
    307     ScoredHistoryMatch scored_c(row, visits, std::string(),
    308                                 ASCIIToUTF16("o"), Make1Term("o"),
    309                                 one_word_no_offset, word_starts, now, NULL);
    310     EXPECT_FALSE(scored_c.can_inline());
    311     EXPECT_FALSE(scored_c.match_in_scheme);
    312   }
    313 
    314   {
    315     URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1));
    316     PopulateWordStarts(row, &word_starts);
    317     ScoredHistoryMatch scored_a(row, visits, std::string(),
    318                                 ASCIIToUTF16("t"), Make1Term("t"),
    319                                 one_word_no_offset, word_starts, now, NULL);
    320     EXPECT_TRUE(scored_a.can_inline());
    321     EXPECT_FALSE(scored_a.match_in_scheme);
    322     ScoredHistoryMatch scored_b(row, visits, std::string(),
    323                                 ASCIIToUTF16("h"), Make1Term("h"),
    324                                 one_word_no_offset, word_starts, now, NULL);
    325     EXPECT_TRUE(scored_b.can_inline());
    326     EXPECT_TRUE(scored_b.match_in_scheme);
    327     ScoredHistoryMatch scored_c(row, visits, std::string(),
    328                                 ASCIIToUTF16("w"), Make1Term("w"),
    329                                 one_word_no_offset, word_starts, now, NULL);
    330     EXPECT_TRUE(scored_c.can_inline());
    331     EXPECT_FALSE(scored_c.match_in_scheme);
    332   }
    333 }
    334 
    335 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
    336   const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
    337       ASCIIToUTF16("def"),
    338       ASCIIToUTF16("http://abc.def.com/"),
    339       ASCIIToUTF16("Non-Matching Title"));
    340   const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
    341       ASCIIToUTF16("def"),
    342       ASCIIToUTF16("http://abc.def.com"),
    343       ASCIIToUTF16("Non-Matching Title"));
    344   EXPECT_EQ(hostname_no_slash, hostname);
    345 }
    346 
    347 // This function only tests scoring of single terms that match exactly
    348 // once somewhere in the URL or title.
    349 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
    350   base::string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?"
    351       "arg1=val1&arg2=val2#hash_component");
    352   base::string16 title = ASCIIToUTF16("here is a title");
    353   const float hostname_score =
    354       GetTopicalityScoreOfTermAgainstURLAndTitle(
    355           ASCIIToUTF16("abc"), url, title);
    356   const float hostname_mid_word_score =
    357       GetTopicalityScoreOfTermAgainstURLAndTitle(
    358           ASCIIToUTF16("bc"), url, title);
    359   const float domain_name_score =
    360       GetTopicalityScoreOfTermAgainstURLAndTitle(
    361           ASCIIToUTF16("def"), url, title);
    362   const float domain_name_mid_word_score =
    363       GetTopicalityScoreOfTermAgainstURLAndTitle(
    364           ASCIIToUTF16("ef"), url, title);
    365   const float tld_score =
    366       GetTopicalityScoreOfTermAgainstURLAndTitle(
    367           ASCIIToUTF16("com"), url, title);
    368   const float tld_mid_word_score =
    369       GetTopicalityScoreOfTermAgainstURLAndTitle(
    370           ASCIIToUTF16("om"), url, title);
    371   const float path_score =
    372       GetTopicalityScoreOfTermAgainstURLAndTitle(
    373           ASCIIToUTF16("path1"), url, title);
    374   const float path_mid_word_score =
    375       GetTopicalityScoreOfTermAgainstURLAndTitle(
    376           ASCIIToUTF16("ath1"), url, title);
    377   const float arg_score =
    378       GetTopicalityScoreOfTermAgainstURLAndTitle(
    379           ASCIIToUTF16("arg2"), url, title);
    380   const float arg_mid_word_score =
    381       GetTopicalityScoreOfTermAgainstURLAndTitle(
    382           ASCIIToUTF16("rg2"), url, title);
    383   const float protocol_score =
    384       GetTopicalityScoreOfTermAgainstURLAndTitle(
    385           ASCIIToUTF16("htt"), url, title);
    386   const float protocol_mid_word_score =
    387       GetTopicalityScoreOfTermAgainstURLAndTitle(
    388           ASCIIToUTF16("tt"), url, title);
    389   const float title_score =
    390       GetTopicalityScoreOfTermAgainstURLAndTitle(
    391           ASCIIToUTF16("her"), url, title);
    392   const float title_mid_word_score =
    393       GetTopicalityScoreOfTermAgainstURLAndTitle(
    394           ASCIIToUTF16("er"), url, title);
    395   // Verify hostname and domain name > path > arg.
    396   EXPECT_GT(hostname_score, path_score);
    397   EXPECT_GT(domain_name_score, path_score);
    398   EXPECT_GT(path_score, arg_score);
    399   // Verify that domain name > path and domain name > arg for non-word
    400   // boundaries.
    401   EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
    402   EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
    403   EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score);
    404   EXPECT_GT(hostname_mid_word_score, arg_mid_word_score);
    405   // Also verify that the matches at non-word-boundaries all score
    406   // worse than the matches at word boundaries.  These three sets suffice.
    407   EXPECT_GT(arg_score, hostname_mid_word_score);
    408   EXPECT_GT(arg_score, domain_name_mid_word_score);
    409   EXPECT_GT(title_score, title_mid_word_score);
    410   // Check that title matches fit somewhere reasonable compared to the
    411   // various types of URL matches.
    412   EXPECT_GT(title_score, arg_score);
    413   EXPECT_GT(arg_score, title_mid_word_score);
    414   // Finally, verify that protocol matches and top level domain name
    415   // matches (.com, .net, etc.) score worse than some of the mid-word
    416   // matches that actually count.
    417   EXPECT_GT(hostname_mid_word_score, protocol_score);
    418   EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score);
    419   EXPECT_GT(hostname_mid_word_score, tld_score);
    420   EXPECT_GT(hostname_mid_word_score, tld_mid_word_score);
    421 }
    422 
    423 }  // namespace history
    424