Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <algorithm>
      6 
      7 #include "base/strings/string16.h"
      8 #include "base/strings/utf_string_conversions.h"
      9 #include "chrome/browser/bookmarks/bookmark_service.h"
     10 #include "chrome/browser/history/scored_history_match.h"
     11 #include "testing/gtest/include/gtest/gtest.h"
     12 
     13 namespace history {
     14 
     15 // Returns a VisitInfoVector that includes |num_visits| spread over the
     16 // last |frecency|*|num_visits| days (relative to |now|).  A frequency of
     17 // one means one visit each day, two means every other day, etc.
     18 VisitInfoVector CreateVisitInfoVector(int num_visits,
     19                                       int frequency,
     20                                       base::Time now) {
     21   VisitInfoVector visits;
     22   for (int i = 0; i < num_visits; ++i) {
     23     visits.push_back(
     24         std::make_pair(now - base::TimeDelta::FromDays(i * frequency),
     25                        content::PAGE_TRANSITION_LINK));
     26   }
     27   return visits;
     28 }
     29 
     30 class ScoredHistoryMatchTest : public testing::Test {
     31  protected:
     32   // Convenience function to create a URLRow with basic data for |url|, |title|,
     33   // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number
     34   // of days ago to which to set the URL's last_visit.
     35   URLRow MakeURLRow(const char* url,
     36                     const char* title,
     37                     int visit_count,
     38                     int days_since_last_visit,
     39                     int typed_count);
     40 
     41   // Convenience function to set the word starts information from a URLRow's
     42   // URL and title.
     43   void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts);
     44 
     45   // Convenience functions for easily creating vectors of search terms.
     46   String16Vector Make1Term(const char* term) const;
     47   String16Vector Make2Terms(const char* term_1, const char* term_2) const;
     48 
     49   // Convenience function for GetTopicalityScore() that builds the
     50   // term match and word break information automatically that are needed
     51   // to call GetTopicalityScore().  It only works for scoring a single term,
     52   // not multiple terms.
     53   float GetTopicalityScoreOfTermAgainstURLAndTitle(const string16& term,
     54                                                    const string16& url,
     55                                                    const string16& title);
     56 };
     57 
     58 URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
     59                                           const char* title,
     60                                           int visit_count,
     61                                           int days_since_last_visit,
     62                                           int typed_count) {
     63   URLRow row(GURL(url), 0);
     64   row.set_title(ASCIIToUTF16(title));
     65   row.set_visit_count(visit_count);
     66   row.set_typed_count(typed_count);
     67   row.set_last_visit(base::Time::NowFromSystemTime() -
     68                      base::TimeDelta::FromDays(days_since_last_visit));
     69   return row;
     70 }
     71 
     72 void ScoredHistoryMatchTest::PopulateWordStarts(
     73     const URLRow& url_row, RowWordStarts* word_starts) {
     74   String16SetFromString16(ASCIIToUTF16(url_row.url().spec()),
     75                           &word_starts->url_word_starts_);
     76   String16SetFromString16(url_row.title(), &word_starts->title_word_starts_);
     77 }
     78 
     79 
     80 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
     81   String16Vector original_terms;
     82   original_terms.push_back(ASCIIToUTF16(term));
     83   return original_terms;
     84 }
     85 
     86 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
     87                                                   const char* term_2) const {
     88   String16Vector original_terms;
     89   original_terms.push_back(ASCIIToUTF16(term_1));
     90   original_terms.push_back(ASCIIToUTF16(term_2));
     91   return original_terms;
     92 }
     93 
     94 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
     95     const string16& term,
     96     const string16& url,
     97     const string16& title) {
     98   TermMatches url_matches = MatchTermInString(term, url, 0);
     99   TermMatches title_matches = MatchTermInString(term, title, 0);
    100   RowWordStarts word_starts;
    101   String16SetFromString16(url, &word_starts.url_word_starts_);
    102   String16SetFromString16(title, &word_starts.title_word_starts_);
    103   return ScoredHistoryMatch::GetTopicalityScore(
    104       1, url, url_matches, title_matches, word_starts);
    105 }
    106 
    107 TEST_F(ScoredHistoryMatchTest, Scoring) {
    108   // We use NowFromSystemTime() because MakeURLRow uses the same function
    109   // to calculate last visit time when building a row.
    110   base::Time now = base::Time::NowFromSystemTime();
    111 
    112   URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1));
    113   RowWordStarts word_starts_a;
    114   PopulateWordStarts(row_a, &word_starts_a);
    115   VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now);
    116   // Mark one visit as typed.
    117   visits_a[0].second = content::PAGE_TRANSITION_TYPED;
    118   ScoredHistoryMatch scored_a(row_a, visits_a, std::string(),
    119                                ASCIIToUTF16("abc"), Make1Term("abc"),
    120                                word_starts_a, now, NULL);
    121 
    122   // Test scores based on visit_count.
    123   URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1));
    124   RowWordStarts word_starts_b;
    125   PopulateWordStarts(row_b, &word_starts_b);
    126   VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now);
    127   visits_b[0].second = content::PAGE_TRANSITION_TYPED;
    128   ScoredHistoryMatch scored_b(row_b, visits_b, std::string(),
    129                               ASCIIToUTF16("abc"), Make1Term("abc"),
    130                               word_starts_b, now, NULL);
    131   EXPECT_GT(scored_b.raw_score, scored_a.raw_score);
    132 
    133   // Test scores based on last_visit.
    134   URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1));
    135   RowWordStarts word_starts_c;
    136   PopulateWordStarts(row_c, &word_starts_c);
    137   VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now);
    138   visits_c[0].second = content::PAGE_TRANSITION_TYPED;
    139   ScoredHistoryMatch scored_c(row_c, visits_c, std::string(),
    140                               ASCIIToUTF16("abc"), Make1Term("abc"),
    141                               word_starts_c, now, NULL);
    142   EXPECT_GT(scored_c.raw_score, scored_a.raw_score);
    143 
    144   // Test scores based on typed_count.
    145   URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3));
    146   RowWordStarts word_starts_d;
    147   PopulateWordStarts(row_d, &word_starts_d);
    148   VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now);
    149   visits_d[0].second = content::PAGE_TRANSITION_TYPED;
    150   visits_d[1].second = content::PAGE_TRANSITION_TYPED;
    151   visits_d[2].second = content::PAGE_TRANSITION_TYPED;
    152   ScoredHistoryMatch scored_d(row_d, visits_d, std::string(),
    153                               ASCIIToUTF16("abc"), Make1Term("abc"),
    154                               word_starts_d, now, NULL);
    155   EXPECT_GT(scored_d.raw_score, scored_a.raw_score);
    156 
    157   // Test scores based on a terms appearing multiple times.
    158   URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi",
    159       "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3));
    160   RowWordStarts word_starts_e;
    161   PopulateWordStarts(row_e, &word_starts_e);
    162   const VisitInfoVector visits_e = visits_d;
    163   ScoredHistoryMatch scored_e(row_e, visits_e, std::string(),
    164                               ASCIIToUTF16("csi"), Make1Term("csi"),
    165                               word_starts_e, now, NULL);
    166   EXPECT_LT(scored_e.raw_score, 1400);
    167 
    168   // Test that a result with only a mid-term match (i.e., not at a word
    169   // boundary) scores 0.
    170   ScoredHistoryMatch scored_f(row_a, visits_a, std::string(),
    171                               ASCIIToUTF16("cd"), Make1Term("cd"),
    172                               word_starts_a, now, NULL);
    173   EXPECT_EQ(scored_f.raw_score, 0);
    174 }
    175 
    176 TEST_F(ScoredHistoryMatchTest, Inlining) {
    177   // We use NowFromSystemTime() because MakeURLRow uses the same function
    178   // to calculate last visit time when building a row.
    179   base::Time now = base::Time::NowFromSystemTime();
    180   RowWordStarts word_starts;
    181   VisitInfoVector visits;
    182 
    183   {
    184     URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
    185     ScoredHistoryMatch scored_a(row, visits, std::string(),
    186                                 ASCIIToUTF16("g"), Make1Term("g"),
    187                                 word_starts, now, NULL);
    188     EXPECT_TRUE(scored_a.can_inline);
    189     EXPECT_FALSE(scored_a.match_in_scheme);
    190     ScoredHistoryMatch scored_b(row, visits, std::string(),
    191                                 ASCIIToUTF16("w"), Make1Term("w"),
    192                                 word_starts, now, NULL);
    193     EXPECT_TRUE(scored_b.can_inline);
    194     EXPECT_FALSE(scored_b.match_in_scheme);
    195     ScoredHistoryMatch scored_c(row, visits, std::string(),
    196                                 ASCIIToUTF16("h"), Make1Term("h"),
    197                                 word_starts, now, NULL);
    198     EXPECT_TRUE(scored_c.can_inline);
    199     EXPECT_TRUE(scored_c.match_in_scheme);
    200     ScoredHistoryMatch scored_d(row, visits, std::string(),
    201                                 ASCIIToUTF16("o"), Make1Term("o"),
    202                                 word_starts, now, NULL);
    203     EXPECT_FALSE(scored_d.can_inline);
    204     EXPECT_FALSE(scored_d.match_in_scheme);
    205   }
    206 
    207   {
    208     URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
    209     ScoredHistoryMatch scored_a(row, visits, std::string(),
    210                                 ASCIIToUTF16("t"), Make1Term("t"),
    211                                 word_starts, now, NULL);
    212     EXPECT_TRUE(scored_a.can_inline);
    213     EXPECT_FALSE(scored_a.match_in_scheme);
    214     ScoredHistoryMatch scored_b(row, visits, std::string(),
    215                                 ASCIIToUTF16("f"), Make1Term("f"),
    216                                 word_starts, now, NULL);
    217     EXPECT_FALSE(scored_b.can_inline);
    218     EXPECT_FALSE(scored_b.match_in_scheme);
    219     ScoredHistoryMatch scored_c(row, visits, std::string(),
    220                                 ASCIIToUTF16("o"), Make1Term("o"),
    221                                 word_starts, now, NULL);
    222     EXPECT_FALSE(scored_c.can_inline);
    223     EXPECT_FALSE(scored_c.match_in_scheme);
    224   }
    225 
    226   {
    227     URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1));
    228     ScoredHistoryMatch scored_a(row, visits, std::string(),
    229                                 ASCIIToUTF16("t"), Make1Term("t"),
    230                                 word_starts, now, NULL);
    231     EXPECT_TRUE(scored_a.can_inline);
    232     EXPECT_FALSE(scored_a.match_in_scheme);
    233     ScoredHistoryMatch scored_b(row, visits, std::string(),
    234                                 ASCIIToUTF16("h"), Make1Term("h"),
    235                                 word_starts, now, NULL);
    236     EXPECT_TRUE(scored_b.can_inline);
    237     EXPECT_TRUE(scored_b.match_in_scheme);
    238     ScoredHistoryMatch scored_c(row, visits, std::string(),
    239                                 ASCIIToUTF16("w"), Make1Term("w"),
    240                                 word_starts, now, NULL);
    241     EXPECT_TRUE(scored_c.can_inline);
    242     EXPECT_FALSE(scored_c.match_in_scheme);
    243   }
    244 }
    245 
    246 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
    247   const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
    248       ASCIIToUTF16("def"),
    249       ASCIIToUTF16("http://abc.def.com/"),
    250       ASCIIToUTF16("Non-Matching Title"));
    251   const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
    252       ASCIIToUTF16("def"),
    253       ASCIIToUTF16("http://abc.def.com"),
    254       ASCIIToUTF16("Non-Matching Title"));
    255   EXPECT_EQ(hostname_no_slash, hostname);
    256 }
    257 
    258 // This function only tests scoring of single terms that match exactly
    259 // once somewhere in the URL or title.
    260 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
    261   string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?"
    262       "arg1=val1&arg2=val2#hash_component");
    263   string16 title = ASCIIToUTF16("here is a title");
    264   const float hostname_score =
    265       GetTopicalityScoreOfTermAgainstURLAndTitle(
    266           ASCIIToUTF16("abc"), url, title);
    267   const float hostname_mid_word_score =
    268       GetTopicalityScoreOfTermAgainstURLAndTitle(
    269           ASCIIToUTF16("bc"), url, title);
    270   const float domain_name_score =
    271       GetTopicalityScoreOfTermAgainstURLAndTitle(
    272           ASCIIToUTF16("def"), url, title);
    273   const float domain_name_mid_word_score =
    274       GetTopicalityScoreOfTermAgainstURLAndTitle(
    275           ASCIIToUTF16("ef"), url, title);
    276   const float tld_score =
    277       GetTopicalityScoreOfTermAgainstURLAndTitle(
    278           ASCIIToUTF16("com"), url, title);
    279   const float tld_mid_word_score =
    280       GetTopicalityScoreOfTermAgainstURLAndTitle(
    281           ASCIIToUTF16("om"), url, title);
    282   const float path_score =
    283       GetTopicalityScoreOfTermAgainstURLAndTitle(
    284           ASCIIToUTF16("path1"), url, title);
    285   const float path_mid_word_score =
    286       GetTopicalityScoreOfTermAgainstURLAndTitle(
    287           ASCIIToUTF16("ath1"), url, title);
    288   const float arg_score =
    289       GetTopicalityScoreOfTermAgainstURLAndTitle(
    290           ASCIIToUTF16("arg2"), url, title);
    291   const float arg_mid_word_score =
    292       GetTopicalityScoreOfTermAgainstURLAndTitle(
    293           ASCIIToUTF16("rg2"), url, title);
    294   const float protocol_score =
    295       GetTopicalityScoreOfTermAgainstURLAndTitle(
    296           ASCIIToUTF16("htt"), url, title);
    297   const float protocol_mid_word_score =
    298       GetTopicalityScoreOfTermAgainstURLAndTitle(
    299           ASCIIToUTF16("tt"), url, title);
    300   const float title_score =
    301       GetTopicalityScoreOfTermAgainstURLAndTitle(
    302           ASCIIToUTF16("her"), url, title);
    303   const float title_mid_word_score =
    304       GetTopicalityScoreOfTermAgainstURLAndTitle(
    305           ASCIIToUTF16("er"), url, title);
    306   // Verify hostname and domain name > path > arg.
    307   EXPECT_GT(hostname_score, path_score);
    308   EXPECT_GT(domain_name_score, path_score);
    309   EXPECT_GT(path_score, arg_score);
    310   // Verify that domain name > path and domain name > arg for non-word
    311   // boundaries.
    312   EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
    313   EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
    314   EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score);
    315   EXPECT_GT(hostname_mid_word_score, arg_mid_word_score);
    316   // Also verify that the matches at non-word-boundaries all score
    317   // worse than the matches at word boundaries.  These three sets suffice.
    318   EXPECT_GT(arg_score, hostname_mid_word_score);
    319   EXPECT_GT(arg_score, domain_name_mid_word_score);
    320   EXPECT_GT(title_score, title_mid_word_score);
    321   // Check that title matches fit somewhere reasonable compared to the
    322   // various types of URL matches.
    323   EXPECT_GT(title_score, arg_score);
    324   EXPECT_GT(arg_score, title_mid_word_score);
    325   // Finally, verify that protocol matches and top level domain name
    326   // matches (.com, .net, etc.) score worse than some of the mid-word
    327   // matches that actually count.
    328   EXPECT_GT(hostname_mid_word_score, protocol_score);
    329   EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score);
    330   EXPECT_GT(hostname_mid_word_score, tld_score);
    331   EXPECT_GT(hostname_mid_word_score, tld_mid_word_score);
    332 }
    333 
    334 }  // namespace history
    335