Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/visitsegment_database.h"
      6 
      7 #include <math.h>
      8 
      9 #include <algorithm>
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "base/logging.h"
     14 #include "base/stl_util.h"
     15 #include "base/strings/string_util.h"
     16 #include "base/strings/utf_string_conversions.h"
     17 #include "chrome/browser/history/page_usage_data.h"
     18 #include "sql/statement.h"
     19 #include "sql/transaction.h"
     20 
     21 // The following tables are used to store url segment information.
     22 //
     23 // segments
     24 //   id                 Primary key
     25 //   name               A unique string to represent that segment. (URL derived)
     26 //   url_id             ID of the url currently used to represent this segment.
     27 //
     28 // segment_usage
     29 //   id                 Primary key
     30 //   segment_id         Corresponding segment id
     31 //   time_slot          time stamp identifying for what day this entry is about
     32 //   visit_count        Number of visit in the segment
     33 //
     34 
     35 namespace history {
     36 
     37 VisitSegmentDatabase::VisitSegmentDatabase() {
     38 }
     39 
     40 VisitSegmentDatabase::~VisitSegmentDatabase() {
     41 }
     42 
     43 bool VisitSegmentDatabase::InitSegmentTables() {
     44   // Segments table.
     45   if (!GetDB().DoesTableExist("segments")) {
     46     if (!GetDB().Execute("CREATE TABLE segments ("
     47         "id INTEGER PRIMARY KEY,"
     48         "name VARCHAR,"
     49         "url_id INTEGER NON NULL)")) {
     50       return false;
     51     }
     52 
     53     if (!GetDB().Execute(
     54         "CREATE INDEX segments_name ON segments(name)")) {
     55       return false;
     56     }
     57   }
     58 
     59   // This was added later, so we need to try to create it even if the table
     60   // already exists.
     61   if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON "
     62                        "segments(url_id)"))
     63     return false;
     64 
     65   // Segment usage table.
     66   if (!GetDB().DoesTableExist("segment_usage")) {
     67     if (!GetDB().Execute("CREATE TABLE segment_usage ("
     68         "id INTEGER PRIMARY KEY,"
     69         "segment_id INTEGER NOT NULL,"
     70         "time_slot INTEGER NOT NULL,"
     71         "visit_count INTEGER DEFAULT 0 NOT NULL)")) {
     72       return false;
     73     }
     74     if (!GetDB().Execute(
     75         "CREATE INDEX segment_usage_time_slot_segment_id ON "
     76         "segment_usage(time_slot, segment_id)")) {
     77       return false;
     78     }
     79   }
     80 
     81   // Added in a later version, so we always need to try to creat this index.
     82   if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id "
     83                        "ON segment_usage(segment_id)"))
     84     return false;
     85 
     86   return true;
     87 }
     88 
     89 bool VisitSegmentDatabase::DropSegmentTables() {
     90   // Dropping the tables will implicitly delete the indices.
     91   return GetDB().Execute("DROP TABLE segments") &&
     92          GetDB().Execute("DROP TABLE segment_usage");
     93 }
     94 
     95 // Note: the segment name is derived from the URL but is not a URL. It is
     96 // a string that can be easily recreated from various URLS. Maybe this should
     97 // be an MD5 to limit the length.
     98 //
     99 // static
    100 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
    101   // TODO(brettw) this should probably use the registry controlled
    102   // domains service.
    103   GURL::Replacements r;
    104   const char kWWWDot[] = "www.";
    105   const int kWWWDotLen = arraysize(kWWWDot) - 1;
    106 
    107   std::string host = url.host();
    108   const char* host_c = host.c_str();
    109   // Remove www. to avoid some dups.
    110   if (static_cast<int>(host.size()) > kWWWDotLen &&
    111       LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
    112     r.SetHost(host.c_str(),
    113               url::Component(kWWWDotLen,
    114                              static_cast<int>(host.size()) - kWWWDotLen));
    115   }
    116   // Remove other stuff we don't want.
    117   r.ClearUsername();
    118   r.ClearPassword();
    119   r.ClearQuery();
    120   r.ClearRef();
    121   r.ClearPort();
    122 
    123   return url.ReplaceComponents(r).spec();
    124 }
    125 
    126 SegmentID VisitSegmentDatabase::GetSegmentNamed(
    127     const std::string& segment_name) {
    128   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    129       "SELECT id FROM segments WHERE name = ?"));
    130   statement.BindString(0, segment_name);
    131 
    132   if (statement.Step())
    133     return statement.ColumnInt64(0);
    134   return 0;
    135 }
    136 
    137 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
    138                                                           URLID url_id) {
    139   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    140       "UPDATE segments SET url_id = ? WHERE id = ?"));
    141   statement.BindInt64(0, url_id);
    142   statement.BindInt64(1, segment_id);
    143 
    144   return statement.Run();
    145 }
    146 
    147 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
    148   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    149       "SELECT url_id FROM segments WHERE id = ?"));
    150   statement.BindInt64(0, segment_id);
    151 
    152   if (statement.Step())
    153     return statement.ColumnInt64(0);
    154   return 0;
    155 }
    156 
    157 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
    158                                               const std::string& segment_name) {
    159   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    160       "INSERT INTO segments (name, url_id) VALUES (?,?)"));
    161   statement.BindString(0, segment_name);
    162   statement.BindInt64(1, url_id);
    163 
    164   if (statement.Run())
    165     return GetDB().GetLastInsertRowId();
    166   return 0;
    167 }
    168 
    169 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
    170                                                      base::Time ts,
    171                                                      int amount) {
    172   base::Time t = ts.LocalMidnight();
    173 
    174   sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
    175       "SELECT id, visit_count FROM segment_usage "
    176       "WHERE time_slot = ? AND segment_id = ?"));
    177   select.BindInt64(0, t.ToInternalValue());
    178   select.BindInt64(1, segment_id);
    179 
    180   if (!select.is_valid())
    181     return false;
    182 
    183   if (select.Step()) {
    184     sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
    185         "UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
    186     update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
    187     update.BindInt64(1, select.ColumnInt64(0));
    188 
    189     return update.Run();
    190   } else {
    191     sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
    192         "INSERT INTO segment_usage "
    193         "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
    194     insert.BindInt64(0, segment_id);
    195     insert.BindInt64(1, t.ToInternalValue());
    196     insert.BindInt64(2, static_cast<int64>(amount));
    197 
    198     return insert.Run();
    199   }
    200 }
    201 
    202 void VisitSegmentDatabase::QuerySegmentUsage(
    203     base::Time from_time,
    204     int max_result_count,
    205     std::vector<PageUsageData*>* results) {
    206   // This function gathers the highest-ranked segments in two queries.
    207   // The first gathers scores for all segments.
    208   // The second gathers segment data (url, title, etc.) for the highest-ranked
    209   // segments.
    210 
    211   // Gather all the segment scores.
    212   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    213       "SELECT segment_id, time_slot, visit_count "
    214       "FROM segment_usage WHERE time_slot >= ? "
    215       "ORDER BY segment_id"));
    216   if (!statement.is_valid())
    217     return;
    218 
    219   base::Time ts = from_time.LocalMidnight();
    220   statement.BindInt64(0, ts.ToInternalValue());
    221 
    222   base::Time now = base::Time::Now();
    223   SegmentID last_segment_id = 0;
    224   PageUsageData* pud = NULL;
    225   float score = 0;
    226   while (statement.Step()) {
    227     SegmentID segment_id = statement.ColumnInt64(0);
    228     if (segment_id != last_segment_id) {
    229       if (pud) {
    230         pud->SetScore(score);
    231         results->push_back(pud);
    232       }
    233 
    234       pud = new PageUsageData(segment_id);
    235       score = 0;
    236       last_segment_id = segment_id;
    237     }
    238 
    239     base::Time timeslot =
    240         base::Time::FromInternalValue(statement.ColumnInt64(1));
    241     int visit_count = statement.ColumnInt(2);
    242     int days_ago = (now - timeslot).InDays();
    243 
    244     // Score for this day in isolation.
    245     float day_visits_score = 1.0f + log(static_cast<float>(visit_count));
    246     // Recent visits count more than historical ones, so we multiply in a boost
    247     // related to how long ago this day was.
    248     // This boost is a curve that smoothly goes through these values:
    249     // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
    250     // at the limit of how far we reach into the past.
    251     float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
    252     score += recency_boost * day_visits_score;
    253   }
    254 
    255   if (pud) {
    256     pud->SetScore(score);
    257     results->push_back(pud);
    258   }
    259 
    260   // Limit to the top kResultCount results.
    261   std::sort(results->begin(), results->end(), PageUsageData::Predicate);
    262   if (static_cast<int>(results->size()) > max_result_count) {
    263     STLDeleteContainerPointers(results->begin() + max_result_count,
    264                                results->end());
    265     results->resize(max_result_count);
    266   }
    267 
    268   // Now fetch the details about the entries we care about.
    269   sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
    270       "SELECT urls.url, urls.title FROM urls "
    271       "JOIN segments ON segments.url_id = urls.id "
    272       "WHERE segments.id = ?"));
    273 
    274   if (!statement2.is_valid())
    275     return;
    276 
    277   for (size_t i = 0; i < results->size(); ++i) {
    278     PageUsageData* pud = (*results)[i];
    279     statement2.BindInt64(0, pud->GetID());
    280     if (statement2.Step()) {
    281       pud->SetURL(GURL(statement2.ColumnString(0)));
    282       pud->SetTitle(statement2.ColumnString16(1));
    283     }
    284     statement2.Reset(true);
    285   }
    286 }
    287 
    288 bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
    289   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    290       "DELETE FROM segment_usage WHERE time_slot < ?"));
    291   statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue());
    292 
    293   return statement.Run();
    294 }
    295 
    296 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
    297   sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
    298       "DELETE FROM segment_usage WHERE segment_id IN "
    299       "(SELECT id FROM segments WHERE url_id = ?)"));
    300   delete_usage.BindInt64(0, url_id);
    301 
    302   if (!delete_usage.Run())
    303     return false;
    304 
    305   sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
    306       "DELETE FROM segments WHERE url_id = ?"));
    307   delete_seg.BindInt64(0, url_id);
    308 
    309   return delete_seg.Run();
    310 }
    311 
    312 bool VisitSegmentDatabase::MigratePresentationIndex() {
    313   sql::Transaction transaction(&GetDB());
    314   return transaction.Begin() &&
    315       GetDB().Execute("DROP TABLE presentation") &&
    316       GetDB().Execute("CREATE TABLE segments_tmp ("
    317                       "id INTEGER PRIMARY KEY,"
    318                       "name VARCHAR,"
    319                       "url_id INTEGER NON NULL)") &&
    320       GetDB().Execute("INSERT INTO segments_tmp SELECT "
    321                       "id, name, url_id FROM segments") &&
    322       GetDB().Execute("DROP TABLE segments") &&
    323       GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") &&
    324       transaction.Commit();
    325 }
    326 
    327 }  // namespace history
    328