Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/visitsegment_database.h"
      6 
      7 #include <math.h>
      8 
      9 #include <algorithm>
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "base/command_line.h"
     14 #include "base/logging.h"
     15 #include "base/stl_util.h"
     16 #include "base/strings/string_util.h"
     17 #include "base/strings/utf_string_conversions.h"
     18 #include "chrome/browser/history/page_usage_data.h"
     19 #include "chrome/common/chrome_switches.h"
     20 #include "sql/statement.h"
     21 #include "sql/transaction.h"
     22 
     23 // The following tables are used to store url segment information.
     24 //
     25 // segments
     26 //   id                 Primary key
     27 //   name               A unique string to represent that segment. (URL derived)
     28 //   url_id             ID of the url currently used to represent this segment.
     29 //
     30 // segment_usage
     31 //   id                 Primary key
     32 //   segment_id         Corresponding segment id
     33 //   time_slot          time stamp identifying for what day this entry is about
     34 //   visit_count        Number of visit in the segment
     35 //
     36 // segment_duration
     37 //   id                 Primary key
     38 //   segment_id         Corresponding segment id
     39 //   time_slot          time stamp identifying what day this entry is for
     40 //   duration           Total time during the time_slot the user has been on
     41 //                      the page. This is a serialized TimeDelta value.
     42 // segment_duration is only created if chrome::kTrackActiveVisitTime is set.
     43 
     44 namespace history {
     45 
     46 VisitSegmentDatabase::VisitSegmentDatabase()
     47     : has_duration_table_(CommandLine::ForCurrentProcess()->HasSwitch(
     48                               switches::kTrackActiveVisitTime)) {
     49 }
     50 
     51 VisitSegmentDatabase::~VisitSegmentDatabase() {
     52 }
     53 
     54 bool VisitSegmentDatabase::InitSegmentTables() {
     55   // Segments table.
     56   if (!GetDB().DoesTableExist("segments")) {
     57     if (!GetDB().Execute("CREATE TABLE segments ("
     58         "id INTEGER PRIMARY KEY,"
     59         "name VARCHAR,"
     60         "url_id INTEGER NON NULL)")) {
     61       return false;
     62     }
     63 
     64     if (!GetDB().Execute(
     65         "CREATE INDEX segments_name ON segments(name)")) {
     66       return false;
     67     }
     68   }
     69 
     70   // This was added later, so we need to try to create it even if the table
     71   // already exists.
     72   if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON "
     73                        "segments(url_id)"))
     74     return false;
     75 
     76   // Segment usage table.
     77   if (!GetDB().DoesTableExist("segment_usage")) {
     78     if (!GetDB().Execute("CREATE TABLE segment_usage ("
     79         "id INTEGER PRIMARY KEY,"
     80         "segment_id INTEGER NOT NULL,"
     81         "time_slot INTEGER NOT NULL,"
     82         "visit_count INTEGER DEFAULT 0 NOT NULL)")) {
     83       return false;
     84     }
     85     if (!GetDB().Execute(
     86         "CREATE INDEX segment_usage_time_slot_segment_id ON "
     87         "segment_usage(time_slot, segment_id)")) {
     88       return false;
     89     }
     90   }
     91 
     92   // Added in a later version, so we always need to try to creat this index.
     93   if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id "
     94                        "ON segment_usage(segment_id)"))
     95     return false;
     96 
     97   // TODO(sky): if we decide to keep this feature duration should be added to
     98   // segument_usage.
     99   if (has_duration_table_ && !GetDB().DoesTableExist("segment_duration")) {
    100     if (!GetDB().Execute("CREATE TABLE segment_duration ("
    101                          "id INTEGER PRIMARY KEY,"
    102                          "segment_id INTEGER NOT NULL,"
    103                          "time_slot INTEGER NOT NULL,"
    104                          "duration INTEGER DEFAULT 0 NOT NULL)")) {
    105       return false;
    106     }
    107     if (!GetDB().Execute(
    108             "CREATE INDEX segment_duration_time_slot_segment_id ON "
    109             "segment_duration(time_slot, segment_id)")) {
    110       return false;
    111     }
    112   } else if (!has_duration_table_ &&
    113              !GetDB().Execute("DROP TABLE IF EXISTS segment_duration")) {
    114     return false;
    115   }
    116 
    117   return true;
    118 }
    119 
    120 bool VisitSegmentDatabase::DropSegmentTables() {
    121   // Dropping the tables will implicitly delete the indices.
    122   return GetDB().Execute("DROP TABLE segments") &&
    123          GetDB().Execute("DROP TABLE segment_usage") &&
    124          GetDB().Execute("DROP TABLE IF EXISTS segment_duration");
    125 }
    126 
    127 // Note: the segment name is derived from the URL but is not a URL. It is
    128 // a string that can be easily recreated from various URLS. Maybe this should
    129 // be an MD5 to limit the length.
    130 //
    131 // static
    132 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
    133   // TODO(brettw) this should probably use the registry controlled
    134   // domains service.
    135   GURL::Replacements r;
    136   const char kWWWDot[] = "www.";
    137   const int kWWWDotLen = arraysize(kWWWDot) - 1;
    138 
    139   std::string host = url.host();
    140   const char* host_c = host.c_str();
    141   // Remove www. to avoid some dups.
    142   if (static_cast<int>(host.size()) > kWWWDotLen &&
    143       LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
    144     r.SetHost(host.c_str(),
    145               url_parse::Component(kWWWDotLen,
    146                   static_cast<int>(host.size()) - kWWWDotLen));
    147   }
    148   // Remove other stuff we don't want.
    149   r.ClearUsername();
    150   r.ClearPassword();
    151   r.ClearQuery();
    152   r.ClearRef();
    153   r.ClearPort();
    154 
    155   return url.ReplaceComponents(r).spec();
    156 }
    157 
    158 // static
    159 base::Time VisitSegmentDatabase::SegmentTime(base::Time time) {
    160   return time.LocalMidnight();
    161 }
    162 
    163 SegmentID VisitSegmentDatabase::GetSegmentNamed(
    164     const std::string& segment_name) {
    165   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    166       "SELECT id FROM segments WHERE name = ?"));
    167   statement.BindString(0, segment_name);
    168 
    169   if (statement.Step())
    170     return statement.ColumnInt64(0);
    171   return 0;
    172 }
    173 
    174 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
    175                                                           URLID url_id) {
    176   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    177       "UPDATE segments SET url_id = ? WHERE id = ?"));
    178   statement.BindInt64(0, url_id);
    179   statement.BindInt64(1, segment_id);
    180 
    181   return statement.Run();
    182 }
    183 
    184 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
    185   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    186       "SELECT url_id FROM segments WHERE id = ?"));
    187   statement.BindInt64(0, segment_id);
    188 
    189   if (statement.Step())
    190     return statement.ColumnInt64(0);
    191   return 0;
    192 }
    193 
    194 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
    195                                               const std::string& segment_name) {
    196   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    197       "INSERT INTO segments (name, url_id) VALUES (?,?)"));
    198   statement.BindString(0, segment_name);
    199   statement.BindInt64(1, url_id);
    200 
    201   if (statement.Run())
    202     return GetDB().GetLastInsertRowId();
    203   return 0;
    204 }
    205 
    206 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
    207                                                      base::Time ts,
    208                                                      int amount) {
    209   base::Time t = SegmentTime(ts);
    210 
    211   sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
    212       "SELECT id, visit_count FROM segment_usage "
    213       "WHERE time_slot = ? AND segment_id = ?"));
    214   select.BindInt64(0, t.ToInternalValue());
    215   select.BindInt64(1, segment_id);
    216 
    217   if (!select.is_valid())
    218     return false;
    219 
    220   if (select.Step()) {
    221     sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
    222         "UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
    223     update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
    224     update.BindInt64(1, select.ColumnInt64(0));
    225 
    226     return update.Run();
    227   } else {
    228     sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
    229         "INSERT INTO segment_usage "
    230         "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
    231     insert.BindInt64(0, segment_id);
    232     insert.BindInt64(1, t.ToInternalValue());
    233     insert.BindInt64(2, static_cast<int64>(amount));
    234 
    235     return insert.Run();
    236   }
    237 }
    238 
    239 void VisitSegmentDatabase::QuerySegmentUsage(
    240     base::Time from_time,
    241     int max_result_count,
    242     std::vector<PageUsageData*>* result) {
    243   // Gather all the segment scores.
    244   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    245       "SELECT segment_id, time_slot, visit_count "
    246       "FROM segment_usage WHERE time_slot >= ? "
    247       "ORDER BY segment_id"));
    248   if (!statement.is_valid())
    249     return;
    250 
    251   QuerySegmentsCommon(&statement, from_time, max_result_count,
    252                       QUERY_VISIT_COUNT, result);
    253 }
    254 
    255 bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
    256   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    257       "DELETE FROM segment_usage WHERE time_slot < ?"));
    258   statement.BindInt64(0, SegmentTime(older_than).ToInternalValue());
    259 
    260   if (!statement.Run())
    261     return false;
    262 
    263   if (!has_duration_table_)
    264     return true;
    265 
    266   sql::Statement duration_statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    267       "DELETE FROM segment_duration WHERE time_slot < ?"));
    268   duration_statement.BindInt64(0, SegmentTime(older_than).ToInternalValue());
    269 
    270   return duration_statement.Run();
    271 }
    272 
    273 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
    274   sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
    275       "DELETE FROM segment_usage WHERE segment_id IN "
    276       "(SELECT id FROM segments WHERE url_id = ?)"));
    277   delete_usage.BindInt64(0, url_id);
    278 
    279   if (!delete_usage.Run())
    280     return false;
    281 
    282   if (has_duration_table_) {
    283     sql::Statement delete_duration(GetDB().GetCachedStatement(SQL_FROM_HERE,
    284         "DELETE FROM segment_duration WHERE segment_id IN "
    285         "(SELECT id FROM segments WHERE url_id = ?)"));
    286     delete_duration.BindInt64(0, url_id);
    287 
    288     if (!delete_duration.Run())
    289       return false;
    290   }
    291 
    292   sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
    293       "DELETE FROM segments WHERE url_id = ?"));
    294   delete_seg.BindInt64(0, url_id);
    295 
    296   return delete_seg.Run();
    297 }
    298 
    299 SegmentDurationID VisitSegmentDatabase::CreateSegmentDuration(
    300     SegmentID segment_id,
    301     base::Time time,
    302     base::TimeDelta delta) {
    303   if (!has_duration_table_)
    304     return 0;
    305 
    306   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    307       "INSERT INTO segment_duration (segment_id, time_slot, duration) "
    308       "VALUES (?,?,?)"));
    309   statement.BindInt64(0, segment_id);
    310   statement.BindInt64(1, SegmentTime(time).ToInternalValue());
    311   statement.BindInt64(2, delta.ToInternalValue());
    312   return statement.Run() ? GetDB().GetLastInsertRowId() : 0;
    313 }
    314 
    315 bool VisitSegmentDatabase::SetSegmentDuration(SegmentDurationID duration_id,
    316                                               base::TimeDelta time_delta) {
    317   if (!has_duration_table_)
    318     return false;
    319 
    320   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    321       "UPDATE segment_duration SET duration = ? WHERE id = ?"));
    322   statement.BindInt64(0, time_delta.ToInternalValue());
    323   statement.BindInt64(1, duration_id);
    324   return statement.Run();
    325 }
    326 
    327 bool VisitSegmentDatabase::GetSegmentDuration(SegmentID segment_id,
    328                                               base::Time time,
    329                                               SegmentDurationID* duration_id,
    330                                               base::TimeDelta* time_delta) {
    331   if (!has_duration_table_)
    332     return false;
    333 
    334   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    335       "SELECT id, duration FROM segment_duration "
    336       "WHERE segment_id = ? AND time_slot = ? "));
    337   if (!statement.is_valid())
    338     return false;
    339 
    340   statement.BindInt64(0, segment_id);
    341   statement.BindInt64(1, SegmentTime(time).ToInternalValue());
    342 
    343   if (!statement.Step())
    344     return false;
    345 
    346   *duration_id = statement.ColumnInt64(0);
    347   *time_delta = base::TimeDelta::FromInternalValue(statement.ColumnInt64(1));
    348   return true;
    349 }
    350 
    351 void VisitSegmentDatabase::QuerySegmentDuration(
    352     base::Time from_time,
    353     int max_result_count,
    354     std::vector<PageUsageData*>* result) {
    355   if (!has_duration_table_)
    356     return;
    357 
    358   // Gather all the segment scores.
    359   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    360       "SELECT segment_id, time_slot, duration "
    361       "FROM segment_duration WHERE time_slot >= ? "
    362       "ORDER BY segment_id"));
    363   if (!statement.is_valid())
    364     return;
    365 
    366   QuerySegmentsCommon(&statement, from_time, max_result_count, QUERY_DURATION,
    367                       result);
    368 }
    369 
    370 bool VisitSegmentDatabase::MigratePresentationIndex() {
    371   sql::Transaction transaction(&GetDB());
    372   return transaction.Begin() &&
    373       GetDB().Execute("DROP TABLE presentation") &&
    374       GetDB().Execute("CREATE TABLE segments_tmp ("
    375                       "id INTEGER PRIMARY KEY,"
    376                       "name VARCHAR,"
    377                       "url_id INTEGER NON NULL)") &&
    378       GetDB().Execute("INSERT INTO segments_tmp SELECT "
    379                       "id, name, url_id FROM segments") &&
    380       GetDB().Execute("DROP TABLE segments") &&
    381       GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") &&
    382       transaction.Commit();
    383 }
    384 
    385 
    386 void VisitSegmentDatabase::QuerySegmentsCommon(
    387     sql::Statement* statement,
    388     base::Time from_time,
    389     int max_result_count,
    390     QueryType query_type,
    391     std::vector<PageUsageData*>* result) {
    392   // This function gathers the highest-ranked segments in two queries.
    393   // The first gathers scores for all segments.
    394   // The second gathers segment data (url, title, etc.) for the highest-ranked
    395   // segments.
    396 
    397   base::Time ts = SegmentTime(from_time);
    398   statement->BindInt64(0, ts.ToInternalValue());
    399 
    400   base::Time now = base::Time::Now();
    401   SegmentID last_segment_id = 0;
    402   PageUsageData* pud = NULL;
    403   float score = 0;
    404   base::TimeDelta duration;
    405   while (statement->Step()) {
    406     SegmentID segment_id = statement->ColumnInt64(0);
    407     if (segment_id != last_segment_id) {
    408       if (pud) {
    409         pud->SetScore(score);
    410         pud->SetDuration(duration);
    411         result->push_back(pud);
    412       }
    413 
    414       pud = new PageUsageData(segment_id);
    415       score = 0;
    416       last_segment_id = segment_id;
    417       duration = base::TimeDelta();
    418     }
    419 
    420     base::Time timeslot =
    421         base::Time::FromInternalValue(statement->ColumnInt64(1));
    422     int count;
    423     if (query_type == QUERY_VISIT_COUNT) {
    424       count = statement->ColumnInt(2);
    425     } else {
    426       base::TimeDelta current_duration(
    427           base::TimeDelta::FromInternalValue(statement->ColumnInt64(2)));
    428       duration += current_duration;
    429       // Souldn't overflow since we group by day.
    430       count = static_cast<int>(current_duration.InSeconds());
    431     }
    432     float day_score = 1.0f + log(static_cast<float>(count));
    433 
    434     // Recent visits count more than historical ones, so we multiply in a boost
    435     // related to how long ago this day was.
    436     // This boost is a curve that smoothly goes through these values:
    437     // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
    438     // at the limit of how far we reach into the past.
    439     int days_ago = (now - timeslot).InDays();
    440     float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
    441     score += recency_boost * day_score;
    442   }
    443 
    444   if (pud) {
    445     pud->SetScore(score);
    446     pud->SetDuration(duration);
    447     result->push_back(pud);
    448   }
    449 
    450   // Limit to the top kResultCount results.
    451   std::sort(result->begin(), result->end(), PageUsageData::Predicate);
    452   if (static_cast<int>(result->size()) > max_result_count) {
    453     STLDeleteContainerPointers(result->begin() + max_result_count,
    454                                result->end());
    455     result->resize(max_result_count);
    456   }
    457 
    458   // Now fetch the details about the entries we care about.
    459   sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
    460       "SELECT urls.url, urls.title FROM urls "
    461       "JOIN segments ON segments.url_id = urls.id "
    462       "WHERE segments.id = ?"));
    463 
    464   if (!statement2.is_valid())
    465     return;
    466 
    467   for (size_t i = 0; i < result->size(); ++i) {
    468     PageUsageData* pud = (*result)[i];
    469     statement2.BindInt64(0, pud->GetID());
    470     if (statement2.Step()) {
    471       pud->SetURL(GURL(statement2.ColumnString(0)));
    472       pud->SetTitle(statement2.ColumnString16(1));
    473     }
    474     statement2.Reset(true);
    475   }
    476 }
    477 
    478 }  // namespace history
    479