Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/visitsegment_database.h"
      6 
      7 #include <math.h>
      8 
      9 #include <algorithm>
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "app/sql/statement.h"
     14 #include "base/logging.h"
     15 #include "base/stl_util-inl.h"
     16 #include "base/string_util.h"
     17 #include "base/utf_string_conversions.h"
     18 #include "chrome/browser/history/page_usage_data.h"
     19 
     20 // The following tables are used to store url segment information.
     21 //
     22 // segments
     23 //   id                 Primary key
     24 //   name               A unique string to represent that segment. (URL derived)
     25 //   url_id             ID of the url currently used to represent this segment.
     26 //   pres_index         index used to store a fixed presentation position.
     27 //
     28 // segment_usage
     29 //   id                 Primary key
     30 //   segment_id         Corresponding segment id
     31 //   time_slot          time stamp identifying for what day this entry is about
     32 //   visit_count        Number of visit in the segment
     33 //
     34 
     35 namespace history {
     36 
     37 VisitSegmentDatabase::VisitSegmentDatabase() {
     38 }
     39 
     40 VisitSegmentDatabase::~VisitSegmentDatabase() {
     41 }
     42 
     43 bool VisitSegmentDatabase::InitSegmentTables() {
     44   // Segments table.
     45   if (!GetDB().DoesTableExist("segments")) {
     46     if (!GetDB().Execute("CREATE TABLE segments ("
     47         "id INTEGER PRIMARY KEY,"
     48         "name VARCHAR,"
     49         "url_id INTEGER NON NULL,"
     50         "pres_index INTEGER DEFAULT -1 NOT NULL)")) {
     51       NOTREACHED();
     52       return false;
     53     }
     54 
     55     if (!GetDB().Execute("CREATE INDEX segments_name ON segments(name)")) {
     56       NOTREACHED();
     57       return false;
     58     }
     59   }
     60 
     61   // This was added later, so we need to try to create it even if the table
     62   // already exists.
     63   GetDB().Execute("CREATE INDEX segments_url_id ON segments(url_id)");
     64 
     65   // Segment usage table.
     66   if (!GetDB().DoesTableExist("segment_usage")) {
     67     if (!GetDB().Execute("CREATE TABLE segment_usage ("
     68         "id INTEGER PRIMARY KEY,"
     69         "segment_id INTEGER NOT NULL,"
     70         "time_slot INTEGER NOT NULL,"
     71         "visit_count INTEGER DEFAULT 0 NOT NULL)")) {
     72       NOTREACHED();
     73       return false;
     74     }
     75     if (!GetDB().Execute(
     76         "CREATE INDEX segment_usage_time_slot_segment_id ON "
     77         "segment_usage(time_slot, segment_id)")) {
     78       NOTREACHED();
     79       return false;
     80     }
     81   }
     82 
     83   // Added in a later version, so we always need to try to creat this index.
     84   GetDB().Execute("CREATE INDEX segments_usage_seg_id "
     85                   "ON segment_usage(segment_id)");
     86 
     87   // Presentation index table.
     88   //
     89   // Important note:
     90   // Right now, this table is only used to store the presentation index.
     91   // If you need to add more columns, keep in mind that rows are currently
     92   // deleted when the presentation index is changed to -1.
     93   // See SetPagePresentationIndex() in this file
     94   if (!GetDB().DoesTableExist("presentation")) {
     95     if (!GetDB().Execute("CREATE TABLE presentation("
     96         "url_id INTEGER PRIMARY KEY,"
     97         "pres_index INTEGER NOT NULL)"))
     98       return false;
     99   }
    100   return true;
    101 }
    102 
    103 bool VisitSegmentDatabase::DropSegmentTables() {
    104   // Dropping the tables will implicitly delete the indices.
    105   return GetDB().Execute("DROP TABLE segments") &&
    106          GetDB().Execute("DROP TABLE segment_usage");
    107 }
    108 
    109 // Note: the segment name is derived from the URL but is not a URL. It is
    110 // a string that can be easily recreated from various URLS. Maybe this should
    111 // be an MD5 to limit the length.
    112 //
    113 // static
    114 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
    115   // TODO(brettw) this should probably use the registry controlled
    116   // domains service.
    117   GURL::Replacements r;
    118   const char kWWWDot[] = "www.";
    119   const int kWWWDotLen = arraysize(kWWWDot) - 1;
    120 
    121   std::string host = url.host();
    122   const char* host_c = host.c_str();
    123   // Remove www. to avoid some dups.
    124   if (static_cast<int>(host.size()) > kWWWDotLen &&
    125       LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
    126     r.SetHost(host.c_str(),
    127               url_parse::Component(kWWWDotLen,
    128                   static_cast<int>(host.size()) - kWWWDotLen));
    129   }
    130   // Remove other stuff we don't want.
    131   r.ClearUsername();
    132   r.ClearPassword();
    133   r.ClearQuery();
    134   r.ClearRef();
    135   r.ClearPort();
    136 
    137   return url.ReplaceComponents(r).spec();
    138 }
    139 
    140 SegmentID VisitSegmentDatabase::GetSegmentNamed(
    141     const std::string& segment_name) {
    142   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    143       "SELECT id FROM segments WHERE name = ?"));
    144   if (!statement)
    145     return 0;
    146 
    147   statement.BindString(0, segment_name);
    148   if (statement.Step())
    149     return statement.ColumnInt64(0);
    150   return 0;
    151 }
    152 
    153 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
    154                                                           URLID url_id) {
    155   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    156       "UPDATE segments SET url_id = ? WHERE id = ?"));
    157   if (!statement)
    158     return false;
    159 
    160   statement.BindInt64(0, url_id);
    161   statement.BindInt64(1, segment_id);
    162   return statement.Run();
    163 }
    164 
    165 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
    166   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    167       "SELECT url_id FROM segments WHERE id = ?"));
    168   if (!statement)
    169     return 0;
    170 
    171   statement.BindInt64(0, segment_id);
    172   if (statement.Step())
    173     return statement.ColumnInt64(0);
    174   return 0;
    175 }
    176 
    177 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
    178                                               const std::string& segment_name) {
    179   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    180       "INSERT INTO segments (name, url_id) VALUES (?,?)"));
    181   if (!statement)
    182     return false;
    183 
    184   statement.BindString(0, segment_name);
    185   statement.BindInt64(1, url_id);
    186   if (statement.Run())
    187     return GetDB().GetLastInsertRowId();
    188   return false;
    189 }
    190 
    191 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
    192                                                      base::Time ts,
    193                                                      int amount) {
    194   base::Time t = ts.LocalMidnight();
    195 
    196   sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
    197       "SELECT id, visit_count FROM segment_usage "
    198       "WHERE time_slot = ? AND segment_id = ?"));
    199   if (!select)
    200     return false;
    201 
    202   select.BindInt64(0, t.ToInternalValue());
    203   select.BindInt64(1, segment_id);
    204   if (select.Step()) {
    205     sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
    206         "UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
    207     if (!update)
    208       return false;
    209 
    210     update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
    211     update.BindInt64(1, select.ColumnInt64(0));
    212     return update.Run();
    213 
    214   } else {
    215     sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
    216         "INSERT INTO segment_usage "
    217         "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
    218     if (!insert)
    219       return false;
    220 
    221     insert.BindInt64(0, segment_id);
    222     insert.BindInt64(1, t.ToInternalValue());
    223     insert.BindInt64(2, static_cast<int64>(amount));
    224     return insert.Run();
    225   }
    226 }
    227 
    228 void VisitSegmentDatabase::QuerySegmentUsage(
    229     base::Time from_time,
    230     int max_result_count,
    231     std::vector<PageUsageData*>* results) {
    232   // This function gathers the highest-ranked segments in two queries.
    233   // The first gathers scores for all segments.
    234   // The second gathers segment data (url, title, etc.) for the highest-ranked
    235   // segments.
    236   // TODO(evanm): this disregards the "presentation index", which was what was
    237   // used to lock results into position.  But the rest of our code currently
    238   // does as well.
    239 
    240   // Gather all the segment scores.
    241   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    242       "SELECT segment_id, time_slot, visit_count "
    243       "FROM segment_usage WHERE time_slot >= ? "
    244       "ORDER BY segment_id"));
    245   if (!statement) {
    246     NOTREACHED() << GetDB().GetErrorMessage();
    247     return;
    248   }
    249 
    250   base::Time ts = from_time.LocalMidnight();
    251   statement.BindInt64(0, ts.ToInternalValue());
    252 
    253   base::Time now = base::Time::Now();
    254   SegmentID last_segment_id = 0;
    255   PageUsageData* pud = NULL;
    256   float score = 0;
    257   while (statement.Step()) {
    258     SegmentID segment_id = statement.ColumnInt64(0);
    259     if (segment_id != last_segment_id) {
    260       if (pud) {
    261         pud->SetScore(score);
    262         results->push_back(pud);
    263       }
    264 
    265       pud = new PageUsageData(segment_id);
    266       score = 0;
    267       last_segment_id = segment_id;
    268     }
    269 
    270     base::Time timeslot =
    271         base::Time::FromInternalValue(statement.ColumnInt64(1));
    272     int visit_count = statement.ColumnInt(2);
    273     int days_ago = (now - timeslot).InDays();
    274 
    275     // Score for this day in isolation.
    276     float day_visits_score = 1.0f + log(static_cast<float>(visit_count));
    277     // Recent visits count more than historical ones, so we multiply in a boost
    278     // related to how long ago this day was.
    279     // This boost is a curve that smoothly goes through these values:
    280     // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
    281     // at the limit of how far we reach into the past.
    282     float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
    283     score += recency_boost * day_visits_score;
    284   }
    285 
    286   if (pud) {
    287     pud->SetScore(score);
    288     results->push_back(pud);
    289   }
    290 
    291   // Limit to the top kResultCount results.
    292   sort(results->begin(), results->end(), PageUsageData::Predicate);
    293   if (static_cast<int>(results->size()) > max_result_count) {
    294     STLDeleteContainerPointers(results->begin() + max_result_count,
    295                                results->end());
    296     results->resize(max_result_count);
    297   }
    298 
    299   // Now fetch the details about the entries we care about.
    300   sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
    301       "SELECT urls.url, urls.title FROM urls "
    302       "JOIN segments ON segments.url_id = urls.id "
    303       "WHERE segments.id = ?"));
    304   if (!statement2) {
    305     NOTREACHED() << GetDB().GetErrorMessage();
    306     return;
    307   }
    308   for (size_t i = 0; i < results->size(); ++i) {
    309     PageUsageData* pud = (*results)[i];
    310     statement2.BindInt64(0, pud->GetID());
    311     if (statement2.Step()) {
    312       pud->SetURL(GURL(statement2.ColumnString(0)));
    313       pud->SetTitle(UTF8ToUTF16(statement2.ColumnString(1)));
    314     }
    315     statement2.Reset();
    316   }
    317 }
    318 
    319 void VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
    320   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    321       "DELETE FROM segment_usage WHERE time_slot < ?"));
    322   if (!statement)
    323     return;
    324 
    325   statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue());
    326   if (!statement.Run())
    327     NOTREACHED();
    328 }
    329 
    330 void VisitSegmentDatabase::SetSegmentPresentationIndex(SegmentID segment_id,
    331                                                        int index) {
    332   sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
    333       "UPDATE segments SET pres_index = ? WHERE id = ?"));
    334   if (!statement)
    335     return;
    336 
    337   statement.BindInt(0, index);
    338   statement.BindInt64(1, segment_id);
    339   if (!statement.Run())
    340     NOTREACHED();
    341   else
    342     DCHECK_EQ(1, GetDB().GetLastChangeCount());
    343 }
    344 
    345 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
    346   sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
    347       "SELECT id FROM segments WHERE url_id = ?"));
    348   if (!select)
    349     return false;
    350 
    351   sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
    352       "DELETE FROM segments WHERE id = ?"));
    353   if (!delete_seg)
    354     return false;
    355 
    356   sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
    357       "DELETE FROM segment_usage WHERE segment_id = ?"));
    358   if (!delete_usage)
    359     return false;
    360 
    361   bool r = true;
    362   select.BindInt64(0, url_id);
    363   // In theory there could not be more than one segment using that URL but we
    364   // loop anyway to cleanup any inconsistency.
    365   while (select.Step()) {
    366     SegmentID segment_id = select.ColumnInt64(0);
    367 
    368     delete_usage.BindInt64(0, segment_id);
    369     if (!delete_usage.Run()) {
    370       NOTREACHED();
    371       r = false;
    372     }
    373 
    374     delete_seg.BindInt64(0, segment_id);
    375     if (!delete_seg.Run()) {
    376       NOTREACHED();
    377       r = false;
    378     }
    379     delete_usage.Reset();
    380     delete_seg.Reset();
    381   }
    382   return r;
    383 }
    384 
    385 }  // namespace history
    386