1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/visitsegment_database.h" 6 7 #include <math.h> 8 9 #include <algorithm> 10 #include <string> 11 #include <vector> 12 13 #include "base/logging.h" 14 #include "base/stl_util.h" 15 #include "base/strings/string_util.h" 16 #include "base/strings/utf_string_conversions.h" 17 #include "chrome/browser/history/page_usage_data.h" 18 #include "sql/statement.h" 19 #include "sql/transaction.h" 20 21 // The following tables are used to store url segment information. 22 // 23 // segments 24 // id Primary key 25 // name A unique string to represent that segment. (URL derived) 26 // url_id ID of the url currently used to represent this segment. 27 // 28 // segment_usage 29 // id Primary key 30 // segment_id Corresponding segment id 31 // time_slot time stamp identifying for what day this entry is about 32 // visit_count Number of visit in the segment 33 // 34 35 namespace history { 36 37 VisitSegmentDatabase::VisitSegmentDatabase() { 38 } 39 40 VisitSegmentDatabase::~VisitSegmentDatabase() { 41 } 42 43 bool VisitSegmentDatabase::InitSegmentTables() { 44 // Segments table. 45 if (!GetDB().DoesTableExist("segments")) { 46 if (!GetDB().Execute("CREATE TABLE segments (" 47 "id INTEGER PRIMARY KEY," 48 "name VARCHAR," 49 "url_id INTEGER NON NULL)")) { 50 return false; 51 } 52 53 if (!GetDB().Execute( 54 "CREATE INDEX segments_name ON segments(name)")) { 55 return false; 56 } 57 } 58 59 // This was added later, so we need to try to create it even if the table 60 // already exists. 61 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON " 62 "segments(url_id)")) 63 return false; 64 65 // Segment usage table. 66 if (!GetDB().DoesTableExist("segment_usage")) { 67 if (!GetDB().Execute("CREATE TABLE segment_usage (" 68 "id INTEGER PRIMARY KEY," 69 "segment_id INTEGER NOT NULL," 70 "time_slot INTEGER NOT NULL," 71 "visit_count INTEGER DEFAULT 0 NOT NULL)")) { 72 return false; 73 } 74 if (!GetDB().Execute( 75 "CREATE INDEX segment_usage_time_slot_segment_id ON " 76 "segment_usage(time_slot, segment_id)")) { 77 return false; 78 } 79 } 80 81 // Added in a later version, so we always need to try to creat this index. 82 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id " 83 "ON segment_usage(segment_id)")) 84 return false; 85 86 return true; 87 } 88 89 bool VisitSegmentDatabase::DropSegmentTables() { 90 // Dropping the tables will implicitly delete the indices. 91 return GetDB().Execute("DROP TABLE segments") && 92 GetDB().Execute("DROP TABLE segment_usage"); 93 } 94 95 // Note: the segment name is derived from the URL but is not a URL. It is 96 // a string that can be easily recreated from various URLS. Maybe this should 97 // be an MD5 to limit the length. 98 // 99 // static 100 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) { 101 // TODO(brettw) this should probably use the registry controlled 102 // domains service. 103 GURL::Replacements r; 104 const char kWWWDot[] = "www."; 105 const int kWWWDotLen = arraysize(kWWWDot) - 1; 106 107 std::string host = url.host(); 108 const char* host_c = host.c_str(); 109 // Remove www. to avoid some dups. 110 if (static_cast<int>(host.size()) > kWWWDotLen && 111 LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) { 112 r.SetHost(host.c_str(), 113 url::Component(kWWWDotLen, 114 static_cast<int>(host.size()) - kWWWDotLen)); 115 } 116 // Remove other stuff we don't want. 117 r.ClearUsername(); 118 r.ClearPassword(); 119 r.ClearQuery(); 120 r.ClearRef(); 121 r.ClearPort(); 122 123 return url.ReplaceComponents(r).spec(); 124 } 125 126 SegmentID VisitSegmentDatabase::GetSegmentNamed( 127 const std::string& segment_name) { 128 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 129 "SELECT id FROM segments WHERE name = ?")); 130 statement.BindString(0, segment_name); 131 132 if (statement.Step()) 133 return statement.ColumnInt64(0); 134 return 0; 135 } 136 137 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id, 138 URLID url_id) { 139 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 140 "UPDATE segments SET url_id = ? WHERE id = ?")); 141 statement.BindInt64(0, url_id); 142 statement.BindInt64(1, segment_id); 143 144 return statement.Run(); 145 } 146 147 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) { 148 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 149 "SELECT url_id FROM segments WHERE id = ?")); 150 statement.BindInt64(0, segment_id); 151 152 if (statement.Step()) 153 return statement.ColumnInt64(0); 154 return 0; 155 } 156 157 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id, 158 const std::string& segment_name) { 159 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 160 "INSERT INTO segments (name, url_id) VALUES (?,?)")); 161 statement.BindString(0, segment_name); 162 statement.BindInt64(1, url_id); 163 164 if (statement.Run()) 165 return GetDB().GetLastInsertRowId(); 166 return 0; 167 } 168 169 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id, 170 base::Time ts, 171 int amount) { 172 base::Time t = ts.LocalMidnight(); 173 174 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE, 175 "SELECT id, visit_count FROM segment_usage " 176 "WHERE time_slot = ? AND segment_id = ?")); 177 select.BindInt64(0, t.ToInternalValue()); 178 select.BindInt64(1, segment_id); 179 180 if (!select.is_valid()) 181 return false; 182 183 if (select.Step()) { 184 sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE, 185 "UPDATE segment_usage SET visit_count = ? WHERE id = ?")); 186 update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount)); 187 update.BindInt64(1, select.ColumnInt64(0)); 188 189 return update.Run(); 190 } else { 191 sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE, 192 "INSERT INTO segment_usage " 193 "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)")); 194 insert.BindInt64(0, segment_id); 195 insert.BindInt64(1, t.ToInternalValue()); 196 insert.BindInt64(2, static_cast<int64>(amount)); 197 198 return insert.Run(); 199 } 200 } 201 202 void VisitSegmentDatabase::QuerySegmentUsage( 203 base::Time from_time, 204 int max_result_count, 205 std::vector<PageUsageData*>* results) { 206 // This function gathers the highest-ranked segments in two queries. 207 // The first gathers scores for all segments. 208 // The second gathers segment data (url, title, etc.) for the highest-ranked 209 // segments. 210 211 // Gather all the segment scores. 212 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 213 "SELECT segment_id, time_slot, visit_count " 214 "FROM segment_usage WHERE time_slot >= ? " 215 "ORDER BY segment_id")); 216 if (!statement.is_valid()) 217 return; 218 219 base::Time ts = from_time.LocalMidnight(); 220 statement.BindInt64(0, ts.ToInternalValue()); 221 222 base::Time now = base::Time::Now(); 223 SegmentID last_segment_id = 0; 224 PageUsageData* pud = NULL; 225 float score = 0; 226 while (statement.Step()) { 227 SegmentID segment_id = statement.ColumnInt64(0); 228 if (segment_id != last_segment_id) { 229 if (pud) { 230 pud->SetScore(score); 231 results->push_back(pud); 232 } 233 234 pud = new PageUsageData(segment_id); 235 score = 0; 236 last_segment_id = segment_id; 237 } 238 239 base::Time timeslot = 240 base::Time::FromInternalValue(statement.ColumnInt64(1)); 241 int visit_count = statement.ColumnInt(2); 242 int days_ago = (now - timeslot).InDays(); 243 244 // Score for this day in isolation. 245 float day_visits_score = 1.0f + log(static_cast<float>(visit_count)); 246 // Recent visits count more than historical ones, so we multiply in a boost 247 // related to how long ago this day was. 248 // This boost is a curve that smoothly goes through these values: 249 // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x 250 // at the limit of how far we reach into the past. 251 float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f))); 252 score += recency_boost * day_visits_score; 253 } 254 255 if (pud) { 256 pud->SetScore(score); 257 results->push_back(pud); 258 } 259 260 // Limit to the top kResultCount results. 261 std::sort(results->begin(), results->end(), PageUsageData::Predicate); 262 if (static_cast<int>(results->size()) > max_result_count) { 263 STLDeleteContainerPointers(results->begin() + max_result_count, 264 results->end()); 265 results->resize(max_result_count); 266 } 267 268 // Now fetch the details about the entries we care about. 269 sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE, 270 "SELECT urls.url, urls.title FROM urls " 271 "JOIN segments ON segments.url_id = urls.id " 272 "WHERE segments.id = ?")); 273 274 if (!statement2.is_valid()) 275 return; 276 277 for (size_t i = 0; i < results->size(); ++i) { 278 PageUsageData* pud = (*results)[i]; 279 statement2.BindInt64(0, pud->GetID()); 280 if (statement2.Step()) { 281 pud->SetURL(GURL(statement2.ColumnString(0))); 282 pud->SetTitle(statement2.ColumnString16(1)); 283 } 284 statement2.Reset(true); 285 } 286 } 287 288 bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) { 289 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 290 "DELETE FROM segment_usage WHERE time_slot < ?")); 291 statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue()); 292 293 return statement.Run(); 294 } 295 296 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) { 297 sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE, 298 "DELETE FROM segment_usage WHERE segment_id IN " 299 "(SELECT id FROM segments WHERE url_id = ?)")); 300 delete_usage.BindInt64(0, url_id); 301 302 if (!delete_usage.Run()) 303 return false; 304 305 sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE, 306 "DELETE FROM segments WHERE url_id = ?")); 307 delete_seg.BindInt64(0, url_id); 308 309 return delete_seg.Run(); 310 } 311 312 bool VisitSegmentDatabase::MigratePresentationIndex() { 313 sql::Transaction transaction(&GetDB()); 314 return transaction.Begin() && 315 GetDB().Execute("DROP TABLE presentation") && 316 GetDB().Execute("CREATE TABLE segments_tmp (" 317 "id INTEGER PRIMARY KEY," 318 "name VARCHAR," 319 "url_id INTEGER NON NULL)") && 320 GetDB().Execute("INSERT INTO segments_tmp SELECT " 321 "id, name, url_id FROM segments") && 322 GetDB().Execute("DROP TABLE segments") && 323 GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") && 324 transaction.Commit(); 325 } 326 327 } // namespace history 328