1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/visitsegment_database.h" 6 7 #include <math.h> 8 9 #include <algorithm> 10 #include <string> 11 #include <vector> 12 13 #include "base/command_line.h" 14 #include "base/logging.h" 15 #include "base/stl_util.h" 16 #include "base/strings/string_util.h" 17 #include "base/strings/utf_string_conversions.h" 18 #include "chrome/browser/history/page_usage_data.h" 19 #include "chrome/common/chrome_switches.h" 20 #include "sql/statement.h" 21 #include "sql/transaction.h" 22 23 // The following tables are used to store url segment information. 24 // 25 // segments 26 // id Primary key 27 // name A unique string to represent that segment. (URL derived) 28 // url_id ID of the url currently used to represent this segment. 29 // 30 // segment_usage 31 // id Primary key 32 // segment_id Corresponding segment id 33 // time_slot time stamp identifying for what day this entry is about 34 // visit_count Number of visit in the segment 35 // 36 // segment_duration 37 // id Primary key 38 // segment_id Corresponding segment id 39 // time_slot time stamp identifying what day this entry is for 40 // duration Total time during the time_slot the user has been on 41 // the page. This is a serialized TimeDelta value. 42 // segment_duration is only created if chrome::kTrackActiveVisitTime is set. 43 44 namespace history { 45 46 VisitSegmentDatabase::VisitSegmentDatabase() 47 : has_duration_table_(CommandLine::ForCurrentProcess()->HasSwitch( 48 switches::kTrackActiveVisitTime)) { 49 } 50 51 VisitSegmentDatabase::~VisitSegmentDatabase() { 52 } 53 54 bool VisitSegmentDatabase::InitSegmentTables() { 55 // Segments table. 56 if (!GetDB().DoesTableExist("segments")) { 57 if (!GetDB().Execute("CREATE TABLE segments (" 58 "id INTEGER PRIMARY KEY," 59 "name VARCHAR," 60 "url_id INTEGER NON NULL)")) { 61 return false; 62 } 63 64 if (!GetDB().Execute( 65 "CREATE INDEX segments_name ON segments(name)")) { 66 return false; 67 } 68 } 69 70 // This was added later, so we need to try to create it even if the table 71 // already exists. 72 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON " 73 "segments(url_id)")) 74 return false; 75 76 // Segment usage table. 77 if (!GetDB().DoesTableExist("segment_usage")) { 78 if (!GetDB().Execute("CREATE TABLE segment_usage (" 79 "id INTEGER PRIMARY KEY," 80 "segment_id INTEGER NOT NULL," 81 "time_slot INTEGER NOT NULL," 82 "visit_count INTEGER DEFAULT 0 NOT NULL)")) { 83 return false; 84 } 85 if (!GetDB().Execute( 86 "CREATE INDEX segment_usage_time_slot_segment_id ON " 87 "segment_usage(time_slot, segment_id)")) { 88 return false; 89 } 90 } 91 92 // Added in a later version, so we always need to try to creat this index. 93 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id " 94 "ON segment_usage(segment_id)")) 95 return false; 96 97 // TODO(sky): if we decide to keep this feature duration should be added to 98 // segument_usage. 99 if (has_duration_table_ && !GetDB().DoesTableExist("segment_duration")) { 100 if (!GetDB().Execute("CREATE TABLE segment_duration (" 101 "id INTEGER PRIMARY KEY," 102 "segment_id INTEGER NOT NULL," 103 "time_slot INTEGER NOT NULL," 104 "duration INTEGER DEFAULT 0 NOT NULL)")) { 105 return false; 106 } 107 if (!GetDB().Execute( 108 "CREATE INDEX segment_duration_time_slot_segment_id ON " 109 "segment_duration(time_slot, segment_id)")) { 110 return false; 111 } 112 } else if (!has_duration_table_ && 113 !GetDB().Execute("DROP TABLE IF EXISTS segment_duration")) { 114 return false; 115 } 116 117 return true; 118 } 119 120 bool VisitSegmentDatabase::DropSegmentTables() { 121 // Dropping the tables will implicitly delete the indices. 122 return GetDB().Execute("DROP TABLE segments") && 123 GetDB().Execute("DROP TABLE segment_usage") && 124 GetDB().Execute("DROP TABLE IF EXISTS segment_duration"); 125 } 126 127 // Note: the segment name is derived from the URL but is not a URL. It is 128 // a string that can be easily recreated from various URLS. Maybe this should 129 // be an MD5 to limit the length. 130 // 131 // static 132 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) { 133 // TODO(brettw) this should probably use the registry controlled 134 // domains service. 135 GURL::Replacements r; 136 const char kWWWDot[] = "www."; 137 const int kWWWDotLen = arraysize(kWWWDot) - 1; 138 139 std::string host = url.host(); 140 const char* host_c = host.c_str(); 141 // Remove www. to avoid some dups. 142 if (static_cast<int>(host.size()) > kWWWDotLen && 143 LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) { 144 r.SetHost(host.c_str(), 145 url_parse::Component(kWWWDotLen, 146 static_cast<int>(host.size()) - kWWWDotLen)); 147 } 148 // Remove other stuff we don't want. 149 r.ClearUsername(); 150 r.ClearPassword(); 151 r.ClearQuery(); 152 r.ClearRef(); 153 r.ClearPort(); 154 155 return url.ReplaceComponents(r).spec(); 156 } 157 158 // static 159 base::Time VisitSegmentDatabase::SegmentTime(base::Time time) { 160 return time.LocalMidnight(); 161 } 162 163 SegmentID VisitSegmentDatabase::GetSegmentNamed( 164 const std::string& segment_name) { 165 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 166 "SELECT id FROM segments WHERE name = ?")); 167 statement.BindString(0, segment_name); 168 169 if (statement.Step()) 170 return statement.ColumnInt64(0); 171 return 0; 172 } 173 174 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id, 175 URLID url_id) { 176 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 177 "UPDATE segments SET url_id = ? WHERE id = ?")); 178 statement.BindInt64(0, url_id); 179 statement.BindInt64(1, segment_id); 180 181 return statement.Run(); 182 } 183 184 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) { 185 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 186 "SELECT url_id FROM segments WHERE id = ?")); 187 statement.BindInt64(0, segment_id); 188 189 if (statement.Step()) 190 return statement.ColumnInt64(0); 191 return 0; 192 } 193 194 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id, 195 const std::string& segment_name) { 196 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 197 "INSERT INTO segments (name, url_id) VALUES (?,?)")); 198 statement.BindString(0, segment_name); 199 statement.BindInt64(1, url_id); 200 201 if (statement.Run()) 202 return GetDB().GetLastInsertRowId(); 203 return 0; 204 } 205 206 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id, 207 base::Time ts, 208 int amount) { 209 base::Time t = SegmentTime(ts); 210 211 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE, 212 "SELECT id, visit_count FROM segment_usage " 213 "WHERE time_slot = ? AND segment_id = ?")); 214 select.BindInt64(0, t.ToInternalValue()); 215 select.BindInt64(1, segment_id); 216 217 if (!select.is_valid()) 218 return false; 219 220 if (select.Step()) { 221 sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE, 222 "UPDATE segment_usage SET visit_count = ? WHERE id = ?")); 223 update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount)); 224 update.BindInt64(1, select.ColumnInt64(0)); 225 226 return update.Run(); 227 } else { 228 sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE, 229 "INSERT INTO segment_usage " 230 "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)")); 231 insert.BindInt64(0, segment_id); 232 insert.BindInt64(1, t.ToInternalValue()); 233 insert.BindInt64(2, static_cast<int64>(amount)); 234 235 return insert.Run(); 236 } 237 } 238 239 void VisitSegmentDatabase::QuerySegmentUsage( 240 base::Time from_time, 241 int max_result_count, 242 std::vector<PageUsageData*>* result) { 243 // Gather all the segment scores. 244 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 245 "SELECT segment_id, time_slot, visit_count " 246 "FROM segment_usage WHERE time_slot >= ? " 247 "ORDER BY segment_id")); 248 if (!statement.is_valid()) 249 return; 250 251 QuerySegmentsCommon(&statement, from_time, max_result_count, 252 QUERY_VISIT_COUNT, result); 253 } 254 255 bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) { 256 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 257 "DELETE FROM segment_usage WHERE time_slot < ?")); 258 statement.BindInt64(0, SegmentTime(older_than).ToInternalValue()); 259 260 if (!statement.Run()) 261 return false; 262 263 if (!has_duration_table_) 264 return true; 265 266 sql::Statement duration_statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 267 "DELETE FROM segment_duration WHERE time_slot < ?")); 268 duration_statement.BindInt64(0, SegmentTime(older_than).ToInternalValue()); 269 270 return duration_statement.Run(); 271 } 272 273 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) { 274 sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE, 275 "DELETE FROM segment_usage WHERE segment_id IN " 276 "(SELECT id FROM segments WHERE url_id = ?)")); 277 delete_usage.BindInt64(0, url_id); 278 279 if (!delete_usage.Run()) 280 return false; 281 282 if (has_duration_table_) { 283 sql::Statement delete_duration(GetDB().GetCachedStatement(SQL_FROM_HERE, 284 "DELETE FROM segment_duration WHERE segment_id IN " 285 "(SELECT id FROM segments WHERE url_id = ?)")); 286 delete_duration.BindInt64(0, url_id); 287 288 if (!delete_duration.Run()) 289 return false; 290 } 291 292 sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE, 293 "DELETE FROM segments WHERE url_id = ?")); 294 delete_seg.BindInt64(0, url_id); 295 296 return delete_seg.Run(); 297 } 298 299 SegmentDurationID VisitSegmentDatabase::CreateSegmentDuration( 300 SegmentID segment_id, 301 base::Time time, 302 base::TimeDelta delta) { 303 if (!has_duration_table_) 304 return 0; 305 306 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 307 "INSERT INTO segment_duration (segment_id, time_slot, duration) " 308 "VALUES (?,?,?)")); 309 statement.BindInt64(0, segment_id); 310 statement.BindInt64(1, SegmentTime(time).ToInternalValue()); 311 statement.BindInt64(2, delta.ToInternalValue()); 312 return statement.Run() ? GetDB().GetLastInsertRowId() : 0; 313 } 314 315 bool VisitSegmentDatabase::SetSegmentDuration(SegmentDurationID duration_id, 316 base::TimeDelta time_delta) { 317 if (!has_duration_table_) 318 return false; 319 320 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 321 "UPDATE segment_duration SET duration = ? WHERE id = ?")); 322 statement.BindInt64(0, time_delta.ToInternalValue()); 323 statement.BindInt64(1, duration_id); 324 return statement.Run(); 325 } 326 327 bool VisitSegmentDatabase::GetSegmentDuration(SegmentID segment_id, 328 base::Time time, 329 SegmentDurationID* duration_id, 330 base::TimeDelta* time_delta) { 331 if (!has_duration_table_) 332 return false; 333 334 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 335 "SELECT id, duration FROM segment_duration " 336 "WHERE segment_id = ? AND time_slot = ? ")); 337 if (!statement.is_valid()) 338 return false; 339 340 statement.BindInt64(0, segment_id); 341 statement.BindInt64(1, SegmentTime(time).ToInternalValue()); 342 343 if (!statement.Step()) 344 return false; 345 346 *duration_id = statement.ColumnInt64(0); 347 *time_delta = base::TimeDelta::FromInternalValue(statement.ColumnInt64(1)); 348 return true; 349 } 350 351 void VisitSegmentDatabase::QuerySegmentDuration( 352 base::Time from_time, 353 int max_result_count, 354 std::vector<PageUsageData*>* result) { 355 if (!has_duration_table_) 356 return; 357 358 // Gather all the segment scores. 359 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 360 "SELECT segment_id, time_slot, duration " 361 "FROM segment_duration WHERE time_slot >= ? " 362 "ORDER BY segment_id")); 363 if (!statement.is_valid()) 364 return; 365 366 QuerySegmentsCommon(&statement, from_time, max_result_count, QUERY_DURATION, 367 result); 368 } 369 370 bool VisitSegmentDatabase::MigratePresentationIndex() { 371 sql::Transaction transaction(&GetDB()); 372 return transaction.Begin() && 373 GetDB().Execute("DROP TABLE presentation") && 374 GetDB().Execute("CREATE TABLE segments_tmp (" 375 "id INTEGER PRIMARY KEY," 376 "name VARCHAR," 377 "url_id INTEGER NON NULL)") && 378 GetDB().Execute("INSERT INTO segments_tmp SELECT " 379 "id, name, url_id FROM segments") && 380 GetDB().Execute("DROP TABLE segments") && 381 GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") && 382 transaction.Commit(); 383 } 384 385 386 void VisitSegmentDatabase::QuerySegmentsCommon( 387 sql::Statement* statement, 388 base::Time from_time, 389 int max_result_count, 390 QueryType query_type, 391 std::vector<PageUsageData*>* result) { 392 // This function gathers the highest-ranked segments in two queries. 393 // The first gathers scores for all segments. 394 // The second gathers segment data (url, title, etc.) for the highest-ranked 395 // segments. 396 397 base::Time ts = SegmentTime(from_time); 398 statement->BindInt64(0, ts.ToInternalValue()); 399 400 base::Time now = base::Time::Now(); 401 SegmentID last_segment_id = 0; 402 PageUsageData* pud = NULL; 403 float score = 0; 404 base::TimeDelta duration; 405 while (statement->Step()) { 406 SegmentID segment_id = statement->ColumnInt64(0); 407 if (segment_id != last_segment_id) { 408 if (pud) { 409 pud->SetScore(score); 410 pud->SetDuration(duration); 411 result->push_back(pud); 412 } 413 414 pud = new PageUsageData(segment_id); 415 score = 0; 416 last_segment_id = segment_id; 417 duration = base::TimeDelta(); 418 } 419 420 base::Time timeslot = 421 base::Time::FromInternalValue(statement->ColumnInt64(1)); 422 int count; 423 if (query_type == QUERY_VISIT_COUNT) { 424 count = statement->ColumnInt(2); 425 } else { 426 base::TimeDelta current_duration( 427 base::TimeDelta::FromInternalValue(statement->ColumnInt64(2))); 428 duration += current_duration; 429 // Souldn't overflow since we group by day. 430 count = static_cast<int>(current_duration.InSeconds()); 431 } 432 float day_score = 1.0f + log(static_cast<float>(count)); 433 434 // Recent visits count more than historical ones, so we multiply in a boost 435 // related to how long ago this day was. 436 // This boost is a curve that smoothly goes through these values: 437 // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x 438 // at the limit of how far we reach into the past. 439 int days_ago = (now - timeslot).InDays(); 440 float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f))); 441 score += recency_boost * day_score; 442 } 443 444 if (pud) { 445 pud->SetScore(score); 446 pud->SetDuration(duration); 447 result->push_back(pud); 448 } 449 450 // Limit to the top kResultCount results. 451 std::sort(result->begin(), result->end(), PageUsageData::Predicate); 452 if (static_cast<int>(result->size()) > max_result_count) { 453 STLDeleteContainerPointers(result->begin() + max_result_count, 454 result->end()); 455 result->resize(max_result_count); 456 } 457 458 // Now fetch the details about the entries we care about. 459 sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE, 460 "SELECT urls.url, urls.title FROM urls " 461 "JOIN segments ON segments.url_id = urls.id " 462 "WHERE segments.id = ?")); 463 464 if (!statement2.is_valid()) 465 return; 466 467 for (size_t i = 0; i < result->size(); ++i) { 468 PageUsageData* pud = (*result)[i]; 469 statement2.BindInt64(0, pud->GetID()); 470 if (statement2.Step()) { 471 pud->SetURL(GURL(statement2.ColumnString(0))); 472 pud->SetTitle(statement2.ColumnString16(1)); 473 } 474 statement2.Reset(true); 475 } 476 } 477 478 } // namespace history 479