1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/visitsegment_database.h" 6 7 #include <math.h> 8 9 #include <algorithm> 10 #include <string> 11 #include <vector> 12 13 #include "app/sql/statement.h" 14 #include "base/logging.h" 15 #include "base/stl_util-inl.h" 16 #include "base/string_util.h" 17 #include "base/utf_string_conversions.h" 18 #include "chrome/browser/history/page_usage_data.h" 19 20 // The following tables are used to store url segment information. 21 // 22 // segments 23 // id Primary key 24 // name A unique string to represent that segment. (URL derived) 25 // url_id ID of the url currently used to represent this segment. 26 // pres_index index used to store a fixed presentation position. 27 // 28 // segment_usage 29 // id Primary key 30 // segment_id Corresponding segment id 31 // time_slot time stamp identifying for what day this entry is about 32 // visit_count Number of visit in the segment 33 // 34 35 namespace history { 36 37 VisitSegmentDatabase::VisitSegmentDatabase() { 38 } 39 40 VisitSegmentDatabase::~VisitSegmentDatabase() { 41 } 42 43 bool VisitSegmentDatabase::InitSegmentTables() { 44 // Segments table. 45 if (!GetDB().DoesTableExist("segments")) { 46 if (!GetDB().Execute("CREATE TABLE segments (" 47 "id INTEGER PRIMARY KEY," 48 "name VARCHAR," 49 "url_id INTEGER NON NULL," 50 "pres_index INTEGER DEFAULT -1 NOT NULL)")) { 51 NOTREACHED(); 52 return false; 53 } 54 55 if (!GetDB().Execute("CREATE INDEX segments_name ON segments(name)")) { 56 NOTREACHED(); 57 return false; 58 } 59 } 60 61 // This was added later, so we need to try to create it even if the table 62 // already exists. 63 GetDB().Execute("CREATE INDEX segments_url_id ON segments(url_id)"); 64 65 // Segment usage table. 66 if (!GetDB().DoesTableExist("segment_usage")) { 67 if (!GetDB().Execute("CREATE TABLE segment_usage (" 68 "id INTEGER PRIMARY KEY," 69 "segment_id INTEGER NOT NULL," 70 "time_slot INTEGER NOT NULL," 71 "visit_count INTEGER DEFAULT 0 NOT NULL)")) { 72 NOTREACHED(); 73 return false; 74 } 75 if (!GetDB().Execute( 76 "CREATE INDEX segment_usage_time_slot_segment_id ON " 77 "segment_usage(time_slot, segment_id)")) { 78 NOTREACHED(); 79 return false; 80 } 81 } 82 83 // Added in a later version, so we always need to try to creat this index. 84 GetDB().Execute("CREATE INDEX segments_usage_seg_id " 85 "ON segment_usage(segment_id)"); 86 87 // Presentation index table. 88 // 89 // Important note: 90 // Right now, this table is only used to store the presentation index. 91 // If you need to add more columns, keep in mind that rows are currently 92 // deleted when the presentation index is changed to -1. 93 // See SetPagePresentationIndex() in this file 94 if (!GetDB().DoesTableExist("presentation")) { 95 if (!GetDB().Execute("CREATE TABLE presentation(" 96 "url_id INTEGER PRIMARY KEY," 97 "pres_index INTEGER NOT NULL)")) 98 return false; 99 } 100 return true; 101 } 102 103 bool VisitSegmentDatabase::DropSegmentTables() { 104 // Dropping the tables will implicitly delete the indices. 105 return GetDB().Execute("DROP TABLE segments") && 106 GetDB().Execute("DROP TABLE segment_usage"); 107 } 108 109 // Note: the segment name is derived from the URL but is not a URL. It is 110 // a string that can be easily recreated from various URLS. Maybe this should 111 // be an MD5 to limit the length. 112 // 113 // static 114 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) { 115 // TODO(brettw) this should probably use the registry controlled 116 // domains service. 117 GURL::Replacements r; 118 const char kWWWDot[] = "www."; 119 const int kWWWDotLen = arraysize(kWWWDot) - 1; 120 121 std::string host = url.host(); 122 const char* host_c = host.c_str(); 123 // Remove www. to avoid some dups. 124 if (static_cast<int>(host.size()) > kWWWDotLen && 125 LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) { 126 r.SetHost(host.c_str(), 127 url_parse::Component(kWWWDotLen, 128 static_cast<int>(host.size()) - kWWWDotLen)); 129 } 130 // Remove other stuff we don't want. 131 r.ClearUsername(); 132 r.ClearPassword(); 133 r.ClearQuery(); 134 r.ClearRef(); 135 r.ClearPort(); 136 137 return url.ReplaceComponents(r).spec(); 138 } 139 140 SegmentID VisitSegmentDatabase::GetSegmentNamed( 141 const std::string& segment_name) { 142 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 143 "SELECT id FROM segments WHERE name = ?")); 144 if (!statement) 145 return 0; 146 147 statement.BindString(0, segment_name); 148 if (statement.Step()) 149 return statement.ColumnInt64(0); 150 return 0; 151 } 152 153 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id, 154 URLID url_id) { 155 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 156 "UPDATE segments SET url_id = ? WHERE id = ?")); 157 if (!statement) 158 return false; 159 160 statement.BindInt64(0, url_id); 161 statement.BindInt64(1, segment_id); 162 return statement.Run(); 163 } 164 165 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) { 166 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 167 "SELECT url_id FROM segments WHERE id = ?")); 168 if (!statement) 169 return 0; 170 171 statement.BindInt64(0, segment_id); 172 if (statement.Step()) 173 return statement.ColumnInt64(0); 174 return 0; 175 } 176 177 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id, 178 const std::string& segment_name) { 179 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 180 "INSERT INTO segments (name, url_id) VALUES (?,?)")); 181 if (!statement) 182 return false; 183 184 statement.BindString(0, segment_name); 185 statement.BindInt64(1, url_id); 186 if (statement.Run()) 187 return GetDB().GetLastInsertRowId(); 188 return false; 189 } 190 191 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id, 192 base::Time ts, 193 int amount) { 194 base::Time t = ts.LocalMidnight(); 195 196 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE, 197 "SELECT id, visit_count FROM segment_usage " 198 "WHERE time_slot = ? AND segment_id = ?")); 199 if (!select) 200 return false; 201 202 select.BindInt64(0, t.ToInternalValue()); 203 select.BindInt64(1, segment_id); 204 if (select.Step()) { 205 sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE, 206 "UPDATE segment_usage SET visit_count = ? WHERE id = ?")); 207 if (!update) 208 return false; 209 210 update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount)); 211 update.BindInt64(1, select.ColumnInt64(0)); 212 return update.Run(); 213 214 } else { 215 sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE, 216 "INSERT INTO segment_usage " 217 "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)")); 218 if (!insert) 219 return false; 220 221 insert.BindInt64(0, segment_id); 222 insert.BindInt64(1, t.ToInternalValue()); 223 insert.BindInt64(2, static_cast<int64>(amount)); 224 return insert.Run(); 225 } 226 } 227 228 void VisitSegmentDatabase::QuerySegmentUsage( 229 base::Time from_time, 230 int max_result_count, 231 std::vector<PageUsageData*>* results) { 232 // This function gathers the highest-ranked segments in two queries. 233 // The first gathers scores for all segments. 234 // The second gathers segment data (url, title, etc.) for the highest-ranked 235 // segments. 236 // TODO(evanm): this disregards the "presentation index", which was what was 237 // used to lock results into position. But the rest of our code currently 238 // does as well. 239 240 // Gather all the segment scores. 241 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 242 "SELECT segment_id, time_slot, visit_count " 243 "FROM segment_usage WHERE time_slot >= ? " 244 "ORDER BY segment_id")); 245 if (!statement) { 246 NOTREACHED() << GetDB().GetErrorMessage(); 247 return; 248 } 249 250 base::Time ts = from_time.LocalMidnight(); 251 statement.BindInt64(0, ts.ToInternalValue()); 252 253 base::Time now = base::Time::Now(); 254 SegmentID last_segment_id = 0; 255 PageUsageData* pud = NULL; 256 float score = 0; 257 while (statement.Step()) { 258 SegmentID segment_id = statement.ColumnInt64(0); 259 if (segment_id != last_segment_id) { 260 if (pud) { 261 pud->SetScore(score); 262 results->push_back(pud); 263 } 264 265 pud = new PageUsageData(segment_id); 266 score = 0; 267 last_segment_id = segment_id; 268 } 269 270 base::Time timeslot = 271 base::Time::FromInternalValue(statement.ColumnInt64(1)); 272 int visit_count = statement.ColumnInt(2); 273 int days_ago = (now - timeslot).InDays(); 274 275 // Score for this day in isolation. 276 float day_visits_score = 1.0f + log(static_cast<float>(visit_count)); 277 // Recent visits count more than historical ones, so we multiply in a boost 278 // related to how long ago this day was. 279 // This boost is a curve that smoothly goes through these values: 280 // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x 281 // at the limit of how far we reach into the past. 282 float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f))); 283 score += recency_boost * day_visits_score; 284 } 285 286 if (pud) { 287 pud->SetScore(score); 288 results->push_back(pud); 289 } 290 291 // Limit to the top kResultCount results. 292 sort(results->begin(), results->end(), PageUsageData::Predicate); 293 if (static_cast<int>(results->size()) > max_result_count) { 294 STLDeleteContainerPointers(results->begin() + max_result_count, 295 results->end()); 296 results->resize(max_result_count); 297 } 298 299 // Now fetch the details about the entries we care about. 300 sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE, 301 "SELECT urls.url, urls.title FROM urls " 302 "JOIN segments ON segments.url_id = urls.id " 303 "WHERE segments.id = ?")); 304 if (!statement2) { 305 NOTREACHED() << GetDB().GetErrorMessage(); 306 return; 307 } 308 for (size_t i = 0; i < results->size(); ++i) { 309 PageUsageData* pud = (*results)[i]; 310 statement2.BindInt64(0, pud->GetID()); 311 if (statement2.Step()) { 312 pud->SetURL(GURL(statement2.ColumnString(0))); 313 pud->SetTitle(UTF8ToUTF16(statement2.ColumnString(1))); 314 } 315 statement2.Reset(); 316 } 317 } 318 319 void VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) { 320 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 321 "DELETE FROM segment_usage WHERE time_slot < ?")); 322 if (!statement) 323 return; 324 325 statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue()); 326 if (!statement.Run()) 327 NOTREACHED(); 328 } 329 330 void VisitSegmentDatabase::SetSegmentPresentationIndex(SegmentID segment_id, 331 int index) { 332 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, 333 "UPDATE segments SET pres_index = ? WHERE id = ?")); 334 if (!statement) 335 return; 336 337 statement.BindInt(0, index); 338 statement.BindInt64(1, segment_id); 339 if (!statement.Run()) 340 NOTREACHED(); 341 else 342 DCHECK_EQ(1, GetDB().GetLastChangeCount()); 343 } 344 345 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) { 346 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE, 347 "SELECT id FROM segments WHERE url_id = ?")); 348 if (!select) 349 return false; 350 351 sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE, 352 "DELETE FROM segments WHERE id = ?")); 353 if (!delete_seg) 354 return false; 355 356 sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE, 357 "DELETE FROM segment_usage WHERE segment_id = ?")); 358 if (!delete_usage) 359 return false; 360 361 bool r = true; 362 select.BindInt64(0, url_id); 363 // In theory there could not be more than one segment using that URL but we 364 // loop anyway to cleanup any inconsistency. 365 while (select.Step()) { 366 SegmentID segment_id = select.ColumnInt64(0); 367 368 delete_usage.BindInt64(0, segment_id); 369 if (!delete_usage.Run()) { 370 NOTREACHED(); 371 r = false; 372 } 373 374 delete_seg.BindInt64(0, segment_id); 375 if (!delete_seg.Run()) { 376 NOTREACHED(); 377 r = false; 378 } 379 delete_usage.Reset(); 380 delete_seg.Reset(); 381 } 382 return r; 383 } 384 385 } // namespace history 386