1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/history/text_database_manager.h" 6 7 #include "base/compiler_specific.h" 8 #include "base/file_util.h" 9 #include "base/metrics/histogram.h" 10 #include "base/logging.h" 11 #include "base/message_loop.h" 12 #include "base/string_util.h" 13 #include "base/utf_string_conversions.h" 14 #include "chrome/browser/history/history_publisher.h" 15 #include "chrome/browser/history/visit_database.h" 16 #include "content/common/mru_cache.h" 17 18 using base::Time; 19 using base::TimeDelta; 20 using base::TimeTicks; 21 22 namespace history { 23 24 namespace { 25 26 // The number of database files we will be attached to at once. 27 const int kCacheDBSize = 5; 28 29 std::string ConvertStringForIndexer(const string16& input) { 30 // TODO(evanm): other transformations here? 31 return UTF16ToUTF8(CollapseWhitespace(input, false)); 32 } 33 34 // Data older than this will be committed to the full text index even if we 35 // haven't gotten a title and/or body. 36 const int kExpirationSec = 20; 37 38 } // namespace 39 40 // TextDatabaseManager::ChangeSet ---------------------------------------------- 41 42 TextDatabaseManager::ChangeSet::ChangeSet() {} 43 44 TextDatabaseManager::ChangeSet::~ChangeSet() {} 45 46 // TextDatabaseManager::PageInfo ----------------------------------------------- 47 48 TextDatabaseManager::PageInfo::PageInfo(URLID url_id, 49 VisitID visit_id, 50 Time visit_time) 51 : url_id_(url_id), 52 visit_id_(visit_id), 53 visit_time_(visit_time) { 54 added_time_ = TimeTicks::Now(); 55 } 56 57 TextDatabaseManager::PageInfo::~PageInfo() {} 58 59 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) { 60 if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet. 61 title_ = ASCIIToUTF16(" "); 62 else 63 title_ = ttl; 64 } 65 66 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) { 67 if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet. 68 body_ = ASCIIToUTF16(" "); 69 else 70 body_ = bdy; 71 } 72 73 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const { 74 return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec); 75 } 76 77 // TextDatabaseManager --------------------------------------------------------- 78 79 TextDatabaseManager::TextDatabaseManager(const FilePath& dir, 80 URLDatabase* url_database, 81 VisitDatabase* visit_database) 82 : dir_(dir), 83 url_database_(url_database), 84 visit_database_(visit_database), 85 recent_changes_(RecentChangeList::NO_AUTO_EVICT), 86 transaction_nesting_(0), 87 db_cache_(DBCache::NO_AUTO_EVICT), 88 present_databases_loaded_(false), 89 ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)), 90 history_publisher_(NULL) { 91 } 92 93 TextDatabaseManager::~TextDatabaseManager() { 94 if (transaction_nesting_) 95 CommitTransaction(); 96 } 97 98 // static 99 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) { 100 Time::Exploded exploded; 101 time.UTCExplode(&exploded); 102 103 // We combine the month and year into a 6-digit number (200801 for 104 // January, 2008). The month is 1-based. 105 return exploded.year * 100 + exploded.month; 106 } 107 108 // static 109 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) { 110 Time::Exploded exploded; 111 memset(&exploded, 0, sizeof(Time::Exploded)); 112 exploded.year = id / 100; 113 exploded.month = id % 100; 114 return Time::FromUTCExploded(exploded); 115 } 116 117 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) { 118 history_publisher_ = history_publisher; 119 120 // Start checking recent changes and committing them. 121 ScheduleFlushOldChanges(); 122 return true; 123 } 124 125 void TextDatabaseManager::BeginTransaction() { 126 transaction_nesting_++; 127 } 128 129 void TextDatabaseManager::CommitTransaction() { 130 DCHECK(transaction_nesting_); 131 transaction_nesting_--; 132 if (transaction_nesting_) 133 return; // Still more nesting of transactions before committing. 134 135 // Commit all databases with open transactions on them. 136 for (DBIdentSet::const_iterator i = open_transactions_.begin(); 137 i != open_transactions_.end(); ++i) { 138 DBCache::iterator iter = db_cache_.Get(*i); 139 if (iter == db_cache_.end()) { 140 NOTREACHED() << "All open transactions should be cached."; 141 continue; 142 } 143 iter->second->CommitTransaction(); 144 } 145 open_transactions_.clear(); 146 147 // Now that the transaction is over, we can expire old connections. 148 db_cache_.ShrinkToSize(kCacheDBSize); 149 } 150 151 void TextDatabaseManager::InitDBList() { 152 if (present_databases_loaded_) 153 return; 154 155 present_databases_loaded_ = true; 156 157 // Find files on disk matching our pattern so we can quickly test for them. 158 FilePath::StringType filepattern(TextDatabase::file_base()); 159 filepattern.append(FILE_PATH_LITERAL("*")); 160 file_util::FileEnumerator enumerator( 161 dir_, false, file_util::FileEnumerator::FILES, filepattern); 162 FilePath cur_file; 163 while (!(cur_file = enumerator.Next()).empty()) { 164 // Convert to the number representing this file. 165 TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file); 166 if (id) // Will be 0 on error. 167 present_databases_.insert(id); 168 } 169 } 170 171 void TextDatabaseManager::AddPageURL(const GURL& url, 172 URLID url_id, 173 VisitID visit_id, 174 Time time) { 175 // Delete any existing page info. 176 RecentChangeList::iterator found = recent_changes_.Peek(url); 177 if (found != recent_changes_.end()) 178 recent_changes_.Erase(found); 179 180 // Just save this info for later. We will save it when it expires or when all 181 // the data is complete. 182 recent_changes_.Put(url, PageInfo(url_id, visit_id, time)); 183 } 184 185 void TextDatabaseManager::AddPageTitle(const GURL& url, 186 const string16& title) { 187 RecentChangeList::iterator found = recent_changes_.Peek(url); 188 if (found == recent_changes_.end()) { 189 // This page is not in our cache of recent pages. This is very much an edge 190 // case as normally a title will come in <20 seconds after the page commits, 191 // and TabContents will avoid spamming us with >1 title per page. However, 192 // it could come up if your connection is unhappy, and we don't want to 193 // miss anything. 194 // 195 // To solve this problem, we'll just associate the most recent visit with 196 // the new title and index that using the regular code path. 197 URLRow url_row; 198 if (!url_database_->GetRowForURL(url, &url_row)) 199 return; // URL is unknown, give up. 200 VisitRow visit; 201 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) 202 return; // No recent visit, give up. 203 204 if (visit.is_indexed) { 205 // If this page was already indexed, we could have a body that came in 206 // first and we don't want to overwrite it. We could go query for the 207 // current body, or have a special setter for only the title, but this is 208 // not worth it for this edge case. 209 // 210 // It will be almost impossible for the title to take longer than 211 // kExpirationSec yet we got a body in less than that time, since the 212 // title should always come in first. 213 return; 214 } 215 216 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, 217 title, string16()); 218 return; // We don't know about this page, give up. 219 } 220 221 PageInfo& info = found->second; 222 if (info.has_body()) { 223 // This info is complete, write to the database. 224 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), 225 title, info.body()); 226 recent_changes_.Erase(found); 227 return; 228 } 229 230 info.set_title(title); 231 } 232 233 void TextDatabaseManager::AddPageContents(const GURL& url, 234 const string16& body) { 235 RecentChangeList::iterator found = recent_changes_.Peek(url); 236 if (found == recent_changes_.end()) { 237 // This page is not in our cache of recent pages. This means that the page 238 // took more than kExpirationSec to load. Often, this will be the result of 239 // a very slow iframe or other resource on the page that makes us think its 240 // still loading. 241 // 242 // As a fallback, set the most recent visit's contents using the input, and 243 // use the last set title in the URL table as the title to index. 244 URLRow url_row; 245 if (!url_database_->GetRowForURL(url, &url_row)) 246 return; // URL is unknown, give up. 247 VisitRow visit; 248 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) 249 return; // No recent visit, give up. 250 251 // Use the title from the URL row as the title for the indexing. 252 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, 253 url_row.title(), body); 254 return; 255 } 256 257 PageInfo& info = found->second; 258 if (info.has_title()) { 259 // This info is complete, write to the database. 260 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), 261 info.title(), body); 262 recent_changes_.Erase(found); 263 return; 264 } 265 266 info.set_body(body); 267 } 268 269 bool TextDatabaseManager::AddPageData(const GURL& url, 270 URLID url_id, 271 VisitID visit_id, 272 Time visit_time, 273 const string16& title, 274 const string16& body) { 275 TextDatabase* db = GetDBForTime(visit_time, true); 276 if (!db) 277 return false; 278 279 TimeTicks beginning_time = TimeTicks::Now(); 280 281 // First delete any recently-indexed data for this page. This will delete 282 // anything in the main database, but we don't bother looking through the 283 // archived database. 284 VisitVector visits; 285 visit_database_->GetVisitsForURL(url_id, &visits); 286 size_t our_visit_row_index = visits.size(); 287 for (size_t i = 0; i < visits.size(); i++) { 288 // While we're going trough all the visits, also find our row so we can 289 // avoid another DB query. 290 if (visits[i].visit_id == visit_id) { 291 our_visit_row_index = i; 292 } else if (visits[i].is_indexed) { 293 visits[i].is_indexed = false; 294 visit_database_->UpdateVisitRow(visits[i]); 295 DeletePageData(visits[i].visit_time, url, NULL); 296 } 297 } 298 299 if (visit_id) { 300 // We're supposed to update the visit database. 301 if (our_visit_row_index >= visits.size()) { 302 NOTREACHED() << "We should always have found a visit when given an ID."; 303 return false; 304 } 305 306 DCHECK(visit_time == visits[our_visit_row_index].visit_time); 307 308 // Update the visit database to reference our addition. 309 visits[our_visit_row_index].is_indexed = true; 310 if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index])) 311 return false; 312 } 313 314 // Now index the data. 315 std::string url_str = URLDatabase::GURLToDatabaseURL(url); 316 bool success = db->AddPageData(visit_time, url_str, 317 ConvertStringForIndexer(title), 318 ConvertStringForIndexer(body)); 319 320 UMA_HISTOGRAM_TIMES("History.AddFTSData", 321 TimeTicks::Now() - beginning_time); 322 323 if (history_publisher_) 324 history_publisher_->PublishPageContent(visit_time, url, title, body); 325 326 return success; 327 } 328 329 void TextDatabaseManager::DeletePageData(Time time, const GURL& url, 330 ChangeSet* change_set) { 331 TextDatabase::DBIdent db_ident = TimeToID(time); 332 333 // We want to open the database for writing, but only if it exists. To 334 // achieve this, we check whether it exists by saying we're not going to 335 // write to it (avoiding the autocreation code normally called when writing) 336 // and then access it for writing only if it succeeds. 337 TextDatabase* db = GetDB(db_ident, false); 338 if (!db) 339 return; 340 db = GetDB(db_ident, true); 341 342 if (change_set) 343 change_set->Add(db_ident); 344 345 db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url)); 346 } 347 348 void TextDatabaseManager::DeleteFromUncommitted( 349 const std::set<GURL>& restrict_urls, Time begin, Time end) { 350 // First find the beginning of the range to delete. Recall that the list 351 // has the most recent item at the beginning. There won't normally be very 352 // many items, so a brute-force search is fine. 353 RecentChangeList::iterator cur = recent_changes_.begin(); 354 if (!end.is_null()) { 355 // Walk from the beginning of the list backwards in time to find the newest 356 // entry that should be deleted. 357 while (cur != recent_changes_.end() && cur->second.visit_time() >= end) 358 ++cur; 359 } 360 361 // Now delete all visits up to the oldest one we were supposed to delete. 362 // Note that if begin is_null, it will be less than or equal to any other 363 // time. 364 if (restrict_urls.empty()) { 365 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) 366 cur = recent_changes_.Erase(cur); 367 } else { 368 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) { 369 if (restrict_urls.find(cur->first) != restrict_urls.end()) 370 cur = recent_changes_.Erase(cur); 371 else 372 ++cur; 373 } 374 } 375 } 376 377 void TextDatabaseManager::DeleteAll() { 378 DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction."; 379 380 InitDBList(); 381 382 // Close all open databases. 383 db_cache_.Clear(); 384 385 // Now go through and delete all the files. 386 for (DBIdentSet::iterator i = present_databases_.begin(); 387 i != present_databases_.end(); ++i) { 388 FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i)); 389 file_util::Delete(file_name, false); 390 } 391 } 392 393 void TextDatabaseManager::OptimizeChangedDatabases( 394 const ChangeSet& change_set) { 395 for (ChangeSet::DBSet::const_iterator i = 396 change_set.changed_databases_.begin(); 397 i != change_set.changed_databases_.end(); ++i) { 398 // We want to open the database for writing, but only if it exists. To 399 // achieve this, we check whether it exists by saying we're not going to 400 // write to it (avoiding the autocreation code normally called when writing) 401 // and then access it for writing only if it succeeds. 402 TextDatabase* db = GetDB(*i, false); 403 if (!db) 404 continue; 405 db = GetDB(*i, true); 406 if (!db) 407 continue; // The file may have changed or something. 408 db->Optimize(); 409 } 410 } 411 412 void TextDatabaseManager::GetTextMatches( 413 const string16& query, 414 const QueryOptions& options, 415 std::vector<TextDatabase::Match>* results, 416 Time* first_time_searched) { 417 results->clear(); 418 419 InitDBList(); 420 if (present_databases_.empty()) { 421 // Nothing to search. 422 *first_time_searched = options.begin_time; 423 return; 424 } 425 426 // Get the query into the proper format for the individual DBs. 427 string16 fts_query16; 428 query_parser_.ParseQuery(query, &fts_query16); 429 std::string fts_query = UTF16ToUTF8(fts_query16); 430 431 // Need a copy of the options so we can modify the max count for each call 432 // to the individual databases. 433 QueryOptions cur_options(options); 434 435 // Compute the minimum and maximum values for the identifiers that could 436 // encompass the input time range. 437 TextDatabase::DBIdent min_ident = options.begin_time.is_null() ? 438 *present_databases_.begin() : 439 TimeToID(options.begin_time); 440 TextDatabase::DBIdent max_ident = options.end_time.is_null() ? 441 *present_databases_.rbegin() : 442 TimeToID(options.end_time); 443 444 // Iterate over the databases from the most recent backwards. 445 bool checked_one = false; 446 TextDatabase::URLSet found_urls; 447 for (DBIdentSet::reverse_iterator i = present_databases_.rbegin(); 448 i != present_databases_.rend(); 449 ++i) { 450 // TODO(brettw) allow canceling the query in the middle. 451 // if (canceled_or_something) 452 // break; 453 454 // This code is stupid, we just loop until we find the correct starting 455 // time range rather than search in an intelligent way. Users will have a 456 // few dozen files at most, so this should not be an issue. 457 if (*i > max_ident) 458 continue; // Haven't gotten to the time range yet. 459 if (*i < min_ident) 460 break; // Covered all the time range. 461 462 TextDatabase* cur_db = GetDB(*i, false); 463 if (!cur_db) 464 continue; 465 466 // Adjust the max count according to how many results we've already got. 467 if (options.max_count) { 468 cur_options.max_count = options.max_count - 469 static_cast<int>(results->size()); 470 } 471 472 // Since we are going backwards in time, it is always OK to pass the 473 // current first_time_searched, since it will always be smaller than 474 // any previous set. 475 cur_db->GetTextMatches(fts_query, cur_options, 476 results, &found_urls, first_time_searched); 477 checked_one = true; 478 479 DCHECK(options.max_count == 0 || 480 static_cast<int>(results->size()) <= options.max_count); 481 if (options.max_count && 482 static_cast<int>(results->size()) >= options.max_count) 483 break; // Got the max number of results. 484 } 485 486 // When there were no databases in the range, we need to fix up the min time. 487 if (!checked_one) 488 *first_time_searched = options.begin_time; 489 } 490 491 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id, 492 bool for_writing) { 493 DBCache::iterator found_db = db_cache_.Get(id); 494 if (found_db != db_cache_.end()) { 495 if (transaction_nesting_ && for_writing && 496 open_transactions_.find(id) == open_transactions_.end()) { 497 // If we currently have an open transaction, that database is not yet 498 // part of the transaction, and the database will be written to, it needs 499 // to be part of our transaction. 500 found_db->second->BeginTransaction(); 501 open_transactions_.insert(id); 502 } 503 return found_db->second; 504 } 505 506 // Need to make the database. 507 TextDatabase* new_db = new TextDatabase(dir_, id, for_writing); 508 if (!new_db->Init()) { 509 delete new_db; 510 return NULL; 511 } 512 db_cache_.Put(id, new_db); 513 present_databases_.insert(id); 514 515 if (transaction_nesting_ && for_writing) { 516 // If we currently have an open transaction and the new database will be 517 // written to, it needs to be part of our transaction. 518 new_db->BeginTransaction(); 519 open_transactions_.insert(id); 520 } 521 522 // When no transaction is open, allow this new one to kick out an old one. 523 if (!transaction_nesting_) 524 db_cache_.ShrinkToSize(kCacheDBSize); 525 526 return new_db; 527 } 528 529 TextDatabase* TextDatabaseManager::GetDBForTime(Time time, 530 bool create_if_necessary) { 531 return GetDB(TimeToID(time), create_if_necessary); 532 } 533 534 void TextDatabaseManager::ScheduleFlushOldChanges() { 535 factory_.RevokeAll(); 536 MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod( 537 &TextDatabaseManager::FlushOldChanges), 538 kExpirationSec * Time::kMillisecondsPerSecond); 539 } 540 541 void TextDatabaseManager::FlushOldChanges() { 542 FlushOldChangesForTime(TimeTicks::Now()); 543 } 544 545 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) { 546 // The end of the list is the oldest, so we just start from there committing 547 // things until we get something too new. 548 RecentChangeList::reverse_iterator i = recent_changes_.rbegin(); 549 while (i != recent_changes_.rend() && i->second.Expired(now)) { 550 AddPageData(i->first, i->second.url_id(), i->second.visit_id(), 551 i->second.visit_time(), i->second.title(), i->second.body()); 552 i = recent_changes_.Erase(i); 553 } 554 555 ScheduleFlushOldChanges(); 556 } 557 558 } // namespace history 559