Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/text_database_manager.h"
      6 
      7 #include "base/compiler_specific.h"
      8 #include "base/file_util.h"
      9 #include "base/metrics/histogram.h"
     10 #include "base/logging.h"
     11 #include "base/message_loop.h"
     12 #include "base/string_util.h"
     13 #include "base/utf_string_conversions.h"
     14 #include "chrome/browser/history/history_publisher.h"
     15 #include "chrome/browser/history/visit_database.h"
     16 #include "content/common/mru_cache.h"
     17 
     18 using base::Time;
     19 using base::TimeDelta;
     20 using base::TimeTicks;
     21 
     22 namespace history {
     23 
     24 namespace {
     25 
     26 // The number of database files we will be attached to at once.
     27 const int kCacheDBSize = 5;
     28 
     29 std::string ConvertStringForIndexer(const string16& input) {
     30   // TODO(evanm): other transformations here?
     31   return UTF16ToUTF8(CollapseWhitespace(input, false));
     32 }
     33 
     34 // Data older than this will be committed to the full text index even if we
     35 // haven't gotten a title and/or body.
     36 const int kExpirationSec = 20;
     37 
     38 }  // namespace
     39 
     40 // TextDatabaseManager::ChangeSet ----------------------------------------------
     41 
     42 TextDatabaseManager::ChangeSet::ChangeSet() {}
     43 
     44 TextDatabaseManager::ChangeSet::~ChangeSet() {}
     45 
     46 // TextDatabaseManager::PageInfo -----------------------------------------------
     47 
     48 TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
     49                                         VisitID visit_id,
     50                                         Time visit_time)
     51     : url_id_(url_id),
     52       visit_id_(visit_id),
     53       visit_time_(visit_time) {
     54   added_time_ = TimeTicks::Now();
     55 }
     56 
     57 TextDatabaseManager::PageInfo::~PageInfo() {}
     58 
     59 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) {
     60   if (ttl.empty())  // Make the title nonempty when we set it for EverybodySet.
     61     title_ = ASCIIToUTF16(" ");
     62   else
     63     title_ = ttl;
     64 }
     65 
     66 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) {
     67   if (bdy.empty())  // Make the body nonempty when we set it for EverybodySet.
     68     body_ = ASCIIToUTF16(" ");
     69   else
     70     body_ = bdy;
     71 }
     72 
     73 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
     74   return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec);
     75 }
     76 
     77 // TextDatabaseManager ---------------------------------------------------------
     78 
     79 TextDatabaseManager::TextDatabaseManager(const FilePath& dir,
     80                                          URLDatabase* url_database,
     81                                          VisitDatabase* visit_database)
     82     : dir_(dir),
     83       url_database_(url_database),
     84       visit_database_(visit_database),
     85       recent_changes_(RecentChangeList::NO_AUTO_EVICT),
     86       transaction_nesting_(0),
     87       db_cache_(DBCache::NO_AUTO_EVICT),
     88       present_databases_loaded_(false),
     89       ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)),
     90       history_publisher_(NULL) {
     91 }
     92 
     93 TextDatabaseManager::~TextDatabaseManager() {
     94   if (transaction_nesting_)
     95     CommitTransaction();
     96 }
     97 
     98 // static
     99 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
    100   Time::Exploded exploded;
    101   time.UTCExplode(&exploded);
    102 
    103   // We combine the month and year into a 6-digit number (200801 for
    104   // January, 2008). The month is 1-based.
    105   return exploded.year * 100 + exploded.month;
    106 }
    107 
    108 // static
    109 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
    110   Time::Exploded exploded;
    111   memset(&exploded, 0, sizeof(Time::Exploded));
    112   exploded.year = id / 100;
    113   exploded.month = id % 100;
    114   return Time::FromUTCExploded(exploded);
    115 }
    116 
    117 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
    118   history_publisher_ = history_publisher;
    119 
    120   // Start checking recent changes and committing them.
    121   ScheduleFlushOldChanges();
    122   return true;
    123 }
    124 
    125 void TextDatabaseManager::BeginTransaction() {
    126   transaction_nesting_++;
    127 }
    128 
    129 void TextDatabaseManager::CommitTransaction() {
    130   DCHECK(transaction_nesting_);
    131   transaction_nesting_--;
    132   if (transaction_nesting_)
    133     return;  // Still more nesting of transactions before committing.
    134 
    135   // Commit all databases with open transactions on them.
    136   for (DBIdentSet::const_iterator i = open_transactions_.begin();
    137        i != open_transactions_.end(); ++i) {
    138     DBCache::iterator iter = db_cache_.Get(*i);
    139     if (iter == db_cache_.end()) {
    140       NOTREACHED() << "All open transactions should be cached.";
    141       continue;
    142     }
    143     iter->second->CommitTransaction();
    144   }
    145   open_transactions_.clear();
    146 
    147   // Now that the transaction is over, we can expire old connections.
    148   db_cache_.ShrinkToSize(kCacheDBSize);
    149 }
    150 
    151 void TextDatabaseManager::InitDBList() {
    152   if (present_databases_loaded_)
    153     return;
    154 
    155   present_databases_loaded_ = true;
    156 
    157   // Find files on disk matching our pattern so we can quickly test for them.
    158   FilePath::StringType filepattern(TextDatabase::file_base());
    159   filepattern.append(FILE_PATH_LITERAL("*"));
    160   file_util::FileEnumerator enumerator(
    161       dir_, false, file_util::FileEnumerator::FILES, filepattern);
    162   FilePath cur_file;
    163   while (!(cur_file = enumerator.Next()).empty()) {
    164     // Convert to the number representing this file.
    165     TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
    166     if (id)  // Will be 0 on error.
    167       present_databases_.insert(id);
    168   }
    169 }
    170 
    171 void TextDatabaseManager::AddPageURL(const GURL& url,
    172                                      URLID url_id,
    173                                      VisitID visit_id,
    174                                      Time time) {
    175   // Delete any existing page info.
    176   RecentChangeList::iterator found = recent_changes_.Peek(url);
    177   if (found != recent_changes_.end())
    178     recent_changes_.Erase(found);
    179 
    180   // Just save this info for later. We will save it when it expires or when all
    181   // the data is complete.
    182   recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
    183 }
    184 
    185 void TextDatabaseManager::AddPageTitle(const GURL& url,
    186                                        const string16& title) {
    187   RecentChangeList::iterator found = recent_changes_.Peek(url);
    188   if (found == recent_changes_.end()) {
    189     // This page is not in our cache of recent pages. This is very much an edge
    190     // case as normally a title will come in <20 seconds after the page commits,
    191     // and TabContents will avoid spamming us with >1 title per page. However,
    192     // it could come up if your connection is unhappy, and we don't want to
    193     // miss anything.
    194     //
    195     // To solve this problem, we'll just associate the most recent visit with
    196     // the new title and index that using the regular code path.
    197     URLRow url_row;
    198     if (!url_database_->GetRowForURL(url, &url_row))
    199       return;  // URL is unknown, give up.
    200     VisitRow visit;
    201     if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
    202       return;  // No recent visit, give up.
    203 
    204     if (visit.is_indexed) {
    205       // If this page was already indexed, we could have a body that came in
    206       // first and we don't want to overwrite it. We could go query for the
    207       // current body, or have a special setter for only the title, but this is
    208       // not worth it for this edge case.
    209       //
    210       // It will be almost impossible for the title to take longer than
    211       // kExpirationSec yet we got a body in less than that time, since the
    212       // title should always come in first.
    213       return;
    214     }
    215 
    216     AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
    217                 title, string16());
    218     return;  // We don't know about this page, give up.
    219   }
    220 
    221   PageInfo& info = found->second;
    222   if (info.has_body()) {
    223     // This info is complete, write to the database.
    224     AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
    225                 title, info.body());
    226     recent_changes_.Erase(found);
    227     return;
    228   }
    229 
    230   info.set_title(title);
    231 }
    232 
    233 void TextDatabaseManager::AddPageContents(const GURL& url,
    234                                           const string16& body) {
    235   RecentChangeList::iterator found = recent_changes_.Peek(url);
    236   if (found == recent_changes_.end()) {
    237     // This page is not in our cache of recent pages. This means that the page
    238     // took more than kExpirationSec to load. Often, this will be the result of
    239     // a very slow iframe or other resource on the page that makes us think its
    240     // still loading.
    241     //
    242     // As a fallback, set the most recent visit's contents using the input, and
    243     // use the last set title in the URL table as the title to index.
    244     URLRow url_row;
    245     if (!url_database_->GetRowForURL(url, &url_row))
    246       return;  // URL is unknown, give up.
    247     VisitRow visit;
    248     if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
    249       return;  // No recent visit, give up.
    250 
    251     // Use the title from the URL row as the title for the indexing.
    252     AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
    253                 url_row.title(), body);
    254     return;
    255   }
    256 
    257   PageInfo& info = found->second;
    258   if (info.has_title()) {
    259     // This info is complete, write to the database.
    260     AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
    261                 info.title(), body);
    262     recent_changes_.Erase(found);
    263     return;
    264   }
    265 
    266   info.set_body(body);
    267 }
    268 
    269 bool TextDatabaseManager::AddPageData(const GURL& url,
    270                                       URLID url_id,
    271                                       VisitID visit_id,
    272                                       Time visit_time,
    273                                       const string16& title,
    274                                       const string16& body) {
    275   TextDatabase* db = GetDBForTime(visit_time, true);
    276   if (!db)
    277     return false;
    278 
    279   TimeTicks beginning_time = TimeTicks::Now();
    280 
    281   // First delete any recently-indexed data for this page. This will delete
    282   // anything in the main database, but we don't bother looking through the
    283   // archived database.
    284   VisitVector visits;
    285   visit_database_->GetVisitsForURL(url_id, &visits);
    286   size_t our_visit_row_index = visits.size();
    287   for (size_t i = 0; i < visits.size(); i++) {
    288     // While we're going trough all the visits, also find our row so we can
    289     // avoid another DB query.
    290     if (visits[i].visit_id == visit_id) {
    291       our_visit_row_index = i;
    292     } else if (visits[i].is_indexed) {
    293       visits[i].is_indexed = false;
    294       visit_database_->UpdateVisitRow(visits[i]);
    295       DeletePageData(visits[i].visit_time, url, NULL);
    296     }
    297   }
    298 
    299   if (visit_id) {
    300     // We're supposed to update the visit database.
    301     if (our_visit_row_index >= visits.size()) {
    302       NOTREACHED() << "We should always have found a visit when given an ID.";
    303       return false;
    304     }
    305 
    306     DCHECK(visit_time == visits[our_visit_row_index].visit_time);
    307 
    308     // Update the visit database to reference our addition.
    309     visits[our_visit_row_index].is_indexed = true;
    310     if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index]))
    311       return false;
    312   }
    313 
    314   // Now index the data.
    315   std::string url_str = URLDatabase::GURLToDatabaseURL(url);
    316   bool success = db->AddPageData(visit_time, url_str,
    317                                  ConvertStringForIndexer(title),
    318                                  ConvertStringForIndexer(body));
    319 
    320   UMA_HISTOGRAM_TIMES("History.AddFTSData",
    321                       TimeTicks::Now() - beginning_time);
    322 
    323   if (history_publisher_)
    324     history_publisher_->PublishPageContent(visit_time, url, title, body);
    325 
    326   return success;
    327 }
    328 
    329 void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
    330                                          ChangeSet* change_set) {
    331   TextDatabase::DBIdent db_ident = TimeToID(time);
    332 
    333   // We want to open the database for writing, but only if it exists. To
    334   // achieve this, we check whether it exists by saying we're not going to
    335   // write to it (avoiding the autocreation code normally called when writing)
    336   // and then access it for writing only if it succeeds.
    337   TextDatabase* db = GetDB(db_ident, false);
    338   if (!db)
    339     return;
    340   db = GetDB(db_ident, true);
    341 
    342   if (change_set)
    343     change_set->Add(db_ident);
    344 
    345   db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
    346 }
    347 
    348 void TextDatabaseManager::DeleteFromUncommitted(
    349     const std::set<GURL>& restrict_urls, Time begin, Time end) {
    350   // First find the beginning of the range to delete. Recall that the list
    351   // has the most recent item at the beginning. There won't normally be very
    352   // many items, so a brute-force search is fine.
    353   RecentChangeList::iterator cur = recent_changes_.begin();
    354   if (!end.is_null()) {
    355     // Walk from the beginning of the list backwards in time to find the newest
    356     // entry that should be deleted.
    357     while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
    358       ++cur;
    359   }
    360 
    361   // Now delete all visits up to the oldest one we were supposed to delete.
    362   // Note that if begin is_null, it will be less than or equal to any other
    363   // time.
    364   if (restrict_urls.empty()) {
    365     while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
    366       cur = recent_changes_.Erase(cur);
    367   } else {
    368     while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) {
    369       if (restrict_urls.find(cur->first) != restrict_urls.end())
    370         cur = recent_changes_.Erase(cur);
    371       else
    372         ++cur;
    373     }
    374   }
    375 }
    376 
    377 void TextDatabaseManager::DeleteAll() {
    378   DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction.";
    379 
    380   InitDBList();
    381 
    382   // Close all open databases.
    383   db_cache_.Clear();
    384 
    385   // Now go through and delete all the files.
    386   for (DBIdentSet::iterator i = present_databases_.begin();
    387        i != present_databases_.end(); ++i) {
    388     FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i));
    389     file_util::Delete(file_name, false);
    390   }
    391 }
    392 
    393 void TextDatabaseManager::OptimizeChangedDatabases(
    394     const ChangeSet& change_set) {
    395   for (ChangeSet::DBSet::const_iterator i =
    396            change_set.changed_databases_.begin();
    397        i != change_set.changed_databases_.end(); ++i) {
    398     // We want to open the database for writing, but only if it exists. To
    399     // achieve this, we check whether it exists by saying we're not going to
    400     // write to it (avoiding the autocreation code normally called when writing)
    401     // and then access it for writing only if it succeeds.
    402     TextDatabase* db = GetDB(*i, false);
    403     if (!db)
    404       continue;
    405     db = GetDB(*i, true);
    406     if (!db)
    407       continue;  // The file may have changed or something.
    408     db->Optimize();
    409   }
    410 }
    411 
    412 void TextDatabaseManager::GetTextMatches(
    413     const string16& query,
    414     const QueryOptions& options,
    415     std::vector<TextDatabase::Match>* results,
    416     Time* first_time_searched) {
    417   results->clear();
    418 
    419   InitDBList();
    420   if (present_databases_.empty()) {
    421     // Nothing to search.
    422     *first_time_searched = options.begin_time;
    423     return;
    424   }
    425 
    426   // Get the query into the proper format for the individual DBs.
    427   string16 fts_query16;
    428   query_parser_.ParseQuery(query, &fts_query16);
    429   std::string fts_query = UTF16ToUTF8(fts_query16);
    430 
    431   // Need a copy of the options so we can modify the max count for each call
    432   // to the individual databases.
    433   QueryOptions cur_options(options);
    434 
    435   // Compute the minimum and maximum values for the identifiers that could
    436   // encompass the input time range.
    437   TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
    438       *present_databases_.begin() :
    439       TimeToID(options.begin_time);
    440   TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
    441       *present_databases_.rbegin() :
    442       TimeToID(options.end_time);
    443 
    444   // Iterate over the databases from the most recent backwards.
    445   bool checked_one = false;
    446   TextDatabase::URLSet found_urls;
    447   for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
    448        i != present_databases_.rend();
    449        ++i) {
    450     // TODO(brettw) allow canceling the query in the middle.
    451     // if (canceled_or_something)
    452     //   break;
    453 
    454     // This code is stupid, we just loop until we find the correct starting
    455     // time range rather than search in an intelligent way. Users will have a
    456     // few dozen files at most, so this should not be an issue.
    457     if (*i > max_ident)
    458       continue;  // Haven't gotten to the time range yet.
    459     if (*i < min_ident)
    460       break;  // Covered all the time range.
    461 
    462     TextDatabase* cur_db = GetDB(*i, false);
    463     if (!cur_db)
    464       continue;
    465 
    466     // Adjust the max count according to how many results we've already got.
    467     if (options.max_count) {
    468       cur_options.max_count = options.max_count -
    469           static_cast<int>(results->size());
    470     }
    471 
    472     // Since we are going backwards in time, it is always OK to pass the
    473     // current first_time_searched, since it will always be smaller than
    474     // any previous set.
    475     cur_db->GetTextMatches(fts_query, cur_options,
    476                            results, &found_urls, first_time_searched);
    477     checked_one = true;
    478 
    479     DCHECK(options.max_count == 0 ||
    480            static_cast<int>(results->size()) <= options.max_count);
    481     if (options.max_count &&
    482         static_cast<int>(results->size()) >= options.max_count)
    483       break;  // Got the max number of results.
    484   }
    485 
    486   // When there were no databases in the range, we need to fix up the min time.
    487   if (!checked_one)
    488     *first_time_searched = options.begin_time;
    489 }
    490 
    491 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
    492                                          bool for_writing) {
    493   DBCache::iterator found_db = db_cache_.Get(id);
    494   if (found_db != db_cache_.end()) {
    495     if (transaction_nesting_ && for_writing &&
    496         open_transactions_.find(id) == open_transactions_.end()) {
    497       // If we currently have an open transaction, that database is not yet
    498       // part of the transaction, and the database will be written to, it needs
    499       // to be part of our transaction.
    500       found_db->second->BeginTransaction();
    501       open_transactions_.insert(id);
    502     }
    503     return found_db->second;
    504   }
    505 
    506   // Need to make the database.
    507   TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
    508   if (!new_db->Init()) {
    509     delete new_db;
    510     return NULL;
    511   }
    512   db_cache_.Put(id, new_db);
    513   present_databases_.insert(id);
    514 
    515   if (transaction_nesting_ && for_writing) {
    516     // If we currently have an open transaction and the new database will be
    517     // written to, it needs to be part of our transaction.
    518     new_db->BeginTransaction();
    519     open_transactions_.insert(id);
    520   }
    521 
    522   // When no transaction is open, allow this new one to kick out an old one.
    523   if (!transaction_nesting_)
    524     db_cache_.ShrinkToSize(kCacheDBSize);
    525 
    526   return new_db;
    527 }
    528 
    529 TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
    530                                                 bool create_if_necessary) {
    531   return GetDB(TimeToID(time), create_if_necessary);
    532 }
    533 
    534 void TextDatabaseManager::ScheduleFlushOldChanges() {
    535   factory_.RevokeAll();
    536   MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod(
    537           &TextDatabaseManager::FlushOldChanges),
    538       kExpirationSec * Time::kMillisecondsPerSecond);
    539 }
    540 
    541 void TextDatabaseManager::FlushOldChanges() {
    542   FlushOldChangesForTime(TimeTicks::Now());
    543 }
    544 
    545 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
    546   // The end of the list is the oldest, so we just start from there committing
    547   // things until we get something too new.
    548   RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
    549   while (i != recent_changes_.rend() && i->second.Expired(now)) {
    550     AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
    551                 i->second.visit_time(), i->second.title(), i->second.body());
    552     i = recent_changes_.Erase(i);
    553   }
    554 
    555   ScheduleFlushOldChanges();
    556 }
    557 
    558 }  // namespace history
    559