Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <limits>
      6 #include <set>
      7 #include <string>
      8 
      9 #include "chrome/browser/history/text_database.h"
     10 
     11 #include "app/sql/statement.h"
     12 #include "app/sql/transaction.h"
     13 #include "base/file_util.h"
     14 #include "base/logging.h"
     15 #include "base/metrics/histogram.h"
     16 #include "base/string_number_conversions.h"
     17 #include "base/string_util.h"
     18 #include "base/utf_string_conversions.h"
     19 #include "chrome/browser/diagnostics/sqlite_diagnostics.h"
     20 
     21 // There are two tables in each database, one full-text search (FTS) table which
     22 // indexes the contents and title of the pages. The other is a regular SQLITE
     23 // table which contains non-indexed information about the page. All columns of
     24 // a FTS table are indexed using the text search algorithm, which isn't what we
     25 // want for things like times. If this were in the FTS table, there would be
     26 // different words in the index for each time number.
     27 //
     28 // "pages" FTS table:
     29 //   url    URL of the page so searches will match the URL.
     30 //   title  Title of the page.
     31 //   body   Body of the page.
     32 //
     33 // "info" regular table:
     34 //   time     Time the corresponding FTS entry was visited.
     35 //
     36 // We do joins across these two tables by using their internal rowids, which we
     37 // keep in sync between the two tables. The internal rowid is the only part of
     38 // an FTS table that is indexed like a normal table, and the index over it is
     39 // free since sqlite always indexes the internal rowid.
     40 
     41 namespace history {
     42 
     43 namespace {
     44 
     45 // Version 1 uses FTS2 for index files.
     46 // Version 2 uses FTS3.
     47 static const int kCurrentVersionNumber = 2;
     48 static const int kCompatibleVersionNumber = 2;
     49 
     50 // Snippet computation relies on the index of the columns in the original
     51 // create statement. These are the 0-based indices (as strings) of the
     52 // corresponding columns.
     53 const char kTitleColumnIndex[] = "1";
     54 const char kBodyColumnIndex[] = "2";
     55 
     56 // The string prepended to the database identifier to generate the filename.
     57 const FilePath::CharType kFilePrefix[] = FILE_PATH_LITERAL("History Index ");
     58 
     59 }  // namespace
     60 
     61 TextDatabase::Match::Match() {}
     62 
     63 TextDatabase::Match::~Match() {}
     64 
     65 TextDatabase::TextDatabase(const FilePath& path,
     66                            DBIdent id,
     67                            bool allow_create)
     68     : path_(path),
     69       ident_(id),
     70       allow_create_(allow_create) {
     71   // Compute the file name.
     72   file_name_ = path_.Append(IDToFileName(ident_));
     73 }
     74 
     75 TextDatabase::~TextDatabase() {
     76 }
     77 
     78 // static
     79 const FilePath::CharType* TextDatabase::file_base() {
     80   return kFilePrefix;
     81 }
     82 
     83 // static
     84 FilePath TextDatabase::IDToFileName(DBIdent id) {
     85   // Identifiers are intended to be a combination of the year and month, for
     86   // example, 200801 for January 2008. We convert this to
     87   // "History Index 2008-01". However, we don't make assumptions about this
     88   // scheme: the caller should assign IDs as it feels fit with the knowledge
     89   // that they will apppear on disk in this form.
     90   FilePath::StringType filename(file_base());
     91   base::StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"),
     92                       id / 100, id % 100);
     93   return FilePath(filename);
     94 }
     95 
     96 // static
     97 TextDatabase::DBIdent TextDatabase::FileNameToID(const FilePath& file_path) {
     98   FilePath::StringType file_name = file_path.BaseName().value();
     99 
    100   // We don't actually check the prefix here. Since the file system could
    101   // be case insensitive in ways we can't predict (NTFS), checking could
    102   // potentially be the wrong thing to do. Instead, we just look for a suffix.
    103   static const size_t kIDStringLength = 7;  // Room for "xxxx-xx".
    104   if (file_name.length() < kIDStringLength)
    105     return 0;
    106   const FilePath::StringType suffix(
    107       &file_name[file_name.length() - kIDStringLength]);
    108 
    109   if (suffix.length() != kIDStringLength ||
    110       suffix[4] != FILE_PATH_LITERAL('-')) {
    111     return 0;
    112   }
    113 
    114   int year, month;
    115   base::StringToInt(suffix.begin(), suffix.begin() + 4, &year);
    116   base::StringToInt(suffix.begin() + 5, suffix.begin() + 7, &month);
    117 
    118   return year * 100 + month;
    119 }
    120 
    121 bool TextDatabase::Init() {
    122   // Make sure, if we're not allowed to create the file, that it exists.
    123   if (!allow_create_) {
    124     if (!file_util::PathExists(file_name_))
    125       return false;
    126   }
    127 
    128   // Set the exceptional sqlite error handler.
    129   db_.set_error_delegate(GetErrorHandlerForTextDb());
    130 
    131   // Set the database page size to something a little larger to give us
    132   // better performance (we're typically seek rather than bandwidth limited).
    133   // This only has an effect before any tables have been created, otherwise
    134   // this is a NOP. Must be a power of 2 and a max of 8192.
    135   db_.set_page_size(4096);
    136 
    137   // The default cache size is 2000 which give >8MB of data. Since we will often
    138   // have 2-3 of these objects, each with their own 8MB, this adds up very fast.
    139   // We therefore reduce the size so when there are multiple objects, we're not
    140   // too big.
    141   db_.set_cache_size(512);
    142 
    143   // Run the database in exclusive mode. Nobody else should be accessing the
    144   // database while we're running, and this will give somewhat improved perf.
    145   db_.set_exclusive_locking();
    146 
    147   // Attach the database to our index file.
    148   if (!db_.Open(file_name_))
    149     return false;
    150 
    151   // Meta table tracking version information.
    152   if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber))
    153     return false;
    154   if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) {
    155     // This version is too new. We don't bother notifying the user on this
    156     // error, and just fail to use the file. Normally if they have version skew,
    157     // they will get it for the main history file and it won't be necessary
    158     // here. If that's not the case, since this is only indexed data, it's
    159     // probably better to just not give FTS results than strange errors when
    160     // everything else is working OK.
    161     LOG(WARNING) << "Text database is too new.";
    162     return false;
    163   }
    164 
    165   return CreateTables();
    166 }
    167 
    168 void TextDatabase::BeginTransaction() {
    169   db_.BeginTransaction();
    170 }
    171 
    172 void TextDatabase::CommitTransaction() {
    173   db_.CommitTransaction();
    174 }
    175 
    176 bool TextDatabase::CreateTables() {
    177   // FTS table of page contents.
    178   if (!db_.DoesTableExist("pages")) {
    179     if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3("
    180                      "TOKENIZE icu,"
    181                      "url LONGVARCHAR,"
    182                      "title LONGVARCHAR,"
    183                      "body LONGVARCHAR)"))
    184       return false;
    185   }
    186 
    187   // Non-FTS table containing URLs and times so we can efficiently find them
    188   // using a regular index (all FTS columns are special and are treated as
    189   // full-text-search, which is not what we want when retrieving this data).
    190   if (!db_.DoesTableExist("info")) {
    191     // Note that there is no point in creating an index over time. Since
    192     // we must always query the entire FTS table (it can not efficiently do
    193     // subsets), we will always end up doing that first, and joining the info
    194     // table off of that.
    195     if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)"))
    196       return false;
    197   }
    198 
    199   // Create the index. This will fail when the index already exists, so we just
    200   // ignore the error.
    201   db_.Execute("CREATE INDEX info_time ON info(time)");
    202   return true;
    203 }
    204 
    205 bool TextDatabase::AddPageData(base::Time time,
    206                                const std::string& url,
    207                                const std::string& title,
    208                                const std::string& contents) {
    209   sql::Transaction committer(&db_);
    210   if (!committer.Begin())
    211     return false;
    212 
    213   // Add to the pages table.
    214   sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE,
    215       "INSERT INTO pages (url, title, body) VALUES (?,?,?)"));
    216   if (!add_to_pages) {
    217     NOTREACHED() << db_.GetErrorMessage();
    218     return false;
    219   }
    220   add_to_pages.BindString(0, url);
    221   add_to_pages.BindString(1, title);
    222   add_to_pages.BindString(2, contents);
    223   if (!add_to_pages.Run()) {
    224     NOTREACHED() << db_.GetErrorMessage();
    225     return false;
    226   }
    227 
    228   int64 rowid = db_.GetLastInsertRowId();
    229 
    230   // Add to the info table with the same rowid.
    231   sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE,
    232       "INSERT INTO info (rowid, time) VALUES (?,?)"));
    233   if (!add_to_info) {
    234     NOTREACHED() << db_.GetErrorMessage();
    235     return false;
    236   }
    237   add_to_info.BindInt64(0, rowid);
    238   add_to_info.BindInt64(1, time.ToInternalValue());
    239   if (!add_to_info.Run()) {
    240     NOTREACHED() << db_.GetErrorMessage();
    241     return false;
    242   }
    243 
    244   return committer.Commit();
    245 }
    246 
    247 void TextDatabase::DeletePageData(base::Time time, const std::string& url) {
    248   // First get all rows that match. Selecing on time (which has an index) allows
    249   // us to avoid brute-force searches on the full-text-index table (there will
    250   // generally be only one match per time).
    251   sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE,
    252       "SELECT info.rowid "
    253       "FROM info JOIN pages ON info.rowid = pages.rowid "
    254       "WHERE info.time=? AND pages.url=?"));
    255   if (!select_ids)
    256     return;
    257   select_ids.BindInt64(0, time.ToInternalValue());
    258   select_ids.BindString(1, url);
    259   std::set<int64> rows_to_delete;
    260   while (select_ids.Step())
    261     rows_to_delete.insert(select_ids.ColumnInt64(0));
    262 
    263   // Delete from the pages table.
    264   sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE,
    265       "DELETE FROM pages WHERE rowid=?"));
    266   if (!delete_page)
    267     return;
    268   for (std::set<int64>::const_iterator i = rows_to_delete.begin();
    269        i != rows_to_delete.end(); ++i) {
    270     delete_page.BindInt64(0, *i);
    271     if (!delete_page.Run()) {
    272       NOTREACHED();
    273       return;
    274     }
    275     delete_page.Reset();
    276   }
    277 
    278   // Delete from the info table.
    279   sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE,
    280       "DELETE FROM info WHERE rowid=?"));
    281   if (!delete_info)
    282     return;
    283   for (std::set<int64>::const_iterator i = rows_to_delete.begin();
    284        i != rows_to_delete.end(); ++i) {
    285     delete_info.BindInt64(0, *i);
    286     if (!delete_info.Run()) {
    287       NOTREACHED();
    288       return;
    289     }
    290     delete_info.Reset();
    291   }
    292 }
    293 
    294 void TextDatabase::Optimize() {
    295   sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,
    296       "SELECT OPTIMIZE(pages) FROM pages LIMIT 1"));
    297   if (!statement)
    298     return;
    299   statement.Run();
    300 }
    301 
    302 void TextDatabase::GetTextMatches(const std::string& query,
    303                                   const QueryOptions& options,
    304                                   std::vector<Match>* results,
    305                                   URLSet* found_urls,
    306                                   base::Time* first_time_searched) {
    307   *first_time_searched = options.begin_time;
    308 
    309   sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE,
    310     "SELECT url, title, time, offsets(pages), body "
    311         "FROM pages LEFT OUTER JOIN info ON pages.rowid = info.rowid "
    312         "WHERE pages MATCH ? AND time >= ? AND time < ? "
    313         "ORDER BY time DESC "
    314         "LIMIT ?"));
    315   if (!statement)
    316     return;
    317 
    318   // When their values indicate "unspecified", saturate the numbers to the max
    319   // or min to get the correct result.
    320   int64 effective_begin_time = options.begin_time.is_null() ?
    321       0 : options.begin_time.ToInternalValue();
    322   int64 effective_end_time = options.end_time.is_null() ?
    323       std::numeric_limits<int64>::max() : options.end_time.ToInternalValue();
    324   int effective_max_count = options.max_count ?
    325       options.max_count : std::numeric_limits<int>::max();
    326 
    327   statement.BindString(0, query);
    328   statement.BindInt64(1, effective_begin_time);
    329   statement.BindInt64(2, effective_end_time);
    330   statement.BindInt(3, effective_max_count);
    331 
    332   while (statement.Step()) {
    333     // TODO(brettw) allow canceling the query in the middle.
    334     // if (canceled_or_something)
    335     //   break;
    336 
    337     GURL url(statement.ColumnString(0));
    338     URLSet::const_iterator found_url = found_urls->find(url);
    339     if (found_url != found_urls->end())
    340       continue;  // Don't add this duplicate.
    341 
    342     // Fill the results into the vector (avoid copying the URL with Swap()).
    343     results->resize(results->size() + 1);
    344     Match& match = results->at(results->size() - 1);
    345     match.url.Swap(&url);
    346 
    347     match.title = statement.ColumnString16(1);
    348     match.time = base::Time::FromInternalValue(statement.ColumnInt64(2));
    349 
    350     // Extract any matches in the title.
    351     std::string offsets_str = statement.ColumnString(3);
    352     Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex,
    353                                    &match.title_match_positions);
    354     Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1),
    355                                          &match.title_match_positions);
    356 
    357     // Extract the matches in the body.
    358     Snippet::MatchPositions match_positions;
    359     Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex,
    360                                    &match_positions);
    361 
    362     // Compute the snippet based on those matches.
    363     std::string body = statement.ColumnString(4);
    364     match.snippet.ComputeSnippet(match_positions, body);
    365   }
    366 
    367   // When we have returned all the results possible (or determined that there
    368   // are none), then we have searched all the time requested, so we can
    369   // set the first_time_searched to that value.
    370   if (results->empty() ||
    371       options.max_count == 0 ||  // Special case for wanting all the results.
    372       static_cast<int>(results->size()) < options.max_count) {
    373     *first_time_searched = options.begin_time;
    374   } else {
    375     // Since we got the results in order, we know the last item is the last
    376     // time we considered.
    377     *first_time_searched = results->back().time;
    378   }
    379 
    380   statement.Reset();
    381 }
    382 
    383 }  // namespace history
    384