1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <limits> 6 #include <set> 7 #include <string> 8 9 #include "chrome/browser/history/text_database.h" 10 11 #include "app/sql/statement.h" 12 #include "app/sql/transaction.h" 13 #include "base/file_util.h" 14 #include "base/logging.h" 15 #include "base/metrics/histogram.h" 16 #include "base/string_number_conversions.h" 17 #include "base/string_util.h" 18 #include "base/utf_string_conversions.h" 19 #include "chrome/browser/diagnostics/sqlite_diagnostics.h" 20 21 // There are two tables in each database, one full-text search (FTS) table which 22 // indexes the contents and title of the pages. The other is a regular SQLITE 23 // table which contains non-indexed information about the page. All columns of 24 // a FTS table are indexed using the text search algorithm, which isn't what we 25 // want for things like times. If this were in the FTS table, there would be 26 // different words in the index for each time number. 27 // 28 // "pages" FTS table: 29 // url URL of the page so searches will match the URL. 30 // title Title of the page. 31 // body Body of the page. 32 // 33 // "info" regular table: 34 // time Time the corresponding FTS entry was visited. 35 // 36 // We do joins across these two tables by using their internal rowids, which we 37 // keep in sync between the two tables. The internal rowid is the only part of 38 // an FTS table that is indexed like a normal table, and the index over it is 39 // free since sqlite always indexes the internal rowid. 40 41 namespace history { 42 43 namespace { 44 45 // Version 1 uses FTS2 for index files. 46 // Version 2 uses FTS3. 47 static const int kCurrentVersionNumber = 2; 48 static const int kCompatibleVersionNumber = 2; 49 50 // Snippet computation relies on the index of the columns in the original 51 // create statement. These are the 0-based indices (as strings) of the 52 // corresponding columns. 53 const char kTitleColumnIndex[] = "1"; 54 const char kBodyColumnIndex[] = "2"; 55 56 // The string prepended to the database identifier to generate the filename. 57 const FilePath::CharType kFilePrefix[] = FILE_PATH_LITERAL("History Index "); 58 59 } // namespace 60 61 TextDatabase::Match::Match() {} 62 63 TextDatabase::Match::~Match() {} 64 65 TextDatabase::TextDatabase(const FilePath& path, 66 DBIdent id, 67 bool allow_create) 68 : path_(path), 69 ident_(id), 70 allow_create_(allow_create) { 71 // Compute the file name. 72 file_name_ = path_.Append(IDToFileName(ident_)); 73 } 74 75 TextDatabase::~TextDatabase() { 76 } 77 78 // static 79 const FilePath::CharType* TextDatabase::file_base() { 80 return kFilePrefix; 81 } 82 83 // static 84 FilePath TextDatabase::IDToFileName(DBIdent id) { 85 // Identifiers are intended to be a combination of the year and month, for 86 // example, 200801 for January 2008. We convert this to 87 // "History Index 2008-01". However, we don't make assumptions about this 88 // scheme: the caller should assign IDs as it feels fit with the knowledge 89 // that they will apppear on disk in this form. 90 FilePath::StringType filename(file_base()); 91 base::StringAppendF(&filename, FILE_PATH_LITERAL("%d-%02d"), 92 id / 100, id % 100); 93 return FilePath(filename); 94 } 95 96 // static 97 TextDatabase::DBIdent TextDatabase::FileNameToID(const FilePath& file_path) { 98 FilePath::StringType file_name = file_path.BaseName().value(); 99 100 // We don't actually check the prefix here. Since the file system could 101 // be case insensitive in ways we can't predict (NTFS), checking could 102 // potentially be the wrong thing to do. Instead, we just look for a suffix. 103 static const size_t kIDStringLength = 7; // Room for "xxxx-xx". 104 if (file_name.length() < kIDStringLength) 105 return 0; 106 const FilePath::StringType suffix( 107 &file_name[file_name.length() - kIDStringLength]); 108 109 if (suffix.length() != kIDStringLength || 110 suffix[4] != FILE_PATH_LITERAL('-')) { 111 return 0; 112 } 113 114 int year, month; 115 base::StringToInt(suffix.begin(), suffix.begin() + 4, &year); 116 base::StringToInt(suffix.begin() + 5, suffix.begin() + 7, &month); 117 118 return year * 100 + month; 119 } 120 121 bool TextDatabase::Init() { 122 // Make sure, if we're not allowed to create the file, that it exists. 123 if (!allow_create_) { 124 if (!file_util::PathExists(file_name_)) 125 return false; 126 } 127 128 // Set the exceptional sqlite error handler. 129 db_.set_error_delegate(GetErrorHandlerForTextDb()); 130 131 // Set the database page size to something a little larger to give us 132 // better performance (we're typically seek rather than bandwidth limited). 133 // This only has an effect before any tables have been created, otherwise 134 // this is a NOP. Must be a power of 2 and a max of 8192. 135 db_.set_page_size(4096); 136 137 // The default cache size is 2000 which give >8MB of data. Since we will often 138 // have 2-3 of these objects, each with their own 8MB, this adds up very fast. 139 // We therefore reduce the size so when there are multiple objects, we're not 140 // too big. 141 db_.set_cache_size(512); 142 143 // Run the database in exclusive mode. Nobody else should be accessing the 144 // database while we're running, and this will give somewhat improved perf. 145 db_.set_exclusive_locking(); 146 147 // Attach the database to our index file. 148 if (!db_.Open(file_name_)) 149 return false; 150 151 // Meta table tracking version information. 152 if (!meta_table_.Init(&db_, kCurrentVersionNumber, kCompatibleVersionNumber)) 153 return false; 154 if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) { 155 // This version is too new. We don't bother notifying the user on this 156 // error, and just fail to use the file. Normally if they have version skew, 157 // they will get it for the main history file and it won't be necessary 158 // here. If that's not the case, since this is only indexed data, it's 159 // probably better to just not give FTS results than strange errors when 160 // everything else is working OK. 161 LOG(WARNING) << "Text database is too new."; 162 return false; 163 } 164 165 return CreateTables(); 166 } 167 168 void TextDatabase::BeginTransaction() { 169 db_.BeginTransaction(); 170 } 171 172 void TextDatabase::CommitTransaction() { 173 db_.CommitTransaction(); 174 } 175 176 bool TextDatabase::CreateTables() { 177 // FTS table of page contents. 178 if (!db_.DoesTableExist("pages")) { 179 if (!db_.Execute("CREATE VIRTUAL TABLE pages USING fts3(" 180 "TOKENIZE icu," 181 "url LONGVARCHAR," 182 "title LONGVARCHAR," 183 "body LONGVARCHAR)")) 184 return false; 185 } 186 187 // Non-FTS table containing URLs and times so we can efficiently find them 188 // using a regular index (all FTS columns are special and are treated as 189 // full-text-search, which is not what we want when retrieving this data). 190 if (!db_.DoesTableExist("info")) { 191 // Note that there is no point in creating an index over time. Since 192 // we must always query the entire FTS table (it can not efficiently do 193 // subsets), we will always end up doing that first, and joining the info 194 // table off of that. 195 if (!db_.Execute("CREATE TABLE info(time INTEGER NOT NULL)")) 196 return false; 197 } 198 199 // Create the index. This will fail when the index already exists, so we just 200 // ignore the error. 201 db_.Execute("CREATE INDEX info_time ON info(time)"); 202 return true; 203 } 204 205 bool TextDatabase::AddPageData(base::Time time, 206 const std::string& url, 207 const std::string& title, 208 const std::string& contents) { 209 sql::Transaction committer(&db_); 210 if (!committer.Begin()) 211 return false; 212 213 // Add to the pages table. 214 sql::Statement add_to_pages(db_.GetCachedStatement(SQL_FROM_HERE, 215 "INSERT INTO pages (url, title, body) VALUES (?,?,?)")); 216 if (!add_to_pages) { 217 NOTREACHED() << db_.GetErrorMessage(); 218 return false; 219 } 220 add_to_pages.BindString(0, url); 221 add_to_pages.BindString(1, title); 222 add_to_pages.BindString(2, contents); 223 if (!add_to_pages.Run()) { 224 NOTREACHED() << db_.GetErrorMessage(); 225 return false; 226 } 227 228 int64 rowid = db_.GetLastInsertRowId(); 229 230 // Add to the info table with the same rowid. 231 sql::Statement add_to_info(db_.GetCachedStatement(SQL_FROM_HERE, 232 "INSERT INTO info (rowid, time) VALUES (?,?)")); 233 if (!add_to_info) { 234 NOTREACHED() << db_.GetErrorMessage(); 235 return false; 236 } 237 add_to_info.BindInt64(0, rowid); 238 add_to_info.BindInt64(1, time.ToInternalValue()); 239 if (!add_to_info.Run()) { 240 NOTREACHED() << db_.GetErrorMessage(); 241 return false; 242 } 243 244 return committer.Commit(); 245 } 246 247 void TextDatabase::DeletePageData(base::Time time, const std::string& url) { 248 // First get all rows that match. Selecing on time (which has an index) allows 249 // us to avoid brute-force searches on the full-text-index table (there will 250 // generally be only one match per time). 251 sql::Statement select_ids(db_.GetCachedStatement(SQL_FROM_HERE, 252 "SELECT info.rowid " 253 "FROM info JOIN pages ON info.rowid = pages.rowid " 254 "WHERE info.time=? AND pages.url=?")); 255 if (!select_ids) 256 return; 257 select_ids.BindInt64(0, time.ToInternalValue()); 258 select_ids.BindString(1, url); 259 std::set<int64> rows_to_delete; 260 while (select_ids.Step()) 261 rows_to_delete.insert(select_ids.ColumnInt64(0)); 262 263 // Delete from the pages table. 264 sql::Statement delete_page(db_.GetCachedStatement(SQL_FROM_HERE, 265 "DELETE FROM pages WHERE rowid=?")); 266 if (!delete_page) 267 return; 268 for (std::set<int64>::const_iterator i = rows_to_delete.begin(); 269 i != rows_to_delete.end(); ++i) { 270 delete_page.BindInt64(0, *i); 271 if (!delete_page.Run()) { 272 NOTREACHED(); 273 return; 274 } 275 delete_page.Reset(); 276 } 277 278 // Delete from the info table. 279 sql::Statement delete_info(db_.GetCachedStatement(SQL_FROM_HERE, 280 "DELETE FROM info WHERE rowid=?")); 281 if (!delete_info) 282 return; 283 for (std::set<int64>::const_iterator i = rows_to_delete.begin(); 284 i != rows_to_delete.end(); ++i) { 285 delete_info.BindInt64(0, *i); 286 if (!delete_info.Run()) { 287 NOTREACHED(); 288 return; 289 } 290 delete_info.Reset(); 291 } 292 } 293 294 void TextDatabase::Optimize() { 295 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE, 296 "SELECT OPTIMIZE(pages) FROM pages LIMIT 1")); 297 if (!statement) 298 return; 299 statement.Run(); 300 } 301 302 void TextDatabase::GetTextMatches(const std::string& query, 303 const QueryOptions& options, 304 std::vector<Match>* results, 305 URLSet* found_urls, 306 base::Time* first_time_searched) { 307 *first_time_searched = options.begin_time; 308 309 sql::Statement statement(db_.GetCachedStatement(SQL_FROM_HERE, 310 "SELECT url, title, time, offsets(pages), body " 311 "FROM pages LEFT OUTER JOIN info ON pages.rowid = info.rowid " 312 "WHERE pages MATCH ? AND time >= ? AND time < ? " 313 "ORDER BY time DESC " 314 "LIMIT ?")); 315 if (!statement) 316 return; 317 318 // When their values indicate "unspecified", saturate the numbers to the max 319 // or min to get the correct result. 320 int64 effective_begin_time = options.begin_time.is_null() ? 321 0 : options.begin_time.ToInternalValue(); 322 int64 effective_end_time = options.end_time.is_null() ? 323 std::numeric_limits<int64>::max() : options.end_time.ToInternalValue(); 324 int effective_max_count = options.max_count ? 325 options.max_count : std::numeric_limits<int>::max(); 326 327 statement.BindString(0, query); 328 statement.BindInt64(1, effective_begin_time); 329 statement.BindInt64(2, effective_end_time); 330 statement.BindInt(3, effective_max_count); 331 332 while (statement.Step()) { 333 // TODO(brettw) allow canceling the query in the middle. 334 // if (canceled_or_something) 335 // break; 336 337 GURL url(statement.ColumnString(0)); 338 URLSet::const_iterator found_url = found_urls->find(url); 339 if (found_url != found_urls->end()) 340 continue; // Don't add this duplicate. 341 342 // Fill the results into the vector (avoid copying the URL with Swap()). 343 results->resize(results->size() + 1); 344 Match& match = results->at(results->size() - 1); 345 match.url.Swap(&url); 346 347 match.title = statement.ColumnString16(1); 348 match.time = base::Time::FromInternalValue(statement.ColumnInt64(2)); 349 350 // Extract any matches in the title. 351 std::string offsets_str = statement.ColumnString(3); 352 Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex, 353 &match.title_match_positions); 354 Snippet::ConvertMatchPositionsToWide(statement.ColumnString(1), 355 &match.title_match_positions); 356 357 // Extract the matches in the body. 358 Snippet::MatchPositions match_positions; 359 Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex, 360 &match_positions); 361 362 // Compute the snippet based on those matches. 363 std::string body = statement.ColumnString(4); 364 match.snippet.ComputeSnippet(match_positions, body); 365 } 366 367 // When we have returned all the results possible (or determined that there 368 // are none), then we have searched all the time requested, so we can 369 // set the first_time_searched to that value. 370 if (results->empty() || 371 options.max_count == 0 || // Special case for wanting all the results. 372 static_cast<int>(results->size()) < options.max_count) { 373 *first_time_searched = options.begin_time; 374 } else { 375 // Since we got the results in order, we know the last item is the last 376 // time we considered. 377 *first_time_searched = results->back().time; 378 } 379 380 statement.Reset(); 381 } 382 383 } // namespace history 384