1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_HISTORY_TEXT_DATABASE_MANAGER_H_ 6 #define CHROME_BROWSER_HISTORY_TEXT_DATABASE_MANAGER_H_ 7 #pragma once 8 9 #include <set> 10 #include <vector> 11 12 #include "base/basictypes.h" 13 #include "base/file_path.h" 14 #include "base/gtest_prod_util.h" 15 #include "base/string16.h" 16 #include "base/task.h" 17 #include "chrome/browser/history/history_types.h" 18 #include "chrome/browser/history/text_database.h" 19 #include "chrome/browser/history/query_parser.h" 20 #include "chrome/browser/history/url_database.h" 21 #include "content/common/mru_cache.h" 22 23 namespace history { 24 25 class HistoryPublisher; 26 class VisitDatabase; 27 28 // Manages a set of text databases representing different time periods. This 29 // will page them in and out as necessary, and will manage queries for times 30 // spanning multiple databases. 31 // 32 // It will also keep a list of partial changes, such as page adds and title and 33 // body sets, all of which come in at different times for a given page. When 34 // all data is received or enough time has elapsed since adding, the indexed 35 // data will be comitted. 36 // 37 // This allows us to minimize inserts and modifications, which are slow for the 38 // full text database, since each page's information is added exactly once. 39 // 40 // Note: be careful to delete the relevant entries from this uncommitted list 41 // when clearing history or this information may get added to the database soon 42 // after the clear. 43 class TextDatabaseManager { 44 public: 45 // Tracks a set of changes (only deletes need to be supported now) to the 46 // databases. This is opaque to the caller, but allows it to pass back a list 47 // of all database that it has caused a change to. 48 // 49 // This is necessary for the feature where we optimize full text databases 50 // which have changed as a result of the user deleting history via 51 // OptimizeChangedDatabases. We want to do each affected database only once at 52 // the end of the delete, but we don't want the caller to have to worry about 53 // our internals. 54 class ChangeSet { 55 public: 56 ChangeSet(); 57 ~ChangeSet(); 58 59 private: 60 friend class TextDatabaseManager; 61 62 typedef std::set<TextDatabase::DBIdent> DBSet; 63 64 void Add(TextDatabase::DBIdent id) { changed_databases_.insert(id); } 65 66 DBSet changed_databases_; 67 }; 68 69 // You must call Init() to complete initialization. 70 // 71 // |dir| is the directory that will hold the full text database files (there 72 // will be many files named by their date ranges). 73 // 74 // The visit database is a pointer owned by the caller for the main database 75 // (of recent visits). The visit database will be updated to refer to the 76 // added text database entries. 77 explicit TextDatabaseManager(const FilePath& dir, 78 URLDatabase* url_database, 79 VisitDatabase* visit_database); 80 ~TextDatabaseManager(); 81 82 // Must call before using other functions. If it returns false, no other 83 // functions should be called. 84 bool Init(const HistoryPublisher* history_publisher); 85 86 // Returns the directory that holds the full text database files. 87 const FilePath& GetDir() { return dir_; } 88 89 // Allows scoping updates. This also allows things to go faster since every 90 // page add doesn't need to be committed to disk (slow). Note that files will 91 // still get created during a transaction. 92 void BeginTransaction(); 93 void CommitTransaction(); 94 95 // Sets specific information for the given page to be added to the database. 96 // In normal operation, URLs will be added as the user visits them, the titles 97 // and bodies will come in some time after that. These changes will be 98 // automatically coalesced and added to the database some time in the future 99 // using AddPageData(). 100 // 101 // AddPageURL must be called for a given URL (+ its corresponding ID) before 102 // either the title or body set. The visit ID specifies the visit that will 103 // get updated to refer to the full text indexed information. The visit time 104 // should be the time corresponding to that visit in the database. 105 void AddPageURL(const GURL& url, URLID url_id, VisitID visit_id, 106 base::Time visit_time); 107 void AddPageTitle(const GURL& url, const string16& title); 108 void AddPageContents(const GURL& url, const string16& body); 109 110 // Adds the given data to the appropriate database file, returning true on 111 // success. The visit database row identified by |visit_id| will be updated 112 // to refer to the full text index entry. If the visit ID is 0, the visit 113 // database will not be updated. 114 bool AddPageData(const GURL& url, 115 URLID url_id, 116 VisitID visit_id, 117 base::Time visit_time, 118 const string16& title, 119 const string16& body); 120 121 // Deletes the instance of indexed data identified by the given time and URL. 122 // Any changes will be tracked in the optional change set for use when calling 123 // OptimizeChangedDatabases later. change_set can be NULL. 124 void DeletePageData(base::Time time, const GURL& url, 125 ChangeSet* change_set); 126 127 // The text database manager keeps a list of changes that are made to the 128 // file AddPageURL/Title/Body that may not be committed to the database yet. 129 // This function removes entires from this list happening between the given 130 // time range. It is called when the user clears their history for a time 131 // range, and we don't want any of our data to "leak." If restrict_urls is 132 // not empty, only changes on those URLs are deleted. 133 // 134 // Either or both times my be is_null to be unbounded in that direction. When 135 // non-null, the range is [begin, end). 136 void DeleteFromUncommitted(const std::set<GURL>& restrict_urls, 137 base::Time begin, base::Time end); 138 139 // Deletes all full text search data by removing the files from the disk. 140 // This must be called OUTSIDE of a transaction since it actually deletes the 141 // files rather than messing with the database. 142 void DeleteAll(); 143 144 // Calls optimize on all the databases identified in a given change set (see 145 // the definition of ChangeSet above for more). Optimizing means that old data 146 // will be removed rather than marked unused. 147 void OptimizeChangedDatabases(const ChangeSet& change_set); 148 149 // Executes the given query. See QueryOptions for more info on input. 150 // 151 // The results are filled into |results|, and the first time considered for 152 // the output is in |first_time_searched| (see QueryResults for more). 153 // 154 // This function will return more than one match per URL if there is more than 155 // one entry for that URL in the database. 156 void GetTextMatches(const string16& query, 157 const QueryOptions& options, 158 std::vector<TextDatabase::Match>* results, 159 base::Time* first_time_searched); 160 161 private: 162 // These tests call ExpireRecentChangesForTime to force expiration. 163 FRIEND_TEST_ALL_PREFIXES(TextDatabaseManagerTest, InsertPartial); 164 FRIEND_TEST_ALL_PREFIXES(TextDatabaseManagerTest, PartialComplete); 165 FRIEND_TEST_ALL_PREFIXES(ExpireHistoryTest, DeleteURLAndFavicon); 166 FRIEND_TEST_ALL_PREFIXES(ExpireHistoryTest, FlushRecentURLsUnstarred); 167 FRIEND_TEST_ALL_PREFIXES(ExpireHistoryTest, 168 FlushRecentURLsUnstarredRestricted); 169 170 // Stores "recent stuff" that has happened with the page, since the page 171 // visit, title, and body all come in at different times. 172 class PageInfo { 173 public: 174 PageInfo(URLID url_id, VisitID visit_id, base::Time visit_time); 175 ~PageInfo(); 176 177 // Getters. 178 URLID url_id() const { return url_id_; } 179 VisitID visit_id() const { return visit_id_; } 180 base::Time visit_time() const { return visit_time_; } 181 const string16& title() const { return title_; } 182 const string16& body() const { return body_; } 183 184 // Setters, we can only update the title and body. 185 void set_title(const string16& ttl); 186 void set_body(const string16& bdy); 187 188 // Returns true if both the title or body of the entry has been set. Since 189 // both the title and body setters will "fix" empty strings to be a space, 190 // these indicate if the setter was ever called. 191 bool has_title() const { return !title_.empty(); } 192 bool has_body() { return !body_.empty(); } 193 194 // Returns true if this entry was added too long ago and we should give up 195 // waiting for more data. The current time is passed in as an argument so we 196 // can check many without re-querying the timer. 197 bool Expired(base::TimeTicks now) const; 198 199 private: 200 URLID url_id_; 201 VisitID visit_id_; 202 203 // Time of the visit of the URL. This will be the value stored in the URL 204 // and visit tables for the entry. 205 base::Time visit_time_; 206 207 // When this page entry was created. We have a cap on the maximum time that 208 // an entry will be in the queue before being flushed to the database. 209 base::TimeTicks added_time_; 210 211 // Will be the string " " when they are set to distinguish set and unset. 212 string16 title_; 213 string16 body_; 214 }; 215 216 // Converts the given time to a database identifier or vice-versa. 217 static TextDatabase::DBIdent TimeToID(base::Time time); 218 static base::Time IDToTime(TextDatabase::DBIdent id); 219 220 // Returns a text database for the given identifier or time. This file will 221 // be created if it doesn't exist and |for_writing| is set. On error, 222 // including the case where the file doesn't exist and |for_writing| 223 // is false, it will return NULL. 224 // 225 // When |for_writing| is set, a transaction on the database will be opened 226 // if there is a transaction open on this manager. 227 // 228 // The pointer will be tracked in the cache. The caller should not store it 229 // or delete it since it will get automatically deleted as necessary. 230 TextDatabase* GetDB(TextDatabase::DBIdent id, bool for_writing); 231 TextDatabase* GetDBForTime(base::Time time, bool for_writing); 232 233 // Populates the present_databases_ list based on which files are on disk. 234 // When the list is already initialized, this will do nothing, so you can 235 // call it whenever you want to ensure the present_databases_ set is filled. 236 void InitDBList(); 237 238 // Schedules a call to ExpireRecentChanges in the future. 239 void ScheduleFlushOldChanges(); 240 241 // Checks the recent_changes_ list and commits partial data that has been 242 // around too long. 243 void FlushOldChanges(); 244 245 // Given "now," this will expire old things from the recent_changes_ list. 246 // This is used as the backend for FlushOldChanges and is called directly 247 // by the unit tests with fake times. 248 void FlushOldChangesForTime(base::TimeTicks now); 249 250 // Directory holding our index files. 251 const FilePath dir_; 252 253 // Non-owning pointers to the recent history databases for URLs and visits. 254 URLDatabase* url_database_; 255 VisitDatabase* visit_database_; 256 257 // Lists recent additions that we have not yet filled out with the title and 258 // body. Sorted by time, we will flush them when they are complete or have 259 // been in the queue too long without modification. 260 // 261 // We kind of abuse the MRUCache because we never move things around in it 262 // using Get. Instead, we keep them in the order they were inserted, since 263 // this is the metric we use to measure age. The MRUCache gives us an ordered 264 // list with fast lookup by URL. 265 typedef MRUCache<GURL, PageInfo> RecentChangeList; 266 RecentChangeList recent_changes_; 267 268 // Nesting levels of transactions. Since sqlite only allows one open 269 // transaction, we simulate nested transactions by mapping the outermost one 270 // to a real transaction. Since this object never needs to do ROLLBACK, losing 271 // the ability for all transactions to rollback is inconsequential. 272 int transaction_nesting_; 273 274 // The cache owns the TextDatabase pointers, they will be automagically 275 // deleted when the cache entry is removed or expired. 276 typedef OwningMRUCache<TextDatabase::DBIdent, TextDatabase*> DBCache; 277 DBCache db_cache_; 278 279 // Tells us about the existence of database files on disk. All existing 280 // databases will be in here, and non-existant ones will not, so we don't 281 // have to check the disk every time. 282 // 283 // This set is populated LAZILY by InitDBList(), you should call that function 284 // before accessing the list. 285 // 286 // Note that iterators will work on the keys in-order. Normally, reverse 287 // iterators will be used to iterate the keys in reverse-order. 288 typedef std::set<TextDatabase::DBIdent> DBIdentSet; 289 DBIdentSet present_databases_; 290 bool present_databases_loaded_; // Set by InitDBList when populated. 291 292 // Lists all databases with open transactions. These will have to be closed 293 // when the transaction is committed. 294 DBIdentSet open_transactions_; 295 296 QueryParser query_parser_; 297 298 // Generates tasks for our periodic checking of expired "recent changes". 299 ScopedRunnableMethodFactory<TextDatabaseManager> factory_; 300 301 // This object is created and managed by the history backend. We maintain an 302 // opaque pointer to the object for our use. 303 // This can be NULL if there are no indexers registered to receive indexing 304 // data from us. 305 const HistoryPublisher* history_publisher_; 306 307 DISALLOW_COPY_AND_ASSIGN(TextDatabaseManager); 308 }; 309 310 } // namespace history 311 312 #endif // CHROME_BROWSER_HISTORY_TEXT_DATABASE_MANAGER_H_ 313