1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ 6 #define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ 7 8 #include <set> 9 #include <string> 10 11 #include "base/files/file_path.h" 12 #include "base/gtest_prod_util.h" 13 #include "base/memory/ref_counted.h" 14 #include "chrome/browser/common/cancelable_request.h" 15 #include "chrome/browser/history/history_service.h" 16 #include "chrome/browser/history/in_memory_url_index_cache.pb.h" 17 #include "chrome/browser/history/in_memory_url_index_types.h" 18 #include "chrome/browser/history/scored_history_match.h" 19 20 class HistoryQuickProviderTest; 21 22 namespace in_memory_url_index { 23 class InMemoryURLIndexCacheItem; 24 } 25 26 namespace history { 27 28 namespace imui = in_memory_url_index; 29 30 class HistoryClient; 31 class HistoryDatabase; 32 class InMemoryURLIndex; 33 class RefCountedBool; 34 35 // Current version of the cache file. 36 static const int kCurrentCacheFileVersion = 5; 37 38 // A structure private to InMemoryURLIndex describing its internal data and 39 // providing for restoring, rebuilding and updating that internal data. As 40 // this class is for exclusive use by the InMemoryURLIndex class there should 41 // be no calls from any other class. 42 // 43 // All public member functions are called on the main thread unless otherwise 44 // annotated. 45 class URLIndexPrivateData 46 : public base::RefCountedThreadSafe<URLIndexPrivateData> { 47 public: 48 URLIndexPrivateData(); 49 50 // Given a base::string16 in |term_string|, scans the history index and 51 // returns a vector with all scored, matching history items. The 52 // |term_string| is broken down into individual terms (words), each of which 53 // must occur in the candidate history item's URL or page title for the item 54 // to qualify; however, the terms do not necessarily have to be adjacent. We 55 // also allow breaking |term_string| at |cursor_position| (if 56 // set). Once we have a set of candidates, they are filtered to ensure 57 // that all |term_string| terms, as separated by whitespace and the 58 // cursor (if set), occur within the candidate's URL or page title. 59 // Scores are then calculated on no more than |kItemsToScoreLimit| 60 // candidates, as the scoring of such a large number of candidates may 61 // cause perceptible typing response delays in the omnibox. This is 62 // likely to occur for short omnibox terms such as 'h' and 'w' which 63 // will be found in nearly all history candidates. Results are sorted by 64 // descending score. The full results set (i.e. beyond the 65 // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls 66 // to this function. |history_client| is used to boost a result's score if 67 // its URL is referenced by one or more of the user's bookmarks. |languages| 68 // is used to help parse/format the URLs in the history index. In total, 69 // |max_matches| of items will be returned in the |ScoredHistoryMatches| 70 // vector. 71 ScoredHistoryMatches HistoryItemsForTerms(base::string16 term_string, 72 size_t cursor_position, 73 size_t max_matches, 74 const std::string& languages, 75 HistoryClient* history_client); 76 77 // Adds the history item in |row| to the index if it does not already already 78 // exist and it meets the minimum 'quick' criteria. If the row already exists 79 // in the index then the index will be updated if the row still meets the 80 // criteria, otherwise the row will be removed from the index. Returns true 81 // if the index was actually updated. |languages| gives a list of language 82 // encodings by which the URLs and page titles are broken down into words and 83 // characters. |scheme_whitelist| is used to filter non-qualifying schemes. 84 // |history_service| is used to schedule an update to the recent visits 85 // component of this URL's entry in the index. 86 bool UpdateURL(HistoryService* history_service, 87 const URLRow& row, 88 const std::string& languages, 89 const std::set<std::string>& scheme_whitelist); 90 91 // Updates the entry for |url_id| in the index, replacing its 92 // recent visits information with |recent_visits|. If |url_id| 93 // is not in the index, does nothing. 94 void UpdateRecentVisits(URLID url_id, 95 const VisitVector& recent_visits); 96 97 // Using |history_service| schedules an update (using the historyDB 98 // thread) for the recent visits information for |url_id|. Unless 99 // something unexpectedly goes wrong, UdpateRecentVisits() should 100 // eventually be called from a callback. 101 void ScheduleUpdateRecentVisits(HistoryService* history_service, 102 URLID url_id); 103 104 // Deletes index data for the history item with the given |url|. 105 // The item may not have actually been indexed, which is the case if it did 106 // not previously meet minimum 'quick' criteria. Returns true if the index 107 // was actually updated. 108 bool DeleteURL(const GURL& url); 109 110 // Constructs a new object by restoring its contents from the cache file 111 // at |path|. Returns the new URLIndexPrivateData which on success will 112 // contain the restored data but upon failure will be empty. |languages| 113 // is used to break URLs and page titles into words. This function 114 // should be run on the the file thread. 115 static scoped_refptr<URLIndexPrivateData> RestoreFromFile( 116 const base::FilePath& path, 117 const std::string& languages); 118 119 // Constructs a new object by rebuilding its contents from the history 120 // database in |history_db|. Returns the new URLIndexPrivateData which on 121 // success will contain the rebuilt data but upon failure will be empty. 122 // |languages| gives a list of language encodings by which the URLs and page 123 // titles are broken down into words and characters. 124 static scoped_refptr<URLIndexPrivateData> RebuildFromHistory( 125 HistoryDatabase* history_db, 126 const std::string& languages, 127 const std::set<std::string>& scheme_whitelist); 128 129 // Writes |private_data| as a cache file to |file_path| and returns success. 130 static bool WritePrivateDataToCacheFileTask( 131 scoped_refptr<URLIndexPrivateData> private_data, 132 const base::FilePath& file_path); 133 134 // Stops all pending updates to recent visits fields. This should be 135 // called during shutdown. 136 void CancelPendingUpdates(); 137 138 // Creates a copy of ourself. 139 scoped_refptr<URLIndexPrivateData> Duplicate() const; 140 141 // Returns true if there is no data in the index. 142 bool Empty() const; 143 144 // Initializes all index data members in preparation for restoring the index 145 // from the cache or a complete rebuild from the history database. 146 void Clear(); 147 148 private: 149 friend class base::RefCountedThreadSafe<URLIndexPrivateData>; 150 ~URLIndexPrivateData(); 151 152 friend class AddHistoryMatch; 153 friend class ::HistoryQuickProviderTest; 154 friend class InMemoryURLIndexTest; 155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); 156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet); 157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory); 158 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld); 159 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring); 160 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch); 161 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching); 162 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs); 163 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization); 164 165 // Support caching of term results so that we can optimize searches which 166 // build upon a previous search. Each entry in this map represents one 167 // search term from the most recent search. For example, if the user had 168 // typed "google blog trans" and then typed an additional 'l' (at the end, 169 // of course) then there would be four items in the cache: 'blog', 'google', 170 // 'trans', and 'transl'. All would be marked as being in use except for the 171 // 'trans' item; its cached data would have been used when optimizing the 172 // construction of the search results candidates for 'transl' but then would 173 // no longer needed. 174 // 175 // Items stored in the search term cache. If a search term exactly matches one 176 // in the cache then we can quickly supply the proper |history_id_set_| (and 177 // marking the cache item as being |used_|. If we find a prefix for a search 178 // term in the cache (which is very likely to occur as the user types each 179 // term into the omnibox) then we can short-circuit the index search for those 180 // characters in the prefix by returning the |word_id_set|. In that case we do 181 // not mark the item as being |used_|. 182 struct SearchTermCacheItem { 183 SearchTermCacheItem(const WordIDSet& word_id_set, 184 const HistoryIDSet& history_id_set); 185 // Creates a cache item for a term which has no results. 186 SearchTermCacheItem(); 187 188 ~SearchTermCacheItem(); 189 190 WordIDSet word_id_set_; 191 HistoryIDSet history_id_set_; 192 bool used_; // True if this item has been used for the current term search. 193 }; 194 typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap; 195 196 // A helper class which performs the final filter on each candidate 197 // history URL match, inserting accepted matches into |scored_matches_|. 198 class AddHistoryMatch : public std::unary_function<HistoryID, void> { 199 public: 200 AddHistoryMatch(const URLIndexPrivateData& private_data, 201 const std::string& languages, 202 HistoryClient* history_client, 203 const base::string16& lower_string, 204 const String16Vector& lower_terms, 205 const base::Time now); 206 ~AddHistoryMatch(); 207 208 void operator()(const HistoryID history_id); 209 210 ScoredHistoryMatches ScoredMatches() const { return scored_matches_; } 211 212 private: 213 const URLIndexPrivateData& private_data_; 214 const std::string& languages_; 215 HistoryClient* history_client_; 216 ScoredHistoryMatches scored_matches_; 217 const base::string16& lower_string_; 218 const String16Vector& lower_terms_; 219 WordStarts lower_terms_to_word_starts_offsets_; 220 const base::Time now_; 221 }; 222 223 // A helper predicate class used to filter excess history items when the 224 // candidate results set is too large. 225 class HistoryItemFactorGreater 226 : public std::binary_function<HistoryID, HistoryID, void> { 227 public: 228 explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map); 229 ~HistoryItemFactorGreater(); 230 231 bool operator()(const HistoryID h1, const HistoryID h2); 232 233 private: 234 const history::HistoryInfoMap& history_info_map_; 235 }; 236 237 // URL History indexing support functions. 238 239 // Composes a set of history item IDs by intersecting the set for each word 240 // in |unsorted_words|. 241 HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words); 242 243 // Helper function to HistoryIDSetFromWords which composes a set of history 244 // ids for the given term given in |term|. 245 HistoryIDSet HistoryIDsForTerm(const base::string16& term); 246 247 // Given a set of Char16s, finds words containing those characters. 248 WordIDSet WordIDSetForTermChars(const Char16Set& term_chars); 249 250 // Indexes one URL history item as described by |row|. Returns true if the 251 // row was actually indexed. |languages| gives a list of language encodings by 252 // which the URLs and page titles are broken down into words and characters. 253 // |scheme_whitelist| is used to filter non-qualifying schemes. If 254 // |history_db| is not NULL then this function uses the history database 255 // synchronously to get the URL's recent visits information. This mode should 256 // only be used on the historyDB thread. If |history_db| is NULL, then 257 // this function uses |history_service| to schedule a task on the 258 // historyDB thread to fetch and update the recent visits 259 // information. 260 bool IndexRow(HistoryDatabase* history_db, 261 HistoryService* history_service, 262 const URLRow& row, 263 const std::string& languages, 264 const std::set<std::string>& scheme_whitelist); 265 266 // Parses and indexes the words in the URL and page title of |row| and 267 // calculate the word starts in each, saving the starts in |word_starts|. 268 // |languages| gives a list of language encodings by which the URLs and page 269 // titles are broken down into words and characters. 270 void AddRowWordsToIndex(const URLRow& row, 271 RowWordStarts* word_starts, 272 const std::string& languages); 273 274 // Given a single word in |uni_word|, adds a reference for the containing 275 // history item identified by |history_id| to the index. 276 void AddWordToIndex(const base::string16& uni_word, HistoryID history_id); 277 278 // Creates a new entry in the word/history map for |word_id| and add 279 // |history_id| as the initial element of the word's set. 280 void AddWordHistory(const base::string16& uni_word, HistoryID history_id); 281 282 // Updates an existing entry in the word/history index by adding the 283 // |history_id| to set for |word_id| in the word_id_history_map_. 284 void UpdateWordHistory(WordID word_id, HistoryID history_id); 285 286 // Adds |word_id| to |history_id|'s entry in the history/word map, 287 // creating a new entry if one does not already exist. 288 void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id); 289 290 // Removes |row| and all associated words and characters from the index. 291 void RemoveRowFromIndex(const URLRow& row); 292 293 // Removes all words and characters associated with |row| from the index. 294 void RemoveRowWordsFromIndex(const URLRow& row); 295 296 // Clears |used_| for each item in the search term cache. 297 void ResetSearchTermCache(); 298 299 // Caches the index private data and writes the cache file to the profile 300 // directory. Called by WritePrivateDataToCacheFileTask. 301 bool SaveToFile(const base::FilePath& file_path); 302 303 // Encode a data structure into the protobuf |cache|. 304 void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const; 305 void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const; 306 void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const; 307 void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const; 308 void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const; 309 void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const; 310 void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const; 311 312 // Decode a data structure from the protobuf |cache|. Return false if there 313 // is any kind of failure. |languages| will be used to break URLs and page 314 // titles into words 315 bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache, 316 const std::string& languages); 317 bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache); 318 bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache); 319 bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache); 320 bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache); 321 bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache); 322 bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache, 323 const std::string& languages); 324 325 // Determines if |gurl| has a whitelisted scheme and returns true if so. 326 static bool URLSchemeIsWhitelisted(const GURL& gurl, 327 const std::set<std::string>& whitelist); 328 329 // Cache of search terms. 330 SearchTermCacheMap search_term_cache_; 331 332 // Allows canceling pending requests to update recent visits information. 333 CancelableRequestConsumer recent_visits_consumer_; 334 335 // Start of data members that are cached ------------------------------------- 336 337 // The version of the cache file most recently used to restore this instance 338 // of the private data. If the private data was rebuilt from the history 339 // database this will be 0. 340 int restored_cache_version_; 341 342 // The last time the data was rebuilt from the history database. 343 base::Time last_time_rebuilt_from_history_; 344 345 // A list of all of indexed words. The index of a word in this list is the 346 // ID of the word in the word_map_. It reduces the memory overhead by 347 // replacing a potentially long and repeated string with a simple index. 348 String16Vector word_list_; 349 350 // A list of available words slots in |word_list_|. An available word slot 351 // is the index of a unused word in word_list_ vector, also referred to as 352 // a WordID. As URL visits are added or modified new words may be added to 353 // the index, in which case any available words are used, if any, and then 354 // words are added to the end of the word_list_. When URL visits are 355 // modified or deleted old words may be removed from the index, in which 356 // case the slots for those words are added to available_words_ for resuse 357 // by future URL updates. 358 WordIDSet available_words_; 359 360 // A one-to-one mapping from the a word string to its slot number (i.e. 361 // WordID) in the |word_list_|. 362 WordMap word_map_; 363 364 // A one-to-many mapping from a single character to all WordIDs of words 365 // containing that character. 366 CharWordIDMap char_word_map_; 367 368 // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as 369 // used in the history database) of history items in which the word occurs. 370 WordIDHistoryMap word_id_history_map_; 371 372 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur 373 // in the URL and/or page title of the history item referenced by that 374 // HistoryID. 375 HistoryIDWordMap history_id_word_map_; 376 377 // A one-to-one mapping from HistoryID to the history item data governing 378 // index inclusion and relevance scoring. 379 HistoryInfoMap history_info_map_; 380 381 // A one-to-one mapping from HistoryID to the word starts detected in each 382 // item's URL and page title. 383 WordStartsMap word_starts_map_; 384 385 // End of data members that are cached --------------------------------------- 386 387 // For unit testing only. Specifies the version of the cache file to be saved. 388 // Used only for testing upgrading of an older version of the cache upon 389 // restore. 390 int saved_cache_version_; 391 392 // Used for unit testing only. Records the number of candidate history items 393 // at three stages in the index searching process. 394 size_t pre_filter_item_count_; // After word index is queried. 395 size_t post_filter_item_count_; // After trimming large result set. 396 size_t post_scoring_item_count_; // After performing final filter/scoring. 397 }; 398 399 } // namespace history 400 401 #endif // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ 402