Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
      6 #define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
      7 
      8 #include <set>
      9 #include <string>
     10 
     11 #include "base/files/file_path.h"
     12 #include "base/gtest_prod_util.h"
     13 #include "base/memory/ref_counted.h"
     14 #include "chrome/browser/common/cancelable_request.h"
     15 #include "chrome/browser/history/history_service.h"
     16 #include "chrome/browser/history/in_memory_url_index_cache.pb.h"
     17 #include "chrome/browser/history/in_memory_url_index_types.h"
     18 #include "chrome/browser/history/scored_history_match.h"
     19 
     20 class HistoryQuickProviderTest;
     21 
     22 namespace in_memory_url_index {
     23 class InMemoryURLIndexCacheItem;
     24 }
     25 
     26 namespace history {
     27 
     28 namespace imui = in_memory_url_index;
     29 
     30 class HistoryClient;
     31 class HistoryDatabase;
     32 class InMemoryURLIndex;
     33 class RefCountedBool;
     34 
     35 // Current version of the cache file.
     36 static const int kCurrentCacheFileVersion = 5;
     37 
     38 // A structure private to InMemoryURLIndex describing its internal data and
     39 // providing for restoring, rebuilding and updating that internal data. As
     40 // this class is for exclusive use by the InMemoryURLIndex class there should
     41 // be no calls from any other class.
     42 //
     43 // All public member functions are called on the main thread unless otherwise
     44 // annotated.
     45 class URLIndexPrivateData
     46     : public base::RefCountedThreadSafe<URLIndexPrivateData> {
     47  public:
     48   URLIndexPrivateData();
     49 
     50   // Given a base::string16 in |term_string|, scans the history index and
     51   // returns a vector with all scored, matching history items. The
     52   // |term_string| is broken down into individual terms (words), each of which
     53   // must occur in the candidate history item's URL or page title for the item
     54   // to qualify; however, the terms do not necessarily have to be adjacent. We
     55   // also allow breaking |term_string| at |cursor_position| (if
     56   // set). Once we have a set of candidates, they are filtered to ensure
     57   // that all |term_string| terms, as separated by whitespace and the
     58   // cursor (if set), occur within the candidate's URL or page title.
     59   // Scores are then calculated on no more than |kItemsToScoreLimit|
     60   // candidates, as the scoring of such a large number of candidates may
     61   // cause perceptible typing response delays in the omnibox. This is
     62   // likely to occur for short omnibox terms such as 'h' and 'w' which
     63   // will be found in nearly all history candidates. Results are sorted by
     64   // descending score. The full results set (i.e. beyond the
     65   // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls
     66   // to this function. |history_client| is used to boost a result's score if
     67   // its URL is referenced by one or more of the user's bookmarks.  |languages|
     68   // is used to help parse/format the URLs in the history index.  In total,
     69   // |max_matches| of items will be returned in the |ScoredHistoryMatches|
     70   // vector.
     71   ScoredHistoryMatches HistoryItemsForTerms(base::string16 term_string,
     72                                             size_t cursor_position,
     73                                             size_t max_matches,
     74                                             const std::string& languages,
     75                                             HistoryClient* history_client);
     76 
     77   // Adds the history item in |row| to the index if it does not already already
     78   // exist and it meets the minimum 'quick' criteria. If the row already exists
     79   // in the index then the index will be updated if the row still meets the
     80   // criteria, otherwise the row will be removed from the index. Returns true
     81   // if the index was actually updated. |languages| gives a list of language
     82   // encodings by which the URLs and page titles are broken down into words and
     83   // characters. |scheme_whitelist| is used to filter non-qualifying schemes.
     84   // |history_service| is used to schedule an update to the recent visits
     85   // component of this URL's entry in the index.
     86   bool UpdateURL(HistoryService* history_service,
     87                  const URLRow& row,
     88                  const std::string& languages,
     89                  const std::set<std::string>& scheme_whitelist);
     90 
     91   // Updates the entry for |url_id| in the index, replacing its
     92   // recent visits information with |recent_visits|.  If |url_id|
     93   // is not in the index, does nothing.
     94   void UpdateRecentVisits(URLID url_id,
     95                           const VisitVector& recent_visits);
     96 
     97   // Using |history_service| schedules an update (using the historyDB
     98   // thread) for the recent visits information for |url_id|.  Unless
     99   // something unexpectedly goes wrong, UdpateRecentVisits() should
    100   // eventually be called from a callback.
    101   void ScheduleUpdateRecentVisits(HistoryService* history_service,
    102                                   URLID url_id);
    103 
    104   // Deletes index data for the history item with the given |url|.
    105   // The item may not have actually been indexed, which is the case if it did
    106   // not previously meet minimum 'quick' criteria. Returns true if the index
    107   // was actually updated.
    108   bool DeleteURL(const GURL& url);
    109 
    110   // Constructs a new object by restoring its contents from the cache file
    111   // at |path|. Returns the new URLIndexPrivateData which on success will
    112   // contain the restored data but upon failure will be empty.  |languages|
    113   // is used to break URLs and page titles into words.  This function
    114   // should be run on the the file thread.
    115   static scoped_refptr<URLIndexPrivateData> RestoreFromFile(
    116       const base::FilePath& path,
    117       const std::string& languages);
    118 
    119   // Constructs a new object by rebuilding its contents from the history
    120   // database in |history_db|. Returns the new URLIndexPrivateData which on
    121   // success will contain the rebuilt data but upon failure will be empty.
    122   // |languages| gives a list of language encodings by which the URLs and page
    123   // titles are broken down into words and characters.
    124   static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(
    125       HistoryDatabase* history_db,
    126       const std::string& languages,
    127       const std::set<std::string>& scheme_whitelist);
    128 
    129   // Writes |private_data| as a cache file to |file_path| and returns success.
    130   static bool WritePrivateDataToCacheFileTask(
    131       scoped_refptr<URLIndexPrivateData> private_data,
    132       const base::FilePath& file_path);
    133 
    134   // Stops all pending updates to recent visits fields.  This should be
    135   // called during shutdown.
    136   void CancelPendingUpdates();
    137 
    138   // Creates a copy of ourself.
    139   scoped_refptr<URLIndexPrivateData> Duplicate() const;
    140 
    141   // Returns true if there is no data in the index.
    142   bool Empty() const;
    143 
    144   // Initializes all index data members in preparation for restoring the index
    145   // from the cache or a complete rebuild from the history database.
    146   void Clear();
    147 
    148  private:
    149   friend class base::RefCountedThreadSafe<URLIndexPrivateData>;
    150   ~URLIndexPrivateData();
    151 
    152   friend class AddHistoryMatch;
    153   friend class ::HistoryQuickProviderTest;
    154   friend class InMemoryURLIndexTest;
    155   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
    156   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);
    157   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);
    158   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);
    159   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
    160   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
    161   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
    162   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);
    163   FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
    164 
    165   // Support caching of term results so that we can optimize searches which
    166   // build upon a previous search. Each entry in this map represents one
    167   // search term from the most recent search. For example, if the user had
    168   // typed "google blog trans" and then typed an additional 'l' (at the end,
    169   // of course) then there would be four items in the cache: 'blog', 'google',
    170   // 'trans', and 'transl'. All would be marked as being in use except for the
    171   // 'trans' item; its cached data would have been used when optimizing the
    172   // construction of the search results candidates for 'transl' but then would
    173   // no longer needed.
    174   //
    175   // Items stored in the search term cache. If a search term exactly matches one
    176   // in the cache then we can quickly supply the proper |history_id_set_| (and
    177   // marking the cache item as being |used_|. If we find a prefix for a search
    178   // term in the cache (which is very likely to occur as the user types each
    179   // term into the omnibox) then we can short-circuit the index search for those
    180   // characters in the prefix by returning the |word_id_set|. In that case we do
    181   // not mark the item as being |used_|.
    182   struct SearchTermCacheItem {
    183     SearchTermCacheItem(const WordIDSet& word_id_set,
    184                         const HistoryIDSet& history_id_set);
    185     // Creates a cache item for a term which has no results.
    186     SearchTermCacheItem();
    187 
    188     ~SearchTermCacheItem();
    189 
    190     WordIDSet word_id_set_;
    191     HistoryIDSet history_id_set_;
    192     bool used_;  // True if this item has been used for the current term search.
    193   };
    194   typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;
    195 
    196   // A helper class which performs the final filter on each candidate
    197   // history URL match, inserting accepted matches into |scored_matches_|.
    198   class AddHistoryMatch : public std::unary_function<HistoryID, void> {
    199    public:
    200     AddHistoryMatch(const URLIndexPrivateData& private_data,
    201                     const std::string& languages,
    202                     HistoryClient* history_client,
    203                     const base::string16& lower_string,
    204                     const String16Vector& lower_terms,
    205                     const base::Time now);
    206     ~AddHistoryMatch();
    207 
    208     void operator()(const HistoryID history_id);
    209 
    210     ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
    211 
    212    private:
    213     const URLIndexPrivateData& private_data_;
    214     const std::string& languages_;
    215     HistoryClient* history_client_;
    216     ScoredHistoryMatches scored_matches_;
    217     const base::string16& lower_string_;
    218     const String16Vector& lower_terms_;
    219     WordStarts lower_terms_to_word_starts_offsets_;
    220     const base::Time now_;
    221   };
    222 
    223   // A helper predicate class used to filter excess history items when the
    224   // candidate results set is too large.
    225   class HistoryItemFactorGreater
    226       : public std::binary_function<HistoryID, HistoryID, void> {
    227    public:
    228     explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);
    229     ~HistoryItemFactorGreater();
    230 
    231     bool operator()(const HistoryID h1, const HistoryID h2);
    232 
    233    private:
    234     const history::HistoryInfoMap& history_info_map_;
    235   };
    236 
    237   // URL History indexing support functions.
    238 
    239   // Composes a set of history item IDs by intersecting the set for each word
    240   // in |unsorted_words|.
    241   HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words);
    242 
    243   // Helper function to HistoryIDSetFromWords which composes a set of history
    244   // ids for the given term given in |term|.
    245   HistoryIDSet HistoryIDsForTerm(const base::string16& term);
    246 
    247   // Given a set of Char16s, finds words containing those characters.
    248   WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);
    249 
    250   // Indexes one URL history item as described by |row|. Returns true if the
    251   // row was actually indexed. |languages| gives a list of language encodings by
    252   // which the URLs and page titles are broken down into words and characters.
    253   // |scheme_whitelist| is used to filter non-qualifying schemes.  If
    254   // |history_db| is not NULL then this function uses the history database
    255   // synchronously to get the URL's recent visits information.  This mode should
    256   // only be used on the historyDB thread.  If |history_db| is NULL, then
    257   // this function uses |history_service| to schedule a task on the
    258   // historyDB thread to fetch and update the recent visits
    259   // information.
    260   bool IndexRow(HistoryDatabase* history_db,
    261                 HistoryService* history_service,
    262                 const URLRow& row,
    263                 const std::string& languages,
    264                 const std::set<std::string>& scheme_whitelist);
    265 
    266   // Parses and indexes the words in the URL and page title of |row| and
    267   // calculate the word starts in each, saving the starts in |word_starts|.
    268   // |languages| gives a list of language encodings by which the URLs and page
    269   // titles are broken down into words and characters.
    270   void AddRowWordsToIndex(const URLRow& row,
    271                           RowWordStarts* word_starts,
    272                           const std::string& languages);
    273 
    274   // Given a single word in |uni_word|, adds a reference for the containing
    275   // history item identified by |history_id| to the index.
    276   void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);
    277 
    278   // Creates a new entry in the word/history map for |word_id| and add
    279   // |history_id| as the initial element of the word's set.
    280   void AddWordHistory(const base::string16& uni_word, HistoryID history_id);
    281 
    282   // Updates an existing entry in the word/history index by adding the
    283   // |history_id| to set for |word_id| in the word_id_history_map_.
    284   void UpdateWordHistory(WordID word_id, HistoryID history_id);
    285 
    286   // Adds |word_id| to |history_id|'s entry in the history/word map,
    287   // creating a new entry if one does not already exist.
    288   void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);
    289 
    290   // Removes |row| and all associated words and characters from the index.
    291   void RemoveRowFromIndex(const URLRow& row);
    292 
    293   // Removes all words and characters associated with |row| from the index.
    294   void RemoveRowWordsFromIndex(const URLRow& row);
    295 
    296   // Clears |used_| for each item in the search term cache.
    297   void ResetSearchTermCache();
    298 
    299   // Caches the index private data and writes the cache file to the profile
    300   // directory.  Called by WritePrivateDataToCacheFileTask.
    301   bool SaveToFile(const base::FilePath& file_path);
    302 
    303   // Encode a data structure into the protobuf |cache|.
    304   void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;
    305   void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;
    306   void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
    307   void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
    308   void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;
    309   void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;
    310   void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const;
    311 
    312   // Decode a data structure from the protobuf |cache|. Return false if there
    313   // is any kind of failure. |languages| will be used to break URLs and page
    314   // titles into words
    315   bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache,
    316                           const std::string& languages);
    317   bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);
    318   bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);
    319   bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);
    320   bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);
    321   bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);
    322   bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache,
    323                             const std::string& languages);
    324 
    325   // Determines if |gurl| has a whitelisted scheme and returns true if so.
    326   static bool URLSchemeIsWhitelisted(const GURL& gurl,
    327                                      const std::set<std::string>& whitelist);
    328 
    329   // Cache of search terms.
    330   SearchTermCacheMap search_term_cache_;
    331 
    332   // Allows canceling pending requests to update recent visits information.
    333   CancelableRequestConsumer recent_visits_consumer_;
    334 
    335   // Start of data members that are cached -------------------------------------
    336 
    337   // The version of the cache file most recently used to restore this instance
    338   // of the private data. If the private data was rebuilt from the history
    339   // database this will be 0.
    340   int restored_cache_version_;
    341 
    342   // The last time the data was rebuilt from the history database.
    343   base::Time last_time_rebuilt_from_history_;
    344 
    345   // A list of all of indexed words. The index of a word in this list is the
    346   // ID of the word in the word_map_. It reduces the memory overhead by
    347   // replacing a potentially long and repeated string with a simple index.
    348   String16Vector word_list_;
    349 
    350   // A list of available words slots in |word_list_|. An available word slot
    351   // is the index of a unused word in word_list_ vector, also referred to as
    352   // a WordID. As URL visits are added or modified new words may be added to
    353   // the index, in which case any available words are used, if any, and then
    354   // words are added to the end of the word_list_. When URL visits are
    355   // modified or deleted old words may be removed from the index, in which
    356   // case the slots for those words are added to available_words_ for resuse
    357   // by future URL updates.
    358   WordIDSet available_words_;
    359 
    360   // A one-to-one mapping from the a word string to its slot number (i.e.
    361   // WordID) in the |word_list_|.
    362   WordMap word_map_;
    363 
    364   // A one-to-many mapping from a single character to all WordIDs of words
    365   // containing that character.
    366   CharWordIDMap char_word_map_;
    367 
    368   // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as
    369   // used in the history database) of history items in which the word occurs.
    370   WordIDHistoryMap word_id_history_map_;
    371 
    372   // A one-to-many mapping from a HistoryID to all WordIDs of words that occur
    373   // in the URL and/or page title of the history item referenced by that
    374   // HistoryID.
    375   HistoryIDWordMap history_id_word_map_;
    376 
    377   // A one-to-one mapping from HistoryID to the history item data governing
    378   // index inclusion and relevance scoring.
    379   HistoryInfoMap history_info_map_;
    380 
    381   // A one-to-one mapping from HistoryID to the word starts detected in each
    382   // item's URL and page title.
    383   WordStartsMap word_starts_map_;
    384 
    385   // End of data members that are cached ---------------------------------------
    386 
    387   // For unit testing only. Specifies the version of the cache file to be saved.
    388   // Used only for testing upgrading of an older version of the cache upon
    389   // restore.
    390   int saved_cache_version_;
    391 
    392   // Used for unit testing only. Records the number of candidate history items
    393   // at three stages in the index searching process.
    394   size_t pre_filter_item_count_;    // After word index is queried.
    395   size_t post_filter_item_count_;   // After trimming large result set.
    396   size_t post_scoring_item_count_;  // After performing final filter/scoring.
    397 };
    398 
    399 }  // namespace history
    400 
    401 #endif  // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
    402