Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
      6 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
      7 #pragma once
      8 
      9 #include <functional>
     10 #include <map>
     11 #include <set>
     12 #include <string>
     13 #include <vector>
     14 
     15 #include "app/sql/connection.h"
     16 #include "base/basictypes.h"
     17 #include "base/file_path.h"
     18 #include "base/gtest_prod_util.h"
     19 #include "base/memory/linked_ptr.h"
     20 #include "base/memory/scoped_ptr.h"
     21 #include "base/string16.h"
     22 #include "chrome/browser/autocomplete/autocomplete_match.h"
     23 #include "chrome/browser/autocomplete/history_provider_util.h"
     24 #include "chrome/browser/history/history_types.h"
     25 #include "chrome/browser/history/in_memory_url_index_cache.pb.h"
     26 #include "testing/gtest/include/gtest/gtest_prod.h"
     27 
     28 class Profile;
     29 
     30 namespace base {
     31 class Time;
     32 }
     33 
     34 namespace in_memory_url_index {
     35 class InMemoryURLIndexCacheItem;
     36 }
     37 
     38 namespace history {
     39 
     40 namespace imui = in_memory_url_index;
     41 
     42 class URLDatabase;
     43 
     44 // Specifies where an omnibox term occurs within a string. Used for specifying
     45 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
     46 // scoring a result.
     47 struct TermMatch {
     48   TermMatch(int term_num, size_t offset, size_t length)
     49       : term_num(term_num),
     50         offset(offset),
     51         length(length) {}
     52 
     53   int term_num;  // The index of the term in the original search string.
     54   size_t offset;  // The starting offset of the substring match.
     55   size_t length;  // The length of the substring match.
     56 };
     57 typedef std::vector<TermMatch> TermMatches;
     58 
     59 // Used for intermediate history result operations.
     60 struct ScoredHistoryMatch : public HistoryMatch {
     61   ScoredHistoryMatch();  // Required by STL.
     62   explicit ScoredHistoryMatch(const URLRow& url_info);
     63   ~ScoredHistoryMatch();
     64 
     65   static bool MatchScoreGreater(const ScoredHistoryMatch& m1,
     66                                 const ScoredHistoryMatch& m2);
     67 
     68   // An interim score taking into consideration location and completeness
     69   // of the match.
     70   int raw_score;
     71   TermMatches url_matches;  // Term matches within the URL.
     72   TermMatches title_matches;  // Term matches within the page title.
     73   size_t prefix_adjust;  // The length of a prefix which should be ignored.
     74 };
     75 typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches;
     76 
     77 // The URL history source.
     78 // Holds portions of the URL database in memory in an indexed form.  Used to
     79 // quickly look up matching URLs for a given query string.  Used by
     80 // the HistoryURLProvider for inline autocomplete and to provide URL
     81 // matches to the omnibox.
     82 //
     83 // Note about multi-byte codepoints and the data structures in the
     84 // InMemoryURLIndex class: One will quickly notice that no effort is made to
     85 // insure that multi-byte character boundaries are detected when indexing the
     86 // words and characters in the URL history database except when converting
     87 // URL strings to lowercase. Multi-byte-edness makes no difference when
     88 // indexing or when searching the index as the final filtering of results
     89 // is dependent on the comparison of a string of bytes, not individual
     90 // characters. While the lookup of those bytes during a search in the
     91 // |char_word_map_| could serve up words in which the individual char16
     92 // occurs as a portion of a composite character the next filtering step
     93 // will eliminate such words except in the case where a single character
     94 // is being searched on and which character occurs as the second char16 of a
     95 // multi-char16 instance.
     96 class InMemoryURLIndex {
     97  public:
     98   // |history_dir| is a path to the directory containing the history database
     99   // within the profile wherein the cache and transaction journals will be
    100   // stored.
    101   explicit InMemoryURLIndex(const FilePath& history_dir);
    102   ~InMemoryURLIndex();
    103 
    104   // Convenience types
    105   typedef std::vector<string16> String16Vector;
    106 
    107   // Opens and indexes the URL history database.
    108   // |languages| gives a list of language encodings with which the history
    109   // URLs and omnibox searches are interpreted, i.e. when each is broken
    110   // down into words and each word is broken down into characters.
    111   bool Init(URLDatabase* history_db, const std::string& languages);
    112 
    113   // Reloads the history index. Attempts to reload from the cache unless
    114   // |clear_cache| is true. If the cache is unavailable then reload the
    115   // index from |history_db|.
    116   bool ReloadFromHistory(URLDatabase* history_db, bool clear_cache);
    117 
    118   // Signals that any outstanding initialization should be canceled and
    119   // flushes the cache to disk.
    120   void ShutDown();
    121 
    122   // Restores the index's private data from the cache file stored in the
    123   // profile directory and returns true if successful.
    124   bool RestoreFromCacheFile();
    125 
    126   // Caches the index private data and writes the cache file to the profile
    127   // directory.
    128   bool SaveToCacheFile();
    129 
    130   // Given a vector containing one or more words as string16s, scans the
    131   // history index and return a vector with all scored, matching history items.
    132   // Each term must occur somewhere in the history item's URL or page title for
    133   // the item to qualify; however, the terms do not necessarily have to be
    134   // adjacent. Results are sorted with higher scoring items first. Each term
    135   // from |terms| may contain punctuation but should not contain spaces.
    136   // A search request which results in more than |kItemsToScoreLimit| total
    137   // candidate items returns no matches (though the results set will be
    138   // retained and used for subsequent calls to this function) as the scoring
    139   // of such a large number of candidates may cause perceptible typing response
    140   // delays in the omnibox. This is likely to occur for short omnibox terms
    141   // such as 'h' and 'w' which will be found in nearly all history candidates.
    142   ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms);
    143 
    144   // Updates or adds an history item to the index if it meets the minimum
    145   // 'quick' criteria.
    146   void UpdateURL(URLID row_id, const URLRow& row);
    147 
    148   // Deletes indexing data for an history item. The item may not have actually
    149   // been indexed (which is the case if it did not previously meet minimum
    150   // 'quick' criteria).
    151   void DeleteURL(URLID row_id);
    152 
    153   // Breaks the |uni_string| string down into individual words and return
    154   // a vector with the individual words in their original order. If
    155   // |break_on_space| is false then the resulting list will contain only words
    156   // containing alpha-numeric characters. If |break_on_space| is true then the
    157   // resulting list will contain strings broken at whitespace.
    158   //
    159   // Example:
    160   //   Given: |uni_string|: "http://www.google.com/ harry the rabbit."
    161   //   With |break_on_space| false the returned list will contain:
    162   //    "http", "www", "google", "com", "harry", "the", "rabbit"
    163   //   With |break_on_space| true the returned list will contain:
    164   //    "http://", "www.google.com/", "harry", "the", "rabbit."
    165   static String16Vector WordVectorFromString16(const string16& uni_string,
    166                                                bool break_on_space);
    167 
    168   // Extract and return the offsets from |matches|.
    169   static std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);
    170 
    171   // Replace the offsets in |matches| with those given in |offsets|, deleting
    172   // any which are npos, and return the updated list of matches.
    173   static TermMatches ReplaceOffsetsInTermMatches(
    174       const TermMatches& matches,
    175       const std::vector<size_t>& offsets);
    176 
    177  private:
    178   friend class AddHistoryMatch;
    179   FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
    180   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath);
    181   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
    182   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities);
    183   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
    184   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions);
    185   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
    186   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
    187 
    188   // Signals that there are no previously cached results for the typed term.
    189   static const size_t kNoCachedResultForTerm;
    190 
    191   // Creating one of me without a history path is not allowed (tests excepted).
    192   InMemoryURLIndex();
    193 
    194   // Convenience types.
    195   typedef std::set<string16> String16Set;
    196   typedef std::set<char16> Char16Set;
    197   typedef std::vector<char16> Char16Vector;
    198 
    199   // An index into list of all of the words we have indexed.
    200   typedef int WordID;
    201 
    202   // A map allowing a WordID to be determined given a word.
    203   typedef std::map<string16, WordID> WordMap;
    204 
    205   // A map from character to word_ids.
    206   typedef std::set<WordID> WordIDSet;  // An index into the WordList.
    207   typedef std::map<char16, WordIDSet> CharWordIDMap;
    208 
    209   // A map from word_id to history item.
    210   // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit.
    211   // Consider using a smaller type.
    212   typedef URLID HistoryID;
    213   typedef std::set<HistoryID> HistoryIDSet;
    214   typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
    215 
    216   // Support caching of term character results so that we can optimize
    217   // searches which build upon a previous search. Each entry in this vector
    218   // represents a progressive reduction of the result set for each unique
    219   // character found in the search term, with each character being taken as
    220   // initially encountered. For example, once the results for the search
    221   // term of "mextexarkana" have been fully determined, this vector will
    222   // contain one entry for the characters: m, e, x, t, a, r, k, & n, in
    223   // that order. The result sets will naturally shrink in size for each
    224   // subsequent character as the sets intersections are taken in an
    225   // incremental manner.
    226   struct TermCharWordSet;
    227   typedef std::vector<TermCharWordSet> TermCharWordSetVector;
    228 
    229   // TODO(rohitrao): Probably replace this with QueryResults.
    230   typedef std::vector<URLRow> URLRowVector;
    231 
    232   // A map from history_id to the history's URL and title.
    233   typedef std::map<HistoryID, URLRow> HistoryInfoMap;
    234 
    235   // A helper class which performs the final filter on each candidate
    236   // history URL match, inserting accepted matches into |scored_matches_|
    237   // and trimming the maximum number of matches to 10.
    238   class AddHistoryMatch : public std::unary_function<HistoryID, void> {
    239    public:
    240     AddHistoryMatch(const InMemoryURLIndex& index,
    241                     const String16Vector& lower_terms);
    242     ~AddHistoryMatch();
    243 
    244     void operator()(const HistoryID history_id);
    245 
    246     ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
    247 
    248    private:
    249     const InMemoryURLIndex& index_;
    250     ScoredHistoryMatches scored_matches_;
    251     const String16Vector& lower_terms_;
    252   };
    253 
    254   // Initializes all index data members in preparation for restoring the index
    255   // from the cache or a complete rebuild from the history database.
    256   void ClearPrivateData();
    257 
    258   // Breaks a string down into individual words.
    259   static String16Set WordSetFromString16(const string16& uni_string);
    260 
    261   // Given a vector of Char16s, representing the characters the user has typed
    262   // into the omnibox, finds words containing those characters. If any
    263   // existing, cached set is a proper subset then starts with that cached
    264   // set. Updates the previously-typed-character cache.
    265   WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars);
    266 
    267   // Given a vector of Char16s in |uni_chars|, compare those characters, in
    268   // order, with the previously searched term, returning the index of the
    269   // cached results in the term_char_word_set_cache_ of the set with best
    270   // matching number of characters. Returns kNoCachedResultForTerm if there
    271   // was no match at all (i.e. the first character of |uni-chars| is not the
    272   // same as the character of the first entry in term_char_word_set_cache_).
    273   size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars);
    274 
    275   // Creates a TermMatches which has an entry for each occurrence of the string
    276   // |term| found in the string |string|. Mark each match with |term_num| so
    277   // that the resulting TermMatches can be merged with other TermMatches for
    278   // other terms.
    279   static TermMatches MatchTermInString(const string16& term,
    280                                        const string16& string,
    281                                        int term_num);
    282 
    283   // URL History indexing support functions.
    284 
    285   // Indexes one URL history item.
    286   bool IndexRow(const URLRow& row);
    287 
    288   // Breaks a string down into unique, individual characters in the order
    289   // in which the characters are first encountered in the |uni_word| string.
    290   static Char16Vector Char16VectorFromString16(const string16& uni_word);
    291 
    292   // Breaks the |uni_word| string down into its individual characters.
    293   // Note that this is temporarily intended to work on a single word, but
    294   // _will_ work on a string of words, perhaps with unexpected results.
    295   // TODO(mrossetti): Lots of optimizations possible here for not restarting
    296   // a search if the user is just typing along. Also, change this to uniString
    297   // and properly handle substring matches, scoring and sorting the results
    298   // by score. Also, provide the metrics for where the matches occur so that
    299   // the UI can highlight the matched sections.
    300   static Char16Set Char16SetFromString16(const string16& uni_word);
    301 
    302   // Given a single word in |uni_word|, adds a reference for the containing
    303   // history item identified by |history_id| to the index.
    304   void AddWordToIndex(const string16& uni_word, HistoryID history_id);
    305 
    306   // Updates an existing entry in the word/history index by adding the
    307   // |history_id| to set for |word_id| in the word_id_history_map_.
    308   void UpdateWordHistory(WordID word_id, HistoryID history_id);
    309 
    310   // Creates a new entry in the word/history map for |word_id| and add
    311   // |history_id| as the initial element of the word's set.
    312   void AddWordHistory(const string16& uni_word, HistoryID history_id);
    313 
    314   // Clears the search term cache. This cache holds on to the intermediate
    315   // word results for each previously typed character to eliminate the need
    316   // to re-perform set operations for previously typed characters.
    317   void ResetTermCharWordSetCache();
    318 
    319   // Composes a set of history item IDs by intersecting the set for each word
    320   // in |uni_string|.
    321   HistoryIDSet HistoryIDSetFromWords(const string16& uni_string);
    322 
    323   // Helper function to HistoryIDSetFromWords which composes a set of history
    324   // ids for the given term given in |uni_word|.
    325   HistoryIDSet HistoryIDsForTerm(const string16& uni_word);
    326 
    327   // Calculates a raw score for this history item by first determining
    328   // if all of the terms in |terms_vector| occur in |row| and, if so,
    329   // calculating a raw score based on 1) starting position of each term
    330   // in the user input, 2) completeness of each term's match, 3) ordering
    331   // of the occurrence of each term (i.e. they appear in order), 4) last
    332   // visit time, and 5) number of visits.
    333   // This raw score allows the results to be ordered and can be used
    334   // to influence the final score calculated by the client of this
    335   // index. Returns a ScoredHistoryMatch structure with the raw score and
    336   // substring matching metrics.
    337   static ScoredHistoryMatch ScoredMatchForURL(
    338       const URLRow& row,
    339       const String16Vector& terms_vector);
    340 
    341   // Calculates a component score based on position, ordering and total
    342   // substring match size using metrics recorded in |matches|. |max_length|
    343   // is the length of the string against which the terms are being searched.
    344   static int ScoreComponentForMatches(const TermMatches& matches,
    345                                       size_t max_length);
    346 
    347   // Sorts and removes overlapping substring matches from |matches| and
    348   // returns the cleaned up matches.
    349   static TermMatches SortAndDeoverlap(const TermMatches& matches);
    350 
    351   // Utility functions supporting RestoreFromCache and SaveToCache.
    352 
    353   // Construct a file path for the cache file within the same directory where
    354   // the history database is kept and saves that path to |file_path|. Returns
    355   // true if |file_path| can be successfully constructed. (This function
    356   // provided as a hook for unit testing.)
    357   bool GetCacheFilePath(FilePath* file_path);
    358 
    359   // Encode a data structure into the protobuf |cache|.
    360   void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;
    361   void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;
    362   void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
    363   void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
    364   void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;
    365   void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;
    366 
    367   // Decode a data structure from the protobuf |cache|. Return false if there
    368   // is any kind of failure.
    369   bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache);
    370   bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);
    371   bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);
    372   bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);
    373   bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);
    374   bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);
    375 
    376   // Directory where cache file resides. This is, except when unit testing,
    377   // the same directory in which the profile's history database is found. It
    378   // should never be empty.
    379   FilePath history_dir_;
    380 
    381   // The timestamp of when the cache was last saved. This is used to validate
    382   // the transaction journal's applicability to the cache. The timestamp is
    383   // initialized to the NULL time, indicating that the cache was not used with
    384   // the InMemoryURLIndex was last populated.
    385   base::Time last_saved_;
    386 
    387   // A list of all of indexed words. The index of a word in this list is the
    388   // ID of the word in the word_map_. It reduces the memory overhead by
    389   // replacing a potentially long and repeated string with a simple index.
    390   // NOTE: A word will _never_ be removed from this vector thus insuring
    391   // the immutability of the word_id throughout the session, reducing
    392   // maintenance complexity.
    393   // TODO(mrossetti): Profile the vector allocation and determine if judicious
    394   // 'reserve' calls are called for.
    395   String16Vector word_list_;
    396 
    397   int history_item_count_;
    398   WordMap word_map_;
    399   CharWordIDMap char_word_map_;
    400   WordIDHistoryMap word_id_history_map_;
    401   TermCharWordSetVector term_char_word_set_cache_;
    402   HistoryInfoMap history_info_map_;
    403   std::string languages_;
    404 
    405   DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex);
    406 };
    407 
    408 }  // namespace history
    409 
    410 #endif  // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
    411