Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_HISTORY_TEXT_DATABASE_MANAGER_H_
      6 #define CHROME_BROWSER_HISTORY_TEXT_DATABASE_MANAGER_H_
      7 #pragma once
      8 
      9 #include <set>
     10 #include <vector>
     11 
     12 #include "base/basictypes.h"
     13 #include "base/file_path.h"
     14 #include "base/gtest_prod_util.h"
     15 #include "base/string16.h"
     16 #include "base/task.h"
     17 #include "chrome/browser/history/history_types.h"
     18 #include "chrome/browser/history/text_database.h"
     19 #include "chrome/browser/history/query_parser.h"
     20 #include "chrome/browser/history/url_database.h"
     21 #include "content/common/mru_cache.h"
     22 
     23 namespace history {
     24 
     25 class HistoryPublisher;
     26 class VisitDatabase;
     27 
     28 // Manages a set of text databases representing different time periods. This
     29 // will page them in and out as necessary, and will manage queries for times
     30 // spanning multiple databases.
     31 //
     32 // It will also keep a list of partial changes, such as page adds and title and
     33 // body sets, all of which come in at different times for a given page. When
     34 // all data is received or enough time has elapsed since adding, the indexed
     35 // data will be comitted.
     36 //
     37 // This allows us to minimize inserts and modifications, which are slow for the
     38 // full text database, since each page's information is added exactly once.
     39 //
     40 // Note: be careful to delete the relevant entries from this uncommitted list
     41 // when clearing history or this information may get added to the database soon
     42 // after the clear.
     43 class TextDatabaseManager {
     44  public:
     45   // Tracks a set of changes (only deletes need to be supported now) to the
     46   // databases. This is opaque to the caller, but allows it to pass back a list
     47   // of all database that it has caused a change to.
     48   //
     49   // This is necessary for the feature where we optimize full text databases
     50   // which have changed as a result of the user deleting history via
     51   // OptimizeChangedDatabases. We want to do each affected database only once at
     52   // the end of the delete, but we don't want the caller to have to worry about
     53   // our internals.
     54   class ChangeSet {
     55    public:
     56     ChangeSet();
     57     ~ChangeSet();
     58 
     59    private:
     60     friend class TextDatabaseManager;
     61 
     62     typedef std::set<TextDatabase::DBIdent> DBSet;
     63 
     64     void Add(TextDatabase::DBIdent id) { changed_databases_.insert(id); }
     65 
     66     DBSet changed_databases_;
     67   };
     68 
     69   // You must call Init() to complete initialization.
     70   //
     71   // |dir| is the directory that will hold the full text database files (there
     72   // will be many files named by their date ranges).
     73   //
     74   // The visit database is a pointer owned by the caller for the main database
     75   // (of recent visits). The visit database will be updated to refer to the
     76   // added text database entries.
     77   explicit TextDatabaseManager(const FilePath& dir,
     78                                URLDatabase* url_database,
     79                                VisitDatabase* visit_database);
     80   ~TextDatabaseManager();
     81 
     82   // Must call before using other functions. If it returns false, no other
     83   // functions should be called.
     84   bool Init(const HistoryPublisher* history_publisher);
     85 
     86   // Returns the directory that holds the full text database files.
     87   const FilePath& GetDir() { return dir_; }
     88 
     89   // Allows scoping updates. This also allows things to go faster since every
     90   // page add doesn't need to be committed to disk (slow). Note that files will
     91   // still get created during a transaction.
     92   void BeginTransaction();
     93   void CommitTransaction();
     94 
     95   // Sets specific information for the given page to be added to the database.
     96   // In normal operation, URLs will be added as the user visits them, the titles
     97   // and bodies will come in some time after that. These changes will be
     98   // automatically coalesced and added to the database some time in the future
     99   // using AddPageData().
    100   //
    101   // AddPageURL must be called for a given URL (+ its corresponding ID) before
    102   // either the title or body set. The visit ID specifies the visit that will
    103   // get updated to refer to the full text indexed information. The visit time
    104   // should be the time corresponding to that visit in the database.
    105   void AddPageURL(const GURL& url, URLID url_id, VisitID visit_id,
    106                   base::Time visit_time);
    107   void AddPageTitle(const GURL& url, const string16& title);
    108   void AddPageContents(const GURL& url, const string16& body);
    109 
    110   // Adds the given data to the appropriate database file, returning true on
    111   // success. The visit database row identified by |visit_id| will be updated
    112   // to refer to the full text index entry. If the visit ID is 0, the visit
    113   // database will not be updated.
    114   bool AddPageData(const GURL& url,
    115                    URLID url_id,
    116                    VisitID visit_id,
    117                    base::Time visit_time,
    118                    const string16& title,
    119                    const string16& body);
    120 
    121   // Deletes the instance of indexed data identified by the given time and URL.
    122   // Any changes will be tracked in the optional change set for use when calling
    123   // OptimizeChangedDatabases later. change_set can be NULL.
    124   void DeletePageData(base::Time time, const GURL& url,
    125                       ChangeSet* change_set);
    126 
    127   // The text database manager keeps a list of changes that are made to the
    128   // file AddPageURL/Title/Body that may not be committed to the database yet.
    129   // This function removes entires from this list happening between the given
    130   // time range. It is called when the user clears their history for a time
    131   // range, and we don't want any of our data to "leak." If restrict_urls is
    132   // not empty, only changes on those URLs are deleted.
    133   //
    134   // Either or both times my be is_null to be unbounded in that direction. When
    135   // non-null, the range is [begin, end).
    136   void DeleteFromUncommitted(const std::set<GURL>& restrict_urls,
    137                              base::Time begin, base::Time end);
    138 
    139   // Deletes all full text search data by removing the files from the disk.
    140   // This must be called OUTSIDE of a transaction since it actually deletes the
    141   // files rather than messing with the database.
    142   void DeleteAll();
    143 
    144   // Calls optimize on all the databases identified in a given change set (see
    145   // the definition of ChangeSet above for more). Optimizing means that old data
    146   // will be removed rather than marked unused.
    147   void OptimizeChangedDatabases(const ChangeSet& change_set);
    148 
    149   // Executes the given query. See QueryOptions for more info on input.
    150   //
    151   // The results are filled into |results|, and the first time considered for
    152   // the output is in |first_time_searched| (see QueryResults for more).
    153   //
    154   // This function will return more than one match per URL if there is more than
    155   // one entry for that URL in the database.
    156   void GetTextMatches(const string16& query,
    157                       const QueryOptions& options,
    158                       std::vector<TextDatabase::Match>* results,
    159                       base::Time* first_time_searched);
    160 
    161  private:
    162   // These tests call ExpireRecentChangesForTime to force expiration.
    163   FRIEND_TEST_ALL_PREFIXES(TextDatabaseManagerTest, InsertPartial);
    164   FRIEND_TEST_ALL_PREFIXES(TextDatabaseManagerTest, PartialComplete);
    165   FRIEND_TEST_ALL_PREFIXES(ExpireHistoryTest, DeleteURLAndFavicon);
    166   FRIEND_TEST_ALL_PREFIXES(ExpireHistoryTest, FlushRecentURLsUnstarred);
    167   FRIEND_TEST_ALL_PREFIXES(ExpireHistoryTest,
    168                            FlushRecentURLsUnstarredRestricted);
    169 
    170   // Stores "recent stuff" that has happened with the page, since the page
    171   // visit, title, and body all come in at different times.
    172   class PageInfo {
    173    public:
    174     PageInfo(URLID url_id, VisitID visit_id, base::Time visit_time);
    175     ~PageInfo();
    176 
    177     // Getters.
    178     URLID url_id() const { return url_id_; }
    179     VisitID visit_id() const { return visit_id_; }
    180     base::Time visit_time() const { return visit_time_; }
    181     const string16& title() const { return title_; }
    182     const string16& body() const { return body_; }
    183 
    184     // Setters, we can only update the title and body.
    185     void set_title(const string16& ttl);
    186     void set_body(const string16& bdy);
    187 
    188     // Returns true if both the title or body of the entry has been set. Since
    189     // both the title and body setters will "fix" empty strings to be a space,
    190     // these indicate if the setter was ever called.
    191     bool has_title() const { return !title_.empty(); }
    192     bool has_body() { return !body_.empty(); }
    193 
    194     // Returns true if this entry was added too long ago and we should give up
    195     // waiting for more data. The current time is passed in as an argument so we
    196     // can check many without re-querying the timer.
    197     bool Expired(base::TimeTicks now) const;
    198 
    199    private:
    200     URLID url_id_;
    201     VisitID visit_id_;
    202 
    203     // Time of the visit of the URL. This will be the value stored in the URL
    204     // and visit tables for the entry.
    205     base::Time visit_time_;
    206 
    207     // When this page entry was created. We have a cap on the maximum time that
    208     // an entry will be in the queue before being flushed to the database.
    209     base::TimeTicks added_time_;
    210 
    211     // Will be the string " " when they are set to distinguish set and unset.
    212     string16 title_;
    213     string16 body_;
    214   };
    215 
    216   // Converts the given time to a database identifier or vice-versa.
    217   static TextDatabase::DBIdent TimeToID(base::Time time);
    218   static base::Time IDToTime(TextDatabase::DBIdent id);
    219 
    220   // Returns a text database for the given identifier or time. This file will
    221   // be created if it doesn't exist and |for_writing| is set. On error,
    222   // including the case where the file doesn't exist and |for_writing|
    223   // is false, it will return NULL.
    224   //
    225   // When |for_writing| is set, a transaction on the database will be opened
    226   // if there is a transaction open on this manager.
    227   //
    228   // The pointer will be tracked in the cache. The caller should not store it
    229   // or delete it since it will get automatically deleted as necessary.
    230   TextDatabase* GetDB(TextDatabase::DBIdent id, bool for_writing);
    231   TextDatabase* GetDBForTime(base::Time time, bool for_writing);
    232 
    233   // Populates the present_databases_ list based on which files are on disk.
    234   // When the list is already initialized, this will do nothing, so you can
    235   // call it whenever you want to ensure the present_databases_ set is filled.
    236   void InitDBList();
    237 
    238   // Schedules a call to ExpireRecentChanges in the future.
    239   void ScheduleFlushOldChanges();
    240 
    241   // Checks the recent_changes_ list and commits partial data that has been
    242   // around too long.
    243   void FlushOldChanges();
    244 
    245   // Given "now," this will expire old things from the recent_changes_ list.
    246   // This is used as the backend for FlushOldChanges and is called directly
    247   // by the unit tests with fake times.
    248   void FlushOldChangesForTime(base::TimeTicks now);
    249 
    250   // Directory holding our index files.
    251   const FilePath dir_;
    252 
    253   // Non-owning pointers to the recent history databases for URLs and visits.
    254   URLDatabase* url_database_;
    255   VisitDatabase* visit_database_;
    256 
    257   // Lists recent additions that we have not yet filled out with the title and
    258   // body. Sorted by time, we will flush them when they are complete or have
    259   // been in the queue too long without modification.
    260   //
    261   // We kind of abuse the MRUCache because we never move things around in it
    262   // using Get. Instead, we keep them in the order they were inserted, since
    263   // this is the metric we use to measure age. The MRUCache gives us an ordered
    264   // list with fast lookup by URL.
    265   typedef MRUCache<GURL, PageInfo> RecentChangeList;
    266   RecentChangeList recent_changes_;
    267 
    268   // Nesting levels of transactions. Since sqlite only allows one open
    269   // transaction, we simulate nested transactions by mapping the outermost one
    270   // to a real transaction. Since this object never needs to do ROLLBACK, losing
    271   // the ability for all transactions to rollback is inconsequential.
    272   int transaction_nesting_;
    273 
    274   // The cache owns the TextDatabase pointers, they will be automagically
    275   // deleted when the cache entry is removed or expired.
    276   typedef OwningMRUCache<TextDatabase::DBIdent, TextDatabase*> DBCache;
    277   DBCache db_cache_;
    278 
    279   // Tells us about the existence of database files on disk. All existing
    280   // databases will be in here, and non-existant ones will not, so we don't
    281   // have to check the disk every time.
    282   //
    283   // This set is populated LAZILY by InitDBList(), you should call that function
    284   // before accessing the list.
    285   //
    286   // Note that iterators will work on the keys in-order. Normally, reverse
    287   // iterators will be used to iterate the keys in reverse-order.
    288   typedef std::set<TextDatabase::DBIdent> DBIdentSet;
    289   DBIdentSet present_databases_;
    290   bool present_databases_loaded_;  // Set by InitDBList when populated.
    291 
    292   // Lists all databases with open transactions. These will have to be closed
    293   // when the transaction is committed.
    294   DBIdentSet open_transactions_;
    295 
    296   QueryParser query_parser_;
    297 
    298   // Generates tasks for our periodic checking of expired "recent changes".
    299   ScopedRunnableMethodFactory<TextDatabaseManager> factory_;
    300 
    301   // This object is created and managed by the history backend. We maintain an
    302   // opaque pointer to the object for our use.
    303   // This can be NULL if there are no indexers registered to receive indexing
    304   // data from us.
    305   const HistoryPublisher* history_publisher_;
    306 
    307   DISALLOW_COPY_AND_ASSIGN(TextDatabaseManager);
    308 };
    309 
    310 }  // namespace history
    311 
    312 #endif  // CHROME_BROWSER_HISTORY_TEXT_DATABASE_MANAGER_H_
    313