Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
      6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
      7 #pragma once
      8 
      9 #include <set>
     10 #include <vector>
     11 
     12 #include "base/file_path.h"
     13 #include "base/memory/scoped_ptr.h"
     14 #include "base/synchronization/lock.h"
     15 #include "base/task.h"
     16 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
     17 #include "testing/gtest/include/gtest/gtest_prod.h"
     18 
     19 namespace base {
     20   class Time;
     21 }
     22 
     23 namespace safe_browsing {
     24 class PrefixSet;
     25 }
     26 
     27 class BloomFilter;
     28 class GURL;
     29 class MessageLoop;
     30 class SafeBrowsingDatabase;
     31 
     32 // Factory for creating SafeBrowsingDatabase. Tests implement this factory
     33 // to create fake Databases for testing.
     34 class SafeBrowsingDatabaseFactory {
     35  public:
     36   SafeBrowsingDatabaseFactory() { }
     37   virtual ~SafeBrowsingDatabaseFactory() { }
     38   virtual SafeBrowsingDatabase* CreateSafeBrowsingDatabase(
     39       bool enable_download_protection,
     40       bool enable_client_side_whitelist) = 0;
     41  private:
     42   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingDatabaseFactory);
     43 };
     44 
     45 
     46 // Encapsulates on-disk databases that for safebrowsing. There are
     47 // three databases: browse, download and client-side detection (csd)
     48 // whitelist databases. The browse database contains information
     49 // about phishing and malware urls. The download database contains
     50 // URLs for bad binaries (e.g: those containing virus) and hash of
     51 // these downloaded contents. The csd whitelist database contains URLs
     52 // that will never be considered as phishing by the client-side
     53 // phishing detection. These on-disk databases are shared among all
     54 // profiles, as it doesn't contain user-specific data. This object is
     55 // not thread-safe, i.e. all its methods should be used on the same
     56 // thread that it was created on.
     57 class SafeBrowsingDatabase {
     58  public:
     59   // Factory method for obtaining a SafeBrowsingDatabase implementation.
     60   // It is not thread safe.
     61   // |enable_download_protection| is used to control the download database
     62   // feature.
     63   // |enable_client_side_whitelist| is used to control the csd whitelist
     64   // database feature.
     65   static SafeBrowsingDatabase* Create(bool enable_download_protection,
     66                                       bool enable_client_side_whitelist);
     67 
     68   // Makes the passed |factory| the factory used to instantiate
     69   // a SafeBrowsingDatabase. This is used for tests.
     70   static void RegisterFactory(SafeBrowsingDatabaseFactory* factory) {
     71     factory_ = factory;
     72   }
     73 
     74   virtual ~SafeBrowsingDatabase();
     75 
     76   // Initializes the database with the given filename.
     77   virtual void Init(const FilePath& filename) = 0;
     78 
     79   // Deletes the current database and creates a new one.
     80   virtual bool ResetDatabase() = 0;
     81 
     82   // Returns false if |url| is not in the browse database.  If it
     83   // returns true, then either |matching_list| is the name of the matching
     84   // list, or |prefix_hits| and |full_hits| contains the matching hash
     85   // prefixes.  This function is safe to call from threads other than
     86   // the creation thread.
     87   virtual bool ContainsBrowseUrl(const GURL& url,
     88                                  std::string* matching_list,
     89                                  std::vector<SBPrefix>* prefix_hits,
     90                                  std::vector<SBFullHashResult>* full_hits,
     91                                  base::Time last_update) = 0;
     92 
     93   // Returns false if none of |urls| are in Download database. If it returns
     94   // true, |prefix_hits| should contain the prefixes for the URLs that were in
     95   // the database.  This function could ONLY be accessed from creation thread.
     96   virtual bool ContainsDownloadUrl(const std::vector<GURL>& urls,
     97                                    std::vector<SBPrefix>* prefix_hits) = 0;
     98 
     99   // Returns false if |prefix| is not in Download database.
    100   // This function could ONLY be accessed from creation thread.
    101   virtual bool ContainsDownloadHashPrefix(const SBPrefix& prefix) = 0;
    102 
    103   // Returns false if |url| is not on the client-side phishing detection
    104   // whitelist.  Otherwise, this function returns true.  Note: the whitelist
    105   // only contains full-length hashes so we don't return any prefix hit.
    106   // This function should only be called from the IO thread.
    107   virtual bool ContainsCsdWhitelistedUrl(const GURL& url) = 0;
    108 
    109   // A database transaction should look like:
    110   //
    111   // std::vector<SBListChunkRanges> lists;
    112   // if (db.UpdateStarted(&lists)) {
    113   //   // Do something with |lists|.
    114   //
    115   //   // Process add/sub commands.
    116   //   db.InsertChunks(list_name, chunks);
    117   //
    118   //   // Process adddel/subdel commands.
    119   //   db.DeleteChunks(chunks_deletes);
    120   //
    121   //   // If passed true, processes the collected chunk info and
    122   //   // rebuilds the bloom filter.  If passed false, rolls everything
    123   //   // back.
    124   //   db.UpdateFinished(success);
    125   // }
    126   //
    127   // If UpdateStarted() returns true, the caller MUST eventually call
    128   // UpdateFinished().  If it returns false, the caller MUST NOT call
    129   // the other functions.
    130   virtual bool UpdateStarted(std::vector<SBListChunkRanges>* lists) = 0;
    131   virtual void InsertChunks(const std::string& list_name,
    132                             const SBChunkList& chunks) = 0;
    133   virtual void DeleteChunks(
    134       const std::vector<SBChunkDelete>& chunk_deletes) = 0;
    135   virtual void UpdateFinished(bool update_succeeded) = 0;
    136 
    137   // Store the results of a GetHash response. In the case of empty results, we
    138   // cache the prefixes until the next update so that we don't have to issue
    139   // further GetHash requests we know will be empty.
    140   virtual void CacheHashResults(
    141       const std::vector<SBPrefix>& prefixes,
    142       const std::vector<SBFullHashResult>& full_hits) = 0;
    143 
    144   // The name of the bloom-filter file for the given database file.
    145   static FilePath BloomFilterForFilename(const FilePath& db_filename);
    146 
    147   // Filename for malware and phishing URL database.
    148   static FilePath BrowseDBFilename(const FilePath& db_base_filename);
    149 
    150   // Filename for download URL and download binary hash database.
    151   static FilePath DownloadDBFilename(const FilePath& db_base_filename);
    152 
    153   // Filename for client-side phishing detection whitelist databsae.
    154   static FilePath CsdWhitelistDBFilename(
    155       const FilePath& csd_whitelist_base_filename);
    156 
    157   // Enumerate failures for histogramming purposes.  DO NOT CHANGE THE
    158   // ORDERING OF THESE VALUES.
    159   enum FailureType {
    160     FAILURE_DATABASE_CORRUPT,
    161     FAILURE_DATABASE_CORRUPT_HANDLER,
    162     FAILURE_BROWSE_DATABASE_UPDATE_BEGIN,
    163     FAILURE_BROWSE_DATABASE_UPDATE_FINISH,
    164     FAILURE_DATABASE_FILTER_MISSING,
    165     FAILURE_DATABASE_FILTER_READ,
    166     FAILURE_DATABASE_FILTER_WRITE,
    167     FAILURE_DATABASE_FILTER_DELETE,
    168     FAILURE_DATABASE_STORE_MISSING,
    169     FAILURE_DATABASE_STORE_DELETE,
    170     FAILURE_DOWNLOAD_DATABASE_UPDATE_BEGIN,
    171     FAILURE_DOWNLOAD_DATABASE_UPDATE_FINISH,
    172     FAILURE_CSD_WHITELIST_DATABASE_UPDATE_BEGIN,
    173     FAILURE_CSD_WHITELIST_DATABASE_UPDATE_FINISH,
    174 
    175     // Memory space for histograms is determined by the max.  ALWAYS
    176     // ADD NEW VALUES BEFORE THIS ONE.
    177     FAILURE_DATABASE_MAX
    178   };
    179 
    180   static void RecordFailure(FailureType failure_type);
    181 
    182  private:
    183   // The factory used to instantiate a SafeBrowsingDatabase object.
    184   // Useful for tests, so they can provide their own implementation of
    185   // SafeBrowsingDatabase.
    186   static SafeBrowsingDatabaseFactory* factory_;
    187 };
    188 
    189 class SafeBrowsingDatabaseNew : public SafeBrowsingDatabase {
    190  public:
    191   // Create a database with a browse store, download store and
    192   // csd_whitelist_store. Takes ownership of browse_store, download_store and
    193   // csd_whitelist_store. When |download_store| is NULL, the database
    194   // will ignore any operations related download (url hashes and
    195   // binary hashes).  Same for the |csd_whitelist_store|.
    196   SafeBrowsingDatabaseNew(SafeBrowsingStore* browse_store,
    197                           SafeBrowsingStore* download_store,
    198                           SafeBrowsingStore* csd_whitelist_store);
    199 
    200   // Create a database with a browse store. This is a legacy interface that
    201   // useds Sqlite.
    202   SafeBrowsingDatabaseNew();
    203 
    204   virtual ~SafeBrowsingDatabaseNew();
    205 
    206   // Implement SafeBrowsingDatabase interface.
    207   virtual void Init(const FilePath& filename);
    208   virtual bool ResetDatabase();
    209   virtual bool ContainsBrowseUrl(const GURL& url,
    210                                  std::string* matching_list,
    211                                  std::vector<SBPrefix>* prefix_hits,
    212                                  std::vector<SBFullHashResult>* full_hits,
    213                                  base::Time last_update);
    214   virtual bool ContainsDownloadUrl(const std::vector<GURL>& urls,
    215                                    std::vector<SBPrefix>* prefix_hits);
    216   virtual bool ContainsDownloadHashPrefix(const SBPrefix& prefix);
    217   virtual bool ContainsCsdWhitelistedUrl(const GURL& url);
    218   virtual bool UpdateStarted(std::vector<SBListChunkRanges>* lists);
    219   virtual void InsertChunks(const std::string& list_name,
    220                             const SBChunkList& chunks);
    221   virtual void DeleteChunks(const std::vector<SBChunkDelete>& chunk_deletes);
    222   virtual void UpdateFinished(bool update_succeeded);
    223   virtual void CacheHashResults(const std::vector<SBPrefix>& prefixes,
    224                                 const std::vector<SBFullHashResult>& full_hits);
    225 
    226  private:
    227   friend class SafeBrowsingDatabaseTest;
    228   FRIEND_TEST(SafeBrowsingDatabaseTest, HashCaching);
    229 
    230   // Return the browse_store_, download_store_ or csd_whitelist_store_
    231   // based on list_id.
    232   SafeBrowsingStore* GetStore(int list_id);
    233 
    234     // Deletes the files on disk.
    235   bool Delete();
    236 
    237   // Load the bloom filter off disk, or generates one if it doesn't exist.
    238   void LoadBloomFilter();
    239 
    240   // Writes the current bloom filter to disk.
    241   void WriteBloomFilter();
    242 
    243   // Loads the given full-length hashes to the csd whitelist.  If the number
    244   // of hashes is too large or if the kill switch URL is on the whitelist
    245   // we will whitelist all URLs.
    246   void LoadCsdWhitelist(const std::vector<SBAddFullHash>& full_hashes);
    247 
    248   // Call this method if an error occured with the csd whitelist.  This will
    249   // result in all calls to ContainsCsdWhitelistedUrl() to returning true.
    250   void CsdWhitelistAllUrls();
    251 
    252   // Helpers for handling database corruption.
    253   // |OnHandleCorruptDatabase()| runs |ResetDatabase()| and sets
    254   // |corruption_detected_|, |HandleCorruptDatabase()| posts
    255   // |OnHandleCorruptDatabase()| to the current thread, to be run
    256   // after the current task completes.
    257   // TODO(shess): Wire things up to entirely abort the update
    258   // transaction when this happens.
    259   void HandleCorruptDatabase();
    260   void OnHandleCorruptDatabase();
    261 
    262   // Helpers for InsertChunks().
    263   void InsertAdd(int chunk, SBPrefix host, const SBEntry* entry, int list_id);
    264   void InsertAddChunks(int list_id, const SBChunkList& chunks);
    265   void InsertSub(int chunk, SBPrefix host, const SBEntry* entry, int list_id);
    266   void InsertSubChunks(int list_id, const SBChunkList& chunks);
    267 
    268   void UpdateDownloadStore();
    269   void UpdateBrowseStore();
    270   void UpdateCsdWhitelistStore();
    271 
    272   // Helper function to compare addprefixes in download_store_ with |prefixes|.
    273   // The |list_bit| indicates which list (download url or download hash)
    274   // to compare.
    275   // Returns true if there is a match, |*prefix_hits| will contain the actual
    276   // matching prefixes.
    277   bool MatchDownloadAddPrefixes(int list_bit,
    278                                 const std::vector<SBPrefix>& prefixes,
    279                                 std::vector<SBPrefix>* prefix_hits);
    280 
    281   // Used to verify that various calls are made from the thread the
    282   // object was created on.
    283   MessageLoop* creation_loop_;
    284 
    285   // Lock for protecting access to variables that may be used on the
    286   // IO thread.  This includes |browse_bloom_filter_|, |full_browse_hashes_|,
    287   // |pending_browse_hashes_|, |prefix_miss_cache_|, |csd_whitelist_|, and
    288   // |csd_whitelist_all_urls_|.
    289   base::Lock lookup_lock_;
    290 
    291   // Underlying persistent store for chunk data.
    292   // For browsing related (phishing and malware URLs) chunks and prefixes.
    293   FilePath browse_filename_;
    294   scoped_ptr<SafeBrowsingStore> browse_store_;
    295 
    296   // For download related (download URL and binary hash) chunks and prefixes.
    297   FilePath download_filename_;
    298   scoped_ptr<SafeBrowsingStore> download_store_;
    299 
    300   // For the client-side phishing detection whitelist chunks and full-length
    301   // hashes.  This list only contains 256 bit hashes.
    302   FilePath csd_whitelist_filename_;
    303   scoped_ptr<SafeBrowsingStore> csd_whitelist_store_;
    304 
    305   // All the client-side phishing detection whitelist entries are loaded in
    306   // a sorted vector.
    307   std::vector<SBFullHash> csd_whitelist_;
    308 
    309   // If true, ContainsCsdWhitelistedUrl will always return true for all URLs.
    310   // This is set to true if the csd whitelist is too large to be stored in
    311   // memory, if the kill switch URL is on the csd whitelist or if there was
    312   // an error during the most recent update.
    313   bool csd_whitelist_all_urls_;
    314 
    315   // Bloom filter generated from the add-prefixes in |browse_store_|.
    316   // Only browse_store_ requires the BloomFilter for fast query.
    317   FilePath bloom_filter_filename_;
    318   scoped_refptr<BloomFilter> browse_bloom_filter_;
    319 
    320   // Cached browse store related full-hash items, ordered by prefix for
    321   // efficient scanning.
    322   // |full_browse_hashes_| are items from |browse_store_|,
    323   // |pending_browse_hashes_| are items from |CacheHashResults()|, which
    324   // will be pushed to the store on the next update.
    325   std::vector<SBAddFullHash> full_browse_hashes_;
    326   std::vector<SBAddFullHash> pending_browse_hashes_;
    327 
    328   // Cache of prefixes that returned empty results (no full hash
    329   // match) to |CacheHashResults()|.  Cached to prevent asking for
    330   // them every time.  Cleared on next update.
    331   std::set<SBPrefix> prefix_miss_cache_;
    332 
    333   // Used to schedule resetting the database because of corruption.
    334   ScopedRunnableMethodFactory<SafeBrowsingDatabaseNew> reset_factory_;
    335 
    336   // Set if corruption is detected during the course of an update.
    337   // Causes the update functions to fail with no side effects, until
    338   // the next call to |UpdateStarted()|.
    339   bool corruption_detected_;
    340 
    341   // Set to true if any chunks are added or deleted during an update.
    342   // Used to optimize away database update.
    343   bool change_detected_;
    344 
    345   // Used to check if a prefix was in the database.
    346   scoped_ptr<safe_browsing::PrefixSet> prefix_set_;
    347 };
    348 
    349 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
    350