Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
      6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
      7 
      8 #include <map>
      9 #include <set>
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "base/containers/hash_tables.h"
     14 #include "base/files/file_path.h"
     15 #include "base/gtest_prod_util.h"
     16 #include "base/memory/scoped_ptr.h"
     17 #include "base/memory/weak_ptr.h"
     18 #include "base/synchronization/lock.h"
     19 #include "base/time/time.h"
     20 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
     21 
     22 namespace base {
     23 class MessageLoop;
     24 }
     25 
     26 namespace safe_browsing {
     27 class PrefixSet;
     28 }
     29 
     30 class GURL;
     31 class SafeBrowsingDatabase;
     32 
     33 // Factory for creating SafeBrowsingDatabase. Tests implement this factory
     34 // to create fake Databases for testing.
     35 class SafeBrowsingDatabaseFactory {
     36  public:
     37   SafeBrowsingDatabaseFactory() { }
     38   virtual ~SafeBrowsingDatabaseFactory() { }
     39   virtual SafeBrowsingDatabase* CreateSafeBrowsingDatabase(
     40       bool enable_download_protection,
     41       bool enable_client_side_whitelist,
     42       bool enable_download_whitelist,
     43       bool enable_extension_blacklist,
     44       bool enable_side_effect_free_whitelist,
     45       bool enable_ip_blacklist) = 0;
     46  private:
     47   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingDatabaseFactory);
     48 };
     49 
     50 // Encapsulates on-disk databases that for safebrowsing. There are
     51 // four databases: browse, download, download whitelist and
     52 // client-side detection (csd) whitelist databases. The browse database contains
     53 // information about phishing and malware urls. The download database contains
     54 // URLs for bad binaries (e.g: those containing virus) and hash of
     55 // these downloaded contents. The download whitelist contains whitelisted
     56 // download hosting sites as well as whitelisted binary signing certificates
     57 // etc.  The csd whitelist database contains URLs that will never be considered
     58 // as phishing by the client-side phishing detection. These on-disk databases
     59 // are shared among all profiles, as it doesn't contain user-specific data. This
     60 // object is not thread-safe, i.e. all its methods should be used on the same
     61 // thread that it was created on.
     62 class SafeBrowsingDatabase {
     63  public:
     64   // Factory method for obtaining a SafeBrowsingDatabase implementation.
     65   // It is not thread safe.
     66   // |enable_download_protection| is used to control the download database
     67   // feature.
     68   // |enable_client_side_whitelist| is used to control the csd whitelist
     69   // database feature.
     70   // |enable_download_whitelist| is used to control the download whitelist
     71   // database feature.
     72   // |enable_ip_blacklist| is used to control the csd malware IP blacklist
     73   // database feature.
     74   static SafeBrowsingDatabase* Create(bool enable_download_protection,
     75                                       bool enable_client_side_whitelist,
     76                                       bool enable_download_whitelist,
     77                                       bool enable_extension_blacklist,
     78                                       bool side_effect_free_whitelist,
     79                                       bool enable_ip_blacklist);
     80 
     81   // Makes the passed |factory| the factory used to instantiate
     82   // a SafeBrowsingDatabase. This is used for tests.
     83   static void RegisterFactory(SafeBrowsingDatabaseFactory* factory) {
     84     factory_ = factory;
     85   }
     86 
     87   virtual ~SafeBrowsingDatabase();
     88 
     89   // Initializes the database with the given filename.
     90   virtual void Init(const base::FilePath& filename) = 0;
     91 
     92   // Deletes the current database and creates a new one.
     93   virtual bool ResetDatabase() = 0;
     94 
     95   // Returns false if |url| is not in the browse database or already was cached
     96   // as a miss.  If it returns true, |prefix_hits| contains matching hash
     97   // prefixes which had no cached results and |cache_hits| contains any matching
     98   // cached gethash results.  This function is safe to call from any thread.
     99   virtual bool ContainsBrowseUrl(
    100       const GURL& url,
    101       std::vector<SBPrefix>* prefix_hits,
    102       std::vector<SBFullHashResult>* cache_hits) = 0;
    103 
    104   // Returns false if none of |urls| are in Download database. If it returns
    105   // true, |prefix_hits| should contain the prefixes for the URLs that were in
    106   // the database.  This function could ONLY be accessed from creation thread.
    107   virtual bool ContainsDownloadUrl(const std::vector<GURL>& urls,
    108                                    std::vector<SBPrefix>* prefix_hits) = 0;
    109 
    110   // Returns false if |url| is not on the client-side phishing detection
    111   // whitelist.  Otherwise, this function returns true.  Note: the whitelist
    112   // only contains full-length hashes so we don't return any prefix hit.
    113   // This function should only be called from the IO thread.
    114   virtual bool ContainsCsdWhitelistedUrl(const GURL& url) = 0;
    115 
    116   // The download whitelist is used for two purposes: a white-domain list of
    117   // sites that are considered to host only harmless binaries as well as a
    118   // whitelist of arbitrary strings such as hashed certificate authorities that
    119   // are considered to be trusted.  The two methods below let you lookup
    120   // the whitelist either for a URL or an arbitrary string.  These methods will
    121   // return false if no match is found and true otherwise.
    122   // This function could ONLY be accessed from the IO thread.
    123   virtual bool ContainsDownloadWhitelistedUrl(const GURL& url) = 0;
    124   virtual bool ContainsDownloadWhitelistedString(const std::string& str) = 0;
    125 
    126   // Populates |prefix_hits| with any prefixes in |prefixes| that have matches
    127   // in the database.
    128   //
    129   // This function can ONLY be accessed from the creation thread.
    130   virtual bool ContainsExtensionPrefixes(
    131       const std::vector<SBPrefix>& prefixes,
    132       std::vector<SBPrefix>* prefix_hits) = 0;
    133 
    134   // Returns false unless the hash of |url| is on the side-effect free
    135   // whitelist.
    136   virtual bool ContainsSideEffectFreeWhitelistUrl(const GURL& url) = 0;
    137 
    138   // Returns true iff the given IP is currently on the csd malware IP blacklist.
    139   virtual bool ContainsMalwareIP(const std::string& ip_address) = 0;
    140 
    141   // A database transaction should look like:
    142   //
    143   // std::vector<SBListChunkRanges> lists;
    144   // if (db.UpdateStarted(&lists)) {
    145   //   // Do something with |lists|.
    146   //
    147   //   // Process add/sub commands.
    148   //   db.InsertChunks(list_name, chunks);
    149   //
    150   //   // Process adddel/subdel commands.
    151   //   db.DeleteChunks(chunks_deletes);
    152   //
    153   //   // If passed true, processes the collected chunk info and
    154   //   // rebuilds the filter.  If passed false, rolls everything
    155   //   // back.
    156   //   db.UpdateFinished(success);
    157   // }
    158   //
    159   // If UpdateStarted() returns true, the caller MUST eventually call
    160   // UpdateFinished().  If it returns false, the caller MUST NOT call
    161   // the other functions.
    162   virtual bool UpdateStarted(std::vector<SBListChunkRanges>* lists) = 0;
    163   virtual void InsertChunks(const std::string& list_name,
    164                             const std::vector<SBChunkData*>& chunks) = 0;
    165   virtual void DeleteChunks(
    166       const std::vector<SBChunkDelete>& chunk_deletes) = 0;
    167   virtual void UpdateFinished(bool update_succeeded) = 0;
    168 
    169   // Store the results of a GetHash response. In the case of empty results, we
    170   // cache the prefixes until the next update so that we don't have to issue
    171   // further GetHash requests we know will be empty.
    172   virtual void CacheHashResults(
    173       const std::vector<SBPrefix>& prefixes,
    174       const std::vector<SBFullHashResult>& full_hits,
    175       const base::TimeDelta& cache_lifetime) = 0;
    176 
    177   // Returns true if the malware IP blacklisting killswitch URL is present
    178   // in the csd whitelist.
    179   virtual bool IsMalwareIPMatchKillSwitchOn() = 0;
    180 
    181   // Returns true if the whitelist killswitch URL is present in the csd
    182   // whitelist.
    183   virtual bool IsCsdWhitelistKillSwitchOn() = 0;
    184 
    185   // The name of the bloom-filter file for the given database file.
    186   // NOTE(shess): OBSOLETE.  Present for deleting stale files.
    187   static base::FilePath BloomFilterForFilename(
    188       const base::FilePath& db_filename);
    189 
    190   // The name of the prefix set file for the given database file.
    191   static base::FilePath PrefixSetForFilename(const base::FilePath& db_filename);
    192 
    193   // Filename for malware and phishing URL database.
    194   static base::FilePath BrowseDBFilename(
    195       const base::FilePath& db_base_filename);
    196 
    197   // Filename for download URL and download binary hash database.
    198   static base::FilePath DownloadDBFilename(
    199       const base::FilePath& db_base_filename);
    200 
    201   // Filename for client-side phishing detection whitelist databsae.
    202   static base::FilePath CsdWhitelistDBFilename(
    203       const base::FilePath& csd_whitelist_base_filename);
    204 
    205   // Filename for download whitelist databsae.
    206   static base::FilePath DownloadWhitelistDBFilename(
    207       const base::FilePath& download_whitelist_base_filename);
    208 
    209   // Filename for extension blacklist database.
    210   static base::FilePath ExtensionBlacklistDBFilename(
    211       const base::FilePath& extension_blacklist_base_filename);
    212 
    213   // Filename for side-effect free whitelist database.
    214   static base::FilePath SideEffectFreeWhitelistDBFilename(
    215       const base::FilePath& side_effect_free_whitelist_base_filename);
    216 
    217   // Filename for the csd malware IP blacklist database.
    218   static base::FilePath IpBlacklistDBFilename(
    219       const base::FilePath& ip_blacklist_base_filename);
    220 
    221   // Enumerate failures for histogramming purposes.  DO NOT CHANGE THE
    222   // ORDERING OF THESE VALUES.
    223   enum FailureType {
    224     FAILURE_DATABASE_CORRUPT,
    225     FAILURE_DATABASE_CORRUPT_HANDLER,
    226     FAILURE_BROWSE_DATABASE_UPDATE_BEGIN,
    227     FAILURE_BROWSE_DATABASE_UPDATE_FINISH,
    228     FAILURE_DATABASE_FILTER_MISSING_OBSOLETE,
    229     FAILURE_DATABASE_FILTER_READ_OBSOLETE,
    230     FAILURE_DATABASE_FILTER_WRITE_OBSOLETE,
    231     FAILURE_DATABASE_FILTER_DELETE,
    232     FAILURE_DATABASE_STORE_MISSING,
    233     FAILURE_DATABASE_STORE_DELETE,
    234     FAILURE_DOWNLOAD_DATABASE_UPDATE_BEGIN,
    235     FAILURE_DOWNLOAD_DATABASE_UPDATE_FINISH,
    236     FAILURE_WHITELIST_DATABASE_UPDATE_BEGIN,
    237     FAILURE_WHITELIST_DATABASE_UPDATE_FINISH,
    238     FAILURE_BROWSE_PREFIX_SET_MISSING,
    239     FAILURE_BROWSE_PREFIX_SET_READ,
    240     FAILURE_BROWSE_PREFIX_SET_WRITE,
    241     FAILURE_BROWSE_PREFIX_SET_DELETE,
    242     FAILURE_EXTENSION_BLACKLIST_UPDATE_BEGIN,
    243     FAILURE_EXTENSION_BLACKLIST_UPDATE_FINISH,
    244     FAILURE_EXTENSION_BLACKLIST_DELETE,
    245     FAILURE_SIDE_EFFECT_FREE_WHITELIST_UPDATE_BEGIN,
    246     FAILURE_SIDE_EFFECT_FREE_WHITELIST_UPDATE_FINISH,
    247     FAILURE_SIDE_EFFECT_FREE_WHITELIST_DELETE,
    248     FAILURE_SIDE_EFFECT_FREE_WHITELIST_PREFIX_SET_READ,
    249     FAILURE_SIDE_EFFECT_FREE_WHITELIST_PREFIX_SET_WRITE,
    250     FAILURE_SIDE_EFFECT_FREE_WHITELIST_PREFIX_SET_DELETE,
    251     FAILURE_IP_BLACKLIST_UPDATE_BEGIN,
    252     FAILURE_IP_BLACKLIST_UPDATE_FINISH,
    253     FAILURE_IP_BLACKLIST_UPDATE_INVALID,
    254     FAILURE_IP_BLACKLIST_DELETE,
    255 
    256     // Memory space for histograms is determined by the max.  ALWAYS
    257     // ADD NEW VALUES BEFORE THIS ONE.
    258     FAILURE_DATABASE_MAX
    259   };
    260 
    261   static void RecordFailure(FailureType failure_type);
    262 
    263  private:
    264   // The factory used to instantiate a SafeBrowsingDatabase object.
    265   // Useful for tests, so they can provide their own implementation of
    266   // SafeBrowsingDatabase.
    267   static SafeBrowsingDatabaseFactory* factory_;
    268 };
    269 
    270 class SafeBrowsingDatabaseNew : public SafeBrowsingDatabase {
    271  public:
    272   // Create a database with a browse, download, download whitelist and
    273   // csd whitelist store objects. Takes ownership of all the store objects.
    274   // When |download_store| is NULL, the database will ignore any operations
    275   // related download (url hashes and binary hashes).  The same is true for
    276   // the |csd_whitelist_store|, |download_whitelist_store| and
    277   // |ip_blacklist_store|.
    278   SafeBrowsingDatabaseNew(SafeBrowsingStore* browse_store,
    279                           SafeBrowsingStore* download_store,
    280                           SafeBrowsingStore* csd_whitelist_store,
    281                           SafeBrowsingStore* download_whitelist_store,
    282                           SafeBrowsingStore* extension_blacklist_store,
    283                           SafeBrowsingStore* side_effect_free_whitelist_store,
    284                           SafeBrowsingStore* ip_blacklist_store);
    285 
    286   // Create a database with a browse store. This is a legacy interface that
    287   // useds Sqlite.
    288   SafeBrowsingDatabaseNew();
    289 
    290   virtual ~SafeBrowsingDatabaseNew();
    291 
    292   // Implement SafeBrowsingDatabase interface.
    293   virtual void Init(const base::FilePath& filename) OVERRIDE;
    294   virtual bool ResetDatabase() OVERRIDE;
    295   virtual bool ContainsBrowseUrl(
    296       const GURL& url,
    297       std::vector<SBPrefix>* prefix_hits,
    298       std::vector<SBFullHashResult>* cache_hits) OVERRIDE;
    299   virtual bool ContainsDownloadUrl(const std::vector<GURL>& urls,
    300                                    std::vector<SBPrefix>* prefix_hits) OVERRIDE;
    301   virtual bool ContainsCsdWhitelistedUrl(const GURL& url) OVERRIDE;
    302   virtual bool ContainsDownloadWhitelistedUrl(const GURL& url) OVERRIDE;
    303   virtual bool ContainsDownloadWhitelistedString(
    304       const std::string& str) OVERRIDE;
    305   virtual bool ContainsExtensionPrefixes(
    306       const std::vector<SBPrefix>& prefixes,
    307       std::vector<SBPrefix>* prefix_hits) OVERRIDE;
    308   virtual bool ContainsSideEffectFreeWhitelistUrl(const GURL& url)  OVERRIDE;
    309   virtual bool ContainsMalwareIP(const std::string& ip_address) OVERRIDE;
    310   virtual bool UpdateStarted(std::vector<SBListChunkRanges>* lists) OVERRIDE;
    311   virtual void InsertChunks(const std::string& list_name,
    312                             const std::vector<SBChunkData*>& chunks) OVERRIDE;
    313   virtual void DeleteChunks(
    314       const std::vector<SBChunkDelete>& chunk_deletes) OVERRIDE;
    315   virtual void UpdateFinished(bool update_succeeded) OVERRIDE;
    316   virtual void CacheHashResults(
    317       const std::vector<SBPrefix>& prefixes,
    318       const std::vector<SBFullHashResult>& full_hits,
    319       const base::TimeDelta& cache_lifetime) OVERRIDE;
    320 
    321   // Returns the value of malware_kill_switch_;
    322   virtual bool IsMalwareIPMatchKillSwitchOn() OVERRIDE;
    323 
    324   // Returns true if the CSD whitelist has everything whitelisted.
    325   virtual bool IsCsdWhitelistKillSwitchOn() OVERRIDE;
    326 
    327  private:
    328   friend class SafeBrowsingDatabaseTest;
    329   FRIEND_TEST_ALL_PREFIXES(SafeBrowsingDatabaseTest, HashCaching);
    330   FRIEND_TEST_ALL_PREFIXES(SafeBrowsingDatabaseTest, CachedFullMiss);
    331   FRIEND_TEST_ALL_PREFIXES(SafeBrowsingDatabaseTest, CachedPrefixHitFullMiss);
    332   FRIEND_TEST_ALL_PREFIXES(SafeBrowsingDatabaseTest, BrowseFullHashMatching);
    333   FRIEND_TEST_ALL_PREFIXES(SafeBrowsingDatabaseTest,
    334                            BrowseFullHashAndPrefixMatching);
    335 
    336   // A SafeBrowsing whitelist contains a list of whitelisted full-hashes (stored
    337   // in a sorted vector) as well as a boolean flag indicating whether all
    338   // lookups in the whitelist should be considered matches for safety.
    339   typedef std::pair<std::vector<SBFullHash>, bool> SBWhitelist;
    340 
    341   // This map holds a csd malware IP blacklist which maps a prefix mask
    342   // to a set of hashed blacklisted IP prefixes.  Each IP prefix is a hashed
    343   // IPv6 IP prefix using SHA-1.
    344   typedef std::map<std::string, base::hash_set<std::string> > IPBlacklist;
    345 
    346   // Helper for ContainsBrowseUrl, exposed for testing.
    347   bool ContainsBrowseUrlHashes(const std::vector<SBFullHash>& full_hashes,
    348                                std::vector<SBPrefix>* prefix_hits,
    349                                std::vector<SBFullHashResult>* cache_hits);
    350 
    351   // Returns true if the whitelist is disabled or if any of the given hashes
    352   // matches the whitelist.
    353   bool ContainsWhitelistedHashes(const SBWhitelist& whitelist,
    354                                  const std::vector<SBFullHash>& hashes);
    355 
    356   // Return the browse_store_, download_store_, download_whitelist_store or
    357   // csd_whitelist_store_ based on list_id.
    358   SafeBrowsingStore* GetStore(int list_id);
    359 
    360   // Deletes the files on disk.
    361   bool Delete();
    362 
    363   // Load the prefix set off disk, if available.
    364   void LoadPrefixSet();
    365 
    366   // Writes the current prefix set to disk.
    367   void WritePrefixSet();
    368 
    369   // Loads the given full-length hashes to the given whitelist.  If the number
    370   // of hashes is too large or if the kill switch URL is on the whitelist
    371   // we will whitelist everything.
    372   void LoadWhitelist(const std::vector<SBAddFullHash>& full_hashes,
    373                      SBWhitelist* whitelist);
    374 
    375   // Call this method if an error occured with the given whitelist.  This will
    376   // result in all lookups to the whitelist to return true.
    377   void WhitelistEverything(SBWhitelist* whitelist);
    378 
    379   // Parses the IP blacklist from the given full-length hashes.
    380   void LoadIpBlacklist(const std::vector<SBAddFullHash>& full_hashes);
    381 
    382   // Helpers for handling database corruption.
    383   // |OnHandleCorruptDatabase()| runs |ResetDatabase()| and sets
    384   // |corruption_detected_|, |HandleCorruptDatabase()| posts
    385   // |OnHandleCorruptDatabase()| to the current thread, to be run
    386   // after the current task completes.
    387   // TODO(shess): Wire things up to entirely abort the update
    388   // transaction when this happens.
    389   void HandleCorruptDatabase();
    390   void OnHandleCorruptDatabase();
    391 
    392   // Helpers for InsertChunks().
    393   void InsertAddChunk(SafeBrowsingStore* store,
    394                       safe_browsing_util::ListType list_id,
    395                       const SBChunkData& chunk);
    396   void InsertSubChunk(SafeBrowsingStore* store,
    397                       safe_browsing_util::ListType list_id,
    398                       const SBChunkData& chunk);
    399 
    400   // Returns the size in bytes of the store after the update.
    401   int64 UpdateHashPrefixStore(const base::FilePath& store_filename,
    402                                SafeBrowsingStore* store,
    403                                FailureType failure_type);
    404   void UpdateBrowseStore();
    405   void UpdateSideEffectFreeWhitelistStore();
    406   void UpdateWhitelistStore(const base::FilePath& store_filename,
    407                             SafeBrowsingStore* store,
    408                             SBWhitelist* whitelist);
    409   void UpdateIpBlacklistStore();
    410 
    411   // Used to verify that various calls are made from the thread the
    412   // object was created on.
    413   base::MessageLoop* creation_loop_;
    414 
    415   // Lock for protecting access to variables that may be used on the IO thread.
    416   // This includes |prefix_set_|, |browse_gethash_cache_|, |csd_whitelist_|.
    417   base::Lock lookup_lock_;
    418 
    419   // The base filename passed to Init(), used to generate the store and prefix
    420   // set filenames used to store data on disk.
    421   base::FilePath filename_base_;
    422 
    423   // Underlying persistent store for chunk data.
    424   // For browsing related (phishing and malware URLs) chunks and prefixes.
    425   scoped_ptr<SafeBrowsingStore> browse_store_;
    426 
    427   // For download related (download URL and binary hash) chunks and prefixes.
    428   scoped_ptr<SafeBrowsingStore> download_store_;
    429 
    430   // For the client-side phishing detection whitelist chunks and full-length
    431   // hashes.  This list only contains 256 bit hashes.
    432   scoped_ptr<SafeBrowsingStore> csd_whitelist_store_;
    433 
    434   // For the download whitelist chunks and full-length hashes.  This list only
    435   // contains 256 bit hashes.
    436   scoped_ptr<SafeBrowsingStore> download_whitelist_store_;
    437 
    438   // For extension IDs.
    439   scoped_ptr<SafeBrowsingStore> extension_blacklist_store_;
    440 
    441   // For side-effect free whitelist.
    442   scoped_ptr<SafeBrowsingStore> side_effect_free_whitelist_store_;
    443 
    444   // For IP blacklist.
    445   scoped_ptr<SafeBrowsingStore> ip_blacklist_store_;
    446 
    447   SBWhitelist csd_whitelist_;
    448   SBWhitelist download_whitelist_;
    449   SBWhitelist extension_blacklist_;
    450 
    451   // The IP blacklist should be small.  At most a couple hundred IPs.
    452   IPBlacklist ip_blacklist_;
    453 
    454   // Cache of gethash results for browse store. Entries should not be used if
    455   // they are older than their expire_after field.  Cached misses will have
    456   // empty full_hashes field.  Cleared on each update.
    457   std::map<SBPrefix, SBCachedFullHashResult> browse_gethash_cache_;
    458 
    459   // Set if corruption is detected during the course of an update.
    460   // Causes the update functions to fail with no side effects, until
    461   // the next call to |UpdateStarted()|.
    462   bool corruption_detected_;
    463 
    464   // Set to true if any chunks are added or deleted during an update.
    465   // Used to optimize away database update.
    466   bool change_detected_;
    467 
    468   // Used to check if a prefix was in the browse database.
    469   scoped_ptr<safe_browsing::PrefixSet> browse_prefix_set_;
    470 
    471   // Used to check if a prefix was in the browse database.
    472   scoped_ptr<safe_browsing::PrefixSet> side_effect_free_whitelist_prefix_set_;
    473 
    474   // Used to schedule resetting the database because of corruption.
    475   base::WeakPtrFactory<SafeBrowsingDatabaseNew> reset_factory_;
    476 };
    477 
    478 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
    479