Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
      6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
      7 
      8 #include <map>
      9 #include <set>
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "base/containers/hash_tables.h"
     14 #include "base/files/file_path.h"
     15 #include "base/gtest_prod_util.h"
     16 #include "base/memory/scoped_ptr.h"
     17 #include "base/memory/weak_ptr.h"
     18 #include "base/synchronization/lock.h"
     19 #include "base/time/time.h"
     20 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
     21 
     22 namespace base {
     23 class MessageLoop;
     24 }
     25 
     26 namespace safe_browsing {
     27 class PrefixSet;
     28 }
     29 
     30 class GURL;
     31 class SafeBrowsingDatabase;
     32 
     33 // Factory for creating SafeBrowsingDatabase. Tests implement this factory
     34 // to create fake Databases for testing.
     35 class SafeBrowsingDatabaseFactory {
     36  public:
     37   SafeBrowsingDatabaseFactory() { }
     38   virtual ~SafeBrowsingDatabaseFactory() { }
     39   virtual SafeBrowsingDatabase* CreateSafeBrowsingDatabase(
     40       bool enable_download_protection,
     41       bool enable_client_side_whitelist,
     42       bool enable_download_whitelist,
     43       bool enable_extension_blacklist,
     44       bool enable_side_effect_free_whitelist,
     45       bool enable_ip_blacklist) = 0;
     46  private:
     47   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingDatabaseFactory);
     48 };
     49 
     50 // Contains full_hash elements which are cached in memory.  Differs from
     51 // SBAddFullHash in deriving |list_id| from |chunk_id|.  Differs from
     52 // SBFullHashResult in adding |received| for later expiration.
     53 // TODO(shess): Remove/refactor this as part of converting to v2.3 caching
     54 // semantics.
     55 struct SBFullHashCached {
     56   SBFullHash hash;
     57   int list_id;  // TODO(shess): Use safe_browsing_util::ListType.
     58   base::Time expire_after;
     59 };
     60 
     61 // Encapsulates on-disk databases that for safebrowsing. There are
     62 // four databases: browse, download, download whitelist and
     63 // client-side detection (csd) whitelist databases. The browse database contains
     64 // information about phishing and malware urls. The download database contains
     65 // URLs for bad binaries (e.g: those containing virus) and hash of
     66 // these downloaded contents. The download whitelist contains whitelisted
     67 // download hosting sites as well as whitelisted binary signing certificates
     68 // etc.  The csd whitelist database contains URLs that will never be considered
     69 // as phishing by the client-side phishing detection. These on-disk databases
     70 // are shared among all profiles, as it doesn't contain user-specific data. This
     71 // object is not thread-safe, i.e. all its methods should be used on the same
     72 // thread that it was created on.
     73 class SafeBrowsingDatabase {
     74  public:
     75   // Factory method for obtaining a SafeBrowsingDatabase implementation.
     76   // It is not thread safe.
     77   // |enable_download_protection| is used to control the download database
     78   // feature.
     79   // |enable_client_side_whitelist| is used to control the csd whitelist
     80   // database feature.
     81   // |enable_download_whitelist| is used to control the download whitelist
     82   // database feature.
     83   // |enable_ip_blacklist| is used to control the csd malware IP blacklist
     84   // database feature.
     85   static SafeBrowsingDatabase* Create(bool enable_download_protection,
     86                                       bool enable_client_side_whitelist,
     87                                       bool enable_download_whitelist,
     88                                       bool enable_extension_blacklist,
     89                                       bool side_effect_free_whitelist,
     90                                       bool enable_ip_blacklist);
     91 
     92   // Makes the passed |factory| the factory used to instantiate
     93   // a SafeBrowsingDatabase. This is used for tests.
     94   static void RegisterFactory(SafeBrowsingDatabaseFactory* factory) {
     95     factory_ = factory;
     96   }
     97 
     98   virtual ~SafeBrowsingDatabase();
     99 
    100   // Initializes the database with the given filename.
    101   virtual void Init(const base::FilePath& filename) = 0;
    102 
    103   // Deletes the current database and creates a new one.
    104   virtual bool ResetDatabase() = 0;
    105 
    106   // Returns false if |url| is not in the browse database.  If it returns true,
    107   // then |prefix_hits| contains the list of prefix matches, and |cache_hits|
    108   // contains the cached gethash results for those prefixes (if any).  This
    109   // function is safe to call from threads other than the creation thread.
    110   virtual bool ContainsBrowseUrl(
    111       const GURL& url,
    112       std::vector<SBPrefix>* prefix_hits,
    113       std::vector<SBFullHashResult>* cache_hits) = 0;
    114 
    115   // Returns false if none of |urls| are in Download database. If it returns
    116   // true, |prefix_hits| should contain the prefixes for the URLs that were in
    117   // the database.  This function could ONLY be accessed from creation thread.
    118   virtual bool ContainsDownloadUrl(const std::vector<GURL>& urls,
    119                                    std::vector<SBPrefix>* prefix_hits) = 0;
    120 
    121   // Returns false if |url| is not on the client-side phishing detection
    122   // whitelist.  Otherwise, this function returns true.  Note: the whitelist
    123   // only contains full-length hashes so we don't return any prefix hit.
    124   // This function should only be called from the IO thread.
    125   virtual bool ContainsCsdWhitelistedUrl(const GURL& url) = 0;
    126 
    127   // The download whitelist is used for two purposes: a white-domain list of
    128   // sites that are considered to host only harmless binaries as well as a
    129   // whitelist of arbitrary strings such as hashed certificate authorities that
    130   // are considered to be trusted.  The two methods below let you lookup
    131   // the whitelist either for a URL or an arbitrary string.  These methods will
    132   // return false if no match is found and true otherwise.
    133   // This function could ONLY be accessed from the IO thread.
    134   virtual bool ContainsDownloadWhitelistedUrl(const GURL& url) = 0;
    135   virtual bool ContainsDownloadWhitelistedString(const std::string& str) = 0;
    136 
    137   // Populates |prefix_hits| with any prefixes in |prefixes| that have matches
    138   // in the database.
    139   //
    140   // This function can ONLY be accessed from the creation thread.
    141   virtual bool ContainsExtensionPrefixes(
    142       const std::vector<SBPrefix>& prefixes,
    143       std::vector<SBPrefix>* prefix_hits) = 0;
    144 
    145   // Returns false unless the hash of |url| is on the side-effect free
    146   // whitelist.
    147   virtual bool ContainsSideEffectFreeWhitelistUrl(const GURL& url) = 0;
    148 
    149   // Returns true iff the given IP is currently on the csd malware IP blacklist.
    150   virtual bool ContainsMalwareIP(const std::string& ip_address) = 0;
    151 
    152   // A database transaction should look like:
    153   //
    154   // std::vector<SBListChunkRanges> lists;
    155   // if (db.UpdateStarted(&lists)) {
    156   //   // Do something with |lists|.
    157   //
    158   //   // Process add/sub commands.
    159   //   db.InsertChunks(list_name, chunks);
    160   //
    161   //   // Process adddel/subdel commands.
    162   //   db.DeleteChunks(chunks_deletes);
    163   //
    164   //   // If passed true, processes the collected chunk info and
    165   //   // rebuilds the filter.  If passed false, rolls everything
    166   //   // back.
    167   //   db.UpdateFinished(success);
    168   // }
    169   //
    170   // If UpdateStarted() returns true, the caller MUST eventually call
    171   // UpdateFinished().  If it returns false, the caller MUST NOT call
    172   // the other functions.
    173   virtual bool UpdateStarted(std::vector<SBListChunkRanges>* lists) = 0;
    174   virtual void InsertChunks(const std::string& list_name,
    175                             const std::vector<SBChunkData*>& chunks) = 0;
    176   virtual void DeleteChunks(
    177       const std::vector<SBChunkDelete>& chunk_deletes) = 0;
    178   virtual void UpdateFinished(bool update_succeeded) = 0;
    179 
    180   // Store the results of a GetHash response. In the case of empty results, we
    181   // cache the prefixes until the next update so that we don't have to issue
    182   // further GetHash requests we know will be empty.
    183   virtual void CacheHashResults(
    184       const std::vector<SBPrefix>& prefixes,
    185       const std::vector<SBFullHashResult>& full_hits,
    186       const base::TimeDelta& cache_lifetime) = 0;
    187 
    188   // Returns true if the malware IP blacklisting killswitch URL is present
    189   // in the csd whitelist.
    190   virtual bool IsMalwareIPMatchKillSwitchOn() = 0;
    191 
    192   // Returns true if the whitelist killswitch URL is present in the csd
    193   // whitelist.
    194   virtual bool IsCsdWhitelistKillSwitchOn() = 0;
    195 
    196   // The name of the bloom-filter file for the given database file.
    197   // NOTE(shess): OBSOLETE.  Present for deleting stale files.
    198   static base::FilePath BloomFilterForFilename(
    199       const base::FilePath& db_filename);
    200 
    201   // The name of the prefix set file for the given database file.
    202   static base::FilePath PrefixSetForFilename(const base::FilePath& db_filename);
    203 
    204   // Filename for malware and phishing URL database.
    205   static base::FilePath BrowseDBFilename(
    206       const base::FilePath& db_base_filename);
    207 
    208   // Filename for download URL and download binary hash database.
    209   static base::FilePath DownloadDBFilename(
    210       const base::FilePath& db_base_filename);
    211 
    212   // Filename for client-side phishing detection whitelist databsae.
    213   static base::FilePath CsdWhitelistDBFilename(
    214       const base::FilePath& csd_whitelist_base_filename);
    215 
    216   // Filename for download whitelist databsae.
    217   static base::FilePath DownloadWhitelistDBFilename(
    218       const base::FilePath& download_whitelist_base_filename);
    219 
    220   // Filename for extension blacklist database.
    221   static base::FilePath ExtensionBlacklistDBFilename(
    222       const base::FilePath& extension_blacklist_base_filename);
    223 
    224   // Filename for side-effect free whitelist database.
    225   static base::FilePath SideEffectFreeWhitelistDBFilename(
    226       const base::FilePath& side_effect_free_whitelist_base_filename);
    227 
    228   // Filename for the csd malware IP blacklist database.
    229   static base::FilePath IpBlacklistDBFilename(
    230       const base::FilePath& ip_blacklist_base_filename);
    231 
    232   // Enumerate failures for histogramming purposes.  DO NOT CHANGE THE
    233   // ORDERING OF THESE VALUES.
    234   enum FailureType {
    235     FAILURE_DATABASE_CORRUPT,
    236     FAILURE_DATABASE_CORRUPT_HANDLER,
    237     FAILURE_BROWSE_DATABASE_UPDATE_BEGIN,
    238     FAILURE_BROWSE_DATABASE_UPDATE_FINISH,
    239     FAILURE_DATABASE_FILTER_MISSING_OBSOLETE,
    240     FAILURE_DATABASE_FILTER_READ_OBSOLETE,
    241     FAILURE_DATABASE_FILTER_WRITE_OBSOLETE,
    242     FAILURE_DATABASE_FILTER_DELETE,
    243     FAILURE_DATABASE_STORE_MISSING,
    244     FAILURE_DATABASE_STORE_DELETE,
    245     FAILURE_DOWNLOAD_DATABASE_UPDATE_BEGIN,
    246     FAILURE_DOWNLOAD_DATABASE_UPDATE_FINISH,
    247     FAILURE_WHITELIST_DATABASE_UPDATE_BEGIN,
    248     FAILURE_WHITELIST_DATABASE_UPDATE_FINISH,
    249     FAILURE_BROWSE_PREFIX_SET_MISSING,
    250     FAILURE_BROWSE_PREFIX_SET_READ,
    251     FAILURE_BROWSE_PREFIX_SET_WRITE,
    252     FAILURE_BROWSE_PREFIX_SET_DELETE,
    253     FAILURE_EXTENSION_BLACKLIST_UPDATE_BEGIN,
    254     FAILURE_EXTENSION_BLACKLIST_UPDATE_FINISH,
    255     FAILURE_EXTENSION_BLACKLIST_DELETE,
    256     FAILURE_SIDE_EFFECT_FREE_WHITELIST_UPDATE_BEGIN,
    257     FAILURE_SIDE_EFFECT_FREE_WHITELIST_UPDATE_FINISH,
    258     FAILURE_SIDE_EFFECT_FREE_WHITELIST_DELETE,
    259     FAILURE_SIDE_EFFECT_FREE_WHITELIST_PREFIX_SET_READ,
    260     FAILURE_SIDE_EFFECT_FREE_WHITELIST_PREFIX_SET_WRITE,
    261     FAILURE_SIDE_EFFECT_FREE_WHITELIST_PREFIX_SET_DELETE,
    262     FAILURE_IP_BLACKLIST_UPDATE_BEGIN,
    263     FAILURE_IP_BLACKLIST_UPDATE_FINISH,
    264     FAILURE_IP_BLACKLIST_UPDATE_INVALID,
    265     FAILURE_IP_BLACKLIST_DELETE,
    266 
    267     // Memory space for histograms is determined by the max.  ALWAYS
    268     // ADD NEW VALUES BEFORE THIS ONE.
    269     FAILURE_DATABASE_MAX
    270   };
    271 
    272   static void RecordFailure(FailureType failure_type);
    273 
    274  private:
    275   // The factory used to instantiate a SafeBrowsingDatabase object.
    276   // Useful for tests, so they can provide their own implementation of
    277   // SafeBrowsingDatabase.
    278   static SafeBrowsingDatabaseFactory* factory_;
    279 };
    280 
    281 class SafeBrowsingDatabaseNew : public SafeBrowsingDatabase {
    282  public:
    283   // Create a database with a browse, download, download whitelist and
    284   // csd whitelist store objects. Takes ownership of all the store objects.
    285   // When |download_store| is NULL, the database will ignore any operations
    286   // related download (url hashes and binary hashes).  The same is true for
    287   // the |csd_whitelist_store|, |download_whitelist_store| and
    288   // |ip_blacklist_store|.
    289   SafeBrowsingDatabaseNew(SafeBrowsingStore* browse_store,
    290                           SafeBrowsingStore* download_store,
    291                           SafeBrowsingStore* csd_whitelist_store,
    292                           SafeBrowsingStore* download_whitelist_store,
    293                           SafeBrowsingStore* extension_blacklist_store,
    294                           SafeBrowsingStore* side_effect_free_whitelist_store,
    295                           SafeBrowsingStore* ip_blacklist_store);
    296 
    297   // Create a database with a browse store. This is a legacy interface that
    298   // useds Sqlite.
    299   SafeBrowsingDatabaseNew();
    300 
    301   virtual ~SafeBrowsingDatabaseNew();
    302 
    303   // Implement SafeBrowsingDatabase interface.
    304   virtual void Init(const base::FilePath& filename) OVERRIDE;
    305   virtual bool ResetDatabase() OVERRIDE;
    306   virtual bool ContainsBrowseUrl(
    307       const GURL& url,
    308       std::vector<SBPrefix>* prefix_hits,
    309       std::vector<SBFullHashResult>* cache_hits) OVERRIDE;
    310   virtual bool ContainsDownloadUrl(const std::vector<GURL>& urls,
    311                                    std::vector<SBPrefix>* prefix_hits) OVERRIDE;
    312   virtual bool ContainsCsdWhitelistedUrl(const GURL& url) OVERRIDE;
    313   virtual bool ContainsDownloadWhitelistedUrl(const GURL& url) OVERRIDE;
    314   virtual bool ContainsDownloadWhitelistedString(
    315       const std::string& str) OVERRIDE;
    316   virtual bool ContainsExtensionPrefixes(
    317       const std::vector<SBPrefix>& prefixes,
    318       std::vector<SBPrefix>* prefix_hits) OVERRIDE;
    319   virtual bool ContainsSideEffectFreeWhitelistUrl(const GURL& url)  OVERRIDE;
    320   virtual bool ContainsMalwareIP(const std::string& ip_address) OVERRIDE;
    321   virtual bool UpdateStarted(std::vector<SBListChunkRanges>* lists) OVERRIDE;
    322   virtual void InsertChunks(const std::string& list_name,
    323                             const std::vector<SBChunkData*>& chunks) OVERRIDE;
    324   virtual void DeleteChunks(
    325       const std::vector<SBChunkDelete>& chunk_deletes) OVERRIDE;
    326   virtual void UpdateFinished(bool update_succeeded) OVERRIDE;
    327   virtual void CacheHashResults(
    328       const std::vector<SBPrefix>& prefixes,
    329       const std::vector<SBFullHashResult>& full_hits,
    330       const base::TimeDelta& cache_lifetime) OVERRIDE;
    331 
    332   // Returns the value of malware_kill_switch_;
    333   virtual bool IsMalwareIPMatchKillSwitchOn() OVERRIDE;
    334 
    335   // Returns true if the CSD whitelist has everything whitelisted.
    336   virtual bool IsCsdWhitelistKillSwitchOn() OVERRIDE;
    337 
    338  private:
    339   friend class SafeBrowsingDatabaseTest;
    340   FRIEND_TEST_ALL_PREFIXES(SafeBrowsingDatabaseTest, HashCaching);
    341 
    342   // A SafeBrowsing whitelist contains a list of whitelisted full-hashes (stored
    343   // in a sorted vector) as well as a boolean flag indicating whether all
    344   // lookups in the whitelist should be considered matches for safety.
    345   typedef std::pair<std::vector<SBFullHash>, bool> SBWhitelist;
    346 
    347   // This map holds a csd malware IP blacklist which maps a prefix mask
    348   // to a set of hashed blacklisted IP prefixes.  Each IP prefix is a hashed
    349   // IPv6 IP prefix using SHA-1.
    350   typedef std::map<std::string, base::hash_set<std::string> > IPBlacklist;
    351 
    352   // Returns true if the whitelist is disabled or if any of the given hashes
    353   // matches the whitelist.
    354   bool ContainsWhitelistedHashes(const SBWhitelist& whitelist,
    355                                  const std::vector<SBFullHash>& hashes);
    356 
    357   // Return the browse_store_, download_store_, download_whitelist_store or
    358   // csd_whitelist_store_ based on list_id.
    359   SafeBrowsingStore* GetStore(int list_id);
    360 
    361   // Deletes the files on disk.
    362   bool Delete();
    363 
    364   // Load the prefix set off disk, if available.
    365   void LoadPrefixSet();
    366 
    367   // Writes the current prefix set to disk.
    368   void WritePrefixSet();
    369 
    370   // Loads the given full-length hashes to the given whitelist.  If the number
    371   // of hashes is too large or if the kill switch URL is on the whitelist
    372   // we will whitelist everything.
    373   void LoadWhitelist(const std::vector<SBAddFullHash>& full_hashes,
    374                      SBWhitelist* whitelist);
    375 
    376   // Call this method if an error occured with the given whitelist.  This will
    377   // result in all lookups to the whitelist to return true.
    378   void WhitelistEverything(SBWhitelist* whitelist);
    379 
    380   // Parses the IP blacklist from the given full-length hashes.
    381   void LoadIpBlacklist(const std::vector<SBAddFullHash>& full_hashes);
    382 
    383   // Helpers for handling database corruption.
    384   // |OnHandleCorruptDatabase()| runs |ResetDatabase()| and sets
    385   // |corruption_detected_|, |HandleCorruptDatabase()| posts
    386   // |OnHandleCorruptDatabase()| to the current thread, to be run
    387   // after the current task completes.
    388   // TODO(shess): Wire things up to entirely abort the update
    389   // transaction when this happens.
    390   void HandleCorruptDatabase();
    391   void OnHandleCorruptDatabase();
    392 
    393   // Helpers for InsertChunks().
    394   void InsertAddChunk(SafeBrowsingStore* store,
    395                       safe_browsing_util::ListType list_id,
    396                       const SBChunkData& chunk);
    397   void InsertSubChunk(SafeBrowsingStore* store,
    398                       safe_browsing_util::ListType list_id,
    399                       const SBChunkData& chunk);
    400 
    401   // Returns the size in bytes of the store after the update.
    402   int64 UpdateHashPrefixStore(const base::FilePath& store_filename,
    403                                SafeBrowsingStore* store,
    404                                FailureType failure_type);
    405   void UpdateBrowseStore();
    406   void UpdateSideEffectFreeWhitelistStore();
    407   void UpdateWhitelistStore(const base::FilePath& store_filename,
    408                             SafeBrowsingStore* store,
    409                             SBWhitelist* whitelist);
    410   void UpdateIpBlacklistStore();
    411 
    412   // Used to verify that various calls are made from the thread the
    413   // object was created on.
    414   base::MessageLoop* creation_loop_;
    415 
    416   // Lock for protecting access to variables that may be used on the
    417   // IO thread.  This includes |prefix_set_|, |cached_browse_hashes_|,
    418   // |prefix_miss_cache_|, |csd_whitelist_|.
    419   base::Lock lookup_lock_;
    420 
    421   // The base filename passed to Init(), used to generate the store and prefix
    422   // set filenames used to store data on disk.
    423   base::FilePath filename_base_;
    424 
    425   // Underlying persistent store for chunk data.
    426   // For browsing related (phishing and malware URLs) chunks and prefixes.
    427   scoped_ptr<SafeBrowsingStore> browse_store_;
    428 
    429   // For download related (download URL and binary hash) chunks and prefixes.
    430   scoped_ptr<SafeBrowsingStore> download_store_;
    431 
    432   // For the client-side phishing detection whitelist chunks and full-length
    433   // hashes.  This list only contains 256 bit hashes.
    434   scoped_ptr<SafeBrowsingStore> csd_whitelist_store_;
    435 
    436   // For the download whitelist chunks and full-length hashes.  This list only
    437   // contains 256 bit hashes.
    438   scoped_ptr<SafeBrowsingStore> download_whitelist_store_;
    439 
    440   // For extension IDs.
    441   scoped_ptr<SafeBrowsingStore> extension_blacklist_store_;
    442 
    443   // For side-effect free whitelist.
    444   scoped_ptr<SafeBrowsingStore> side_effect_free_whitelist_store_;
    445 
    446   // For IP blacklist.
    447   scoped_ptr<SafeBrowsingStore> ip_blacklist_store_;
    448 
    449   SBWhitelist csd_whitelist_;
    450   SBWhitelist download_whitelist_;
    451   SBWhitelist extension_blacklist_;
    452 
    453   // The IP blacklist should be small.  At most a couple hundred IPs.
    454   IPBlacklist ip_blacklist_;
    455 
    456   // Store items from CacheHashResults(), ordered by hash for efficient
    457   // scanning.  Discarded on next update.
    458   std::vector<SBFullHashCached> cached_browse_hashes_;
    459 
    460   // Cache of prefixes that returned empty results (no full hash
    461   // match) to |CacheHashResults()|.  Cached to prevent asking for
    462   // them every time.  Cleared on next update.
    463   std::set<SBPrefix> prefix_miss_cache_;
    464 
    465   // Used to schedule resetting the database because of corruption.
    466   base::WeakPtrFactory<SafeBrowsingDatabaseNew> reset_factory_;
    467 
    468   // Set if corruption is detected during the course of an update.
    469   // Causes the update functions to fail with no side effects, until
    470   // the next call to |UpdateStarted()|.
    471   bool corruption_detected_;
    472 
    473   // Set to true if any chunks are added or deleted during an update.
    474   // Used to optimize away database update.
    475   bool change_detected_;
    476 
    477   // Used to check if a prefix was in the browse database.
    478   scoped_ptr<safe_browsing::PrefixSet> browse_prefix_set_;
    479 
    480   // Used to check if a prefix was in the browse database.
    481   scoped_ptr<safe_browsing::PrefixSet> side_effect_free_whitelist_prefix_set_;
    482 };
    483 
    484 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_DATABASE_H_
    485