Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
      6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
      7 
      8 #include <set>
      9 #include <vector>
     10 
     11 #include "base/basictypes.h"
     12 #include "base/callback_forward.h"
     13 #include "base/containers/hash_tables.h"
     14 #include "base/time/time.h"
     15 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
     16 
     17 namespace base {
     18 class FilePath;
     19 }
     20 
     21 // SafeBrowsingStore provides a storage abstraction for the
     22 // safe-browsing data used to build the bloom filter.  The items
     23 // stored are:
     24 //   The set of add and sub chunks seen.
     25 //   List of SBAddPrefix (chunk_id and SBPrefix).
     26 //   List of SBSubPrefix (chunk_id and the target SBAddPrefix).
     27 //   List of SBAddFullHash (SBAddPrefix, time received and an SBFullHash).
     28 //   List of SBSubFullHash (chunk_id, target SBAddPrefix, and an SBFullHash).
     29 //
     30 // The store is geared towards updating the data, not runtime access
     31 // to the data (that is handled by SafeBrowsingDatabase).  Updates are
     32 // handled similar to a SQL transaction cycle, with the new data being
     33 // returned from FinishUpdate() (the COMMIT).  Data is not persistent
     34 // until FinishUpdate() returns successfully.
     35 //
     36 // FinishUpdate() also handles dropping items who's chunk has been
     37 // deleted, and netting out the add/sub lists (when a sub matches an
     38 // add, both are dropped).
     39 
     40 // GetAddChunkId(), GetAddPrefix() and GetFullHash() are exposed so
     41 // that these items can be generically compared with each other by
     42 // SBAddPrefixLess() and SBAddPrefixHashLess().
     43 
     44 struct SBAddPrefix {
     45   int32 chunk_id;
     46   SBPrefix prefix;
     47 
     48   SBAddPrefix(int32 id, SBPrefix p) : chunk_id(id), prefix(p) {}
     49   SBAddPrefix() : chunk_id(), prefix() {}
     50 
     51   int32 GetAddChunkId() const { return chunk_id; }
     52   SBPrefix GetAddPrefix() const { return prefix; }
     53 };
     54 
     55 typedef std::deque<SBAddPrefix> SBAddPrefixes;
     56 
     57 struct SBSubPrefix {
     58   int32 chunk_id;
     59   int32 add_chunk_id;
     60   SBPrefix add_prefix;
     61 
     62   SBSubPrefix(int32 id, int32 add_id, int prefix)
     63       : chunk_id(id), add_chunk_id(add_id), add_prefix(prefix) {}
     64   SBSubPrefix() : chunk_id(), add_chunk_id(), add_prefix() {}
     65 
     66   int32 GetAddChunkId() const { return add_chunk_id; }
     67   SBPrefix GetAddPrefix() const { return add_prefix; }
     68 };
     69 
     70 typedef std::deque<SBSubPrefix> SBSubPrefixes;
     71 
     72 struct SBAddFullHash {
     73   int32 chunk_id;
     74   int32 received;
     75   SBFullHash full_hash;
     76 
     77   SBAddFullHash(int32 id, base::Time r, const SBFullHash& h)
     78       : chunk_id(id),
     79         received(static_cast<int32>(r.ToTimeT())),
     80         full_hash(h) {
     81   }
     82 
     83   // Provided for ReadAddHashes() implementations, which already have
     84   // an int32 for the time.
     85   SBAddFullHash(int32 id, int32 r, const SBFullHash& h)
     86       : chunk_id(id), received(r), full_hash(h) {}
     87 
     88   SBAddFullHash() : chunk_id(), received(), full_hash() {}
     89 
     90   int32 GetAddChunkId() const { return chunk_id; }
     91   SBPrefix GetAddPrefix() const { return full_hash.prefix; }
     92 };
     93 
     94 struct SBSubFullHash {
     95   int32 chunk_id;
     96   int32 add_chunk_id;
     97   SBFullHash full_hash;
     98 
     99   SBSubFullHash(int32 id, int32 add_id, const SBFullHash& h)
    100       : chunk_id(id), add_chunk_id(add_id), full_hash(h) {}
    101   SBSubFullHash() : chunk_id(), add_chunk_id(), full_hash() {}
    102 
    103   int32 GetAddChunkId() const { return add_chunk_id; }
    104   SBPrefix GetAddPrefix() const { return full_hash.prefix; }
    105 };
    106 
    107 // Determine less-than based on add chunk and prefix.
    108 template <class T, class U>
    109 bool SBAddPrefixLess(const T& a, const U& b) {
    110   if (a.GetAddChunkId() != b.GetAddChunkId())
    111     return a.GetAddChunkId() < b.GetAddChunkId();
    112 
    113   return a.GetAddPrefix() < b.GetAddPrefix();
    114 }
    115 
    116 // Determine less-than based on add chunk, prefix, and full hash.
    117 // Prefix can compare differently than hash due to byte ordering,
    118 // so it must take precedence.
    119 template <class T, class U>
    120 bool SBAddPrefixHashLess(const T& a, const U& b) {
    121   if (SBAddPrefixLess(a, b))
    122     return true;
    123 
    124   if (SBAddPrefixLess(b, a))
    125     return false;
    126 
    127   return memcmp(a.full_hash.full_hash, b.full_hash.full_hash,
    128                 sizeof(a.full_hash.full_hash)) < 0;
    129 }
    130 
    131 // Process the lists for subs which knock out adds.  For any item in
    132 // |sub_prefixes| which has a match in |add_prefixes|, knock out the
    133 // matched items from all vectors.  Additionally remove items from
    134 // deleted chunks.
    135 //
    136 // TODO(shess): Since the prefixes are uniformly-distributed hashes,
    137 // there aren't many ways to organize the inputs for efficient
    138 // processing.  For this reason, the vectors are sorted and processed
    139 // in parallel.  At this time this code does the sorting internally,
    140 // but it might make sense to make sorting an API requirement so that
    141 // the storage can optimize for it.
    142 void SBProcessSubs(SBAddPrefixes* add_prefixes,
    143                    SBSubPrefixes* sub_prefixes,
    144                    std::vector<SBAddFullHash>* add_full_hashes,
    145                    std::vector<SBSubFullHash>* sub_full_hashes,
    146                    const base::hash_set<int32>& add_chunks_deleted,
    147                    const base::hash_set<int32>& sub_chunks_deleted);
    148 
    149 // Records a histogram of the number of items in |prefix_misses| which
    150 // are not in |add_prefixes|.
    151 void SBCheckPrefixMisses(const SBAddPrefixes& add_prefixes,
    152                          const std::set<SBPrefix>& prefix_misses);
    153 
    154 // TODO(shess): This uses int32 rather than int because it's writing
    155 // specifically-sized items to files.  SBPrefix should likewise be
    156 // explicitly sized.
    157 
    158 // Abstract interface for storing data.
    159 class SafeBrowsingStore {
    160  public:
    161   SafeBrowsingStore() {}
    162   virtual ~SafeBrowsingStore() {}
    163 
    164   // Sets up the information for later use, but does not necessarily
    165   // check whether the underlying file exists, or is valid.  If
    166   // |curruption_callback| is non-NULL it will be called if corruption
    167   // is detected, which could happen as part of any call other than
    168   // Delete().  The appropriate action is to use Delete() to clear the
    169   // store.
    170   virtual void Init(const base::FilePath& filename,
    171                     const base::Closure& corruption_callback) = 0;
    172 
    173   // Deletes the files which back the store, returning true if
    174   // successful.
    175   virtual bool Delete() = 0;
    176 
    177   // Get all Add prefixes out from the store.
    178   virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) = 0;
    179 
    180   // Get all add full-length hashes.
    181   virtual bool GetAddFullHashes(
    182       std::vector<SBAddFullHash>* add_full_hashes) = 0;
    183 
    184   // Start an update.  None of the following methods should be called
    185   // unless this returns true.  If this returns true, the update
    186   // should be terminated by FinishUpdate() or CancelUpdate().
    187   virtual bool BeginUpdate() = 0;
    188 
    189   // Start a chunk of data.  None of the methods through FinishChunk()
    190   // should be called unless this returns true.
    191   // TODO(shess): Would it make sense for this to accept |chunk_id|?
    192   // Possibly not, because of possible confusion between sub_chunk_id
    193   // and add_chunk_id.
    194   virtual bool BeginChunk() = 0;
    195 
    196   virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) = 0;
    197   virtual bool WriteAddHash(int32 chunk_id,
    198                             base::Time receive_time,
    199                             const SBFullHash& full_hash) = 0;
    200   virtual bool WriteSubPrefix(int32 chunk_id,
    201                               int32 add_chunk_id, SBPrefix prefix) = 0;
    202   virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
    203                             const SBFullHash& full_hash) = 0;
    204 
    205   // Collect the chunk data and preferrably store it on disk to
    206   // release memory.  Shoul not modify the data in-place.
    207   virtual bool FinishChunk() = 0;
    208 
    209   // Track the chunks which have been seen.
    210   virtual void SetAddChunk(int32 chunk_id) = 0;
    211   virtual bool CheckAddChunk(int32 chunk_id) = 0;
    212   virtual void GetAddChunks(std::vector<int32>* out) = 0;
    213   virtual void SetSubChunk(int32 chunk_id) = 0;
    214   virtual bool CheckSubChunk(int32 chunk_id) = 0;
    215   virtual void GetSubChunks(std::vector<int32>* out) = 0;
    216 
    217   // Delete the indicated chunk_id.  The chunk will continue to be
    218   // visible until the end of the transaction.
    219   virtual void DeleteAddChunk(int32 chunk_id) = 0;
    220   virtual void DeleteSubChunk(int32 chunk_id) = 0;
    221 
    222   // May be called during update to verify that the storage is valid.
    223   // Return true if the store seems valid.  If corruption is detected,
    224   // calls the corruption callback and return false.
    225   // NOTE(shess): When storage was SQLite, there was no guarantee that
    226   // a structurally sound database actually contained valid data,
    227   // whereas SafeBrowsingStoreFile checksums the data.  For now, this
    228   // distinction doesn't matter.
    229   virtual bool CheckValidity() = 0;
    230 
    231   // Pass the collected chunks through SBPRocessSubs() and commit to
    232   // permanent storage.  The resulting add prefixes and hashes will be
    233   // stored in |add_prefixes_result| and |add_full_hashes_result|.
    234   // |pending_adds| is the set of full hashes which have been received
    235   // since the previous update, and is provided as a convenience
    236   // (could be written via WriteAddHash(), but that would flush the
    237   // chunk to disk).  |prefix_misses| is the set of prefixes where the
    238   // |GetHash()| request returned no full hashes, used for diagnostic
    239   // purposes.
    240   virtual bool FinishUpdate(
    241       const std::vector<SBAddFullHash>& pending_adds,
    242       const std::set<SBPrefix>& prefix_misses,
    243       SBAddPrefixes* add_prefixes_result,
    244       std::vector<SBAddFullHash>* add_full_hashes_result) = 0;
    245 
    246   // Cancel the update in process and remove any temporary disk
    247   // storage, leaving the original data unmodified.
    248   virtual bool CancelUpdate() = 0;
    249 
    250  private:
    251   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStore);
    252 };
    253 
    254 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
    255