Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
      6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
      7 #pragma once
      8 
      9 #include <set>
     10 #include <vector>
     11 
     12 #include "base/basictypes.h"
     13 #include "base/callback.h"
     14 #include "base/hash_tables.h"
     15 #include "base/task.h"
     16 #include "base/time.h"
     17 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
     18 
     19 class FilePath;
     20 
     21 // SafeBrowsingStore provides a storage abstraction for the
     22 // safe-browsing data used to build the bloom filter.  The items
     23 // stored are:
     24 //   The set of add and sub chunks seen.
     25 //   List of SBAddPrefix (chunk_id and SBPrefix).
     26 //   List of SBSubPrefix (chunk_id and the target SBAddPrefix).
     27 //   List of SBAddFullHash (SBAddPrefix, time received and an SBFullHash).
     28 //   List of SBSubFullHash (chunk_id, target SBAddPrefix, and an SBFullHash).
     29 //
     30 // The store is geared towards updating the data, not runtime access
     31 // to the data (that is handled by SafeBrowsingDatabase).  Updates are
     32 // handled similar to a SQL transaction cycle, with the new data being
     33 // returned from FinishUpdate() (the COMMIT).  Data is not persistent
     34 // until FinishUpdate() returns successfully.
     35 //
     36 // FinishUpdate() also handles dropping items who's chunk has been
     37 // deleted, and netting out the add/sub lists (when a sub matches an
     38 // add, both are dropped).
     39 
     40 // GetAddChunkId(), GetAddPrefix() and GetFullHash() are exposed so
     41 // that these items can be generically compared with each other by
     42 // SBAddPrefixLess() and SBAddPrefixHashLess().
     43 
     44 struct SBAddPrefix {
     45   int32 chunk_id;
     46   SBPrefix prefix;
     47 
     48   SBAddPrefix(int32 id, SBPrefix p) : chunk_id(id), prefix(p) {}
     49   SBAddPrefix() : chunk_id(), prefix() {}
     50 
     51   int32 GetAddChunkId() const { return chunk_id; }
     52   SBPrefix GetAddPrefix() const { return prefix; }
     53 };
     54 
     55 struct SBSubPrefix {
     56   int32 chunk_id;
     57   int32 add_chunk_id;
     58   SBPrefix add_prefix;
     59 
     60   SBSubPrefix(int32 id, int32 add_id, int prefix)
     61       : chunk_id(id), add_chunk_id(add_id), add_prefix(prefix) {}
     62   SBSubPrefix() : chunk_id(), add_chunk_id(), add_prefix() {}
     63 
     64   int32 GetAddChunkId() const { return add_chunk_id; }
     65   SBPrefix GetAddPrefix() const { return add_prefix; }
     66 };
     67 
     68 struct SBAddFullHash {
     69   int32 chunk_id;
     70   int32 received;
     71   SBFullHash full_hash;
     72 
     73   SBAddFullHash(int32 id, base::Time r, const SBFullHash& h)
     74       : chunk_id(id),
     75         received(static_cast<int32>(r.ToTimeT())),
     76         full_hash(h) {
     77   }
     78 
     79   // Provided for ReadAddHashes() implementations, which already have
     80   // an int32 for the time.
     81   SBAddFullHash(int32 id, int32 r, const SBFullHash& h)
     82       : chunk_id(id), received(r), full_hash(h) {}
     83 
     84   SBAddFullHash() : chunk_id(), received(), full_hash() {}
     85 
     86   int32 GetAddChunkId() const { return chunk_id; }
     87   SBPrefix GetAddPrefix() const { return full_hash.prefix; }
     88 };
     89 
     90 struct SBSubFullHash {
     91   int32 chunk_id;
     92   int32 add_chunk_id;
     93   SBFullHash full_hash;
     94 
     95   SBSubFullHash(int32 id, int32 add_id, const SBFullHash& h)
     96       : chunk_id(id), add_chunk_id(add_id), full_hash(h) {}
     97   SBSubFullHash() : chunk_id(), add_chunk_id(), full_hash() {}
     98 
     99   int32 GetAddChunkId() const { return add_chunk_id; }
    100   SBPrefix GetAddPrefix() const { return full_hash.prefix; }
    101 };
    102 
    103 // Determine less-than based on add chunk and prefix.
    104 template <class T, class U>
    105 bool SBAddPrefixLess(const T& a, const U& b) {
    106   if (a.GetAddChunkId() != b.GetAddChunkId())
    107     return a.GetAddChunkId() < b.GetAddChunkId();
    108 
    109   return a.GetAddPrefix() < b.GetAddPrefix();
    110 }
    111 
    112 // Determine less-than based on add chunk, prefix, and full hash.
    113 // Prefix can compare differently than hash due to byte ordering,
    114 // so it must take precedence.
    115 template <class T, class U>
    116 bool SBAddPrefixHashLess(const T& a, const U& b) {
    117   if (SBAddPrefixLess(a, b))
    118     return true;
    119 
    120   if (SBAddPrefixLess(b, a))
    121     return false;
    122 
    123   return memcmp(a.full_hash.full_hash, b.full_hash.full_hash,
    124                 sizeof(a.full_hash.full_hash)) < 0;
    125 }
    126 
    127 // Process the lists for subs which knock out adds.  For any item in
    128 // |sub_prefixes| which has a match in |add_prefixes|, knock out the
    129 // matched items from all vectors.  Additionally remove items from
    130 // deleted chunks.
    131 //
    132 // TODO(shess): Since the prefixes are uniformly-distributed hashes,
    133 // there aren't many ways to organize the inputs for efficient
    134 // processing.  For this reason, the vectors are sorted and processed
    135 // in parallel.  At this time this code does the sorting internally,
    136 // but it might make sense to make sorting an API requirement so that
    137 // the storage can optimize for it.
    138 //
    139 // TODO(shess): The original code did not process |sub_full_hashes|
    140 // for matches in |add_full_hashes|, so this code doesn't, either.  I
    141 // think this is probably a bug.
    142 void SBProcessSubs(std::vector<SBAddPrefix>* add_prefixes,
    143                    std::vector<SBSubPrefix>* sub_prefixes,
    144                    std::vector<SBAddFullHash>* add_full_hashes,
    145                    std::vector<SBSubFullHash>* sub_full_hashes,
    146                    const base::hash_set<int32>& add_chunks_deleted,
    147                    const base::hash_set<int32>& sub_chunks_deleted);
    148 
    149 // Records a histogram of the number of items in |prefix_misses| which
    150 // are not in |add_prefixes|.
    151 void SBCheckPrefixMisses(const std::vector<SBAddPrefix>& add_prefixes,
    152                          const std::set<SBPrefix>& prefix_misses);
    153 
    154 // TODO(shess): This uses int32 rather than int because it's writing
    155 // specifically-sized items to files.  SBPrefix should likewise be
    156 // explicitly sized.
    157 
    158 // Abstract interface for storing data.
    159 class SafeBrowsingStore {
    160  public:
    161   SafeBrowsingStore() {}
    162   virtual ~SafeBrowsingStore() {}
    163 
    164   // Sets up the information for later use, but does not necessarily
    165   // check whether the underlying file exists, or is valid.  If
    166   // |curruption_callback| is non-NULL it will be called if corruption
    167   // is detected, which could happen as part of any call other than
    168   // Delete().  The appropriate action is to use Delete() to clear the
    169   // store.
    170   virtual void Init(const FilePath& filename,
    171                     Callback0::Type* corruption_callback) = 0;
    172 
    173   // Deletes the files which back the store, returning true if
    174   // successful.
    175   virtual bool Delete() = 0;
    176 
    177   // Get all Add prefixes out from the store.
    178   virtual bool GetAddPrefixes(std::vector<SBAddPrefix>* add_prefixes) = 0;
    179 
    180   // Get all add full-length hashes.
    181   virtual bool GetAddFullHashes(
    182       std::vector<SBAddFullHash>* add_full_hashes) = 0;
    183 
    184   // Start an update.  None of the following methods should be called
    185   // unless this returns true.  If this returns true, the update
    186   // should be terminated by FinishUpdate() or CancelUpdate().
    187   virtual bool BeginUpdate() = 0;
    188 
    189   // Start a chunk of data.  None of the methods through FinishChunk()
    190   // should be called unless this returns true.
    191   // TODO(shess): Would it make sense for this to accept |chunk_id|?
    192   // Possibly not, because of possible confusion between sub_chunk_id
    193   // and add_chunk_id.
    194   virtual bool BeginChunk() = 0;
    195 
    196   virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) = 0;
    197   virtual bool WriteAddHash(int32 chunk_id,
    198                             base::Time receive_time,
    199                             const SBFullHash& full_hash) = 0;
    200   virtual bool WriteSubPrefix(int32 chunk_id,
    201                               int32 add_chunk_id, SBPrefix prefix) = 0;
    202   virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
    203                             const SBFullHash& full_hash) = 0;
    204 
    205   // Collect the chunk data and preferrably store it on disk to
    206   // release memory.  Shoul not modify the data in-place.
    207   virtual bool FinishChunk() = 0;
    208 
    209   // Track the chunks which have been seen.
    210   virtual void SetAddChunk(int32 chunk_id) = 0;
    211   virtual bool CheckAddChunk(int32 chunk_id) = 0;
    212   virtual void GetAddChunks(std::vector<int32>* out) = 0;
    213   virtual void SetSubChunk(int32 chunk_id) = 0;
    214   virtual bool CheckSubChunk(int32 chunk_id) = 0;
    215   virtual void GetSubChunks(std::vector<int32>* out) = 0;
    216 
    217   // Delete the indicated chunk_id.  The chunk will continue to be
    218   // visible until the end of the transaction.
    219   virtual void DeleteAddChunk(int32 chunk_id) = 0;
    220   virtual void DeleteSubChunk(int32 chunk_id) = 0;
    221 
    222   // Pass the collected chunks through SBPRocessSubs() and commit to
    223   // permanent storage.  The resulting add prefixes and hashes will be
    224   // stored in |add_prefixes_result| and |add_full_hashes_result|.
    225   // |pending_adds| is the set of full hashes which have been received
    226   // since the previous update, and is provided as a convenience
    227   // (could be written via WriteAddHash(), but that would flush the
    228   // chunk to disk).  |prefix_misses| is the set of prefixes where the
    229   // |GetHash()| request returned no full hashes, used for diagnostic
    230   // purposes.
    231   virtual bool FinishUpdate(
    232       const std::vector<SBAddFullHash>& pending_adds,
    233       const std::set<SBPrefix>& prefix_misses,
    234       std::vector<SBAddPrefix>* add_prefixes_result,
    235       std::vector<SBAddFullHash>* add_full_hashes_result) = 0;
    236 
    237   // Cancel the update in process and remove any temporary disk
    238   // storage, leaving the original data unmodified.
    239   virtual bool CancelUpdate() = 0;
    240 
    241  private:
    242   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStore);
    243 };
    244 
    245 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
    246